/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Tue Oct 14 02:27:58 2008 UTC revision 1.15 by wakaba, Sat Oct 18 08:05:29 2008 UTC
# Line 2  package Whatpm::HTML::Tokenizer; Line 2  package Whatpm::HTML::Tokenizer;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    BEGIN {
6      require Exporter;
7      push our @ISA, 'Exporter';
8    
9      our @EXPORT_OK = qw(
10        DOCTYPE_TOKEN
11        COMMENT_TOKEN
12        START_TAG_TOKEN
13        END_TAG_TOKEN
14        END_OF_FILE_TOKEN
15        CHARACTER_TOKEN
16        PI_TOKEN
17        ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24      );
25      
26      our %EXPORT_TAGS = (
27        token => [qw(
28          DOCTYPE_TOKEN
29          COMMENT_TOKEN
30          START_TAG_TOKEN
31          END_TAG_TOKEN
32          END_OF_FILE_TOKEN
33          CHARACTER_TOKEN
34          PI_TOKEN
35          ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42        )],
43      );
44    }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48    ## Token types
49    
50    sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51    sub COMMENT_TOKEN () { 2 }
52    sub START_TAG_TOKEN () { 3 }
53    sub END_TAG_TOKEN () { 4 }
54    sub END_OF_FILE_TOKEN () { 5 }
55    sub CHARACTER_TOKEN () { 6 }
56    sub PI_TOKEN () { 7 } ## NOTE: XML only.
57    sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
75    BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77  ## Content model flags  ## Content model flags
78    
79  sub CM_ENTITY () { 0b001 } # & markup in data  sub CM_ENTITY () { 0b001 } # & markup in data
# Line 72  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145  ## Token types  ## XML-only states
146    sub PI_STATE () { 51 }
147  sub DOCTYPE_TOKEN () { 1 }  sub PI_TARGET_STATE () { 52 }
148  sub COMMENT_TOKEN () { 2 }  sub PI_TARGET_AFTER_STATE () { 53 }
149  sub START_TAG_TOKEN () { 3 }  sub PI_DATA_STATE () { 54 }
150  sub END_TAG_TOKEN () { 4 }  sub PI_AFTER_STATE () { 55 }
151  sub END_OF_FILE_TOKEN () { 5 }  sub PI_DATA_AFTER_STATE () { 56 }
152  sub CHARACTER_TOKEN () { 6 }  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    
181  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
182  ## list and descriptions)  ## list and descriptions)
# Line 142  sub _initialize_tokenizer ($) { Line 239  sub _initialize_tokenizer ($) {
239    #$self->{level}    #$self->{level}
240    #$self->{set_nc}    #$self->{set_nc}
241    #$self->{parse_error}    #$self->{parse_error}
242      #$self->{is_xml} (if XML)
243    
244    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
245    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
246      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
247    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
248    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
249    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 164  sub _initialize_tokenizer ($) { Line 263  sub _initialize_tokenizer ($) {
263    
264  ## A token has:  ## A token has:
265  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
266  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
267  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
268  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
269    ##   ->{target} (PI_TOKEN)
270  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
271  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
272  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 174  sub _initialize_tokenizer ($) { Line 274  sub _initialize_tokenizer ($) {
274  ##        ->{name}  ##        ->{name}
275  ##        ->{value}  ##        ->{value}
276  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
277  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
278    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
279    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
280    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
281    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
282    
283  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
284  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
285  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 194  my $is_space = { Line 299  my $is_space = {
299    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
300    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
301    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
302    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
303    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
304    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
305  };  };
# Line 278  sub _get_next_token ($) { Line 383  sub _get_next_token ($) {
383          }          }
384        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
385          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
386            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
387              !!!cp (3);              !!!cp (3);
388              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
389              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
390              #              #
391            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
392              !!!cp (4);              !!!cp (4);
393              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
394              #              #
395              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
396                !!!cp (4.1);
397                $self->{s_kwd} .= '-';
398                #
399            } else {            } else {
400              !!!cp (5);              !!!cp (5);
401                $self->{s_kwd} = '-';
402              #              #
403            }            }
404          }          }
# Line 326  sub _get_next_token ($) { Line 434  sub _get_next_token ($) {
434            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
435              !!!cp (8);              !!!cp (8);
436              delete $self->{escape};              delete $self->{escape};
437                #
438            } else {            } else {
439              !!!cp (9);              !!!cp (9);
440                #
441            }            }
442            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
443              !!!cp (9.1);
444              !!!parse-error (type => 'unmatched mse', ## TODO: type
445                              line => $self->{line_prev},
446                              column => $self->{column_prev} - 1);
447              #
448          } else {          } else {
449            !!!cp (10);            !!!cp (10);
450              #
451          }          }
452                    
453          $self->{s_kwd} = '';          $self->{s_kwd} = '';
454          #          #
455          } elsif ($self->{nc} == 0x005D) { # ]
456            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
457              !!!cp (10.1);
458              $self->{s_kwd} .= ']';
459            } elsif ($self->{s_kwd} eq ']]') {
460              !!!cp (10.2);
461              #
462            } else {
463              !!!cp (10.3);
464              $self->{s_kwd} = '';
465            }
466            #
467        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
468          !!!cp (11);          !!!cp (11);
469          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 352  sub _get_next_token ($) { Line 481  sub _get_next_token ($) {
481                     data => chr $self->{nc},                     data => chr $self->{nc},
482                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
483                    };                    };
484        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
485                                  length $token->{data})) {                                  length $token->{data})) {
486          $self->{s_kwd} = '';          $self->{s_kwd} = '';
487        }        }
488    
489        ## Stay in the data state.        ## Stay in the data state.
490        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
491              $self->{content_model} == PCDATA_CONTENT_MODEL) {
492          !!!cp (13);          !!!cp (13);
493          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
494        } else {        } else {
# Line 369  sub _get_next_token ($) { Line 499  sub _get_next_token ($) {
499        !!!emit ($token);        !!!emit ($token);
500        redo A;        redo A;
501      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
502          ## XML5: "tag state".
503    
504        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
505          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
506            !!!cp (15);            !!!cp (15);
# Line 377  sub _get_next_token ($) { Line 509  sub _get_next_token ($) {
509            redo A;            redo A;
510          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
511            !!!cp (15.1);            !!!cp (15.1);
512            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
513            #            #
514          } else {          } else {
515            !!!cp (16);            !!!cp (16);
516              $self->{s_kwd} = '';
517            #            #
518          }          }
519    
# Line 407  sub _get_next_token ($) { Line 540  sub _get_next_token ($) {
540            !!!cp (19);            !!!cp (19);
541            $self->{ct}            $self->{ct}
542              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
543                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
544                 line => $self->{line_prev},                 line => $self->{line_prev},
545                 column => $self->{column_prev}};                 column => $self->{column_prev}};
546            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 429  sub _get_next_token ($) { Line 562  sub _get_next_token ($) {
562                            line => $self->{line_prev},                            line => $self->{line_prev},
563                            column => $self->{column_prev});                            column => $self->{column_prev});
564            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
565              $self->{s_kwd} = '';
566            !!!next-input-character;            !!!next-input-character;
567    
568            !!!emit ({type => CHARACTER_TOKEN, data => '<>',            !!!emit ({type => CHARACTER_TOKEN, data => '<>',
# Line 438  sub _get_next_token ($) { Line 572  sub _get_next_token ($) {
572    
573            redo A;            redo A;
574          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
575            !!!cp (22);            if ($self->{is_xml}) {
576            !!!parse-error (type => 'pio',              !!!cp (22.1);
577                            line => $self->{line_prev},              $self->{state} = PI_STATE;
578                            column => $self->{column_prev});              !!!next-input-character;
579            $self->{state} = BOGUS_COMMENT_STATE;              redo A;
580            $self->{ct} = {type => COMMENT_TOKEN, data => '',            } else {
581                                      line => $self->{line_prev},              !!!cp (22);
582                                      column => $self->{column_prev},              !!!parse-error (type => 'pio',
583                                     };                              line => $self->{line_prev},
584            ## $self->{nc} is intentionally left as is                              column => $self->{column_prev});
585            redo A;              $self->{state} = BOGUS_COMMENT_STATE;
586          } else {              $self->{ct} = {type => COMMENT_TOKEN, data => '',
587                               line => $self->{line_prev},
588                               column => $self->{column_prev},
589                              };
590                ## $self->{nc} is intentionally left as is
591                redo A;
592              }
593            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
594            !!!cp (23);            !!!cp (23);
595            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
596                            line => $self->{line_prev},                            line => $self->{line_prev},
597                            column => $self->{column_prev});                            column => $self->{column_prev});
598            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
599              $self->{s_kwd} = '';
600            ## reconsume            ## reconsume
601    
602            !!!emit ({type => CHARACTER_TOKEN, data => '<',            !!!emit ({type => CHARACTER_TOKEN, data => '<',
# Line 463  sub _get_next_token ($) { Line 605  sub _get_next_token ($) {
605                     });                     });
606    
607            redo A;            redo A;
608            } else {
609              ## XML5: "<:" is a parse error.
610              !!!cp (23.1);
611              $self->{ct} = {type => START_TAG_TOKEN,
612                                        tag_name => chr ($self->{nc}),
613                                        line => $self->{line_prev},
614                                        column => $self->{column_prev}};
615              $self->{state} = TAG_NAME_STATE;
616              !!!next-input-character;
617              redo A;
618          }          }
619        } else {        } else {
620          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 471  sub _get_next_token ($) { Line 623  sub _get_next_token ($) {
623        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
624        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
625    
626          ## XML5: "end tag state".
627    
628        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
629        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
630          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
631            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
632            $self->{s_kwd} = '';            $self->{kwd} = '';
633            ## Reconsume.            ## Reconsume.
634            redo A;            redo A;
635          } else {          } else {
# Line 483  sub _get_next_token ($) { Line 637  sub _get_next_token ($) {
637            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
638            !!!cp (28);            !!!cp (28);
639            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
640              $self->{s_kwd} = '';
641            ## Reconsume.            ## Reconsume.
642            !!!emit ({type => CHARACTER_TOKEN, data => '</',            !!!emit ({type => CHARACTER_TOKEN, data => '</',
643                      line => $l, column => $c,                      line => $l, column => $c,
# Line 496  sub _get_next_token ($) { Line 651  sub _get_next_token ($) {
651          !!!cp (29);          !!!cp (29);
652          $self->{ct}          $self->{ct}
653              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
654                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
655                 line => $l, column => $c};                 line => $l, column => $c};
656          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
657          !!!next-input-character;          !!!next-input-character;
# Line 511  sub _get_next_token ($) { Line 666  sub _get_next_token ($) {
666          !!!next-input-character;          !!!next-input-character;
667          redo A;          redo A;
668        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (31);  
669          !!!parse-error (type => 'empty end tag',          !!!parse-error (type => 'empty end tag',
670                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
671                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
672          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
673          !!!next-input-character;          $self->{s_kwd} = '';
674            if ($self->{is_xml}) {
675              !!!cp (31);
676              ## XML5: No parse error.
677              
678              ## NOTE: This parser raises a parse error, since it supports
679              ## XML1, not XML5.
680    
681              ## NOTE: A short end tag token.
682              my $ct = {type => END_TAG_TOKEN,
683                        tag_name => '',
684                        line => $self->{line_prev},
685                        column => $self->{column_prev} - 1,
686                       };
687              !!!next-input-character;
688              !!!emit ($ct);
689            } else {
690              !!!cp (31.1);
691              !!!next-input-character;
692            }
693          redo A;          redo A;
694        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
695          !!!cp (32);          !!!cp (32);
696          !!!parse-error (type => 'bare etago');          !!!parse-error (type => 'bare etago');
697            $self->{s_kwd} = '';
698          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
699          # reconsume          # reconsume
700    
# Line 529  sub _get_next_token ($) { Line 703  sub _get_next_token ($) {
703                   });                   });
704    
705          redo A;          redo A;
706        } else {        } elsif (not $self->{is_xml} or
707                   $is_space->{$self->{nc}}) {
708          !!!cp (33);          !!!cp (33);
709          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag',
710                            line => $self->{line_prev}, # "<" of "</"
711                            column => $self->{column_prev} - 1);
712          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
713          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
714                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 544  sub _get_next_token ($) { Line 721  sub _get_next_token ($) {
721          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
722          ## "bogus comment state" entry.          ## "bogus comment state" entry.
723          redo A;          redo A;
724          } else {
725            ## XML5: "</:" is a parse error.
726            !!!cp (30.1);
727            $self->{ct} = {type => END_TAG_TOKEN,
728                           tag_name => chr ($self->{nc}),
729                           line => $l, column => $c};
730            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
731            !!!next-input-character;
732            redo A;
733        }        }
734      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
735        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
736        if (length $ch) {        if (length $ch) {
737          my $CH = $ch;          my $CH = $ch;
738          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 554  sub _get_next_token ($) { Line 740  sub _get_next_token ($) {
740          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
741            !!!cp (24);            !!!cp (24);
742            ## Stay in the state.            ## Stay in the state.
743            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
744            !!!next-input-character;            !!!next-input-character;
745            redo A;            redo A;
746          } else {          } else {
747            !!!cp (25);            !!!cp (25);
748            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
749              $self->{s_kwd} = '';
750            ## Reconsume.            ## Reconsume.
751            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
752                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
753                      line => $self->{line_prev},                      line => $self->{line_prev},
754                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
755                     });                     });
756            redo A;            redo A;
757          }          }
# Line 578  sub _get_next_token ($) { Line 765  sub _get_next_token ($) {
765            !!!cp (26);            !!!cp (26);
766            ## Reconsume.            ## Reconsume.
767            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
768              $self->{s_kwd} = '';
769            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
770                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
771                      line => $self->{line_prev},                      line => $self->{line_prev},
772                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
773                     });                     });
774            redo A;            redo A;
775          } else {          } else {
# Line 590  sub _get_next_token ($) { Line 778  sub _get_next_token ($) {
778                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
779                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
780                   line => $self->{line_prev},                   line => $self->{line_prev},
781                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
782            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
783            ## Reconsume.            ## Reconsume.
784            redo A;            redo A;
# Line 619  sub _get_next_token ($) { Line 807  sub _get_next_token ($) {
807            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
808          }          }
809          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
810            $self->{s_kwd} = '';
811          !!!next-input-character;          !!!next-input-character;
812    
813          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 627  sub _get_next_token ($) { Line 816  sub _get_next_token ($) {
816        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
817                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
818          !!!cp (38);          !!!cp (38);
819          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
820                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
821            # start tag or end tag            # start tag or end tag
822          ## Stay in this state          ## Stay in this state
823          !!!next-input-character;          !!!next-input-character;
# Line 650  sub _get_next_token ($) { Line 840  sub _get_next_token ($) {
840            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
841          }          }
842          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
843            $self->{s_kwd} = '';
844          # reconsume          # reconsume
845    
846          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 669  sub _get_next_token ($) { Line 860  sub _get_next_token ($) {
860          redo A;          redo A;
861        }        }
862      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
863          ## XML5: "Tag attribute name before state".
864    
865        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
866          !!!cp (45);          !!!cp (45);
867          ## Stay in the state          ## Stay in the state
# Line 690  sub _get_next_token ($) { Line 883  sub _get_next_token ($) {
883            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
884          }          }
885          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
886            $self->{s_kwd} = '';
887          !!!next-input-character;          !!!next-input-character;
888    
889          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 699  sub _get_next_token ($) { Line 893  sub _get_next_token ($) {
893                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
894          !!!cp (49);          !!!cp (49);
895          $self->{ca}          $self->{ca}
896              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
897                 value => '',                 value => '',
898                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
899          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 727  sub _get_next_token ($) { Line 921  sub _get_next_token ($) {
921            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
922          }          }
923          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
924            $self->{s_kwd} = '';
925          # reconsume          # reconsume
926    
927          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 739  sub _get_next_token ($) { Line 934  sub _get_next_token ($) {
934               0x003D => 1, # =               0x003D => 1, # =
935              }->{$self->{nc}}) {              }->{$self->{nc}}) {
936            !!!cp (55);            !!!cp (55);
937              ## XML5: Not a parse error.
938            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
939          } else {          } else {
940            !!!cp (56);            !!!cp (56);
941              ## XML5: ":" raises a parse error and is ignored.
942          }          }
943          $self->{ca}          $self->{ca}
944              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 752  sub _get_next_token ($) { Line 949  sub _get_next_token ($) {
949          redo A;          redo A;
950        }        }
951      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
952          ## XML5: "Tag attribute name state".
953    
954        my $before_leave = sub {        my $before_leave = sub {
955          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
956              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 762  sub _get_next_token ($) { Line 961  sub _get_next_token ($) {
961            !!!cp (58);            !!!cp (58);
962            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
963              = $self->{ca};              = $self->{ca};
964              $self->{ca}->{index} = ++$self->{ct}->{last_index};
965          }          }
966        }; # $before_leave        }; # $before_leave
967    
# Line 778  sub _get_next_token ($) { Line 978  sub _get_next_token ($) {
978          !!!next-input-character;          !!!next-input-character;
979          redo A;          redo A;
980        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
981            if ($self->{is_xml}) {
982              !!!cp (60.1);
983              ## XML5: Not a parse error.
984              !!!parse-error (type => 'no attr value'); ## TODO: type
985            } else {
986              !!!cp (60.2);
987            }
988    
989          $before_leave->();          $before_leave->();
990          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
991            !!!cp (61);            !!!cp (61);
# Line 792  sub _get_next_token ($) { Line 1000  sub _get_next_token ($) {
1000            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1001          }          }
1002          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1003            $self->{s_kwd} = '';
1004          !!!next-input-character;          !!!next-input-character;
1005    
1006          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 800  sub _get_next_token ($) { Line 1009  sub _get_next_token ($) {
1009        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1010                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1011          !!!cp (63);          !!!cp (63);
1012          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1013                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1014          ## Stay in the state          ## Stay in the state
1015          !!!next-input-character;          !!!next-input-character;
1016          redo A;          redo A;
1017        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1018          !!!cp (64);          if ($self->{is_xml}) {
1019              !!!cp (64);
1020              ## XML5: Not a parse error.
1021              !!!parse-error (type => 'no attr value'); ## TODO: type
1022            } else {
1023              !!!cp (64.1);
1024            }
1025            
1026          $before_leave->();          $before_leave->();
1027          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1028          !!!next-input-character;          !!!next-input-character;
# Line 829  sub _get_next_token ($) { Line 1046  sub _get_next_token ($) {
1046            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1047          }          }
1048          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1049            $self->{s_kwd} = '';
1050          # reconsume          # reconsume
1051    
1052          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 838  sub _get_next_token ($) { Line 1056  sub _get_next_token ($) {
1056          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1057              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1058            !!!cp (69);            !!!cp (69);
1059              ## XML5: Not a parse error.
1060            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1061          } else {          } else {
1062            !!!cp (70);            !!!cp (70);
# Line 848  sub _get_next_token ($) { Line 1067  sub _get_next_token ($) {
1067          redo A;          redo A;
1068        }        }
1069      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1070          ## XML5: "Tag attribute name after state".
1071          
1072        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1073          !!!cp (71);          !!!cp (71);
1074          ## Stay in the state          ## Stay in the state
# Line 859  sub _get_next_token ($) { Line 1080  sub _get_next_token ($) {
1080          !!!next-input-character;          !!!next-input-character;
1081          redo A;          redo A;
1082        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1083            if ($self->{is_xml}) {
1084              !!!cp (72.1);
1085              ## XML5: Not a parse error.
1086              !!!parse-error (type => 'no attr value'); ## TODO: type
1087            } else {
1088              !!!cp (72.2);
1089            }
1090    
1091          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1092            !!!cp (73);            !!!cp (73);
1093            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 875  sub _get_next_token ($) { Line 1104  sub _get_next_token ($) {
1104            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1105          }          }
1106          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1107            $self->{s_kwd} = '';
1108          !!!next-input-character;          !!!next-input-character;
1109    
1110          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 884  sub _get_next_token ($) { Line 1114  sub _get_next_token ($) {
1114                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1115          !!!cp (76);          !!!cp (76);
1116          $self->{ca}          $self->{ca}
1117              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1118                 value => '',                 value => '',
1119                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1120          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
1121          !!!next-input-character;          !!!next-input-character;
1122          redo A;          redo A;
1123        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1124          !!!cp (77);          if ($self->{is_xml}) {
1125              !!!cp (77);
1126              ## XML5: Not a parse error.
1127              !!!parse-error (type => 'no attr value'); ## TODO: type
1128            } else {
1129              !!!cp (77.1);
1130            }
1131            
1132          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1133          !!!next-input-character;          !!!next-input-character;
1134          redo A;          redo A;
# Line 912  sub _get_next_token ($) { Line 1149  sub _get_next_token ($) {
1149          } else {          } else {
1150            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1151          }          }
1152            $self->{s_kwd} = '';
1153          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1154          # reconsume          # reconsume
1155    
# Line 919  sub _get_next_token ($) { Line 1157  sub _get_next_token ($) {
1157    
1158          redo A;          redo A;
1159        } else {        } else {
1160            if ($self->{is_xml}) {
1161              !!!cp (78.1);
1162              ## XML5: Not a parse error.
1163              !!!parse-error (type => 'no attr value'); ## TODO: type
1164            } else {
1165              !!!cp (78.2);
1166            }
1167    
1168          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1169              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1170            !!!cp (78);            !!!cp (78);
1171              ## XML5: Not a parse error.
1172            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1173          } else {          } else {
1174            !!!cp (82);            !!!cp (82);
# Line 935  sub _get_next_token ($) { Line 1182  sub _get_next_token ($) {
1182          redo A;                  redo A;        
1183        }        }
1184      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1185          ## XML5: "Tag attribute value before state".
1186    
1187        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1188          !!!cp (83);          !!!cp (83);
1189          ## Stay in the state          ## Stay in the state
# Line 973  sub _get_next_token ($) { Line 1222  sub _get_next_token ($) {
1222            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1223          }          }
1224          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1225            $self->{s_kwd} = '';
1226          !!!next-input-character;          !!!next-input-character;
1227    
1228          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 996  sub _get_next_token ($) { Line 1246  sub _get_next_token ($) {
1246            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1247          }          }
1248          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1249            $self->{s_kwd} = '';
1250          ## reconsume          ## reconsume
1251    
1252          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1004  sub _get_next_token ($) { Line 1255  sub _get_next_token ($) {
1255        } else {        } else {
1256          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1257            !!!cp (93);            !!!cp (93);
1258              ## XML5: Not a parse error.
1259            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1260            } elsif ($self->{is_xml}) {
1261              !!!cp (93.1);
1262              ## XML5: No parse error.
1263              !!!parse-error (type => 'unquoted attr value'); ## TODO
1264          } else {          } else {
1265            !!!cp (94);            !!!cp (94);
1266          }          }
# Line 1014  sub _get_next_token ($) { Line 1270  sub _get_next_token ($) {
1270          redo A;          redo A;
1271        }        }
1272      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1273          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1274          ## ATTLIST attribute value double quoted state".
1275          
1276        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1277          !!!cp (95);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1278          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            !!!cp (95.1);
1279              ## XML5: "DOCTYPE ATTLIST name after state".
1280              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1281              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1282            } else {
1283              !!!cp (95);
1284              ## XML5: "Tag attribute name before state".
1285              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1286            }
1287          !!!next-input-character;          !!!next-input-character;
1288          redo A;          redo A;
1289        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1290          !!!cp (96);          !!!cp (96);
1291            ## XML5: Not defined yet.
1292    
1293          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1294          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1295          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1035  sub _get_next_token ($) { Line 1304  sub _get_next_token ($) {
1304          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1305            !!!cp (97);            !!!cp (97);
1306            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1307    
1308              $self->{state} = DATA_STATE;
1309              $self->{s_kwd} = '';
1310              ## reconsume
1311              !!!emit ($self->{ct}); # start tag
1312              redo A;
1313          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1314            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1315            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1044  sub _get_next_token ($) { Line 1319  sub _get_next_token ($) {
1319              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1320              !!!cp (99);              !!!cp (99);
1321            }            }
1322    
1323              $self->{state} = DATA_STATE;
1324              $self->{s_kwd} = '';
1325              ## reconsume
1326              !!!emit ($self->{ct}); # end tag
1327              redo A;
1328            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1329              ## XML5: No parse error above; not defined yet.
1330              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1331              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1332              ## Reconsume.
1333              !!!emit ($self->{ct}); # ATTLIST
1334              redo A;
1335          } else {          } else {
1336            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1337          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1338        } else {        } else {
1339          !!!cp (100);          ## XML5 [ATTLIST]: Not defined yet.
1340            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1341              !!!cp (100);
1342              ## XML5: Not a parse error.
1343              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1344            } else {
1345              !!!cp (100.1);
1346            }
1347          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1348          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1349                                q["&],                                q["&<],
1350                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1351    
1352          ## Stay in the state          ## Stay in the state
# Line 1065  sub _get_next_token ($) { Line 1354  sub _get_next_token ($) {
1354          redo A;          redo A;
1355        }        }
1356      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1357          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1358          ## ATTLIST attribute value single quoted state".
1359    
1360        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1361          !!!cp (101);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1362          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            !!!cp (101.1);
1363              ## XML5: "DOCTYPE ATTLIST name after state".
1364              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1365              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1366            } else {
1367              !!!cp (101);
1368              ## XML5: "Before attribute name state" (sic).
1369              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1370            }
1371          !!!next-input-character;          !!!next-input-character;
1372          redo A;          redo A;
1373        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1374          !!!cp (102);          !!!cp (102);
1375            ## XML5: Not defined yet.
1376    
1377          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1378          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1379          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1086  sub _get_next_token ($) { Line 1388  sub _get_next_token ($) {
1388          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1389            !!!cp (103);            !!!cp (103);
1390            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1391    
1392              $self->{state} = DATA_STATE;
1393              $self->{s_kwd} = '';
1394              ## reconsume
1395              !!!emit ($self->{ct}); # start tag
1396              redo A;
1397          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1398            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1399            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1095  sub _get_next_token ($) { Line 1403  sub _get_next_token ($) {
1403              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1404              !!!cp (105);              !!!cp (105);
1405            }            }
1406    
1407              $self->{state} = DATA_STATE;
1408              $self->{s_kwd} = '';
1409              ## reconsume
1410              !!!emit ($self->{ct}); # end tag
1411              redo A;
1412            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1413              ## XML5: No parse error above; not defined yet.
1414              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1415              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1416              ## Reconsume.
1417              !!!emit ($self->{ct}); # ATTLIST
1418              redo A;
1419          } else {          } else {
1420            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1421          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1422        } else {        } else {
1423          !!!cp (106);          ## XML5 [ATTLIST]: Not defined yet.
1424            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1425              !!!cp (106);
1426              ## XML5: Not a parse error.
1427              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1428            } else {
1429              !!!cp (106.1);
1430            }
1431          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1432          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1433                                q['&],                                q['&<],
1434                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1435    
1436          ## Stay in the state          ## Stay in the state
# Line 1116  sub _get_next_token ($) { Line 1438  sub _get_next_token ($) {
1438          redo A;          redo A;
1439        }        }
1440      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1441          ## XML5: "Tag attribute value unquoted state".
1442    
1443        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1444          !!!cp (107);          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1445          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            !!!cp (107.1);
1446              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1447              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1448            } else {
1449              !!!cp (107);
1450              ## XML5: "Tag attribute name before state".
1451              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1452            }
1453          !!!next-input-character;          !!!next-input-character;
1454          redo A;          redo A;
1455        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1456          !!!cp (108);          !!!cp (108);
1457    
1458            ## XML5: Not defined yet.
1459    
1460          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1461          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1462          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1136  sub _get_next_token ($) { Line 1470  sub _get_next_token ($) {
1470          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1471            !!!cp (109);            !!!cp (109);
1472            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1473    
1474              $self->{state} = DATA_STATE;
1475              $self->{s_kwd} = '';
1476              !!!next-input-character;
1477              !!!emit ($self->{ct}); # start tag
1478              redo A;
1479          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1480            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1481            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1145  sub _get_next_token ($) { Line 1485  sub _get_next_token ($) {
1485              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1486              !!!cp (111);              !!!cp (111);
1487            }            }
1488    
1489              $self->{state} = DATA_STATE;
1490              $self->{s_kwd} = '';
1491              !!!next-input-character;
1492              !!!emit ($self->{ct}); # end tag
1493              redo A;
1494            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1495              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1496              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1497              !!!next-input-character;
1498              !!!emit ($self->{ct}); # ATTLIST
1499              redo A;
1500          } else {          } else {
1501            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1502          }          }
         $self->{state} = DATA_STATE;  
         !!!next-input-character;  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1503        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!parse-error (type => 'unclosed tag');  
1504          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1505            !!!cp (112);            !!!cp (112);
1506              !!!parse-error (type => 'unclosed tag');
1507            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1508    
1509              $self->{state} = DATA_STATE;
1510              $self->{s_kwd} = '';
1511              ## reconsume
1512              !!!emit ($self->{ct}); # start tag
1513              redo A;
1514          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1515              !!!parse-error (type => 'unclosed tag');
1516            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1517            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
1518              !!!cp (113);              !!!cp (113);
# Line 1168  sub _get_next_token ($) { Line 1521  sub _get_next_token ($) {
1521              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1522              !!!cp (114);              !!!cp (114);
1523            }            }
1524    
1525              $self->{state} = DATA_STATE;
1526              $self->{s_kwd} = '';
1527              ## reconsume
1528              !!!emit ($self->{ct}); # end tag
1529              redo A;
1530            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1531              !!!parse-error (type => 'unclosed md'); ## TODO: type
1532              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1533              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1534              ## Reconsume.
1535              !!!emit ($self->{ct}); # ATTLIST
1536              redo A;
1537          } else {          } else {
1538            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1539          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         !!!emit ($self->{ct}); # start tag or end tag  
   
         redo A;  
1540        } else {        } else {
1541          if ({          if ({
1542               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1184  sub _get_next_token ($) { Line 1544  sub _get_next_token ($) {
1544               0x003D => 1, # =               0x003D => 1, # =
1545              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1546            !!!cp (115);            !!!cp (115);
1547              ## XML5: Not a parse error.
1548            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1549          } else {          } else {
1550            !!!cp (116);            !!!cp (116);
# Line 1220  sub _get_next_token ($) { Line 1581  sub _get_next_token ($) {
1581            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1582          }          }
1583          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1584            $self->{s_kwd} = '';
1585          !!!next-input-character;          !!!next-input-character;
1586    
1587          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1247  sub _get_next_token ($) { Line 1609  sub _get_next_token ($) {
1609            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1610          }          }
1611          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1612            $self->{s_kwd} = '';
1613          ## Reconsume.          ## Reconsume.
1614          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1615          redo A;          redo A;
# Line 1258  sub _get_next_token ($) { Line 1621  sub _get_next_token ($) {
1621          redo A;          redo A;
1622        }        }
1623      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1624          ## XML5: "Empty tag state".
1625    
1626        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1627          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1628            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1277  sub _get_next_token ($) { Line 1642  sub _get_next_token ($) {
1642          }          }
1643    
1644          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1645            $self->{s_kwd} = '';
1646          !!!next-input-character;          !!!next-input-character;
1647    
1648          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1298  sub _get_next_token ($) { Line 1664  sub _get_next_token ($) {
1664          } else {          } else {
1665            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1666          }          }
1667            ## XML5: "Tag attribute name before state".
1668          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1669            $self->{s_kwd} = '';
1670          ## Reconsume.          ## Reconsume.
1671          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1672          redo A;          redo A;
# Line 1311  sub _get_next_token ($) { Line 1679  sub _get_next_token ($) {
1679          redo A;          redo A;
1680        }        }
1681      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1682        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1683    
1684        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1685        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1686                
1687        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1688          !!!cp (124);          if ($self->{in_subset}) {
1689          $self->{state} = DATA_STATE;            !!!cp (123);
1690              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1691            } else {
1692              !!!cp (124);
1693              $self->{state} = DATA_STATE;
1694              $self->{s_kwd} = '';
1695            }
1696          !!!next-input-character;          !!!next-input-character;
1697    
1698          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1699          redo A;          redo A;
1700        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1701          !!!cp (125);          if ($self->{in_subset}) {
1702          $self->{state} = DATA_STATE;            !!!cp (125.1);
1703              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1704            } else {
1705              !!!cp (125);
1706              $self->{state} = DATA_STATE;
1707              $self->{s_kwd} = '';
1708            }
1709          ## reconsume          ## reconsume
1710    
1711          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1342  sub _get_next_token ($) { Line 1722  sub _get_next_token ($) {
1722          redo A;          redo A;
1723        }        }
1724      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1725        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1726                
1727        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1728          !!!cp (133);          !!!cp (133);
# Line 1354  sub _get_next_token ($) { Line 1734  sub _get_next_token ($) {
1734          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1735          !!!cp (130);          !!!cp (130);
1736          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1737          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1738          !!!next-input-character;          !!!next-input-character;
1739          redo A;          redo A;
1740        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1741                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1742                    $self->{is_xml}) and
1743                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1744          !!!cp (135.4);                          !!!cp (135.4);                
1745          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1746          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1747          !!!next-input-character;          !!!next-input-character;
1748          redo A;          redo A;
1749        } else {        } else {
# Line 1386  sub _get_next_token ($) { Line 1767  sub _get_next_token ($) {
1767                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1768                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
1769                                   };                                   };
1770          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1771          !!!next-input-character;          !!!next-input-character;
1772          redo A;          redo A;
1773        } else {        } else {
# Line 1412  sub _get_next_token ($) { Line 1793  sub _get_next_token ($) {
1793              0x0054, # T              0x0054, # T
1794              0x0059, # Y              0x0059, # Y
1795              0x0050, # P              0x0050, # P
1796            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1797            $self->{nc} == [            $self->{nc} == [
1798              undef,              undef,
1799              0x006F, # o              0x006F, # o
# Line 1420  sub _get_next_token ($) { Line 1801  sub _get_next_token ($) {
1801              0x0074, # t              0x0074, # t
1802              0x0079, # y              0x0079, # y
1803              0x0070, # p              0x0070, # p
1804            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1805          !!!cp (131);          !!!cp (131);
1806          ## Stay in the state.          ## Stay in the state.
1807          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1808          !!!next-input-character;          !!!next-input-character;
1809          redo A;          redo A;
1810        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1811                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1812                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1813          !!!cp (129);          if ($self->{is_xml} and
1814                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1815              !!!cp (129);
1816              ## XML5: case-sensitive.
1817              !!!parse-error (type => 'lowercase keyword', ## TODO
1818                              text => 'DOCTYPE',
1819                              line => $self->{line_prev},
1820                              column => $self->{column_prev} - 5);
1821            } else {
1822              !!!cp (129.1);
1823            }
1824          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
1825          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
1826                                    quirks => 1,                                    quirks => 1,
# Line 1442  sub _get_next_token ($) { Line 1833  sub _get_next_token ($) {
1833          !!!cp (132);                  !!!cp (132);        
1834          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1835                          line => $self->{line_prev},                          line => $self->{line_prev},
1836                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1837          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1838          ## Reconsume.          ## Reconsume.
1839          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1840                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1841                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1842                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1843                                   };                                   };
1844          redo A;          redo A;
1845        }        }
# Line 1459  sub _get_next_token ($) { Line 1850  sub _get_next_token ($) {
1850              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1851              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1852              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1853            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1854          !!!cp (135.1);          !!!cp (135.1);
1855          ## Stay in the state.          ## Stay in the state.
1856          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1857          !!!next-input-character;          !!!next-input-character;
1858          redo A;          redo A;
1859        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1860                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1861          !!!cp (135.2);          if ($self->{is_xml} and
1862                not $self->{tainted} and
1863                @{$self->{open_elements} or []} == 0) {
1864              !!!cp (135.2);
1865              !!!parse-error (type => 'cdata outside of root element',
1866                              line => $self->{line_prev},
1867                              column => $self->{column_prev} - 7);
1868              $self->{tainted} = 1;
1869            } else {
1870              !!!cp (135.21);
1871            }
1872    
1873          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
1874                                    data => '',                                    data => '',
1875                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 1479  sub _get_next_token ($) { Line 1881  sub _get_next_token ($) {
1881          !!!cp (135.3);          !!!cp (135.3);
1882          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1883                          line => $self->{line_prev},                          line => $self->{line_prev},
1884                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1885          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1886          ## Reconsume.          ## Reconsume.
1887          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1888                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1889                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1890                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1891                                   };                                   };
1892          redo A;          redo A;
1893        }        }
# Line 1496  sub _get_next_token ($) { Line 1898  sub _get_next_token ($) {
1898          !!!next-input-character;          !!!next-input-character;
1899          redo A;          redo A;
1900        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1901          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1902          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1903              !!!cp (138.1);
1904              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1905            } else {
1906              !!!cp (138);
1907              $self->{state} = DATA_STATE;
1908              $self->{s_kwd} = '';
1909            }
1910          !!!next-input-character;          !!!next-input-character;
1911    
1912          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1913    
1914          redo A;          redo A;
1915        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1916          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1917          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1918              !!!cp (139.1);
1919              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1920            } else {
1921              !!!cp (139);
1922              $self->{state} = DATA_STATE;
1923              $self->{s_kwd} = '';
1924            }
1925          ## reconsume          ## reconsume
1926    
1927          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1528  sub _get_next_token ($) { Line 1942  sub _get_next_token ($) {
1942          !!!next-input-character;          !!!next-input-character;
1943          redo A;          redo A;
1944        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1945          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1946          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1947              !!!cp (142.1);
1948              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1949            } else {
1950              !!!cp (142);
1951              $self->{state} = DATA_STATE;
1952              $self->{s_kwd} = '';
1953            }
1954          !!!next-input-character;          !!!next-input-character;
1955    
1956          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1957    
1958          redo A;          redo A;
1959        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1960          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1961          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1962              !!!cp (143.1);
1963              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1964            } else {
1965              !!!cp (143);
1966              $self->{state} = DATA_STATE;
1967              $self->{s_kwd} = '';
1968            }
1969          ## reconsume          ## reconsume
1970    
1971          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1554  sub _get_next_token ($) { Line 1980  sub _get_next_token ($) {
1980          redo A;          redo A;
1981        }        }
1982      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
1983          ## XML5: "Comment state" and "DOCTYPE comment state".
1984    
1985        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1986          !!!cp (145);          !!!cp (145);
1987          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
1988          !!!next-input-character;          !!!next-input-character;
1989          redo A;          redo A;
1990        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
1991          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1992          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1993              !!!cp (146.1);
1994              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1995            } else {
1996              !!!cp (146);
1997              $self->{state} = DATA_STATE;
1998              $self->{s_kwd} = '';
1999            }
2000          ## reconsume          ## reconsume
2001    
2002          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1580  sub _get_next_token ($) { Line 2014  sub _get_next_token ($) {
2014          redo A;          redo A;
2015        }        }
2016      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2017          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2018    
2019        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2020          !!!cp (148);          !!!cp (148);
2021          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
2022          !!!next-input-character;          !!!next-input-character;
2023          redo A;          redo A;
2024        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
2025          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2026          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2027              !!!cp (149.1);
2028              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2029            } else {
2030              !!!cp (149);
2031              $self->{state} = DATA_STATE;
2032              $self->{s_kwd} = '';
2033            }
2034          ## reconsume          ## reconsume
2035    
2036          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1602  sub _get_next_token ($) { Line 2044  sub _get_next_token ($) {
2044          redo A;          redo A;
2045        }        }
2046      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2047          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2048    
2049        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2050          !!!cp (151);          if ($self->{in_subset}) {
2051          $self->{state} = DATA_STATE;            !!!cp (151.1);
2052              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2053            } else {
2054              !!!cp (151);
2055              $self->{state} = DATA_STATE;
2056              $self->{s_kwd} = '';
2057            }
2058          !!!next-input-character;          !!!next-input-character;
2059    
2060          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1612  sub _get_next_token ($) { Line 2062  sub _get_next_token ($) {
2062          redo A;          redo A;
2063        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2064          !!!cp (152);          !!!cp (152);
2065            ## XML5: Not a parse error.
2066          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
2067                          line => $self->{line_prev},                          line => $self->{line_prev},
2068                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1620  sub _get_next_token ($) { Line 2071  sub _get_next_token ($) {
2071          !!!next-input-character;          !!!next-input-character;
2072          redo A;          redo A;
2073        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
2074          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2075          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2076              !!!cp (153.1);
2077              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2078            } else {
2079              !!!cp (153);
2080              $self->{state} = DATA_STATE;
2081              $self->{s_kwd} = '';
2082            }
2083          ## reconsume          ## reconsume
2084    
2085          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1630  sub _get_next_token ($) { Line 2087  sub _get_next_token ($) {
2087          redo A;          redo A;
2088        } else {        } else {
2089          !!!cp (154);          !!!cp (154);
2090            ## XML5: Not a parse error.
2091          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
2092                          line => $self->{line_prev},                          line => $self->{line_prev},
2093                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1646  sub _get_next_token ($) { Line 2104  sub _get_next_token ($) {
2104          redo A;          redo A;
2105        } else {        } else {
2106          !!!cp (156);          !!!cp (156);
2107            ## XML5: Unless EOF, swith to the bogus comment state.
2108          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
2109          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2110          ## reconsume          ## reconsume
2111          redo A;          redo A;
2112        }        }
2113      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2114          ## XML5: "DOCTYPE root name before state".
2115    
2116        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2117          !!!cp (157);          !!!cp (157);
2118          ## Stay in the state          ## Stay in the state
# Line 1659  sub _get_next_token ($) { Line 2120  sub _get_next_token ($) {
2120          redo A;          redo A;
2121        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2122          !!!cp (158);          !!!cp (158);
2123            ## XML5: No parse error.
2124          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2125          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2126            $self->{s_kwd} = '';
2127          !!!next-input-character;          !!!next-input-character;
2128    
2129          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1670  sub _get_next_token ($) { Line 2133  sub _get_next_token ($) {
2133          !!!cp (159);          !!!cp (159);
2134          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2135          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2136            $self->{s_kwd} = '';
2137          ## reconsume          ## reconsume
2138    
2139          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2140    
2141          redo A;          redo A;
2142          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2143            !!!cp (159.1);
2144            !!!parse-error (type => 'no DOCTYPE name');
2145            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2146            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2147            $self->{in_subset} = 1;
2148            !!!next-input-character;
2149            !!!emit ($self->{ct}); # DOCTYPE
2150            redo A;
2151        } else {        } else {
2152          !!!cp (160);          !!!cp (160);
2153          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1684  sub _get_next_token ($) { Line 2157  sub _get_next_token ($) {
2157          redo A;          redo A;
2158        }        }
2159      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2160  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2161    
2162          ## ISSUE: Redundant "First," in the spec.
2163    
2164        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2165          !!!cp (161);          !!!cp (161);
2166          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1693  sub _get_next_token ($) { Line 2169  sub _get_next_token ($) {
2169        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2170          !!!cp (162);          !!!cp (162);
2171          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2172            $self->{s_kwd} = '';
2173          !!!next-input-character;          !!!next-input-character;
2174    
2175          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1702  sub _get_next_token ($) { Line 2179  sub _get_next_token ($) {
2179          !!!cp (163);          !!!cp (163);
2180          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2181          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2182            $self->{s_kwd} = '';
2183          ## reconsume          ## reconsume
2184    
2185          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2186          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2187    
2188          redo A;          redo A;
2189          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2190            !!!cp (163.1);
2191            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2192            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2193            $self->{in_subset} = 1;
2194            !!!next-input-character;
2195            !!!emit ($self->{ct}); # DOCTYPE
2196            redo A;
2197        } else {        } else {
2198          !!!cp (164);          !!!cp (164);
2199          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1717  sub _get_next_token ($) { Line 2203  sub _get_next_token ($) {
2203          redo A;          redo A;
2204        }        }
2205      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2206          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2207          ## state", but implemented differently.
2208    
2209        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2210          !!!cp (165);          !!!cp (165);
2211          ## Stay in the state          ## Stay in the state
# Line 1725  sub _get_next_token ($) { Line 2214  sub _get_next_token ($) {
2214        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2215          !!!cp (166);          !!!cp (166);
2216          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2217            $self->{s_kwd} = '';
2218          !!!next-input-character;          !!!next-input-character;
2219    
2220          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1734  sub _get_next_token ($) { Line 2224  sub _get_next_token ($) {
2224          !!!cp (167);          !!!cp (167);
2225          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2226          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2227            $self->{s_kwd} = '';
2228          ## reconsume          ## reconsume
2229    
2230          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1742  sub _get_next_token ($) { Line 2233  sub _get_next_token ($) {
2233          redo A;          redo A;
2234        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2235                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2236            !!!cp (167.1);
2237          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2238          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2239          !!!next-input-character;          !!!next-input-character;
2240          redo A;          redo A;
2241        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2242                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2243            !!!cp (167.2);
2244          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2245          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2246          !!!next-input-character;          !!!next-input-character;
2247          redo A;          redo A;
2248          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2249            !!!cp (167.3);
2250            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2251            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2252            $self->{in_subset} = 1;
2253            !!!next-input-character;
2254            !!!emit ($self->{ct}); # DOCTYPE
2255            redo A;
2256        } else {        } else {
2257          !!!cp (180);          !!!cp (180);
2258          !!!parse-error (type => 'string after DOCTYPE name');          !!!parse-error (type => 'string after DOCTYPE name');
# Line 1769  sub _get_next_token ($) { Line 2270  sub _get_next_token ($) {
2270              0x0042, # B              0x0042, # B
2271              0x004C, # L              0x004C, # L
2272              0x0049, # I              0x0049, # I
2273            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2274            $self->{nc} == [            $self->{nc} == [
2275              undef,              undef,
2276              0x0075, # u              0x0075, # u
2277              0x0062, # b              0x0062, # b
2278              0x006C, # l              0x006C, # l
2279              0x0069, # i              0x0069, # i
2280            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2281          !!!cp (175);          !!!cp (175);
2282          ## Stay in the state.          ## Stay in the state.
2283          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2284          !!!next-input-character;          !!!next-input-character;
2285          redo A;          redo A;
2286        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2287                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2288                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2289          !!!cp (168);          if ($self->{is_xml} and
2290                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2291              !!!cp (168.1);
2292              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2293                              text => 'PUBLIC',
2294                              line => $self->{line_prev},
2295                              column => $self->{column_prev} - 4);
2296            } else {
2297              !!!cp (168);
2298            }
2299          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2300          !!!next-input-character;          !!!next-input-character;
2301          redo A;          redo A;
# Line 1793  sub _get_next_token ($) { Line 2303  sub _get_next_token ($) {
2303          !!!cp (169);          !!!cp (169);
2304          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2305                          line => $self->{line_prev},                          line => $self->{line_prev},
2306                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2307          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2308    
2309          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 1808  sub _get_next_token ($) { Line 2318  sub _get_next_token ($) {
2318              0x0053, # S              0x0053, # S
2319              0x0054, # T              0x0054, # T
2320              0x0045, # E              0x0045, # E
2321            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2322            $self->{nc} == [            $self->{nc} == [
2323              undef,              undef,
2324              0x0079, # y              0x0079, # y
2325              0x0073, # s              0x0073, # s
2326              0x0074, # t              0x0074, # t
2327              0x0065, # e              0x0065, # e
2328            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2329          !!!cp (170);          !!!cp (170);
2330          ## Stay in the state.          ## Stay in the state.
2331          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2332          !!!next-input-character;          !!!next-input-character;
2333          redo A;          redo A;
2334        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2335                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2336                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2337          !!!cp (171);          if ($self->{is_xml} and
2338                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2339              !!!cp (171.1);
2340              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2341                              text => 'SYSTEM',
2342                              line => $self->{line_prev},
2343                              column => $self->{column_prev} - 4);
2344            } else {
2345              !!!cp (171);
2346            }
2347          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2348          !!!next-input-character;          !!!next-input-character;
2349          redo A;          redo A;
# Line 1832  sub _get_next_token ($) { Line 2351  sub _get_next_token ($) {
2351          !!!cp (172);          !!!cp (172);
2352          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2353                          line => $self->{line_prev},                          line => $self->{line_prev},
2354                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2355          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2356    
2357          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 1862  sub _get_next_token ($) { Line 2381  sub _get_next_token ($) {
2381          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2382    
2383          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2384            $self->{s_kwd} = '';
2385          !!!next-input-character;          !!!next-input-character;
2386    
2387          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1873  sub _get_next_token ($) { Line 2393  sub _get_next_token ($) {
2393          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2394    
2395          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2396            $self->{s_kwd} = '';
2397          ## reconsume          ## reconsume
2398    
2399          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2400          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2401    
2402          redo A;          redo A;
2403          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2404            !!!cp (186.1);
2405            !!!parse-error (type => 'no PUBLIC literal');
2406            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2407            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2408            $self->{in_subset} = 1;
2409            !!!next-input-character;
2410            !!!emit ($self->{ct}); # DOCTYPE
2411            redo A;
2412        } else {        } else {
2413          !!!cp (186);          !!!cp (186);
2414          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
# Line 1899  sub _get_next_token ($) { Line 2429  sub _get_next_token ($) {
2429          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2430    
2431          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2432            $self->{s_kwd} = '';
2433          !!!next-input-character;          !!!next-input-character;
2434    
2435          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1910  sub _get_next_token ($) { Line 2441  sub _get_next_token ($) {
2441          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2442    
2443          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2444            $self->{s_kwd} = '';
2445          ## reconsume          ## reconsume
2446    
2447          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1938  sub _get_next_token ($) { Line 2470  sub _get_next_token ($) {
2470          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2471    
2472          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2473            $self->{s_kwd} = '';
2474          !!!next-input-character;          !!!next-input-character;
2475    
2476          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1949  sub _get_next_token ($) { Line 2482  sub _get_next_token ($) {
2482          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2483    
2484          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2485            $self->{s_kwd} = '';
2486          ## reconsume          ## reconsume
2487    
2488          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1985  sub _get_next_token ($) { Line 2519  sub _get_next_token ($) {
2519          !!!next-input-character;          !!!next-input-character;
2520          redo A;          redo A;
2521        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2522          !!!cp (198);          if ($self->{is_xml}) {
2523              !!!cp (198.1);
2524              !!!parse-error (type => 'no SYSTEM literal');
2525            } else {
2526              !!!cp (198);
2527            }
2528          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2529            $self->{s_kwd} = '';
2530          !!!next-input-character;          !!!next-input-character;
2531    
2532          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1997  sub _get_next_token ($) { Line 2537  sub _get_next_token ($) {
2537          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2538    
2539          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2540            $self->{s_kwd} = '';
2541          ## reconsume          ## reconsume
2542    
2543          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2544          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2545    
2546          redo A;          redo A;
2547          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2548            !!!cp (200.1);
2549            !!!parse-error (type => 'no SYSTEM literal');
2550            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2551            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2552            $self->{in_subset} = 1;
2553            !!!next-input-character;
2554            !!!emit ($self->{ct}); # DOCTYPE
2555            redo A;
2556        } else {        } else {
2557          !!!cp (200);          !!!cp (200);
2558          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
# Line 2034  sub _get_next_token ($) { Line 2584  sub _get_next_token ($) {
2584          !!!cp (204);          !!!cp (204);
2585          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2586          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2587            $self->{s_kwd} = '';
2588          !!!next-input-character;          !!!next-input-character;
2589    
2590          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2045  sub _get_next_token ($) { Line 2596  sub _get_next_token ($) {
2596          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2597    
2598          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2599            $self->{s_kwd} = '';
2600          ## reconsume          ## reconsume
2601    
2602          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2603          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2604    
2605          redo A;          redo A;
2606          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2607            !!!cp (206.1);
2608            !!!parse-error (type => 'no SYSTEM literal');
2609    
2610            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2611            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2612            $self->{in_subset} = 1;
2613            !!!next-input-character;
2614            !!!emit ($self->{ct}); # DOCTYPE
2615            redo A;
2616        } else {        } else {
2617          !!!cp (206);          !!!cp (206);
2618          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
# Line 2066  sub _get_next_token ($) { Line 2628  sub _get_next_token ($) {
2628          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2629          !!!next-input-character;          !!!next-input-character;
2630          redo A;          redo A;
2631        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2632          !!!cp (208);          !!!cp (208);
2633          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2634    
2635          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2636            $self->{s_kwd} = '';
2637          !!!next-input-character;          !!!next-input-character;
2638    
2639          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2082  sub _get_next_token ($) { Line 2645  sub _get_next_token ($) {
2645          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2646    
2647          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2648            $self->{s_kwd} = '';
2649          ## reconsume          ## reconsume
2650    
2651          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2105  sub _get_next_token ($) { Line 2669  sub _get_next_token ($) {
2669          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2670          !!!next-input-character;          !!!next-input-character;
2671          redo A;          redo A;
2672        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2673          !!!cp (212);          !!!cp (212);
2674          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2675    
2676          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2677            $self->{s_kwd} = '';
2678          !!!next-input-character;          !!!next-input-character;
2679    
2680          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2121  sub _get_next_token ($) { Line 2686  sub _get_next_token ($) {
2686          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2687    
2688          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2689            $self->{s_kwd} = '';
2690          ## reconsume          ## reconsume
2691    
2692          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2147  sub _get_next_token ($) { Line 2713  sub _get_next_token ($) {
2713        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2714          !!!cp (216);          !!!cp (216);
2715          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2716            $self->{s_kwd} = '';
2717          !!!next-input-character;          !!!next-input-character;
2718    
2719          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2156  sub _get_next_token ($) { Line 2723  sub _get_next_token ($) {
2723          !!!cp (217);          !!!cp (217);
2724          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2725          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2726            $self->{s_kwd} = '';
2727          ## reconsume          ## reconsume
2728    
2729          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2730          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2731    
2732          redo A;          redo A;
2733          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2734            !!!cp (218.1);
2735            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2736            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2737            $self->{in_subset} = 1;
2738            !!!next-input-character;
2739            !!!emit ($self->{ct}); # DOCTYPE
2740            redo A;
2741        } else {        } else {
2742          !!!cp (218);          !!!cp (218);
2743          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
# Line 2175  sub _get_next_token ($) { Line 2751  sub _get_next_token ($) {
2751        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2752          !!!cp (219);          !!!cp (219);
2753          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2754            $self->{s_kwd} = '';
2755          !!!next-input-character;          !!!next-input-character;
2756    
2757          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2758    
2759          redo A;          redo A;
2760          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2761            !!!cp (220.1);
2762            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2763            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2764            $self->{in_subset} = 1;
2765            !!!next-input-character;
2766            !!!emit ($self->{ct}); # DOCTYPE
2767            redo A;
2768        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2769          !!!cp (220);          !!!cp (220);
2770          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2771            $self->{s_kwd} = '';
2772          ## reconsume          ## reconsume
2773    
2774          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2191  sub _get_next_token ($) { Line 2777  sub _get_next_token ($) {
2777        } else {        } else {
2778          !!!cp (221);          !!!cp (221);
2779          my $s = '';          my $s = '';
2780          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2781    
2782          ## Stay in the state          ## Stay in the state
2783          !!!next-input-character;          !!!next-input-character;
# Line 2201  sub _get_next_token ($) { Line 2787  sub _get_next_token ($) {
2787        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
2788        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2789        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
2790    
2791          ## XML5: "CDATA state".
2792                
2793        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2794          !!!cp (221.1);          !!!cp (221.1);
# Line 2208  sub _get_next_token ($) { Line 2796  sub _get_next_token ($) {
2796          !!!next-input-character;          !!!next-input-character;
2797          redo A;          redo A;
2798        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2799            if ($self->{is_xml}) {
2800              !!!cp (221.11);
2801              !!!parse-error (type => 'no mse'); ## TODO: type
2802            } else {
2803              !!!cp (221.12);
2804            }
2805    
2806          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2807          !!!next-input-character;          $self->{s_kwd} = '';
2808            ## Reconsume.
2809          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2810            !!!cp (221.2);            !!!cp (221.2);
2811            !!!emit ($self->{ct}); # character            !!!emit ($self->{ct}); # character
# Line 2232  sub _get_next_token ($) { Line 2828  sub _get_next_token ($) {
2828    
2829        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
2830      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2831          ## XML5: "CDATA bracket state".
2832    
2833        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2834          !!!cp (221.5);          !!!cp (221.5);
2835          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 2239  sub _get_next_token ($) { Line 2837  sub _get_next_token ($) {
2837          redo A;          redo A;
2838        } else {        } else {
2839          !!!cp (221.6);          !!!cp (221.6);
2840            ## XML5: If EOF, "]" is not appended and changed to the data state.
2841          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
2842          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2843          ## Reconsume.          ## Reconsume.
2844          redo A;          redo A;
2845        }        }
2846      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2847          ## XML5: "CDATA end state".
2848    
2849        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2850          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2851            $self->{s_kwd} = '';
2852          !!!next-input-character;          !!!next-input-character;
2853          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2854            !!!cp (221.7);            !!!cp (221.7);
# Line 2266  sub _get_next_token ($) { Line 2868  sub _get_next_token ($) {
2868          !!!cp (221.11);          !!!cp (221.11);
2869          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
2870          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
2871          ## Reconsume.          ## Reconsume. ## XML5: Emit.
2872          redo A;          redo A;
2873        }        }
2874      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 2283  sub _get_next_token ($) { Line 2885  sub _get_next_token ($) {
2885        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
2886          !!!cp (999);          !!!cp (999);
2887          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
2888          $self->{s_kwd} = '#';          $self->{kwd} = '#';
2889          !!!next-input-character;          !!!next-input-character;
2890          redo A;          redo A;
2891        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2293  sub _get_next_token ($) { Line 2895  sub _get_next_token ($) {
2895          !!!cp (998);          !!!cp (998);
2896          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
2897          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
2898          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2899          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
2900          $self->{entity__match} = 0;          $self->{entity__match} = 0;
2901          !!!next-input-character;          !!!next-input-character;
2902          redo A;          redo A;
# Line 2314  sub _get_next_token ($) { Line 2916  sub _get_next_token ($) {
2916        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2917          !!!cp (997);          !!!cp (997);
2918          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2919            $self->{s_kwd} = '';
2920          ## Reconsume.          ## Reconsume.
2921          !!!emit ({type => CHARACTER_TOKEN, data => '&',          !!!emit ({type => CHARACTER_TOKEN, data => '&',
2922                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 2324  sub _get_next_token ($) { Line 2927  sub _get_next_token ($) {
2927          !!!cp (996);          !!!cp (996);
2928          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
2929          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2930            $self->{s_kwd} = '';
2931          ## Reconsume.          ## Reconsume.
2932          redo A;          redo A;
2933        }        }
# Line 2332  sub _get_next_token ($) { Line 2936  sub _get_next_token ($) {
2936            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
2937          !!!cp (995);          !!!cp (995);
2938          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
2939          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2940          !!!next-input-character;          !!!next-input-character;
2941          redo A;          redo A;
2942        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
2943                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
2944          !!!cp (994);          !!!cp (994);
2945          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
2946          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
2947          !!!next-input-character;          !!!next-input-character;
2948          redo A;          redo A;
2949        } else {        } else {
# Line 2354  sub _get_next_token ($) { Line 2958  sub _get_next_token ($) {
2958          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2959            !!!cp (1019);            !!!cp (1019);
2960            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2961              $self->{s_kwd} = '';
2962            ## Reconsume.            ## Reconsume.
2963            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2964                      data => '&#',                      data => '&#',
# Line 2365  sub _get_next_token ($) { Line 2970  sub _get_next_token ($) {
2970            !!!cp (993);            !!!cp (993);
2971            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
2972            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2973              $self->{s_kwd} = '';
2974            ## Reconsume.            ## Reconsume.
2975            redo A;            redo A;
2976          }          }
# Line 2373  sub _get_next_token ($) { Line 2979  sub _get_next_token ($) {
2979        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
2980            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
2981          !!!cp (1012);          !!!cp (1012);
2982          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
2983          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2984                    
2985          ## Stay in the state.          ## Stay in the state.
2986          !!!next-input-character;          !!!next-input-character;
# Line 2390  sub _get_next_token ($) { Line 2996  sub _get_next_token ($) {
2996          #          #
2997        }        }
2998    
2999        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3000        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3001        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3002        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2410  sub _get_next_token ($) { Line 3016  sub _get_next_token ($) {
3016        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
3017          !!!cp (992);          !!!cp (992);
3018          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3019            $self->{s_kwd} = '';
3020          ## Reconsume.          ## Reconsume.
3021          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3022                      has_reference => 1,
3023                    line => $l, column => $c,                    line => $l, column => $c,
3024                   });                   });
3025          redo A;          redo A;
# Line 2420  sub _get_next_token ($) { Line 3028  sub _get_next_token ($) {
3028          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
3029          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
3030          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3031            $self->{s_kwd} = '';
3032          ## Reconsume.          ## Reconsume.
3033          redo A;          redo A;
3034        }        }
# Line 2430  sub _get_next_token ($) { Line 3039  sub _get_next_token ($) {
3039          # 0..9, A..F, a..f          # 0..9, A..F, a..f
3040          !!!cp (990);          !!!cp (990);
3041          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
3042          $self->{s_kwd} = 0;          $self->{kwd} = 0;
3043          ## Reconsume.          ## Reconsume.
3044          redo A;          redo A;
3045        } else {        } else {
# Line 2445  sub _get_next_token ($) { Line 3054  sub _get_next_token ($) {
3054          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
3055            !!!cp (1005);            !!!cp (1005);
3056            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
3057              $self->{s_kwd} = '';
3058            ## Reconsume.            ## Reconsume.
3059            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
3060                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
3061                      line => $self->{line_prev},                      line => $self->{line_prev},
3062                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
3063                     });                     });
3064            redo A;            redo A;
3065          } else {          } else {
3066            !!!cp (989);            !!!cp (989);
3067            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
3068            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
3069              $self->{s_kwd} = '';
3070            ## Reconsume.            ## Reconsume.
3071            redo A;            redo A;
3072          }          }
# Line 2464  sub _get_next_token ($) { Line 3075  sub _get_next_token ($) {
3075        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3076          # 0..9          # 0..9
3077          !!!cp (1002);          !!!cp (1002);
3078          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3079          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
3080          ## Stay in the state.          ## Stay in the state.
3081          !!!next-input-character;          !!!next-input-character;
3082          redo A;          redo A;
3083        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
3084                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
3085          !!!cp (1003);          !!!cp (1003);
3086          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3087          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
3088          ## Stay in the state.          ## Stay in the state.
3089          !!!next-input-character;          !!!next-input-character;
3090          redo A;          redo A;
3091        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
3092                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
3093          !!!cp (1004);          !!!cp (1004);
3094          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3095          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
3096          ## Stay in the state.          ## Stay in the state.
3097          !!!next-input-character;          !!!next-input-character;
3098          redo A;          redo A;
# Line 2498  sub _get_next_token ($) { Line 3109  sub _get_next_token ($) {
3109          #          #
3110        }        }
3111    
3112        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3113        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3114        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3115        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2518  sub _get_next_token ($) { Line 3129  sub _get_next_token ($) {
3129        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
3130          !!!cp (988);          !!!cp (988);
3131          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3132            $self->{s_kwd} = '';
3133          ## Reconsume.          ## Reconsume.
3134          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3135                      has_reference => 1,
3136                    line => $l, column => $c,                    line => $l, column => $c,
3137                   });                   });
3138          redo A;          redo A;
# Line 2528  sub _get_next_token ($) { Line 3141  sub _get_next_token ($) {
3141          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
3142          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
3143          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3144            $self->{s_kwd} = '';
3145          ## Reconsume.          ## Reconsume.
3146          redo A;          redo A;
3147        }        }
3148      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3149        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
3150            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
3151            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
3152              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2542  sub _get_next_token ($) { Line 3156  sub _get_next_token ($) {
3156              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
3157             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
3158          our $EntityChar;          our $EntityChar;
3159          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3160          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
3161            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3162              !!!cp (1020);              !!!cp (1020);
3163              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3164              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3165              !!!next-input-character;              !!!next-input-character;
3166              #              #
3167            } else {            } else {
3168              !!!cp (1021);              !!!cp (1021);
3169              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3170              $self->{entity__match} = -1;              $self->{entity__match} = -1;
3171              ## Stay in the state.              ## Stay in the state.
3172              !!!next-input-character;              !!!next-input-character;
# Line 2580  sub _get_next_token ($) { Line 3194  sub _get_next_token ($) {
3194          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3195              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3196            !!!cp (1024);            !!!cp (1024);
3197            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3198            #            #
3199          } else {          } else {
3200            !!!cp (1025);            !!!cp (1025);
# Line 2592  sub _get_next_token ($) { Line 3206  sub _get_next_token ($) {
3206          !!!cp (1026);          !!!cp (1026);
3207          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3208                          line => $self->{line_prev},                          line => $self->{line_prev},
3209                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3210          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3211          #          #
3212        }        }
3213        
# Line 2610  sub _get_next_token ($) { Line 3224  sub _get_next_token ($) {
3224        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
3225          !!!cp (986);          !!!cp (986);
3226          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3227            $self->{s_kwd} = '';
3228          ## Reconsume.          ## Reconsume.
3229          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
3230                    data => $data,                    data => $data,
3231                      has_reference => $has_ref,
3232                    line => $self->{line_prev},                    line => $self->{line_prev},
3233                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3234                   });                   });
3235          redo A;          redo A;
3236        } else {        } else {
# Line 2622  sub _get_next_token ($) { Line 3238  sub _get_next_token ($) {
3238          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
3239          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
3240          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3241            $self->{s_kwd} = '';
3242            ## Reconsume.
3243            redo A;
3244          }
3245    
3246        ## XML-only states
3247    
3248        } elsif ($self->{state} == PI_STATE) {
3249          ## XML5: "Pi state" and "DOCTYPE pi state".
3250    
3251          if ($is_space->{$self->{nc}} or
3252              $self->{nc} == 0x003F or # ?
3253              $self->{nc} == -1) {
3254            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3255            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3256            ## "DOCTYPE pi state": Parse error, switch to the "data
3257            ## state".
3258            !!!parse-error (type => 'bare pio', ## TODO: type
3259                            line => $self->{line_prev},
3260                            column => $self->{column_prev}
3261                                - 1 * ($self->{nc} != -1));
3262            $self->{state} = BOGUS_COMMENT_STATE;
3263            ## Reconsume.
3264            $self->{ct} = {type => COMMENT_TOKEN,
3265                           data => '?',
3266                           line => $self->{line_prev},
3267                           column => $self->{column_prev}
3268                               - 1 * ($self->{nc} != -1),
3269                          };
3270            redo A;
3271          } else {
3272            ## XML5: "DOCTYPE pi state": Stay in the state.
3273            $self->{ct} = {type => PI_TOKEN,
3274                           target => chr $self->{nc},
3275                           data => '',
3276                           line => $self->{line_prev},
3277                           column => $self->{column_prev} - 1,
3278                          };
3279            $self->{state} = PI_TARGET_STATE;
3280            !!!next-input-character;
3281            redo A;
3282          }
3283        } elsif ($self->{state} == PI_TARGET_STATE) {
3284          if ($is_space->{$self->{nc}}) {
3285            $self->{state} = PI_TARGET_AFTER_STATE;
3286            !!!next-input-character;
3287            redo A;
3288          } elsif ($self->{nc} == -1) {
3289            !!!parse-error (type => 'no pic'); ## TODO: type
3290            if ($self->{in_subset}) {
3291              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3292            } else {
3293              $self->{state} = DATA_STATE;
3294              $self->{s_kwd} = '';
3295            }
3296            ## Reconsume.
3297            !!!emit ($self->{ct}); # pi
3298            redo A;
3299          } elsif ($self->{nc} == 0x003F) { # ?
3300            $self->{state} = PI_AFTER_STATE;
3301            !!!next-input-character;
3302            redo A;
3303          } else {
3304            ## XML5: typo ("tag name" -> "target")
3305            $self->{ct}->{target} .= chr $self->{nc}; # pi
3306            !!!next-input-character;
3307            redo A;
3308          }
3309        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3310          if ($is_space->{$self->{nc}}) {
3311            ## Stay in the state.
3312            !!!next-input-character;
3313            redo A;
3314          } else {
3315            $self->{state} = PI_DATA_STATE;
3316            ## Reprocess.
3317            redo A;
3318          }
3319        } elsif ($self->{state} == PI_DATA_STATE) {
3320          if ($self->{nc} == 0x003F) { # ?
3321            $self->{state} = PI_DATA_AFTER_STATE;
3322            !!!next-input-character;
3323            redo A;
3324          } elsif ($self->{nc} == -1) {
3325            !!!parse-error (type => 'no pic'); ## TODO: type
3326            if ($self->{in_subset}) {
3327              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3328            } else {
3329              $self->{state} = DATA_STATE;
3330              $self->{s_kwd} = '';
3331            }
3332            ## Reprocess.
3333            !!!emit ($self->{ct}); # pi
3334            redo A;
3335          } else {
3336            $self->{ct}->{data} .= chr $self->{nc}; # pi
3337            $self->{read_until}->($self->{ct}->{data}, q[?],
3338                                  length $self->{ct}->{data});
3339            ## Stay in the state.
3340            !!!next-input-character;
3341            ## Reprocess.
3342            redo A;
3343          }
3344        } elsif ($self->{state} == PI_AFTER_STATE) {
3345          ## XML5: Part of "Pi after state".
3346    
3347          if ($self->{nc} == 0x003E) { # >
3348            if ($self->{in_subset}) {
3349              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3350            } else {
3351              $self->{state} = DATA_STATE;
3352              $self->{s_kwd} = '';
3353            }
3354            !!!next-input-character;
3355            !!!emit ($self->{ct}); # pi
3356            redo A;
3357          } elsif ($self->{nc} == 0x003F) { # ?
3358            !!!parse-error (type => 'no s after target', ## TODO: type
3359                            line => $self->{line_prev},
3360                            column => $self->{column_prev}); ## XML5: no error
3361            $self->{ct}->{data} .= '?';
3362            $self->{state} = PI_DATA_AFTER_STATE;
3363            !!!next-input-character;
3364            redo A;
3365          } else {
3366            !!!parse-error (type => 'no s after target', ## TODO: type
3367                            line => $self->{line_prev},
3368                            column => $self->{column_prev}
3369                                + 1 * ($self->{nc} == -1)); ## XML5: no error
3370            $self->{ct}->{data} .= '?'; ## XML5: not appended
3371            $self->{state} = PI_DATA_STATE;
3372            ## Reprocess.
3373            redo A;
3374          }
3375        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3376          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3377    
3378          if ($self->{nc} == 0x003E) { # >
3379            if ($self->{in_subset}) {
3380              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3381            } else {
3382              $self->{state} = DATA_STATE;
3383              $self->{s_kwd} = '';
3384            }
3385            !!!next-input-character;
3386            !!!emit ($self->{ct}); # pi
3387            redo A;
3388          } elsif ($self->{nc} == 0x003F) { # ?
3389            $self->{ct}->{data} .= '?';
3390            ## Stay in the state.
3391            !!!next-input-character;
3392            redo A;
3393          } else {
3394            $self->{ct}->{data} .= '?'; ## XML5: not appended
3395            $self->{state} = PI_DATA_STATE;
3396            ## Reprocess.
3397            redo A;
3398          }
3399    
3400        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3401          if ($self->{nc} == 0x003C) { # <
3402            $self->{state} = DOCTYPE_TAG_STATE;
3403            !!!next-input-character;
3404            redo A;
3405          } elsif ($self->{nc} == 0x0025) { # %
3406            ## XML5: Not defined yet.
3407    
3408            ## TODO:
3409            !!!next-input-character;
3410            redo A;
3411          } elsif ($self->{nc} == 0x005D) { # ]
3412            delete $self->{in_subset};
3413            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3414            !!!next-input-character;
3415            redo A;
3416          } elsif ($is_space->{$self->{nc}}) {
3417            ## Stay in the state.
3418            !!!next-input-character;
3419            redo A;
3420          } elsif ($self->{nc} == -1) {
3421            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3422            delete $self->{in_subset};
3423            $self->{state} = DATA_STATE;
3424            $self->{s_kwd} = '';
3425          ## Reconsume.          ## Reconsume.
3426            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3427            redo A;
3428          } else {
3429            unless ($self->{internal_subset_tainted}) {
3430              ## XML5: No parse error.
3431              !!!parse-error (type => 'string in internal subset');
3432              $self->{internal_subset_tainted} = 1;
3433            }
3434            ## Stay in the state.
3435            !!!next-input-character;
3436          redo A;          redo A;
3437        }        }
3438        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3439          if ($self->{nc} == 0x003E) { # >
3440            $self->{state} = DATA_STATE;
3441            $self->{s_kwd} = '';
3442            !!!next-input-character;
3443            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3444            redo A;
3445          } elsif ($self->{nc} == -1) {
3446            !!!parse-error (type => 'unclosed DOCTYPE');
3447            $self->{state} = DATA_STATE;
3448            $self->{s_kwd} = '';
3449            ## Reconsume.
3450            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3451            redo A;
3452          } else {
3453            ## XML5: No parse error and stay in the state.
3454            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3455    
3456            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3457            !!!next-input-character;
3458            redo A;
3459          }
3460        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3461          if ($self->{nc} == 0x003E) { # >
3462            $self->{state} = DATA_STATE;
3463            $self->{s_kwd} = '';
3464            !!!next-input-character;
3465            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3466            redo A;
3467          } elsif ($self->{nc} == -1) {
3468            $self->{state} = DATA_STATE;
3469            $self->{s_kwd} = '';
3470            ## Reconsume.
3471            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3472            redo A;
3473          } else {
3474            ## Stay in the state.
3475            !!!next-input-character;
3476            redo A;
3477          }
3478        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3479          if ($self->{nc} == 0x0021) { # !
3480            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3481            !!!next-input-character;
3482            redo A;
3483          } elsif ($self->{nc} == 0x003F) { # ?
3484            $self->{state} = PI_STATE;
3485            !!!next-input-character;
3486            redo A;
3487          } elsif ($self->{nc} == -1) {
3488            !!!parse-error (type => 'bare stago');
3489            $self->{state} = DATA_STATE;
3490            $self->{s_kwd} = '';
3491            ## Reconsume.
3492            redo A;
3493          } else {
3494            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3495                            line => $self->{line_prev},
3496                            column => $self->{column_prev});
3497            $self->{state} = BOGUS_COMMENT_STATE;
3498            $self->{ct} = {type => COMMENT_TOKEN,
3499                           data => '',
3500                          }; ## NOTE: Will be discarded.
3501            !!!next-input-character;
3502            redo A;
3503          }
3504        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3505          ## XML5: "DOCTYPE markup declaration state".
3506          
3507          if ($self->{nc} == 0x002D) { # -
3508            $self->{state} = MD_HYPHEN_STATE;
3509            !!!next-input-character;
3510            redo A;
3511          } elsif ($self->{nc} == 0x0045) { # E
3512            $self->{state} = MD_E_STATE;
3513            $self->{kwd} = chr $self->{nc};
3514            !!!next-input-character;
3515            redo A;
3516          } elsif ($self->{nc} == 0x0041) { # A
3517            $self->{state} = MD_ATTLIST_STATE;
3518            $self->{kwd} = chr $self->{nc};
3519            !!!next-input-character;
3520            redo A;
3521          } elsif ($self->{nc} == 0x004E) { # N
3522            $self->{state} = MD_NOTATION_STATE;
3523            $self->{kwd} = chr $self->{nc};
3524            !!!next-input-character;
3525            redo A;
3526          } else {
3527            #
3528          }
3529          
3530          ## XML5: No parse error.
3531          !!!parse-error (type => 'bogus comment',
3532                          line => $self->{line_prev},
3533                          column => $self->{column_prev} - 1);
3534          ## Reconsume.
3535          $self->{state} = BOGUS_COMMENT_STATE;
3536          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3537          redo A;
3538        } elsif ($self->{state} == MD_E_STATE) {
3539          if ($self->{nc} == 0x004E) { # N
3540            $self->{state} = MD_ENTITY_STATE;
3541            $self->{kwd} .= chr $self->{nc};
3542            !!!next-input-character;
3543            redo A;
3544          } elsif ($self->{nc} == 0x004C) { # L
3545            ## XML5: <!ELEMENT> not supported.
3546            $self->{state} = MD_ELEMENT_STATE;
3547            $self->{kwd} .= chr $self->{nc};
3548            !!!next-input-character;
3549            redo A;
3550          } else {
3551            ## XML5: No parse error.
3552            !!!parse-error (type => 'bogus comment',
3553                            line => $self->{line_prev},
3554                            column => $self->{column_prev} - 2
3555                                + 1 * ($self->{nc} == -1));
3556            ## Reconsume.
3557            $self->{state} = BOGUS_COMMENT_STATE;
3558            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3559            redo A;
3560          }
3561        } elsif ($self->{state} == MD_ENTITY_STATE) {
3562          if ($self->{nc} == {
3563                'EN' => 0x0054, # T
3564                'ENT' => 0x0049, # I
3565                'ENTI' => 0x0054, # T
3566              }->{$self->{kwd}}) {
3567            ## Stay in the state.
3568            $self->{kwd} .= chr $self->{nc};
3569            !!!next-input-character;
3570            redo A;
3571          } elsif ($self->{kwd} eq 'ENTIT' and
3572                   $self->{nc} == 0x0059) { # Y
3573            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
3574                           line => $self->{line_prev},
3575                           column => $self->{column_prev} - 6};
3576            $self->{state} = DOCTYPE_MD_STATE;
3577            !!!next-input-character;
3578            redo A;
3579          } else {
3580            !!!parse-error (type => 'bogus comment',
3581                            line => $self->{line_prev},
3582                            column => $self->{column_prev} - 1
3583                                - (length $self->{kwd})
3584                                + 1 * ($self->{nc} == -1));
3585            $self->{state} = BOGUS_COMMENT_STATE;
3586            ## Reconsume.
3587            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3588            redo A;
3589          }
3590        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3591          if ($self->{nc} == {
3592                'EL' => 0x0045, # E
3593                'ELE' => 0x004D, # M
3594                'ELEM' => 0x0045, # E
3595                'ELEME' => 0x004E, # N
3596              }->{$self->{kwd}}) {
3597            ## Stay in the state.
3598            $self->{kwd} .= chr $self->{nc};
3599            !!!next-input-character;
3600            redo A;
3601          } elsif ($self->{kwd} eq 'ELEMEN' and
3602                   $self->{nc} == 0x0054) { # T
3603            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3604                           line => $self->{line_prev},
3605                           column => $self->{column_prev} - 6};
3606            $self->{state} = DOCTYPE_MD_STATE;
3607            !!!next-input-character;
3608            redo A;
3609          } else {
3610            !!!parse-error (type => 'bogus comment',
3611                            line => $self->{line_prev},
3612                            column => $self->{column_prev} - 1
3613                                - (length $self->{kwd})
3614                                + 1 * ($self->{nc} == -1));
3615            $self->{state} = BOGUS_COMMENT_STATE;
3616            ## Reconsume.
3617            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3618            redo A;
3619          }
3620        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3621          if ($self->{nc} == {
3622                'A' => 0x0054, # T
3623                'AT' => 0x0054, # T
3624                'ATT' => 0x004C, # L
3625                'ATTL' => 0x0049, # I
3626                'ATTLI' => 0x0053, # S
3627              }->{$self->{kwd}}) {
3628            ## Stay in the state.
3629            $self->{kwd} .= chr $self->{nc};
3630            !!!next-input-character;
3631            redo A;
3632          } elsif ($self->{kwd} eq 'ATTLIS' and
3633                   $self->{nc} == 0x0054) { # T
3634            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3635                           attrdefs => [],
3636                           line => $self->{line_prev},
3637                           column => $self->{column_prev} - 6};
3638            $self->{state} = DOCTYPE_MD_STATE;
3639            !!!next-input-character;
3640            redo A;
3641          } else {
3642            !!!parse-error (type => 'bogus comment',
3643                            line => $self->{line_prev},
3644                            column => $self->{column_prev} - 1
3645                                 - (length $self->{kwd})
3646                                 + 1 * ($self->{nc} == -1));
3647            $self->{state} = BOGUS_COMMENT_STATE;
3648            ## Reconsume.
3649            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3650            redo A;
3651          }
3652        } elsif ($self->{state} == MD_NOTATION_STATE) {
3653          if ($self->{nc} == {
3654                'N' => 0x004F, # O
3655                'NO' => 0x0054, # T
3656                'NOT' => 0x0041, # A
3657                'NOTA' => 0x0054, # T
3658                'NOTAT' => 0x0049, # I
3659                'NOTATI' => 0x004F, # O
3660              }->{$self->{kwd}}) {
3661            ## Stay in the state.
3662            $self->{kwd} .= chr $self->{nc};
3663            !!!next-input-character;
3664            redo A;
3665          } elsif ($self->{kwd} eq 'NOTATIO' and
3666                   $self->{nc} == 0x004E) { # N
3667            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3668                           line => $self->{line_prev},
3669                           column => $self->{column_prev} - 6};
3670            $self->{state} = DOCTYPE_MD_STATE;
3671            !!!next-input-character;
3672            redo A;
3673          } else {
3674            !!!parse-error (type => 'bogus comment',
3675                            line => $self->{line_prev},
3676                            column => $self->{column_prev} - 1
3677                                - (length $self->{kwd})
3678                                + 1 * ($self->{nc} == -1));
3679            $self->{state} = BOGUS_COMMENT_STATE;
3680            ## Reconsume.
3681            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3682            redo A;
3683          }
3684        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3685          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3686          ## "DOCTYPE NOTATION state".
3687    
3688          if ($is_space->{$self->{nc}}) {
3689            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3690            $self->{state} = BEFORE_MD_NAME_STATE;
3691            !!!next-input-character;
3692            redo A;
3693          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3694                   $self->{nc} == 0x0025) { # %
3695            ## XML5: Switch to the "DOCTYPE bogus comment state".
3696            !!!parse-error (type => 'no space before md name'); ## TODO: type
3697            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3698            !!!next-input-character;
3699            redo A;
3700          } elsif ($self->{nc} == -1) {
3701            !!!parse-error (type => 'unclosed md'); ## TODO: type
3702            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3703            ## Reconsume.
3704            redo A;
3705          } elsif ($self->{nc} == 0x003E) { # >
3706            ## XML5: Switch to the "DOCTYPE bogus comment state".
3707            !!!parse-error (type => 'no md name'); ## TODO: type
3708            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3709            !!!next-input-character;
3710            redo A;
3711          } else {
3712            ## XML5: Switch to the "DOCTYPE bogus comment state".
3713            !!!parse-error (type => 'no space before md name'); ## TODO: type
3714            $self->{state} = BEFORE_MD_NAME_STATE;
3715            redo A;
3716          }
3717        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3718          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3719          ## before state", "DOCTYPE ATTLIST name before state".
3720    
3721          if ($is_space->{$self->{nc}}) {
3722            ## Stay in the state.
3723            !!!next-input-character;
3724            redo A;
3725          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3726                   $self->{nc} == 0x0025) { # %
3727            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3728            !!!next-input-character;
3729            redo A;
3730          } elsif ($self->{nc} == 0x003E) { # >
3731            ## XML5: Same as "Anything else".
3732            !!!parse-error (type => 'no md name'); ## TODO: type
3733            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3734            !!!next-input-character;
3735            redo A;
3736          } elsif ($self->{nc} == -1) {
3737            !!!parse-error (type => 'unclosed md'); ## TODO: type
3738            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3739            ## Reconsume.
3740            redo A;
3741          } else {
3742            ## XML5: [ATTLIST] Not defined yet.
3743            $self->{ct}->{name} .= chr $self->{nc};
3744            $self->{state} = MD_NAME_STATE;
3745            !!!next-input-character;
3746            redo A;
3747          }
3748        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3749          if ($is_space->{$self->{nc}}) {
3750            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3751            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3752            $self->{state} = BEFORE_MD_NAME_STATE;
3753            !!!next-input-character;
3754            redo A;
3755          } elsif ($self->{nc} == 0x003E) { # >
3756            ## XML5: Same as "Anything else".
3757            !!!parse-error (type => 'no md name'); ## TODO: type
3758            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3759            !!!next-input-character;
3760            redo A;
3761          } elsif ($self->{nc} == -1) {
3762            !!!parse-error (type => 'unclosed md');
3763            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3764            ## Reconsume.
3765            redo A;
3766          } else {
3767            ## XML5: No parse error.
3768            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3769            $self->{state} = BOGUS_COMMENT_STATE;
3770            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3771            ## Reconsume.
3772            redo A;
3773          }
3774        } elsif ($self->{state} == MD_NAME_STATE) {
3775          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3776          
3777          if ($is_space->{$self->{nc}}) {
3778            ## TODO:
3779            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3780            !!!next-input-character;
3781            redo A;
3782          } elsif ($self->{nc} == 0x003E) { # >
3783            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3784              #
3785            } else {
3786              !!!parse-error (type => 'no md body'); ## TODO: type
3787            }
3788            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3789            !!!next-input-character;
3790            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3791            redo A;
3792          } elsif ($self->{nc} == -1) {
3793            ## XML5: [ATTLIST] No parse error.
3794            !!!parse-error (type => 'unclosed md');
3795            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3796            ## Reconsume.
3797            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3798            redo A;
3799          } else {
3800            ## XML5: [ATTLIST] Not defined yet.
3801            $self->{ct}->{name} .= chr $self->{nc};
3802            ## Stay in the state.
3803            !!!next-input-character;
3804            redo A;
3805          }
3806        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
3807          if ($is_space->{$self->{nc}}) {
3808            ## Stay in the state.
3809            !!!next-input-character;
3810            redo A;
3811          } elsif ($self->{nc} == 0x003E) { # >
3812            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3813            !!!next-input-character;
3814            !!!emit ($self->{ct}); # ATTLIST
3815            redo A;
3816          } elsif ($self->{nc} == -1) {
3817            ## XML5: No parse error.
3818            !!!parse-error (type => 'unclosed md'); ## TODO: type
3819            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3820            !!!emit ($self->{ct});
3821            redo A;
3822          } else {
3823            ## XML5: Not defined yet.
3824            $self->{ca} = {name => chr ($self->{nc}), # attrdef
3825                           tokens => [],
3826                           line => $self->{line}, column => $self->{column}};
3827            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
3828            !!!next-input-character;
3829            redo A;
3830          }
3831        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
3832          if ($is_space->{$self->{nc}}) {
3833            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
3834            !!!next-input-character;
3835            redo A;
3836          } elsif ($self->{nc} == 0x003E) { # >
3837            ## XML5: Same as "anything else".
3838            !!!parse-error (type => 'no attr type'); ## TODO: type
3839            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3840            !!!next-input-character;
3841            !!!emit ($self->{ct}); # ATTLIST
3842            redo A;
3843          } elsif ($self->{nc} == 0x0028) { # (
3844            ## XML5: Same as "anything else".
3845            !!!parse-error (type => 'no space before paren'); ## TODO: type
3846            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
3847            !!!next-input-character;
3848            redo A;
3849          } elsif ($self->{nc} == -1) {
3850            ## XML5: No parse error.
3851            !!!parse-error (type => 'unclosed md'); ## TODO: type
3852            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3853            !!!next-input-character;
3854            !!!emit ($self->{ct}); # ATTLIST
3855            redo A;
3856          } else {
3857            ## XML5: Not defined yet.
3858            $self->{ca}->{name} .= chr $self->{nc};
3859            ## Stay in the state.
3860            !!!next-input-character;
3861            redo A;
3862          }
3863        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
3864          if ($is_space->{$self->{nc}}) {
3865            ## Stay in the state.
3866            !!!next-input-character;
3867            redo A;
3868          } elsif ($self->{nc} == 0x003E) { # >
3869            ## XML5: Same as "anything else".
3870            !!!parse-error (type => 'no attr type'); ## TODO: type
3871            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3872            !!!next-input-character;
3873            !!!emit ($self->{ct}); # ATTLIST
3874            redo A;
3875          } elsif ($self->{nc} == 0x0028) { # (
3876            ## XML5: Same as "anything else".
3877            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
3878            !!!next-input-character;
3879            redo A;
3880          } elsif ($self->{nc} == -1) {
3881            ## XML5: No parse error.
3882            !!!parse-error (type => 'unclosed md'); ## TODO: type
3883            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3884            !!!next-input-character;
3885            !!!emit ($self->{ct});
3886            redo A;
3887          } else {
3888            ## XML5: Not defined yet.
3889            $self->{ca}->{type} = chr $self->{nc};
3890            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
3891            !!!next-input-character;
3892            redo A;
3893          }
3894        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
3895          if ($is_space->{$self->{nc}}) {
3896            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
3897            !!!next-input-character;
3898            redo A;
3899          } elsif ($self->{nc} == 0x0023) { # #
3900            ## XML5: Same as "anything else".
3901            !!!parse-error (type => 'no space before default value'); ## TODO: type
3902            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
3903            !!!next-input-character;
3904            redo A;
3905          } elsif ($self->{nc} == 0x0022) { # "
3906            ## XML5: Same as "anything else".
3907            !!!parse-error (type => 'no space before default value'); ## TODO: type
3908            $self->{ca}->{value} = '';
3909            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
3910            !!!next-input-character;
3911            redo A;
3912          } elsif ($self->{nc} == 0x0027) { # '
3913            ## XML5: Same as "anything else".
3914            !!!parse-error (type => 'no space before default value'); ## TODO: type
3915            $self->{ca}->{value} = '';
3916            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
3917            !!!next-input-character;
3918            redo A;
3919          } elsif ($self->{nc} == 0x003E) { # >
3920            ## XML5: Same as "anything else".
3921            !!!parse-error (type => 'no attr default'); ## TODO: type
3922            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3923            !!!next-input-character;
3924            !!!emit ($self->{ct}); # ATTLIST
3925            redo A;
3926          } elsif ($self->{nc} == 0x0028) { # (
3927            ## XML5: Same as "anything else".
3928            !!!parse-error (type => 'no space before paren'); ## TODO: type
3929            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
3930            !!!next-input-character;
3931            redo A;
3932          } elsif ($self->{nc} == -1) {
3933            ## XML5: No parse error.
3934            !!!parse-error (type => 'unclosed md'); ## TODO: type
3935            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3936            !!!next-input-character;
3937            !!!emit ($self->{ct});
3938            redo A;
3939          } else {
3940            ## XML5: Not defined yet.
3941            $self->{ca}->{type} .= chr $self->{nc};
3942            ## Stay in the state.
3943            !!!next-input-character;
3944            redo A;
3945          }
3946        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
3947          if ($is_space->{$self->{nc}}) {
3948            ## Stay in the state.
3949            !!!next-input-character;
3950            redo A;
3951          } elsif ($self->{nc} == 0x0028) { # (
3952            ## XML5: Same as "anything else".
3953            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
3954            !!!next-input-character;
3955            redo A;
3956          } elsif ($self->{nc} == 0x0023) { # #
3957            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
3958            !!!next-input-character;
3959            redo A;
3960          } elsif ($self->{nc} == 0x0022) { # "
3961            ## XML5: Same as "anything else".
3962            $self->{ca}->{value} = '';
3963            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
3964            !!!next-input-character;
3965            redo A;
3966          } elsif ($self->{nc} == 0x0027) { # '
3967            ## XML5: Same as "anything else".
3968            $self->{ca}->{value} = '';
3969            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
3970            !!!next-input-character;
3971            redo A;
3972          } elsif ($self->{nc} == 0x003E) { # >
3973            ## XML5: Same as "anything else".
3974            !!!parse-error (type => 'no attr default'); ## TODO: type
3975            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3976            !!!next-input-character;
3977            !!!emit ($self->{ct}); # ATTLIST
3978            redo A;
3979          } elsif ($self->{nc} == -1) {
3980            ## XML5: No parse error.
3981            !!!parse-error (type => 'unclosed md'); ## TODO: type
3982            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3983            !!!next-input-character;
3984            !!!emit ($self->{ct});
3985            redo A;
3986          } else {
3987            ## XML5: Switch to the "DOCTYPE bogus comment state".
3988            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
3989            $self->{ca}->{value} = '';
3990            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
3991            ## Reconsume.
3992            redo A;
3993          }
3994        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
3995          if ($is_space->{$self->{nc}}) {
3996            ## Stay in the state.
3997            !!!next-input-character;
3998            redo A;
3999          } elsif ($self->{nc} == 0x007C) { # |
4000            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4001            ## Stay in the state.
4002            !!!next-input-character;
4003            redo A;
4004          } elsif ($self->{nc} == 0x0029) { # )
4005            !!!parse-error (type => 'empty allowed token'); ## TODO: type
4006            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4007            !!!next-input-character;
4008            redo A;
4009          } elsif ($self->{nc} == 0x003E) { # >
4010            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4011            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4012            !!!next-input-character;
4013            !!!emit ($self->{ct}); # ATTLIST
4014            redo A;
4015          } elsif ($self->{nc} == -1) {
4016            ## XML5: No parse error.
4017            !!!parse-error (type => 'unclosed md'); ## TODO: type
4018            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4019            !!!next-input-character;
4020            !!!emit ($self->{ct});
4021            redo A;
4022          } else {
4023            push @{$self->{ca}->{tokens}}, chr $self->{nc};
4024            $self->{state} = ALLOWED_TOKEN_STATE;
4025            !!!next-input-character;
4026            redo A;
4027          }
4028        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
4029          if ($is_space->{$self->{nc}}) {
4030            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
4031            !!!next-input-character;
4032            redo A;
4033          } elsif ($self->{nc} == 0x007C) { # |
4034            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4035            !!!next-input-character;
4036            redo A;
4037          } elsif ($self->{nc} == 0x0029) { # )
4038            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4039            !!!next-input-character;
4040            redo A;
4041          } elsif ($self->{nc} == 0x003E) { # >
4042            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4043            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4044            !!!next-input-character;
4045            !!!emit ($self->{ct}); # ATTLIST
4046            redo A;
4047          } elsif ($self->{nc} == -1) {
4048            ## XML5: No parse error.
4049            !!!parse-error (type => 'unclosed md'); ## TODO: type
4050            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4051            !!!next-input-character;
4052            !!!emit ($self->{ct});
4053            redo A;
4054          } else {
4055            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
4056            ## Stay in the state.
4057            !!!next-input-character;
4058            redo A;
4059          }
4060        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
4061          if ($is_space->{$self->{nc}}) {
4062            ## Stay in the state.
4063            !!!next-input-character;
4064            redo A;
4065          } elsif ($self->{nc} == 0x007C) { # |
4066            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
4067            !!!next-input-character;
4068            redo A;
4069          } elsif ($self->{nc} == 0x0029) { # )
4070            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
4071            !!!next-input-character;
4072            redo A;
4073          } elsif ($self->{nc} == 0x003E) { # >
4074            !!!parse-error (type => 'unclosed allowed tokens'); ## TODO: type
4075            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4076            !!!next-input-character;
4077            !!!emit ($self->{ct}); # ATTLIST
4078            redo A;
4079          } elsif ($self->{nc} == -1) {
4080            ## XML5: No parse error.
4081            !!!parse-error (type => 'unclosed md'); ## TODO: type
4082            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4083            !!!next-input-character;
4084            !!!emit ($self->{ct});
4085            redo A;
4086          } else {
4087            !!!parse-error (type => 'space in allowed token', ## TODO: type
4088                            line => $self->{line_prev},
4089                            column => $self->{column_prev});
4090            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
4091            $self->{state} = ALLOWED_TOKEN_STATE;
4092            !!!next-input-character;
4093            redo A;
4094          }
4095        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
4096          if ($is_space->{$self->{nc}}) {
4097            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
4098            !!!next-input-character;
4099            redo A;
4100          } elsif ($self->{nc} == 0x0023) { # #
4101            !!!parse-error (type => 'no space before default value'); ## TODO: type
4102            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4103            !!!next-input-character;
4104            redo A;
4105          } elsif ($self->{nc} == 0x0022) { # "
4106            !!!parse-error (type => 'no space before default value'); ## TODO: type
4107            $self->{ca}->{value} = '';
4108            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4109            !!!next-input-character;
4110            redo A;
4111          } elsif ($self->{nc} == 0x0027) { # '
4112            !!!parse-error (type => 'no space before default value'); ## TODO: type
4113            $self->{ca}->{value} = '';
4114            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4115            !!!next-input-character;
4116            redo A;
4117          } elsif ($self->{nc} == 0x003E) { # >
4118            !!!parse-error (type => 'no attr default'); ## TODO: type
4119            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4120            !!!next-input-character;
4121            !!!emit ($self->{ct}); # ATTLIST
4122            redo A;
4123          } elsif ($self->{nc} == -1) {
4124            !!!parse-error (type => 'unclosed md'); ## TODO: type
4125            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4126            !!!next-input-character;
4127            !!!emit ($self->{ct});
4128            redo A;
4129          } else {
4130            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4131            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4132            ## Reconsume.
4133            redo A;
4134          }
4135        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
4136          if ($is_space->{$self->{nc}}) {
4137            ## Stay in the state.
4138            !!!next-input-character;
4139            redo A;
4140          } elsif ($self->{nc} == 0x0023) { # #
4141            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
4142            !!!next-input-character;
4143            redo A;
4144          } elsif ($self->{nc} == 0x0022) { # "
4145            $self->{ca}->{value} = '';
4146            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4147            !!!next-input-character;
4148            redo A;
4149          } elsif ($self->{nc} == 0x0027) { # '
4150            $self->{ca}->{value} = '';
4151            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4152            !!!next-input-character;
4153            redo A;
4154          } elsif ($self->{nc} == 0x003E) { # >
4155            !!!parse-error (type => 'no attr default'); ## TODO: type
4156            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4157            !!!next-input-character;
4158            !!!emit ($self->{ct}); # ATTLIST
4159            redo A;
4160          } elsif ($self->{nc} == -1) {
4161            !!!parse-error (type => 'unclosed md'); ## TODO: type
4162            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4163            !!!next-input-character;
4164            !!!emit ($self->{ct});
4165            redo A;
4166          } else {
4167            !!!parse-error (type => 'unquoted attr value'); ## TODO: type
4168            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4169            ## Reconsume.
4170            redo A;
4171          }
4172        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
4173          if ($is_space->{$self->{nc}}) {
4174            ## XML5: No parse error.
4175            !!!parse-error (type => 'no default type'); ## TODO: type
4176            $self->{state} = BOGUS_COMMENT_STATE;
4177            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
4178            ## Reconsume.
4179            redo A;
4180          } elsif ($self->{nc} == 0x0022) { # "
4181            ## XML5: Same as "anything else".
4182            $self->{ca}->{value} = '';
4183            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4184            !!!next-input-character;
4185            redo A;
4186          } elsif ($self->{nc} == 0x0027) { # '
4187            ## XML5: Same as "anything else".
4188            $self->{ca}->{value} = '';
4189            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4190            !!!next-input-character;
4191            redo A;
4192          } elsif ($self->{nc} == 0x003E) { # >
4193            ## XML5: Same as "anything else".
4194            !!!parse-error (type => 'no attr default'); ## TODO: type
4195            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4196            !!!next-input-character;
4197            !!!emit ($self->{ct}); # ATTLIST
4198            redo A;
4199          } elsif ($self->{nc} == -1) {
4200            ## XML5: No parse error.
4201            !!!parse-error (type => 'unclosed md'); ## TODO: type
4202            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4203            !!!next-input-character;
4204            !!!emit ($self->{ct});
4205            redo A;
4206          } else {
4207            $self->{ca}->{default} = chr $self->{nc};
4208            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
4209            !!!next-input-character;
4210            redo A;
4211          }
4212        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
4213          if ($is_space->{$self->{nc}}) {
4214            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
4215            !!!next-input-character;
4216            redo A;
4217          } elsif ($self->{nc} == 0x0022) { # "
4218            ## XML5: Same as "anything else".
4219            !!!parse-error (type => 'no space before default value'); ## TODO: type
4220            $self->{ca}->{value} = '';
4221            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4222            !!!next-input-character;
4223            redo A;
4224          } elsif ($self->{nc} == 0x0027) { # '
4225            ## XML5: Same as "anything else".
4226            !!!parse-error (type => 'no space before default value'); ## TODO: type
4227            $self->{ca}->{value} = '';
4228            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4229            !!!next-input-character;
4230            redo A;
4231          } elsif ($self->{nc} == 0x003E) { # >
4232            ## XML5: Same as "anything else".
4233            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4234            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4235            !!!next-input-character;
4236            !!!emit ($self->{ct}); # ATTLIST
4237            redo A;
4238          } elsif ($self->{nc} == -1) {
4239            ## XML5: No parse error.
4240            !!!parse-error (type => 'unclosed md'); ## TODO: type
4241            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4242            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4243            !!!next-input-character;
4244            !!!emit ($self->{ct});
4245            redo A;
4246          } else {
4247            $self->{ca}->{default} .= chr $self->{nc};
4248            ## Stay in the state.
4249            !!!next-input-character;
4250            redo A;
4251          }
4252        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
4253          if ($is_space->{$self->{nc}}) {
4254            ## Stay in the state.
4255            !!!next-input-character;
4256            redo A;
4257          } elsif ($self->{nc} == 0x0022) { # "
4258            $self->{ca}->{value} = '';
4259            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
4260            !!!next-input-character;
4261            redo A;
4262          } elsif ($self->{nc} == 0x0027) { # '
4263            $self->{ca}->{value} = '';
4264            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
4265            !!!next-input-character;
4266            redo A;
4267          } elsif ($self->{nc} == 0x003E) { # >
4268            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4269            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4270            !!!next-input-character;
4271            !!!emit ($self->{ct}); # ATTLIST
4272            redo A;
4273          } elsif ($self->{nc} == -1) {
4274            ## XML5: No parse error.
4275            !!!parse-error (type => 'unclosed md'); ## TODO: type
4276            push @{$self->{ct}->{attrdefs}}, $self->{ca};
4277            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
4278            !!!next-input-character;
4279            !!!emit ($self->{ct});
4280            redo A;
4281          } else {
4282            ## XML5: Not defined yet.
4283            if ($self->{ca}->{default} eq 'FIXED') {
4284              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
4285            } else {
4286              push @{$self->{ct}->{attrdefs}}, $self->{ca};
4287              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4288            }
4289            ## Reconsume.
4290            redo A;
4291          }
4292        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
4293          if ($is_space->{$self->{nc}} or
4294              $self->{nc} == -1 or
4295              $self->{nc} == 0x003E) { # >
4296            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4297            ## Reconsume.
4298            redo A;
4299          } else {
4300            !!!parse-error (type => 'no space before attr name'); ## TODO: type
4301            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
4302            ## Reconsume.
4303            redo A;
4304          }      
4305      } else {      } else {
4306        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4307      }      }
# Line 2635  sub _get_next_token ($) { Line 4312  sub _get_next_token ($) {
4312    
4313  1;  1;
4314  ## $Date$  ## $Date$
4315                                    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.15

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24