/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3 by wakaba, Tue Oct 14 05:34:05 2008 UTC revision 1.14 by wakaba, Fri Oct 17 07:14:29 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145    ## XML-only states
146    sub PI_STATE () { 51 }
147    sub PI_TARGET_STATE () { 52 }
148    sub PI_TARGET_AFTER_STATE () { 53 }
149    sub PI_DATA_STATE () { 54 }
150    sub PI_AFTER_STATE () { 55 }
151    sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    
168  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
169  ## list and descriptions)  ## list and descriptions)
170    
# Line 178  sub _initialize_tokenizer ($) { Line 229  sub _initialize_tokenizer ($) {
229    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
230    
231    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
232    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
233      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
234    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
235    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
236    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 198  sub _initialize_tokenizer ($) { Line 250  sub _initialize_tokenizer ($) {
250    
251  ## A token has:  ## A token has:
252  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
253  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
254  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
255  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
256    ##   ->{target} (PI_TOKEN)
257  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
258  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
259  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 208  sub _initialize_tokenizer ($) { Line 261  sub _initialize_tokenizer ($) {
261  ##        ->{name}  ##        ->{name}
262  ##        ->{value}  ##        ->{value}
263  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
264  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
265    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
266    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
267    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
268    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
269    
270  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
271  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
272  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 228  my $is_space = { Line 286  my $is_space = {
286    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
287    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
288    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
289    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
290    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
291    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
292  };  };
# Line 312  sub _get_next_token ($) { Line 370  sub _get_next_token ($) {
370          }          }
371        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
372          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
373            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
374              !!!cp (3);              !!!cp (3);
375              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
376              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
377              #              #
378            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
379              !!!cp (4);              !!!cp (4);
380              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
381              #              #
382              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
383                !!!cp (4.1);
384                $self->{s_kwd} .= '-';
385                #
386            } else {            } else {
387              !!!cp (5);              !!!cp (5);
388                $self->{s_kwd} = '-';
389              #              #
390            }            }
391          }          }
# Line 360  sub _get_next_token ($) { Line 421  sub _get_next_token ($) {
421            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
422              !!!cp (8);              !!!cp (8);
423              delete $self->{escape};              delete $self->{escape};
424                #
425            } else {            } else {
426              !!!cp (9);              !!!cp (9);
427                #
428            }            }
429            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
430              !!!cp (9.1);
431              !!!parse-error (type => 'unmatched mse', ## TODO: type
432                              line => $self->{line_prev},
433                              column => $self->{column_prev} - 1);
434              #
435          } else {          } else {
436            !!!cp (10);            !!!cp (10);
437              #
438          }          }
439                    
440          $self->{s_kwd} = '';          $self->{s_kwd} = '';
441          #          #
442          } elsif ($self->{nc} == 0x005D) { # ]
443            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
444              !!!cp (10.1);
445              $self->{s_kwd} .= ']';
446            } elsif ($self->{s_kwd} eq ']]') {
447              !!!cp (10.2);
448              #
449            } else {
450              !!!cp (10.3);
451              $self->{s_kwd} = '';
452            }
453            #
454        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
455          !!!cp (11);          !!!cp (11);
456          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 386  sub _get_next_token ($) { Line 468  sub _get_next_token ($) {
468                     data => chr $self->{nc},                     data => chr $self->{nc},
469                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
470                    };                    };
471        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
472                                  length $token->{data})) {                                  length $token->{data})) {
473          $self->{s_kwd} = '';          $self->{s_kwd} = '';
474        }        }
475    
476        ## Stay in the data state.        ## Stay in the data state.
477        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
478              $self->{content_model} == PCDATA_CONTENT_MODEL) {
479          !!!cp (13);          !!!cp (13);
480          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
481        } else {        } else {
# Line 403  sub _get_next_token ($) { Line 486  sub _get_next_token ($) {
486        !!!emit ($token);        !!!emit ($token);
487        redo A;        redo A;
488      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
489          ## XML5: "tag state".
490    
491        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
492          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
493            !!!cp (15);            !!!cp (15);
# Line 411  sub _get_next_token ($) { Line 496  sub _get_next_token ($) {
496            redo A;            redo A;
497          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
498            !!!cp (15.1);            !!!cp (15.1);
499            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
500            #            #
501          } else {          } else {
502            !!!cp (16);            !!!cp (16);
503              $self->{s_kwd} = '';
504            #            #
505          }          }
506    
# Line 441  sub _get_next_token ($) { Line 527  sub _get_next_token ($) {
527            !!!cp (19);            !!!cp (19);
528            $self->{ct}            $self->{ct}
529              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
530                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
531                 line => $self->{line_prev},                 line => $self->{line_prev},
532                 column => $self->{column_prev}};                 column => $self->{column_prev}};
533            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 463  sub _get_next_token ($) { Line 549  sub _get_next_token ($) {
549                            line => $self->{line_prev},                            line => $self->{line_prev},
550                            column => $self->{column_prev});                            column => $self->{column_prev});
551            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
552              $self->{s_kwd} = '';
553            !!!next-input-character;            !!!next-input-character;
554    
555            !!!emit ({type => CHARACTER_TOKEN, data => '<>',            !!!emit ({type => CHARACTER_TOKEN, data => '<>',
# Line 472  sub _get_next_token ($) { Line 559  sub _get_next_token ($) {
559    
560            redo A;            redo A;
561          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
562            !!!cp (22);            if ($self->{is_xml}) {
563            !!!parse-error (type => 'pio',              !!!cp (22.1);
564                            line => $self->{line_prev},              $self->{state} = PI_STATE;
565                            column => $self->{column_prev});              !!!next-input-character;
566            $self->{state} = BOGUS_COMMENT_STATE;              redo A;
567            $self->{ct} = {type => COMMENT_TOKEN, data => '',            } else {
568                                      line => $self->{line_prev},              !!!cp (22);
569                                      column => $self->{column_prev},              !!!parse-error (type => 'pio',
570                                     };                              line => $self->{line_prev},
571            ## $self->{nc} is intentionally left as is                              column => $self->{column_prev});
572            redo A;              $self->{state} = BOGUS_COMMENT_STATE;
573          } else {              $self->{ct} = {type => COMMENT_TOKEN, data => '',
574                               line => $self->{line_prev},
575                               column => $self->{column_prev},
576                              };
577                ## $self->{nc} is intentionally left as is
578                redo A;
579              }
580            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
581            !!!cp (23);            !!!cp (23);
582            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
583                            line => $self->{line_prev},                            line => $self->{line_prev},
584                            column => $self->{column_prev});                            column => $self->{column_prev});
585            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
586              $self->{s_kwd} = '';
587            ## reconsume            ## reconsume
588    
589            !!!emit ({type => CHARACTER_TOKEN, data => '<',            !!!emit ({type => CHARACTER_TOKEN, data => '<',
# Line 497  sub _get_next_token ($) { Line 592  sub _get_next_token ($) {
592                     });                     });
593    
594            redo A;            redo A;
595            } else {
596              ## XML5: "<:" is a parse error.
597              !!!cp (23.1);
598              $self->{ct} = {type => START_TAG_TOKEN,
599                                        tag_name => chr ($self->{nc}),
600                                        line => $self->{line_prev},
601                                        column => $self->{column_prev}};
602              $self->{state} = TAG_NAME_STATE;
603              !!!next-input-character;
604              redo A;
605          }          }
606        } else {        } else {
607          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 505  sub _get_next_token ($) { Line 610  sub _get_next_token ($) {
610        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
611        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
612    
613          ## XML5: "end tag state".
614    
615        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
616        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
617          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
618            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
619            $self->{s_kwd} = '';            $self->{kwd} = '';
620            ## Reconsume.            ## Reconsume.
621            redo A;            redo A;
622          } else {          } else {
# Line 517  sub _get_next_token ($) { Line 624  sub _get_next_token ($) {
624            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
625            !!!cp (28);            !!!cp (28);
626            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
627              $self->{s_kwd} = '';
628            ## Reconsume.            ## Reconsume.
629            !!!emit ({type => CHARACTER_TOKEN, data => '</',            !!!emit ({type => CHARACTER_TOKEN, data => '</',
630                      line => $l, column => $c,                      line => $l, column => $c,
# Line 530  sub _get_next_token ($) { Line 638  sub _get_next_token ($) {
638          !!!cp (29);          !!!cp (29);
639          $self->{ct}          $self->{ct}
640              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
641                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
642                 line => $l, column => $c};                 line => $l, column => $c};
643          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
644          !!!next-input-character;          !!!next-input-character;
# Line 545  sub _get_next_token ($) { Line 653  sub _get_next_token ($) {
653          !!!next-input-character;          !!!next-input-character;
654          redo A;          redo A;
655        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (31);  
656          !!!parse-error (type => 'empty end tag',          !!!parse-error (type => 'empty end tag',
657                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
658                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
659          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
660          !!!next-input-character;          $self->{s_kwd} = '';
661            if ($self->{is_xml}) {
662              !!!cp (31);
663              ## XML5: No parse error.
664              
665              ## NOTE: This parser raises a parse error, since it supports
666              ## XML1, not XML5.
667    
668              ## NOTE: A short end tag token.
669              my $ct = {type => END_TAG_TOKEN,
670                        tag_name => '',
671                        line => $self->{line_prev},
672                        column => $self->{column_prev} - 1,
673                       };
674              !!!next-input-character;
675              !!!emit ($ct);
676            } else {
677              !!!cp (31.1);
678              !!!next-input-character;
679            }
680          redo A;          redo A;
681        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
682          !!!cp (32);          !!!cp (32);
683          !!!parse-error (type => 'bare etago');          !!!parse-error (type => 'bare etago');
684            $self->{s_kwd} = '';
685          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
686          # reconsume          # reconsume
687    
# Line 563  sub _get_next_token ($) { Line 690  sub _get_next_token ($) {
690                   });                   });
691    
692          redo A;          redo A;
693        } else {        } elsif (not $self->{is_xml} or
694                   $is_space->{$self->{nc}}) {
695          !!!cp (33);          !!!cp (33);
696          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag',
697                            line => $self->{line_prev}, # "<" of "</"
698                            column => $self->{column_prev} - 1);
699          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
700          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
701                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 578  sub _get_next_token ($) { Line 708  sub _get_next_token ($) {
708          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
709          ## "bogus comment state" entry.          ## "bogus comment state" entry.
710          redo A;          redo A;
711          } else {
712            ## XML5: "</:" is a parse error.
713            !!!cp (30.1);
714            $self->{ct} = {type => END_TAG_TOKEN,
715                           tag_name => chr ($self->{nc}),
716                           line => $l, column => $c};
717            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
718            !!!next-input-character;
719            redo A;
720        }        }
721      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
722        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
723        if (length $ch) {        if (length $ch) {
724          my $CH = $ch;          my $CH = $ch;
725          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 588  sub _get_next_token ($) { Line 727  sub _get_next_token ($) {
727          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
728            !!!cp (24);            !!!cp (24);
729            ## Stay in the state.            ## Stay in the state.
730            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
731            !!!next-input-character;            !!!next-input-character;
732            redo A;            redo A;
733          } else {          } else {
734            !!!cp (25);            !!!cp (25);
735            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
736              $self->{s_kwd} = '';
737            ## Reconsume.            ## Reconsume.
738            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
739                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
740                      line => $self->{line_prev},                      line => $self->{line_prev},
741                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
742                     });                     });
743            redo A;            redo A;
744          }          }
# Line 612  sub _get_next_token ($) { Line 752  sub _get_next_token ($) {
752            !!!cp (26);            !!!cp (26);
753            ## Reconsume.            ## Reconsume.
754            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
755              $self->{s_kwd} = '';
756            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
757                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
758                      line => $self->{line_prev},                      line => $self->{line_prev},
759                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
760                     });                     });
761            redo A;            redo A;
762          } else {          } else {
# Line 624  sub _get_next_token ($) { Line 765  sub _get_next_token ($) {
765                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
766                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
767                   line => $self->{line_prev},                   line => $self->{line_prev},
768                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
769            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
770            ## Reconsume.            ## Reconsume.
771            redo A;            redo A;
# Line 653  sub _get_next_token ($) { Line 794  sub _get_next_token ($) {
794            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
795          }          }
796          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
797            $self->{s_kwd} = '';
798          !!!next-input-character;          !!!next-input-character;
799    
800          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 661  sub _get_next_token ($) { Line 803  sub _get_next_token ($) {
803        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
804                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
805          !!!cp (38);          !!!cp (38);
806          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
807                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
808            # start tag or end tag            # start tag or end tag
809          ## Stay in this state          ## Stay in this state
810          !!!next-input-character;          !!!next-input-character;
# Line 684  sub _get_next_token ($) { Line 827  sub _get_next_token ($) {
827            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
828          }          }
829          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
830            $self->{s_kwd} = '';
831          # reconsume          # reconsume
832    
833          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 703  sub _get_next_token ($) { Line 847  sub _get_next_token ($) {
847          redo A;          redo A;
848        }        }
849      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
850          ## XML5: "Tag attribute name before state".
851    
852        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
853          !!!cp (45);          !!!cp (45);
854          ## Stay in the state          ## Stay in the state
# Line 724  sub _get_next_token ($) { Line 870  sub _get_next_token ($) {
870            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
871          }          }
872          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
873            $self->{s_kwd} = '';
874          !!!next-input-character;          !!!next-input-character;
875    
876          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 733  sub _get_next_token ($) { Line 880  sub _get_next_token ($) {
880                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
881          !!!cp (49);          !!!cp (49);
882          $self->{ca}          $self->{ca}
883              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
884                 value => '',                 value => '',
885                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
886          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 761  sub _get_next_token ($) { Line 908  sub _get_next_token ($) {
908            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
909          }          }
910          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
911            $self->{s_kwd} = '';
912          # reconsume          # reconsume
913    
914          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 773  sub _get_next_token ($) { Line 921  sub _get_next_token ($) {
921               0x003D => 1, # =               0x003D => 1, # =
922              }->{$self->{nc}}) {              }->{$self->{nc}}) {
923            !!!cp (55);            !!!cp (55);
924              ## XML5: Not a parse error.
925            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
926          } else {          } else {
927            !!!cp (56);            !!!cp (56);
928              ## XML5: ":" raises a parse error and is ignored.
929          }          }
930          $self->{ca}          $self->{ca}
931              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 786  sub _get_next_token ($) { Line 936  sub _get_next_token ($) {
936          redo A;          redo A;
937        }        }
938      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
939          ## XML5: "Tag attribute name state".
940    
941        my $before_leave = sub {        my $before_leave = sub {
942          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
943              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 796  sub _get_next_token ($) { Line 948  sub _get_next_token ($) {
948            !!!cp (58);            !!!cp (58);
949            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
950              = $self->{ca};              = $self->{ca};
951              $self->{ca}->{index} = ++$self->{ct}->{last_index};
952          }          }
953        }; # $before_leave        }; # $before_leave
954    
# Line 812  sub _get_next_token ($) { Line 965  sub _get_next_token ($) {
965          !!!next-input-character;          !!!next-input-character;
966          redo A;          redo A;
967        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
968            if ($self->{is_xml}) {
969              !!!cp (60.1);
970              ## XML5: Not a parse error.
971              !!!parse-error (type => 'no attr value'); ## TODO: type
972            } else {
973              !!!cp (60.2);
974            }
975    
976          $before_leave->();          $before_leave->();
977          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
978            !!!cp (61);            !!!cp (61);
# Line 826  sub _get_next_token ($) { Line 987  sub _get_next_token ($) {
987            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
988          }          }
989          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
990            $self->{s_kwd} = '';
991          !!!next-input-character;          !!!next-input-character;
992    
993          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 834  sub _get_next_token ($) { Line 996  sub _get_next_token ($) {
996        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
997                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
998          !!!cp (63);          !!!cp (63);
999          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1000                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1001          ## Stay in the state          ## Stay in the state
1002          !!!next-input-character;          !!!next-input-character;
1003          redo A;          redo A;
1004        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1005          !!!cp (64);          if ($self->{is_xml}) {
1006              !!!cp (64);
1007              ## XML5: Not a parse error.
1008              !!!parse-error (type => 'no attr value'); ## TODO: type
1009            } else {
1010              !!!cp (64.1);
1011            }
1012            
1013          $before_leave->();          $before_leave->();
1014          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1015          !!!next-input-character;          !!!next-input-character;
# Line 863  sub _get_next_token ($) { Line 1033  sub _get_next_token ($) {
1033            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1034          }          }
1035          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1036            $self->{s_kwd} = '';
1037          # reconsume          # reconsume
1038    
1039          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 872  sub _get_next_token ($) { Line 1043  sub _get_next_token ($) {
1043          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1044              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1045            !!!cp (69);            !!!cp (69);
1046              ## XML5: Not a parse error.
1047            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1048          } else {          } else {
1049            !!!cp (70);            !!!cp (70);
# Line 882  sub _get_next_token ($) { Line 1054  sub _get_next_token ($) {
1054          redo A;          redo A;
1055        }        }
1056      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1057          ## XML5: "Tag attribute name after state".
1058          
1059        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1060          !!!cp (71);          !!!cp (71);
1061          ## Stay in the state          ## Stay in the state
# Line 893  sub _get_next_token ($) { Line 1067  sub _get_next_token ($) {
1067          !!!next-input-character;          !!!next-input-character;
1068          redo A;          redo A;
1069        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1070            if ($self->{is_xml}) {
1071              !!!cp (72.1);
1072              ## XML5: Not a parse error.
1073              !!!parse-error (type => 'no attr value'); ## TODO: type
1074            } else {
1075              !!!cp (72.2);
1076            }
1077    
1078          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1079            !!!cp (73);            !!!cp (73);
1080            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 909  sub _get_next_token ($) { Line 1091  sub _get_next_token ($) {
1091            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1092          }          }
1093          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1094            $self->{s_kwd} = '';
1095          !!!next-input-character;          !!!next-input-character;
1096    
1097          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 918  sub _get_next_token ($) { Line 1101  sub _get_next_token ($) {
1101                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1102          !!!cp (76);          !!!cp (76);
1103          $self->{ca}          $self->{ca}
1104              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1105                 value => '',                 value => '',
1106                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1107          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
1108          !!!next-input-character;          !!!next-input-character;
1109          redo A;          redo A;
1110        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1111          !!!cp (77);          if ($self->{is_xml}) {
1112              !!!cp (77);
1113              ## XML5: Not a parse error.
1114              !!!parse-error (type => 'no attr value'); ## TODO: type
1115            } else {
1116              !!!cp (77.1);
1117            }
1118            
1119          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1120          !!!next-input-character;          !!!next-input-character;
1121          redo A;          redo A;
# Line 946  sub _get_next_token ($) { Line 1136  sub _get_next_token ($) {
1136          } else {          } else {
1137            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1138          }          }
1139            $self->{s_kwd} = '';
1140          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1141          # reconsume          # reconsume
1142    
# Line 953  sub _get_next_token ($) { Line 1144  sub _get_next_token ($) {
1144    
1145          redo A;          redo A;
1146        } else {        } else {
1147            if ($self->{is_xml}) {
1148              !!!cp (78.1);
1149              ## XML5: Not a parse error.
1150              !!!parse-error (type => 'no attr value'); ## TODO: type
1151            } else {
1152              !!!cp (78.2);
1153            }
1154    
1155          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1156              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1157            !!!cp (78);            !!!cp (78);
1158              ## XML5: Not a parse error.
1159            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1160          } else {          } else {
1161            !!!cp (82);            !!!cp (82);
# Line 969  sub _get_next_token ($) { Line 1169  sub _get_next_token ($) {
1169          redo A;                  redo A;        
1170        }        }
1171      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1172          ## XML5: "Tag attribute value before state".
1173    
1174        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1175          !!!cp (83);          !!!cp (83);
1176          ## Stay in the state          ## Stay in the state
# Line 1007  sub _get_next_token ($) { Line 1209  sub _get_next_token ($) {
1209            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1210          }          }
1211          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1212            $self->{s_kwd} = '';
1213          !!!next-input-character;          !!!next-input-character;
1214    
1215          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1030  sub _get_next_token ($) { Line 1233  sub _get_next_token ($) {
1233            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1234          }          }
1235          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1236            $self->{s_kwd} = '';
1237          ## reconsume          ## reconsume
1238    
1239          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1038  sub _get_next_token ($) { Line 1242  sub _get_next_token ($) {
1242        } else {        } else {
1243          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1244            !!!cp (93);            !!!cp (93);
1245              ## XML5: Not a parse error.
1246            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1247            } elsif ($self->{is_xml}) {
1248              !!!cp (93.1);
1249              ## XML5: No parse error.
1250              !!!parse-error (type => 'unquoted attr value'); ## TODO
1251          } else {          } else {
1252            !!!cp (94);            !!!cp (94);
1253          }          }
# Line 1048  sub _get_next_token ($) { Line 1257  sub _get_next_token ($) {
1257          redo A;          redo A;
1258        }        }
1259      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1260          ## XML5: "Tag attribute value double quoted state".
1261          
1262        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1263          !!!cp (95);          !!!cp (95);
1264            ## XML5: "Tag attribute name before state".
1265          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1266          !!!next-input-character;          !!!next-input-character;
1267          redo A;          redo A;
1268        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1269          !!!cp (96);          !!!cp (96);
1270            ## XML5: Not defined yet.
1271    
1272          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1273          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1274          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1082  sub _get_next_token ($) { Line 1296  sub _get_next_token ($) {
1296            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1297          }          }
1298          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1299            $self->{s_kwd} = '';
1300          ## reconsume          ## reconsume
1301    
1302          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1303    
1304          redo A;          redo A;
1305        } else {        } else {
1306          !!!cp (100);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1307              !!!cp (100);
1308              ## XML5: Not a parse error.
1309              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1310            } else {
1311              !!!cp (100.1);
1312            }
1313          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1314          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1315                                q["&],                                q["&<],
1316                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1317    
1318          ## Stay in the state          ## Stay in the state
# Line 1099  sub _get_next_token ($) { Line 1320  sub _get_next_token ($) {
1320          redo A;          redo A;
1321        }        }
1322      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1323          ## XML5: "Tag attribute value single quoted state".
1324    
1325        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1326          !!!cp (101);          !!!cp (101);
1327            ## XML5: "Before attribute name state" (sic).
1328          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1329          !!!next-input-character;          !!!next-input-character;
1330          redo A;          redo A;
1331        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1332          !!!cp (102);          !!!cp (102);
1333            ## XML5: Not defined yet.
1334    
1335          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1336          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1337          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1133  sub _get_next_token ($) { Line 1359  sub _get_next_token ($) {
1359            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1360          }          }
1361          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1362            $self->{s_kwd} = '';
1363          ## reconsume          ## reconsume
1364    
1365          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1366    
1367          redo A;          redo A;
1368        } else {        } else {
1369          !!!cp (106);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1370              !!!cp (106);
1371              ## XML5: Not a parse error.
1372              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1373            } else {
1374              !!!cp (106.1);
1375            }
1376          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1377          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1378                                q['&],                                q['&<],
1379                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1380    
1381          ## Stay in the state          ## Stay in the state
# Line 1150  sub _get_next_token ($) { Line 1383  sub _get_next_token ($) {
1383          redo A;          redo A;
1384        }        }
1385      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1386          ## XML5: "Tag attribute value unquoted state".
1387    
1388        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1389          !!!cp (107);          !!!cp (107);
1390            ## XML5: "Tag attribute name before state".
1391          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1392          !!!next-input-character;          !!!next-input-character;
1393          redo A;          redo A;
1394        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1395          !!!cp (108);          !!!cp (108);
1396    
1397            ## XML5: Not defined yet.
1398    
1399          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1400          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1401          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1183  sub _get_next_token ($) { Line 1422  sub _get_next_token ($) {
1422            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1423          }          }
1424          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1425            $self->{s_kwd} = '';
1426          !!!next-input-character;          !!!next-input-character;
1427    
1428          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1206  sub _get_next_token ($) { Line 1446  sub _get_next_token ($) {
1446            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1447          }          }
1448          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1449            $self->{s_kwd} = '';
1450          ## reconsume          ## reconsume
1451    
1452          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1218  sub _get_next_token ($) { Line 1459  sub _get_next_token ($) {
1459               0x003D => 1, # =               0x003D => 1, # =
1460              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1461            !!!cp (115);            !!!cp (115);
1462              ## XML5: Not a parse error.
1463            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1464          } else {          } else {
1465            !!!cp (116);            !!!cp (116);
# Line 1254  sub _get_next_token ($) { Line 1496  sub _get_next_token ($) {
1496            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1497          }          }
1498          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1499            $self->{s_kwd} = '';
1500          !!!next-input-character;          !!!next-input-character;
1501    
1502          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1281  sub _get_next_token ($) { Line 1524  sub _get_next_token ($) {
1524            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1525          }          }
1526          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1527            $self->{s_kwd} = '';
1528          ## Reconsume.          ## Reconsume.
1529          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1530          redo A;          redo A;
# Line 1292  sub _get_next_token ($) { Line 1536  sub _get_next_token ($) {
1536          redo A;          redo A;
1537        }        }
1538      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1539          ## XML5: "Empty tag state".
1540    
1541        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1542          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1543            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1311  sub _get_next_token ($) { Line 1557  sub _get_next_token ($) {
1557          }          }
1558    
1559          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1560            $self->{s_kwd} = '';
1561          !!!next-input-character;          !!!next-input-character;
1562    
1563          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1332  sub _get_next_token ($) { Line 1579  sub _get_next_token ($) {
1579          } else {          } else {
1580            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1581          }          }
1582            ## XML5: "Tag attribute name before state".
1583          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1584            $self->{s_kwd} = '';
1585          ## Reconsume.          ## Reconsume.
1586          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1587          redo A;          redo A;
# Line 1345  sub _get_next_token ($) { Line 1594  sub _get_next_token ($) {
1594          redo A;          redo A;
1595        }        }
1596      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1597        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1598    
1599        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1600        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1601                
1602        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1603          !!!cp (124);          if ($self->{in_subset}) {
1604          $self->{state} = DATA_STATE;            !!!cp (123);
1605              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1606            } else {
1607              !!!cp (124);
1608              $self->{state} = DATA_STATE;
1609              $self->{s_kwd} = '';
1610            }
1611          !!!next-input-character;          !!!next-input-character;
1612    
1613          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1614          redo A;          redo A;
1615        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1616          !!!cp (125);          if ($self->{in_subset}) {
1617          $self->{state} = DATA_STATE;            !!!cp (125.1);
1618              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1619            } else {
1620              !!!cp (125);
1621              $self->{state} = DATA_STATE;
1622              $self->{s_kwd} = '';
1623            }
1624          ## reconsume          ## reconsume
1625    
1626          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1376  sub _get_next_token ($) { Line 1637  sub _get_next_token ($) {
1637          redo A;          redo A;
1638        }        }
1639      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1640        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1641                
1642        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1643          !!!cp (133);          !!!cp (133);
# Line 1388  sub _get_next_token ($) { Line 1649  sub _get_next_token ($) {
1649          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1650          !!!cp (130);          !!!cp (130);
1651          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1652          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1653          !!!next-input-character;          !!!next-input-character;
1654          redo A;          redo A;
1655        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
# Line 1397  sub _get_next_token ($) { Line 1658  sub _get_next_token ($) {
1658                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1659          !!!cp (135.4);                          !!!cp (135.4);                
1660          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1661          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1662          !!!next-input-character;          !!!next-input-character;
1663          redo A;          redo A;
1664        } else {        } else {
# Line 1421  sub _get_next_token ($) { Line 1682  sub _get_next_token ($) {
1682                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1683                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
1684                                   };                                   };
1685          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1686          !!!next-input-character;          !!!next-input-character;
1687          redo A;          redo A;
1688        } else {        } else {
# Line 1447  sub _get_next_token ($) { Line 1708  sub _get_next_token ($) {
1708              0x0054, # T              0x0054, # T
1709              0x0059, # Y              0x0059, # Y
1710              0x0050, # P              0x0050, # P
1711            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1712            $self->{nc} == [            $self->{nc} == [
1713              undef,              undef,
1714              0x006F, # o              0x006F, # o
# Line 1455  sub _get_next_token ($) { Line 1716  sub _get_next_token ($) {
1716              0x0074, # t              0x0074, # t
1717              0x0079, # y              0x0079, # y
1718              0x0070, # p              0x0070, # p
1719            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1720          !!!cp (131);          !!!cp (131);
1721          ## Stay in the state.          ## Stay in the state.
1722          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1723          !!!next-input-character;          !!!next-input-character;
1724          redo A;          redo A;
1725        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1726                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1727                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1728          !!!cp (129);          if ($self->{is_xml} and
1729                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1730              !!!cp (129);
1731              ## XML5: case-sensitive.
1732              !!!parse-error (type => 'lowercase keyword', ## TODO
1733                              text => 'DOCTYPE',
1734                              line => $self->{line_prev},
1735                              column => $self->{column_prev} - 5);
1736            } else {
1737              !!!cp (129.1);
1738            }
1739          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
1740          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
1741                                    quirks => 1,                                    quirks => 1,
# Line 1477  sub _get_next_token ($) { Line 1748  sub _get_next_token ($) {
1748          !!!cp (132);                  !!!cp (132);        
1749          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1750                          line => $self->{line_prev},                          line => $self->{line_prev},
1751                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1752          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1753          ## Reconsume.          ## Reconsume.
1754          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1755                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1756                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1757                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1758                                   };                                   };
1759          redo A;          redo A;
1760        }        }
# Line 1494  sub _get_next_token ($) { Line 1765  sub _get_next_token ($) {
1765              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1766              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1767              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1768            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1769          !!!cp (135.1);          !!!cp (135.1);
1770          ## Stay in the state.          ## Stay in the state.
1771          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1772          !!!next-input-character;          !!!next-input-character;
1773          redo A;          redo A;
1774        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1775                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1776          !!!cp (135.2);          if ($self->{is_xml} and
1777                not $self->{tainted} and
1778                @{$self->{open_elements} or []} == 0) {
1779              !!!cp (135.2);
1780              !!!parse-error (type => 'cdata outside of root element',
1781                              line => $self->{line_prev},
1782                              column => $self->{column_prev} - 7);
1783              $self->{tainted} = 1;
1784            } else {
1785              !!!cp (135.21);
1786            }
1787    
1788          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
1789                                    data => '',                                    data => '',
1790                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 1514  sub _get_next_token ($) { Line 1796  sub _get_next_token ($) {
1796          !!!cp (135.3);          !!!cp (135.3);
1797          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1798                          line => $self->{line_prev},                          line => $self->{line_prev},
1799                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1800          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1801          ## Reconsume.          ## Reconsume.
1802          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1803                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1804                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1805                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1806                                   };                                   };
1807          redo A;          redo A;
1808        }        }
# Line 1531  sub _get_next_token ($) { Line 1813  sub _get_next_token ($) {
1813          !!!next-input-character;          !!!next-input-character;
1814          redo A;          redo A;
1815        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1816          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1817          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1818              !!!cp (138.1);
1819              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1820            } else {
1821              !!!cp (138);
1822              $self->{state} = DATA_STATE;
1823              $self->{s_kwd} = '';
1824            }
1825          !!!next-input-character;          !!!next-input-character;
1826    
1827          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1828    
1829          redo A;          redo A;
1830        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1831          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1832          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1833              !!!cp (139.1);
1834              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1835            } else {
1836              !!!cp (139);
1837              $self->{state} = DATA_STATE;
1838              $self->{s_kwd} = '';
1839            }
1840          ## reconsume          ## reconsume
1841    
1842          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1563  sub _get_next_token ($) { Line 1857  sub _get_next_token ($) {
1857          !!!next-input-character;          !!!next-input-character;
1858          redo A;          redo A;
1859        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1860          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1861          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1862              !!!cp (142.1);
1863              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1864            } else {
1865              !!!cp (142);
1866              $self->{state} = DATA_STATE;
1867              $self->{s_kwd} = '';
1868            }
1869          !!!next-input-character;          !!!next-input-character;
1870    
1871          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1872    
1873          redo A;          redo A;
1874        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1875          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1876          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1877              !!!cp (143.1);
1878              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1879            } else {
1880              !!!cp (143);
1881              $self->{state} = DATA_STATE;
1882              $self->{s_kwd} = '';
1883            }
1884          ## reconsume          ## reconsume
1885    
1886          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1589  sub _get_next_token ($) { Line 1895  sub _get_next_token ($) {
1895          redo A;          redo A;
1896        }        }
1897      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
1898          ## XML5: "Comment state" and "DOCTYPE comment state".
1899    
1900        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1901          !!!cp (145);          !!!cp (145);
1902          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
1903          !!!next-input-character;          !!!next-input-character;
1904          redo A;          redo A;
1905        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
1906          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1907          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1908              !!!cp (146.1);
1909              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1910            } else {
1911              !!!cp (146);
1912              $self->{state} = DATA_STATE;
1913              $self->{s_kwd} = '';
1914            }
1915          ## reconsume          ## reconsume
1916    
1917          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1615  sub _get_next_token ($) { Line 1929  sub _get_next_token ($) {
1929          redo A;          redo A;
1930        }        }
1931      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1932          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
1933    
1934        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1935          !!!cp (148);          !!!cp (148);
1936          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
1937          !!!next-input-character;          !!!next-input-character;
1938          redo A;          redo A;
1939        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
1940          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1941          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1942              !!!cp (149.1);
1943              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1944            } else {
1945              !!!cp (149);
1946              $self->{state} = DATA_STATE;
1947              $self->{s_kwd} = '';
1948            }
1949          ## reconsume          ## reconsume
1950    
1951          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1637  sub _get_next_token ($) { Line 1959  sub _get_next_token ($) {
1959          redo A;          redo A;
1960        }        }
1961      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
1962          ## XML5: "Comment end state" and "DOCTYPE comment end state".
1963    
1964        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1965          !!!cp (151);          if ($self->{in_subset}) {
1966          $self->{state} = DATA_STATE;            !!!cp (151.1);
1967              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1968            } else {
1969              !!!cp (151);
1970              $self->{state} = DATA_STATE;
1971              $self->{s_kwd} = '';
1972            }
1973          !!!next-input-character;          !!!next-input-character;
1974    
1975          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1647  sub _get_next_token ($) { Line 1977  sub _get_next_token ($) {
1977          redo A;          redo A;
1978        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
1979          !!!cp (152);          !!!cp (152);
1980            ## XML5: Not a parse error.
1981          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1982                          line => $self->{line_prev},                          line => $self->{line_prev},
1983                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1655  sub _get_next_token ($) { Line 1986  sub _get_next_token ($) {
1986          !!!next-input-character;          !!!next-input-character;
1987          redo A;          redo A;
1988        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
1989          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1990          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1991              !!!cp (153.1);
1992              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1993            } else {
1994              !!!cp (153);
1995              $self->{state} = DATA_STATE;
1996              $self->{s_kwd} = '';
1997            }
1998          ## reconsume          ## reconsume
1999    
2000          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1665  sub _get_next_token ($) { Line 2002  sub _get_next_token ($) {
2002          redo A;          redo A;
2003        } else {        } else {
2004          !!!cp (154);          !!!cp (154);
2005            ## XML5: Not a parse error.
2006          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
2007                          line => $self->{line_prev},                          line => $self->{line_prev},
2008                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1681  sub _get_next_token ($) { Line 2019  sub _get_next_token ($) {
2019          redo A;          redo A;
2020        } else {        } else {
2021          !!!cp (156);          !!!cp (156);
2022            ## XML5: Unless EOF, swith to the bogus comment state.
2023          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
2024          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2025          ## reconsume          ## reconsume
2026          redo A;          redo A;
2027        }        }
2028      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2029          ## XML5: "DOCTYPE root name before state".
2030    
2031        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2032          !!!cp (157);          !!!cp (157);
2033          ## Stay in the state          ## Stay in the state
# Line 1694  sub _get_next_token ($) { Line 2035  sub _get_next_token ($) {
2035          redo A;          redo A;
2036        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2037          !!!cp (158);          !!!cp (158);
2038            ## XML5: No parse error.
2039          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2040          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2041            $self->{s_kwd} = '';
2042          !!!next-input-character;          !!!next-input-character;
2043    
2044          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1705  sub _get_next_token ($) { Line 2048  sub _get_next_token ($) {
2048          !!!cp (159);          !!!cp (159);
2049          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2050          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2051            $self->{s_kwd} = '';
2052          ## reconsume          ## reconsume
2053    
2054          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2055    
2056          redo A;          redo A;
2057          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2058            !!!cp (159.1);
2059            !!!parse-error (type => 'no DOCTYPE name');
2060            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2062            $self->{in_subset} = 1;
2063            !!!next-input-character;
2064            !!!emit ($self->{ct}); # DOCTYPE
2065            redo A;
2066        } else {        } else {
2067          !!!cp (160);          !!!cp (160);
2068          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1719  sub _get_next_token ($) { Line 2072  sub _get_next_token ($) {
2072          redo A;          redo A;
2073        }        }
2074      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2075  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2076    
2077          ## ISSUE: Redundant "First," in the spec.
2078    
2079        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2080          !!!cp (161);          !!!cp (161);
2081          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1728  sub _get_next_token ($) { Line 2084  sub _get_next_token ($) {
2084        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2085          !!!cp (162);          !!!cp (162);
2086          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2087            $self->{s_kwd} = '';
2088          !!!next-input-character;          !!!next-input-character;
2089    
2090          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1737  sub _get_next_token ($) { Line 2094  sub _get_next_token ($) {
2094          !!!cp (163);          !!!cp (163);
2095          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2096          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2097            $self->{s_kwd} = '';
2098          ## reconsume          ## reconsume
2099    
2100          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2101          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2102    
2103          redo A;          redo A;
2104          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2105            !!!cp (163.1);
2106            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2107            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2108            $self->{in_subset} = 1;
2109            !!!next-input-character;
2110            !!!emit ($self->{ct}); # DOCTYPE
2111            redo A;
2112        } else {        } else {
2113          !!!cp (164);          !!!cp (164);
2114          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1752  sub _get_next_token ($) { Line 2118  sub _get_next_token ($) {
2118          redo A;          redo A;
2119        }        }
2120      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2121          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2122          ## state", but implemented differently.
2123    
2124        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2125          !!!cp (165);          !!!cp (165);
2126          ## Stay in the state          ## Stay in the state
# Line 1760  sub _get_next_token ($) { Line 2129  sub _get_next_token ($) {
2129        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2130          !!!cp (166);          !!!cp (166);
2131          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2132            $self->{s_kwd} = '';
2133          !!!next-input-character;          !!!next-input-character;
2134    
2135          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1769  sub _get_next_token ($) { Line 2139  sub _get_next_token ($) {
2139          !!!cp (167);          !!!cp (167);
2140          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2141          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2142            $self->{s_kwd} = '';
2143          ## reconsume          ## reconsume
2144    
2145          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1777  sub _get_next_token ($) { Line 2148  sub _get_next_token ($) {
2148          redo A;          redo A;
2149        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2150                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2151            !!!cp (167.1);
2152          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2153          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2154          !!!next-input-character;          !!!next-input-character;
2155          redo A;          redo A;
2156        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2157                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2158            !!!cp (167.2);
2159          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2160          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2161            !!!next-input-character;
2162            redo A;
2163          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2164            !!!cp (167.3);
2165            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2166            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2167            $self->{in_subset} = 1;
2168          !!!next-input-character;          !!!next-input-character;
2169            !!!emit ($self->{ct}); # DOCTYPE
2170          redo A;          redo A;
2171        } else {        } else {
2172          !!!cp (180);          !!!cp (180);
# Line 1804  sub _get_next_token ($) { Line 2185  sub _get_next_token ($) {
2185              0x0042, # B              0x0042, # B
2186              0x004C, # L              0x004C, # L
2187              0x0049, # I              0x0049, # I
2188            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2189            $self->{nc} == [            $self->{nc} == [
2190              undef,              undef,
2191              0x0075, # u              0x0075, # u
2192              0x0062, # b              0x0062, # b
2193              0x006C, # l              0x006C, # l
2194              0x0069, # i              0x0069, # i
2195            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2196          !!!cp (175);          !!!cp (175);
2197          ## Stay in the state.          ## Stay in the state.
2198          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2199          !!!next-input-character;          !!!next-input-character;
2200          redo A;          redo A;
2201        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2202                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2203                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2204          !!!cp (168);          if ($self->{is_xml} and
2205                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2206              !!!cp (168.1);
2207              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2208                              text => 'PUBLIC',
2209                              line => $self->{line_prev},
2210                              column => $self->{column_prev} - 4);
2211            } else {
2212              !!!cp (168);
2213            }
2214          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2215          !!!next-input-character;          !!!next-input-character;
2216          redo A;          redo A;
# Line 1828  sub _get_next_token ($) { Line 2218  sub _get_next_token ($) {
2218          !!!cp (169);          !!!cp (169);
2219          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2220                          line => $self->{line_prev},                          line => $self->{line_prev},
2221                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2222          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2223    
2224          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 1843  sub _get_next_token ($) { Line 2233  sub _get_next_token ($) {
2233              0x0053, # S              0x0053, # S
2234              0x0054, # T              0x0054, # T
2235              0x0045, # E              0x0045, # E
2236            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2237            $self->{nc} == [            $self->{nc} == [
2238              undef,              undef,
2239              0x0079, # y              0x0079, # y
2240              0x0073, # s              0x0073, # s
2241              0x0074, # t              0x0074, # t
2242              0x0065, # e              0x0065, # e
2243            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2244          !!!cp (170);          !!!cp (170);
2245          ## Stay in the state.          ## Stay in the state.
2246          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2247          !!!next-input-character;          !!!next-input-character;
2248          redo A;          redo A;
2249        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2250                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2251                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2252          !!!cp (171);          if ($self->{is_xml} and
2253                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2254              !!!cp (171.1);
2255              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2256                              text => 'SYSTEM',
2257                              line => $self->{line_prev},
2258                              column => $self->{column_prev} - 4);
2259            } else {
2260              !!!cp (171);
2261            }
2262          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2263          !!!next-input-character;          !!!next-input-character;
2264          redo A;          redo A;
# Line 1867  sub _get_next_token ($) { Line 2266  sub _get_next_token ($) {
2266          !!!cp (172);          !!!cp (172);
2267          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2268                          line => $self->{line_prev},                          line => $self->{line_prev},
2269                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2270          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2271    
2272          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 1897  sub _get_next_token ($) { Line 2296  sub _get_next_token ($) {
2296          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2297    
2298          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2299            $self->{s_kwd} = '';
2300          !!!next-input-character;          !!!next-input-character;
2301    
2302          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1908  sub _get_next_token ($) { Line 2308  sub _get_next_token ($) {
2308          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2309    
2310          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2311            $self->{s_kwd} = '';
2312          ## reconsume          ## reconsume
2313    
2314          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2315          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2316    
2317          redo A;          redo A;
2318          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2319            !!!cp (186.1);
2320            !!!parse-error (type => 'no PUBLIC literal');
2321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2322            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2323            $self->{in_subset} = 1;
2324            !!!next-input-character;
2325            !!!emit ($self->{ct}); # DOCTYPE
2326            redo A;
2327        } else {        } else {
2328          !!!cp (186);          !!!cp (186);
2329          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
# Line 1934  sub _get_next_token ($) { Line 2344  sub _get_next_token ($) {
2344          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2345    
2346          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2347            $self->{s_kwd} = '';
2348          !!!next-input-character;          !!!next-input-character;
2349    
2350          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1945  sub _get_next_token ($) { Line 2356  sub _get_next_token ($) {
2356          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2357    
2358          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2359            $self->{s_kwd} = '';
2360          ## reconsume          ## reconsume
2361    
2362          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1973  sub _get_next_token ($) { Line 2385  sub _get_next_token ($) {
2385          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2386    
2387          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2388            $self->{s_kwd} = '';
2389          !!!next-input-character;          !!!next-input-character;
2390    
2391          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1984  sub _get_next_token ($) { Line 2397  sub _get_next_token ($) {
2397          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2398    
2399          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2400            $self->{s_kwd} = '';
2401          ## reconsume          ## reconsume
2402    
2403          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2020  sub _get_next_token ($) { Line 2434  sub _get_next_token ($) {
2434          !!!next-input-character;          !!!next-input-character;
2435          redo A;          redo A;
2436        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2437          !!!cp (198);          if ($self->{is_xml}) {
2438              !!!cp (198.1);
2439              !!!parse-error (type => 'no SYSTEM literal');
2440            } else {
2441              !!!cp (198);
2442            }
2443          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2444            $self->{s_kwd} = '';
2445          !!!next-input-character;          !!!next-input-character;
2446    
2447          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2032  sub _get_next_token ($) { Line 2452  sub _get_next_token ($) {
2452          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2453    
2454          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2455            $self->{s_kwd} = '';
2456          ## reconsume          ## reconsume
2457    
2458          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2459          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2460    
2461          redo A;          redo A;
2462          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2463            !!!cp (200.1);
2464            !!!parse-error (type => 'no SYSTEM literal');
2465            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2466            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2467            $self->{in_subset} = 1;
2468            !!!next-input-character;
2469            !!!emit ($self->{ct}); # DOCTYPE
2470            redo A;
2471        } else {        } else {
2472          !!!cp (200);          !!!cp (200);
2473          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
# Line 2069  sub _get_next_token ($) { Line 2499  sub _get_next_token ($) {
2499          !!!cp (204);          !!!cp (204);
2500          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2501          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2502            $self->{s_kwd} = '';
2503          !!!next-input-character;          !!!next-input-character;
2504    
2505          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2080  sub _get_next_token ($) { Line 2511  sub _get_next_token ($) {
2511          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2512    
2513          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2514            $self->{s_kwd} = '';
2515          ## reconsume          ## reconsume
2516    
2517          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2518          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2519    
2520          redo A;          redo A;
2521          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2522            !!!cp (206.1);
2523            !!!parse-error (type => 'no SYSTEM literal');
2524    
2525            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2526            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2527            $self->{in_subset} = 1;
2528            !!!next-input-character;
2529            !!!emit ($self->{ct}); # DOCTYPE
2530            redo A;
2531        } else {        } else {
2532          !!!cp (206);          !!!cp (206);
2533          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
# Line 2101  sub _get_next_token ($) { Line 2543  sub _get_next_token ($) {
2543          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2544          !!!next-input-character;          !!!next-input-character;
2545          redo A;          redo A;
2546        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2547          !!!cp (208);          !!!cp (208);
2548          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2549    
2550          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2551            $self->{s_kwd} = '';
2552          !!!next-input-character;          !!!next-input-character;
2553    
2554          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2117  sub _get_next_token ($) { Line 2560  sub _get_next_token ($) {
2560          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2561    
2562          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2563            $self->{s_kwd} = '';
2564          ## reconsume          ## reconsume
2565    
2566          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2140  sub _get_next_token ($) { Line 2584  sub _get_next_token ($) {
2584          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2585          !!!next-input-character;          !!!next-input-character;
2586          redo A;          redo A;
2587        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2588          !!!cp (212);          !!!cp (212);
2589          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2590    
2591          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2592            $self->{s_kwd} = '';
2593          !!!next-input-character;          !!!next-input-character;
2594    
2595          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2156  sub _get_next_token ($) { Line 2601  sub _get_next_token ($) {
2601          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2602    
2603          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2604            $self->{s_kwd} = '';
2605          ## reconsume          ## reconsume
2606    
2607          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2182  sub _get_next_token ($) { Line 2628  sub _get_next_token ($) {
2628        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2629          !!!cp (216);          !!!cp (216);
2630          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2631            $self->{s_kwd} = '';
2632          !!!next-input-character;          !!!next-input-character;
2633    
2634          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2191  sub _get_next_token ($) { Line 2638  sub _get_next_token ($) {
2638          !!!cp (217);          !!!cp (217);
2639          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2640          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2641            $self->{s_kwd} = '';
2642          ## reconsume          ## reconsume
2643    
2644          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2645          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2646    
2647          redo A;          redo A;
2648          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2649            !!!cp (218.1);
2650            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2651            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2652            $self->{in_subset} = 1;
2653            !!!next-input-character;
2654            !!!emit ($self->{ct}); # DOCTYPE
2655            redo A;
2656        } else {        } else {
2657          !!!cp (218);          !!!cp (218);
2658          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
# Line 2210  sub _get_next_token ($) { Line 2666  sub _get_next_token ($) {
2666        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2667          !!!cp (219);          !!!cp (219);
2668          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2669            $self->{s_kwd} = '';
2670          !!!next-input-character;          !!!next-input-character;
2671    
2672          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2673    
2674          redo A;          redo A;
2675          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2676            !!!cp (220.1);
2677            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2678            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2679            $self->{in_subset} = 1;
2680            !!!next-input-character;
2681            !!!emit ($self->{ct}); # DOCTYPE
2682            redo A;
2683        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2684          !!!cp (220);          !!!cp (220);
2685          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2686            $self->{s_kwd} = '';
2687          ## reconsume          ## reconsume
2688    
2689          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2226  sub _get_next_token ($) { Line 2692  sub _get_next_token ($) {
2692        } else {        } else {
2693          !!!cp (221);          !!!cp (221);
2694          my $s = '';          my $s = '';
2695          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2696    
2697          ## Stay in the state          ## Stay in the state
2698          !!!next-input-character;          !!!next-input-character;
# Line 2236  sub _get_next_token ($) { Line 2702  sub _get_next_token ($) {
2702        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
2703        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2704        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
2705    
2706          ## XML5: "CDATA state".
2707                
2708        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2709          !!!cp (221.1);          !!!cp (221.1);
# Line 2243  sub _get_next_token ($) { Line 2711  sub _get_next_token ($) {
2711          !!!next-input-character;          !!!next-input-character;
2712          redo A;          redo A;
2713        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2714            if ($self->{is_xml}) {
2715              !!!cp (221.11);
2716              !!!parse-error (type => 'no mse'); ## TODO: type
2717            } else {
2718              !!!cp (221.12);
2719            }
2720    
2721          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2722          !!!next-input-character;          $self->{s_kwd} = '';
2723            ## Reconsume.
2724          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2725            !!!cp (221.2);            !!!cp (221.2);
2726            !!!emit ($self->{ct}); # character            !!!emit ($self->{ct}); # character
# Line 2267  sub _get_next_token ($) { Line 2743  sub _get_next_token ($) {
2743    
2744        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
2745      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2746          ## XML5: "CDATA bracket state".
2747    
2748        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2749          !!!cp (221.5);          !!!cp (221.5);
2750          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 2274  sub _get_next_token ($) { Line 2752  sub _get_next_token ($) {
2752          redo A;          redo A;
2753        } else {        } else {
2754          !!!cp (221.6);          !!!cp (221.6);
2755            ## XML5: If EOF, "]" is not appended and changed to the data state.
2756          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
2757          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2758          ## Reconsume.          ## Reconsume.
2759          redo A;          redo A;
2760        }        }
2761      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2762          ## XML5: "CDATA end state".
2763    
2764        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2765          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2766            $self->{s_kwd} = '';
2767          !!!next-input-character;          !!!next-input-character;
2768          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2769            !!!cp (221.7);            !!!cp (221.7);
# Line 2301  sub _get_next_token ($) { Line 2783  sub _get_next_token ($) {
2783          !!!cp (221.11);          !!!cp (221.11);
2784          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
2785          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
2786          ## Reconsume.          ## Reconsume. ## XML5: Emit.
2787          redo A;          redo A;
2788        }        }
2789      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 2318  sub _get_next_token ($) { Line 2800  sub _get_next_token ($) {
2800        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
2801          !!!cp (999);          !!!cp (999);
2802          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
2803          $self->{s_kwd} = '#';          $self->{kwd} = '#';
2804          !!!next-input-character;          !!!next-input-character;
2805          redo A;          redo A;
2806        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2328  sub _get_next_token ($) { Line 2810  sub _get_next_token ($) {
2810          !!!cp (998);          !!!cp (998);
2811          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
2812          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
2813          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2814          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
2815          $self->{entity__match} = 0;          $self->{entity__match} = 0;
2816          !!!next-input-character;          !!!next-input-character;
2817          redo A;          redo A;
# Line 2349  sub _get_next_token ($) { Line 2831  sub _get_next_token ($) {
2831        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2832          !!!cp (997);          !!!cp (997);
2833          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2834            $self->{s_kwd} = '';
2835          ## Reconsume.          ## Reconsume.
2836          !!!emit ({type => CHARACTER_TOKEN, data => '&',          !!!emit ({type => CHARACTER_TOKEN, data => '&',
2837                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 2359  sub _get_next_token ($) { Line 2842  sub _get_next_token ($) {
2842          !!!cp (996);          !!!cp (996);
2843          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
2844          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2845            $self->{s_kwd} = '';
2846          ## Reconsume.          ## Reconsume.
2847          redo A;          redo A;
2848        }        }
# Line 2367  sub _get_next_token ($) { Line 2851  sub _get_next_token ($) {
2851            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
2852          !!!cp (995);          !!!cp (995);
2853          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
2854          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2855          !!!next-input-character;          !!!next-input-character;
2856          redo A;          redo A;
2857        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
2858                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
2859          !!!cp (994);          !!!cp (994);
2860          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
2861          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
2862          !!!next-input-character;          !!!next-input-character;
2863          redo A;          redo A;
2864        } else {        } else {
# Line 2389  sub _get_next_token ($) { Line 2873  sub _get_next_token ($) {
2873          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2874            !!!cp (1019);            !!!cp (1019);
2875            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2876              $self->{s_kwd} = '';
2877            ## Reconsume.            ## Reconsume.
2878            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2879                      data => '&#',                      data => '&#',
# Line 2400  sub _get_next_token ($) { Line 2885  sub _get_next_token ($) {
2885            !!!cp (993);            !!!cp (993);
2886            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
2887            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2888              $self->{s_kwd} = '';
2889            ## Reconsume.            ## Reconsume.
2890            redo A;            redo A;
2891          }          }
# Line 2408  sub _get_next_token ($) { Line 2894  sub _get_next_token ($) {
2894        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
2895            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
2896          !!!cp (1012);          !!!cp (1012);
2897          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
2898          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2899                    
2900          ## Stay in the state.          ## Stay in the state.
2901          !!!next-input-character;          !!!next-input-character;
# Line 2425  sub _get_next_token ($) { Line 2911  sub _get_next_token ($) {
2911          #          #
2912        }        }
2913    
2914        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2915        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2916        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2917        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2445  sub _get_next_token ($) { Line 2931  sub _get_next_token ($) {
2931        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2932          !!!cp (992);          !!!cp (992);
2933          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2934            $self->{s_kwd} = '';
2935          ## Reconsume.          ## Reconsume.
2936          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2937                      has_reference => 1,
2938                    line => $l, column => $c,                    line => $l, column => $c,
2939                   });                   });
2940          redo A;          redo A;
# Line 2455  sub _get_next_token ($) { Line 2943  sub _get_next_token ($) {
2943          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2944          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2945          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2946            $self->{s_kwd} = '';
2947          ## Reconsume.          ## Reconsume.
2948          redo A;          redo A;
2949        }        }
# Line 2465  sub _get_next_token ($) { Line 2954  sub _get_next_token ($) {
2954          # 0..9, A..F, a..f          # 0..9, A..F, a..f
2955          !!!cp (990);          !!!cp (990);
2956          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
2957          $self->{s_kwd} = 0;          $self->{kwd} = 0;
2958          ## Reconsume.          ## Reconsume.
2959          redo A;          redo A;
2960        } else {        } else {
# Line 2480  sub _get_next_token ($) { Line 2969  sub _get_next_token ($) {
2969          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2970            !!!cp (1005);            !!!cp (1005);
2971            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2972              $self->{s_kwd} = '';
2973            ## Reconsume.            ## Reconsume.
2974            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2975                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
2976                      line => $self->{line_prev},                      line => $self->{line_prev},
2977                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
2978                     });                     });
2979            redo A;            redo A;
2980          } else {          } else {
2981            !!!cp (989);            !!!cp (989);
2982            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
2983            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2984              $self->{s_kwd} = '';
2985            ## Reconsume.            ## Reconsume.
2986            redo A;            redo A;
2987          }          }
# Line 2499  sub _get_next_token ($) { Line 2990  sub _get_next_token ($) {
2990        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2991          # 0..9          # 0..9
2992          !!!cp (1002);          !!!cp (1002);
2993          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2994          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2995          ## Stay in the state.          ## Stay in the state.
2996          !!!next-input-character;          !!!next-input-character;
2997          redo A;          redo A;
2998        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
2999                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
3000          !!!cp (1003);          !!!cp (1003);
3001          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3002          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
3003          ## Stay in the state.          ## Stay in the state.
3004          !!!next-input-character;          !!!next-input-character;
3005          redo A;          redo A;
3006        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
3007                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
3008          !!!cp (1004);          !!!cp (1004);
3009          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3010          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
3011          ## Stay in the state.          ## Stay in the state.
3012          !!!next-input-character;          !!!next-input-character;
3013          redo A;          redo A;
# Line 2533  sub _get_next_token ($) { Line 3024  sub _get_next_token ($) {
3024          #          #
3025        }        }
3026    
3027        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3028        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3029        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3030        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2553  sub _get_next_token ($) { Line 3044  sub _get_next_token ($) {
3044        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
3045          !!!cp (988);          !!!cp (988);
3046          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3047            $self->{s_kwd} = '';
3048          ## Reconsume.          ## Reconsume.
3049          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3050                      has_reference => 1,
3051                    line => $l, column => $c,                    line => $l, column => $c,
3052                   });                   });
3053          redo A;          redo A;
# Line 2563  sub _get_next_token ($) { Line 3056  sub _get_next_token ($) {
3056          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
3057          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
3058          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3059            $self->{s_kwd} = '';
3060          ## Reconsume.          ## Reconsume.
3061          redo A;          redo A;
3062        }        }
3063      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3064        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
3065            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
3066            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
3067              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2577  sub _get_next_token ($) { Line 3071  sub _get_next_token ($) {
3071              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
3072             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
3073          our $EntityChar;          our $EntityChar;
3074          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3075          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
3076            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3077              !!!cp (1020);              !!!cp (1020);
3078              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3079              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3080              !!!next-input-character;              !!!next-input-character;
3081              #              #
3082            } else {            } else {
3083              !!!cp (1021);              !!!cp (1021);
3084              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3085              $self->{entity__match} = -1;              $self->{entity__match} = -1;
3086              ## Stay in the state.              ## Stay in the state.
3087              !!!next-input-character;              !!!next-input-character;
# Line 2615  sub _get_next_token ($) { Line 3109  sub _get_next_token ($) {
3109          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3110              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3111            !!!cp (1024);            !!!cp (1024);
3112            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3113            #            #
3114          } else {          } else {
3115            !!!cp (1025);            !!!cp (1025);
# Line 2627  sub _get_next_token ($) { Line 3121  sub _get_next_token ($) {
3121          !!!cp (1026);          !!!cp (1026);
3122          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3123                          line => $self->{line_prev},                          line => $self->{line_prev},
3124                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3125          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3126          #          #
3127        }        }
3128        
# Line 2645  sub _get_next_token ($) { Line 3139  sub _get_next_token ($) {
3139        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
3140          !!!cp (986);          !!!cp (986);
3141          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3142            $self->{s_kwd} = '';
3143          ## Reconsume.          ## Reconsume.
3144          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
3145                    data => $data,                    data => $data,
3146                      has_reference => $has_ref,
3147                    line => $self->{line_prev},                    line => $self->{line_prev},
3148                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3149                   });                   });
3150          redo A;          redo A;
3151        } else {        } else {
# Line 2657  sub _get_next_token ($) { Line 3153  sub _get_next_token ($) {
3153          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
3154          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
3155          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3156            $self->{s_kwd} = '';
3157            ## Reconsume.
3158            redo A;
3159          }
3160    
3161        ## XML-only states
3162    
3163        } elsif ($self->{state} == PI_STATE) {
3164          ## XML5: "Pi state" and "DOCTYPE pi state".
3165    
3166          if ($is_space->{$self->{nc}} or
3167              $self->{nc} == 0x003F or # ?
3168              $self->{nc} == -1) {
3169            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3170            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3171            ## "DOCTYPE pi state": Parse error, switch to the "data
3172            ## state".
3173            !!!parse-error (type => 'bare pio', ## TODO: type
3174                            line => $self->{line_prev},
3175                            column => $self->{column_prev}
3176                                - 1 * ($self->{nc} != -1));
3177            $self->{state} = BOGUS_COMMENT_STATE;
3178          ## Reconsume.          ## Reconsume.
3179            $self->{ct} = {type => COMMENT_TOKEN,
3180                           data => '?',
3181                           line => $self->{line_prev},
3182                           column => $self->{column_prev}
3183                               - 1 * ($self->{nc} != -1),
3184                          };
3185            redo A;
3186          } else {
3187            ## XML5: "DOCTYPE pi state": Stay in the state.
3188            $self->{ct} = {type => PI_TOKEN,
3189                           target => chr $self->{nc},
3190                           data => '',
3191                           line => $self->{line_prev},
3192                           column => $self->{column_prev} - 1,
3193                          };
3194            $self->{state} = PI_TARGET_STATE;
3195            !!!next-input-character;
3196          redo A;          redo A;
3197        }        }
3198        } elsif ($self->{state} == PI_TARGET_STATE) {
3199          if ($is_space->{$self->{nc}}) {
3200            $self->{state} = PI_TARGET_AFTER_STATE;
3201            !!!next-input-character;
3202            redo A;
3203          } elsif ($self->{nc} == -1) {
3204            !!!parse-error (type => 'no pic'); ## TODO: type
3205            if ($self->{in_subset}) {
3206              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3207            } else {
3208              $self->{state} = DATA_STATE;
3209              $self->{s_kwd} = '';
3210            }
3211            ## Reconsume.
3212            !!!emit ($self->{ct}); # pi
3213            redo A;
3214          } elsif ($self->{nc} == 0x003F) { # ?
3215            $self->{state} = PI_AFTER_STATE;
3216            !!!next-input-character;
3217            redo A;
3218          } else {
3219            ## XML5: typo ("tag name" -> "target")
3220            $self->{ct}->{target} .= chr $self->{nc}; # pi
3221            !!!next-input-character;
3222            redo A;
3223          }
3224        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3225          if ($is_space->{$self->{nc}}) {
3226            ## Stay in the state.
3227            !!!next-input-character;
3228            redo A;
3229          } else {
3230            $self->{state} = PI_DATA_STATE;
3231            ## Reprocess.
3232            redo A;
3233          }
3234        } elsif ($self->{state} == PI_DATA_STATE) {
3235          if ($self->{nc} == 0x003F) { # ?
3236            $self->{state} = PI_DATA_AFTER_STATE;
3237            !!!next-input-character;
3238            redo A;
3239          } elsif ($self->{nc} == -1) {
3240            !!!parse-error (type => 'no pic'); ## TODO: type
3241            if ($self->{in_subset}) {
3242              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3243            } else {
3244              $self->{state} = DATA_STATE;
3245              $self->{s_kwd} = '';
3246            }
3247            ## Reprocess.
3248            !!!emit ($self->{ct}); # pi
3249            redo A;
3250          } else {
3251            $self->{ct}->{data} .= chr $self->{nc}; # pi
3252            $self->{read_until}->($self->{ct}->{data}, q[?],
3253                                  length $self->{ct}->{data});
3254            ## Stay in the state.
3255            !!!next-input-character;
3256            ## Reprocess.
3257            redo A;
3258          }
3259        } elsif ($self->{state} == PI_AFTER_STATE) {
3260          ## XML5: Part of "Pi after state".
3261    
3262          if ($self->{nc} == 0x003E) { # >
3263            if ($self->{in_subset}) {
3264              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265            } else {
3266              $self->{state} = DATA_STATE;
3267              $self->{s_kwd} = '';
3268            }
3269            !!!next-input-character;
3270            !!!emit ($self->{ct}); # pi
3271            redo A;
3272          } elsif ($self->{nc} == 0x003F) { # ?
3273            !!!parse-error (type => 'no s after target', ## TODO: type
3274                            line => $self->{line_prev},
3275                            column => $self->{column_prev}); ## XML5: no error
3276            $self->{ct}->{data} .= '?';
3277            $self->{state} = PI_DATA_AFTER_STATE;
3278            !!!next-input-character;
3279            redo A;
3280          } else {
3281            !!!parse-error (type => 'no s after target', ## TODO: type
3282                            line => $self->{line_prev},
3283                            column => $self->{column_prev}
3284                                + 1 * ($self->{nc} == -1)); ## XML5: no error
3285            $self->{ct}->{data} .= '?'; ## XML5: not appended
3286            $self->{state} = PI_DATA_STATE;
3287            ## Reprocess.
3288            redo A;
3289          }
3290        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3291          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3292    
3293          if ($self->{nc} == 0x003E) { # >
3294            if ($self->{in_subset}) {
3295              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3296            } else {
3297              $self->{state} = DATA_STATE;
3298              $self->{s_kwd} = '';
3299            }
3300            !!!next-input-character;
3301            !!!emit ($self->{ct}); # pi
3302            redo A;
3303          } elsif ($self->{nc} == 0x003F) { # ?
3304            $self->{ct}->{data} .= '?';
3305            ## Stay in the state.
3306            !!!next-input-character;
3307            redo A;
3308          } else {
3309            $self->{ct}->{data} .= '?'; ## XML5: not appended
3310            $self->{state} = PI_DATA_STATE;
3311            ## Reprocess.
3312            redo A;
3313          }
3314    
3315        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3316          if ($self->{nc} == 0x003C) { # <
3317            $self->{state} = DOCTYPE_TAG_STATE;
3318            !!!next-input-character;
3319            redo A;
3320          } elsif ($self->{nc} == 0x0025) { # %
3321            ## XML5: Not defined yet.
3322    
3323            ## TODO:
3324            !!!next-input-character;
3325            redo A;
3326          } elsif ($self->{nc} == 0x005D) { # ]
3327            delete $self->{in_subset};
3328            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3329            !!!next-input-character;
3330            redo A;
3331          } elsif ($is_space->{$self->{nc}}) {
3332            ## Stay in the state.
3333            !!!next-input-character;
3334            redo A;
3335          } elsif ($self->{nc} == -1) {
3336            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3337            delete $self->{in_subset};
3338            $self->{state} = DATA_STATE;
3339            $self->{s_kwd} = '';
3340            ## Reconsume.
3341            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3342            redo A;
3343          } else {
3344            unless ($self->{internal_subset_tainted}) {
3345              ## XML5: No parse error.
3346              !!!parse-error (type => 'string in internal subset');
3347              $self->{internal_subset_tainted} = 1;
3348            }
3349            ## Stay in the state.
3350            !!!next-input-character;
3351            redo A;
3352          }
3353        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3354          if ($self->{nc} == 0x003E) { # >
3355            $self->{state} = DATA_STATE;
3356            $self->{s_kwd} = '';
3357            !!!next-input-character;
3358            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3359            redo A;
3360          } elsif ($self->{nc} == -1) {
3361            !!!parse-error (type => 'unclosed DOCTYPE');
3362            $self->{state} = DATA_STATE;
3363            $self->{s_kwd} = '';
3364            ## Reconsume.
3365            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3366            redo A;
3367          } else {
3368            ## XML5: No parse error and stay in the state.
3369            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3370    
3371            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3372            !!!next-input-character;
3373            redo A;
3374          }
3375        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3376          if ($self->{nc} == 0x003E) { # >
3377            $self->{state} = DATA_STATE;
3378            $self->{s_kwd} = '';
3379            !!!next-input-character;
3380            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3381            redo A;
3382          } elsif ($self->{nc} == -1) {
3383            $self->{state} = DATA_STATE;
3384            $self->{s_kwd} = '';
3385            ## Reconsume.
3386            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3387            redo A;
3388          } else {
3389            ## Stay in the state.
3390            !!!next-input-character;
3391            redo A;
3392          }
3393        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3394          if ($self->{nc} == 0x0021) { # !
3395            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3396            !!!next-input-character;
3397            redo A;
3398          } elsif ($self->{nc} == 0x003F) { # ?
3399            $self->{state} = PI_STATE;
3400            !!!next-input-character;
3401            redo A;
3402          } elsif ($self->{nc} == -1) {
3403            !!!parse-error (type => 'bare stago');
3404            $self->{state} = DATA_STATE;
3405            $self->{s_kwd} = '';
3406            ## Reconsume.
3407            redo A;
3408          } else {
3409            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3410                            line => $self->{line_prev},
3411                            column => $self->{column_prev});
3412            $self->{state} = BOGUS_COMMENT_STATE;
3413            $self->{ct} = {type => COMMENT_TOKEN,
3414                           data => '',
3415                          }; ## NOTE: Will be discarded.
3416            !!!next-input-character;
3417            redo A;
3418          }
3419        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3420          ## XML5: "DOCTYPE markup declaration state".
3421          
3422          if ($self->{nc} == 0x002D) { # -
3423            $self->{state} = MD_HYPHEN_STATE;
3424            !!!next-input-character;
3425            redo A;
3426          } elsif ($self->{nc} == 0x0045) { # E
3427            $self->{state} = MD_E_STATE;
3428            $self->{kwd} = chr $self->{nc};
3429            !!!next-input-character;
3430            redo A;
3431          } elsif ($self->{nc} == 0x0041) { # A
3432            $self->{state} = MD_ATTLIST_STATE;
3433            $self->{kwd} = chr $self->{nc};
3434            !!!next-input-character;
3435            redo A;
3436          } elsif ($self->{nc} == 0x004E) { # N
3437            $self->{state} = MD_NOTATION_STATE;
3438            $self->{kwd} = chr $self->{nc};
3439            !!!next-input-character;
3440            redo A;
3441          } else {
3442            #
3443          }
3444          
3445          ## XML5: No parse error.
3446          !!!parse-error (type => 'bogus comment',
3447                          line => $self->{line_prev},
3448                          column => $self->{column_prev} - 1);
3449          ## Reconsume.
3450          $self->{state} = BOGUS_COMMENT_STATE;
3451          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3452          redo A;
3453        } elsif ($self->{state} == MD_E_STATE) {
3454          if ($self->{nc} == 0x004E) { # N
3455            $self->{state} = MD_ENTITY_STATE;
3456            $self->{kwd} .= chr $self->{nc};
3457            !!!next-input-character;
3458            redo A;
3459          } elsif ($self->{nc} == 0x004C) { # L
3460            ## XML5: <!ELEMENT> not supported.
3461            $self->{state} = MD_ELEMENT_STATE;
3462            $self->{kwd} .= chr $self->{nc};
3463            !!!next-input-character;
3464            redo A;
3465          } else {
3466            ## XML5: No parse error.
3467            !!!parse-error (type => 'bogus comment',
3468                            line => $self->{line_prev},
3469                            column => $self->{column_prev} - 2
3470                                + 1 * ($self->{nc} == -1));
3471            ## Reconsume.
3472            $self->{state} = BOGUS_COMMENT_STATE;
3473            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3474            redo A;
3475          }
3476        } elsif ($self->{state} == MD_ENTITY_STATE) {
3477          if ($self->{nc} == {
3478                'EN' => 0x0054, # T
3479                'ENT' => 0x0049, # I
3480                'ENTI' => 0x0054, # T
3481              }->{$self->{kwd}}) {
3482            ## Stay in the state.
3483            $self->{kwd} .= chr $self->{nc};
3484            !!!next-input-character;
3485            redo A;
3486          } elsif ($self->{kwd} eq 'ENTIT' and
3487                   $self->{nc} == 0x0059) { # Y
3488            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
3489                           line => $self->{line_prev},
3490                           column => $self->{column_prev} - 6};
3491            $self->{state} = DOCTYPE_MD_STATE;
3492            !!!next-input-character;
3493            redo A;
3494          } else {
3495            !!!parse-error (type => 'bogus comment',
3496                            line => $self->{line_prev},
3497                            column => $self->{column_prev} - 1
3498                                - (length $self->{kwd})
3499                                + 1 * ($self->{nc} == -1));
3500            $self->{state} = BOGUS_COMMENT_STATE;
3501            ## Reconsume.
3502            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3503            redo A;
3504          }
3505        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3506          if ($self->{nc} == {
3507                'EL' => 0x0045, # E
3508                'ELE' => 0x004D, # M
3509                'ELEM' => 0x0045, # E
3510                'ELEME' => 0x004E, # N
3511              }->{$self->{kwd}}) {
3512            ## Stay in the state.
3513            $self->{kwd} .= chr $self->{nc};
3514            !!!next-input-character;
3515            redo A;
3516          } elsif ($self->{kwd} eq 'ELEMEN' and
3517                   $self->{nc} == 0x0054) { # T
3518            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3519                           line => $self->{line_prev},
3520                           column => $self->{column_prev} - 6};
3521            $self->{state} = DOCTYPE_MD_STATE;
3522            !!!next-input-character;
3523            redo A;
3524          } else {
3525            !!!parse-error (type => 'bogus comment',
3526                            line => $self->{line_prev},
3527                            column => $self->{column_prev} - 1
3528                                - (length $self->{kwd})
3529                                + 1 * ($self->{nc} == -1));
3530            $self->{state} = BOGUS_COMMENT_STATE;
3531            ## Reconsume.
3532            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3533            redo A;
3534          }
3535        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3536          if ($self->{nc} == {
3537                'A' => 0x0054, # T
3538                'AT' => 0x0054, # T
3539                'ATT' => 0x004C, # L
3540                'ATTL' => 0x0049, # I
3541                'ATTLI' => 0x0053, # S
3542              }->{$self->{kwd}}) {
3543            ## Stay in the state.
3544            $self->{kwd} .= chr $self->{nc};
3545            !!!next-input-character;
3546            redo A;
3547          } elsif ($self->{kwd} eq 'ATTLIS' and
3548                   $self->{nc} == 0x0054) { # T
3549            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3550                           line => $self->{line_prev},
3551                           column => $self->{column_prev} - 6};
3552            $self->{state} = DOCTYPE_MD_STATE;
3553            !!!next-input-character;
3554            redo A;
3555          } else {
3556            !!!parse-error (type => 'bogus comment',
3557                            line => $self->{line_prev},
3558                            column => $self->{column_prev} - 1
3559                                 - (length $self->{kwd})
3560                                 + 1 * ($self->{nc} == -1));
3561            $self->{state} = BOGUS_COMMENT_STATE;
3562            ## Reconsume.
3563            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3564            redo A;
3565          }
3566        } elsif ($self->{state} == MD_NOTATION_STATE) {
3567          if ($self->{nc} == {
3568                'N' => 0x004F, # O
3569                'NO' => 0x0054, # T
3570                'NOT' => 0x0041, # A
3571                'NOTA' => 0x0054, # T
3572                'NOTAT' => 0x0049, # I
3573                'NOTATI' => 0x004F, # O
3574              }->{$self->{kwd}}) {
3575            ## Stay in the state.
3576            $self->{kwd} .= chr $self->{nc};
3577            !!!next-input-character;
3578            redo A;
3579          } elsif ($self->{kwd} eq 'NOTATIO' and
3580                   $self->{nc} == 0x004E) { # N
3581            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3582                           line => $self->{line_prev},
3583                           column => $self->{column_prev} - 6};
3584            $self->{state} = DOCTYPE_MD_STATE;
3585            !!!next-input-character;
3586            redo A;
3587          } else {
3588            !!!parse-error (type => 'bogus comment',
3589                            line => $self->{line_prev},
3590                            column => $self->{column_prev} - 1
3591                                - (length $self->{kwd})
3592                                + 1 * ($self->{nc} == -1));
3593            $self->{state} = BOGUS_COMMENT_STATE;
3594            ## Reconsume.
3595            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3596            redo A;
3597          }
3598        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3599          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3600          ## "DOCTYPE NOTATION state".
3601    
3602          if ($is_space->{$self->{nc}}) {
3603            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3604            $self->{state} = BEFORE_MD_NAME_STATE;
3605            !!!next-input-character;
3606            redo A;
3607          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3608                   $self->{nc} == 0x0025) { # %
3609            ## XML5: Switch to the "DOCTYPE bogus comment state".
3610            !!!parse-error (type => 'no space before md name'); ## TODO: type
3611            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3612            !!!next-input-character;
3613            redo A;
3614          } elsif ($self->{nc} == -1) {
3615            !!!parse-error (type => 'unclosed md'); ## TODO: type
3616            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3617            ## Reconsume.
3618            redo A;
3619          } elsif ($self->{nc} == 0x003E) { # >
3620            ## XML5: Switch to the "DOCTYPE bogus comment state".
3621            !!!parse-error (type => 'no md name'); ## TODO: type
3622            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3623            !!!next-input-character;
3624            redo A;
3625          } else {
3626            ## XML5: Switch to the "DOCTYPE bogus comment state".
3627            !!!parse-error (type => 'no space before md name'); ## TODO: type
3628            $self->{state} = BEFORE_MD_NAME_STATE;
3629            redo A;
3630          }
3631        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3632          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3633          ## before state", "DOCTYPE ATTLIST name before state".
3634    
3635          if ($is_space->{$self->{nc}}) {
3636            ## Stay in the state.
3637            !!!next-input-character;
3638            redo A;
3639          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3640                   $self->{nc} == 0x0025) { # %
3641            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3642            !!!next-input-character;
3643            redo A;
3644          } elsif ($self->{nc} == 0x003E) { # >
3645            ## XML5: Same as "Anything else".
3646            !!!parse-error (type => 'no md name'); ## TODO: type
3647            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3648            !!!next-input-character;
3649            redo A;
3650          } elsif ($self->{nc} == -1) {
3651            !!!parse-error (type => 'unclosed md'); ## TODO: type
3652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3653            ## Reconsume.
3654            redo A;
3655          } else {
3656            ## XML5: [ATTLIST] Not defined yet.
3657            $self->{ct}->{name} .= chr $self->{nc};
3658            $self->{state} = MD_NAME_STATE;
3659            !!!next-input-character;
3660            redo A;
3661          }
3662        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3663          if ($is_space->{$self->{nc}}) {
3664            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3665            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3666            $self->{state} = BEFORE_MD_NAME_STATE;
3667            !!!next-input-character;
3668            redo A;
3669          } elsif ($self->{nc} == 0x003E) { # >
3670            ## XML5: Same as "Anything else".
3671            !!!parse-error (type => 'no md name'); ## TODO: type
3672            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3673            !!!next-input-character;
3674            redo A;
3675          } elsif ($self->{nc} == -1) {
3676            !!!parse-error (type => 'unclosed md');
3677            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3678            ## Reconsume.
3679            redo A;
3680          } else {
3681            ## XML5: No parse error.
3682            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3683            $self->{state} = BOGUS_COMMENT_STATE;
3684            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3685            ## Reconsume.
3686            redo A;
3687          }
3688        } elsif ($self->{state} == MD_NAME_STATE) {
3689          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3690          
3691          if ($is_space->{$self->{nc}}) {
3692            ## TODO:
3693            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3694            !!!next-input-character;
3695            redo A;
3696          } elsif ($self->{nc} == 0x003E) { # >
3697            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3698              #
3699            } else {
3700              !!!parse-error (type => 'no md body'); ## TODO: type
3701            }
3702            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3703            !!!next-input-character;
3704            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3705            redo A;
3706          } elsif ($self->{nc} == -1) {
3707            ## XML5: [ATTLIST] No parse error.
3708            !!!parse-error (type => 'unclosed md');
3709            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3710            ## Reconsume.
3711            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3712            redo A;
3713          } else {
3714            ## XML5: [ATTLIST] Not defined yet.
3715            $self->{ct}->{name} .= chr $self->{nc};
3716            ## Stay in the state.
3717            !!!next-input-character;
3718            redo A;
3719          }
3720        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
3721          if ($is_space->{$self->{nc}}) {
3722            ## Stay in the state.
3723            !!!next-input-character;
3724            redo A;
3725          } elsif ($self->{nc} == 0x003E) { # >
3726            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3727            !!!next-input-character;
3728            !!!emit ($self->{ct}); # ATTLIST
3729            redo A;
3730          } elsif ($self->{nc} == -1) {
3731            ## XML5: No parse error.
3732            !!!parse-error (type => 'unclosed md'); ## TODO: type
3733            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3734            redo A;
3735          } else {
3736            ## XML5: Not defined yet.
3737    
3738            ## TODO: ...
3739    
3740            $self->{state} = BOGUS_COMMENT_STATE;
3741            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3742            ## Reconsume.
3743            redo A;
3744          }
3745    
3746      } else {      } else {
3747        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
3748      }      }

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.14

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24