/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.6 by wakaba, Tue Oct 14 14:57:52 2008 UTC revision 1.14 by wakaba, Fri Oct 17 07:14:29 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145    ## XML-only states
146    sub PI_STATE () { 51 }
147    sub PI_TARGET_STATE () { 52 }
148    sub PI_TARGET_AFTER_STATE () { 53 }
149    sub PI_DATA_STATE () { 54 }
150    sub PI_AFTER_STATE () { 55 }
151    sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    
168  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
169  ## list and descriptions)  ## list and descriptions)
170    
# Line 178  sub _initialize_tokenizer ($) { Line 229  sub _initialize_tokenizer ($) {
229    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
230    
231    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
232    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
233      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
234    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
235    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
236    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 198  sub _initialize_tokenizer ($) { Line 250  sub _initialize_tokenizer ($) {
250    
251  ## A token has:  ## A token has:
252  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
253  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
254  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
255  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
256    ##   ->{target} (PI_TOKEN)
257  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
258  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
259  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 208  sub _initialize_tokenizer ($) { Line 261  sub _initialize_tokenizer ($) {
261  ##        ->{name}  ##        ->{name}
262  ##        ->{value}  ##        ->{value}
263  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
264  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
265    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
266    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
267    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
268    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
269    
270  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
271  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
272  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 228  my $is_space = { Line 286  my $is_space = {
286    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
287    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
288    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
289    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
290    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
291    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
292  };  };
# Line 428  sub _get_next_token ($) { Line 486  sub _get_next_token ($) {
486        !!!emit ($token);        !!!emit ($token);
487        redo A;        redo A;
488      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
489          ## XML5: "tag state".
490    
491        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
492          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
493            !!!cp (15);            !!!cp (15);
# Line 436  sub _get_next_token ($) { Line 496  sub _get_next_token ($) {
496            redo A;            redo A;
497          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
498            !!!cp (15.1);            !!!cp (15.1);
499            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
500            #            #
501          } else {          } else {
502            !!!cp (16);            !!!cp (16);
503              $self->{s_kwd} = '';
504            #            #
505          }          }
506    
507          ## reconsume          ## reconsume
508          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
509          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
510                    line => $self->{line_prev},                    line => $self->{line_prev},
511                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 499  sub _get_next_token ($) { Line 559  sub _get_next_token ($) {
559    
560            redo A;            redo A;
561          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
562            !!!cp (22);            if ($self->{is_xml}) {
563            !!!parse-error (type => 'pio',              !!!cp (22.1);
564                            line => $self->{line_prev},              $self->{state} = PI_STATE;
565                            column => $self->{column_prev});              !!!next-input-character;
566            $self->{state} = BOGUS_COMMENT_STATE;              redo A;
567            $self->{ct} = {type => COMMENT_TOKEN, data => '',            } else {
568                                      line => $self->{line_prev},              !!!cp (22);
569                                      column => $self->{column_prev},              !!!parse-error (type => 'pio',
570                                     };                              line => $self->{line_prev},
571            ## $self->{nc} is intentionally left as is                              column => $self->{column_prev});
572            redo A;              $self->{state} = BOGUS_COMMENT_STATE;
573          } else {              $self->{ct} = {type => COMMENT_TOKEN, data => '',
574                               line => $self->{line_prev},
575                               column => $self->{column_prev},
576                              };
577                ## $self->{nc} is intentionally left as is
578                redo A;
579              }
580            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
581            !!!cp (23);            !!!cp (23);
582            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
583                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 525  sub _get_next_token ($) { Line 592  sub _get_next_token ($) {
592                     });                     });
593    
594            redo A;            redo A;
595            } else {
596              ## XML5: "<:" is a parse error.
597              !!!cp (23.1);
598              $self->{ct} = {type => START_TAG_TOKEN,
599                                        tag_name => chr ($self->{nc}),
600                                        line => $self->{line_prev},
601                                        column => $self->{column_prev}};
602              $self->{state} = TAG_NAME_STATE;
603              !!!next-input-character;
604              redo A;
605          }          }
606        } else {        } else {
607          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 533  sub _get_next_token ($) { Line 610  sub _get_next_token ($) {
610        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
611        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
612    
613          ## XML5: "end tag state".
614    
615        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
616        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
617          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
618            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
619            $self->{s_kwd} = '';            $self->{kwd} = '';
620            ## Reconsume.            ## Reconsume.
621            redo A;            redo A;
622          } else {          } else {
# Line 574  sub _get_next_token ($) { Line 653  sub _get_next_token ($) {
653          !!!next-input-character;          !!!next-input-character;
654          redo A;          redo A;
655        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (31);  
656          !!!parse-error (type => 'empty end tag',          !!!parse-error (type => 'empty end tag',
657                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
658                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
659          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
660          $self->{s_kwd} = '';          $self->{s_kwd} = '';
661          !!!next-input-character;          if ($self->{is_xml}) {
662              !!!cp (31);
663              ## XML5: No parse error.
664              
665              ## NOTE: This parser raises a parse error, since it supports
666              ## XML1, not XML5.
667    
668              ## NOTE: A short end tag token.
669              my $ct = {type => END_TAG_TOKEN,
670                        tag_name => '',
671                        line => $self->{line_prev},
672                        column => $self->{column_prev} - 1,
673                       };
674              !!!next-input-character;
675              !!!emit ($ct);
676            } else {
677              !!!cp (31.1);
678              !!!next-input-character;
679            }
680          redo A;          redo A;
681        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
682          !!!cp (32);          !!!cp (32);
# Line 594  sub _get_next_token ($) { Line 690  sub _get_next_token ($) {
690                   });                   });
691    
692          redo A;          redo A;
693        } else {        } elsif (not $self->{is_xml} or
694                   $is_space->{$self->{nc}}) {
695          !!!cp (33);          !!!cp (33);
696          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag',
697                            line => $self->{line_prev}, # "<" of "</"
698                            column => $self->{column_prev} - 1);
699          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
700          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
701                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 609  sub _get_next_token ($) { Line 708  sub _get_next_token ($) {
708          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
709          ## "bogus comment state" entry.          ## "bogus comment state" entry.
710          redo A;          redo A;
711          } else {
712            ## XML5: "</:" is a parse error.
713            !!!cp (30.1);
714            $self->{ct} = {type => END_TAG_TOKEN,
715                           tag_name => chr ($self->{nc}),
716                           line => $l, column => $c};
717            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
718            !!!next-input-character;
719            redo A;
720        }        }
721      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
722        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
723        if (length $ch) {        if (length $ch) {
724          my $CH = $ch;          my $CH = $ch;
725          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 619  sub _get_next_token ($) { Line 727  sub _get_next_token ($) {
727          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
728            !!!cp (24);            !!!cp (24);
729            ## Stay in the state.            ## Stay in the state.
730            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
731            !!!next-input-character;            !!!next-input-character;
732            redo A;            redo A;
733          } else {          } else {
# Line 628  sub _get_next_token ($) { Line 736  sub _get_next_token ($) {
736            $self->{s_kwd} = '';            $self->{s_kwd} = '';
737            ## Reconsume.            ## Reconsume.
738            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
739                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
740                      line => $self->{line_prev},                      line => $self->{line_prev},
741                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
742                     });                     });
743            redo A;            redo A;
744          }          }
# Line 646  sub _get_next_token ($) { Line 754  sub _get_next_token ($) {
754            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
755            $self->{s_kwd} = '';            $self->{s_kwd} = '';
756            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
757                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
758                      line => $self->{line_prev},                      line => $self->{line_prev},
759                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
760                     });                     });
761            redo A;            redo A;
762          } else {          } else {
# Line 657  sub _get_next_token ($) { Line 765  sub _get_next_token ($) {
765                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
766                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
767                   line => $self->{line_prev},                   line => $self->{line_prev},
768                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
769            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
770            ## Reconsume.            ## Reconsume.
771            redo A;            redo A;
# Line 739  sub _get_next_token ($) { Line 847  sub _get_next_token ($) {
847          redo A;          redo A;
848        }        }
849      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
850          ## XML5: "Tag attribute name before state".
851    
852        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
853          !!!cp (45);          !!!cp (45);
854          ## Stay in the state          ## Stay in the state
# Line 811  sub _get_next_token ($) { Line 921  sub _get_next_token ($) {
921               0x003D => 1, # =               0x003D => 1, # =
922              }->{$self->{nc}}) {              }->{$self->{nc}}) {
923            !!!cp (55);            !!!cp (55);
924              ## XML5: Not a parse error.
925            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
926          } else {          } else {
927            !!!cp (56);            !!!cp (56);
928              ## XML5: ":" raises a parse error and is ignored.
929          }          }
930          $self->{ca}          $self->{ca}
931              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 824  sub _get_next_token ($) { Line 936  sub _get_next_token ($) {
936          redo A;          redo A;
937        }        }
938      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
939          ## XML5: "Tag attribute name state".
940    
941        my $before_leave = sub {        my $before_leave = sub {
942          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
943              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 834  sub _get_next_token ($) { Line 948  sub _get_next_token ($) {
948            !!!cp (58);            !!!cp (58);
949            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
950              = $self->{ca};              = $self->{ca};
951              $self->{ca}->{index} = ++$self->{ct}->{last_index};
952          }          }
953        }; # $before_leave        }; # $before_leave
954    
# Line 850  sub _get_next_token ($) { Line 965  sub _get_next_token ($) {
965          !!!next-input-character;          !!!next-input-character;
966          redo A;          redo A;
967        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
968            if ($self->{is_xml}) {
969              !!!cp (60.1);
970              ## XML5: Not a parse error.
971              !!!parse-error (type => 'no attr value'); ## TODO: type
972            } else {
973              !!!cp (60.2);
974            }
975    
976          $before_leave->();          $before_leave->();
977          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
978            !!!cp (61);            !!!cp (61);
# Line 879  sub _get_next_token ($) { Line 1002  sub _get_next_token ($) {
1002          !!!next-input-character;          !!!next-input-character;
1003          redo A;          redo A;
1004        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1005          !!!cp (64);          if ($self->{is_xml}) {
1006              !!!cp (64);
1007              ## XML5: Not a parse error.
1008              !!!parse-error (type => 'no attr value'); ## TODO: type
1009            } else {
1010              !!!cp (64.1);
1011            }
1012            
1013          $before_leave->();          $before_leave->();
1014          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1015          !!!next-input-character;          !!!next-input-character;
# Line 913  sub _get_next_token ($) { Line 1043  sub _get_next_token ($) {
1043          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1044              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1045            !!!cp (69);            !!!cp (69);
1046              ## XML5: Not a parse error.
1047            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1048          } else {          } else {
1049            !!!cp (70);            !!!cp (70);
# Line 923  sub _get_next_token ($) { Line 1054  sub _get_next_token ($) {
1054          redo A;          redo A;
1055        }        }
1056      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1057          ## XML5: "Tag attribute name after state".
1058          
1059        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1060          !!!cp (71);          !!!cp (71);
1061          ## Stay in the state          ## Stay in the state
# Line 934  sub _get_next_token ($) { Line 1067  sub _get_next_token ($) {
1067          !!!next-input-character;          !!!next-input-character;
1068          redo A;          redo A;
1069        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1070            if ($self->{is_xml}) {
1071              !!!cp (72.1);
1072              ## XML5: Not a parse error.
1073              !!!parse-error (type => 'no attr value'); ## TODO: type
1074            } else {
1075              !!!cp (72.2);
1076            }
1077    
1078          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1079            !!!cp (73);            !!!cp (73);
1080            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 967  sub _get_next_token ($) { Line 1108  sub _get_next_token ($) {
1108          !!!next-input-character;          !!!next-input-character;
1109          redo A;          redo A;
1110        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1111          !!!cp (77);          if ($self->{is_xml}) {
1112              !!!cp (77);
1113              ## XML5: Not a parse error.
1114              !!!parse-error (type => 'no attr value'); ## TODO: type
1115            } else {
1116              !!!cp (77.1);
1117            }
1118            
1119          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1120          !!!next-input-character;          !!!next-input-character;
1121          redo A;          redo A;
# Line 996  sub _get_next_token ($) { Line 1144  sub _get_next_token ($) {
1144    
1145          redo A;          redo A;
1146        } else {        } else {
1147            if ($self->{is_xml}) {
1148              !!!cp (78.1);
1149              ## XML5: Not a parse error.
1150              !!!parse-error (type => 'no attr value'); ## TODO: type
1151            } else {
1152              !!!cp (78.2);
1153            }
1154    
1155          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1156              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1157            !!!cp (78);            !!!cp (78);
1158              ## XML5: Not a parse error.
1159            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1160          } else {          } else {
1161            !!!cp (82);            !!!cp (82);
# Line 1012  sub _get_next_token ($) { Line 1169  sub _get_next_token ($) {
1169          redo A;                  redo A;        
1170        }        }
1171      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1172          ## XML5: "Tag attribute value before state".
1173    
1174        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1175          !!!cp (83);          !!!cp (83);
1176          ## Stay in the state          ## Stay in the state
# Line 1083  sub _get_next_token ($) { Line 1242  sub _get_next_token ($) {
1242        } else {        } else {
1243          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1244            !!!cp (93);            !!!cp (93);
1245              ## XML5: Not a parse error.
1246            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1247            } elsif ($self->{is_xml}) {
1248              !!!cp (93.1);
1249              ## XML5: No parse error.
1250              !!!parse-error (type => 'unquoted attr value'); ## TODO
1251          } else {          } else {
1252            !!!cp (94);            !!!cp (94);
1253          }          }
# Line 1093  sub _get_next_token ($) { Line 1257  sub _get_next_token ($) {
1257          redo A;          redo A;
1258        }        }
1259      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1260          ## XML5: "Tag attribute value double quoted state".
1261          
1262        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1263          !!!cp (95);          !!!cp (95);
1264            ## XML5: "Tag attribute name before state".
1265          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1266          !!!next-input-character;          !!!next-input-character;
1267          redo A;          redo A;
1268        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1269          !!!cp (96);          !!!cp (96);
1270            ## XML5: Not defined yet.
1271    
1272          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1273          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1274          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1134  sub _get_next_token ($) { Line 1303  sub _get_next_token ($) {
1303    
1304          redo A;          redo A;
1305        } else {        } else {
1306          !!!cp (100);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1307              !!!cp (100);
1308              ## XML5: Not a parse error.
1309              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1310            } else {
1311              !!!cp (100.1);
1312            }
1313          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1314          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1315                                q["&],                                q["&<],
1316                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1317    
1318          ## Stay in the state          ## Stay in the state
# Line 1145  sub _get_next_token ($) { Line 1320  sub _get_next_token ($) {
1320          redo A;          redo A;
1321        }        }
1322      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1323          ## XML5: "Tag attribute value single quoted state".
1324    
1325        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1326          !!!cp (101);          !!!cp (101);
1327            ## XML5: "Before attribute name state" (sic).
1328          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1329          !!!next-input-character;          !!!next-input-character;
1330          redo A;          redo A;
1331        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1332          !!!cp (102);          !!!cp (102);
1333            ## XML5: Not defined yet.
1334    
1335          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1336          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1337          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1186  sub _get_next_token ($) { Line 1366  sub _get_next_token ($) {
1366    
1367          redo A;          redo A;
1368        } else {        } else {
1369          !!!cp (106);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1370              !!!cp (106);
1371              ## XML5: Not a parse error.
1372              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1373            } else {
1374              !!!cp (106.1);
1375            }
1376          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1377          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1378                                q['&],                                q['&<],
1379                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1380    
1381          ## Stay in the state          ## Stay in the state
# Line 1197  sub _get_next_token ($) { Line 1383  sub _get_next_token ($) {
1383          redo A;          redo A;
1384        }        }
1385      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1386          ## XML5: "Tag attribute value unquoted state".
1387    
1388        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1389          !!!cp (107);          !!!cp (107);
1390            ## XML5: "Tag attribute name before state".
1391          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1392          !!!next-input-character;          !!!next-input-character;
1393          redo A;          redo A;
1394        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1395          !!!cp (108);          !!!cp (108);
1396    
1397            ## XML5: Not defined yet.
1398    
1399          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1400          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1401          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1267  sub _get_next_token ($) { Line 1459  sub _get_next_token ($) {
1459               0x003D => 1, # =               0x003D => 1, # =
1460              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1461            !!!cp (115);            !!!cp (115);
1462              ## XML5: Not a parse error.
1463            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1464          } else {          } else {
1465            !!!cp (116);            !!!cp (116);
# Line 1343  sub _get_next_token ($) { Line 1536  sub _get_next_token ($) {
1536          redo A;          redo A;
1537        }        }
1538      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1539          ## XML5: "Empty tag state".
1540    
1541        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1542          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1543            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1384  sub _get_next_token ($) { Line 1579  sub _get_next_token ($) {
1579          } else {          } else {
1580            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1581          }          }
1582            ## XML5: "Tag attribute name before state".
1583          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1584          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1585          ## Reconsume.          ## Reconsume.
# Line 1398  sub _get_next_token ($) { Line 1594  sub _get_next_token ($) {
1594          redo A;          redo A;
1595        }        }
1596      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1597        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
1598    
1599        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
1600        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
1601                
1602        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1603          !!!cp (124);          if ($self->{in_subset}) {
1604          $self->{state} = DATA_STATE;            !!!cp (123);
1605          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1606            } else {
1607              !!!cp (124);
1608              $self->{state} = DATA_STATE;
1609              $self->{s_kwd} = '';
1610            }
1611          !!!next-input-character;          !!!next-input-character;
1612    
1613          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1614          redo A;          redo A;
1615        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1616          !!!cp (125);          if ($self->{in_subset}) {
1617          $self->{state} = DATA_STATE;            !!!cp (125.1);
1618          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1619            } else {
1620              !!!cp (125);
1621              $self->{state} = DATA_STATE;
1622              $self->{s_kwd} = '';
1623            }
1624          ## reconsume          ## reconsume
1625    
1626          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1431  sub _get_next_token ($) { Line 1637  sub _get_next_token ($) {
1637          redo A;          redo A;
1638        }        }
1639      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1640        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
1641                
1642        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1643          !!!cp (133);          !!!cp (133);
# Line 1443  sub _get_next_token ($) { Line 1649  sub _get_next_token ($) {
1649          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1650          !!!cp (130);          !!!cp (130);
1651          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1652          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1653          !!!next-input-character;          !!!next-input-character;
1654          redo A;          redo A;
1655        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
# Line 1452  sub _get_next_token ($) { Line 1658  sub _get_next_token ($) {
1658                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1659          !!!cp (135.4);                          !!!cp (135.4);                
1660          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1661          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1662          !!!next-input-character;          !!!next-input-character;
1663          redo A;          redo A;
1664        } else {        } else {
# Line 1476  sub _get_next_token ($) { Line 1682  sub _get_next_token ($) {
1682                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1683                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
1684                                   };                                   };
1685          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1686          !!!next-input-character;          !!!next-input-character;
1687          redo A;          redo A;
1688        } else {        } else {
# Line 1502  sub _get_next_token ($) { Line 1708  sub _get_next_token ($) {
1708              0x0054, # T              0x0054, # T
1709              0x0059, # Y              0x0059, # Y
1710              0x0050, # P              0x0050, # P
1711            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1712            $self->{nc} == [            $self->{nc} == [
1713              undef,              undef,
1714              0x006F, # o              0x006F, # o
# Line 1510  sub _get_next_token ($) { Line 1716  sub _get_next_token ($) {
1716              0x0074, # t              0x0074, # t
1717              0x0079, # y              0x0079, # y
1718              0x0070, # p              0x0070, # p
1719            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1720          !!!cp (131);          !!!cp (131);
1721          ## Stay in the state.          ## Stay in the state.
1722          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1723          !!!next-input-character;          !!!next-input-character;
1724          redo A;          redo A;
1725        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1726                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1727                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1728          !!!cp (129);          if ($self->{is_xml} and
1729                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1730              !!!cp (129);
1731              ## XML5: case-sensitive.
1732              !!!parse-error (type => 'lowercase keyword', ## TODO
1733                              text => 'DOCTYPE',
1734                              line => $self->{line_prev},
1735                              column => $self->{column_prev} - 5);
1736            } else {
1737              !!!cp (129.1);
1738            }
1739          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
1740          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
1741                                    quirks => 1,                                    quirks => 1,
# Line 1532  sub _get_next_token ($) { Line 1748  sub _get_next_token ($) {
1748          !!!cp (132);                  !!!cp (132);        
1749          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1750                          line => $self->{line_prev},                          line => $self->{line_prev},
1751                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1752          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1753          ## Reconsume.          ## Reconsume.
1754          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1755                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1756                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1757                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1758                                   };                                   };
1759          redo A;          redo A;
1760        }        }
# Line 1549  sub _get_next_token ($) { Line 1765  sub _get_next_token ($) {
1765              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1766              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1767              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1768            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1769          !!!cp (135.1);          !!!cp (135.1);
1770          ## Stay in the state.          ## Stay in the state.
1771          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1772          !!!next-input-character;          !!!next-input-character;
1773          redo A;          redo A;
1774        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1775                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
         !!!cp (135.2);  
   
1776          if ($self->{is_xml} and          if ($self->{is_xml} and
1777              not $self->{tainted} and              not $self->{tainted} and
1778              @{$self->{open_elements} or []} == 0) {              @{$self->{open_elements} or []} == 0) {
1779              !!!cp (135.2);
1780            !!!parse-error (type => 'cdata outside of root element',            !!!parse-error (type => 'cdata outside of root element',
1781                            line => $self->{line_prev},                            line => $self->{line_prev},
1782                            column => $self->{column_prev} - 7);                            column => $self->{column_prev} - 7);
1783            $self->{tainted} = 1;            $self->{tainted} = 1;
1784            } else {
1785              !!!cp (135.21);
1786          }          }
1787    
1788          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
# Line 1579  sub _get_next_token ($) { Line 1796  sub _get_next_token ($) {
1796          !!!cp (135.3);          !!!cp (135.3);
1797          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1798                          line => $self->{line_prev},                          line => $self->{line_prev},
1799                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1800          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1801          ## Reconsume.          ## Reconsume.
1802          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1803                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1804                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1805                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1806                                   };                                   };
1807          redo A;          redo A;
1808        }        }
# Line 1596  sub _get_next_token ($) { Line 1813  sub _get_next_token ($) {
1813          !!!next-input-character;          !!!next-input-character;
1814          redo A;          redo A;
1815        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (138);  
1816          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1817          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1818          $self->{s_kwd} = '';            !!!cp (138.1);
1819              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1820            } else {
1821              !!!cp (138);
1822              $self->{state} = DATA_STATE;
1823              $self->{s_kwd} = '';
1824            }
1825          !!!next-input-character;          !!!next-input-character;
1826    
1827          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1828    
1829          redo A;          redo A;
1830        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (139);  
1831          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1832          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1833          $self->{s_kwd} = '';            !!!cp (139.1);
1834              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1835            } else {
1836              !!!cp (139);
1837              $self->{state} = DATA_STATE;
1838              $self->{s_kwd} = '';
1839            }
1840          ## reconsume          ## reconsume
1841    
1842          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1630  sub _get_next_token ($) { Line 1857  sub _get_next_token ($) {
1857          !!!next-input-character;          !!!next-input-character;
1858          redo A;          redo A;
1859        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (142);  
1860          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1861          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1862          $self->{s_kwd} = '';            !!!cp (142.1);
1863              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1864            } else {
1865              !!!cp (142);
1866              $self->{state} = DATA_STATE;
1867              $self->{s_kwd} = '';
1868            }
1869          !!!next-input-character;          !!!next-input-character;
1870    
1871          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
1872    
1873          redo A;          redo A;
1874        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (143);  
1875          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1876          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1877          $self->{s_kwd} = '';            !!!cp (143.1);
1878              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1879            } else {
1880              !!!cp (143);
1881              $self->{state} = DATA_STATE;
1882              $self->{s_kwd} = '';
1883            }
1884          ## reconsume          ## reconsume
1885    
1886          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1658  sub _get_next_token ($) { Line 1895  sub _get_next_token ($) {
1895          redo A;          redo A;
1896        }        }
1897      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
1898          ## XML5: "Comment state" and "DOCTYPE comment state".
1899    
1900        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1901          !!!cp (145);          !!!cp (145);
1902          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
1903          !!!next-input-character;          !!!next-input-character;
1904          redo A;          redo A;
1905        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (146);  
1906          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1907          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1908          $self->{s_kwd} = '';            !!!cp (146.1);
1909              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1910            } else {
1911              !!!cp (146);
1912              $self->{state} = DATA_STATE;
1913              $self->{s_kwd} = '';
1914            }
1915          ## reconsume          ## reconsume
1916    
1917          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1685  sub _get_next_token ($) { Line 1929  sub _get_next_token ($) {
1929          redo A;          redo A;
1930        }        }
1931      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1932          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
1933    
1934        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1935          !!!cp (148);          !!!cp (148);
1936          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
1937          !!!next-input-character;          !!!next-input-character;
1938          redo A;          redo A;
1939        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (149);  
1940          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1941          $self->{s_kwd} = '';          if ($self->{in_subset}) {
1942          $self->{state} = DATA_STATE;            !!!cp (149.1);
1943          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1944            } else {
1945              !!!cp (149);
1946              $self->{state} = DATA_STATE;
1947              $self->{s_kwd} = '';
1948            }
1949          ## reconsume          ## reconsume
1950    
1951          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1709  sub _get_next_token ($) { Line 1959  sub _get_next_token ($) {
1959          redo A;          redo A;
1960        }        }
1961      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
1962          ## XML5: "Comment end state" and "DOCTYPE comment end state".
1963    
1964        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1965          !!!cp (151);          if ($self->{in_subset}) {
1966          $self->{state} = DATA_STATE;            !!!cp (151.1);
1967          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1968            } else {
1969              !!!cp (151);
1970              $self->{state} = DATA_STATE;
1971              $self->{s_kwd} = '';
1972            }
1973          !!!next-input-character;          !!!next-input-character;
1974    
1975          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1720  sub _get_next_token ($) { Line 1977  sub _get_next_token ($) {
1977          redo A;          redo A;
1978        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
1979          !!!cp (152);          !!!cp (152);
1980            ## XML5: Not a parse error.
1981          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1982                          line => $self->{line_prev},                          line => $self->{line_prev},
1983                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1728  sub _get_next_token ($) { Line 1986  sub _get_next_token ($) {
1986          !!!next-input-character;          !!!next-input-character;
1987          redo A;          redo A;
1988        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         !!!cp (153);  
1989          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1990          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
1991          $self->{s_kwd} = '';            !!!cp (153.1);
1992              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1993            } else {
1994              !!!cp (153);
1995              $self->{state} = DATA_STATE;
1996              $self->{s_kwd} = '';
1997            }
1998          ## reconsume          ## reconsume
1999    
2000          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1739  sub _get_next_token ($) { Line 2002  sub _get_next_token ($) {
2002          redo A;          redo A;
2003        } else {        } else {
2004          !!!cp (154);          !!!cp (154);
2005            ## XML5: Not a parse error.
2006          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
2007                          line => $self->{line_prev},                          line => $self->{line_prev},
2008                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1755  sub _get_next_token ($) { Line 2019  sub _get_next_token ($) {
2019          redo A;          redo A;
2020        } else {        } else {
2021          !!!cp (156);          !!!cp (156);
2022            ## XML5: Unless EOF, swith to the bogus comment state.
2023          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
2024          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2025          ## reconsume          ## reconsume
2026          redo A;          redo A;
2027        }        }
2028      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2029          ## XML5: "DOCTYPE root name before state".
2030    
2031        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2032          !!!cp (157);          !!!cp (157);
2033          ## Stay in the state          ## Stay in the state
# Line 1768  sub _get_next_token ($) { Line 2035  sub _get_next_token ($) {
2035          redo A;          redo A;
2036        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2037          !!!cp (158);          !!!cp (158);
2038            ## XML5: No parse error.
2039          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2040          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2041          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 1786  sub _get_next_token ($) { Line 2054  sub _get_next_token ($) {
2054          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2055    
2056          redo A;          redo A;
2057          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2058            !!!cp (159.1);
2059            !!!parse-error (type => 'no DOCTYPE name');
2060            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2061            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2062            $self->{in_subset} = 1;
2063            !!!next-input-character;
2064            !!!emit ($self->{ct}); # DOCTYPE
2065            redo A;
2066        } else {        } else {
2067          !!!cp (160);          !!!cp (160);
2068          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1795  sub _get_next_token ($) { Line 2072  sub _get_next_token ($) {
2072          redo A;          redo A;
2073        }        }
2074      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2075  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2076    
2077          ## ISSUE: Redundant "First," in the spec.
2078    
2079        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2080          !!!cp (161);          !!!cp (161);
2081          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1821  sub _get_next_token ($) { Line 2101  sub _get_next_token ($) {
2101          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2102    
2103          redo A;          redo A;
2104          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2105            !!!cp (163.1);
2106            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2107            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2108            $self->{in_subset} = 1;
2109            !!!next-input-character;
2110            !!!emit ($self->{ct}); # DOCTYPE
2111            redo A;
2112        } else {        } else {
2113          !!!cp (164);          !!!cp (164);
2114          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1830  sub _get_next_token ($) { Line 2118  sub _get_next_token ($) {
2118          redo A;          redo A;
2119        }        }
2120      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2121          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2122          ## state", but implemented differently.
2123    
2124        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2125          !!!cp (165);          !!!cp (165);
2126          ## Stay in the state          ## Stay in the state
# Line 1857  sub _get_next_token ($) { Line 2148  sub _get_next_token ($) {
2148          redo A;          redo A;
2149        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2150                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2151            !!!cp (167.1);
2152          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2153          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2154          !!!next-input-character;          !!!next-input-character;
2155          redo A;          redo A;
2156        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2157                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2158            !!!cp (167.2);
2159          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2160          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2161          !!!next-input-character;          !!!next-input-character;
2162          redo A;          redo A;
2163          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2164            !!!cp (167.3);
2165            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2166            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2167            $self->{in_subset} = 1;
2168            !!!next-input-character;
2169            !!!emit ($self->{ct}); # DOCTYPE
2170            redo A;
2171        } else {        } else {
2172          !!!cp (180);          !!!cp (180);
2173          !!!parse-error (type => 'string after DOCTYPE name');          !!!parse-error (type => 'string after DOCTYPE name');
# Line 1884  sub _get_next_token ($) { Line 2185  sub _get_next_token ($) {
2185              0x0042, # B              0x0042, # B
2186              0x004C, # L              0x004C, # L
2187              0x0049, # I              0x0049, # I
2188            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2189            $self->{nc} == [            $self->{nc} == [
2190              undef,              undef,
2191              0x0075, # u              0x0075, # u
2192              0x0062, # b              0x0062, # b
2193              0x006C, # l              0x006C, # l
2194              0x0069, # i              0x0069, # i
2195            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2196          !!!cp (175);          !!!cp (175);
2197          ## Stay in the state.          ## Stay in the state.
2198          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2199          !!!next-input-character;          !!!next-input-character;
2200          redo A;          redo A;
2201        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2202                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2203                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2204          !!!cp (168);          if ($self->{is_xml} and
2205                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2206              !!!cp (168.1);
2207              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2208                              text => 'PUBLIC',
2209                              line => $self->{line_prev},
2210                              column => $self->{column_prev} - 4);
2211            } else {
2212              !!!cp (168);
2213            }
2214          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2215          !!!next-input-character;          !!!next-input-character;
2216          redo A;          redo A;
# Line 1908  sub _get_next_token ($) { Line 2218  sub _get_next_token ($) {
2218          !!!cp (169);          !!!cp (169);
2219          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2220                          line => $self->{line_prev},                          line => $self->{line_prev},
2221                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2222          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2223    
2224          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 1923  sub _get_next_token ($) { Line 2233  sub _get_next_token ($) {
2233              0x0053, # S              0x0053, # S
2234              0x0054, # T              0x0054, # T
2235              0x0045, # E              0x0045, # E
2236            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2237            $self->{nc} == [            $self->{nc} == [
2238              undef,              undef,
2239              0x0079, # y              0x0079, # y
2240              0x0073, # s              0x0073, # s
2241              0x0074, # t              0x0074, # t
2242              0x0065, # e              0x0065, # e
2243            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2244          !!!cp (170);          !!!cp (170);
2245          ## Stay in the state.          ## Stay in the state.
2246          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2247          !!!next-input-character;          !!!next-input-character;
2248          redo A;          redo A;
2249        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2250                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2251                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2252          !!!cp (171);          if ($self->{is_xml} and
2253                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2254              !!!cp (171.1);
2255              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2256                              text => 'SYSTEM',
2257                              line => $self->{line_prev},
2258                              column => $self->{column_prev} - 4);
2259            } else {
2260              !!!cp (171);
2261            }
2262          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2263          !!!next-input-character;          !!!next-input-character;
2264          redo A;          redo A;
# Line 1947  sub _get_next_token ($) { Line 2266  sub _get_next_token ($) {
2266          !!!cp (172);          !!!cp (172);
2267          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2268                          line => $self->{line_prev},                          line => $self->{line_prev},
2269                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2270          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2271    
2272          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 1996  sub _get_next_token ($) { Line 2315  sub _get_next_token ($) {
2315          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2316    
2317          redo A;          redo A;
2318          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2319            !!!cp (186.1);
2320            !!!parse-error (type => 'no PUBLIC literal');
2321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2322            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2323            $self->{in_subset} = 1;
2324            !!!next-input-character;
2325            !!!emit ($self->{ct}); # DOCTYPE
2326            redo A;
2327        } else {        } else {
2328          !!!cp (186);          !!!cp (186);
2329          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
# Line 2106  sub _get_next_token ($) { Line 2434  sub _get_next_token ($) {
2434          !!!next-input-character;          !!!next-input-character;
2435          redo A;          redo A;
2436        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2437          !!!cp (198);          if ($self->{is_xml}) {
2438              !!!cp (198.1);
2439              !!!parse-error (type => 'no SYSTEM literal');
2440            } else {
2441              !!!cp (198);
2442            }
2443          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2444          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2445          !!!next-input-character;          !!!next-input-character;
# Line 2126  sub _get_next_token ($) { Line 2459  sub _get_next_token ($) {
2459          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2460    
2461          redo A;          redo A;
2462          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2463            !!!cp (200.1);
2464            !!!parse-error (type => 'no SYSTEM literal');
2465            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2466            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2467            $self->{in_subset} = 1;
2468            !!!next-input-character;
2469            !!!emit ($self->{ct}); # DOCTYPE
2470            redo A;
2471        } else {        } else {
2472          !!!cp (200);          !!!cp (200);
2473          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
# Line 2176  sub _get_next_token ($) { Line 2518  sub _get_next_token ($) {
2518          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2519    
2520          redo A;          redo A;
2521          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2522            !!!cp (206.1);
2523            !!!parse-error (type => 'no SYSTEM literal');
2524    
2525            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2526            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2527            $self->{in_subset} = 1;
2528            !!!next-input-character;
2529            !!!emit ($self->{ct}); # DOCTYPE
2530            redo A;
2531        } else {        } else {
2532          !!!cp (206);          !!!cp (206);
2533          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
# Line 2191  sub _get_next_token ($) { Line 2543  sub _get_next_token ($) {
2543          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2544          !!!next-input-character;          !!!next-input-character;
2545          redo A;          redo A;
2546        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2547          !!!cp (208);          !!!cp (208);
2548          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2549    
# Line 2232  sub _get_next_token ($) { Line 2584  sub _get_next_token ($) {
2584          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2585          !!!next-input-character;          !!!next-input-character;
2586          redo A;          redo A;
2587        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2588          !!!cp (212);          !!!cp (212);
2589          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2590    
# Line 2293  sub _get_next_token ($) { Line 2645  sub _get_next_token ($) {
2645          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2646    
2647          redo A;          redo A;
2648          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2649            !!!cp (218.1);
2650            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2651            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2652            $self->{in_subset} = 1;
2653            !!!next-input-character;
2654            !!!emit ($self->{ct}); # DOCTYPE
2655            redo A;
2656        } else {        } else {
2657          !!!cp (218);          !!!cp (218);
2658          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
# Line 2312  sub _get_next_token ($) { Line 2672  sub _get_next_token ($) {
2672          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2673    
2674          redo A;          redo A;
2675          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2676            !!!cp (220.1);
2677            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2678            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2679            $self->{in_subset} = 1;
2680            !!!next-input-character;
2681            !!!emit ($self->{ct}); # DOCTYPE
2682            redo A;
2683        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2684          !!!cp (220);          !!!cp (220);
2685          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 2324  sub _get_next_token ($) { Line 2692  sub _get_next_token ($) {
2692        } else {        } else {
2693          !!!cp (221);          !!!cp (221);
2694          my $s = '';          my $s = '';
2695          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2696    
2697          ## Stay in the state          ## Stay in the state
2698          !!!next-input-character;          !!!next-input-character;
# Line 2334  sub _get_next_token ($) { Line 2702  sub _get_next_token ($) {
2702        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
2703        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2704        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
2705    
2706          ## XML5: "CDATA state".
2707                
2708        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2709          !!!cp (221.1);          !!!cp (221.1);
# Line 2342  sub _get_next_token ($) { Line 2712  sub _get_next_token ($) {
2712          redo A;          redo A;
2713        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2714          if ($self->{is_xml}) {          if ($self->{is_xml}) {
2715              !!!cp (221.11);
2716            !!!parse-error (type => 'no mse'); ## TODO: type            !!!parse-error (type => 'no mse'); ## TODO: type
2717            } else {
2718              !!!cp (221.12);
2719          }          }
2720    
2721          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2722          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2723          !!!next-input-character;          ## Reconsume.
2724          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2725            !!!cp (221.2);            !!!cp (221.2);
2726            !!!emit ($self->{ct}); # character            !!!emit ($self->{ct}); # character
# Line 2370  sub _get_next_token ($) { Line 2743  sub _get_next_token ($) {
2743    
2744        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
2745      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2746          ## XML5: "CDATA bracket state".
2747    
2748        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2749          !!!cp (221.5);          !!!cp (221.5);
2750          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 2377  sub _get_next_token ($) { Line 2752  sub _get_next_token ($) {
2752          redo A;          redo A;
2753        } else {        } else {
2754          !!!cp (221.6);          !!!cp (221.6);
2755            ## XML5: If EOF, "]" is not appended and changed to the data state.
2756          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
2757          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2758          ## Reconsume.          ## Reconsume.
2759          redo A;          redo A;
2760        }        }
2761      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2762          ## XML5: "CDATA end state".
2763    
2764        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2765          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2766          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2405  sub _get_next_token ($) { Line 2783  sub _get_next_token ($) {
2783          !!!cp (221.11);          !!!cp (221.11);
2784          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
2785          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
2786          ## Reconsume.          ## Reconsume. ## XML5: Emit.
2787          redo A;          redo A;
2788        }        }
2789      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 2422  sub _get_next_token ($) { Line 2800  sub _get_next_token ($) {
2800        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
2801          !!!cp (999);          !!!cp (999);
2802          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
2803          $self->{s_kwd} = '#';          $self->{kwd} = '#';
2804          !!!next-input-character;          !!!next-input-character;
2805          redo A;          redo A;
2806        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2432  sub _get_next_token ($) { Line 2810  sub _get_next_token ($) {
2810          !!!cp (998);          !!!cp (998);
2811          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
2812          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
2813          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2814          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
2815          $self->{entity__match} = 0;          $self->{entity__match} = 0;
2816          !!!next-input-character;          !!!next-input-character;
2817          redo A;          redo A;
# Line 2473  sub _get_next_token ($) { Line 2851  sub _get_next_token ($) {
2851            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
2852          !!!cp (995);          !!!cp (995);
2853          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
2854          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2855          !!!next-input-character;          !!!next-input-character;
2856          redo A;          redo A;
2857        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
2858                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
2859          !!!cp (994);          !!!cp (994);
2860          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
2861          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
2862          !!!next-input-character;          !!!next-input-character;
2863          redo A;          redo A;
2864        } else {        } else {
# Line 2516  sub _get_next_token ($) { Line 2894  sub _get_next_token ($) {
2894        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
2895            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
2896          !!!cp (1012);          !!!cp (1012);
2897          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
2898          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2899                    
2900          ## Stay in the state.          ## Stay in the state.
2901          !!!next-input-character;          !!!next-input-character;
# Line 2533  sub _get_next_token ($) { Line 2911  sub _get_next_token ($) {
2911          #          #
2912        }        }
2913    
2914        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2915        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2916        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2917        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2556  sub _get_next_token ($) { Line 2934  sub _get_next_token ($) {
2934          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2935          ## Reconsume.          ## Reconsume.
2936          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2937                      has_reference => 1,
2938                    line => $l, column => $c,                    line => $l, column => $c,
2939                   });                   });
2940          redo A;          redo A;
# Line 2575  sub _get_next_token ($) { Line 2954  sub _get_next_token ($) {
2954          # 0..9, A..F, a..f          # 0..9, A..F, a..f
2955          !!!cp (990);          !!!cp (990);
2956          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
2957          $self->{s_kwd} = 0;          $self->{kwd} = 0;
2958          ## Reconsume.          ## Reconsume.
2959          redo A;          redo A;
2960        } else {        } else {
# Line 2593  sub _get_next_token ($) { Line 2972  sub _get_next_token ($) {
2972            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2973            ## Reconsume.            ## Reconsume.
2974            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2975                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
2976                      line => $self->{line_prev},                      line => $self->{line_prev},
2977                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
2978                     });                     });
2979            redo A;            redo A;
2980          } else {          } else {
2981            !!!cp (989);            !!!cp (989);
2982            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
2983            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2984            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2985            ## Reconsume.            ## Reconsume.
# Line 2611  sub _get_next_token ($) { Line 2990  sub _get_next_token ($) {
2990        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2991          # 0..9          # 0..9
2992          !!!cp (1002);          !!!cp (1002);
2993          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2994          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2995          ## Stay in the state.          ## Stay in the state.
2996          !!!next-input-character;          !!!next-input-character;
2997          redo A;          redo A;
2998        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
2999                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
3000          !!!cp (1003);          !!!cp (1003);
3001          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3002          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
3003          ## Stay in the state.          ## Stay in the state.
3004          !!!next-input-character;          !!!next-input-character;
3005          redo A;          redo A;
3006        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
3007                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
3008          !!!cp (1004);          !!!cp (1004);
3009          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
3010          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
3011          ## Stay in the state.          ## Stay in the state.
3012          !!!next-input-character;          !!!next-input-character;
3013          redo A;          redo A;
# Line 2645  sub _get_next_token ($) { Line 3024  sub _get_next_token ($) {
3024          #          #
3025        }        }
3026    
3027        my $code = $self->{s_kwd};        my $code = $self->{kwd};
3028        my $l = $self->{line_prev};        my $l = $self->{line_prev};
3029        my $c = $self->{column_prev};        my $c = $self->{column_prev};
3030        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2668  sub _get_next_token ($) { Line 3047  sub _get_next_token ($) {
3047          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3048          ## Reconsume.          ## Reconsume.
3049          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3050                      has_reference => 1,
3051                    line => $l, column => $c,                    line => $l, column => $c,
3052                   });                   });
3053          redo A;          redo A;
# Line 2681  sub _get_next_token ($) { Line 3061  sub _get_next_token ($) {
3061          redo A;          redo A;
3062        }        }
3063      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
3064        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
3065            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
3066            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
3067              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2691  sub _get_next_token ($) { Line 3071  sub _get_next_token ($) {
3071              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
3072             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
3073          our $EntityChar;          our $EntityChar;
3074          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3075          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
3076            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
3077              !!!cp (1020);              !!!cp (1020);
3078              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3079              $self->{entity__match} = 1;              $self->{entity__match} = 1;
3080              !!!next-input-character;              !!!next-input-character;
3081              #              #
3082            } else {            } else {
3083              !!!cp (1021);              !!!cp (1021);
3084              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
3085              $self->{entity__match} = -1;              $self->{entity__match} = -1;
3086              ## Stay in the state.              ## Stay in the state.
3087              !!!next-input-character;              !!!next-input-character;
# Line 2729  sub _get_next_token ($) { Line 3109  sub _get_next_token ($) {
3109          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3110              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3111            !!!cp (1024);            !!!cp (1024);
3112            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3113            #            #
3114          } else {          } else {
3115            !!!cp (1025);            !!!cp (1025);
# Line 2741  sub _get_next_token ($) { Line 3121  sub _get_next_token ($) {
3121          !!!cp (1026);          !!!cp (1026);
3122          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3123                          line => $self->{line_prev},                          line => $self->{line_prev},
3124                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3125          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3126          #          #
3127        }        }
3128        
# Line 2763  sub _get_next_token ($) { Line 3143  sub _get_next_token ($) {
3143          ## Reconsume.          ## Reconsume.
3144          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
3145                    data => $data,                    data => $data,
3146                      has_reference => $has_ref,
3147                    line => $self->{line_prev},                    line => $self->{line_prev},
3148                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3149                   });                   });
3150          redo A;          redo A;
3151        } else {        } else {
# Line 2776  sub _get_next_token ($) { Line 3157  sub _get_next_token ($) {
3157          ## Reconsume.          ## Reconsume.
3158          redo A;          redo A;
3159        }        }
3160    
3161        ## XML-only states
3162    
3163        } elsif ($self->{state} == PI_STATE) {
3164          ## XML5: "Pi state" and "DOCTYPE pi state".
3165    
3166          if ($is_space->{$self->{nc}} or
3167              $self->{nc} == 0x003F or # ?
3168              $self->{nc} == -1) {
3169            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
3170            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
3171            ## "DOCTYPE pi state": Parse error, switch to the "data
3172            ## state".
3173            !!!parse-error (type => 'bare pio', ## TODO: type
3174                            line => $self->{line_prev},
3175                            column => $self->{column_prev}
3176                                - 1 * ($self->{nc} != -1));
3177            $self->{state} = BOGUS_COMMENT_STATE;
3178            ## Reconsume.
3179            $self->{ct} = {type => COMMENT_TOKEN,
3180                           data => '?',
3181                           line => $self->{line_prev},
3182                           column => $self->{column_prev}
3183                               - 1 * ($self->{nc} != -1),
3184                          };
3185            redo A;
3186          } else {
3187            ## XML5: "DOCTYPE pi state": Stay in the state.
3188            $self->{ct} = {type => PI_TOKEN,
3189                           target => chr $self->{nc},
3190                           data => '',
3191                           line => $self->{line_prev},
3192                           column => $self->{column_prev} - 1,
3193                          };
3194            $self->{state} = PI_TARGET_STATE;
3195            !!!next-input-character;
3196            redo A;
3197          }
3198        } elsif ($self->{state} == PI_TARGET_STATE) {
3199          if ($is_space->{$self->{nc}}) {
3200            $self->{state} = PI_TARGET_AFTER_STATE;
3201            !!!next-input-character;
3202            redo A;
3203          } elsif ($self->{nc} == -1) {
3204            !!!parse-error (type => 'no pic'); ## TODO: type
3205            if ($self->{in_subset}) {
3206              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3207            } else {
3208              $self->{state} = DATA_STATE;
3209              $self->{s_kwd} = '';
3210            }
3211            ## Reconsume.
3212            !!!emit ($self->{ct}); # pi
3213            redo A;
3214          } elsif ($self->{nc} == 0x003F) { # ?
3215            $self->{state} = PI_AFTER_STATE;
3216            !!!next-input-character;
3217            redo A;
3218          } else {
3219            ## XML5: typo ("tag name" -> "target")
3220            $self->{ct}->{target} .= chr $self->{nc}; # pi
3221            !!!next-input-character;
3222            redo A;
3223          }
3224        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3225          if ($is_space->{$self->{nc}}) {
3226            ## Stay in the state.
3227            !!!next-input-character;
3228            redo A;
3229          } else {
3230            $self->{state} = PI_DATA_STATE;
3231            ## Reprocess.
3232            redo A;
3233          }
3234        } elsif ($self->{state} == PI_DATA_STATE) {
3235          if ($self->{nc} == 0x003F) { # ?
3236            $self->{state} = PI_DATA_AFTER_STATE;
3237            !!!next-input-character;
3238            redo A;
3239          } elsif ($self->{nc} == -1) {
3240            !!!parse-error (type => 'no pic'); ## TODO: type
3241            if ($self->{in_subset}) {
3242              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
3243            } else {
3244              $self->{state} = DATA_STATE;
3245              $self->{s_kwd} = '';
3246            }
3247            ## Reprocess.
3248            !!!emit ($self->{ct}); # pi
3249            redo A;
3250          } else {
3251            $self->{ct}->{data} .= chr $self->{nc}; # pi
3252            $self->{read_until}->($self->{ct}->{data}, q[?],
3253                                  length $self->{ct}->{data});
3254            ## Stay in the state.
3255            !!!next-input-character;
3256            ## Reprocess.
3257            redo A;
3258          }
3259        } elsif ($self->{state} == PI_AFTER_STATE) {
3260          ## XML5: Part of "Pi after state".
3261    
3262          if ($self->{nc} == 0x003E) { # >
3263            if ($self->{in_subset}) {
3264              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265            } else {
3266              $self->{state} = DATA_STATE;
3267              $self->{s_kwd} = '';
3268            }
3269            !!!next-input-character;
3270            !!!emit ($self->{ct}); # pi
3271            redo A;
3272          } elsif ($self->{nc} == 0x003F) { # ?
3273            !!!parse-error (type => 'no s after target', ## TODO: type
3274                            line => $self->{line_prev},
3275                            column => $self->{column_prev}); ## XML5: no error
3276            $self->{ct}->{data} .= '?';
3277            $self->{state} = PI_DATA_AFTER_STATE;
3278            !!!next-input-character;
3279            redo A;
3280          } else {
3281            !!!parse-error (type => 'no s after target', ## TODO: type
3282                            line => $self->{line_prev},
3283                            column => $self->{column_prev}
3284                                + 1 * ($self->{nc} == -1)); ## XML5: no error
3285            $self->{ct}->{data} .= '?'; ## XML5: not appended
3286            $self->{state} = PI_DATA_STATE;
3287            ## Reprocess.
3288            redo A;
3289          }
3290        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3291          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
3292    
3293          if ($self->{nc} == 0x003E) { # >
3294            if ($self->{in_subset}) {
3295              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3296            } else {
3297              $self->{state} = DATA_STATE;
3298              $self->{s_kwd} = '';
3299            }
3300            !!!next-input-character;
3301            !!!emit ($self->{ct}); # pi
3302            redo A;
3303          } elsif ($self->{nc} == 0x003F) { # ?
3304            $self->{ct}->{data} .= '?';
3305            ## Stay in the state.
3306            !!!next-input-character;
3307            redo A;
3308          } else {
3309            $self->{ct}->{data} .= '?'; ## XML5: not appended
3310            $self->{state} = PI_DATA_STATE;
3311            ## Reprocess.
3312            redo A;
3313          }
3314    
3315        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3316          if ($self->{nc} == 0x003C) { # <
3317            $self->{state} = DOCTYPE_TAG_STATE;
3318            !!!next-input-character;
3319            redo A;
3320          } elsif ($self->{nc} == 0x0025) { # %
3321            ## XML5: Not defined yet.
3322    
3323            ## TODO:
3324            !!!next-input-character;
3325            redo A;
3326          } elsif ($self->{nc} == 0x005D) { # ]
3327            delete $self->{in_subset};
3328            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3329            !!!next-input-character;
3330            redo A;
3331          } elsif ($is_space->{$self->{nc}}) {
3332            ## Stay in the state.
3333            !!!next-input-character;
3334            redo A;
3335          } elsif ($self->{nc} == -1) {
3336            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3337            delete $self->{in_subset};
3338            $self->{state} = DATA_STATE;
3339            $self->{s_kwd} = '';
3340            ## Reconsume.
3341            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3342            redo A;
3343          } else {
3344            unless ($self->{internal_subset_tainted}) {
3345              ## XML5: No parse error.
3346              !!!parse-error (type => 'string in internal subset');
3347              $self->{internal_subset_tainted} = 1;
3348            }
3349            ## Stay in the state.
3350            !!!next-input-character;
3351            redo A;
3352          }
3353        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3354          if ($self->{nc} == 0x003E) { # >
3355            $self->{state} = DATA_STATE;
3356            $self->{s_kwd} = '';
3357            !!!next-input-character;
3358            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3359            redo A;
3360          } elsif ($self->{nc} == -1) {
3361            !!!parse-error (type => 'unclosed DOCTYPE');
3362            $self->{state} = DATA_STATE;
3363            $self->{s_kwd} = '';
3364            ## Reconsume.
3365            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3366            redo A;
3367          } else {
3368            ## XML5: No parse error and stay in the state.
3369            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3370    
3371            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3372            !!!next-input-character;
3373            redo A;
3374          }
3375        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3376          if ($self->{nc} == 0x003E) { # >
3377            $self->{state} = DATA_STATE;
3378            $self->{s_kwd} = '';
3379            !!!next-input-character;
3380            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3381            redo A;
3382          } elsif ($self->{nc} == -1) {
3383            $self->{state} = DATA_STATE;
3384            $self->{s_kwd} = '';
3385            ## Reconsume.
3386            !!!emit ({type => END_OF_DOCTYPE_TOKEN});
3387            redo A;
3388          } else {
3389            ## Stay in the state.
3390            !!!next-input-character;
3391            redo A;
3392          }
3393        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
3394          if ($self->{nc} == 0x0021) { # !
3395            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
3396            !!!next-input-character;
3397            redo A;
3398          } elsif ($self->{nc} == 0x003F) { # ?
3399            $self->{state} = PI_STATE;
3400            !!!next-input-character;
3401            redo A;
3402          } elsif ($self->{nc} == -1) {
3403            !!!parse-error (type => 'bare stago');
3404            $self->{state} = DATA_STATE;
3405            $self->{s_kwd} = '';
3406            ## Reconsume.
3407            redo A;
3408          } else {
3409            !!!parse-error (type => 'bare stago', ## XML5: Not a parse error.
3410                            line => $self->{line_prev},
3411                            column => $self->{column_prev});
3412            $self->{state} = BOGUS_COMMENT_STATE;
3413            $self->{ct} = {type => COMMENT_TOKEN,
3414                           data => '',
3415                          }; ## NOTE: Will be discarded.
3416            !!!next-input-character;
3417            redo A;
3418          }
3419        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
3420          ## XML5: "DOCTYPE markup declaration state".
3421          
3422          if ($self->{nc} == 0x002D) { # -
3423            $self->{state} = MD_HYPHEN_STATE;
3424            !!!next-input-character;
3425            redo A;
3426          } elsif ($self->{nc} == 0x0045) { # E
3427            $self->{state} = MD_E_STATE;
3428            $self->{kwd} = chr $self->{nc};
3429            !!!next-input-character;
3430            redo A;
3431          } elsif ($self->{nc} == 0x0041) { # A
3432            $self->{state} = MD_ATTLIST_STATE;
3433            $self->{kwd} = chr $self->{nc};
3434            !!!next-input-character;
3435            redo A;
3436          } elsif ($self->{nc} == 0x004E) { # N
3437            $self->{state} = MD_NOTATION_STATE;
3438            $self->{kwd} = chr $self->{nc};
3439            !!!next-input-character;
3440            redo A;
3441          } else {
3442            #
3443          }
3444          
3445          ## XML5: No parse error.
3446          !!!parse-error (type => 'bogus comment',
3447                          line => $self->{line_prev},
3448                          column => $self->{column_prev} - 1);
3449          ## Reconsume.
3450          $self->{state} = BOGUS_COMMENT_STATE;
3451          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
3452          redo A;
3453        } elsif ($self->{state} == MD_E_STATE) {
3454          if ($self->{nc} == 0x004E) { # N
3455            $self->{state} = MD_ENTITY_STATE;
3456            $self->{kwd} .= chr $self->{nc};
3457            !!!next-input-character;
3458            redo A;
3459          } elsif ($self->{nc} == 0x004C) { # L
3460            ## XML5: <!ELEMENT> not supported.
3461            $self->{state} = MD_ELEMENT_STATE;
3462            $self->{kwd} .= chr $self->{nc};
3463            !!!next-input-character;
3464            redo A;
3465          } else {
3466            ## XML5: No parse error.
3467            !!!parse-error (type => 'bogus comment',
3468                            line => $self->{line_prev},
3469                            column => $self->{column_prev} - 2
3470                                + 1 * ($self->{nc} == -1));
3471            ## Reconsume.
3472            $self->{state} = BOGUS_COMMENT_STATE;
3473            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3474            redo A;
3475          }
3476        } elsif ($self->{state} == MD_ENTITY_STATE) {
3477          if ($self->{nc} == {
3478                'EN' => 0x0054, # T
3479                'ENT' => 0x0049, # I
3480                'ENTI' => 0x0054, # T
3481              }->{$self->{kwd}}) {
3482            ## Stay in the state.
3483            $self->{kwd} .= chr $self->{nc};
3484            !!!next-input-character;
3485            redo A;
3486          } elsif ($self->{kwd} eq 'ENTIT' and
3487                   $self->{nc} == 0x0059) { # Y
3488            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
3489                           line => $self->{line_prev},
3490                           column => $self->{column_prev} - 6};
3491            $self->{state} = DOCTYPE_MD_STATE;
3492            !!!next-input-character;
3493            redo A;
3494          } else {
3495            !!!parse-error (type => 'bogus comment',
3496                            line => $self->{line_prev},
3497                            column => $self->{column_prev} - 1
3498                                - (length $self->{kwd})
3499                                + 1 * ($self->{nc} == -1));
3500            $self->{state} = BOGUS_COMMENT_STATE;
3501            ## Reconsume.
3502            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3503            redo A;
3504          }
3505        } elsif ($self->{state} == MD_ELEMENT_STATE) {
3506          if ($self->{nc} == {
3507                'EL' => 0x0045, # E
3508                'ELE' => 0x004D, # M
3509                'ELEM' => 0x0045, # E
3510                'ELEME' => 0x004E, # N
3511              }->{$self->{kwd}}) {
3512            ## Stay in the state.
3513            $self->{kwd} .= chr $self->{nc};
3514            !!!next-input-character;
3515            redo A;
3516          } elsif ($self->{kwd} eq 'ELEMEN' and
3517                   $self->{nc} == 0x0054) { # T
3518            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
3519                           line => $self->{line_prev},
3520                           column => $self->{column_prev} - 6};
3521            $self->{state} = DOCTYPE_MD_STATE;
3522            !!!next-input-character;
3523            redo A;
3524          } else {
3525            !!!parse-error (type => 'bogus comment',
3526                            line => $self->{line_prev},
3527                            column => $self->{column_prev} - 1
3528                                - (length $self->{kwd})
3529                                + 1 * ($self->{nc} == -1));
3530            $self->{state} = BOGUS_COMMENT_STATE;
3531            ## Reconsume.
3532            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3533            redo A;
3534          }
3535        } elsif ($self->{state} == MD_ATTLIST_STATE) {
3536          if ($self->{nc} == {
3537                'A' => 0x0054, # T
3538                'AT' => 0x0054, # T
3539                'ATT' => 0x004C, # L
3540                'ATTL' => 0x0049, # I
3541                'ATTLI' => 0x0053, # S
3542              }->{$self->{kwd}}) {
3543            ## Stay in the state.
3544            $self->{kwd} .= chr $self->{nc};
3545            !!!next-input-character;
3546            redo A;
3547          } elsif ($self->{kwd} eq 'ATTLIS' and
3548                   $self->{nc} == 0x0054) { # T
3549            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
3550                           line => $self->{line_prev},
3551                           column => $self->{column_prev} - 6};
3552            $self->{state} = DOCTYPE_MD_STATE;
3553            !!!next-input-character;
3554            redo A;
3555          } else {
3556            !!!parse-error (type => 'bogus comment',
3557                            line => $self->{line_prev},
3558                            column => $self->{column_prev} - 1
3559                                 - (length $self->{kwd})
3560                                 + 1 * ($self->{nc} == -1));
3561            $self->{state} = BOGUS_COMMENT_STATE;
3562            ## Reconsume.
3563            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3564            redo A;
3565          }
3566        } elsif ($self->{state} == MD_NOTATION_STATE) {
3567          if ($self->{nc} == {
3568                'N' => 0x004F, # O
3569                'NO' => 0x0054, # T
3570                'NOT' => 0x0041, # A
3571                'NOTA' => 0x0054, # T
3572                'NOTAT' => 0x0049, # I
3573                'NOTATI' => 0x004F, # O
3574              }->{$self->{kwd}}) {
3575            ## Stay in the state.
3576            $self->{kwd} .= chr $self->{nc};
3577            !!!next-input-character;
3578            redo A;
3579          } elsif ($self->{kwd} eq 'NOTATIO' and
3580                   $self->{nc} == 0x004E) { # N
3581            $self->{ct} = {type => NOTATION_TOKEN, name => '',
3582                           line => $self->{line_prev},
3583                           column => $self->{column_prev} - 6};
3584            $self->{state} = DOCTYPE_MD_STATE;
3585            !!!next-input-character;
3586            redo A;
3587          } else {
3588            !!!parse-error (type => 'bogus comment',
3589                            line => $self->{line_prev},
3590                            column => $self->{column_prev} - 1
3591                                - (length $self->{kwd})
3592                                + 1 * ($self->{nc} == -1));
3593            $self->{state} = BOGUS_COMMENT_STATE;
3594            ## Reconsume.
3595            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3596            redo A;
3597          }
3598        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
3599          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
3600          ## "DOCTYPE NOTATION state".
3601    
3602          if ($is_space->{$self->{nc}}) {
3603            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
3604            $self->{state} = BEFORE_MD_NAME_STATE;
3605            !!!next-input-character;
3606            redo A;
3607          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3608                   $self->{nc} == 0x0025) { # %
3609            ## XML5: Switch to the "DOCTYPE bogus comment state".
3610            !!!parse-error (type => 'no space before md name'); ## TODO: type
3611            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3612            !!!next-input-character;
3613            redo A;
3614          } elsif ($self->{nc} == -1) {
3615            !!!parse-error (type => 'unclosed md'); ## TODO: type
3616            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3617            ## Reconsume.
3618            redo A;
3619          } elsif ($self->{nc} == 0x003E) { # >
3620            ## XML5: Switch to the "DOCTYPE bogus comment state".
3621            !!!parse-error (type => 'no md name'); ## TODO: type
3622            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3623            !!!next-input-character;
3624            redo A;
3625          } else {
3626            ## XML5: Switch to the "DOCTYPE bogus comment state".
3627            !!!parse-error (type => 'no space before md name'); ## TODO: type
3628            $self->{state} = BEFORE_MD_NAME_STATE;
3629            redo A;
3630          }
3631        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
3632          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
3633          ## before state", "DOCTYPE ATTLIST name before state".
3634    
3635          if ($is_space->{$self->{nc}}) {
3636            ## Stay in the state.
3637            !!!next-input-character;
3638            redo A;
3639          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
3640                   $self->{nc} == 0x0025) { # %
3641            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
3642            !!!next-input-character;
3643            redo A;
3644          } elsif ($self->{nc} == 0x003E) { # >
3645            ## XML5: Same as "Anything else".
3646            !!!parse-error (type => 'no md name'); ## TODO: type
3647            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3648            !!!next-input-character;
3649            redo A;
3650          } elsif ($self->{nc} == -1) {
3651            !!!parse-error (type => 'unclosed md'); ## TODO: type
3652            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3653            ## Reconsume.
3654            redo A;
3655          } else {
3656            ## XML5: [ATTLIST] Not defined yet.
3657            $self->{ct}->{name} .= chr $self->{nc};
3658            $self->{state} = MD_NAME_STATE;
3659            !!!next-input-character;
3660            redo A;
3661          }
3662        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
3663          if ($is_space->{$self->{nc}}) {
3664            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
3665            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
3666            $self->{state} = BEFORE_MD_NAME_STATE;
3667            !!!next-input-character;
3668            redo A;
3669          } elsif ($self->{nc} == 0x003E) { # >
3670            ## XML5: Same as "Anything else".
3671            !!!parse-error (type => 'no md name'); ## TODO: type
3672            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3673            !!!next-input-character;
3674            redo A;
3675          } elsif ($self->{nc} == -1) {
3676            !!!parse-error (type => 'unclosed md');
3677            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3678            ## Reconsume.
3679            redo A;
3680          } else {
3681            ## XML5: No parse error.
3682            !!!parse-error (type => 'no space after ENTITY percent'); ## TODO: type
3683            $self->{state} = BOGUS_COMMENT_STATE;
3684            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3685            ## Reconsume.
3686            redo A;
3687          }
3688        } elsif ($self->{state} == MD_NAME_STATE) {
3689          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
3690          
3691          if ($is_space->{$self->{nc}}) {
3692            ## TODO:
3693            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
3694            !!!next-input-character;
3695            redo A;
3696          } elsif ($self->{nc} == 0x003E) { # >
3697            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
3698              #
3699            } else {
3700              !!!parse-error (type => 'no md body'); ## TODO: type
3701            }
3702            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3703            !!!next-input-character;
3704            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3705            redo A;
3706          } elsif ($self->{nc} == -1) {
3707            ## XML5: [ATTLIST] No parse error.
3708            !!!parse-error (type => 'unclosed md');
3709            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3710            ## Reconsume.
3711            !!!emit ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
3712            redo A;
3713          } else {
3714            ## XML5: [ATTLIST] Not defined yet.
3715            $self->{ct}->{name} .= chr $self->{nc};
3716            ## Stay in the state.
3717            !!!next-input-character;
3718            redo A;
3719          }
3720        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
3721          if ($is_space->{$self->{nc}}) {
3722            ## Stay in the state.
3723            !!!next-input-character;
3724            redo A;
3725          } elsif ($self->{nc} == 0x003E) { # >
3726            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3727            !!!next-input-character;
3728            !!!emit ($self->{ct}); # ATTLIST
3729            redo A;
3730          } elsif ($self->{nc} == -1) {
3731            ## XML5: No parse error.
3732            !!!parse-error (type => 'unclosed md'); ## TODO: type
3733            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
3734            redo A;
3735          } else {
3736            ## XML5: Not defined yet.
3737    
3738            ## TODO: ...
3739    
3740            $self->{state} = BOGUS_COMMENT_STATE;
3741            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
3742            ## Reconsume.
3743            redo A;
3744          }
3745    
3746      } else {      } else {
3747        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
3748      }      }

Legend:
Removed from v.1.6  
changed lines
  Added in v.1.14

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24