/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC revision 1.12 by wakaba, Wed Oct 15 12:49:49 2008 UTC
# Line 31  BEGIN { Line 31  BEGIN {
31    );    );
32  }  }
33    
34    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
35    
36  ## Token types  ## Token types
37    
38  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
39  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
40  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
41  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
42  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
43  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
44  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
45  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
46    
47    ## XML5: XML5 has "empty tag token".  In this implementation, it is
48    ## represented as a start tag token with $self->{self_closing} flag
49    ## set to true.
50    
51    ## XML5: XML5 has "short end tag token".  In this implementation, it
52    ## is represented as an end tag token with $token->{tag_name} flag set
53    ## to an empty string.
54    
55  package Whatpm::HTML;  package Whatpm::HTML;
56    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 124  sub HEXREF_HEX_STATE () { 48 }
124  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
125  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
126    
127  ## XML states  ## XML-only states
128  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
129  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
130  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
131  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
132  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
133  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
134    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
135    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
136    
137  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
138  ## list and descriptions)  ## list and descriptions)
# Line 186  sub _initialize_tokenizer ($) { Line 198  sub _initialize_tokenizer ($) {
198    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
199    
200    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
201    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
202      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
203    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
204    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
205    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 221  sub _initialize_tokenizer ($) { Line 234  sub _initialize_tokenizer ($) {
234  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
235  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
236  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.  ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
237    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
238    
239  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
240  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
241  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 240  my $is_space = { Line 255  my $is_space = {
255    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
256    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
257    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
258    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
259    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
260    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
261  };  };
# Line 450  sub _get_next_token ($) { Line 465  sub _get_next_token ($) {
465            redo A;            redo A;
466          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
467            !!!cp (15.1);            !!!cp (15.1);
468            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
469            #            #
470          } else {          } else {
471            !!!cp (16);            !!!cp (16);
472              $self->{s_kwd} = '';
473            #            #
474          }          }
475    
476          ## reconsume          ## reconsume
477          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
478          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
479                    line => $self->{line_prev},                    line => $self->{line_prev},
480                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 570  sub _get_next_token ($) { Line 585  sub _get_next_token ($) {
585        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
586          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
587            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
588            $self->{s_kwd} = '';            $self->{kwd} = '';
589            ## Reconsume.            ## Reconsume.
590            redo A;            redo A;
591          } else {          } else {
# Line 673  sub _get_next_token ($) { Line 688  sub _get_next_token ($) {
688          redo A;          redo A;
689        }        }
690      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
691        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
692        if (length $ch) {        if (length $ch) {
693          my $CH = $ch;          my $CH = $ch;
694          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 681  sub _get_next_token ($) { Line 696  sub _get_next_token ($) {
696          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
697            !!!cp (24);            !!!cp (24);
698            ## Stay in the state.            ## Stay in the state.
699            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
700            !!!next-input-character;            !!!next-input-character;
701            redo A;            redo A;
702          } else {          } else {
# Line 690  sub _get_next_token ($) { Line 705  sub _get_next_token ($) {
705            $self->{s_kwd} = '';            $self->{s_kwd} = '';
706            ## Reconsume.            ## Reconsume.
707            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
708                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
709                      line => $self->{line_prev},                      line => $self->{line_prev},
710                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
711                     });                     });
712            redo A;            redo A;
713          }          }
# Line 708  sub _get_next_token ($) { Line 723  sub _get_next_token ($) {
723            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
724            $self->{s_kwd} = '';            $self->{s_kwd} = '';
725            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
726                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
727                      line => $self->{line_prev},                      line => $self->{line_prev},
728                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
729                     });                     });
730            redo A;            redo A;
731          } else {          } else {
# Line 719  sub _get_next_token ($) { Line 734  sub _get_next_token ($) {
734                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
735                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
736                   line => $self->{line_prev},                   line => $self->{line_prev},
737                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
738            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
739            ## Reconsume.            ## Reconsume.
740            redo A;            redo A;
# Line 1593  sub _get_next_token ($) { Line 1608  sub _get_next_token ($) {
1608          ## ASCII case-insensitive.          ## ASCII case-insensitive.
1609          !!!cp (130);          !!!cp (130);
1610          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
1611          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
1612          !!!next-input-character;          !!!next-input-character;
1613          redo A;          redo A;
1614        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
# Line 1602  sub _get_next_token ($) { Line 1617  sub _get_next_token ($) {
1617                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1618          !!!cp (135.4);                          !!!cp (135.4);                
1619          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
1620          $self->{s_kwd} = '[';          $self->{kwd} = '[';
1621          !!!next-input-character;          !!!next-input-character;
1622          redo A;          redo A;
1623        } else {        } else {
# Line 1652  sub _get_next_token ($) { Line 1667  sub _get_next_token ($) {
1667              0x0054, # T              0x0054, # T
1668              0x0059, # Y              0x0059, # Y
1669              0x0050, # P              0x0050, # P
1670            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
1671            $self->{nc} == [            $self->{nc} == [
1672              undef,              undef,
1673              0x006F, # o              0x006F, # o
# Line 1660  sub _get_next_token ($) { Line 1675  sub _get_next_token ($) {
1675              0x0074, # t              0x0074, # t
1676              0x0079, # y              0x0079, # y
1677              0x0070, # p              0x0070, # p
1678            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
1679          !!!cp (131);          !!!cp (131);
1680          ## Stay in the state.          ## Stay in the state.
1681          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1682          !!!next-input-character;          !!!next-input-character;
1683          redo A;          redo A;
1684        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
1685                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1686                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1687          if ($self->{s_kwd} ne 'DOCTYP') {          if ($self->{is_xml} and
1688                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1689            !!!cp (129);            !!!cp (129);
1690            ## XML5: case-sensitive.            ## XML5: case-sensitive.
1691            !!!parse-error (type => 'lowercase keyword', ## TODO            !!!parse-error (type => 'lowercase keyword', ## TODO
# Line 1691  sub _get_next_token ($) { Line 1707  sub _get_next_token ($) {
1707          !!!cp (132);                  !!!cp (132);        
1708          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1709                          line => $self->{line_prev},                          line => $self->{line_prev},
1710                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1711          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1712          ## Reconsume.          ## Reconsume.
1713          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1714                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1715                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1716                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1717                                   };                                   };
1718          redo A;          redo A;
1719        }        }
# Line 1708  sub _get_next_token ($) { Line 1724  sub _get_next_token ($) {
1724              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
1725              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
1726              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
1727            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
1728          !!!cp (135.1);          !!!cp (135.1);
1729          ## Stay in the state.          ## Stay in the state.
1730          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
1731          !!!next-input-character;          !!!next-input-character;
1732          redo A;          redo A;
1733        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
1734                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1735          if ($self->{is_xml} and          if ($self->{is_xml} and
1736              not $self->{tainted} and              not $self->{tainted} and
# Line 1739  sub _get_next_token ($) { Line 1755  sub _get_next_token ($) {
1755          !!!cp (135.3);          !!!cp (135.3);
1756          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
1757                          line => $self->{line_prev},                          line => $self->{line_prev},
1758                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
1759          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1760          ## Reconsume.          ## Reconsume.
1761          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
1762                                    data => $self->{s_kwd},                                    data => $self->{kwd},
1763                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1764                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
1765                                   };                                   };
1766          redo A;          redo A;
1767        }        }
# Line 1855  sub _get_next_token ($) { Line 1871  sub _get_next_token ($) {
1871        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1872          !!!cp (149);          !!!cp (149);
1873          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
         $self->{s_kwd} = '';  
1874          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1875          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1876          ## reconsume          ## reconsume
# Line 1919  sub _get_next_token ($) { Line 1934  sub _get_next_token ($) {
1934          redo A;          redo A;
1935        } else {        } else {
1936          !!!cp (156);          !!!cp (156);
1937            ## XML5: Unless EOF, swith to the bogus comment state.
1938          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
1939          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1940          ## reconsume          ## reconsume
1941          redo A;          redo A;
1942        }        }
1943      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1944          ## XML5: "DOCTYPE root name before state".
1945    
1946        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1947          !!!cp (157);          !!!cp (157);
1948          ## Stay in the state          ## Stay in the state
# Line 1932  sub _get_next_token ($) { Line 1950  sub _get_next_token ($) {
1950          redo A;          redo A;
1951        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1952          !!!cp (158);          !!!cp (158);
1953            ## XML5: No parse error.
1954          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1955          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1956          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 1950  sub _get_next_token ($) { Line 1969  sub _get_next_token ($) {
1969          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
1970    
1971          redo A;          redo A;
1972          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
1973            !!!cp (159.1);
1974            !!!parse-error (type => 'no DOCTYPE name');
1975            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1976            !!!next-input-character;
1977            redo A;
1978        } else {        } else {
1979          !!!cp (160);          !!!cp (160);
1980          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 1959  sub _get_next_token ($) { Line 1984  sub _get_next_token ($) {
1984          redo A;          redo A;
1985        }        }
1986      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1987  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
1988    
1989          ## ISSUE: Redundant "First," in the spec.
1990    
1991        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1992          !!!cp (161);          !!!cp (161);
1993          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 1985  sub _get_next_token ($) { Line 2013  sub _get_next_token ($) {
2013          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2014    
2015          redo A;          redo A;
2016          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2017            !!!cp (163.1);
2018            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2019            !!!next-input-character;
2020            redo A;
2021        } else {        } else {
2022          !!!cp (164);          !!!cp (164);
2023          $self->{ct}->{name}          $self->{ct}->{name}
# Line 1994  sub _get_next_token ($) { Line 2027  sub _get_next_token ($) {
2027          redo A;          redo A;
2028        }        }
2029      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2030          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2031          ## state", but implemented differently.
2032    
2033        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2034          !!!cp (165);          !!!cp (165);
2035          ## Stay in the state          ## Stay in the state
# Line 2021  sub _get_next_token ($) { Line 2057  sub _get_next_token ($) {
2057          redo A;          redo A;
2058        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2059                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2060            !!!cp (167.1);
2061          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2062          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2063          !!!next-input-character;          !!!next-input-character;
2064          redo A;          redo A;
2065        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2066                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2067            !!!cp (167.2);
2068          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2069          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2070            !!!next-input-character;
2071            redo A;
2072          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2073            !!!cp (167.3);
2074            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2075            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2076          !!!next-input-character;          !!!next-input-character;
2077          redo A;          redo A;
2078        } else {        } else {
# Line 2048  sub _get_next_token ($) { Line 2092  sub _get_next_token ($) {
2092              0x0042, # B              0x0042, # B
2093              0x004C, # L              0x004C, # L
2094              0x0049, # I              0x0049, # I
2095            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2096            $self->{nc} == [            $self->{nc} == [
2097              undef,              undef,
2098              0x0075, # u              0x0075, # u
2099              0x0062, # b              0x0062, # b
2100              0x006C, # l              0x006C, # l
2101              0x0069, # i              0x0069, # i
2102            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2103          !!!cp (175);          !!!cp (175);
2104          ## Stay in the state.          ## Stay in the state.
2105          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2106          !!!next-input-character;          !!!next-input-character;
2107          redo A;          redo A;
2108        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2109                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2110                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2111          !!!cp (168);          if ($self->{is_xml} and
2112                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2113              !!!cp (168.1);
2114              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2115                              text => 'PUBLIC',
2116                              line => $self->{line_prev},
2117                              column => $self->{column_prev} - 4);
2118            } else {
2119              !!!cp (168);
2120            }
2121          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2122          !!!next-input-character;          !!!next-input-character;
2123          redo A;          redo A;
# Line 2072  sub _get_next_token ($) { Line 2125  sub _get_next_token ($) {
2125          !!!cp (169);          !!!cp (169);
2126          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2127                          line => $self->{line_prev},                          line => $self->{line_prev},
2128                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2129          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2130    
2131          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2087  sub _get_next_token ($) { Line 2140  sub _get_next_token ($) {
2140              0x0053, # S              0x0053, # S
2141              0x0054, # T              0x0054, # T
2142              0x0045, # E              0x0045, # E
2143            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2144            $self->{nc} == [            $self->{nc} == [
2145              undef,              undef,
2146              0x0079, # y              0x0079, # y
2147              0x0073, # s              0x0073, # s
2148              0x0074, # t              0x0074, # t
2149              0x0065, # e              0x0065, # e
2150            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2151          !!!cp (170);          !!!cp (170);
2152          ## Stay in the state.          ## Stay in the state.
2153          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2154          !!!next-input-character;          !!!next-input-character;
2155          redo A;          redo A;
2156        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
2157                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2158                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2159          !!!cp (171);          if ($self->{is_xml} and
2160                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2161              !!!cp (171.1);
2162              !!!parse-error (type => 'lowercase keyword', ## TODO: type
2163                              text => 'SYSTEM',
2164                              line => $self->{line_prev},
2165                              column => $self->{column_prev} - 4);
2166            } else {
2167              !!!cp (171);
2168            }
2169          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2170          !!!next-input-character;          !!!next-input-character;
2171          redo A;          redo A;
# Line 2111  sub _get_next_token ($) { Line 2173  sub _get_next_token ($) {
2173          !!!cp (172);          !!!cp (172);
2174          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2175                          line => $self->{line_prev},                          line => $self->{line_prev},
2176                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
2177          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2178    
2179          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2160  sub _get_next_token ($) { Line 2222  sub _get_next_token ($) {
2222          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2223    
2224          redo A;          redo A;
2225          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2226            !!!cp (186.1);
2227            !!!parse-error (type => 'no PUBLIC literal');
2228            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2229            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2230            !!!next-input-character;
2231            redo A;
2232        } else {        } else {
2233          !!!cp (186);          !!!cp (186);
2234          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
# Line 2270  sub _get_next_token ($) { Line 2339  sub _get_next_token ($) {
2339          !!!next-input-character;          !!!next-input-character;
2340          redo A;          redo A;
2341        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2342          !!!cp (198);          if ($self->{is_xml}) {
2343              !!!cp (198.1);
2344              !!!parse-error (type => 'no SYSTEM literal');
2345            } else {
2346              !!!cp (198);
2347            }
2348          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2349          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2350          !!!next-input-character;          !!!next-input-character;
# Line 2290  sub _get_next_token ($) { Line 2364  sub _get_next_token ($) {
2364          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2365    
2366          redo A;          redo A;
2367          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2368            !!!cp (200.1);
2369            !!!parse-error (type => 'no SYSTEM literal');
2370            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2372            !!!next-input-character;
2373            redo A;
2374        } else {        } else {
2375          !!!cp (200);          !!!cp (200);
2376          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
# Line 2340  sub _get_next_token ($) { Line 2421  sub _get_next_token ($) {
2421          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2422    
2423          redo A;          redo A;
2424          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2425            !!!cp (206.1);
2426            !!!parse-error (type => 'no SYSTEM literal');
2427    
2428            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2429            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2430            !!!next-input-character;
2431            redo A;
2432        } else {        } else {
2433          !!!cp (206);          !!!cp (206);
2434          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
# Line 2355  sub _get_next_token ($) { Line 2444  sub _get_next_token ($) {
2444          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2445          !!!next-input-character;          !!!next-input-character;
2446          redo A;          redo A;
2447        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2448          !!!cp (208);          !!!cp (208);
2449          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2450    
# Line 2396  sub _get_next_token ($) { Line 2485  sub _get_next_token ($) {
2485          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2486          !!!next-input-character;          !!!next-input-character;
2487          redo A;          redo A;
2488        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2489          !!!cp (212);          !!!cp (212);
2490          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2491    
# Line 2457  sub _get_next_token ($) { Line 2546  sub _get_next_token ($) {
2546          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2547    
2548          redo A;          redo A;
2549          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2550            !!!cp (218.1);
2551            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2552            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2553            !!!next-input-character;
2554            redo A;
2555        } else {        } else {
2556          !!!cp (218);          !!!cp (218);
2557          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
# Line 2476  sub _get_next_token ($) { Line 2571  sub _get_next_token ($) {
2571          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2572    
2573          redo A;          redo A;
2574          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2575            if ($self->{ct}->{has_internal_subset}) { # DOCTYPE
2576              !!!cp (220.2);
2577              ## Stay in the state.
2578              !!!next-input-character;
2579              redo A;
2580            } else {
2581              !!!cp (220.1);
2582              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2583              $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2584              !!!next-input-character;
2585              redo A;
2586            }
2587        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2588          !!!cp (220);          !!!cp (220);
2589          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 2488  sub _get_next_token ($) { Line 2596  sub _get_next_token ($) {
2596        } else {        } else {
2597          !!!cp (221);          !!!cp (221);
2598          my $s = '';          my $s = '';
2599          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
2600    
2601          ## Stay in the state          ## Stay in the state
2602          !!!next-input-character;          !!!next-input-character;
# Line 2596  sub _get_next_token ($) { Line 2704  sub _get_next_token ($) {
2704        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
2705          !!!cp (999);          !!!cp (999);
2706          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
2707          $self->{s_kwd} = '#';          $self->{kwd} = '#';
2708          !!!next-input-character;          !!!next-input-character;
2709          redo A;          redo A;
2710        } elsif ((0x0041 <= $self->{nc} and        } elsif ((0x0041 <= $self->{nc} and
# Line 2606  sub _get_next_token ($) { Line 2714  sub _get_next_token ($) {
2714          !!!cp (998);          !!!cp (998);
2715          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
2716          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
2717          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2718          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
2719          $self->{entity__match} = 0;          $self->{entity__match} = 0;
2720          !!!next-input-character;          !!!next-input-character;
2721          redo A;          redo A;
# Line 2647  sub _get_next_token ($) { Line 2755  sub _get_next_token ($) {
2755            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
2756          !!!cp (995);          !!!cp (995);
2757          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
2758          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2759          !!!next-input-character;          !!!next-input-character;
2760          redo A;          redo A;
2761        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
2762                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
2763          !!!cp (994);          !!!cp (994);
2764          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
2765          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
2766          !!!next-input-character;          !!!next-input-character;
2767          redo A;          redo A;
2768        } else {        } else {
# Line 2690  sub _get_next_token ($) { Line 2798  sub _get_next_token ($) {
2798        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
2799            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
2800          !!!cp (1012);          !!!cp (1012);
2801          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
2802          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2803                    
2804          ## Stay in the state.          ## Stay in the state.
2805          !!!next-input-character;          !!!next-input-character;
# Line 2707  sub _get_next_token ($) { Line 2815  sub _get_next_token ($) {
2815          #          #
2816        }        }
2817    
2818        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2819        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2820        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2821        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2750  sub _get_next_token ($) { Line 2858  sub _get_next_token ($) {
2858          # 0..9, A..F, a..f          # 0..9, A..F, a..f
2859          !!!cp (990);          !!!cp (990);
2860          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
2861          $self->{s_kwd} = 0;          $self->{kwd} = 0;
2862          ## Reconsume.          ## Reconsume.
2863          redo A;          redo A;
2864        } else {        } else {
# Line 2768  sub _get_next_token ($) { Line 2876  sub _get_next_token ($) {
2876            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2877            ## Reconsume.            ## Reconsume.
2878            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2879                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
2880                      line => $self->{line_prev},                      line => $self->{line_prev},
2881                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
2882                     });                     });
2883            redo A;            redo A;
2884          } else {          } else {
2885            !!!cp (989);            !!!cp (989);
2886            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
2887            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2888            $self->{s_kwd} = '';            $self->{s_kwd} = '';
2889            ## Reconsume.            ## Reconsume.
# Line 2786  sub _get_next_token ($) { Line 2894  sub _get_next_token ($) {
2894        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2895          # 0..9          # 0..9
2896          !!!cp (1002);          !!!cp (1002);
2897          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2898          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
2899          ## Stay in the state.          ## Stay in the state.
2900          !!!next-input-character;          !!!next-input-character;
2901          redo A;          redo A;
2902        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
2903                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
2904          !!!cp (1003);          !!!cp (1003);
2905          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2906          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
2907          ## Stay in the state.          ## Stay in the state.
2908          !!!next-input-character;          !!!next-input-character;
2909          redo A;          redo A;
2910        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
2911                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
2912          !!!cp (1004);          !!!cp (1004);
2913          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
2914          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
2915          ## Stay in the state.          ## Stay in the state.
2916          !!!next-input-character;          !!!next-input-character;
2917          redo A;          redo A;
# Line 2820  sub _get_next_token ($) { Line 2928  sub _get_next_token ($) {
2928          #          #
2929        }        }
2930    
2931        my $code = $self->{s_kwd};        my $code = $self->{kwd};
2932        my $l = $self->{line_prev};        my $l = $self->{line_prev};
2933        my $c = $self->{column_prev};        my $c = $self->{column_prev};
2934        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 2857  sub _get_next_token ($) { Line 2965  sub _get_next_token ($) {
2965          redo A;          redo A;
2966        }        }
2967      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
2968        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
2969            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
2970            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
2971              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 2867  sub _get_next_token ($) { Line 2975  sub _get_next_token ($) {
2975              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
2976             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
2977          our $EntityChar;          our $EntityChar;
2978          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2979          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
2980            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
2981              !!!cp (1020);              !!!cp (1020);
2982              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
2983              $self->{entity__match} = 1;              $self->{entity__match} = 1;
2984              !!!next-input-character;              !!!next-input-character;
2985              #              #
2986            } else {            } else {
2987              !!!cp (1021);              !!!cp (1021);
2988              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
2989              $self->{entity__match} = -1;              $self->{entity__match} = -1;
2990              ## Stay in the state.              ## Stay in the state.
2991              !!!next-input-character;              !!!next-input-character;
# Line 2905  sub _get_next_token ($) { Line 3013  sub _get_next_token ($) {
3013          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
3014              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
3015            !!!cp (1024);            !!!cp (1024);
3016            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
3017            #            #
3018          } else {          } else {
3019            !!!cp (1025);            !!!cp (1025);
# Line 2917  sub _get_next_token ($) { Line 3025  sub _get_next_token ($) {
3025          !!!cp (1026);          !!!cp (1026);
3026          !!!parse-error (type => 'bare ero',          !!!parse-error (type => 'bare ero',
3027                          line => $self->{line_prev},                          line => $self->{line_prev},
3028                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
3029          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
3030          #          #
3031        }        }
3032        
# Line 2941  sub _get_next_token ($) { Line 3049  sub _get_next_token ($) {
3049                    data => $data,                    data => $data,
3050                    has_reference => $has_ref,                    has_reference => $has_ref,
3051                    line => $self->{line_prev},                    line => $self->{line_prev},
3052                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
3053                   });                   });
3054          redo A;          redo A;
3055        } else {        } else {
# Line 3081  sub _get_next_token ($) { Line 3189  sub _get_next_token ($) {
3189          ## Reprocess.          ## Reprocess.
3190          redo A;          redo A;
3191        }        }
3192    
3193        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3194          if ($self->{nc} == 0x003C) { # <
3195            ## TODO:
3196            !!!next-input-character;
3197            redo A;
3198          } elsif ($self->{nc} == 0x0025) { # %
3199            ## XML5: Not defined yet.
3200    
3201            ## TODO:
3202            !!!next-input-character;
3203            redo A;
3204          } elsif ($self->{nc} == 0x005D) { # ]
3205            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3206            !!!next-input-character;
3207            redo A;
3208          } elsif ($is_space->{$self->{nc}}) {
3209            ## Stay in the state.
3210            !!!next-input-character;
3211            redo A;
3212          } elsif ($self->{nc} == -1) {
3213            !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3214            $self->{state} = DATA_STATE;
3215            $self->{s_kwd} = '';
3216            ## Reconsume.
3217            !!!emit ($self->{ct}); # DOCTYPE
3218            redo A;
3219          } else {
3220            unless ($self->{internal_subset_tainted}) {
3221              ## XML5: No parse error.
3222              !!!parse-error (type => 'string in internal subset');
3223              $self->{internal_subset_tainted} = 1;
3224            }
3225            ## Stay in the state.
3226            !!!next-input-character;
3227            redo A;
3228          }
3229        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3230          if ($self->{nc} == 0x003E) { # >
3231            $self->{state} = DATA_STATE;
3232            $self->{s_kwd} = '';
3233            !!!next-input-character;
3234            !!!emit ($self->{ct}); # DOCTYPE
3235            redo A;
3236          } elsif ($self->{nc} == -1) {
3237            !!!parse-error (type => 'unclosed DOCTYPE');
3238            $self->{state} = DATA_STATE;
3239            $self->{s_kwd} = '';
3240            ## Reconsume.
3241            !!!emit ($self->{ct}); # DOCTYPE
3242            redo A;
3243          } else {
3244            ## XML5: No parse error and stay in the state.
3245            !!!parse-error (type => 'string after internal subset'); ## TODO: type
3246    
3247            $self->{state} = BOGUS_DOCTYPE_STATE;
3248            !!!next-input-character;
3249            redo A;
3250          }
3251                    
3252      } else {      } else {
3253        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.12

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24