| 31 |
); |
); |
| 32 |
} |
} |
| 33 |
|
|
| 34 |
|
## NOTE: Differences from the XML5 draft are marked as "XML5:". |
| 35 |
|
|
| 36 |
## Token types |
## Token types |
| 37 |
|
|
| 38 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token. |
| 39 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
| 40 |
sub START_TAG_TOKEN () { 3 } |
sub START_TAG_TOKEN () { 3 } |
| 41 |
sub END_TAG_TOKEN () { 4 } |
sub END_TAG_TOKEN () { 4 } |
| 42 |
sub END_OF_FILE_TOKEN () { 5 } |
sub END_OF_FILE_TOKEN () { 5 } |
| 43 |
sub CHARACTER_TOKEN () { 6 } |
sub CHARACTER_TOKEN () { 6 } |
| 44 |
sub PI_TOKEN () { 7 } # XML5 |
sub PI_TOKEN () { 7 } ## NOTE: XML only. |
| 45 |
sub ABORT_TOKEN () { 8 } # Not a token actually |
sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing. |
| 46 |
|
|
| 47 |
|
## XML5: XML5 has "empty tag token". In this implementation, it is |
| 48 |
|
## represented as a start tag token with $self->{self_closing} flag |
| 49 |
|
## set to true. |
| 50 |
|
|
| 51 |
|
## XML5: XML5 has "short end tag token". In this implementation, it |
| 52 |
|
## is represented as an end tag token with $token->{tag_name} flag set |
| 53 |
|
## to an empty string. |
| 54 |
|
|
| 55 |
package Whatpm::HTML; |
package Whatpm::HTML; |
| 56 |
|
|
| 124 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
| 125 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
| 126 |
|
|
| 127 |
## XML states |
## XML-only states |
| 128 |
sub PI_STATE () { 51 } |
sub PI_STATE () { 51 } |
| 129 |
sub PI_TARGET_STATE () { 52 } |
sub PI_TARGET_STATE () { 52 } |
| 130 |
sub PI_TARGET_AFTER_STATE () { 53 } |
sub PI_TARGET_AFTER_STATE () { 53 } |
| 131 |
sub PI_DATA_STATE () { 54 } |
sub PI_DATA_STATE () { 54 } |
| 132 |
sub PI_AFTER_STATE () { 55 } |
sub PI_AFTER_STATE () { 55 } |
| 133 |
sub PI_DATA_AFTER_STATE () { 56 } |
sub PI_DATA_AFTER_STATE () { 56 } |
| 134 |
|
sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 } |
| 135 |
|
sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 } |
| 136 |
|
|
| 137 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
| 138 |
## list and descriptions) |
## list and descriptions) |
| 198 |
#$self->{is_xml} (if XML) |
#$self->{is_xml} (if XML) |
| 199 |
|
|
| 200 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
| 201 |
$self->{s_kwd} = ''; # state keyword |
$self->{s_kwd} = ''; # Data state keyword |
| 202 |
|
#$self->{kwd} = ''; # State-dependent keyword; initialized when used |
| 203 |
#$self->{entity__value}; # initialized when used |
#$self->{entity__value}; # initialized when used |
| 204 |
#$self->{entity__match}; # initialized when used |
#$self->{entity__match}; # initialized when used |
| 205 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
| 234 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) |
| 235 |
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
| 236 |
## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. |
## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. |
| 237 |
|
## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN) |
| 238 |
|
|
| 239 |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
| 240 |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
| 241 |
## while the token is pushed back to the stack. |
## while the token is pushed back to the stack. |
| 255 |
0x0009 => 1, # CHARACTER TABULATION (HT) |
0x0009 => 1, # CHARACTER TABULATION (HT) |
| 256 |
0x000A => 1, # LINE FEED (LF) |
0x000A => 1, # LINE FEED (LF) |
| 257 |
#0x000B => 0, # LINE TABULATION (VT) |
#0x000B => 0, # LINE TABULATION (VT) |
| 258 |
0x000C => 1, # FORM FEED (FF) |
0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character. |
| 259 |
#0x000D => 1, # CARRIAGE RETURN (CR) |
#0x000D => 1, # CARRIAGE RETURN (CR) |
| 260 |
0x0020 => 1, # SPACE (SP) |
0x0020 => 1, # SPACE (SP) |
| 261 |
}; |
}; |
| 465 |
redo A; |
redo A; |
| 466 |
} elsif ($self->{nc} == 0x0021) { # ! |
} elsif ($self->{nc} == 0x0021) { # ! |
| 467 |
!!!cp (15.1); |
!!!cp (15.1); |
| 468 |
$self->{s_kwd} = '<' unless $self->{escape}; |
$self->{s_kwd} = $self->{escaped} ? '' : '<'; |
| 469 |
# |
# |
| 470 |
} else { |
} else { |
| 471 |
!!!cp (16); |
!!!cp (16); |
| 472 |
|
$self->{s_kwd} = ''; |
| 473 |
# |
# |
| 474 |
} |
} |
| 475 |
|
|
| 476 |
## reconsume |
## reconsume |
| 477 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
|
$self->{s_kwd} = ''; |
|
| 478 |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
| 479 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 480 |
column => $self->{column_prev}, |
column => $self->{column_prev}, |
| 585 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 586 |
if (defined $self->{last_stag_name}) { |
if (defined $self->{last_stag_name}) { |
| 587 |
$self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE; |
$self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE; |
| 588 |
$self->{s_kwd} = ''; |
$self->{kwd} = ''; |
| 589 |
## Reconsume. |
## Reconsume. |
| 590 |
redo A; |
redo A; |
| 591 |
} else { |
} else { |
| 688 |
redo A; |
redo A; |
| 689 |
} |
} |
| 690 |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
| 691 |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1; |
| 692 |
if (length $ch) { |
if (length $ch) { |
| 693 |
my $CH = $ch; |
my $CH = $ch; |
| 694 |
$ch =~ tr/a-z/A-Z/; |
$ch =~ tr/a-z/A-Z/; |
| 696 |
if ($nch eq $ch or $nch eq $CH) { |
if ($nch eq $ch or $nch eq $CH) { |
| 697 |
!!!cp (24); |
!!!cp (24); |
| 698 |
## Stay in the state. |
## Stay in the state. |
| 699 |
$self->{s_kwd} .= $nch; |
$self->{kwd} .= $nch; |
| 700 |
!!!next-input-character; |
!!!next-input-character; |
| 701 |
redo A; |
redo A; |
| 702 |
} else { |
} else { |
| 705 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 706 |
## Reconsume. |
## Reconsume. |
| 707 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
| 708 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{kwd}, |
| 709 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 710 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 711 |
}); |
}); |
| 712 |
redo A; |
redo A; |
| 713 |
} |
} |
| 723 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 724 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 725 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
| 726 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{kwd}, |
| 727 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 728 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 729 |
}); |
}); |
| 730 |
redo A; |
redo A; |
| 731 |
} else { |
} else { |
| 734 |
= {type => END_TAG_TOKEN, |
= {type => END_TAG_TOKEN, |
| 735 |
tag_name => $self->{last_stag_name}, |
tag_name => $self->{last_stag_name}, |
| 736 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 737 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}}; |
column => $self->{column_prev} - 1 - length $self->{kwd}}; |
| 738 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 739 |
## Reconsume. |
## Reconsume. |
| 740 |
redo A; |
redo A; |
| 1608 |
## ASCII case-insensitive. |
## ASCII case-insensitive. |
| 1609 |
!!!cp (130); |
!!!cp (130); |
| 1610 |
$self->{state} = MD_DOCTYPE_STATE; |
$self->{state} = MD_DOCTYPE_STATE; |
| 1611 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 1612 |
!!!next-input-character; |
!!!next-input-character; |
| 1613 |
redo A; |
redo A; |
| 1614 |
} elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
} elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
| 1617 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 1618 |
!!!cp (135.4); |
!!!cp (135.4); |
| 1619 |
$self->{state} = MD_CDATA_STATE; |
$self->{state} = MD_CDATA_STATE; |
| 1620 |
$self->{s_kwd} = '['; |
$self->{kwd} = '['; |
| 1621 |
!!!next-input-character; |
!!!next-input-character; |
| 1622 |
redo A; |
redo A; |
| 1623 |
} else { |
} else { |
| 1667 |
0x0054, # T |
0x0054, # T |
| 1668 |
0x0059, # Y |
0x0059, # Y |
| 1669 |
0x0050, # P |
0x0050, # P |
| 1670 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 1671 |
$self->{nc} == [ |
$self->{nc} == [ |
| 1672 |
undef, |
undef, |
| 1673 |
0x006F, # o |
0x006F, # o |
| 1675 |
0x0074, # t |
0x0074, # t |
| 1676 |
0x0079, # y |
0x0079, # y |
| 1677 |
0x0070, # p |
0x0070, # p |
| 1678 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 1679 |
!!!cp (131); |
!!!cp (131); |
| 1680 |
## Stay in the state. |
## Stay in the state. |
| 1681 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 1682 |
!!!next-input-character; |
!!!next-input-character; |
| 1683 |
redo A; |
redo A; |
| 1684 |
} elsif ((length $self->{s_kwd}) == 6 and |
} elsif ((length $self->{kwd}) == 6 and |
| 1685 |
($self->{nc} == 0x0045 or # E |
($self->{nc} == 0x0045 or # E |
| 1686 |
$self->{nc} == 0x0065)) { # e |
$self->{nc} == 0x0065)) { # e |
| 1687 |
if ($self->{s_kwd} ne 'DOCTYP') { |
if ($self->{is_xml} and |
| 1688 |
|
($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) { |
| 1689 |
!!!cp (129); |
!!!cp (129); |
| 1690 |
## XML5: case-sensitive. |
## XML5: case-sensitive. |
| 1691 |
!!!parse-error (type => 'lowercase keyword', ## TODO |
!!!parse-error (type => 'lowercase keyword', ## TODO |
| 1707 |
!!!cp (132); |
!!!cp (132); |
| 1708 |
!!!parse-error (type => 'bogus comment', |
!!!parse-error (type => 'bogus comment', |
| 1709 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 1710 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}); |
column => $self->{column_prev} - 1 - length $self->{kwd}); |
| 1711 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 1712 |
## Reconsume. |
## Reconsume. |
| 1713 |
$self->{ct} = {type => COMMENT_TOKEN, |
$self->{ct} = {type => COMMENT_TOKEN, |
| 1714 |
data => $self->{s_kwd}, |
data => $self->{kwd}, |
| 1715 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 1716 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 1717 |
}; |
}; |
| 1718 |
redo A; |
redo A; |
| 1719 |
} |
} |
| 1724 |
'[CD' => 0x0041, # A |
'[CD' => 0x0041, # A |
| 1725 |
'[CDA' => 0x0054, # T |
'[CDA' => 0x0054, # T |
| 1726 |
'[CDAT' => 0x0041, # A |
'[CDAT' => 0x0041, # A |
| 1727 |
}->{$self->{s_kwd}}) { |
}->{$self->{kwd}}) { |
| 1728 |
!!!cp (135.1); |
!!!cp (135.1); |
| 1729 |
## Stay in the state. |
## Stay in the state. |
| 1730 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 1731 |
!!!next-input-character; |
!!!next-input-character; |
| 1732 |
redo A; |
redo A; |
| 1733 |
} elsif ($self->{s_kwd} eq '[CDATA' and |
} elsif ($self->{kwd} eq '[CDATA' and |
| 1734 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 1735 |
if ($self->{is_xml} and |
if ($self->{is_xml} and |
| 1736 |
not $self->{tainted} and |
not $self->{tainted} and |
| 1755 |
!!!cp (135.3); |
!!!cp (135.3); |
| 1756 |
!!!parse-error (type => 'bogus comment', |
!!!parse-error (type => 'bogus comment', |
| 1757 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 1758 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}); |
column => $self->{column_prev} - 1 - length $self->{kwd}); |
| 1759 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 1760 |
## Reconsume. |
## Reconsume. |
| 1761 |
$self->{ct} = {type => COMMENT_TOKEN, |
$self->{ct} = {type => COMMENT_TOKEN, |
| 1762 |
data => $self->{s_kwd}, |
data => $self->{kwd}, |
| 1763 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 1764 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 1765 |
}; |
}; |
| 1766 |
redo A; |
redo A; |
| 1767 |
} |
} |
| 1871 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 1872 |
!!!cp (149); |
!!!cp (149); |
| 1873 |
!!!parse-error (type => 'unclosed comment'); |
!!!parse-error (type => 'unclosed comment'); |
|
$self->{s_kwd} = ''; |
|
| 1874 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1875 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1876 |
## reconsume |
## reconsume |
| 1934 |
redo A; |
redo A; |
| 1935 |
} else { |
} else { |
| 1936 |
!!!cp (156); |
!!!cp (156); |
| 1937 |
|
## XML5: Unless EOF, swith to the bogus comment state. |
| 1938 |
!!!parse-error (type => 'no space before DOCTYPE name'); |
!!!parse-error (type => 'no space before DOCTYPE name'); |
| 1939 |
$self->{state} = BEFORE_DOCTYPE_NAME_STATE; |
$self->{state} = BEFORE_DOCTYPE_NAME_STATE; |
| 1940 |
## reconsume |
## reconsume |
| 1941 |
redo A; |
redo A; |
| 1942 |
} |
} |
| 1943 |
} elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) { |
| 1944 |
|
## XML5: "DOCTYPE root name before state". |
| 1945 |
|
|
| 1946 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1947 |
!!!cp (157); |
!!!cp (157); |
| 1948 |
## Stay in the state |
## Stay in the state |
| 1950 |
redo A; |
redo A; |
| 1951 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 1952 |
!!!cp (158); |
!!!cp (158); |
| 1953 |
|
## XML5: No parse error. |
| 1954 |
!!!parse-error (type => 'no DOCTYPE name'); |
!!!parse-error (type => 'no DOCTYPE name'); |
| 1955 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1956 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 1969 |
!!!emit ($self->{ct}); # DOCTYPE (quirks) |
!!!emit ($self->{ct}); # DOCTYPE (quirks) |
| 1970 |
|
|
| 1971 |
redo A; |
redo A; |
| 1972 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 1973 |
|
!!!cp (159.1); |
| 1974 |
|
!!!parse-error (type => 'no DOCTYPE name'); |
| 1975 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 1976 |
|
!!!next-input-character; |
| 1977 |
|
redo A; |
| 1978 |
} else { |
} else { |
| 1979 |
!!!cp (160); |
!!!cp (160); |
| 1980 |
$self->{ct}->{name} = chr $self->{nc}; |
$self->{ct}->{name} = chr $self->{nc}; |
| 1984 |
redo A; |
redo A; |
| 1985 |
} |
} |
| 1986 |
} elsif ($self->{state} == DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == DOCTYPE_NAME_STATE) { |
| 1987 |
## ISSUE: Redundant "First," in the spec. |
## XML5: "DOCTYPE root name state". |
| 1988 |
|
|
| 1989 |
|
## ISSUE: Redundant "First," in the spec. |
| 1990 |
|
|
| 1991 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1992 |
!!!cp (161); |
!!!cp (161); |
| 1993 |
$self->{state} = AFTER_DOCTYPE_NAME_STATE; |
$self->{state} = AFTER_DOCTYPE_NAME_STATE; |
| 2013 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
| 2014 |
|
|
| 2015 |
redo A; |
redo A; |
| 2016 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2017 |
|
!!!cp (163.1); |
| 2018 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2019 |
|
!!!next-input-character; |
| 2020 |
|
redo A; |
| 2021 |
} else { |
} else { |
| 2022 |
!!!cp (164); |
!!!cp (164); |
| 2023 |
$self->{ct}->{name} |
$self->{ct}->{name} |
| 2027 |
redo A; |
redo A; |
| 2028 |
} |
} |
| 2029 |
} elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) { |
| 2030 |
|
## XML5: Corresponding to XML5's "DOCTYPE root name after |
| 2031 |
|
## state", but implemented differently. |
| 2032 |
|
|
| 2033 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 2034 |
!!!cp (165); |
!!!cp (165); |
| 2035 |
## Stay in the state |
## Stay in the state |
| 2057 |
redo A; |
redo A; |
| 2058 |
} elsif ($self->{nc} == 0x0050 or # P |
} elsif ($self->{nc} == 0x0050 or # P |
| 2059 |
$self->{nc} == 0x0070) { # p |
$self->{nc} == 0x0070) { # p |
| 2060 |
|
!!!cp (167.1); |
| 2061 |
$self->{state} = PUBLIC_STATE; |
$self->{state} = PUBLIC_STATE; |
| 2062 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 2063 |
!!!next-input-character; |
!!!next-input-character; |
| 2064 |
redo A; |
redo A; |
| 2065 |
} elsif ($self->{nc} == 0x0053 or # S |
} elsif ($self->{nc} == 0x0053 or # S |
| 2066 |
$self->{nc} == 0x0073) { # s |
$self->{nc} == 0x0073) { # s |
| 2067 |
|
!!!cp (167.2); |
| 2068 |
$self->{state} = SYSTEM_STATE; |
$self->{state} = SYSTEM_STATE; |
| 2069 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 2070 |
|
!!!next-input-character; |
| 2071 |
|
redo A; |
| 2072 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2073 |
|
!!!cp (167.3); |
| 2074 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2075 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 2076 |
!!!next-input-character; |
!!!next-input-character; |
| 2077 |
redo A; |
redo A; |
| 2078 |
} else { |
} else { |
| 2092 |
0x0042, # B |
0x0042, # B |
| 2093 |
0x004C, # L |
0x004C, # L |
| 2094 |
0x0049, # I |
0x0049, # I |
| 2095 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 2096 |
$self->{nc} == [ |
$self->{nc} == [ |
| 2097 |
undef, |
undef, |
| 2098 |
0x0075, # u |
0x0075, # u |
| 2099 |
0x0062, # b |
0x0062, # b |
| 2100 |
0x006C, # l |
0x006C, # l |
| 2101 |
0x0069, # i |
0x0069, # i |
| 2102 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 2103 |
!!!cp (175); |
!!!cp (175); |
| 2104 |
## Stay in the state. |
## Stay in the state. |
| 2105 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 2106 |
!!!next-input-character; |
!!!next-input-character; |
| 2107 |
redo A; |
redo A; |
| 2108 |
} elsif ((length $self->{s_kwd}) == 5 and |
} elsif ((length $self->{kwd}) == 5 and |
| 2109 |
($self->{nc} == 0x0043 or # C |
($self->{nc} == 0x0043 or # C |
| 2110 |
$self->{nc} == 0x0063)) { # c |
$self->{nc} == 0x0063)) { # c |
| 2111 |
!!!cp (168); |
if ($self->{is_xml} and |
| 2112 |
|
($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c |
| 2113 |
|
!!!cp (168.1); |
| 2114 |
|
!!!parse-error (type => 'lowercase keyword', ## TODO: type |
| 2115 |
|
text => 'PUBLIC', |
| 2116 |
|
line => $self->{line_prev}, |
| 2117 |
|
column => $self->{column_prev} - 4); |
| 2118 |
|
} else { |
| 2119 |
|
!!!cp (168); |
| 2120 |
|
} |
| 2121 |
$self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 2122 |
!!!next-input-character; |
!!!next-input-character; |
| 2123 |
redo A; |
redo A; |
| 2125 |
!!!cp (169); |
!!!cp (169); |
| 2126 |
!!!parse-error (type => 'string after DOCTYPE name', |
!!!parse-error (type => 'string after DOCTYPE name', |
| 2127 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2128 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}); |
column => $self->{column_prev} + 1 - length $self->{kwd}); |
| 2129 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 2130 |
|
|
| 2131 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 2140 |
0x0053, # S |
0x0053, # S |
| 2141 |
0x0054, # T |
0x0054, # T |
| 2142 |
0x0045, # E |
0x0045, # E |
| 2143 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 2144 |
$self->{nc} == [ |
$self->{nc} == [ |
| 2145 |
undef, |
undef, |
| 2146 |
0x0079, # y |
0x0079, # y |
| 2147 |
0x0073, # s |
0x0073, # s |
| 2148 |
0x0074, # t |
0x0074, # t |
| 2149 |
0x0065, # e |
0x0065, # e |
| 2150 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 2151 |
!!!cp (170); |
!!!cp (170); |
| 2152 |
## Stay in the state. |
## Stay in the state. |
| 2153 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 2154 |
!!!next-input-character; |
!!!next-input-character; |
| 2155 |
redo A; |
redo A; |
| 2156 |
} elsif ((length $self->{s_kwd}) == 5 and |
} elsif ((length $self->{kwd}) == 5 and |
| 2157 |
($self->{nc} == 0x004D or # M |
($self->{nc} == 0x004D or # M |
| 2158 |
$self->{nc} == 0x006D)) { # m |
$self->{nc} == 0x006D)) { # m |
| 2159 |
!!!cp (171); |
if ($self->{is_xml} and |
| 2160 |
|
($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m |
| 2161 |
|
!!!cp (171.1); |
| 2162 |
|
!!!parse-error (type => 'lowercase keyword', ## TODO: type |
| 2163 |
|
text => 'SYSTEM', |
| 2164 |
|
line => $self->{line_prev}, |
| 2165 |
|
column => $self->{column_prev} - 4); |
| 2166 |
|
} else { |
| 2167 |
|
!!!cp (171); |
| 2168 |
|
} |
| 2169 |
$self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 2170 |
!!!next-input-character; |
!!!next-input-character; |
| 2171 |
redo A; |
redo A; |
| 2173 |
!!!cp (172); |
!!!cp (172); |
| 2174 |
!!!parse-error (type => 'string after DOCTYPE name', |
!!!parse-error (type => 'string after DOCTYPE name', |
| 2175 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2176 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}); |
column => $self->{column_prev} + 1 - length $self->{kwd}); |
| 2177 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 2178 |
|
|
| 2179 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 2222 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
| 2223 |
|
|
| 2224 |
redo A; |
redo A; |
| 2225 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2226 |
|
!!!cp (186.1); |
| 2227 |
|
!!!parse-error (type => 'no PUBLIC literal'); |
| 2228 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2229 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 2230 |
|
!!!next-input-character; |
| 2231 |
|
redo A; |
| 2232 |
} else { |
} else { |
| 2233 |
!!!cp (186); |
!!!cp (186); |
| 2234 |
!!!parse-error (type => 'string after PUBLIC'); |
!!!parse-error (type => 'string after PUBLIC'); |
| 2339 |
!!!next-input-character; |
!!!next-input-character; |
| 2340 |
redo A; |
redo A; |
| 2341 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 2342 |
!!!cp (198); |
if ($self->{is_xml}) { |
| 2343 |
|
!!!cp (198.1); |
| 2344 |
|
!!!parse-error (type => 'no SYSTEM literal'); |
| 2345 |
|
} else { |
| 2346 |
|
!!!cp (198); |
| 2347 |
|
} |
| 2348 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2349 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2350 |
!!!next-input-character; |
!!!next-input-character; |
| 2364 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
| 2365 |
|
|
| 2366 |
redo A; |
redo A; |
| 2367 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2368 |
|
!!!cp (200.1); |
| 2369 |
|
!!!parse-error (type => 'no SYSTEM literal'); |
| 2370 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2371 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 2372 |
|
!!!next-input-character; |
| 2373 |
|
redo A; |
| 2374 |
} else { |
} else { |
| 2375 |
!!!cp (200); |
!!!cp (200); |
| 2376 |
!!!parse-error (type => 'string after PUBLIC literal'); |
!!!parse-error (type => 'string after PUBLIC literal'); |
| 2421 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
| 2422 |
|
|
| 2423 |
redo A; |
redo A; |
| 2424 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2425 |
|
!!!cp (206.1); |
| 2426 |
|
!!!parse-error (type => 'no SYSTEM literal'); |
| 2427 |
|
|
| 2428 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2429 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 2430 |
|
!!!next-input-character; |
| 2431 |
|
redo A; |
| 2432 |
} else { |
} else { |
| 2433 |
!!!cp (206); |
!!!cp (206); |
| 2434 |
!!!parse-error (type => 'string after SYSTEM'); |
!!!parse-error (type => 'string after SYSTEM'); |
| 2444 |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 2445 |
!!!next-input-character; |
!!!next-input-character; |
| 2446 |
redo A; |
redo A; |
| 2447 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # > |
| 2448 |
!!!cp (208); |
!!!cp (208); |
| 2449 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
| 2450 |
|
|
| 2485 |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 2486 |
!!!next-input-character; |
!!!next-input-character; |
| 2487 |
redo A; |
redo A; |
| 2488 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # > |
| 2489 |
!!!cp (212); |
!!!cp (212); |
| 2490 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
| 2491 |
|
|
| 2546 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
| 2547 |
|
|
| 2548 |
redo A; |
redo A; |
| 2549 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2550 |
|
!!!cp (218.1); |
| 2551 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2552 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 2553 |
|
!!!next-input-character; |
| 2554 |
|
redo A; |
| 2555 |
} else { |
} else { |
| 2556 |
!!!cp (218); |
!!!cp (218); |
| 2557 |
!!!parse-error (type => 'string after SYSTEM literal'); |
!!!parse-error (type => 'string after SYSTEM literal'); |
| 2571 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
| 2572 |
|
|
| 2573 |
redo A; |
redo A; |
| 2574 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2575 |
|
if ($self->{ct}->{has_internal_subset}) { # DOCTYPE |
| 2576 |
|
!!!cp (220.2); |
| 2577 |
|
## Stay in the state. |
| 2578 |
|
!!!next-input-character; |
| 2579 |
|
redo A; |
| 2580 |
|
} else { |
| 2581 |
|
!!!cp (220.1); |
| 2582 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2583 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 2584 |
|
!!!next-input-character; |
| 2585 |
|
redo A; |
| 2586 |
|
} |
| 2587 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 2588 |
!!!cp (220); |
!!!cp (220); |
| 2589 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2596 |
} else { |
} else { |
| 2597 |
!!!cp (221); |
!!!cp (221); |
| 2598 |
my $s = ''; |
my $s = ''; |
| 2599 |
$self->{read_until}->($s, q[>], 0); |
$self->{read_until}->($s, q{>[}, 0); |
| 2600 |
|
|
| 2601 |
## Stay in the state |
## Stay in the state |
| 2602 |
!!!next-input-character; |
!!!next-input-character; |
| 2704 |
} elsif ($self->{nc} == 0x0023) { # # |
} elsif ($self->{nc} == 0x0023) { # # |
| 2705 |
!!!cp (999); |
!!!cp (999); |
| 2706 |
$self->{state} = ENTITY_HASH_STATE; |
$self->{state} = ENTITY_HASH_STATE; |
| 2707 |
$self->{s_kwd} = '#'; |
$self->{kwd} = '#'; |
| 2708 |
!!!next-input-character; |
!!!next-input-character; |
| 2709 |
redo A; |
redo A; |
| 2710 |
} elsif ((0x0041 <= $self->{nc} and |
} elsif ((0x0041 <= $self->{nc} and |
| 2714 |
!!!cp (998); |
!!!cp (998); |
| 2715 |
require Whatpm::_NamedEntityList; |
require Whatpm::_NamedEntityList; |
| 2716 |
$self->{state} = ENTITY_NAME_STATE; |
$self->{state} = ENTITY_NAME_STATE; |
| 2717 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 2718 |
$self->{entity__value} = $self->{s_kwd}; |
$self->{entity__value} = $self->{kwd}; |
| 2719 |
$self->{entity__match} = 0; |
$self->{entity__match} = 0; |
| 2720 |
!!!next-input-character; |
!!!next-input-character; |
| 2721 |
redo A; |
redo A; |
| 2755 |
$self->{nc} == 0x0058) { # X |
$self->{nc} == 0x0058) { # X |
| 2756 |
!!!cp (995); |
!!!cp (995); |
| 2757 |
$self->{state} = HEXREF_X_STATE; |
$self->{state} = HEXREF_X_STATE; |
| 2758 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 2759 |
!!!next-input-character; |
!!!next-input-character; |
| 2760 |
redo A; |
redo A; |
| 2761 |
} elsif (0x0030 <= $self->{nc} and |
} elsif (0x0030 <= $self->{nc} and |
| 2762 |
$self->{nc} <= 0x0039) { # 0..9 |
$self->{nc} <= 0x0039) { # 0..9 |
| 2763 |
!!!cp (994); |
!!!cp (994); |
| 2764 |
$self->{state} = NCR_NUM_STATE; |
$self->{state} = NCR_NUM_STATE; |
| 2765 |
$self->{s_kwd} = $self->{nc} - 0x0030; |
$self->{kwd} = $self->{nc} - 0x0030; |
| 2766 |
!!!next-input-character; |
!!!next-input-character; |
| 2767 |
redo A; |
redo A; |
| 2768 |
} else { |
} else { |
| 2798 |
if (0x0030 <= $self->{nc} and |
if (0x0030 <= $self->{nc} and |
| 2799 |
$self->{nc} <= 0x0039) { # 0..9 |
$self->{nc} <= 0x0039) { # 0..9 |
| 2800 |
!!!cp (1012); |
!!!cp (1012); |
| 2801 |
$self->{s_kwd} *= 10; |
$self->{kwd} *= 10; |
| 2802 |
$self->{s_kwd} += $self->{nc} - 0x0030; |
$self->{kwd} += $self->{nc} - 0x0030; |
| 2803 |
|
|
| 2804 |
## Stay in the state. |
## Stay in the state. |
| 2805 |
!!!next-input-character; |
!!!next-input-character; |
| 2815 |
# |
# |
| 2816 |
} |
} |
| 2817 |
|
|
| 2818 |
my $code = $self->{s_kwd}; |
my $code = $self->{kwd}; |
| 2819 |
my $l = $self->{line_prev}; |
my $l = $self->{line_prev}; |
| 2820 |
my $c = $self->{column_prev}; |
my $c = $self->{column_prev}; |
| 2821 |
if ($charref_map->{$code}) { |
if ($charref_map->{$code}) { |
| 2858 |
# 0..9, A..F, a..f |
# 0..9, A..F, a..f |
| 2859 |
!!!cp (990); |
!!!cp (990); |
| 2860 |
$self->{state} = HEXREF_HEX_STATE; |
$self->{state} = HEXREF_HEX_STATE; |
| 2861 |
$self->{s_kwd} = 0; |
$self->{kwd} = 0; |
| 2862 |
## Reconsume. |
## Reconsume. |
| 2863 |
redo A; |
redo A; |
| 2864 |
} else { |
} else { |
| 2876 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2877 |
## Reconsume. |
## Reconsume. |
| 2878 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
| 2879 |
data => '&' . $self->{s_kwd}, |
data => '&' . $self->{kwd}, |
| 2880 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2881 |
column => $self->{column_prev} - length $self->{s_kwd}, |
column => $self->{column_prev} - length $self->{kwd}, |
| 2882 |
}); |
}); |
| 2883 |
redo A; |
redo A; |
| 2884 |
} else { |
} else { |
| 2885 |
!!!cp (989); |
!!!cp (989); |
| 2886 |
$self->{ca}->{value} .= '&' . $self->{s_kwd}; |
$self->{ca}->{value} .= '&' . $self->{kwd}; |
| 2887 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 2888 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2889 |
## Reconsume. |
## Reconsume. |
| 2894 |
if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) { |
if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) { |
| 2895 |
# 0..9 |
# 0..9 |
| 2896 |
!!!cp (1002); |
!!!cp (1002); |
| 2897 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 2898 |
$self->{s_kwd} += $self->{nc} - 0x0030; |
$self->{kwd} += $self->{nc} - 0x0030; |
| 2899 |
## Stay in the state. |
## Stay in the state. |
| 2900 |
!!!next-input-character; |
!!!next-input-character; |
| 2901 |
redo A; |
redo A; |
| 2902 |
} elsif (0x0061 <= $self->{nc} and |
} elsif (0x0061 <= $self->{nc} and |
| 2903 |
$self->{nc} <= 0x0066) { # a..f |
$self->{nc} <= 0x0066) { # a..f |
| 2904 |
!!!cp (1003); |
!!!cp (1003); |
| 2905 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 2906 |
$self->{s_kwd} += $self->{nc} - 0x0060 + 9; |
$self->{kwd} += $self->{nc} - 0x0060 + 9; |
| 2907 |
## Stay in the state. |
## Stay in the state. |
| 2908 |
!!!next-input-character; |
!!!next-input-character; |
| 2909 |
redo A; |
redo A; |
| 2910 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
| 2911 |
$self->{nc} <= 0x0046) { # A..F |
$self->{nc} <= 0x0046) { # A..F |
| 2912 |
!!!cp (1004); |
!!!cp (1004); |
| 2913 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 2914 |
$self->{s_kwd} += $self->{nc} - 0x0040 + 9; |
$self->{kwd} += $self->{nc} - 0x0040 + 9; |
| 2915 |
## Stay in the state. |
## Stay in the state. |
| 2916 |
!!!next-input-character; |
!!!next-input-character; |
| 2917 |
redo A; |
redo A; |
| 2928 |
# |
# |
| 2929 |
} |
} |
| 2930 |
|
|
| 2931 |
my $code = $self->{s_kwd}; |
my $code = $self->{kwd}; |
| 2932 |
my $l = $self->{line_prev}; |
my $l = $self->{line_prev}; |
| 2933 |
my $c = $self->{column_prev}; |
my $c = $self->{column_prev}; |
| 2934 |
if ($charref_map->{$code}) { |
if ($charref_map->{$code}) { |
| 2965 |
redo A; |
redo A; |
| 2966 |
} |
} |
| 2967 |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
| 2968 |
if (length $self->{s_kwd} < 30 and |
if (length $self->{kwd} < 30 and |
| 2969 |
## NOTE: Some number greater than the maximum length of entity name |
## NOTE: Some number greater than the maximum length of entity name |
| 2970 |
((0x0041 <= $self->{nc} and # a |
((0x0041 <= $self->{nc} and # a |
| 2971 |
$self->{nc} <= 0x005A) or # x |
$self->{nc} <= 0x005A) or # x |
| 2975 |
$self->{nc} <= 0x0039) or # 9 |
$self->{nc} <= 0x0039) or # 9 |
| 2976 |
$self->{nc} == 0x003B)) { # ; |
$self->{nc} == 0x003B)) { # ; |
| 2977 |
our $EntityChar; |
our $EntityChar; |
| 2978 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 2979 |
if (defined $EntityChar->{$self->{s_kwd}}) { |
if (defined $EntityChar->{$self->{kwd}}) { |
| 2980 |
if ($self->{nc} == 0x003B) { # ; |
if ($self->{nc} == 0x003B) { # ; |
| 2981 |
!!!cp (1020); |
!!!cp (1020); |
| 2982 |
$self->{entity__value} = $EntityChar->{$self->{s_kwd}}; |
$self->{entity__value} = $EntityChar->{$self->{kwd}}; |
| 2983 |
$self->{entity__match} = 1; |
$self->{entity__match} = 1; |
| 2984 |
!!!next-input-character; |
!!!next-input-character; |
| 2985 |
# |
# |
| 2986 |
} else { |
} else { |
| 2987 |
!!!cp (1021); |
!!!cp (1021); |
| 2988 |
$self->{entity__value} = $EntityChar->{$self->{s_kwd}}; |
$self->{entity__value} = $EntityChar->{$self->{kwd}}; |
| 2989 |
$self->{entity__match} = -1; |
$self->{entity__match} = -1; |
| 2990 |
## Stay in the state. |
## Stay in the state. |
| 2991 |
!!!next-input-character; |
!!!next-input-character; |
| 3013 |
if ($self->{prev_state} != DATA_STATE and # in attribute |
if ($self->{prev_state} != DATA_STATE and # in attribute |
| 3014 |
$self->{entity__match} < -1) { |
$self->{entity__match} < -1) { |
| 3015 |
!!!cp (1024); |
!!!cp (1024); |
| 3016 |
$data = '&' . $self->{s_kwd}; |
$data = '&' . $self->{kwd}; |
| 3017 |
# |
# |
| 3018 |
} else { |
} else { |
| 3019 |
!!!cp (1025); |
!!!cp (1025); |
| 3025 |
!!!cp (1026); |
!!!cp (1026); |
| 3026 |
!!!parse-error (type => 'bare ero', |
!!!parse-error (type => 'bare ero', |
| 3027 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 3028 |
column => $self->{column_prev} - length $self->{s_kwd}); |
column => $self->{column_prev} - length $self->{kwd}); |
| 3029 |
$data = '&' . $self->{s_kwd}; |
$data = '&' . $self->{kwd}; |
| 3030 |
# |
# |
| 3031 |
} |
} |
| 3032 |
|
|
| 3049 |
data => $data, |
data => $data, |
| 3050 |
has_reference => $has_ref, |
has_reference => $has_ref, |
| 3051 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 3052 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
column => $self->{column_prev} + 1 - length $self->{kwd}, |
| 3053 |
}); |
}); |
| 3054 |
redo A; |
redo A; |
| 3055 |
} else { |
} else { |
| 3189 |
## Reprocess. |
## Reprocess. |
| 3190 |
redo A; |
redo A; |
| 3191 |
} |
} |
| 3192 |
|
|
| 3193 |
|
} elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) { |
| 3194 |
|
if ($self->{nc} == 0x003C) { # < |
| 3195 |
|
## TODO: |
| 3196 |
|
!!!next-input-character; |
| 3197 |
|
redo A; |
| 3198 |
|
} elsif ($self->{nc} == 0x0025) { # % |
| 3199 |
|
## XML5: Not defined yet. |
| 3200 |
|
|
| 3201 |
|
## TODO: |
| 3202 |
|
!!!next-input-character; |
| 3203 |
|
redo A; |
| 3204 |
|
} elsif ($self->{nc} == 0x005D) { # ] |
| 3205 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE; |
| 3206 |
|
!!!next-input-character; |
| 3207 |
|
redo A; |
| 3208 |
|
} elsif ($is_space->{$self->{nc}}) { |
| 3209 |
|
## Stay in the state. |
| 3210 |
|
!!!next-input-character; |
| 3211 |
|
redo A; |
| 3212 |
|
} elsif ($self->{nc} == -1) { |
| 3213 |
|
!!!parse-error (type => 'unclosed internal subset'); ## TODO: type |
| 3214 |
|
$self->{state} = DATA_STATE; |
| 3215 |
|
$self->{s_kwd} = ''; |
| 3216 |
|
## Reconsume. |
| 3217 |
|
!!!emit ($self->{ct}); # DOCTYPE |
| 3218 |
|
redo A; |
| 3219 |
|
} else { |
| 3220 |
|
unless ($self->{internal_subset_tainted}) { |
| 3221 |
|
## XML5: No parse error. |
| 3222 |
|
!!!parse-error (type => 'string in internal subset'); |
| 3223 |
|
$self->{internal_subset_tainted} = 1; |
| 3224 |
|
} |
| 3225 |
|
## Stay in the state. |
| 3226 |
|
!!!next-input-character; |
| 3227 |
|
redo A; |
| 3228 |
|
} |
| 3229 |
|
} elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) { |
| 3230 |
|
if ($self->{nc} == 0x003E) { # > |
| 3231 |
|
$self->{state} = DATA_STATE; |
| 3232 |
|
$self->{s_kwd} = ''; |
| 3233 |
|
!!!next-input-character; |
| 3234 |
|
!!!emit ($self->{ct}); # DOCTYPE |
| 3235 |
|
redo A; |
| 3236 |
|
} elsif ($self->{nc} == -1) { |
| 3237 |
|
!!!parse-error (type => 'unclosed DOCTYPE'); |
| 3238 |
|
$self->{state} = DATA_STATE; |
| 3239 |
|
$self->{s_kwd} = ''; |
| 3240 |
|
## Reconsume. |
| 3241 |
|
!!!emit ($self->{ct}); # DOCTYPE |
| 3242 |
|
redo A; |
| 3243 |
|
} else { |
| 3244 |
|
## XML5: No parse error and stay in the state. |
| 3245 |
|
!!!parse-error (type => 'string after internal subset'); ## TODO: type |
| 3246 |
|
|
| 3247 |
|
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 3248 |
|
!!!next-input-character; |
| 3249 |
|
redo A; |
| 3250 |
|
} |
| 3251 |
|
|
| 3252 |
} else { |
} else { |
| 3253 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |