| 8 |
## doc.write (''); |
## doc.write (''); |
| 9 |
## alert (doc.compatMode); |
## alert (doc.compatMode); |
| 10 |
|
|
| 11 |
## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT |
## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263) |
| 12 |
## strip BOM and the HTML layer MUST ignore it. Whether we can do it |
## TODO: 1252 parse error (revision 1264) |
| 13 |
## is not yet clear. |
## TODO: 8859-11 = 874 (revision 1271) |
|
## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters? |
|
|
## "{U+FEFF}..." in GB18030? |
|
| 14 |
|
|
| 15 |
my $permitted_slash_tag_name = { |
my $permitted_slash_tag_name = { |
| 16 |
base => 1, |
base => 1, |
| 18 |
meta => 1, |
meta => 1, |
| 19 |
hr => 1, |
hr => 1, |
| 20 |
br => 1, |
br => 1, |
| 21 |
img=> 1, |
img => 1, |
| 22 |
embed => 1, |
embed => 1, |
| 23 |
param => 1, |
param => 1, |
| 24 |
area => 1, |
area => 1, |
| 95 |
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
| 96 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 97 |
} else { |
} else { |
| 98 |
$charset = 'windows-1252'; ## TODO: for now. |
## TODO: Implement HTML5 detection algorithm |
| 99 |
|
require Whatpm::Charset::UniversalCharDet; |
| 100 |
|
$charset = Whatpm::Charset::UniversalCharDet->detect_byte_string |
| 101 |
|
(substr ($$bytes_s, 0, 1024)); |
| 102 |
|
$charset ||= 'windows-1252'; |
| 103 |
$s = \ (Encode::decode ($charset, $$bytes_s)); |
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 104 |
$self->{input_encoding} = $charset; |
$self->{input_encoding} = $charset; |
| 105 |
$self->{confident} = 0; |
$self->{confident} = 0; |
| 153 |
return $return; |
return $return; |
| 154 |
} # parse_byte_string |
} # parse_byte_string |
| 155 |
|
|
| 156 |
|
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM |
| 157 |
|
## and the HTML layer MUST ignore it. However, we does strip BOM in |
| 158 |
|
## the encoding layer and the HTML layer does not ignore any U+FEFF, |
| 159 |
|
## because the core part of our HTML parser expects a string of character, |
| 160 |
|
## not a string of bytes or code units or anything which might contain a BOM. |
| 161 |
|
## Therefore, any parser interface that accepts a string of bytes, |
| 162 |
|
## such as |parse_byte_string| in this module, must ensure that it does |
| 163 |
|
## strip the BOM and never strip any ZWNBSP. |
| 164 |
|
|
| 165 |
*parse_char_string = \&parse_string; |
*parse_char_string = \&parse_string; |
| 166 |
|
|
| 167 |
sub parse_string ($$$;$) { |
sub parse_string ($$$;$) { |
| 286 |
sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 } |
sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 } |
| 287 |
sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 } |
sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 } |
| 288 |
sub BOGUS_DOCTYPE_STATE () { 32 } |
sub BOGUS_DOCTYPE_STATE () { 32 } |
| 289 |
|
sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 } |
| 290 |
|
|
| 291 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } |
| 292 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
| 348 |
## ->{system_identifier} (DOCTYPE_TOKEN) |
## ->{system_identifier} (DOCTYPE_TOKEN) |
| 349 |
## ->{correct} == 1 or 0 (DOCTYPE_TOKEN) |
## ->{correct} == 1 or 0 (DOCTYPE_TOKEN) |
| 350 |
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) |
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) |
| 351 |
|
## ->{name} |
| 352 |
|
## ->{value} |
| 353 |
|
## ->{has_reference} == 1 or 0 |
| 354 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
| 355 |
|
|
| 356 |
## Emitted token MUST immediately be handled by the tree construction state. |
## Emitted token MUST immediately be handled by the tree construction state. |
| 385 |
A: { |
A: { |
| 386 |
if ($self->{state} == DATA_STATE) { |
if ($self->{state} == DATA_STATE) { |
| 387 |
if ($self->{next_input_character} == 0x0026) { # & |
if ($self->{next_input_character} == 0x0026) { # & |
| 388 |
if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA |
if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA |
| 389 |
|
not $self->{escape}) { |
| 390 |
$self->{state} = ENTITY_DATA_STATE; |
$self->{state} = ENTITY_DATA_STATE; |
| 391 |
!!!next-input-character; |
!!!next-input-character; |
| 392 |
redo A; |
redo A; |
| 441 |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
| 442 |
## (cannot happen in CDATA state) |
## (cannot happen in CDATA state) |
| 443 |
|
|
| 444 |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0); |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
| 445 |
|
|
| 446 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 447 |
# next-input-character is already done |
# next-input-character is already done |
| 744 |
|
|
| 745 |
redo A; |
redo A; |
| 746 |
} else { |
} else { |
| 747 |
|
if ({ |
| 748 |
|
0x0022 => 1, # " |
| 749 |
|
0x0027 => 1, # ' |
| 750 |
|
0x003D => 1, # = |
| 751 |
|
}->{$self->{next_input_character}}) { |
| 752 |
|
!!!parse-error (type => 'bad attribute name'); |
| 753 |
|
} |
| 754 |
$self->{current_attribute} = {name => chr ($self->{next_input_character}), |
$self->{current_attribute} = {name => chr ($self->{next_input_character}), |
| 755 |
value => ''}; |
value => ''}; |
| 756 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
| 845 |
|
|
| 846 |
redo A; |
redo A; |
| 847 |
} else { |
} else { |
| 848 |
|
if ($self->{next_input_character} == 0x0022 or # " |
| 849 |
|
$self->{next_input_character} == 0x0027) { # ' |
| 850 |
|
!!!parse-error (type => 'bad attribute name'); |
| 851 |
|
} |
| 852 |
$self->{current_attribute}->{name} .= chr ($self->{next_input_character}); |
$self->{current_attribute}->{name} .= chr ($self->{next_input_character}); |
| 853 |
## Stay in the state |
## Stay in the state |
| 854 |
!!!next-input-character; |
!!!next-input-character; |
| 995 |
|
|
| 996 |
redo A; |
redo A; |
| 997 |
} else { |
} else { |
| 998 |
|
if ($self->{next_input_character} == 0x003D) { # = |
| 999 |
|
!!!parse-error (type => 'bad attribute value'); |
| 1000 |
|
} |
| 1001 |
$self->{current_attribute}->{value} .= chr ($self->{next_input_character}); |
$self->{current_attribute}->{value} .= chr ($self->{next_input_character}); |
| 1002 |
$self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; |
$self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; |
| 1003 |
!!!next-input-character; |
!!!next-input-character; |
| 1005 |
} |
} |
| 1006 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
| 1007 |
if ($self->{next_input_character} == 0x0022) { # " |
if ($self->{next_input_character} == 0x0022) { # " |
| 1008 |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1009 |
!!!next-input-character; |
!!!next-input-character; |
| 1010 |
redo A; |
redo A; |
| 1011 |
} elsif ($self->{next_input_character} == 0x0026) { # & |
} elsif ($self->{next_input_character} == 0x0026) { # & |
| 1041 |
} |
} |
| 1042 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
| 1043 |
if ($self->{next_input_character} == 0x0027) { # ' |
if ($self->{next_input_character} == 0x0027) { # ' |
| 1044 |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1045 |
!!!next-input-character; |
!!!next-input-character; |
| 1046 |
redo A; |
redo A; |
| 1047 |
} elsif ($self->{next_input_character} == 0x0026) { # & |
} elsif ($self->{next_input_character} == 0x0026) { # & |
| 1129 |
|
|
| 1130 |
redo A; |
redo A; |
| 1131 |
} else { |
} else { |
| 1132 |
|
if ({ |
| 1133 |
|
0x0022 => 1, # " |
| 1134 |
|
0x0027 => 1, # ' |
| 1135 |
|
0x003D => 1, # = |
| 1136 |
|
}->{$self->{next_input_character}}) { |
| 1137 |
|
!!!parse-error (type => 'bad attribute value'); |
| 1138 |
|
} |
| 1139 |
$self->{current_attribute}->{value} .= chr ($self->{next_input_character}); |
$self->{current_attribute}->{value} .= chr ($self->{next_input_character}); |
| 1140 |
## Stay in the state |
## Stay in the state |
| 1141 |
!!!next-input-character; |
!!!next-input-character; |
| 1142 |
redo A; |
redo A; |
| 1143 |
} |
} |
| 1144 |
} elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) { |
} elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) { |
| 1145 |
my $token = $self->_tokenize_attempt_to_consume_an_entity (1); |
my $token = $self->_tokenize_attempt_to_consume_an_entity |
| 1146 |
|
(1, |
| 1147 |
|
$self->{last_attribute_value_state} |
| 1148 |
|
== ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # " |
| 1149 |
|
$self->{last_attribute_value_state} |
| 1150 |
|
== ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # ' |
| 1151 |
|
-1); |
| 1152 |
|
|
| 1153 |
unless (defined $token) { |
unless (defined $token) { |
| 1154 |
$self->{current_attribute}->{value} .= '&'; |
$self->{current_attribute}->{value} .= '&'; |
| 1155 |
} else { |
} else { |
| 1156 |
$self->{current_attribute}->{value} .= $token->{data}; |
$self->{current_attribute}->{value} .= $token->{data}; |
| 1157 |
|
$self->{current_attribute}->{has_reference} = $token->{has_reference}; |
| 1158 |
## ISSUE: spec says "append the returned character token to the current attribute's value" |
## ISSUE: spec says "append the returned character token to the current attribute's value" |
| 1159 |
} |
} |
| 1160 |
|
|
| 1161 |
$self->{state} = $self->{last_attribute_value_state}; |
$self->{state} = $self->{last_attribute_value_state}; |
| 1162 |
# next-input-character is already done |
# next-input-character is already done |
| 1163 |
redo A; |
redo A; |
| 1164 |
|
} elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { |
| 1165 |
|
if ($self->{next_input_character} == 0x0009 or # HT |
| 1166 |
|
$self->{next_input_character} == 0x000A or # LF |
| 1167 |
|
$self->{next_input_character} == 0x000B or # VT |
| 1168 |
|
$self->{next_input_character} == 0x000C or # FF |
| 1169 |
|
$self->{next_input_character} == 0x0020) { # SP |
| 1170 |
|
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1171 |
|
!!!next-input-character; |
| 1172 |
|
redo A; |
| 1173 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1174 |
|
if ($self->{current_token}->{type} == START_TAG_TOKEN) { |
| 1175 |
|
$self->{current_token}->{first_start_tag} |
| 1176 |
|
= not defined $self->{last_emitted_start_tag_name}; |
| 1177 |
|
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name}; |
| 1178 |
|
} elsif ($self->{current_token}->{type} == END_TAG_TOKEN) { |
| 1179 |
|
$self->{content_model} = PCDATA_CONTENT_MODEL; # MUST |
| 1180 |
|
if ($self->{current_token}->{attributes}) { |
| 1181 |
|
!!!parse-error (type => 'end tag attribute'); |
| 1182 |
|
} |
| 1183 |
|
} else { |
| 1184 |
|
die "$0: $self->{current_token}->{type}: Unknown token type"; |
| 1185 |
|
} |
| 1186 |
|
$self->{state} = DATA_STATE; |
| 1187 |
|
!!!next-input-character; |
| 1188 |
|
|
| 1189 |
|
!!!emit ($self->{current_token}); # start tag or end tag |
| 1190 |
|
|
| 1191 |
|
redo A; |
| 1192 |
|
} elsif ($self->{next_input_character} == 0x002F) { # / |
| 1193 |
|
!!!next-input-character; |
| 1194 |
|
if ($self->{next_input_character} == 0x003E and # > |
| 1195 |
|
$self->{current_token}->{type} == START_TAG_TOKEN and |
| 1196 |
|
$permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) { |
| 1197 |
|
# permitted slash |
| 1198 |
|
# |
| 1199 |
|
} else { |
| 1200 |
|
!!!parse-error (type => 'nestc'); |
| 1201 |
|
} |
| 1202 |
|
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1203 |
|
# next-input-character is already done |
| 1204 |
|
redo A; |
| 1205 |
|
} else { |
| 1206 |
|
!!!parse-error (type => 'no space between attributes'); |
| 1207 |
|
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1208 |
|
## reconsume |
| 1209 |
|
redo A; |
| 1210 |
|
} |
| 1211 |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
| 1212 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
| 1213 |
|
|
| 1586 |
} |
} |
| 1587 |
|
|
| 1588 |
!!!parse-error (type => 'string after DOCTYPE name'); |
!!!parse-error (type => 'string after DOCTYPE name'); |
| 1589 |
|
delete $self->{current_token}->{correct}; |
| 1590 |
|
|
| 1591 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 1592 |
# next-input-character is already done |
# next-input-character is already done |
| 1593 |
redo A; |
redo A; |
| 1631 |
redo A; |
redo A; |
| 1632 |
} else { |
} else { |
| 1633 |
!!!parse-error (type => 'string after PUBLIC'); |
!!!parse-error (type => 'string after PUBLIC'); |
| 1634 |
|
delete $self->{current_token}->{correct}; |
| 1635 |
|
|
| 1636 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 1637 |
!!!next-input-character; |
!!!next-input-character; |
| 1638 |
redo A; |
redo A; |
| 1642 |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 1643 |
!!!next-input-character; |
!!!next-input-character; |
| 1644 |
redo A; |
redo A; |
| 1645 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1646 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1647 |
|
|
| 1648 |
|
$self->{state} = DATA_STATE; |
| 1649 |
|
!!!next-input-character; |
| 1650 |
|
|
| 1651 |
|
delete $self->{current_token}->{correct}; |
| 1652 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1653 |
|
|
| 1654 |
|
redo A; |
| 1655 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1656 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1657 |
|
|
| 1674 |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 1675 |
!!!next-input-character; |
!!!next-input-character; |
| 1676 |
redo A; |
redo A; |
| 1677 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1678 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1679 |
|
|
| 1680 |
|
$self->{state} = DATA_STATE; |
| 1681 |
|
!!!next-input-character; |
| 1682 |
|
|
| 1683 |
|
delete $self->{current_token}->{correct}; |
| 1684 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1685 |
|
|
| 1686 |
|
redo A; |
| 1687 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1688 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1689 |
|
|
| 1738 |
redo A; |
redo A; |
| 1739 |
} else { |
} else { |
| 1740 |
!!!parse-error (type => 'string after PUBLIC literal'); |
!!!parse-error (type => 'string after PUBLIC literal'); |
| 1741 |
|
delete $self->{current_token}->{correct}; |
| 1742 |
|
|
| 1743 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 1744 |
!!!next-input-character; |
!!!next-input-character; |
| 1745 |
redo A; |
redo A; |
| 1783 |
redo A; |
redo A; |
| 1784 |
} else { |
} else { |
| 1785 |
!!!parse-error (type => 'string after SYSTEM'); |
!!!parse-error (type => 'string after SYSTEM'); |
| 1786 |
|
delete $self->{current_token}->{correct}; |
| 1787 |
|
|
| 1788 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 1789 |
!!!next-input-character; |
!!!next-input-character; |
| 1790 |
redo A; |
redo A; |
| 1794 |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 1795 |
!!!next-input-character; |
!!!next-input-character; |
| 1796 |
redo A; |
redo A; |
| 1797 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1798 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1799 |
|
|
| 1800 |
|
$self->{state} = DATA_STATE; |
| 1801 |
|
!!!next-input-character; |
| 1802 |
|
|
| 1803 |
|
delete $self->{current_token}->{correct}; |
| 1804 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1805 |
|
|
| 1806 |
|
redo A; |
| 1807 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1808 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
| 1809 |
|
|
| 1826 |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 1827 |
!!!next-input-character; |
!!!next-input-character; |
| 1828 |
redo A; |
redo A; |
| 1829 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1830 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1831 |
|
|
| 1832 |
|
$self->{state} = DATA_STATE; |
| 1833 |
|
!!!next-input-character; |
| 1834 |
|
|
| 1835 |
|
delete $self->{current_token}->{correct}; |
| 1836 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1837 |
|
|
| 1838 |
|
redo A; |
| 1839 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1840 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
| 1841 |
|
|
| 1880 |
redo A; |
redo A; |
| 1881 |
} else { |
} else { |
| 1882 |
!!!parse-error (type => 'string after SYSTEM literal'); |
!!!parse-error (type => 'string after SYSTEM literal'); |
| 1883 |
|
delete $self->{current_token}->{correct}; |
| 1884 |
|
|
| 1885 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 1886 |
!!!next-input-character; |
!!!next-input-character; |
| 1887 |
redo A; |
redo A; |
| 1891 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1892 |
!!!next-input-character; |
!!!next-input-character; |
| 1893 |
|
|
|
delete $self->{current_token}->{correct}; |
|
| 1894 |
!!!emit ($self->{current_token}); # DOCTYPE |
!!!emit ($self->{current_token}); # DOCTYPE |
| 1895 |
|
|
| 1896 |
redo A; |
redo A; |
| 1899 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1900 |
## reconsume |
## reconsume |
| 1901 |
|
|
|
delete $self->{current_token}->{correct}; |
|
| 1902 |
!!!emit ($self->{current_token}); # DOCTYPE |
!!!emit ($self->{current_token}); # DOCTYPE |
| 1903 |
|
|
| 1904 |
redo A; |
redo A; |
| 1915 |
die "$0: _get_next_token: unexpected case"; |
die "$0: _get_next_token: unexpected case"; |
| 1916 |
} # _get_next_token |
} # _get_next_token |
| 1917 |
|
|
| 1918 |
sub _tokenize_attempt_to_consume_an_entity ($$) { |
sub _tokenize_attempt_to_consume_an_entity ($$$) { |
| 1919 |
my ($self, $in_attr) = @_; |
my ($self, $in_attr, $additional) = @_; |
| 1920 |
|
|
| 1921 |
if ({ |
if ({ |
| 1922 |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
| 1923 |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
| 1924 |
|
$additional => 1, |
| 1925 |
}->{$self->{next_input_character}}) { |
}->{$self->{next_input_character}}) { |
| 1926 |
## Don't consume |
## Don't consume |
| 1927 |
## No error |
## No error |
| 1977 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 1978 |
} |
} |
| 1979 |
|
|
| 1980 |
return {type => CHARACTER_TOKEN, data => chr $code}; |
return {type => CHARACTER_TOKEN, data => chr $code, |
| 1981 |
|
has_reference => 1}; |
| 1982 |
} # X |
} # X |
| 1983 |
} elsif (0x0030 <= $self->{next_input_character} and |
} elsif (0x0030 <= $self->{next_input_character} and |
| 1984 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
| 2013 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 2014 |
} |
} |
| 2015 |
|
|
| 2016 |
return {type => CHARACTER_TOKEN, data => chr $code}; |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1}; |
| 2017 |
} else { |
} else { |
| 2018 |
!!!parse-error (type => 'bare nero'); |
!!!parse-error (type => 'bare nero'); |
| 2019 |
!!!back-next-input-character ($self->{next_input_character}); |
!!!back-next-input-character ($self->{next_input_character}); |
| 2061 |
} |
} |
| 2062 |
|
|
| 2063 |
if ($match > 0) { |
if ($match > 0) { |
| 2064 |
return {type => CHARACTER_TOKEN, data => $value}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
| 2065 |
} elsif ($match < 0) { |
} elsif ($match < 0) { |
| 2066 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc'); |
| 2067 |
if ($in_attr and $match < -1) { |
if ($in_attr and $match < -1) { |
| 2068 |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
| 2069 |
} else { |
} else { |
| 2070 |
return {type => CHARACTER_TOKEN, data => $value}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
| 2071 |
} |
} |
| 2072 |
} else { |
} else { |
| 2073 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero'); |
| 2074 |
## NOTE: No characters are consumed in the spec. |
## NOTE: "No characters are consumed" in the spec. |
| 2075 |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
| 2076 |
} |
} |
| 2077 |
} else { |
} else { |
| 2208 |
"-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1, |
"-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1, |
| 2209 |
"-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1, |
"-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1, |
| 2210 |
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1, |
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1, |
| 2211 |
|
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1, |
| 2212 |
|
"-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1, |
| 2213 |
|
"-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1, |
| 2214 |
"-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1, |
"-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1, |
| 2215 |
"-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1, |
"-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1, |
| 2216 |
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1, |
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1, |
| 2328 |
# |
# |
| 2329 |
} elsif ($token->{type} == START_TAG_TOKEN) { |
} elsif ($token->{type} == START_TAG_TOKEN) { |
| 2330 |
if ($token->{tag_name} eq 'html' and |
if ($token->{tag_name} eq 'html' and |
| 2331 |
$token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application" |
$token->{attributes}->{manifest}) { |
| 2332 |
$self->{application_cache_selection} |
$self->{application_cache_selection} |
| 2333 |
->($token->{attributes}->{manifest}->{value}); |
->($token->{attributes}->{manifest}->{value}); |
| 2334 |
## ISSUE: No relative reference resolution? |
## ISSUE: No relative reference resolution? |
| 3006 |
push @{$self->{open_elements}}, [$self->{head_element}, 'head']; |
push @{$self->{open_elements}}, [$self->{head_element}, 'head']; |
| 3007 |
} |
} |
| 3008 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
| 3009 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 3010 |
|
|
| 3011 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 3012 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 3013 |
$self->{change_encoding} |
$self->{change_encoding} |
| 3014 |
->($self, $token->{attributes}->{charset}->{value}); |
->($self, $token->{attributes}->{charset}->{value}); |
| 3015 |
|
|
| 3016 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 3017 |
|
->set_user_data (manakai_has_reference => |
| 3018 |
|
$token->{attributes}->{charset} |
| 3019 |
|
->{has_reference}); |
| 3020 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
| 3021 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 3022 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 3023 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 3024 |
|
[\x09-\x0D\x20]*= |
| 3025 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 3026 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 3027 |
$self->{change_encoding} |
$self->{change_encoding} |
| 3028 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 3029 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 3030 |
|
->set_user_data (manakai_has_reference => |
| 3031 |
|
$token->{attributes}->{content} |
| 3032 |
|
->{has_reference}); |
| 3033 |
} |
} |
| 3034 |
} |
} |
| 3035 |
|
} else { |
| 3036 |
|
if ($token->{attributes}->{charset}) { |
| 3037 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 3038 |
|
->set_user_data (manakai_has_reference => |
| 3039 |
|
$token->{attributes}->{charset} |
| 3040 |
|
->{has_reference}); |
| 3041 |
|
} |
| 3042 |
|
if ($token->{attributes}->{content}) { |
| 3043 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 3044 |
|
->set_user_data (manakai_has_reference => |
| 3045 |
|
$token->{attributes}->{content} |
| 3046 |
|
->{has_reference}); |
| 3047 |
|
} |
| 3048 |
} |
} |
| 3049 |
|
|
| 3050 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
| 4617 |
} elsif ($token->{tag_name} eq 'meta') { |
} elsif ($token->{tag_name} eq 'meta') { |
| 4618 |
## NOTE: This is an "as if in head" code clone, only "-t" differs |
## NOTE: This is an "as if in head" code clone, only "-t" differs |
| 4619 |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
| 4620 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4621 |
|
|
| 4622 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 4623 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 4624 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4625 |
->($self, $token->{attributes}->{charset}->{value}); |
->($self, $token->{attributes}->{charset}->{value}); |
| 4626 |
|
|
| 4627 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 4628 |
|
->set_user_data (manakai_has_reference => |
| 4629 |
|
$token->{attributes}->{charset} |
| 4630 |
|
->{has_reference}); |
| 4631 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
| 4632 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 4633 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4634 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 4635 |
|
[\x09-\x0D\x20]*= |
| 4636 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 4637 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 4638 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4639 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 4640 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 4641 |
|
->set_user_data (manakai_has_reference => |
| 4642 |
|
$token->{attributes}->{content} |
| 4643 |
|
->{has_reference}); |
| 4644 |
} |
} |
| 4645 |
} |
} |
| 4646 |
|
} else { |
| 4647 |
|
if ($token->{attributes}->{charset}) { |
| 4648 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 4649 |
|
->set_user_data (manakai_has_reference => |
| 4650 |
|
$token->{attributes}->{charset} |
| 4651 |
|
->{has_reference}); |
| 4652 |
|
} |
| 4653 |
|
if ($token->{attributes}->{content}) { |
| 4654 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 4655 |
|
->set_user_data (manakai_has_reference => |
| 4656 |
|
$token->{attributes}->{content} |
| 4657 |
|
->{has_reference}); |
| 4658 |
|
} |
| 4659 |
} |
} |
| 4660 |
|
|
| 4661 |
!!!next-token; |
!!!next-token; |
| 5555 |
$p->_initialize_tree_constructor; |
$p->_initialize_tree_constructor; |
| 5556 |
|
|
| 5557 |
## Step 2 |
## Step 2 |
| 5558 |
my $node_ln = $node->local_name; |
my $node_ln = $node->manakai_local_name; |
| 5559 |
$p->{content_model} = { |
$p->{content_model} = { |
| 5560 |
title => RCDATA_CONTENT_MODEL, |
title => RCDATA_CONTENT_MODEL, |
| 5561 |
textarea => RCDATA_CONTENT_MODEL, |
textarea => RCDATA_CONTENT_MODEL, |
| 5595 |
if ($anode->node_type == 1) { |
if ($anode->node_type == 1) { |
| 5596 |
my $nsuri = $anode->namespace_uri; |
my $nsuri = $anode->namespace_uri; |
| 5597 |
if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') { |
if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') { |
| 5598 |
if ($anode->local_name eq 'form') { ## TODO: case? |
if ($anode->manakai_local_name eq 'form') { |
| 5599 |
$p->{form_element} = $anode; |
$p->{form_element} = $anode; |
| 5600 |
last AN; |
last AN; |
| 5601 |
} |
} |