| 1 |
package Whatpm::HTML; |
package Whatpm::HTML; |
| 2 |
use strict; |
use strict; |
| 3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
| 4 |
|
use Error qw(:try); |
| 5 |
|
|
| 6 |
## ISSUE: |
## ISSUE: |
| 7 |
## var doc = implementation.createDocument (null, null, null); |
## var doc = implementation.createDocument (null, null, null); |
| 8 |
## doc.write (''); |
## doc.write (''); |
| 9 |
## alert (doc.compatMode); |
## alert (doc.compatMode); |
| 10 |
|
|
| 11 |
## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT |
## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263) |
| 12 |
## strip BOM and the HTML layer MUST ignore it. Whether we can do it |
## TODO: 1252 parse error (revision 1264) |
| 13 |
## is not yet clear. |
## TODO: 8859-11 = 874 (revision 1271) |
|
## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters? |
|
|
## "{U+FEFF}..." in GB18030? |
|
| 14 |
|
|
| 15 |
my $permitted_slash_tag_name = { |
my $permitted_slash_tag_name = { |
| 16 |
base => 1, |
base => 1, |
| 18 |
meta => 1, |
meta => 1, |
| 19 |
hr => 1, |
hr => 1, |
| 20 |
br => 1, |
br => 1, |
| 21 |
img=> 1, |
img => 1, |
| 22 |
embed => 1, |
embed => 1, |
| 23 |
param => 1, |
param => 1, |
| 24 |
area => 1, |
area => 1, |
| 83 |
}; |
}; |
| 84 |
# $phrasing_category: all other elements |
# $phrasing_category: all other elements |
| 85 |
|
|
| 86 |
|
sub parse_byte_string ($$$$;$) { |
| 87 |
|
my $self = ref $_[0] ? shift : shift->new; |
| 88 |
|
my $charset = shift; |
| 89 |
|
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
| 90 |
|
my $s; |
| 91 |
|
|
| 92 |
|
if (defined $charset) { |
| 93 |
|
require Encode; ## TODO: decode(utf8) don't delete BOM |
| 94 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 95 |
|
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
| 96 |
|
$self->{confident} = 1; |
| 97 |
|
} else { |
| 98 |
|
## TODO: Implement HTML5 detection algorithm |
| 99 |
|
require Whatpm::Charset::UniversalCharDet; |
| 100 |
|
$charset = Whatpm::Charset::UniversalCharDet->detect_byte_string |
| 101 |
|
(substr ($$bytes_s, 0, 1024)); |
| 102 |
|
$charset ||= 'windows-1252'; |
| 103 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 104 |
|
$self->{input_encoding} = $charset; |
| 105 |
|
$self->{confident} = 0; |
| 106 |
|
} |
| 107 |
|
|
| 108 |
|
$self->{change_encoding} = sub { |
| 109 |
|
my $self = shift; |
| 110 |
|
my $charset = lc shift; |
| 111 |
|
## TODO: if $charset is supported |
| 112 |
|
## TODO: normalize charset name |
| 113 |
|
|
| 114 |
|
## "Change the encoding" algorithm: |
| 115 |
|
|
| 116 |
|
## Step 1 |
| 117 |
|
if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
| 118 |
|
$charset = 'utf-8'; |
| 119 |
|
} |
| 120 |
|
|
| 121 |
|
## Step 2 |
| 122 |
|
if (defined $self->{input_encoding} and |
| 123 |
|
$self->{input_encoding} eq $charset) { |
| 124 |
|
$self->{confident} = 1; |
| 125 |
|
return; |
| 126 |
|
} |
| 127 |
|
|
| 128 |
|
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
| 129 |
|
':'.$charset, level => 'w'); |
| 130 |
|
|
| 131 |
|
## Step 3 |
| 132 |
|
# if (can) { |
| 133 |
|
## change the encoding on the fly. |
| 134 |
|
#$self->{confident} = 1; |
| 135 |
|
#return; |
| 136 |
|
# } |
| 137 |
|
|
| 138 |
|
## Step 4 |
| 139 |
|
throw Whatpm::HTML::RestartParser (charset => $charset); |
| 140 |
|
}; # $self->{change_encoding} |
| 141 |
|
|
| 142 |
|
my @args = @_; shift @args; # $s |
| 143 |
|
my $return; |
| 144 |
|
try { |
| 145 |
|
$return = $self->parse_char_string ($s, @args); |
| 146 |
|
} catch Whatpm::HTML::RestartParser with { |
| 147 |
|
my $charset = shift->{charset}; |
| 148 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 149 |
|
$self->{input_encoding} = $charset; ## TODO: normalize |
| 150 |
|
$self->{confident} = 1; |
| 151 |
|
$return = $self->parse_char_string ($s, @args); |
| 152 |
|
}; |
| 153 |
|
return $return; |
| 154 |
|
} # parse_byte_string |
| 155 |
|
|
| 156 |
|
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM |
| 157 |
|
## and the HTML layer MUST ignore it. However, we does strip BOM in |
| 158 |
|
## the encoding layer and the HTML layer does not ignore any U+FEFF, |
| 159 |
|
## because the core part of our HTML parser expects a string of character, |
| 160 |
|
## not a string of bytes or code units or anything which might contain a BOM. |
| 161 |
|
## Therefore, any parser interface that accepts a string of bytes, |
| 162 |
|
## such as |parse_byte_string| in this module, must ensure that it does |
| 163 |
|
## strip the BOM and never strip any ZWNBSP. |
| 164 |
|
|
| 165 |
|
*parse_char_string = \&parse_string; |
| 166 |
|
|
| 167 |
sub parse_string ($$$;$) { |
sub parse_string ($$$;$) { |
| 168 |
my $self = shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 169 |
my $s = \$_[0]; |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
| 170 |
$self->{document} = $_[1]; |
$self->{document} = $_[1]; |
| 171 |
|
@{$self->{document}->child_nodes} = (); |
| 172 |
|
|
| 173 |
## NOTE: |set_inner_html| copies most of this method's code |
## NOTE: |set_inner_html| copies most of this method's code |
| 174 |
|
|
| 175 |
|
$self->{confident} = 1 unless exists $self->{confident}; |
| 176 |
|
$self->{document}->input_encoding ($self->{input_encoding}) |
| 177 |
|
if defined $self->{input_encoding}; |
| 178 |
|
|
| 179 |
my $i = 0; |
my $i = 0; |
| 180 |
my $line = 1; |
my $line = 1; |
| 181 |
my $column = 0; |
my $column = 0; |
| 232 |
$self->{parse_error} = sub { |
$self->{parse_error} = sub { |
| 233 |
# |
# |
| 234 |
}; |
}; |
| 235 |
|
$self->{change_encoding} = sub { |
| 236 |
|
# if ($_[0] is a supported encoding) { |
| 237 |
|
# run "change the encoding" algorithm; |
| 238 |
|
# throw Whatpm::HTML::RestartParser (charset => $new_encoding); |
| 239 |
|
# } |
| 240 |
|
}; |
| 241 |
$self->{application_cache_selection} = sub { |
$self->{application_cache_selection} = sub { |
| 242 |
# |
# |
| 243 |
}; |
}; |
| 286 |
sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 } |
sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 } |
| 287 |
sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 } |
sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 } |
| 288 |
sub BOGUS_DOCTYPE_STATE () { 32 } |
sub BOGUS_DOCTYPE_STATE () { 32 } |
| 289 |
|
sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 } |
| 290 |
|
|
| 291 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } |
| 292 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
| 348 |
## ->{system_identifier} (DOCTYPE_TOKEN) |
## ->{system_identifier} (DOCTYPE_TOKEN) |
| 349 |
## ->{correct} == 1 or 0 (DOCTYPE_TOKEN) |
## ->{correct} == 1 or 0 (DOCTYPE_TOKEN) |
| 350 |
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) |
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) |
| 351 |
|
## ->{name} |
| 352 |
|
## ->{value} |
| 353 |
|
## ->{has_reference} == 1 or 0 |
| 354 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
| 355 |
|
|
| 356 |
## Emitted token MUST immediately be handled by the tree construction state. |
## Emitted token MUST immediately be handled by the tree construction state. |
| 385 |
A: { |
A: { |
| 386 |
if ($self->{state} == DATA_STATE) { |
if ($self->{state} == DATA_STATE) { |
| 387 |
if ($self->{next_input_character} == 0x0026) { # & |
if ($self->{next_input_character} == 0x0026) { # & |
| 388 |
if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA |
if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA |
| 389 |
|
not $self->{escape}) { |
| 390 |
$self->{state} = ENTITY_DATA_STATE; |
$self->{state} = ENTITY_DATA_STATE; |
| 391 |
!!!next-input-character; |
!!!next-input-character; |
| 392 |
redo A; |
redo A; |
| 441 |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
| 442 |
## (cannot happen in CDATA state) |
## (cannot happen in CDATA state) |
| 443 |
|
|
| 444 |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0); |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
| 445 |
|
|
| 446 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 447 |
# next-input-character is already done |
# next-input-character is already done |
| 744 |
|
|
| 745 |
redo A; |
redo A; |
| 746 |
} else { |
} else { |
| 747 |
|
if ({ |
| 748 |
|
0x0022 => 1, # " |
| 749 |
|
0x0027 => 1, # ' |
| 750 |
|
0x003D => 1, # = |
| 751 |
|
}->{$self->{next_input_character}}) { |
| 752 |
|
!!!parse-error (type => 'bad attribute name'); |
| 753 |
|
} |
| 754 |
$self->{current_attribute} = {name => chr ($self->{next_input_character}), |
$self->{current_attribute} = {name => chr ($self->{next_input_character}), |
| 755 |
value => ''}; |
value => ''}; |
| 756 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
| 845 |
|
|
| 846 |
redo A; |
redo A; |
| 847 |
} else { |
} else { |
| 848 |
|
if ($self->{next_input_character} == 0x0022 or # " |
| 849 |
|
$self->{next_input_character} == 0x0027) { # ' |
| 850 |
|
!!!parse-error (type => 'bad attribute name'); |
| 851 |
|
} |
| 852 |
$self->{current_attribute}->{name} .= chr ($self->{next_input_character}); |
$self->{current_attribute}->{name} .= chr ($self->{next_input_character}); |
| 853 |
## Stay in the state |
## Stay in the state |
| 854 |
!!!next-input-character; |
!!!next-input-character; |
| 995 |
|
|
| 996 |
redo A; |
redo A; |
| 997 |
} else { |
} else { |
| 998 |
|
if ($self->{next_input_character} == 0x003D) { # = |
| 999 |
|
!!!parse-error (type => 'bad attribute value'); |
| 1000 |
|
} |
| 1001 |
$self->{current_attribute}->{value} .= chr ($self->{next_input_character}); |
$self->{current_attribute}->{value} .= chr ($self->{next_input_character}); |
| 1002 |
$self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; |
$self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; |
| 1003 |
!!!next-input-character; |
!!!next-input-character; |
| 1005 |
} |
} |
| 1006 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
| 1007 |
if ($self->{next_input_character} == 0x0022) { # " |
if ($self->{next_input_character} == 0x0022) { # " |
| 1008 |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1009 |
!!!next-input-character; |
!!!next-input-character; |
| 1010 |
redo A; |
redo A; |
| 1011 |
} elsif ($self->{next_input_character} == 0x0026) { # & |
} elsif ($self->{next_input_character} == 0x0026) { # & |
| 1041 |
} |
} |
| 1042 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
| 1043 |
if ($self->{next_input_character} == 0x0027) { # ' |
if ($self->{next_input_character} == 0x0027) { # ' |
| 1044 |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1045 |
!!!next-input-character; |
!!!next-input-character; |
| 1046 |
redo A; |
redo A; |
| 1047 |
} elsif ($self->{next_input_character} == 0x0026) { # & |
} elsif ($self->{next_input_character} == 0x0026) { # & |
| 1129 |
|
|
| 1130 |
redo A; |
redo A; |
| 1131 |
} else { |
} else { |
| 1132 |
|
if ({ |
| 1133 |
|
0x0022 => 1, # " |
| 1134 |
|
0x0027 => 1, # ' |
| 1135 |
|
0x003D => 1, # = |
| 1136 |
|
}->{$self->{next_input_character}}) { |
| 1137 |
|
!!!parse-error (type => 'bad attribute value'); |
| 1138 |
|
} |
| 1139 |
$self->{current_attribute}->{value} .= chr ($self->{next_input_character}); |
$self->{current_attribute}->{value} .= chr ($self->{next_input_character}); |
| 1140 |
## Stay in the state |
## Stay in the state |
| 1141 |
!!!next-input-character; |
!!!next-input-character; |
| 1142 |
redo A; |
redo A; |
| 1143 |
} |
} |
| 1144 |
} elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) { |
} elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) { |
| 1145 |
my $token = $self->_tokenize_attempt_to_consume_an_entity (1); |
my $token = $self->_tokenize_attempt_to_consume_an_entity |
| 1146 |
|
(1, |
| 1147 |
|
$self->{last_attribute_value_state} |
| 1148 |
|
== ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # " |
| 1149 |
|
$self->{last_attribute_value_state} |
| 1150 |
|
== ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # ' |
| 1151 |
|
-1); |
| 1152 |
|
|
| 1153 |
unless (defined $token) { |
unless (defined $token) { |
| 1154 |
$self->{current_attribute}->{value} .= '&'; |
$self->{current_attribute}->{value} .= '&'; |
| 1155 |
} else { |
} else { |
| 1156 |
$self->{current_attribute}->{value} .= $token->{data}; |
$self->{current_attribute}->{value} .= $token->{data}; |
| 1157 |
|
$self->{current_attribute}->{has_reference} = $token->{has_reference}; |
| 1158 |
## ISSUE: spec says "append the returned character token to the current attribute's value" |
## ISSUE: spec says "append the returned character token to the current attribute's value" |
| 1159 |
} |
} |
| 1160 |
|
|
| 1161 |
$self->{state} = $self->{last_attribute_value_state}; |
$self->{state} = $self->{last_attribute_value_state}; |
| 1162 |
# next-input-character is already done |
# next-input-character is already done |
| 1163 |
redo A; |
redo A; |
| 1164 |
|
} elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { |
| 1165 |
|
if ($self->{next_input_character} == 0x0009 or # HT |
| 1166 |
|
$self->{next_input_character} == 0x000A or # LF |
| 1167 |
|
$self->{next_input_character} == 0x000B or # VT |
| 1168 |
|
$self->{next_input_character} == 0x000C or # FF |
| 1169 |
|
$self->{next_input_character} == 0x0020) { # SP |
| 1170 |
|
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1171 |
|
!!!next-input-character; |
| 1172 |
|
redo A; |
| 1173 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1174 |
|
if ($self->{current_token}->{type} == START_TAG_TOKEN) { |
| 1175 |
|
$self->{current_token}->{first_start_tag} |
| 1176 |
|
= not defined $self->{last_emitted_start_tag_name}; |
| 1177 |
|
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name}; |
| 1178 |
|
} elsif ($self->{current_token}->{type} == END_TAG_TOKEN) { |
| 1179 |
|
$self->{content_model} = PCDATA_CONTENT_MODEL; # MUST |
| 1180 |
|
if ($self->{current_token}->{attributes}) { |
| 1181 |
|
!!!parse-error (type => 'end tag attribute'); |
| 1182 |
|
} |
| 1183 |
|
} else { |
| 1184 |
|
die "$0: $self->{current_token}->{type}: Unknown token type"; |
| 1185 |
|
} |
| 1186 |
|
$self->{state} = DATA_STATE; |
| 1187 |
|
!!!next-input-character; |
| 1188 |
|
|
| 1189 |
|
!!!emit ($self->{current_token}); # start tag or end tag |
| 1190 |
|
|
| 1191 |
|
redo A; |
| 1192 |
|
} elsif ($self->{next_input_character} == 0x002F) { # / |
| 1193 |
|
!!!next-input-character; |
| 1194 |
|
if ($self->{next_input_character} == 0x003E and # > |
| 1195 |
|
$self->{current_token}->{type} == START_TAG_TOKEN and |
| 1196 |
|
$permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) { |
| 1197 |
|
# permitted slash |
| 1198 |
|
# |
| 1199 |
|
} else { |
| 1200 |
|
!!!parse-error (type => 'nestc'); |
| 1201 |
|
} |
| 1202 |
|
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1203 |
|
# next-input-character is already done |
| 1204 |
|
redo A; |
| 1205 |
|
} else { |
| 1206 |
|
!!!parse-error (type => 'no space between attributes'); |
| 1207 |
|
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1208 |
|
## reconsume |
| 1209 |
|
redo A; |
| 1210 |
|
} |
| 1211 |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
| 1212 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
| 1213 |
|
|
| 1638 |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 1639 |
!!!next-input-character; |
!!!next-input-character; |
| 1640 |
redo A; |
redo A; |
| 1641 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1642 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1643 |
|
|
| 1644 |
|
$self->{state} = DATA_STATE; |
| 1645 |
|
!!!next-input-character; |
| 1646 |
|
|
| 1647 |
|
delete $self->{current_token}->{correct}; |
| 1648 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1649 |
|
|
| 1650 |
|
redo A; |
| 1651 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1652 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1653 |
|
|
| 1670 |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 1671 |
!!!next-input-character; |
!!!next-input-character; |
| 1672 |
redo A; |
redo A; |
| 1673 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1674 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1675 |
|
|
| 1676 |
|
$self->{state} = DATA_STATE; |
| 1677 |
|
!!!next-input-character; |
| 1678 |
|
|
| 1679 |
|
delete $self->{current_token}->{correct}; |
| 1680 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1681 |
|
|
| 1682 |
|
redo A; |
| 1683 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1684 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1685 |
|
|
| 1786 |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 1787 |
!!!next-input-character; |
!!!next-input-character; |
| 1788 |
redo A; |
redo A; |
| 1789 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1790 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1791 |
|
|
| 1792 |
|
$self->{state} = DATA_STATE; |
| 1793 |
|
!!!next-input-character; |
| 1794 |
|
|
| 1795 |
|
delete $self->{current_token}->{correct}; |
| 1796 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1797 |
|
|
| 1798 |
|
redo A; |
| 1799 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1800 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
| 1801 |
|
|
| 1818 |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 1819 |
!!!next-input-character; |
!!!next-input-character; |
| 1820 |
redo A; |
redo A; |
| 1821 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1822 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1823 |
|
|
| 1824 |
|
$self->{state} = DATA_STATE; |
| 1825 |
|
!!!next-input-character; |
| 1826 |
|
|
| 1827 |
|
delete $self->{current_token}->{correct}; |
| 1828 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1829 |
|
|
| 1830 |
|
redo A; |
| 1831 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1832 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
| 1833 |
|
|
| 1907 |
die "$0: _get_next_token: unexpected case"; |
die "$0: _get_next_token: unexpected case"; |
| 1908 |
} # _get_next_token |
} # _get_next_token |
| 1909 |
|
|
| 1910 |
sub _tokenize_attempt_to_consume_an_entity ($$) { |
sub _tokenize_attempt_to_consume_an_entity ($$$) { |
| 1911 |
my ($self, $in_attr) = @_; |
my ($self, $in_attr, $additional) = @_; |
| 1912 |
|
|
| 1913 |
if ({ |
if ({ |
| 1914 |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
| 1915 |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
| 1916 |
|
$additional => 1, |
| 1917 |
}->{$self->{next_input_character}}) { |
}->{$self->{next_input_character}}) { |
| 1918 |
## Don't consume |
## Don't consume |
| 1919 |
## No error |
## No error |
| 1969 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 1970 |
} |
} |
| 1971 |
|
|
| 1972 |
return {type => CHARACTER_TOKEN, data => chr $code}; |
return {type => CHARACTER_TOKEN, data => chr $code, |
| 1973 |
|
has_reference => 1}; |
| 1974 |
} # X |
} # X |
| 1975 |
} elsif (0x0030 <= $self->{next_input_character} and |
} elsif (0x0030 <= $self->{next_input_character} and |
| 1976 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
| 2005 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 2006 |
} |
} |
| 2007 |
|
|
| 2008 |
return {type => CHARACTER_TOKEN, data => chr $code}; |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1}; |
| 2009 |
} else { |
} else { |
| 2010 |
!!!parse-error (type => 'bare nero'); |
!!!parse-error (type => 'bare nero'); |
| 2011 |
!!!back-next-input-character ($self->{next_input_character}); |
!!!back-next-input-character ($self->{next_input_character}); |
| 2053 |
} |
} |
| 2054 |
|
|
| 2055 |
if ($match > 0) { |
if ($match > 0) { |
| 2056 |
return {type => CHARACTER_TOKEN, data => $value}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
| 2057 |
} elsif ($match < 0) { |
} elsif ($match < 0) { |
| 2058 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc'); |
| 2059 |
if ($in_attr and $match < -1) { |
if ($in_attr and $match < -1) { |
| 2060 |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
| 2061 |
} else { |
} else { |
| 2062 |
return {type => CHARACTER_TOKEN, data => $value}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
| 2063 |
} |
} |
| 2064 |
} else { |
} else { |
| 2065 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero'); |
| 2066 |
## NOTE: No characters are consumed in the spec. |
## NOTE: "No characters are consumed" in the spec. |
| 2067 |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
| 2068 |
} |
} |
| 2069 |
} else { |
} else { |
| 2200 |
"-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1, |
"-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1, |
| 2201 |
"-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1, |
"-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1, |
| 2202 |
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1, |
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1, |
| 2203 |
|
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1, |
| 2204 |
|
"-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1, |
| 2205 |
|
"-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1, |
| 2206 |
"-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1, |
"-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1, |
| 2207 |
"-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1, |
"-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1, |
| 2208 |
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1, |
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1, |
| 2320 |
# |
# |
| 2321 |
} elsif ($token->{type} == START_TAG_TOKEN) { |
} elsif ($token->{type} == START_TAG_TOKEN) { |
| 2322 |
if ($token->{tag_name} eq 'html' and |
if ($token->{tag_name} eq 'html' and |
| 2323 |
$token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application" |
$token->{attributes}->{manifest}) { |
| 2324 |
$self->{application_cache_selection} |
$self->{application_cache_selection} |
| 2325 |
->($token->{attributes}->{manifest}->{value}); |
->($token->{attributes}->{manifest}->{value}); |
| 2326 |
## ISSUE: No relative reference resolution? |
## ISSUE: No relative reference resolution? |
| 2998 |
push @{$self->{open_elements}}, [$self->{head_element}, 'head']; |
push @{$self->{open_elements}}, [$self->{head_element}, 'head']; |
| 2999 |
} |
} |
| 3000 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
| 3001 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 3002 |
|
|
| 3003 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
| 3004 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 3005 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
| 3006 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
| 3007 |
if ($token->{attributes}->{'http-equiv'}) { |
|
| 3008 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 3009 |
|
->set_user_data (manakai_has_reference => |
| 3010 |
|
$token->{attributes}->{charset} |
| 3011 |
|
->{has_reference}); |
| 3012 |
|
} elsif ($token->{attributes}->{content}) { |
| 3013 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 3014 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
| 3015 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 3016 |
|
[\x09-\x0D\x20]*= |
| 3017 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 3018 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 3019 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
| 3020 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 3021 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 3022 |
|
->set_user_data (manakai_has_reference => |
| 3023 |
|
$token->{attributes}->{content} |
| 3024 |
|
->{has_reference}); |
| 3025 |
|
} |
| 3026 |
|
} |
| 3027 |
|
} else { |
| 3028 |
|
if ($token->{attributes}->{charset}) { |
| 3029 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 3030 |
|
->set_user_data (manakai_has_reference => |
| 3031 |
|
$token->{attributes}->{charset} |
| 3032 |
|
->{has_reference}); |
| 3033 |
|
} |
| 3034 |
|
if ($token->{attributes}->{content}) { |
| 3035 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 3036 |
|
->set_user_data (manakai_has_reference => |
| 3037 |
|
$token->{attributes}->{content} |
| 3038 |
|
->{has_reference}); |
| 3039 |
} |
} |
|
## TODO: Change the encoding |
|
| 3040 |
} |
} |
| 3041 |
|
|
|
## TODO: Extracting |charset| from |meta|. |
|
| 3042 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
| 3043 |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
| 3044 |
!!!next-token; |
!!!next-token; |
| 4609 |
} elsif ($token->{tag_name} eq 'meta') { |
} elsif ($token->{tag_name} eq 'meta') { |
| 4610 |
## NOTE: This is an "as if in head" code clone, only "-t" differs |
## NOTE: This is an "as if in head" code clone, only "-t" differs |
| 4611 |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
| 4612 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4613 |
|
|
| 4614 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
| 4615 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 4616 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
| 4617 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
| 4618 |
if ($token->{attributes}->{'http-equiv'}) { |
|
| 4619 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 4620 |
|
->set_user_data (manakai_has_reference => |
| 4621 |
|
$token->{attributes}->{charset} |
| 4622 |
|
->{has_reference}); |
| 4623 |
|
} elsif ($token->{attributes}->{content}) { |
| 4624 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 4625 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4626 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 4627 |
|
[\x09-\x0D\x20]*= |
| 4628 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 4629 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 4630 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
| 4631 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 4632 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 4633 |
|
->set_user_data (manakai_has_reference => |
| 4634 |
|
$token->{attributes}->{content} |
| 4635 |
|
->{has_reference}); |
| 4636 |
|
} |
| 4637 |
|
} |
| 4638 |
|
} else { |
| 4639 |
|
if ($token->{attributes}->{charset}) { |
| 4640 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 4641 |
|
->set_user_data (manakai_has_reference => |
| 4642 |
|
$token->{attributes}->{charset} |
| 4643 |
|
->{has_reference}); |
| 4644 |
|
} |
| 4645 |
|
if ($token->{attributes}->{content}) { |
| 4646 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 4647 |
|
->set_user_data (manakai_has_reference => |
| 4648 |
|
$token->{attributes}->{content} |
| 4649 |
|
->{has_reference}); |
| 4650 |
} |
} |
|
## TODO: Change the encoding |
|
| 4651 |
} |
} |
| 4652 |
|
|
| 4653 |
!!!next-token; |
!!!next-token; |
| 5473 |
my $s = \$_[0]; |
my $s = \$_[0]; |
| 5474 |
my $onerror = $_[1]; |
my $onerror = $_[1]; |
| 5475 |
|
|
| 5476 |
|
## ISSUE: Should {confident} be true? |
| 5477 |
|
|
| 5478 |
my $nt = $node->node_type; |
my $nt = $node->node_type; |
| 5479 |
if ($nt == 9) { |
if ($nt == 9) { |
| 5480 |
# MUST |
# MUST |
| 5547 |
$p->_initialize_tree_constructor; |
$p->_initialize_tree_constructor; |
| 5548 |
|
|
| 5549 |
## Step 2 |
## Step 2 |
| 5550 |
my $node_ln = $node->local_name; |
my $node_ln = $node->manakai_local_name; |
| 5551 |
$p->{content_model} = { |
$p->{content_model} = { |
| 5552 |
title => RCDATA_CONTENT_MODEL, |
title => RCDATA_CONTENT_MODEL, |
| 5553 |
textarea => RCDATA_CONTENT_MODEL, |
textarea => RCDATA_CONTENT_MODEL, |
| 5587 |
if ($anode->node_type == 1) { |
if ($anode->node_type == 1) { |
| 5588 |
my $nsuri = $anode->namespace_uri; |
my $nsuri = $anode->namespace_uri; |
| 5589 |
if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') { |
if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') { |
| 5590 |
if ($anode->local_name eq 'form') { ## TODO: case? |
if ($anode->manakai_local_name eq 'form') { |
| 5591 |
$p->{form_element} = $anode; |
$p->{form_element} = $anode; |
| 5592 |
last AN; |
last AN; |
| 5593 |
} |
} |
| 5627 |
|
|
| 5628 |
} # tree construction stage |
} # tree construction stage |
| 5629 |
|
|
| 5630 |
sub get_inner_html ($$$) { |
package Whatpm::HTML::RestartParser; |
| 5631 |
my (undef, $node, $on_error) = @_; |
push our @ISA, 'Error'; |
|
|
|
|
## Step 1 |
|
|
my $s = ''; |
|
|
|
|
|
my $in_cdata; |
|
|
my $parent = $node; |
|
|
while (defined $parent) { |
|
|
if ($parent->node_type == 1 and |
|
|
$parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and |
|
|
{ |
|
|
style => 1, script => 1, xmp => 1, iframe => 1, |
|
|
noembed => 1, noframes => 1, noscript => 1, |
|
|
}->{$parent->local_name}) { ## TODO: case thingy |
|
|
$in_cdata = 1; |
|
|
} |
|
|
$parent = $parent->parent_node; |
|
|
} |
|
|
|
|
|
## Step 2 |
|
|
my @node = @{$node->child_nodes}; |
|
|
C: while (@node) { |
|
|
my $child = shift @node; |
|
|
unless (ref $child) { |
|
|
if ($child eq 'cdata-out') { |
|
|
$in_cdata = 0; |
|
|
} else { |
|
|
$s .= $child; # end tag |
|
|
} |
|
|
next C; |
|
|
} |
|
|
|
|
|
my $nt = $child->node_type; |
|
|
if ($nt == 1) { # Element |
|
|
my $tag_name = $child->tag_name; ## TODO: manakai_tag_name |
|
|
$s .= '<' . $tag_name; |
|
|
## NOTE: Non-HTML case: |
|
|
## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191> |
|
|
|
|
|
my @attrs = @{$child->attributes}; # sort order MUST be stable |
|
|
for my $attr (@attrs) { # order is implementation dependent |
|
|
my $attr_name = $attr->name; ## TODO: manakai_name |
|
|
$s .= ' ' . $attr_name . '="'; |
|
|
my $attr_value = $attr->value; |
|
|
## escape |
|
|
$attr_value =~ s/&/&/g; |
|
|
$attr_value =~ s/</</g; |
|
|
$attr_value =~ s/>/>/g; |
|
|
$attr_value =~ s/"/"/g; |
|
|
$s .= $attr_value . '"'; |
|
|
} |
|
|
$s .= '>'; |
|
|
|
|
|
next C if { |
|
|
area => 1, base => 1, basefont => 1, bgsound => 1, |
|
|
br => 1, col => 1, embed => 1, frame => 1, hr => 1, |
|
|
img => 1, input => 1, link => 1, meta => 1, param => 1, |
|
|
spacer => 1, wbr => 1, |
|
|
}->{$tag_name}; |
|
|
|
|
|
$s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea'; |
|
|
|
|
|
if (not $in_cdata and { |
|
|
style => 1, script => 1, xmp => 1, iframe => 1, |
|
|
noembed => 1, noframes => 1, noscript => 1, |
|
|
plaintext => 1, |
|
|
}->{$tag_name}) { |
|
|
unshift @node, 'cdata-out'; |
|
|
$in_cdata = 1; |
|
|
} |
|
|
|
|
|
unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>'; |
|
|
} elsif ($nt == 3 or $nt == 4) { |
|
|
if ($in_cdata) { |
|
|
$s .= $child->data; |
|
|
} else { |
|
|
my $value = $child->data; |
|
|
$value =~ s/&/&/g; |
|
|
$value =~ s/</</g; |
|
|
$value =~ s/>/>/g; |
|
|
$value =~ s/"/"/g; |
|
|
$s .= $value; |
|
|
} |
|
|
} elsif ($nt == 8) { |
|
|
$s .= '<!--' . $child->data . '-->'; |
|
|
} elsif ($nt == 10) { |
|
|
$s .= '<!DOCTYPE ' . $child->name . '>'; |
|
|
} elsif ($nt == 5) { # entrefs |
|
|
push @node, @{$child->child_nodes}; |
|
|
} else { |
|
|
$on_error->($child) if defined $on_error; |
|
|
} |
|
|
## ISSUE: This code does not support PIs. |
|
|
} # C |
|
|
|
|
|
## Step 3 |
|
|
return \$s; |
|
|
} # get_inner_html |
|
| 5632 |
|
|
| 5633 |
1; |
1; |
| 5634 |
# $Date$ |
# $Date$ |