| 1 |
package Whatpm::HTML; |
package Whatpm::HTML; |
| 2 |
use strict; |
use strict; |
| 3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
| 4 |
|
use Error qw(:try); |
| 5 |
|
|
| 6 |
## ISSUE: |
## ISSUE: |
| 7 |
## var doc = implementation.createDocument (null, null, null); |
## var doc = implementation.createDocument (null, null, null); |
| 85 |
}; |
}; |
| 86 |
# $phrasing_category: all other elements |
# $phrasing_category: all other elements |
| 87 |
|
|
| 88 |
|
sub parse_byte_string ($$$$;$) { |
| 89 |
|
my $self = ref $_[0] ? shift : shift->new; |
| 90 |
|
my $charset = shift; |
| 91 |
|
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
| 92 |
|
my $s; |
| 93 |
|
|
| 94 |
|
if (defined $charset) { |
| 95 |
|
require Encode; ## TODO: decode(utf8) don't delete BOM |
| 96 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 97 |
|
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
| 98 |
|
$self->{confident} = 1; |
| 99 |
|
} else { |
| 100 |
|
## TODO: Implement HTML5 detection algorithm |
| 101 |
|
require Whatpm::Charset::UniversalCharDet; |
| 102 |
|
$charset = Whatpm::Charset::UniversalCharDet->detect_byte_string |
| 103 |
|
(substr ($$bytes_s, 0, 1024)); |
| 104 |
|
$charset ||= 'windows-1252'; |
| 105 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 106 |
|
$self->{input_encoding} = $charset; |
| 107 |
|
$self->{confident} = 0; |
| 108 |
|
} |
| 109 |
|
|
| 110 |
|
$self->{change_encoding} = sub { |
| 111 |
|
my $self = shift; |
| 112 |
|
my $charset = lc shift; |
| 113 |
|
## TODO: if $charset is supported |
| 114 |
|
## TODO: normalize charset name |
| 115 |
|
|
| 116 |
|
## "Change the encoding" algorithm: |
| 117 |
|
|
| 118 |
|
## Step 1 |
| 119 |
|
if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
| 120 |
|
$charset = 'utf-8'; |
| 121 |
|
} |
| 122 |
|
|
| 123 |
|
## Step 2 |
| 124 |
|
if (defined $self->{input_encoding} and |
| 125 |
|
$self->{input_encoding} eq $charset) { |
| 126 |
|
$self->{confident} = 1; |
| 127 |
|
return; |
| 128 |
|
} |
| 129 |
|
|
| 130 |
|
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
| 131 |
|
':'.$charset, level => 'w'); |
| 132 |
|
|
| 133 |
|
## Step 3 |
| 134 |
|
# if (can) { |
| 135 |
|
## change the encoding on the fly. |
| 136 |
|
#$self->{confident} = 1; |
| 137 |
|
#return; |
| 138 |
|
# } |
| 139 |
|
|
| 140 |
|
## Step 4 |
| 141 |
|
throw Whatpm::HTML::RestartParser (charset => $charset); |
| 142 |
|
}; # $self->{change_encoding} |
| 143 |
|
|
| 144 |
|
my @args = @_; shift @args; # $s |
| 145 |
|
my $return; |
| 146 |
|
try { |
| 147 |
|
$return = $self->parse_char_string ($s, @args); |
| 148 |
|
} catch Whatpm::HTML::RestartParser with { |
| 149 |
|
my $charset = shift->{charset}; |
| 150 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 151 |
|
$self->{input_encoding} = $charset; ## TODO: normalize |
| 152 |
|
$self->{confident} = 1; |
| 153 |
|
$return = $self->parse_char_string ($s, @args); |
| 154 |
|
}; |
| 155 |
|
return $return; |
| 156 |
|
} # parse_byte_string |
| 157 |
|
|
| 158 |
|
*parse_char_string = \&parse_string; |
| 159 |
|
|
| 160 |
sub parse_string ($$$;$) { |
sub parse_string ($$$;$) { |
| 161 |
my $self = shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 162 |
my $s = \$_[0]; |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
| 163 |
$self->{document} = $_[1]; |
$self->{document} = $_[1]; |
| 164 |
|
@{$self->{document}->child_nodes} = (); |
| 165 |
|
|
| 166 |
## NOTE: |set_inner_html| copies most of this method's code |
## NOTE: |set_inner_html| copies most of this method's code |
| 167 |
|
|
| 168 |
|
$self->{confident} = 1 unless exists $self->{confident}; |
| 169 |
|
$self->{document}->input_encoding ($self->{input_encoding}) |
| 170 |
|
if defined $self->{input_encoding}; |
| 171 |
|
|
| 172 |
my $i = 0; |
my $i = 0; |
| 173 |
my $line = 1; |
my $line = 1; |
| 174 |
my $column = 0; |
my $column = 0; |
| 225 |
$self->{parse_error} = sub { |
$self->{parse_error} = sub { |
| 226 |
# |
# |
| 227 |
}; |
}; |
| 228 |
|
$self->{change_encoding} = sub { |
| 229 |
|
# if ($_[0] is a supported encoding) { |
| 230 |
|
# run "change the encoding" algorithm; |
| 231 |
|
# throw Whatpm::HTML::RestartParser (charset => $new_encoding); |
| 232 |
|
# } |
| 233 |
|
}; |
| 234 |
$self->{application_cache_selection} = sub { |
$self->{application_cache_selection} = sub { |
| 235 |
# |
# |
| 236 |
}; |
}; |
| 340 |
## ->{system_identifier} (DOCTYPE_TOKEN) |
## ->{system_identifier} (DOCTYPE_TOKEN) |
| 341 |
## ->{correct} == 1 or 0 (DOCTYPE_TOKEN) |
## ->{correct} == 1 or 0 (DOCTYPE_TOKEN) |
| 342 |
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) |
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) |
| 343 |
|
## ->{name} |
| 344 |
|
## ->{value} |
| 345 |
|
## ->{has_reference} == 1 or 0 |
| 346 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
| 347 |
|
|
| 348 |
## Emitted token MUST immediately be handled by the tree construction state. |
## Emitted token MUST immediately be handled by the tree construction state. |
| 1118 |
$self->{current_attribute}->{value} .= '&'; |
$self->{current_attribute}->{value} .= '&'; |
| 1119 |
} else { |
} else { |
| 1120 |
$self->{current_attribute}->{value} .= $token->{data}; |
$self->{current_attribute}->{value} .= $token->{data}; |
| 1121 |
|
$self->{current_attribute}->{has_reference} = $token->{has_reference}; |
| 1122 |
## ISSUE: spec says "append the returned character token to the current attribute's value" |
## ISSUE: spec says "append the returned character token to the current attribute's value" |
| 1123 |
} |
} |
| 1124 |
|
|
| 1845 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 1846 |
} |
} |
| 1847 |
|
|
| 1848 |
return {type => CHARACTER_TOKEN, data => chr $code}; |
return {type => CHARACTER_TOKEN, data => chr $code, |
| 1849 |
|
has_reference => 1}; |
| 1850 |
} # X |
} # X |
| 1851 |
} elsif (0x0030 <= $self->{next_input_character} and |
} elsif (0x0030 <= $self->{next_input_character} and |
| 1852 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
| 1881 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 1882 |
} |
} |
| 1883 |
|
|
| 1884 |
return {type => CHARACTER_TOKEN, data => chr $code}; |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1}; |
| 1885 |
} else { |
} else { |
| 1886 |
!!!parse-error (type => 'bare nero'); |
!!!parse-error (type => 'bare nero'); |
| 1887 |
!!!back-next-input-character ($self->{next_input_character}); |
!!!back-next-input-character ($self->{next_input_character}); |
| 1929 |
} |
} |
| 1930 |
|
|
| 1931 |
if ($match > 0) { |
if ($match > 0) { |
| 1932 |
return {type => CHARACTER_TOKEN, data => $value}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
| 1933 |
} elsif ($match < 0) { |
} elsif ($match < 0) { |
| 1934 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc'); |
| 1935 |
if ($in_attr and $match < -1) { |
if ($in_attr and $match < -1) { |
| 1936 |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
| 1937 |
} else { |
} else { |
| 1938 |
return {type => CHARACTER_TOKEN, data => $value}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
| 1939 |
} |
} |
| 1940 |
} else { |
} else { |
| 1941 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero'); |
| 1942 |
## NOTE: No characters are consumed in the spec. |
## NOTE: "No characters are consumed" in the spec. |
| 1943 |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
| 1944 |
} |
} |
| 1945 |
} else { |
} else { |
| 2193 |
# |
# |
| 2194 |
} elsif ($token->{type} == START_TAG_TOKEN) { |
} elsif ($token->{type} == START_TAG_TOKEN) { |
| 2195 |
if ($token->{tag_name} eq 'html' and |
if ($token->{tag_name} eq 'html' and |
| 2196 |
$token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application" |
$token->{attributes}->{manifest}) { |
| 2197 |
$self->{application_cache_selection} |
$self->{application_cache_selection} |
| 2198 |
->($token->{attributes}->{manifest}->{value}); |
->($token->{attributes}->{manifest}->{value}); |
| 2199 |
## ISSUE: No relative reference resolution? |
## ISSUE: No relative reference resolution? |
| 2871 |
push @{$self->{open_elements}}, [$self->{head_element}, 'head']; |
push @{$self->{open_elements}}, [$self->{head_element}, 'head']; |
| 2872 |
} |
} |
| 2873 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
| 2874 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 2875 |
|
|
| 2876 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
| 2877 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 2878 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
| 2879 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
| 2880 |
if ($token->{attributes}->{'http-equiv'}) { |
|
| 2881 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 2882 |
|
->set_user_data (manakai_has_reference => |
| 2883 |
|
$token->{attributes}->{charset} |
| 2884 |
|
->{has_reference}); |
| 2885 |
|
} elsif ($token->{attributes}->{content}) { |
| 2886 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 2887 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
| 2888 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
| 2889 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 2890 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 2891 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
| 2892 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 2893 |
|
} |
| 2894 |
|
} |
| 2895 |
|
} else { |
| 2896 |
|
if ($token->{attributes}->{charset}) { |
| 2897 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 2898 |
|
->set_user_data (manakai_has_reference => |
| 2899 |
|
$token->{attributes}->{charset} |
| 2900 |
|
->{has_reference}); |
| 2901 |
} |
} |
|
## TODO: Change the encoding |
|
| 2902 |
} |
} |
| 2903 |
|
|
|
## TODO: Extracting |charset| from |meta|. |
|
| 2904 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
| 2905 |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
| 2906 |
!!!next-token; |
!!!next-token; |
| 4471 |
} elsif ($token->{tag_name} eq 'meta') { |
} elsif ($token->{tag_name} eq 'meta') { |
| 4472 |
## NOTE: This is an "as if in head" code clone, only "-t" differs |
## NOTE: This is an "as if in head" code clone, only "-t" differs |
| 4473 |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
| 4474 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4475 |
|
|
| 4476 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
| 4477 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 4478 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
| 4479 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
| 4480 |
if ($token->{attributes}->{'http-equiv'}) { |
|
| 4481 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 4482 |
|
->set_user_data (manakai_has_reference => |
| 4483 |
|
$token->{attributes}->{charset} |
| 4484 |
|
->{has_reference}); |
| 4485 |
|
} elsif ($token->{attributes}->{content}) { |
| 4486 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 4487 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4488 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
| 4489 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 4490 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 4491 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
| 4492 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 4493 |
|
} |
| 4494 |
|
} |
| 4495 |
|
} else { |
| 4496 |
|
if ($token->{attributes}->{charset}) { |
| 4497 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 4498 |
|
->set_user_data (manakai_has_reference => |
| 4499 |
|
$token->{attributes}->{charset} |
| 4500 |
|
->{has_reference}); |
| 4501 |
} |
} |
|
## TODO: Change the encoding |
|
| 4502 |
} |
} |
| 4503 |
|
|
| 4504 |
!!!next-token; |
!!!next-token; |
| 5324 |
my $s = \$_[0]; |
my $s = \$_[0]; |
| 5325 |
my $onerror = $_[1]; |
my $onerror = $_[1]; |
| 5326 |
|
|
| 5327 |
|
## ISSUE: Should {confident} be true? |
| 5328 |
|
|
| 5329 |
my $nt = $node->node_type; |
my $nt = $node->node_type; |
| 5330 |
if ($nt == 9) { |
if ($nt == 9) { |
| 5331 |
# MUST |
# MUST |
| 5478 |
|
|
| 5479 |
} # tree construction stage |
} # tree construction stage |
| 5480 |
|
|
| 5481 |
sub get_inner_html ($$$) { |
package Whatpm::HTML::RestartParser; |
| 5482 |
my (undef, $node, $on_error) = @_; |
push our @ISA, 'Error'; |
|
|
|
|
## Step 1 |
|
|
my $s = ''; |
|
|
|
|
|
my $in_cdata; |
|
|
my $parent = $node; |
|
|
while (defined $parent) { |
|
|
if ($parent->node_type == 1 and |
|
|
$parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and |
|
|
{ |
|
|
style => 1, script => 1, xmp => 1, iframe => 1, |
|
|
noembed => 1, noframes => 1, noscript => 1, |
|
|
}->{$parent->local_name}) { ## TODO: case thingy |
|
|
$in_cdata = 1; |
|
|
} |
|
|
$parent = $parent->parent_node; |
|
|
} |
|
|
|
|
|
## Step 2 |
|
|
my @node = @{$node->child_nodes}; |
|
|
C: while (@node) { |
|
|
my $child = shift @node; |
|
|
unless (ref $child) { |
|
|
if ($child eq 'cdata-out') { |
|
|
$in_cdata = 0; |
|
|
} else { |
|
|
$s .= $child; # end tag |
|
|
} |
|
|
next C; |
|
|
} |
|
|
|
|
|
my $nt = $child->node_type; |
|
|
if ($nt == 1) { # Element |
|
|
my $tag_name = $child->tag_name; ## TODO: manakai_tag_name |
|
|
$s .= '<' . $tag_name; |
|
|
## NOTE: Non-HTML case: |
|
|
## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191> |
|
|
|
|
|
my @attrs = @{$child->attributes}; # sort order MUST be stable |
|
|
for my $attr (@attrs) { # order is implementation dependent |
|
|
my $attr_name = $attr->name; ## TODO: manakai_name |
|
|
$s .= ' ' . $attr_name . '="'; |
|
|
my $attr_value = $attr->value; |
|
|
## escape |
|
|
$attr_value =~ s/&/&/g; |
|
|
$attr_value =~ s/</</g; |
|
|
$attr_value =~ s/>/>/g; |
|
|
$attr_value =~ s/"/"/g; |
|
|
$s .= $attr_value . '"'; |
|
|
} |
|
|
$s .= '>'; |
|
|
|
|
|
next C if { |
|
|
area => 1, base => 1, basefont => 1, bgsound => 1, |
|
|
br => 1, col => 1, embed => 1, frame => 1, hr => 1, |
|
|
img => 1, input => 1, link => 1, meta => 1, param => 1, |
|
|
spacer => 1, wbr => 1, |
|
|
}->{$tag_name}; |
|
|
|
|
|
$s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea'; |
|
|
|
|
|
if (not $in_cdata and { |
|
|
style => 1, script => 1, xmp => 1, iframe => 1, |
|
|
noembed => 1, noframes => 1, noscript => 1, |
|
|
plaintext => 1, |
|
|
}->{$tag_name}) { |
|
|
unshift @node, 'cdata-out'; |
|
|
$in_cdata = 1; |
|
|
} |
|
|
|
|
|
unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>'; |
|
|
} elsif ($nt == 3 or $nt == 4) { |
|
|
if ($in_cdata) { |
|
|
$s .= $child->data; |
|
|
} else { |
|
|
my $value = $child->data; |
|
|
$value =~ s/&/&/g; |
|
|
$value =~ s/</</g; |
|
|
$value =~ s/>/>/g; |
|
|
$value =~ s/"/"/g; |
|
|
$s .= $value; |
|
|
} |
|
|
} elsif ($nt == 8) { |
|
|
$s .= '<!--' . $child->data . '-->'; |
|
|
} elsif ($nt == 10) { |
|
|
$s .= '<!DOCTYPE ' . $child->name . '>'; |
|
|
} elsif ($nt == 5) { # entrefs |
|
|
push @node, @{$child->child_nodes}; |
|
|
} else { |
|
|
$on_error->($child) if defined $on_error; |
|
|
} |
|
|
## ISSUE: This code does not support PIs. |
|
|
} # C |
|
|
|
|
|
## Step 3 |
|
|
return \$s; |
|
|
} # get_inner_html |
|
| 5483 |
|
|
| 5484 |
1; |
1; |
| 5485 |
# $Date$ |
# $Date$ |