| 1 |
package Whatpm::HTML; |
package Whatpm::HTML; |
| 2 |
use strict; |
use strict; |
| 3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
| 4 |
|
use Error qw(:try); |
| 5 |
|
|
| 6 |
## ISSUE: |
## ISSUE: |
| 7 |
## var doc = implementation.createDocument (null, null, null); |
## var doc = implementation.createDocument (null, null, null); |
| 85 |
}; |
}; |
| 86 |
# $phrasing_category: all other elements |
# $phrasing_category: all other elements |
| 87 |
|
|
| 88 |
|
sub parse_byte_string ($$$$;$) { |
| 89 |
|
my $self = ref $_[0] ? shift : shift->new; |
| 90 |
|
my $charset = shift; |
| 91 |
|
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
| 92 |
|
my $s; |
| 93 |
|
|
| 94 |
|
if (defined $charset) { |
| 95 |
|
require Encode; ## TODO: decode(utf8) don't delete BOM |
| 96 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 97 |
|
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
| 98 |
|
$self->{confident} = 1; |
| 99 |
|
} else { |
| 100 |
|
## TODO: Implement HTML5 detection algorithm |
| 101 |
|
require Whatpm::Charset::UniversalCharDet; |
| 102 |
|
$charset = Whatpm::Charset::UniversalCharDet->detect_byte_string |
| 103 |
|
(substr ($$bytes_s, 0, 1024)); |
| 104 |
|
$charset ||= 'windows-1252'; |
| 105 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 106 |
|
$self->{input_encoding} = $charset; |
| 107 |
|
$self->{confident} = 0; |
| 108 |
|
} |
| 109 |
|
|
| 110 |
|
$self->{change_encoding} = sub { |
| 111 |
|
my $self = shift; |
| 112 |
|
my $charset = lc shift; |
| 113 |
|
## TODO: if $charset is supported |
| 114 |
|
## TODO: normalize charset name |
| 115 |
|
|
| 116 |
|
## "Change the encoding" algorithm: |
| 117 |
|
|
| 118 |
|
## Step 1 |
| 119 |
|
if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
| 120 |
|
$charset = 'utf-8'; |
| 121 |
|
} |
| 122 |
|
|
| 123 |
|
## Step 2 |
| 124 |
|
if (defined $self->{input_encoding} and |
| 125 |
|
$self->{input_encoding} eq $charset) { |
| 126 |
|
$self->{confident} = 1; |
| 127 |
|
return; |
| 128 |
|
} |
| 129 |
|
|
| 130 |
|
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
| 131 |
|
':'.$charset, level => 'w'); |
| 132 |
|
|
| 133 |
|
## Step 3 |
| 134 |
|
# if (can) { |
| 135 |
|
## change the encoding on the fly. |
| 136 |
|
#$self->{confident} = 1; |
| 137 |
|
#return; |
| 138 |
|
# } |
| 139 |
|
|
| 140 |
|
## Step 4 |
| 141 |
|
throw Whatpm::HTML::RestartParser (charset => $charset); |
| 142 |
|
}; # $self->{change_encoding} |
| 143 |
|
|
| 144 |
|
my @args = @_; shift @args; # $s |
| 145 |
|
my $return; |
| 146 |
|
try { |
| 147 |
|
$return = $self->parse_char_string ($s, @args); |
| 148 |
|
} catch Whatpm::HTML::RestartParser with { |
| 149 |
|
my $charset = shift->{charset}; |
| 150 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 151 |
|
$self->{input_encoding} = $charset; ## TODO: normalize |
| 152 |
|
$self->{confident} = 1; |
| 153 |
|
$return = $self->parse_char_string ($s, @args); |
| 154 |
|
}; |
| 155 |
|
return $return; |
| 156 |
|
} # parse_byte_string |
| 157 |
|
|
| 158 |
|
*parse_char_string = \&parse_string; |
| 159 |
|
|
| 160 |
sub parse_string ($$$;$) { |
sub parse_string ($$$;$) { |
| 161 |
my $self = shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 162 |
my $s = \$_[0]; |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
| 163 |
$self->{document} = $_[1]; |
$self->{document} = $_[1]; |
| 164 |
|
@{$self->{document}->child_nodes} = (); |
| 165 |
|
|
| 166 |
## NOTE: |set_inner_html| copies most of this method's code |
## NOTE: |set_inner_html| copies most of this method's code |
| 167 |
|
|
| 168 |
|
$self->{confident} = 1 unless exists $self->{confident}; |
| 169 |
|
$self->{document}->input_encoding ($self->{input_encoding}) |
| 170 |
|
if defined $self->{input_encoding}; |
| 171 |
|
|
| 172 |
my $i = 0; |
my $i = 0; |
| 173 |
my $line = 1; |
my $line = 1; |
| 174 |
my $column = 0; |
my $column = 0; |
| 225 |
$self->{parse_error} = sub { |
$self->{parse_error} = sub { |
| 226 |
# |
# |
| 227 |
}; |
}; |
| 228 |
|
$self->{change_encoding} = sub { |
| 229 |
|
# if ($_[0] is a supported encoding) { |
| 230 |
|
# run "change the encoding" algorithm; |
| 231 |
|
# throw Whatpm::HTML::RestartParser (charset => $new_encoding); |
| 232 |
|
# } |
| 233 |
|
}; |
| 234 |
$self->{application_cache_selection} = sub { |
$self->{application_cache_selection} = sub { |
| 235 |
# |
# |
| 236 |
}; |
}; |
| 2869 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 2870 |
|
|
| 2871 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
| 2872 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 2873 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
| 2874 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
| 2875 |
if ($token->{attributes}->{'http-equiv'}) { |
} elsif ($token->{attributes}->{content}) { |
| 2876 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 2877 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
| 2878 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
| 2879 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 2880 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 2881 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
| 2882 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 2883 |
|
} |
| 2884 |
} |
} |
|
## TODO: Change the encoding |
|
| 2885 |
} |
} |
| 2886 |
|
|
|
## TODO: Extracting |charset| from |meta|. |
|
| 2887 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
| 2888 |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
| 2889 |
!!!next-token; |
!!!next-token; |
| 4457 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4458 |
|
|
| 4459 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
| 4460 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 4461 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
| 4462 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
| 4463 |
if ($token->{attributes}->{'http-equiv'}) { |
} elsif ($token->{attributes}->{content}) { |
| 4464 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 4465 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4466 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
| 4467 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 4468 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 4469 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
| 4470 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 4471 |
|
} |
| 4472 |
} |
} |
|
## TODO: Change the encoding |
|
| 4473 |
} |
} |
| 4474 |
|
|
| 4475 |
!!!next-token; |
!!!next-token; |
| 5295 |
my $s = \$_[0]; |
my $s = \$_[0]; |
| 5296 |
my $onerror = $_[1]; |
my $onerror = $_[1]; |
| 5297 |
|
|
| 5298 |
|
## ISSUE: Should {confident} be true? |
| 5299 |
|
|
| 5300 |
my $nt = $node->node_type; |
my $nt = $node->node_type; |
| 5301 |
if ($nt == 9) { |
if ($nt == 9) { |
| 5302 |
# MUST |
# MUST |
| 5449 |
|
|
| 5450 |
} # tree construction stage |
} # tree construction stage |
| 5451 |
|
|
| 5452 |
sub get_inner_html ($$$) { |
package Whatpm::HTML::RestartParser; |
| 5453 |
my (undef, $node, $on_error) = @_; |
push our @ISA, 'Error'; |
|
|
|
|
## Step 1 |
|
|
my $s = ''; |
|
|
|
|
|
my $in_cdata; |
|
|
my $parent = $node; |
|
|
while (defined $parent) { |
|
|
if ($parent->node_type == 1 and |
|
|
$parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and |
|
|
{ |
|
|
style => 1, script => 1, xmp => 1, iframe => 1, |
|
|
noembed => 1, noframes => 1, noscript => 1, |
|
|
}->{$parent->local_name}) { ## TODO: case thingy |
|
|
$in_cdata = 1; |
|
|
} |
|
|
$parent = $parent->parent_node; |
|
|
} |
|
|
|
|
|
## Step 2 |
|
|
my @node = @{$node->child_nodes}; |
|
|
C: while (@node) { |
|
|
my $child = shift @node; |
|
|
unless (ref $child) { |
|
|
if ($child eq 'cdata-out') { |
|
|
$in_cdata = 0; |
|
|
} else { |
|
|
$s .= $child; # end tag |
|
|
} |
|
|
next C; |
|
|
} |
|
|
|
|
|
my $nt = $child->node_type; |
|
|
if ($nt == 1) { # Element |
|
|
my $tag_name = $child->tag_name; ## TODO: manakai_tag_name |
|
|
$s .= '<' . $tag_name; |
|
|
## NOTE: Non-HTML case: |
|
|
## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191> |
|
|
|
|
|
my @attrs = @{$child->attributes}; # sort order MUST be stable |
|
|
for my $attr (@attrs) { # order is implementation dependent |
|
|
my $attr_name = $attr->name; ## TODO: manakai_name |
|
|
$s .= ' ' . $attr_name . '="'; |
|
|
my $attr_value = $attr->value; |
|
|
## escape |
|
|
$attr_value =~ s/&/&/g; |
|
|
$attr_value =~ s/</</g; |
|
|
$attr_value =~ s/>/>/g; |
|
|
$attr_value =~ s/"/"/g; |
|
|
$s .= $attr_value . '"'; |
|
|
} |
|
|
$s .= '>'; |
|
|
|
|
|
next C if { |
|
|
area => 1, base => 1, basefont => 1, bgsound => 1, |
|
|
br => 1, col => 1, embed => 1, frame => 1, hr => 1, |
|
|
img => 1, input => 1, link => 1, meta => 1, param => 1, |
|
|
spacer => 1, wbr => 1, |
|
|
}->{$tag_name}; |
|
|
|
|
|
$s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea'; |
|
|
|
|
|
if (not $in_cdata and { |
|
|
style => 1, script => 1, xmp => 1, iframe => 1, |
|
|
noembed => 1, noframes => 1, noscript => 1, |
|
|
plaintext => 1, |
|
|
}->{$tag_name}) { |
|
|
unshift @node, 'cdata-out'; |
|
|
$in_cdata = 1; |
|
|
} |
|
|
|
|
|
unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>'; |
|
|
} elsif ($nt == 3 or $nt == 4) { |
|
|
if ($in_cdata) { |
|
|
$s .= $child->data; |
|
|
} else { |
|
|
my $value = $child->data; |
|
|
$value =~ s/&/&/g; |
|
|
$value =~ s/</</g; |
|
|
$value =~ s/>/>/g; |
|
|
$value =~ s/"/"/g; |
|
|
$s .= $value; |
|
|
} |
|
|
} elsif ($nt == 8) { |
|
|
$s .= '<!--' . $child->data . '-->'; |
|
|
} elsif ($nt == 10) { |
|
|
$s .= '<!DOCTYPE ' . $child->name . '>'; |
|
|
} elsif ($nt == 5) { # entrefs |
|
|
push @node, @{$child->child_nodes}; |
|
|
} else { |
|
|
$on_error->($child) if defined $on_error; |
|
|
} |
|
|
## ISSUE: This code does not support PIs. |
|
|
} # C |
|
|
|
|
|
## Step 3 |
|
|
return \$s; |
|
|
} # get_inner_html |
|
| 5454 |
|
|
| 5455 |
1; |
1; |
| 5456 |
# $Date$ |
# $Date$ |