| 1 |
package Whatpm::HTML; |
package Whatpm::HTML; |
| 2 |
use strict; |
use strict; |
| 3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
| 4 |
|
use Error qw(:try); |
| 5 |
|
|
| 6 |
## ISSUE: |
## ISSUE: |
| 7 |
## var doc = implementation.createDocument (null, null, null); |
## var doc = implementation.createDocument (null, null, null); |
| 85 |
}; |
}; |
| 86 |
# $phrasing_category: all other elements |
# $phrasing_category: all other elements |
| 87 |
|
|
| 88 |
|
sub parse_byte_string ($$$$;$) { |
| 89 |
|
my $self = ref $_[0] ? shift : shift->new; |
| 90 |
|
my $charset = shift; |
| 91 |
|
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
| 92 |
|
my $s; |
| 93 |
|
|
| 94 |
|
if (defined $charset) { |
| 95 |
|
require Encode; |
| 96 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 97 |
|
$self->{input_encoding} = lc $charset; ## TODO: normalize name ## TODO: set $doc->input_encoding |
| 98 |
|
$self->{confident} = 1; |
| 99 |
|
} else { |
| 100 |
|
$s = ref $_[0] ? $_[0] : \($_[0]); |
| 101 |
|
$self->{confident} = 0; |
| 102 |
|
} |
| 103 |
|
|
| 104 |
|
$self->{change_encoding} = sub { |
| 105 |
|
my $self = shift; |
| 106 |
|
my $charset = lc shift; |
| 107 |
|
## TODO: if $charset is supported |
| 108 |
|
## TODO: normalize charset name |
| 109 |
|
|
| 110 |
|
## "Change the encoding" algorithm: |
| 111 |
|
|
| 112 |
|
## Step 1 |
| 113 |
|
if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
| 114 |
|
$charset = 'utf-8'; |
| 115 |
|
} |
| 116 |
|
|
| 117 |
|
## Step 2 |
| 118 |
|
if (defined $self->{input_encoding} and |
| 119 |
|
$self->{input_encoding} eq $charset) { |
| 120 |
|
$self->{confident} = 1; |
| 121 |
|
return; |
| 122 |
|
} |
| 123 |
|
|
| 124 |
|
!!!parse-error (type => 'charset label detected', level => 'w'); |
| 125 |
|
|
| 126 |
|
## Step 3 |
| 127 |
|
# if (can) { |
| 128 |
|
## change the encoding on the fly. |
| 129 |
|
#$self->{confident} = 1; |
| 130 |
|
#return; |
| 131 |
|
# } |
| 132 |
|
|
| 133 |
|
## Step 4 |
| 134 |
|
throw Whatpm::HTML::RestartParser (charset => $charset); |
| 135 |
|
}; # $self->{change_encoding} |
| 136 |
|
|
| 137 |
|
my @args = @_; shift @args; # $s |
| 138 |
|
my $return; |
| 139 |
|
try { |
| 140 |
|
$return = $self->parse_char_string ($s, @args); |
| 141 |
|
} catch Whatpm::HTML::RestartParser with { |
| 142 |
|
my $charset = shift->{charset}; |
| 143 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 144 |
|
$self->{input_encoding} = $charset; ## TODO: $doc->input_encoding; |
| 145 |
|
$self->{confident} = 1; |
| 146 |
|
$return = $self->parse_char_string ($s, @args); |
| 147 |
|
}; |
| 148 |
|
return $return; |
| 149 |
|
} # parse_byte_string |
| 150 |
|
|
| 151 |
|
*parse_char_string = \&parse_string; |
| 152 |
|
|
| 153 |
sub parse_string ($$$;$) { |
sub parse_string ($$$;$) { |
| 154 |
my $self = shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 155 |
my $s = \$_[0]; |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
| 156 |
$self->{document} = $_[1]; |
$self->{document} = $_[1]; |
| 157 |
|
@{$self->{document}->child_nodes} = (); |
| 158 |
|
|
| 159 |
## NOTE: |set_inner_html| copies most of this method's code |
## NOTE: |set_inner_html| copies most of this method's code |
| 160 |
|
|
| 161 |
|
$self->{confident} = 1 unless exists $self->{confident}; |
| 162 |
|
|
| 163 |
my $i = 0; |
my $i = 0; |
| 164 |
my $line = 1; |
my $line = 1; |
| 165 |
my $column = 0; |
my $column = 0; |
| 216 |
$self->{parse_error} = sub { |
$self->{parse_error} = sub { |
| 217 |
# |
# |
| 218 |
}; |
}; |
| 219 |
|
$self->{change_encoding} = sub { |
| 220 |
|
# if ($_[0] is a supported encoding) { |
| 221 |
|
# run "change the encoding" algorithm; |
| 222 |
|
# throw Whatpm::HTML::RestartParser (charset => $new_encoding); |
| 223 |
|
# } |
| 224 |
|
}; |
| 225 |
$self->{application_cache_selection} = sub { |
$self->{application_cache_selection} = sub { |
| 226 |
# |
# |
| 227 |
}; |
}; |
| 2860 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 2861 |
|
|
| 2862 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
| 2863 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 2864 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
| 2865 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
| 2866 |
if ($token->{attributes}->{'http-equiv'}) { |
} elsif ($token->{attributes}->{content}) { |
| 2867 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 2868 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
| 2869 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
| 2870 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 2871 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 2872 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
| 2873 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 2874 |
|
} |
| 2875 |
} |
} |
|
## TODO: Change the encoding |
|
| 2876 |
} |
} |
| 2877 |
|
|
|
## TODO: Extracting |charset| from |meta|. |
|
| 2878 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
| 2879 |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
| 2880 |
!!!next-token; |
!!!next-token; |
| 4448 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4449 |
|
|
| 4450 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
| 4451 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 4452 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
| 4453 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
| 4454 |
if ($token->{attributes}->{'http-equiv'}) { |
} elsif ($token->{attributes}->{content}) { |
| 4455 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 4456 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4457 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
| 4458 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 4459 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 4460 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
| 4461 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 4462 |
|
} |
| 4463 |
} |
} |
|
## TODO: Change the encoding |
|
| 4464 |
} |
} |
| 4465 |
|
|
| 4466 |
!!!next-token; |
!!!next-token; |
| 5286 |
my $s = \$_[0]; |
my $s = \$_[0]; |
| 5287 |
my $onerror = $_[1]; |
my $onerror = $_[1]; |
| 5288 |
|
|
| 5289 |
|
## ISSUE: Should {confident} be true? |
| 5290 |
|
|
| 5291 |
my $nt = $node->node_type; |
my $nt = $node->node_type; |
| 5292 |
if ($nt == 9) { |
if ($nt == 9) { |
| 5293 |
# MUST |
# MUST |
| 5440 |
|
|
| 5441 |
} # tree construction stage |
} # tree construction stage |
| 5442 |
|
|
| 5443 |
|
package Whatpm::HTML::RestartParser; |
| 5444 |
|
push our @ISA, 'Error'; |
| 5445 |
|
|
| 5446 |
1; |
1; |
| 5447 |
# $Date$ |
# $Date$ |