1 |
package Whatpm::HTML; |
package Whatpm::HTML; |
2 |
use strict; |
use strict; |
3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
4 |
|
use Error qw(:try); |
5 |
|
|
6 |
## ISSUE: |
## ISSUE: |
7 |
## var doc = implementation.createDocument (null, null, null); |
## var doc = implementation.createDocument (null, null, null); |
85 |
}; |
}; |
86 |
# $phrasing_category: all other elements |
# $phrasing_category: all other elements |
87 |
|
|
88 |
|
sub parse_byte_string ($$$$;$) { |
89 |
|
my $self = ref $_[0] ? shift : shift->new; |
90 |
|
my $charset = shift; |
91 |
|
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
92 |
|
my $s; |
93 |
|
|
94 |
|
if (defined $charset) { |
95 |
|
require Encode; |
96 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
97 |
|
$self->{input_encoding} = lc $charset; ## TODO: normalize name ## TODO: set $doc->input_encoding |
98 |
|
$self->{confident} = 1; |
99 |
|
} else { |
100 |
|
$s = ref $_[0] ? $_[0] : \($_[0]); |
101 |
|
$self->{confident} = 0; |
102 |
|
} |
103 |
|
|
104 |
|
$self->{change_encoding} = sub { |
105 |
|
my $self = shift; |
106 |
|
my $charset = lc shift; |
107 |
|
## TODO: if $charset is supported |
108 |
|
## TODO: normalize charset name |
109 |
|
|
110 |
|
## "Change the encoding" algorithm: |
111 |
|
|
112 |
|
## Step 1 |
113 |
|
if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
114 |
|
$charset = 'utf-8'; |
115 |
|
} |
116 |
|
|
117 |
|
## Step 2 |
118 |
|
if (defined $self->{input_encoding} and |
119 |
|
$self->{input_encoding} eq $charset) { |
120 |
|
$self->{confident} = 1; |
121 |
|
return; |
122 |
|
} |
123 |
|
|
124 |
|
!!!parse-error (type => 'charset label detected', level => 'w'); |
125 |
|
|
126 |
|
## Step 3 |
127 |
|
# if (can) { |
128 |
|
## change the encoding on the fly. |
129 |
|
#$self->{confident} = 1; |
130 |
|
#return; |
131 |
|
# } |
132 |
|
|
133 |
|
## Step 4 |
134 |
|
throw Whatpm::HTML::RestartParser (charset => $charset); |
135 |
|
}; # $self->{change_encoding} |
136 |
|
|
137 |
|
my @args = @_; shift @args; # $s |
138 |
|
my $return; |
139 |
|
try { |
140 |
|
$return = $self->parse_char_string ($s, @args); |
141 |
|
} catch Whatpm::HTML::RestartParser with { |
142 |
|
my $charset = shift->{charset}; |
143 |
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
144 |
|
$self->{input_encoding} = $charset; ## TODO: $doc->input_encoding; |
145 |
|
$self->{confident} = 1; |
146 |
|
$return = $self->parse_char_string ($s, @args); |
147 |
|
}; |
148 |
|
return $return; |
149 |
|
} # parse_byte_string |
150 |
|
|
151 |
|
*parse_char_string = \&parse_string; |
152 |
|
|
153 |
sub parse_string ($$$;$) { |
sub parse_string ($$$;$) { |
154 |
my $self = shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
155 |
my $s = \$_[0]; |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
156 |
$self->{document} = $_[1]; |
$self->{document} = $_[1]; |
157 |
|
@{$self->{document}->child_nodes} = (); |
158 |
|
|
159 |
## NOTE: |set_inner_html| copies most of this method's code |
## NOTE: |set_inner_html| copies most of this method's code |
160 |
|
|
161 |
|
$self->{confident} = 1 unless exists $self->{confident}; |
162 |
|
|
163 |
my $i = 0; |
my $i = 0; |
164 |
my $line = 1; |
my $line = 1; |
165 |
my $column = 0; |
my $column = 0; |
216 |
$self->{parse_error} = sub { |
$self->{parse_error} = sub { |
217 |
# |
# |
218 |
}; |
}; |
219 |
|
$self->{change_encoding} = sub { |
220 |
|
# if ($_[0] is a supported encoding) { |
221 |
|
# run "change the encoding" algorithm; |
222 |
|
# throw Whatpm::HTML::RestartParser (charset => $new_encoding); |
223 |
|
# } |
224 |
|
}; |
225 |
$self->{application_cache_selection} = sub { |
$self->{application_cache_selection} = sub { |
226 |
# |
# |
227 |
}; |
}; |
2860 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
2861 |
|
|
2862 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
2863 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
2864 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
2865 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
2866 |
if ($token->{attributes}->{'http-equiv'}) { |
} elsif ($token->{attributes}->{content}) { |
2867 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
2868 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
2869 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
2870 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
2871 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
2872 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
2873 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
2874 |
|
} |
2875 |
} |
} |
|
## TODO: Change the encoding |
|
2876 |
} |
} |
2877 |
|
|
|
## TODO: Extracting |charset| from |meta|. |
|
2878 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
2879 |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
if $self->{insertion_mode} == AFTER_HEAD_IM; |
2880 |
!!!next-token; |
!!!next-token; |
4448 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
4449 |
|
|
4450 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
|
my $charset; |
|
4451 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
4452 |
$charset = $token->{attributes}->{charset}->{value}; |
$self->{change_encoding} |
4453 |
} |
->($self, $token->{attributes}->{charset}->{value}); |
4454 |
if ($token->{attributes}->{'http-equiv'}) { |
} elsif ($token->{attributes}->{content}) { |
4455 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
4456 |
if ($token->{attributes}->{'http-equiv'}->{value} |
if ($token->{attributes}->{content}->{value} |
4457 |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
=~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*= |
4458 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
4459 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
4460 |
$charset = defined $1 ? $1 : defined $2 ? $2 : $3; |
$self->{change_encoding} |
4461 |
} ## TODO: And if supported |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
4462 |
|
} |
4463 |
} |
} |
|
## TODO: Change the encoding |
|
4464 |
} |
} |
4465 |
|
|
4466 |
!!!next-token; |
!!!next-token; |
5286 |
my $s = \$_[0]; |
my $s = \$_[0]; |
5287 |
my $onerror = $_[1]; |
my $onerror = $_[1]; |
5288 |
|
|
5289 |
|
## ISSUE: Should {confident} be true? |
5290 |
|
|
5291 |
my $nt = $node->node_type; |
my $nt = $node->node_type; |
5292 |
if ($nt == 9) { |
if ($nt == 9) { |
5293 |
# MUST |
# MUST |
5440 |
|
|
5441 |
} # tree construction stage |
} # tree construction stage |
5442 |
|
|
5443 |
|
package Whatpm::HTML::RestartParser; |
5444 |
|
push our @ISA, 'Error'; |
5445 |
|
|
5446 |
1; |
1; |
5447 |
# $Date$ |
# $Date$ |