| 2 |
use strict; |
use strict; |
| 3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
| 4 |
|
|
| 5 |
## This is a very, very early version of an HTML parser. |
## This is an early version of an HTML parser. |
| 6 |
|
|
| 7 |
my $permitted_slash_tag_name = { |
my $permitted_slash_tag_name = { |
| 8 |
base => 1, |
base => 1, |
| 302 |
}; |
}; |
| 303 |
# $phrasing_category: all other elements |
# $phrasing_category: all other elements |
| 304 |
|
|
| 305 |
|
sub parse_string ($$$;$) { |
| 306 |
|
my $self = shift->new; |
| 307 |
|
my $s = \$_[0]; |
| 308 |
|
$self->{document} = $_[1]; |
| 309 |
|
|
| 310 |
|
my $i; |
| 311 |
|
my $i = 0; |
| 312 |
|
$self->{set_next_input_character} = sub { |
| 313 |
|
my $self = shift; |
| 314 |
|
$self->{next_input_character} = -1 and return if $i >= length $$s; |
| 315 |
|
$self->{next_input_character} = ord substr $$s, $i++, 1; |
| 316 |
|
|
| 317 |
|
if ($self->{next_input_character} == 0x000D) { # CR |
| 318 |
|
if ($i >= length $$s) { |
| 319 |
|
# |
| 320 |
|
} else { |
| 321 |
|
my $next_char = ord substr $$s, $i++, 1; |
| 322 |
|
if ($next_char == 0x000A) { # LF |
| 323 |
|
# |
| 324 |
|
} else { |
| 325 |
|
push @{$self->{char}}, $next_char; |
| 326 |
|
} |
| 327 |
|
} |
| 328 |
|
$self->{next_input_character} = 0x000A; # LF # MUST |
| 329 |
|
} elsif ($self->{next_input_character} > 0x10FFFF) { |
| 330 |
|
$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
| 331 |
|
} elsif ($self->{next_input_character} == 0x0000) { # NULL |
| 332 |
|
$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
| 333 |
|
} |
| 334 |
|
}; |
| 335 |
|
|
| 336 |
|
$self->{parse_error} = $_[2] || sub { |
| 337 |
|
warn "Parse error at character $i\n"; ## TODO: Report (line, column) pair |
| 338 |
|
}; |
| 339 |
|
|
| 340 |
|
$self->_initialize_tokenizer; |
| 341 |
|
$self->_initialize_tree_constructor; |
| 342 |
|
$self->_construct_tree; |
| 343 |
|
$self->_terminate_tree_constructor; |
| 344 |
|
|
| 345 |
|
return $self->{document}; |
| 346 |
|
} # parse_string |
| 347 |
|
|
| 348 |
sub new ($) { |
sub new ($) { |
| 349 |
my $class = shift; |
my $class = shift; |
| 350 |
my $self = bless {}, $class; |
my $self = bless {}, $class; |
| 2124 |
|
|
| 2125 |
sub _initialize_tree_constructor ($) { |
sub _initialize_tree_constructor ($) { |
| 2126 |
my $self = shift; |
my $self = shift; |
| 2127 |
require What::NanoDOM; |
## NOTE: $self->{document} MUST be specified before this method is called |
|
$self->{document} = What::NanoDOM::Document->new; |
|
| 2128 |
$self->{document}->strict_error_checking (0); |
$self->{document}->strict_error_checking (0); |
| 2129 |
## TODO: Turn mutation events off # MUST |
## TODO: Turn mutation events off # MUST |
| 2130 |
## TODO: Turn loose Document option (manakai extension) on |
## TODO: Turn loose Document option (manakai extension) on |
| 2131 |
|
## TODO: Mark the Document as an HTML document # MUST |
| 2132 |
} # _initialize_tree_constructor |
} # _initialize_tree_constructor |
| 2133 |
|
|
| 2134 |
sub _terminate_tree_constructor ($) { |
sub _terminate_tree_constructor ($) { |
| 2608 |
} elsif ({ |
} elsif ({ |
| 2609 |
base => 1, link => 1, meta => 1, |
base => 1, link => 1, meta => 1, |
| 2610 |
}->{$token->{tag_name}}) { |
}->{$token->{tag_name}}) { |
| 2611 |
$self->{parse_error}->(); |
$self->{parse_error}-> ($token->{tag_name}.' in body'); |
| 2612 |
## NOTE: This is an "as if in head" code clone |
## NOTE: This is an "as if in head" code clone |
| 2613 |
my $el; |
my $el; |
| 2614 |
|
|
| 2626 |
$insert->($el); |
$insert->($el); |
| 2627 |
} |
} |
| 2628 |
|
|
|
## ISSUE: Issue on magical <base> in the spec |
|
|
|
|
| 2629 |
$token = $self->_get_next_token; |
$token = $self->_get_next_token; |
| 2630 |
return; |
return; |
| 2631 |
} elsif ($token->{tag_name} eq 'title') { |
} elsif ($token->{tag_name} eq 'title') { |
| 2632 |
|
$self->{parse_error}-> ('title in body'); |
| 2633 |
## NOTE: There is an "as if in head" code clone |
## NOTE: There is an "as if in head" code clone |
| 2634 |
my $title_el; |
my $title_el; |
| 2635 |
|
|
| 3875 |
|
|
| 3876 |
(defined $head_element ? $head_element : $open_elements->[-1]->[0]) |
(defined $head_element ? $head_element : $open_elements->[-1]->[0]) |
| 3877 |
->append_child ($el); |
->append_child ($el); |
|
|
|
|
## ISSUE: Issue on magical <base> in the spec |
|
| 3878 |
|
|
| 3879 |
$token = $self->_get_next_token; |
$token = $self->_get_next_token; |
| 3880 |
redo B; |
redo B; |
| 5542 |
# |
# |
| 5543 |
} |
} |
| 5544 |
|
|
| 5545 |
$self->{parse_error}->(); |
$self->{parse_error}-> ('data after body'); |
| 5546 |
$insertion_mode = 'in body'; |
$insertion_mode = 'in body'; |
| 5547 |
## reprocess |
## reprocess |
| 5548 |
redo B; |
redo B; |
| 5738 |
## TODO: script stuffs |
## TODO: script stuffs |
| 5739 |
} # _construct_tree |
} # _construct_tree |
| 5740 |
|
|
| 5741 |
sub inner_html ($$$) { |
sub get_inner_html ($$$) { |
| 5742 |
my ($class, $node, $on_error) = @_; |
my ($class, $node, $on_error) = @_; |
| 5743 |
|
|
| 5744 |
## Step 1 |
## Step 1 |
| 5826 |
} elsif ($nt == 5) { # entrefs |
} elsif ($nt == 5) { # entrefs |
| 5827 |
push @node, @{$child->child_nodes}; |
push @node, @{$child->child_nodes}; |
| 5828 |
} else { |
} else { |
| 5829 |
$on_error->($child); |
$on_error->($child) if defined $on_error; |
| 5830 |
} |
} |
| 5831 |
|
## ISSUE: This code does not support PIs. |
| 5832 |
} # C |
} # C |
| 5833 |
|
|
| 5834 |
## Step 3 |
## Step 3 |
| 5835 |
return \$s; |
return \$s; |
| 5836 |
} # inner_html |
} # get_inner_html |
| 5837 |
|
|
| 5838 |
1; |
1; |
| 5839 |
# $Date$ |
# $Date$ |