| 11 |
## TODO: 1252 parse error (revision 1264) |
## TODO: 1252 parse error (revision 1264) |
| 12 |
## TODO: 8859-11 = 874 (revision 1271) |
## TODO: 8859-11 = 874 (revision 1271) |
| 13 |
|
|
| 14 |
|
require IO::Handle; |
| 15 |
|
|
| 16 |
my $HTML_NS = q<http://www.w3.org/1999/xhtml>; |
my $HTML_NS = q<http://www.w3.org/1999/xhtml>; |
| 17 |
my $MML_NS = q<http://www.w3.org/1998/Math/MathML>; |
my $MML_NS = q<http://www.w3.org/1998/Math/MathML>; |
| 18 |
my $SVG_NS = q<http://www.w3.org/2000/svg>; |
my $SVG_NS = q<http://www.w3.org/2000/svg>; |
| 334 |
}; # $c1_entity_char |
}; # $c1_entity_char |
| 335 |
|
|
| 336 |
sub parse_byte_string ($$$$;$) { |
sub parse_byte_string ($$$$;$) { |
| 337 |
|
my $self = shift; |
| 338 |
|
my $charset_name = shift; |
| 339 |
|
open my $input, '<', ref $_[0] ? $_[0] : \($_[0]); |
| 340 |
|
return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]); |
| 341 |
|
} # parse_byte_string |
| 342 |
|
|
| 343 |
|
sub parse_byte_stream ($$$$;$) { |
| 344 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 345 |
my $charset_name = shift; |
my $charset_name = shift; |
| 346 |
open my $byte_stream, '<', ref $_[0] ? $_[0] : \($_[0]); |
my $byte_stream = $_[0]; |
| 347 |
|
|
| 348 |
my $onerror = $_[2] || sub { |
my $onerror = $_[2] || sub { |
| 349 |
my (%opt) = @_; |
my (%opt) = @_; |
| 522 |
|
|
| 523 |
my $char_onerror = sub { |
my $char_onerror = sub { |
| 524 |
my (undef, $type, %opt) = @_; |
my (undef, $type, %opt) = @_; |
| 525 |
!!!parse-error (%opt, type => $type); |
!!!parse-error (%opt, type => $type, |
| 526 |
|
line => $self->{line}, column => $self->{column} + 1); |
| 527 |
if ($opt{octets}) { |
if ($opt{octets}) { |
| 528 |
${$opt{octets}} = "\x{FFFD}"; # relacement character |
${$opt{octets}} = "\x{FFFD}"; # relacement character |
| 529 |
} |
} |
| 555 |
$return = $self->parse_char_stream ($char_stream, @args); |
$return = $self->parse_char_stream ($char_stream, @args); |
| 556 |
}; |
}; |
| 557 |
return $return; |
return $return; |
| 558 |
} # parse_byte_string |
} # parse_byte_stream |
| 559 |
|
|
| 560 |
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM |
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM |
| 561 |
## and the HTML layer MUST ignore it. However, we does strip BOM in |
## and the HTML layer MUST ignore it. However, we does strip BOM in |
| 568 |
|
|
| 569 |
sub parse_char_string ($$$;$) { |
sub parse_char_string ($$$;$) { |
| 570 |
my $self = shift; |
my $self = shift; |
| 571 |
open my $input, '<:utf8', ref $_[0] ? $_[0] : \($_[0]); |
require utf8; |
| 572 |
|
my $s = ref $_[0] ? $_[0] : \($_[0]); |
| 573 |
|
open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s; |
| 574 |
return $self->parse_char_stream ($input, @_[1..$#_]); |
return $self->parse_char_stream ($input, @_[1..$#_]); |
| 575 |
} # parse_char_string |
} # parse_char_string |
| 576 |
*parse_string = \&parse_char_string; |
*parse_string = \&parse_char_string; |
| 596 |
pop @{$self->{prev_char}}; |
pop @{$self->{prev_char}}; |
| 597 |
unshift @{$self->{prev_char}}, $self->{next_char}; |
unshift @{$self->{prev_char}}, $self->{next_char}; |
| 598 |
|
|
| 599 |
my $char = $input->getc; |
my $char; |
| 600 |
|
if (defined $self->{next_next_char}) { |
| 601 |
|
$char = $self->{next_next_char}; |
| 602 |
|
delete $self->{next_next_char}; |
| 603 |
|
} else { |
| 604 |
|
$char = $input->getc; |
| 605 |
|
} |
| 606 |
$self->{next_char} = -1 and return unless defined $char; |
$self->{next_char} = -1 and return unless defined $char; |
| 607 |
$self->{next_char} = ord $char; |
$self->{next_char} = ord $char; |
| 608 |
|
|
| 617 |
} elsif ($self->{next_char} == 0x000D) { # CR |
} elsif ($self->{next_char} == 0x000D) { # CR |
| 618 |
!!!cp ('j2'); |
!!!cp ('j2'); |
| 619 |
my $next = $input->getc; |
my $next = $input->getc; |
| 620 |
if ($next ne "\x0A") { |
if (defined $next and $next ne "\x0A") { |
| 621 |
$input->ungetc ($next); |
$self->{next_next_char} = $next; |
| 622 |
} |
} |
| 623 |
$self->{next_char} = 0x000A; # LF # MUST |
$self->{next_char} = 0x000A; # LF # MUST |
| 624 |
$self->{line}++; |
$self->{line}++; |
| 4154 |
!!!next-token; |
!!!next-token; |
| 4155 |
next B; |
next B; |
| 4156 |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4157 |
!!!cp ('t94'); |
!!!cp ('t93.2'); |
| 4158 |
# |
!!!parse-error (type => 'after head:head', token => $token); ## TODO: error type |
| 4159 |
|
## Ignore the token |
| 4160 |
|
!!!nack ('t93.3'); |
| 4161 |
|
!!!next-token; |
| 4162 |
|
next B; |
| 4163 |
} else { |
} else { |
| 4164 |
!!!cp ('t95'); |
!!!cp ('t95'); |
| 4165 |
!!!parse-error (type => 'in head:head', token => $token); # or in head noscript |
!!!parse-error (type => 'in head:head', token => $token); # or in head noscript |
| 4481 |
$self->{insertion_mode} = AFTER_HEAD_IM; |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4482 |
!!!next-token; |
!!!next-token; |
| 4483 |
next B; |
next B; |
| 4484 |
|
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4485 |
|
!!!cp ('t134.1'); |
| 4486 |
|
!!!parse-error (type => 'unmatched end tag:head', token => $token); |
| 4487 |
|
## Ignore the token |
| 4488 |
|
!!!next-token; |
| 4489 |
|
next B; |
| 4490 |
} else { |
} else { |
| 4491 |
!!!cp ('t135'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
|
# |
|
| 4492 |
} |
} |
| 4493 |
} elsif ($token->{tag_name} eq 'noscript') { |
} elsif ($token->{tag_name} eq 'noscript') { |
| 4494 |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
| 4497 |
$self->{insertion_mode} = IN_HEAD_IM; |
$self->{insertion_mode} = IN_HEAD_IM; |
| 4498 |
!!!next-token; |
!!!next-token; |
| 4499 |
next B; |
next B; |
| 4500 |
} elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
} elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or |
| 4501 |
|
$self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4502 |
!!!cp ('t137'); |
!!!cp ('t137'); |
| 4503 |
!!!parse-error (type => 'unmatched end tag:noscript', token => $token); |
!!!parse-error (type => 'unmatched end tag:noscript', token => $token); |
| 4504 |
## Ignore the token ## ISSUE: An issue in the spec. |
## Ignore the token ## ISSUE: An issue in the spec. |
| 4511 |
} elsif ({ |
} elsif ({ |
| 4512 |
body => 1, html => 1, |
body => 1, html => 1, |
| 4513 |
}->{$token->{tag_name}}) { |
}->{$token->{tag_name}}) { |
| 4514 |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
if ($self->{insertion_mode} == BEFORE_HEAD_IM or |
| 4515 |
!!!cp ('t139'); |
$self->{insertion_mode} == IN_HEAD_IM or |
| 4516 |
## As if <head> |
$self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
|
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
|
|
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
|
|
push @{$self->{open_elements}}, |
|
|
[$self->{head_element}, $el_category->{head}]; |
|
|
|
|
|
$self->{insertion_mode} = IN_HEAD_IM; |
|
|
## Reprocess in the "in head" insertion mode... |
|
|
} elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
|
| 4517 |
!!!cp ('t140'); |
!!!cp ('t140'); |
| 4518 |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
| 4519 |
## Ignore the token |
## Ignore the token |
| 4520 |
!!!next-token; |
!!!next-token; |
| 4521 |
next B; |
next B; |
| 4522 |
|
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4523 |
|
!!!cp ('t140.1'); |
| 4524 |
|
!!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token); |
| 4525 |
|
## Ignore the token |
| 4526 |
|
!!!next-token; |
| 4527 |
|
next B; |
| 4528 |
} else { |
} else { |
| 4529 |
!!!cp ('t141'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
| 4530 |
} |
} |
| 4531 |
|
} elsif ($token->{tag_name} eq 'p') { |
| 4532 |
# |
!!!cp ('t142'); |
| 4533 |
} elsif ({ |
!!!parse-error (type => 'unmatched end tag:p', token => $token); |
| 4534 |
p => 1, br => 1, |
## Ignore the token |
| 4535 |
}->{$token->{tag_name}}) { |
!!!next-token; |
| 4536 |
|
next B; |
| 4537 |
|
} elsif ($token->{tag_name} eq 'br') { |
| 4538 |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
| 4539 |
!!!cp ('t142'); |
!!!cp ('t142.2'); |
| 4540 |
## As if <head> |
## (before head) as if <head>, (in head) as if </head> |
| 4541 |
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
| 4542 |
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
| 4543 |
push @{$self->{open_elements}}, |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4544 |
[$self->{head_element}, $el_category->{head}]; |
|
| 4545 |
|
## Reprocess in the "after head" insertion mode... |
| 4546 |
|
} elsif ($self->{insertion_mode} == IN_HEAD_IM) { |
| 4547 |
|
!!!cp ('t143.2'); |
| 4548 |
|
## As if </head> |
| 4549 |
|
pop @{$self->{open_elements}}; |
| 4550 |
|
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4551 |
|
|
| 4552 |
|
## Reprocess in the "after head" insertion mode... |
| 4553 |
|
} elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
| 4554 |
|
!!!cp ('t143.3'); |
| 4555 |
|
## ISSUE: Two parse errors for <head><noscript></br> |
| 4556 |
|
!!!parse-error (type => 'unmatched end tag:br', token => $token); |
| 4557 |
|
## As if </noscript> |
| 4558 |
|
pop @{$self->{open_elements}}; |
| 4559 |
$self->{insertion_mode} = IN_HEAD_IM; |
$self->{insertion_mode} = IN_HEAD_IM; |
| 4560 |
|
|
| 4561 |
## Reprocess in the "in head" insertion mode... |
## Reprocess in the "in head" insertion mode... |
| 4562 |
} else { |
## As if </head> |
| 4563 |
!!!cp ('t143'); |
pop @{$self->{open_elements}}; |
| 4564 |
} |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4565 |
|
|
| 4566 |
# |
## Reprocess in the "after head" insertion mode... |
| 4567 |
} else { |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4568 |
if ($self->{insertion_mode} == AFTER_HEAD_IM) { |
!!!cp ('t143.4'); |
|
!!!cp ('t144'); |
|
| 4569 |
# |
# |
| 4570 |
} else { |
} else { |
| 4571 |
!!!cp ('t145'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
|
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
|
|
## Ignore the token |
|
|
!!!next-token; |
|
|
next B; |
|
| 4572 |
} |
} |
| 4573 |
|
|
| 4574 |
|
## ISSUE: does not agree with IE7 - it doesn't ignore </br>. |
| 4575 |
|
!!!parse-error (type => 'unmatched end tag:br', token => $token); |
| 4576 |
|
## Ignore the token |
| 4577 |
|
!!!next-token; |
| 4578 |
|
next B; |
| 4579 |
|
} else { |
| 4580 |
|
!!!cp ('t145'); |
| 4581 |
|
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
| 4582 |
|
## Ignore the token |
| 4583 |
|
!!!next-token; |
| 4584 |
|
next B; |
| 4585 |
} |
} |
| 4586 |
|
|
| 4587 |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |