| 8 |
## doc.write (''); |
## doc.write (''); |
| 9 |
## alert (doc.compatMode); |
## alert (doc.compatMode); |
| 10 |
|
|
| 11 |
## TODO: 1252 parse error (revision 1264) |
require IO::Handle; |
|
## TODO: 8859-11 = 874 (revision 1271) |
|
| 12 |
|
|
| 13 |
my $HTML_NS = q<http://www.w3.org/1999/xhtml>; |
my $HTML_NS = q<http://www.w3.org/1999/xhtml>; |
| 14 |
my $MML_NS = q<http://www.w3.org/1998/Math/MathML>; |
my $MML_NS = q<http://www.w3.org/1998/Math/MathML>; |
| 331 |
}; # $c1_entity_char |
}; # $c1_entity_char |
| 332 |
|
|
| 333 |
sub parse_byte_string ($$$$;$) { |
sub parse_byte_string ($$$$;$) { |
| 334 |
|
my $self = shift; |
| 335 |
|
my $charset_name = shift; |
| 336 |
|
open my $input, '<', ref $_[0] ? $_[0] : \($_[0]); |
| 337 |
|
return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]); |
| 338 |
|
} # parse_byte_string |
| 339 |
|
|
| 340 |
|
sub parse_byte_stream ($$$$;$) { |
| 341 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 342 |
my $charset_name = shift; |
my $charset_name = shift; |
| 343 |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
my $byte_stream = $_[0]; |
| 344 |
my $s; |
|
| 345 |
|
my $onerror = $_[2] || sub { |
| 346 |
|
my (%opt) = @_; |
| 347 |
|
warn "Parse error ($opt{type})\n"; |
| 348 |
|
}; |
| 349 |
|
$self->{parse_error} = $onerror; # updated later by parse_char_string |
| 350 |
|
|
| 351 |
## HTML5 encoding sniffing algorithm |
## HTML5 encoding sniffing algorithm |
| 352 |
require Message::Charset::Info; |
require Message::Charset::Info; |
| 353 |
my $charset; |
my $charset; |
| 354 |
my ($e, $e_status); |
my $buffer; |
| 355 |
|
my ($char_stream, $e_status); |
| 356 |
|
|
| 357 |
SNIFFING: { |
SNIFFING: { |
| 358 |
|
|
| 361 |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 362 |
|
|
| 363 |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
| 364 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 365 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
| 366 |
allow_fallback => 1); |
allow_fallback => 1); |
| 367 |
if ($e) { |
if ($char_stream) { |
| 368 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 369 |
last SNIFFING; |
last SNIFFING; |
| 370 |
|
} else { |
| 371 |
|
## TODO: unsupported error |
| 372 |
} |
} |
| 373 |
} |
} |
| 374 |
|
|
| 375 |
## Step 2 |
## Step 2 |
| 376 |
# wait |
my $byte_buffer = ''; |
| 377 |
|
for (1..1024) { |
| 378 |
|
my $char = $byte_stream->getc; |
| 379 |
|
last unless defined $char; |
| 380 |
|
$byte_buffer .= $char; |
| 381 |
|
} ## TODO: timeout |
| 382 |
|
|
| 383 |
## Step 3 |
## Step 3 |
| 384 |
my $head = substr ($$bytes_s, 0, 3); |
if ($byte_buffer =~ /^\xFE\xFF/) { |
|
if ($head =~ /^\xFE\xFF/) { |
|
| 385 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16be'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16be'); |
| 386 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 387 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
| 388 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 389 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 390 |
last SNIFFING; |
last SNIFFING; |
| 391 |
} elsif ($head =~ /^\xFF\xFE/) { |
} elsif ($byte_buffer =~ /^\xFF\xFE/) { |
| 392 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16le'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16le'); |
| 393 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 394 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
| 395 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 396 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 397 |
last SNIFFING; |
last SNIFFING; |
| 398 |
} elsif ($head eq "\xEF\xBB\xBF") { |
} elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) { |
| 399 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
| 400 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 401 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
| 402 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 403 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 404 |
last SNIFFING; |
last SNIFFING; |
| 405 |
} |
} |
| 413 |
## Step 6 |
## Step 6 |
| 414 |
require Whatpm::Charset::UniversalCharDet; |
require Whatpm::Charset::UniversalCharDet; |
| 415 |
$charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string |
$charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string |
| 416 |
(substr ($$bytes_s, 0, 1024)); |
($byte_buffer); |
| 417 |
if (defined $charset_name) { |
if (defined $charset_name) { |
| 418 |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 419 |
|
|
| 420 |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
| 421 |
($e, $e_status) = $charset->get_perl_encoding |
require Whatpm::Charset::DecodeHandle; |
| 422 |
(allow_error_reporting => 1, |
$buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new |
| 423 |
allow_fallback => 1); |
($byte_stream); |
| 424 |
if ($e) { |
($char_stream, $e_status) = $charset->get_decode_handle |
| 425 |
|
($buffer, allow_error_reporting => 1, |
| 426 |
|
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 427 |
|
if ($char_stream) { |
| 428 |
|
$buffer->{buffer} = $byte_buffer; |
| 429 |
|
!!!parse-error (type => 'sniffing:chardet', ## TODO: type name |
| 430 |
|
value => $charset_name, |
| 431 |
|
level => $self->{info_level}, |
| 432 |
|
line => 1, column => 1); |
| 433 |
$self->{confident} = 0; |
$self->{confident} = 0; |
| 434 |
last SNIFFING; |
last SNIFFING; |
| 435 |
} |
} |
| 440 |
$charset = Message::Charset::Info->get_by_iana_name ('windows-1252'); |
$charset = Message::Charset::Info->get_by_iana_name ('windows-1252'); |
| 441 |
## NOTE: We choose |windows-1252| here, since |utf-8| should be |
## NOTE: We choose |windows-1252| here, since |utf-8| should be |
| 442 |
## detectable in the step 6. |
## detectable in the step 6. |
| 443 |
($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1, |
require Whatpm::Charset::DecodeHandle; |
| 444 |
allow_fallback => 1); |
$buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new |
| 445 |
|
($byte_stream); |
| 446 |
|
($char_stream, $e_status) |
| 447 |
|
= $charset->get_decode_handle ($buffer, |
| 448 |
|
allow_error_reporting => 1, |
| 449 |
|
allow_fallback => 1, |
| 450 |
|
byte_buffer => \$byte_buffer); |
| 451 |
|
$buffer->{buffer} = $byte_buffer; |
| 452 |
|
!!!parse-error (type => 'sniffing:default', ## TODO: type name |
| 453 |
|
value => 'windows-1252', |
| 454 |
|
level => $self->{info_level}, |
| 455 |
|
line => 1, column => 1); |
| 456 |
$self->{confident} = 0; |
$self->{confident} = 0; |
| 457 |
} # SNIFFING |
} # SNIFFING |
| 458 |
|
|
| 459 |
|
$self->{input_encoding} = $charset->get_iana_name; |
| 460 |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
| 461 |
|
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
| 462 |
|
value => $self->{input_encoding}, |
| 463 |
|
level => $self->{unsupported_level}, |
| 464 |
|
line => 1, column => 1); |
| 465 |
} elsif (not ($e_status & |
} elsif (not ($e_status & |
| 466 |
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
| 467 |
|
!!!parse-error (type => 'chardecode:no error', ## TODO: type name |
| 468 |
|
value => $self->{input_encoding}, |
| 469 |
|
level => $self->{unsupported_level}, |
| 470 |
|
line => 1, column => 1); |
| 471 |
} |
} |
|
$s = \ $e->decode ($$bytes_s); |
|
|
$self->{input_encoding} = $charset->get_iana_name; |
|
| 472 |
|
|
| 473 |
$self->{change_encoding} = sub { |
$self->{change_encoding} = sub { |
| 474 |
my $self = shift; |
my $self = shift; |
| 475 |
my $charset_name = lc shift; |
$charset_name = shift; |
| 476 |
my $token = shift; |
my $token = shift; |
|
## TODO: if $charset_name is supported |
|
|
## TODO: normalize charset name |
|
| 477 |
|
|
| 478 |
## "Change the encoding" algorithm: |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 479 |
|
($char_stream, $e_status) = $charset->get_decode_handle |
| 480 |
|
($byte_stream, allow_error_reporting => 1, allow_fallback => 1, |
| 481 |
|
byte_buffer => \ $buffer->{buffer}); |
| 482 |
|
|
| 483 |
|
if ($char_stream) { # if supported |
| 484 |
|
## "Change the encoding" algorithm: |
| 485 |
|
|
| 486 |
## Step 1 |
## Step 1 |
| 487 |
if ($charset_name eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
| 488 |
$charset_name = 'utf-8'; |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
| 489 |
} |
($char_stream, $e_status) = $charset->get_decode_handle |
| 490 |
|
($byte_stream, |
| 491 |
|
byte_buffer => \ $buffer->{buffer}); |
| 492 |
|
} |
| 493 |
|
$charset_name = $charset->get_iana_name; |
| 494 |
|
|
| 495 |
|
## Step 2 |
| 496 |
|
if (defined $self->{input_encoding} and |
| 497 |
|
$self->{input_encoding} eq $charset_name) { |
| 498 |
|
!!!parse-error (type => 'charset label:matching', ## TODO: type |
| 499 |
|
value => $charset_name, |
| 500 |
|
level => $self->{info_level}); |
| 501 |
|
$self->{confident} = 1; |
| 502 |
|
return; |
| 503 |
|
} |
| 504 |
|
|
| 505 |
## Step 2 |
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
| 506 |
if (defined $self->{input_encoding} and |
':'.$charset_name, level => 'w', token => $token); |
| 507 |
$self->{input_encoding} eq $charset_name) { |
|
| 508 |
$self->{confident} = 1; |
## Step 3 |
| 509 |
return; |
# if (can) { |
| 510 |
|
## change the encoding on the fly. |
| 511 |
|
#$self->{confident} = 1; |
| 512 |
|
#return; |
| 513 |
|
# } |
| 514 |
|
|
| 515 |
|
## Step 4 |
| 516 |
|
throw Whatpm::HTML::RestartParser (); |
| 517 |
} |
} |
|
|
|
|
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
|
|
':'.$charset_name, level => 'w', token => $token); |
|
|
|
|
|
## Step 3 |
|
|
# if (can) { |
|
|
## change the encoding on the fly. |
|
|
#$self->{confident} = 1; |
|
|
#return; |
|
|
# } |
|
|
|
|
|
## Step 4 |
|
|
throw Whatpm::HTML::RestartParser (charset => $charset_name); |
|
| 518 |
}; # $self->{change_encoding} |
}; # $self->{change_encoding} |
| 519 |
|
|
| 520 |
|
my $char_onerror = sub { |
| 521 |
|
my (undef, $type, %opt) = @_; |
| 522 |
|
!!!parse-error (%opt, type => $type, |
| 523 |
|
line => $self->{line}, column => $self->{column} + 1); |
| 524 |
|
if ($opt{octets}) { |
| 525 |
|
${$opt{octets}} = "\x{FFFD}"; # relacement character |
| 526 |
|
} |
| 527 |
|
}; |
| 528 |
|
$char_stream->onerror ($char_onerror); |
| 529 |
|
|
| 530 |
my @args = @_; shift @args; # $s |
my @args = @_; shift @args; # $s |
| 531 |
my $return; |
my $return; |
| 532 |
try { |
try { |
| 533 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_stream ($char_stream, @args); |
| 534 |
} catch Whatpm::HTML::RestartParser with { |
} catch Whatpm::HTML::RestartParser with { |
| 535 |
my $charset_name = shift->{charset}; |
## NOTE: Invoked after {change_encoding}. |
| 536 |
$s = \ (Encode::decode ($charset_name, $$bytes_s)); |
|
| 537 |
$self->{input_encoding} = $charset_name; ## TODO: normalize |
$self->{input_encoding} = $charset->get_iana_name; |
| 538 |
|
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
| 539 |
|
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
| 540 |
|
value => $self->{input_encoding}, |
| 541 |
|
level => $self->{unsupported_level}, |
| 542 |
|
line => 1, column => 1); |
| 543 |
|
} elsif (not ($e_status & |
| 544 |
|
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
| 545 |
|
!!!parse-error (type => 'chardecode:no error', ## TODO: type name |
| 546 |
|
value => $self->{input_encoding}, |
| 547 |
|
level => $self->{unsupported_level}, |
| 548 |
|
line => 1, column => 1); |
| 549 |
|
} |
| 550 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 551 |
$return = $self->parse_char_string ($s, @args); |
$char_stream->onerror ($char_onerror); |
| 552 |
|
$return = $self->parse_char_stream ($char_stream, @args); |
| 553 |
}; |
}; |
| 554 |
return $return; |
return $return; |
| 555 |
} # parse_byte_string |
} # parse_byte_stream |
| 556 |
|
|
| 557 |
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM |
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM |
| 558 |
## and the HTML layer MUST ignore it. However, we does strip BOM in |
## and the HTML layer MUST ignore it. However, we does strip BOM in |
| 563 |
## such as |parse_byte_string| in this module, must ensure that it does |
## such as |parse_byte_string| in this module, must ensure that it does |
| 564 |
## strip the BOM and never strip any ZWNBSP. |
## strip the BOM and never strip any ZWNBSP. |
| 565 |
|
|
| 566 |
*parse_char_string = \&parse_string; |
sub parse_char_string ($$$;$) { |
| 567 |
|
my $self = shift; |
| 568 |
|
require utf8; |
| 569 |
|
my $s = ref $_[0] ? $_[0] : \($_[0]); |
| 570 |
|
open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s; |
| 571 |
|
return $self->parse_char_stream ($input, @_[1..$#_]); |
| 572 |
|
} # parse_char_string |
| 573 |
|
*parse_string = \&parse_char_string; |
| 574 |
|
|
| 575 |
sub parse_string ($$$;$) { |
sub parse_char_stream ($$$;$) { |
| 576 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 577 |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
my $input = $_[0]; |
| 578 |
$self->{document} = $_[1]; |
$self->{document} = $_[1]; |
| 579 |
@{$self->{document}->child_nodes} = (); |
@{$self->{document}->child_nodes} = (); |
| 580 |
|
|
| 593 |
pop @{$self->{prev_char}}; |
pop @{$self->{prev_char}}; |
| 594 |
unshift @{$self->{prev_char}}, $self->{next_char}; |
unshift @{$self->{prev_char}}, $self->{next_char}; |
| 595 |
|
|
| 596 |
$self->{next_char} = -1 and return if $i >= length $$s; |
my $char; |
| 597 |
$self->{next_char} = ord substr $$s, $i++, 1; |
if (defined $self->{next_next_char}) { |
| 598 |
|
$char = $self->{next_next_char}; |
| 599 |
|
delete $self->{next_next_char}; |
| 600 |
|
} else { |
| 601 |
|
$char = $input->getc; |
| 602 |
|
} |
| 603 |
|
$self->{next_char} = -1 and return unless defined $char; |
| 604 |
|
$self->{next_char} = ord $char; |
| 605 |
|
|
| 606 |
($self->{line_prev}, $self->{column_prev}) |
($self->{line_prev}, $self->{column_prev}) |
| 607 |
= ($self->{line}, $self->{column}); |
= ($self->{line}, $self->{column}); |
| 613 |
$self->{column} = 0; |
$self->{column} = 0; |
| 614 |
} elsif ($self->{next_char} == 0x000D) { # CR |
} elsif ($self->{next_char} == 0x000D) { # CR |
| 615 |
!!!cp ('j2'); |
!!!cp ('j2'); |
| 616 |
$i++ if substr ($$s, $i, 1) eq "\x0A"; |
my $next = $input->getc; |
| 617 |
|
if (defined $next and $next ne "\x0A") { |
| 618 |
|
$self->{next_next_char} = $next; |
| 619 |
|
} |
| 620 |
$self->{next_char} = 0x000A; # LF # MUST |
$self->{next_char} = 0x000A; # LF # MUST |
| 621 |
$self->{line}++; |
$self->{line}++; |
| 622 |
$self->{column} = 0; |
$self->{column} = 0; |
| 669 |
delete $self->{parse_error}; # remove loop |
delete $self->{parse_error}; # remove loop |
| 670 |
|
|
| 671 |
return $self->{document}; |
return $self->{document}; |
| 672 |
} # parse_string |
} # parse_char_stream |
| 673 |
|
|
| 674 |
sub new ($) { |
sub new ($) { |
| 675 |
my $class = shift; |
my $class = shift; |
| 676 |
my $self = bless {}, $class; |
my $self = bless { |
| 677 |
|
must_level => 'm', |
| 678 |
|
should_level => 's', |
| 679 |
|
good_level => 'w', |
| 680 |
|
warn_level => 'w', |
| 681 |
|
info_level => 'i', |
| 682 |
|
unsupported_level => 'u', |
| 683 |
|
}, $class; |
| 684 |
$self->{set_next_char} = sub { |
$self->{set_next_char} = sub { |
| 685 |
$self->{next_char} = -1; |
$self->{next_char} = -1; |
| 686 |
}; |
}; |
| 1045 |
redo A; |
redo A; |
| 1046 |
} else { |
} else { |
| 1047 |
!!!cp (23); |
!!!cp (23); |
| 1048 |
!!!parse-error (type => 'bare stago'); |
!!!parse-error (type => 'bare stago', |
| 1049 |
|
line => $self->{line_prev}, |
| 1050 |
|
column => $self->{column_prev}); |
| 1051 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1052 |
## reconsume |
## reconsume |
| 1053 |
|
|
| 1824 |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
| 1825 |
!!!next-input-character; |
!!!next-input-character; |
| 1826 |
redo A; |
redo A; |
| 1827 |
|
} elsif ($self->{next_char} == -1) { |
| 1828 |
|
!!!parse-error (type => 'unclosed tag'); |
| 1829 |
|
if ($self->{current_token}->{type} == START_TAG_TOKEN) { |
| 1830 |
|
!!!cp (122.3); |
| 1831 |
|
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name}; |
| 1832 |
|
} elsif ($self->{current_token}->{type} == END_TAG_TOKEN) { |
| 1833 |
|
if ($self->{current_token}->{attributes}) { |
| 1834 |
|
!!!cp (122.1); |
| 1835 |
|
!!!parse-error (type => 'end tag attribute'); |
| 1836 |
|
} else { |
| 1837 |
|
## NOTE: This state should never be reached. |
| 1838 |
|
!!!cp (122.2); |
| 1839 |
|
} |
| 1840 |
|
} else { |
| 1841 |
|
die "$0: $self->{current_token}->{type}: Unknown token type"; |
| 1842 |
|
} |
| 1843 |
|
$self->{state} = DATA_STATE; |
| 1844 |
|
## Reconsume. |
| 1845 |
|
!!!emit ($self->{current_token}); # start tag or end tag |
| 1846 |
|
redo A; |
| 1847 |
} else { |
} else { |
| 1848 |
!!!cp ('124.1'); |
!!!cp ('124.1'); |
| 1849 |
!!!parse-error (type => 'no space between attributes'); |
!!!parse-error (type => 'no space between attributes'); |
| 1876 |
!!!emit ($self->{current_token}); # start tag or end tag |
!!!emit ($self->{current_token}); # start tag or end tag |
| 1877 |
|
|
| 1878 |
redo A; |
redo A; |
| 1879 |
|
} elsif ($self->{next_char} == -1) { |
| 1880 |
|
!!!parse-error (type => 'unclosed tag'); |
| 1881 |
|
if ($self->{current_token}->{type} == START_TAG_TOKEN) { |
| 1882 |
|
!!!cp (124.7); |
| 1883 |
|
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name}; |
| 1884 |
|
} elsif ($self->{current_token}->{type} == END_TAG_TOKEN) { |
| 1885 |
|
if ($self->{current_token}->{attributes}) { |
| 1886 |
|
!!!cp (124.5); |
| 1887 |
|
!!!parse-error (type => 'end tag attribute'); |
| 1888 |
|
} else { |
| 1889 |
|
## NOTE: This state should never be reached. |
| 1890 |
|
!!!cp (124.6); |
| 1891 |
|
} |
| 1892 |
|
} else { |
| 1893 |
|
die "$0: $self->{current_token}->{type}: Unknown token type"; |
| 1894 |
|
} |
| 1895 |
|
$self->{state} = DATA_STATE; |
| 1896 |
|
## Reconsume. |
| 1897 |
|
!!!emit ($self->{current_token}); # start tag or end tag |
| 1898 |
|
redo A; |
| 1899 |
} else { |
} else { |
| 1900 |
!!!cp ('124.4'); |
!!!cp ('124.4'); |
| 1901 |
!!!parse-error (type => 'nestc'); |
!!!parse-error (type => 'nestc'); |
| 2736 |
redo A; |
redo A; |
| 2737 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
| 2738 |
!!!cp (217); |
!!!cp (217); |
|
!!!parse-error (type => 'unclosed DOCTYPE'); |
|
| 2739 |
|
|
| 2740 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2741 |
## reconsume |
## reconsume |
| 3139 |
} elsif (defined $token->{public_identifier}) { |
} elsif (defined $token->{public_identifier}) { |
| 3140 |
my $pubid = $token->{public_identifier}; |
my $pubid = $token->{public_identifier}; |
| 3141 |
$pubid =~ tr/a-z/A-z/; |
$pubid =~ tr/a-z/A-z/; |
| 3142 |
if ({ |
my $prefix = [ |
| 3143 |
"+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1, |
"+//SILMARIL//DTD HTML PRO V0R11 19970101//", |
| 3144 |
"-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1, |
"-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//", |
| 3145 |
"-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1, |
"-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//", |
| 3146 |
"-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1, |
"-//IETF//DTD HTML 2.0 LEVEL 1//", |
| 3147 |
"-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1, |
"-//IETF//DTD HTML 2.0 LEVEL 2//", |
| 3148 |
"-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1, |
"-//IETF//DTD HTML 2.0 STRICT LEVEL 1//", |
| 3149 |
"-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1, |
"-//IETF//DTD HTML 2.0 STRICT LEVEL 2//", |
| 3150 |
"-//IETF//DTD HTML 2.0 STRICT//EN" => 1, |
"-//IETF//DTD HTML 2.0 STRICT//", |
| 3151 |
"-//IETF//DTD HTML 2.0//EN" => 1, |
"-//IETF//DTD HTML 2.0//", |
| 3152 |
"-//IETF//DTD HTML 2.1E//EN" => 1, |
"-//IETF//DTD HTML 2.1E//", |
| 3153 |
"-//IETF//DTD HTML 3.0//EN" => 1, |
"-//IETF//DTD HTML 3.0//", |
| 3154 |
"-//IETF//DTD HTML 3.0//EN//" => 1, |
"-//IETF//DTD HTML 3.2 FINAL//", |
| 3155 |
"-//IETF//DTD HTML 3.2 FINAL//EN" => 1, |
"-//IETF//DTD HTML 3.2//", |
| 3156 |
"-//IETF//DTD HTML 3.2//EN" => 1, |
"-//IETF//DTD HTML 3//", |
| 3157 |
"-//IETF//DTD HTML 3//EN" => 1, |
"-//IETF//DTD HTML LEVEL 0//", |
| 3158 |
"-//IETF//DTD HTML LEVEL 0//EN" => 1, |
"-//IETF//DTD HTML LEVEL 1//", |
| 3159 |
"-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1, |
"-//IETF//DTD HTML LEVEL 2//", |
| 3160 |
"-//IETF//DTD HTML LEVEL 1//EN" => 1, |
"-//IETF//DTD HTML LEVEL 3//", |
| 3161 |
"-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1, |
"-//IETF//DTD HTML STRICT LEVEL 0//", |
| 3162 |
"-//IETF//DTD HTML LEVEL 2//EN" => 1, |
"-//IETF//DTD HTML STRICT LEVEL 1//", |
| 3163 |
"-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1, |
"-//IETF//DTD HTML STRICT LEVEL 2//", |
| 3164 |
"-//IETF//DTD HTML LEVEL 3//EN" => 1, |
"-//IETF//DTD HTML STRICT LEVEL 3//", |
| 3165 |
"-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1, |
"-//IETF//DTD HTML STRICT//", |
| 3166 |
"-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1, |
"-//IETF//DTD HTML//", |
| 3167 |
"-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1, |
"-//METRIUS//DTD METRIUS PRESENTATIONAL//", |
| 3168 |
"-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1, |
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//", |
| 3169 |
"-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1, |
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//", |
| 3170 |
"-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1, |
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//", |
| 3171 |
"-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1, |
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//", |
| 3172 |
"-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1, |
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//", |
| 3173 |
"-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1, |
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//", |
| 3174 |
"-//IETF//DTD HTML STRICT//EN" => 1, |
"-//NETSCAPE COMM. CORP.//DTD HTML//", |
| 3175 |
"-//IETF//DTD HTML STRICT//EN//2.0" => 1, |
"-//NETSCAPE COMM. CORP.//DTD STRICT HTML//", |
| 3176 |
"-//IETF//DTD HTML STRICT//EN//3.0" => 1, |
"-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//", |
| 3177 |
"-//IETF//DTD HTML//EN" => 1, |
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//", |
| 3178 |
"-//IETF//DTD HTML//EN//2.0" => 1, |
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//", |
| 3179 |
"-//IETF//DTD HTML//EN//3.0" => 1, |
"-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//", |
| 3180 |
"-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1, |
"-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//", |
| 3181 |
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1, |
"-//SPYGLASS//DTD HTML 2.0 EXTENDED//", |
| 3182 |
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1, |
"-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//", |
| 3183 |
"-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1, |
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//", |
| 3184 |
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1, |
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//", |
| 3185 |
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1, |
"-//W3C//DTD HTML 3 1995-03-24//", |
| 3186 |
"-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1, |
"-//W3C//DTD HTML 3.2 DRAFT//", |
| 3187 |
"-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1, |
"-//W3C//DTD HTML 3.2 FINAL//", |
| 3188 |
"-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1, |
"-//W3C//DTD HTML 3.2//", |
| 3189 |
"-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1, |
"-//W3C//DTD HTML 3.2S DRAFT//", |
| 3190 |
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1, |
"-//W3C//DTD HTML 4.0 FRAMESET//", |
| 3191 |
"-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1, |
"-//W3C//DTD HTML 4.0 TRANSITIONAL//", |
| 3192 |
"-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1, |
"-//W3C//DTD HTML EXPERIMETNAL 19960712//", |
| 3193 |
"-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1, |
"-//W3C//DTD HTML EXPERIMENTAL 970421//", |
| 3194 |
"-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1, |
"-//W3C//DTD W3 HTML//", |
| 3195 |
"-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1, |
"-//W3O//DTD W3 HTML 3.0//", |
| 3196 |
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1, |
"-//WEBTECHS//DTD MOZILLA HTML 2.0//", |
| 3197 |
"-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1, |
"-//WEBTECHS//DTD MOZILLA HTML//", |
| 3198 |
"-//W3C//DTD HTML 3 1995-03-24//EN" => 1, |
]; # $prefix |
| 3199 |
"-//W3C//DTD HTML 3.2 DRAFT//EN" => 1, |
my $match; |
| 3200 |
"-//W3C//DTD HTML 3.2 FINAL//EN" => 1, |
for (@$prefix) { |
| 3201 |
"-//W3C//DTD HTML 3.2//EN" => 1, |
if (substr ($prefix, 0, length $_) eq $_) { |
| 3202 |
"-//W3C//DTD HTML 3.2S DRAFT//EN" => 1, |
$match = 1; |
| 3203 |
"-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1, |
last; |
| 3204 |
"-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1, |
} |
| 3205 |
"-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1, |
} |
| 3206 |
"-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1, |
if ($match or |
| 3207 |
"-//W3C//DTD W3 HTML//EN" => 1, |
$pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or |
| 3208 |
"-//W3O//DTD W3 HTML 3.0//EN" => 1, |
$pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or |
| 3209 |
"-//W3O//DTD W3 HTML 3.0//EN//" => 1, |
$pubid eq "HTML") { |
|
"-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1, |
|
|
"-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1, |
|
|
"-//WEBTECHS//DTD MOZILLA HTML//EN" => 1, |
|
|
"-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1, |
|
|
"HTML" => 1, |
|
|
}->{$pubid}) { |
|
| 3210 |
!!!cp ('t5'); |
!!!cp ('t5'); |
| 3211 |
$self->{document}->manakai_compat_mode ('quirks'); |
$self->{document}->manakai_compat_mode ('quirks'); |
| 3212 |
} elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or |
} elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or |
| 3213 |
$pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") { |
$pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) { |
| 3214 |
if (defined $token->{system_identifier}) { |
if (defined $token->{system_identifier}) { |
| 3215 |
!!!cp ('t6'); |
!!!cp ('t6'); |
| 3216 |
$self->{document}->manakai_compat_mode ('quirks'); |
$self->{document}->manakai_compat_mode ('quirks'); |
| 3218 |
!!!cp ('t7'); |
!!!cp ('t7'); |
| 3219 |
$self->{document}->manakai_compat_mode ('limited quirks'); |
$self->{document}->manakai_compat_mode ('limited quirks'); |
| 3220 |
} |
} |
| 3221 |
} elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or |
} elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or |
| 3222 |
$pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") { |
$pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) { |
| 3223 |
!!!cp ('t8'); |
!!!cp ('t8'); |
| 3224 |
$self->{document}->manakai_compat_mode ('limited quirks'); |
$self->{document}->manakai_compat_mode ('limited quirks'); |
| 3225 |
} else { |
} else { |
| 3232 |
my $sysid = $token->{system_identifier}; |
my $sysid = $token->{system_identifier}; |
| 3233 |
$sysid =~ tr/A-Z/a-z/; |
$sysid =~ tr/A-Z/a-z/; |
| 3234 |
if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") { |
if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") { |
| 3235 |
## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)" |
## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is |
| 3236 |
|
## marked as quirks. |
| 3237 |
$self->{document}->manakai_compat_mode ('quirks'); |
$self->{document}->manakai_compat_mode ('quirks'); |
| 3238 |
!!!cp ('t11'); |
!!!cp ('t11'); |
| 3239 |
} else { |
} else { |
| 3405 |
if ($self->{open_elements}->[0]->[0] eq $node->[0]) { |
if ($self->{open_elements}->[0]->[0] eq $node->[0]) { |
| 3406 |
$last = 1; |
$last = 1; |
| 3407 |
if (defined $self->{inner_html_node}) { |
if (defined $self->{inner_html_node}) { |
| 3408 |
if ($self->{inner_html_node}->[1] & TABLE_CELL_EL) { |
!!!cp ('t28'); |
| 3409 |
!!!cp ('t27'); |
$node = $self->{inner_html_node}; |
| 3410 |
# |
} else { |
| 3411 |
} else { |
die "_reset_insertion_mode: t27"; |
|
!!!cp ('t28'); |
|
|
$node = $self->{inner_html_node}; |
|
|
} |
|
| 3412 |
} |
} |
| 3413 |
} |
} |
| 3414 |
|
|
| 3415 |
## Step 4..14 |
## Step 4..14 |
| 3416 |
my $new_mode; |
my $new_mode; |
| 3417 |
if ($node->[1] & FOREIGN_EL) { |
if ($node->[1] & FOREIGN_EL) { |
| 3418 |
## NOTE: Strictly spaking, the line below only applies to MathML and |
!!!cp ('t28.1'); |
| 3419 |
## SVG elements. Currently the HTML syntax supports only MathML and |
## NOTE: Strictly spaking, the line below only applies to MathML and |
| 3420 |
## SVG elements as foreigners. |
## SVG elements. Currently the HTML syntax supports only MathML and |
| 3421 |
$new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM; |
## SVG elements as foreigners. |
| 3422 |
## ISSUE: What is set as the secondary insertion mode? |
$new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM; |
| 3423 |
} else { |
## ISSUE: What is set as the secondary insertion mode? |
| 3424 |
$new_mode = { |
} elsif ($node->[1] & TABLE_CELL_EL) { |
| 3425 |
|
if ($last) { |
| 3426 |
|
!!!cp ('t28.2'); |
| 3427 |
|
# |
| 3428 |
|
} else { |
| 3429 |
|
!!!cp ('t28.3'); |
| 3430 |
|
$new_mode = IN_CELL_IM; |
| 3431 |
|
} |
| 3432 |
|
} else { |
| 3433 |
|
!!!cp ('t28.4'); |
| 3434 |
|
$new_mode = { |
| 3435 |
select => IN_SELECT_IM, |
select => IN_SELECT_IM, |
| 3436 |
## NOTE: |option| and |optgroup| do not set |
## NOTE: |option| and |optgroup| do not set |
| 3437 |
## insertion mode to "in select" by themselves. |
## insertion mode to "in select" by themselves. |
|
td => IN_CELL_IM, |
|
|
th => IN_CELL_IM, |
|
| 3438 |
tr => IN_ROW_IM, |
tr => IN_ROW_IM, |
| 3439 |
tbody => IN_TABLE_BODY_IM, |
tbody => IN_TABLE_BODY_IM, |
| 3440 |
thead => IN_TABLE_BODY_IM, |
thead => IN_TABLE_BODY_IM, |
| 3446 |
body => IN_BODY_IM, |
body => IN_BODY_IM, |
| 3447 |
frameset => IN_FRAMESET_IM, |
frameset => IN_FRAMESET_IM, |
| 3448 |
}->{$node->[0]->manakai_local_name}; |
}->{$node->[0]->manakai_local_name}; |
| 3449 |
} |
} |
| 3450 |
$self->{insertion_mode} = $new_mode and return if defined $new_mode; |
$self->{insertion_mode} = $new_mode and return if defined $new_mode; |
| 3451 |
|
|
| 3452 |
## Step 15 |
## Step 15 |
| 3453 |
if ($node->[1] & HTML_EL) { |
if ($node->[1] & HTML_EL) { |
| 4190 |
!!!next-token; |
!!!next-token; |
| 4191 |
next B; |
next B; |
| 4192 |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4193 |
!!!cp ('t94'); |
!!!cp ('t93.2'); |
| 4194 |
# |
!!!parse-error (type => 'after head:head', token => $token); ## TODO: error type |
| 4195 |
|
## Ignore the token |
| 4196 |
|
!!!nack ('t93.3'); |
| 4197 |
|
!!!next-token; |
| 4198 |
|
next B; |
| 4199 |
} else { |
} else { |
| 4200 |
!!!cp ('t95'); |
!!!cp ('t95'); |
| 4201 |
!!!parse-error (type => 'in head:head', token => $token); # or in head noscript |
!!!parse-error (type => 'in head:head', token => $token); # or in head noscript |
| 4278 |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4279 |
|
|
| 4280 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 4281 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { |
| 4282 |
!!!cp ('t106'); |
!!!cp ('t106'); |
| 4283 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 4284 |
|
## in the {change_encoding} callback. |
| 4285 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4286 |
->($self, $token->{attributes}->{charset}->{value}, |
->($self, $token->{attributes}->{charset}->{value}, |
| 4287 |
$token); |
$token); |
| 4291 |
$token->{attributes}->{charset} |
$token->{attributes}->{charset} |
| 4292 |
->{has_reference}); |
->{has_reference}); |
| 4293 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
|
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
|
| 4294 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4295 |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
=~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 4296 |
[\x09-\x0D\x20]*= |
[\x09-\x0D\x20]*= |
| 4297 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 4298 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) { |
| 4299 |
!!!cp ('t107'); |
!!!cp ('t107'); |
| 4300 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 4301 |
|
## in the {change_encoding} callback. |
| 4302 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4303 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, |
| 4304 |
$token); |
$token); |
| 4517 |
$self->{insertion_mode} = AFTER_HEAD_IM; |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4518 |
!!!next-token; |
!!!next-token; |
| 4519 |
next B; |
next B; |
| 4520 |
|
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4521 |
|
!!!cp ('t134.1'); |
| 4522 |
|
!!!parse-error (type => 'unmatched end tag:head', token => $token); |
| 4523 |
|
## Ignore the token |
| 4524 |
|
!!!next-token; |
| 4525 |
|
next B; |
| 4526 |
} else { |
} else { |
| 4527 |
!!!cp ('t135'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
|
# |
|
| 4528 |
} |
} |
| 4529 |
} elsif ($token->{tag_name} eq 'noscript') { |
} elsif ($token->{tag_name} eq 'noscript') { |
| 4530 |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
| 4533 |
$self->{insertion_mode} = IN_HEAD_IM; |
$self->{insertion_mode} = IN_HEAD_IM; |
| 4534 |
!!!next-token; |
!!!next-token; |
| 4535 |
next B; |
next B; |
| 4536 |
} elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
} elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or |
| 4537 |
|
$self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4538 |
!!!cp ('t137'); |
!!!cp ('t137'); |
| 4539 |
!!!parse-error (type => 'unmatched end tag:noscript', token => $token); |
!!!parse-error (type => 'unmatched end tag:noscript', token => $token); |
| 4540 |
## Ignore the token ## ISSUE: An issue in the spec. |
## Ignore the token ## ISSUE: An issue in the spec. |
| 4547 |
} elsif ({ |
} elsif ({ |
| 4548 |
body => 1, html => 1, |
body => 1, html => 1, |
| 4549 |
}->{$token->{tag_name}}) { |
}->{$token->{tag_name}}) { |
| 4550 |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
if ($self->{insertion_mode} == BEFORE_HEAD_IM or |
| 4551 |
!!!cp ('t139'); |
$self->{insertion_mode} == IN_HEAD_IM or |
| 4552 |
## As if <head> |
$self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
|
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
|
|
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
|
|
push @{$self->{open_elements}}, |
|
|
[$self->{head_element}, $el_category->{head}]; |
|
|
|
|
|
$self->{insertion_mode} = IN_HEAD_IM; |
|
|
## Reprocess in the "in head" insertion mode... |
|
|
} elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
|
| 4553 |
!!!cp ('t140'); |
!!!cp ('t140'); |
| 4554 |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
| 4555 |
## Ignore the token |
## Ignore the token |
| 4556 |
!!!next-token; |
!!!next-token; |
| 4557 |
next B; |
next B; |
| 4558 |
|
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4559 |
|
!!!cp ('t140.1'); |
| 4560 |
|
!!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token); |
| 4561 |
|
## Ignore the token |
| 4562 |
|
!!!next-token; |
| 4563 |
|
next B; |
| 4564 |
} else { |
} else { |
| 4565 |
!!!cp ('t141'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
| 4566 |
} |
} |
| 4567 |
|
} elsif ($token->{tag_name} eq 'p') { |
| 4568 |
# |
!!!cp ('t142'); |
| 4569 |
} elsif ({ |
!!!parse-error (type => 'unmatched end tag:p', token => $token); |
| 4570 |
p => 1, br => 1, |
## Ignore the token |
| 4571 |
}->{$token->{tag_name}}) { |
!!!next-token; |
| 4572 |
|
next B; |
| 4573 |
|
} elsif ($token->{tag_name} eq 'br') { |
| 4574 |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
| 4575 |
!!!cp ('t142'); |
!!!cp ('t142.2'); |
| 4576 |
## As if <head> |
## (before head) as if <head>, (in head) as if </head> |
| 4577 |
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
| 4578 |
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
| 4579 |
push @{$self->{open_elements}}, |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4580 |
[$self->{head_element}, $el_category->{head}]; |
|
| 4581 |
|
## Reprocess in the "after head" insertion mode... |
| 4582 |
|
} elsif ($self->{insertion_mode} == IN_HEAD_IM) { |
| 4583 |
|
!!!cp ('t143.2'); |
| 4584 |
|
## As if </head> |
| 4585 |
|
pop @{$self->{open_elements}}; |
| 4586 |
|
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4587 |
|
|
| 4588 |
|
## Reprocess in the "after head" insertion mode... |
| 4589 |
|
} elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
| 4590 |
|
!!!cp ('t143.3'); |
| 4591 |
|
## ISSUE: Two parse errors for <head><noscript></br> |
| 4592 |
|
!!!parse-error (type => 'unmatched end tag:br', token => $token); |
| 4593 |
|
## As if </noscript> |
| 4594 |
|
pop @{$self->{open_elements}}; |
| 4595 |
$self->{insertion_mode} = IN_HEAD_IM; |
$self->{insertion_mode} = IN_HEAD_IM; |
| 4596 |
|
|
| 4597 |
## Reprocess in the "in head" insertion mode... |
## Reprocess in the "in head" insertion mode... |
| 4598 |
} else { |
## As if </head> |
| 4599 |
!!!cp ('t143'); |
pop @{$self->{open_elements}}; |
| 4600 |
} |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4601 |
|
|
| 4602 |
# |
## Reprocess in the "after head" insertion mode... |
| 4603 |
} else { |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4604 |
if ($self->{insertion_mode} == AFTER_HEAD_IM) { |
!!!cp ('t143.4'); |
|
!!!cp ('t144'); |
|
| 4605 |
# |
# |
| 4606 |
} else { |
} else { |
| 4607 |
!!!cp ('t145'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
|
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
|
|
## Ignore the token |
|
|
!!!next-token; |
|
|
next B; |
|
| 4608 |
} |
} |
| 4609 |
|
|
| 4610 |
|
## ISSUE: does not agree with IE7 - it doesn't ignore </br>. |
| 4611 |
|
!!!parse-error (type => 'unmatched end tag:br', token => $token); |
| 4612 |
|
## Ignore the token |
| 4613 |
|
!!!next-token; |
| 4614 |
|
next B; |
| 4615 |
|
} else { |
| 4616 |
|
!!!cp ('t145'); |
| 4617 |
|
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
| 4618 |
|
## Ignore the token |
| 4619 |
|
!!!next-token; |
| 4620 |
|
next B; |
| 4621 |
} |
} |
| 4622 |
|
|
| 4623 |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
| 6342 |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 6343 |
|
|
| 6344 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 6345 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { |
| 6346 |
!!!cp ('t335'); |
!!!cp ('t335'); |
| 6347 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 6348 |
|
## in the {change_encoding} callback. |
| 6349 |
$self->{change_encoding} |
$self->{change_encoding} |
| 6350 |
->($self, $token->{attributes}->{charset}->{value}, $token); |
->($self, $token->{attributes}->{charset}->{value}, $token); |
| 6351 |
|
|
| 6354 |
$token->{attributes}->{charset} |
$token->{attributes}->{charset} |
| 6355 |
->{has_reference}); |
->{has_reference}); |
| 6356 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
|
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
|
| 6357 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 6358 |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
=~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 6359 |
[\x09-\x0D\x20]*= |
[\x09-\x0D\x20]*= |
| 6360 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 6361 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) { |
| 6362 |
!!!cp ('t336'); |
!!!cp ('t336'); |
| 6363 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 6364 |
|
## in the {change_encoding} callback. |
| 6365 |
$self->{change_encoding} |
$self->{change_encoding} |
| 6366 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); |
| 6367 |
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |