| 337 |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
| 338 |
my $s; |
my $s; |
| 339 |
|
|
| 340 |
|
my $onerror = $_[2] || sub { |
| 341 |
|
my (%opt) = @_; |
| 342 |
|
warn "Parse error ($opt{type})\n"; |
| 343 |
|
}; |
| 344 |
|
$self->{parse_error} = $onerror; # updated later by parse_char_string |
| 345 |
|
|
| 346 |
## HTML5 encoding sniffing algorithm |
## HTML5 encoding sniffing algorithm |
| 347 |
require Message::Charset::Info; |
require Message::Charset::Info; |
| 348 |
my $charset; |
my $charset; |
| 410 |
(allow_error_reporting => 1, |
(allow_error_reporting => 1, |
| 411 |
allow_fallback => 1); |
allow_fallback => 1); |
| 412 |
if ($e) { |
if ($e) { |
| 413 |
|
!!!parse-error (type => 'sniffing:chardet', ## TODO: type name |
| 414 |
|
value => $charset_name, |
| 415 |
|
level => $self->{info_level}, |
| 416 |
|
line => 1, column => 1); |
| 417 |
$self->{confident} = 0; |
$self->{confident} = 0; |
| 418 |
last SNIFFING; |
last SNIFFING; |
| 419 |
} |
} |
| 426 |
## detectable in the step 6. |
## detectable in the step 6. |
| 427 |
($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1, |
($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1, |
| 428 |
allow_fallback => 1); |
allow_fallback => 1); |
| 429 |
|
!!!parse-error (type => 'sniffing:default', ## TODO: type name |
| 430 |
|
value => 'windows-1252', |
| 431 |
|
level => $self->{info_level}, |
| 432 |
|
line => 1, column => 1); |
| 433 |
$self->{confident} = 0; |
$self->{confident} = 0; |
| 434 |
} # SNIFFING |
} # SNIFFING |
| 435 |
|
|
| 436 |
|
$self->{input_encoding} = $charset->get_iana_name; |
| 437 |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
| 438 |
|
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
| 439 |
|
value => $e->name, |
| 440 |
|
level => $self->{unsupported_level}, |
| 441 |
|
line => 1, column => 1); |
| 442 |
} elsif (not ($e_status & |
} elsif (not ($e_status & |
| 443 |
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
| 444 |
|
!!!parse-error (type => 'chardecode:no error', ## TODO: type name |
| 445 |
|
value => $self->{input_encoding}, |
| 446 |
|
level => $self->{unsupported_level}, |
| 447 |
|
line => 1, column => 1); |
| 448 |
} |
} |
| 449 |
$s = \ $e->decode ($$bytes_s); |
$s = \ $e->decode ($$bytes_s); |
|
$self->{input_encoding} = $charset->get_iana_name; |
|
| 450 |
|
|
| 451 |
$self->{change_encoding} = sub { |
$self->{change_encoding} = sub { |
| 452 |
my $self = shift; |
my $self = shift; |
| 453 |
my $charset_name = lc shift; |
$charset_name = shift; |
| 454 |
my $token = shift; |
my $token = shift; |
|
## TODO: if $charset_name is supported |
|
|
## TODO: normalize charset name |
|
| 455 |
|
|
| 456 |
## "Change the encoding" algorithm: |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 457 |
|
($e, $e_status) = $charset->get_perl_encoding |
| 458 |
|
(allow_error_reporting => 1, allow_fallback => 1); |
| 459 |
|
|
| 460 |
|
if ($e) { # if supported |
| 461 |
|
## "Change the encoding" algorithm: |
| 462 |
|
|
| 463 |
## Step 1 |
## Step 1 |
| 464 |
if ($charset_name eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
| 465 |
$charset_name = 'utf-8'; |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
| 466 |
} |
($e, $e_status) = $charset->get_perl_encoding; |
| 467 |
|
} |
| 468 |
|
$charset_name = $charset->get_iana_name; |
| 469 |
|
|
| 470 |
|
## Step 2 |
| 471 |
|
if (defined $self->{input_encoding} and |
| 472 |
|
$self->{input_encoding} eq $charset_name) { |
| 473 |
|
!!!parse-error (type => 'charset label:matching', ## TODO: type |
| 474 |
|
value => $charset_name, |
| 475 |
|
level => $self->{info_level}); |
| 476 |
|
$self->{confident} = 1; |
| 477 |
|
return; |
| 478 |
|
} |
| 479 |
|
|
| 480 |
## Step 2 |
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
| 481 |
if (defined $self->{input_encoding} and |
':'.$charset_name, level => 'w', token => $token); |
| 482 |
$self->{input_encoding} eq $charset_name) { |
|
| 483 |
$self->{confident} = 1; |
## Step 3 |
| 484 |
return; |
# if (can) { |
| 485 |
|
## change the encoding on the fly. |
| 486 |
|
#$self->{confident} = 1; |
| 487 |
|
#return; |
| 488 |
|
# } |
| 489 |
|
|
| 490 |
|
## Step 4 |
| 491 |
|
throw Whatpm::HTML::RestartParser (); |
| 492 |
} |
} |
|
|
|
|
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
|
|
':'.$charset_name, level => 'w', token => $token); |
|
|
|
|
|
## Step 3 |
|
|
# if (can) { |
|
|
## change the encoding on the fly. |
|
|
#$self->{confident} = 1; |
|
|
#return; |
|
|
# } |
|
|
|
|
|
## Step 4 |
|
|
throw Whatpm::HTML::RestartParser (charset => $charset_name); |
|
| 493 |
}; # $self->{change_encoding} |
}; # $self->{change_encoding} |
| 494 |
|
|
| 495 |
my @args = @_; shift @args; # $s |
my @args = @_; shift @args; # $s |
| 497 |
try { |
try { |
| 498 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_string ($s, @args); |
| 499 |
} catch Whatpm::HTML::RestartParser with { |
} catch Whatpm::HTML::RestartParser with { |
| 500 |
my $charset_name = shift->{charset}; |
## NOTE: Invoked after {change_encoding}. |
| 501 |
$s = \ (Encode::decode ($charset_name, $$bytes_s)); |
|
| 502 |
$self->{input_encoding} = $charset_name; ## TODO: normalize |
$self->{input_encoding} = $charset->get_iana_name; |
| 503 |
|
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
| 504 |
|
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
| 505 |
|
value => $e->name, |
| 506 |
|
level => $self->{unsupported_level}, |
| 507 |
|
line => 1, column => 1); |
| 508 |
|
} elsif (not ($e_status & |
| 509 |
|
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
| 510 |
|
!!!parse-error (type => 'chardecode:no error', ## TODO: type name |
| 511 |
|
value => $self->{input_encoding}, |
| 512 |
|
level => $self->{unsupported_level}, |
| 513 |
|
line => 1, column => 1); |
| 514 |
|
} |
| 515 |
|
$s = \ $e->decode ($$bytes_s); |
| 516 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 517 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_string ($s, @args); |
| 518 |
}; |
}; |
| 621 |
|
|
| 622 |
sub new ($) { |
sub new ($) { |
| 623 |
my $class = shift; |
my $class = shift; |
| 624 |
my $self = bless {}, $class; |
my $self = bless { |
| 625 |
|
must_level => 'm', |
| 626 |
|
should_level => 's', |
| 627 |
|
good_level => 'w', |
| 628 |
|
warn_level => 'w', |
| 629 |
|
info_level => 'i', |
| 630 |
|
unsupported_level => 'u', |
| 631 |
|
}, $class; |
| 632 |
$self->{set_next_char} = sub { |
$self->{set_next_char} = sub { |
| 633 |
$self->{next_char} = -1; |
$self->{next_char} = -1; |
| 634 |
}; |
}; |
| 4181 |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4182 |
|
|
| 4183 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 4184 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { |
| 4185 |
!!!cp ('t106'); |
!!!cp ('t106'); |
| 4186 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 4187 |
|
## in the {change_encoding} callback. |
| 4188 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4189 |
->($self, $token->{attributes}->{charset}->{value}, |
->($self, $token->{attributes}->{charset}->{value}, |
| 4190 |
$token); |
$token); |
| 4194 |
$token->{attributes}->{charset} |
$token->{attributes}->{charset} |
| 4195 |
->{has_reference}); |
->{has_reference}); |
| 4196 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
|
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
|
| 4197 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4198 |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 4199 |
[\x09-\x0D\x20]*= |
[\x09-\x0D\x20]*= |
| 4200 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 4201 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 4202 |
!!!cp ('t107'); |
!!!cp ('t107'); |
| 4203 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 4204 |
|
## in the {change_encoding} callback. |
| 4205 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4206 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, |
| 4207 |
$token); |
$token); |
| 6218 |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 6219 |
|
|
| 6220 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 6221 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { |
| 6222 |
!!!cp ('t335'); |
!!!cp ('t335'); |
| 6223 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 6224 |
|
## in the {change_encoding} callback. |
| 6225 |
$self->{change_encoding} |
$self->{change_encoding} |
| 6226 |
->($self, $token->{attributes}->{charset}->{value}, $token); |
->($self, $token->{attributes}->{charset}->{value}, $token); |
| 6227 |
|
|
| 6230 |
$token->{attributes}->{charset} |
$token->{attributes}->{charset} |
| 6231 |
->{has_reference}); |
->{has_reference}); |
| 6232 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
|
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
|
| 6233 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 6234 |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 6235 |
[\x09-\x0D\x20]*= |
[\x09-\x0D\x20]*= |
| 6236 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 6237 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 6238 |
!!!cp ('t336'); |
!!!cp ('t336'); |
| 6239 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 6240 |
|
## in the {change_encoding} callback. |
| 6241 |
$self->{change_encoding} |
$self->{change_encoding} |
| 6242 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); |
| 6243 |
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |