337 |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
338 |
my $s; |
my $s; |
339 |
|
|
340 |
|
my $onerror = $_[2] || sub { |
341 |
|
my (%opt) = @_; |
342 |
|
warn "Parse error ($opt{type})\n"; |
343 |
|
}; |
344 |
|
$self->{parse_error} = $onerror; # updated later by parse_char_string |
345 |
|
|
346 |
## HTML5 encoding sniffing algorithm |
## HTML5 encoding sniffing algorithm |
347 |
require Message::Charset::Info; |
require Message::Charset::Info; |
348 |
my $charset; |
my $charset; |
410 |
(allow_error_reporting => 1, |
(allow_error_reporting => 1, |
411 |
allow_fallback => 1); |
allow_fallback => 1); |
412 |
if ($e) { |
if ($e) { |
413 |
|
!!!parse-error (type => 'sniffing:chardet', ## TODO: type name |
414 |
|
value => $charset_name, |
415 |
|
level => $self->{info_level}, |
416 |
|
line => 1, column => 1); |
417 |
$self->{confident} = 0; |
$self->{confident} = 0; |
418 |
last SNIFFING; |
last SNIFFING; |
419 |
} |
} |
426 |
## detectable in the step 6. |
## detectable in the step 6. |
427 |
($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1, |
($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1, |
428 |
allow_fallback => 1); |
allow_fallback => 1); |
429 |
|
!!!parse-error (type => 'sniffing:default', ## TODO: type name |
430 |
|
value => 'windows-1252', |
431 |
|
level => $self->{info_level}, |
432 |
|
line => 1, column => 1); |
433 |
$self->{confident} = 0; |
$self->{confident} = 0; |
434 |
} # SNIFFING |
} # SNIFFING |
435 |
|
|
436 |
|
$self->{input_encoding} = $charset->get_iana_name; |
437 |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
438 |
|
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
439 |
|
value => $e->name, |
440 |
|
level => $self->{unsupported_level}, |
441 |
|
line => 1, column => 1); |
442 |
} elsif (not ($e_status & |
} elsif (not ($e_status & |
443 |
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
444 |
|
!!!parse-error (type => 'chardecode:no error', ## TODO: type name |
445 |
|
value => $self->{input_encoding}, |
446 |
|
level => $self->{unsupported_level}, |
447 |
|
line => 1, column => 1); |
448 |
} |
} |
449 |
$s = \ $e->decode ($$bytes_s); |
$s = \ $e->decode ($$bytes_s); |
|
$self->{input_encoding} = $charset->get_iana_name; |
|
450 |
|
|
451 |
$self->{change_encoding} = sub { |
$self->{change_encoding} = sub { |
452 |
my $self = shift; |
my $self = shift; |
453 |
my $charset_name = lc shift; |
$charset_name = shift; |
454 |
my $token = shift; |
my $token = shift; |
|
## TODO: if $charset_name is supported |
|
|
## TODO: normalize charset name |
|
455 |
|
|
456 |
## "Change the encoding" algorithm: |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
457 |
|
($e, $e_status) = $charset->get_perl_encoding |
458 |
|
(allow_error_reporting => 1, allow_fallback => 1); |
459 |
|
|
460 |
|
if ($e) { # if supported |
461 |
|
## "Change the encoding" algorithm: |
462 |
|
|
463 |
## Step 1 |
## Step 1 |
464 |
if ($charset_name eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
465 |
$charset_name = 'utf-8'; |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
466 |
} |
($e, $e_status) = $charset->get_perl_encoding; |
467 |
|
} |
468 |
|
$charset_name = $charset->get_iana_name; |
469 |
|
|
470 |
|
## Step 2 |
471 |
|
if (defined $self->{input_encoding} and |
472 |
|
$self->{input_encoding} eq $charset_name) { |
473 |
|
!!!parse-error (type => 'charset label:matching', ## TODO: type |
474 |
|
value => $charset_name, |
475 |
|
level => $self->{info_level}); |
476 |
|
$self->{confident} = 1; |
477 |
|
return; |
478 |
|
} |
479 |
|
|
480 |
## Step 2 |
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
481 |
if (defined $self->{input_encoding} and |
':'.$charset_name, level => 'w', token => $token); |
482 |
$self->{input_encoding} eq $charset_name) { |
|
483 |
$self->{confident} = 1; |
## Step 3 |
484 |
return; |
# if (can) { |
485 |
|
## change the encoding on the fly. |
486 |
|
#$self->{confident} = 1; |
487 |
|
#return; |
488 |
|
# } |
489 |
|
|
490 |
|
## Step 4 |
491 |
|
throw Whatpm::HTML::RestartParser (); |
492 |
} |
} |
|
|
|
|
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
|
|
':'.$charset_name, level => 'w', token => $token); |
|
|
|
|
|
## Step 3 |
|
|
# if (can) { |
|
|
## change the encoding on the fly. |
|
|
#$self->{confident} = 1; |
|
|
#return; |
|
|
# } |
|
|
|
|
|
## Step 4 |
|
|
throw Whatpm::HTML::RestartParser (charset => $charset_name); |
|
493 |
}; # $self->{change_encoding} |
}; # $self->{change_encoding} |
494 |
|
|
495 |
my @args = @_; shift @args; # $s |
my @args = @_; shift @args; # $s |
497 |
try { |
try { |
498 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_string ($s, @args); |
499 |
} catch Whatpm::HTML::RestartParser with { |
} catch Whatpm::HTML::RestartParser with { |
500 |
my $charset_name = shift->{charset}; |
## NOTE: Invoked after {change_encoding}. |
501 |
$s = \ (Encode::decode ($charset_name, $$bytes_s)); |
|
502 |
$self->{input_encoding} = $charset_name; ## TODO: normalize |
$self->{input_encoding} = $charset->get_iana_name; |
503 |
|
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
504 |
|
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
505 |
|
value => $e->name, |
506 |
|
level => $self->{unsupported_level}, |
507 |
|
line => 1, column => 1); |
508 |
|
} elsif (not ($e_status & |
509 |
|
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
510 |
|
!!!parse-error (type => 'chardecode:no error', ## TODO: type name |
511 |
|
value => $self->{input_encoding}, |
512 |
|
level => $self->{unsupported_level}, |
513 |
|
line => 1, column => 1); |
514 |
|
} |
515 |
|
$s = \ $e->decode ($$bytes_s); |
516 |
$self->{confident} = 1; |
$self->{confident} = 1; |
517 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_string ($s, @args); |
518 |
}; |
}; |
621 |
|
|
622 |
sub new ($) { |
sub new ($) { |
623 |
my $class = shift; |
my $class = shift; |
624 |
my $self = bless {}, $class; |
my $self = bless { |
625 |
|
must_level => 'm', |
626 |
|
should_level => 's', |
627 |
|
good_level => 'w', |
628 |
|
warn_level => 'w', |
629 |
|
info_level => 'i', |
630 |
|
unsupported_level => 'u', |
631 |
|
}, $class; |
632 |
$self->{set_next_char} = sub { |
$self->{set_next_char} = sub { |
633 |
$self->{next_char} = -1; |
$self->{next_char} = -1; |
634 |
}; |
}; |
4181 |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
4182 |
|
|
4183 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
4184 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { |
4185 |
!!!cp ('t106'); |
!!!cp ('t106'); |
4186 |
|
## NOTE: Whether the encoding is supported or not is handled |
4187 |
|
## in the {change_encoding} callback. |
4188 |
$self->{change_encoding} |
$self->{change_encoding} |
4189 |
->($self, $token->{attributes}->{charset}->{value}, |
->($self, $token->{attributes}->{charset}->{value}, |
4190 |
$token); |
$token); |
4194 |
$token->{attributes}->{charset} |
$token->{attributes}->{charset} |
4195 |
->{has_reference}); |
->{has_reference}); |
4196 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
|
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
|
4197 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
4198 |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
4199 |
[\x09-\x0D\x20]*= |
[\x09-\x0D\x20]*= |
4200 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
4201 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
4202 |
!!!cp ('t107'); |
!!!cp ('t107'); |
4203 |
|
## NOTE: Whether the encoding is supported or not is handled |
4204 |
|
## in the {change_encoding} callback. |
4205 |
$self->{change_encoding} |
$self->{change_encoding} |
4206 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, |
4207 |
$token); |
$token); |
6218 |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
6219 |
|
|
6220 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
6221 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { |
6222 |
!!!cp ('t335'); |
!!!cp ('t335'); |
6223 |
|
## NOTE: Whether the encoding is supported or not is handled |
6224 |
|
## in the {change_encoding} callback. |
6225 |
$self->{change_encoding} |
$self->{change_encoding} |
6226 |
->($self, $token->{attributes}->{charset}->{value}, $token); |
->($self, $token->{attributes}->{charset}->{value}, $token); |
6227 |
|
|
6230 |
$token->{attributes}->{charset} |
$token->{attributes}->{charset} |
6231 |
->{has_reference}); |
->{has_reference}); |
6232 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
|
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
|
6233 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
6234 |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
6235 |
[\x09-\x0D\x20]*= |
[\x09-\x0D\x20]*= |
6236 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
6237 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
6238 |
!!!cp ('t336'); |
!!!cp ('t336'); |
6239 |
|
## NOTE: Whether the encoding is supported or not is handled |
6240 |
|
## in the {change_encoding} callback. |
6241 |
$self->{change_encoding} |
$self->{change_encoding} |
6242 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); |
6243 |
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |