| 334 |
sub parse_byte_string ($$$$;$) { |
sub parse_byte_string ($$$$;$) { |
| 335 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 336 |
my $charset_name = shift; |
my $charset_name = shift; |
| 337 |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
open my $byte_stream, '<', ref $_[0] ? $_[0] : \($_[0]); |
|
my $s; |
|
| 338 |
|
|
| 339 |
my $onerror = $_[2] || sub { |
my $onerror = $_[2] || sub { |
| 340 |
my (%opt) = @_; |
my (%opt) = @_; |
| 345 |
## HTML5 encoding sniffing algorithm |
## HTML5 encoding sniffing algorithm |
| 346 |
require Message::Charset::Info; |
require Message::Charset::Info; |
| 347 |
my $charset; |
my $charset; |
| 348 |
my ($e, $e_status); |
my $buffer; |
| 349 |
|
my ($char_stream, $e_status); |
| 350 |
|
|
| 351 |
SNIFFING: { |
SNIFFING: { |
| 352 |
|
|
| 355 |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 356 |
|
|
| 357 |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
| 358 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 359 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
| 360 |
allow_fallback => 1); |
allow_fallback => 1); |
| 361 |
if ($e) { |
if ($char_stream) { |
| 362 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 363 |
last SNIFFING; |
last SNIFFING; |
| 364 |
|
} else { |
| 365 |
|
## TODO: unsupported error |
| 366 |
} |
} |
| 367 |
} |
} |
| 368 |
|
|
| 369 |
## Step 2 |
## Step 2 |
| 370 |
# wait |
my $byte_buffer = ''; |
| 371 |
|
for (1..1024) { |
| 372 |
|
my $char = $byte_stream->getc; |
| 373 |
|
last unless defined $char; |
| 374 |
|
$byte_buffer .= $char; |
| 375 |
|
} ## TODO: timeout |
| 376 |
|
|
| 377 |
## Step 3 |
## Step 3 |
| 378 |
my $head = substr ($$bytes_s, 0, 3); |
if ($byte_buffer =~ /^\xFE\xFF/) { |
|
if ($head =~ /^\xFE\xFF/) { |
|
| 379 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16be'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16be'); |
| 380 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 381 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
| 382 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 383 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 384 |
last SNIFFING; |
last SNIFFING; |
| 385 |
} elsif ($head =~ /^\xFF\xFE/) { |
} elsif ($byte_buffer =~ /^\xFF\xFE/) { |
| 386 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16le'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16le'); |
| 387 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 388 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
| 389 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 390 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 391 |
last SNIFFING; |
last SNIFFING; |
| 392 |
} elsif ($head eq "\xEF\xBB\xBF") { |
} elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) { |
| 393 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
| 394 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 395 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
| 396 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 397 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 398 |
last SNIFFING; |
last SNIFFING; |
| 399 |
} |
} |
| 407 |
## Step 6 |
## Step 6 |
| 408 |
require Whatpm::Charset::UniversalCharDet; |
require Whatpm::Charset::UniversalCharDet; |
| 409 |
$charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string |
$charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string |
| 410 |
(substr ($$bytes_s, 0, 1024)); |
($byte_buffer); |
| 411 |
if (defined $charset_name) { |
if (defined $charset_name) { |
| 412 |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 413 |
|
|
| 414 |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
| 415 |
($e, $e_status) = $charset->get_perl_encoding |
require Whatpm::Charset::DecodeHandle; |
| 416 |
(allow_error_reporting => 1, |
$buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new |
| 417 |
allow_fallback => 1); |
($byte_stream); |
| 418 |
if ($e) { |
($char_stream, $e_status) = $charset->get_decode_handle |
| 419 |
|
($buffer, allow_error_reporting => 1, |
| 420 |
|
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 421 |
|
if ($char_stream) { |
| 422 |
|
$buffer->{buffer} = $byte_buffer; |
| 423 |
!!!parse-error (type => 'sniffing:chardet', ## TODO: type name |
!!!parse-error (type => 'sniffing:chardet', ## TODO: type name |
| 424 |
value => $charset_name, |
value => $charset_name, |
| 425 |
level => $self->{info_level}, |
level => $self->{info_level}, |
| 434 |
$charset = Message::Charset::Info->get_by_iana_name ('windows-1252'); |
$charset = Message::Charset::Info->get_by_iana_name ('windows-1252'); |
| 435 |
## NOTE: We choose |windows-1252| here, since |utf-8| should be |
## NOTE: We choose |windows-1252| here, since |utf-8| should be |
| 436 |
## detectable in the step 6. |
## detectable in the step 6. |
| 437 |
($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1, |
require Whatpm::Charset::DecodeHandle; |
| 438 |
allow_fallback => 1); |
$buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new |
| 439 |
|
($byte_stream); |
| 440 |
|
($char_stream, $e_status) |
| 441 |
|
= $charset->get_decode_handle ($buffer, |
| 442 |
|
allow_error_reporting => 1, |
| 443 |
|
allow_fallback => 1, |
| 444 |
|
byte_buffer => \$byte_buffer); |
| 445 |
|
$buffer->{buffer} = $byte_buffer; |
| 446 |
!!!parse-error (type => 'sniffing:default', ## TODO: type name |
!!!parse-error (type => 'sniffing:default', ## TODO: type name |
| 447 |
value => 'windows-1252', |
value => 'windows-1252', |
| 448 |
level => $self->{info_level}, |
level => $self->{info_level}, |
| 453 |
$self->{input_encoding} = $charset->get_iana_name; |
$self->{input_encoding} = $charset->get_iana_name; |
| 454 |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
| 455 |
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
| 456 |
value => $e->name, |
value => $self->{input_encoding}, |
| 457 |
level => $self->{unsupported_level}, |
level => $self->{unsupported_level}, |
| 458 |
line => 1, column => 1); |
line => 1, column => 1); |
| 459 |
} elsif (not ($e_status & |
} elsif (not ($e_status & |
| 463 |
level => $self->{unsupported_level}, |
level => $self->{unsupported_level}, |
| 464 |
line => 1, column => 1); |
line => 1, column => 1); |
| 465 |
} |
} |
|
$s = \ $e->decode ($$bytes_s); |
|
| 466 |
|
|
| 467 |
$self->{change_encoding} = sub { |
$self->{change_encoding} = sub { |
| 468 |
my $self = shift; |
my $self = shift; |
| 470 |
my $token = shift; |
my $token = shift; |
| 471 |
|
|
| 472 |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 473 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
| 474 |
(allow_error_reporting => 1, allow_fallback => 1); |
($byte_stream, allow_error_reporting => 1, allow_fallback => 1, |
| 475 |
|
byte_buffer => \ $buffer->{buffer}); |
| 476 |
|
|
| 477 |
if ($e) { # if supported |
if ($char_stream) { # if supported |
| 478 |
## "Change the encoding" algorithm: |
## "Change the encoding" algorithm: |
| 479 |
|
|
| 480 |
## Step 1 |
## Step 1 |
| 481 |
if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
| 482 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
| 483 |
($e, $e_status) = $charset->get_perl_encoding; |
($char_stream, $e_status) = $charset->get_decode_handle |
| 484 |
|
($byte_stream, |
| 485 |
|
byte_buffer => \ $buffer->{buffer}); |
| 486 |
} |
} |
| 487 |
$charset_name = $charset->get_iana_name; |
$charset_name = $charset->get_iana_name; |
| 488 |
|
|
| 511 |
} |
} |
| 512 |
}; # $self->{change_encoding} |
}; # $self->{change_encoding} |
| 513 |
|
|
| 514 |
|
my $char_onerror = sub { |
| 515 |
|
my (undef, $type, %opt) = @_; |
| 516 |
|
!!!parse-error (%opt, type => $type); |
| 517 |
|
if ($opt{octets}) { |
| 518 |
|
${$opt{octets}} = "\x{FFFD}"; # relacement character |
| 519 |
|
} |
| 520 |
|
}; |
| 521 |
|
$char_stream->onerror ($char_onerror); |
| 522 |
|
|
| 523 |
my @args = @_; shift @args; # $s |
my @args = @_; shift @args; # $s |
| 524 |
my $return; |
my $return; |
| 525 |
try { |
try { |
| 526 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_stream ($char_stream, @args); |
| 527 |
} catch Whatpm::HTML::RestartParser with { |
} catch Whatpm::HTML::RestartParser with { |
| 528 |
## NOTE: Invoked after {change_encoding}. |
## NOTE: Invoked after {change_encoding}. |
| 529 |
|
|
| 530 |
$self->{input_encoding} = $charset->get_iana_name; |
$self->{input_encoding} = $charset->get_iana_name; |
| 531 |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
| 532 |
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
| 533 |
value => $e->name, |
value => $self->{input_encoding}, |
| 534 |
level => $self->{unsupported_level}, |
level => $self->{unsupported_level}, |
| 535 |
line => 1, column => 1); |
line => 1, column => 1); |
| 536 |
} elsif (not ($e_status & |
} elsif (not ($e_status & |
| 540 |
level => $self->{unsupported_level}, |
level => $self->{unsupported_level}, |
| 541 |
line => 1, column => 1); |
line => 1, column => 1); |
| 542 |
} |
} |
|
$s = \ $e->decode ($$bytes_s); |
|
| 543 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 544 |
$return = $self->parse_char_string ($s, @args); |
$char_stream->onerror ($char_onerror); |
| 545 |
|
$return = $self->parse_char_stream ($char_stream, @args); |
| 546 |
}; |
}; |
| 547 |
return $return; |
return $return; |
| 548 |
} # parse_byte_string |
} # parse_byte_string |
| 1030 |
redo A; |
redo A; |
| 1031 |
} else { |
} else { |
| 1032 |
!!!cp (23); |
!!!cp (23); |
| 1033 |
!!!parse-error (type => 'bare stago'); |
!!!parse-error (type => 'bare stago', |
| 1034 |
|
line => $self->{line_prev}, |
| 1035 |
|
column => $self->{column_prev}); |
| 1036 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1037 |
## reconsume |
## reconsume |
| 1038 |
|
|