334 |
sub parse_byte_string ($$$$;$) { |
sub parse_byte_string ($$$$;$) { |
335 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
336 |
my $charset_name = shift; |
my $charset_name = shift; |
337 |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
open my $byte_stream, '<', ref $_[0] ? $_[0] : \($_[0]); |
|
my $s; |
|
338 |
|
|
339 |
my $onerror = $_[2] || sub { |
my $onerror = $_[2] || sub { |
340 |
my (%opt) = @_; |
my (%opt) = @_; |
345 |
## HTML5 encoding sniffing algorithm |
## HTML5 encoding sniffing algorithm |
346 |
require Message::Charset::Info; |
require Message::Charset::Info; |
347 |
my $charset; |
my $charset; |
348 |
my ($e, $e_status); |
my $buffer; |
349 |
|
my ($char_stream, $e_status); |
350 |
|
|
351 |
SNIFFING: { |
SNIFFING: { |
352 |
|
|
355 |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
356 |
|
|
357 |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
358 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
359 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
360 |
allow_fallback => 1); |
allow_fallback => 1); |
361 |
if ($e) { |
if ($char_stream) { |
362 |
$self->{confident} = 1; |
$self->{confident} = 1; |
363 |
last SNIFFING; |
last SNIFFING; |
364 |
|
} else { |
365 |
|
## TODO: unsupported error |
366 |
} |
} |
367 |
} |
} |
368 |
|
|
369 |
## Step 2 |
## Step 2 |
370 |
# wait |
my $byte_buffer = ''; |
371 |
|
for (1..1024) { |
372 |
|
my $char = $byte_stream->getc; |
373 |
|
last unless defined $char; |
374 |
|
$byte_buffer .= $char; |
375 |
|
} ## TODO: timeout |
376 |
|
|
377 |
## Step 3 |
## Step 3 |
378 |
my $head = substr ($$bytes_s, 0, 3); |
if ($byte_buffer =~ /^\xFE\xFF/) { |
|
if ($head =~ /^\xFE\xFF/) { |
|
379 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16be'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16be'); |
380 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
381 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
382 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
383 |
$self->{confident} = 1; |
$self->{confident} = 1; |
384 |
last SNIFFING; |
last SNIFFING; |
385 |
} elsif ($head =~ /^\xFF\xFE/) { |
} elsif ($byte_buffer =~ /^\xFF\xFE/) { |
386 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16le'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-16le'); |
387 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
388 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
389 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
390 |
$self->{confident} = 1; |
$self->{confident} = 1; |
391 |
last SNIFFING; |
last SNIFFING; |
392 |
} elsif ($head eq "\xEF\xBB\xBF") { |
} elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) { |
393 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
394 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
395 |
(allow_error_reporting => 1, |
($byte_stream, allow_error_reporting => 1, |
396 |
allow_fallback => 1); |
allow_fallback => 1, byte_buffer => \$byte_buffer); |
397 |
$self->{confident} = 1; |
$self->{confident} = 1; |
398 |
last SNIFFING; |
last SNIFFING; |
399 |
} |
} |
407 |
## Step 6 |
## Step 6 |
408 |
require Whatpm::Charset::UniversalCharDet; |
require Whatpm::Charset::UniversalCharDet; |
409 |
$charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string |
$charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string |
410 |
(substr ($$bytes_s, 0, 1024)); |
($byte_buffer); |
411 |
if (defined $charset_name) { |
if (defined $charset_name) { |
412 |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
413 |
|
|
414 |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
415 |
($e, $e_status) = $charset->get_perl_encoding |
require Whatpm::Charset::DecodeHandle; |
416 |
(allow_error_reporting => 1, |
$buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new |
417 |
allow_fallback => 1); |
($byte_stream); |
418 |
if ($e) { |
($char_stream, $e_status) = $charset->get_decode_handle |
419 |
|
($buffer, allow_error_reporting => 1, |
420 |
|
allow_fallback => 1, byte_buffer => \$byte_buffer); |
421 |
|
if ($char_stream) { |
422 |
|
$buffer->{buffer} = $byte_buffer; |
423 |
!!!parse-error (type => 'sniffing:chardet', ## TODO: type name |
!!!parse-error (type => 'sniffing:chardet', ## TODO: type name |
424 |
value => $charset_name, |
value => $charset_name, |
425 |
level => $self->{info_level}, |
level => $self->{info_level}, |
434 |
$charset = Message::Charset::Info->get_by_iana_name ('windows-1252'); |
$charset = Message::Charset::Info->get_by_iana_name ('windows-1252'); |
435 |
## NOTE: We choose |windows-1252| here, since |utf-8| should be |
## NOTE: We choose |windows-1252| here, since |utf-8| should be |
436 |
## detectable in the step 6. |
## detectable in the step 6. |
437 |
($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1, |
require Whatpm::Charset::DecodeHandle; |
438 |
allow_fallback => 1); |
$buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new |
439 |
|
($byte_stream); |
440 |
|
($char_stream, $e_status) |
441 |
|
= $charset->get_decode_handle ($buffer, |
442 |
|
allow_error_reporting => 1, |
443 |
|
allow_fallback => 1, |
444 |
|
byte_buffer => \$byte_buffer); |
445 |
|
$buffer->{buffer} = $byte_buffer; |
446 |
!!!parse-error (type => 'sniffing:default', ## TODO: type name |
!!!parse-error (type => 'sniffing:default', ## TODO: type name |
447 |
value => 'windows-1252', |
value => 'windows-1252', |
448 |
level => $self->{info_level}, |
level => $self->{info_level}, |
453 |
$self->{input_encoding} = $charset->get_iana_name; |
$self->{input_encoding} = $charset->get_iana_name; |
454 |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
455 |
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
456 |
value => $e->name, |
value => $self->{input_encoding}, |
457 |
level => $self->{unsupported_level}, |
level => $self->{unsupported_level}, |
458 |
line => 1, column => 1); |
line => 1, column => 1); |
459 |
} elsif (not ($e_status & |
} elsif (not ($e_status & |
463 |
level => $self->{unsupported_level}, |
level => $self->{unsupported_level}, |
464 |
line => 1, column => 1); |
line => 1, column => 1); |
465 |
} |
} |
|
$s = \ $e->decode ($$bytes_s); |
|
466 |
|
|
467 |
$self->{change_encoding} = sub { |
$self->{change_encoding} = sub { |
468 |
my $self = shift; |
my $self = shift; |
470 |
my $token = shift; |
my $token = shift; |
471 |
|
|
472 |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
473 |
($e, $e_status) = $charset->get_perl_encoding |
($char_stream, $e_status) = $charset->get_decode_handle |
474 |
(allow_error_reporting => 1, allow_fallback => 1); |
($byte_stream, allow_error_reporting => 1, allow_fallback => 1, |
475 |
|
byte_buffer => \ $buffer->{buffer}); |
476 |
|
|
477 |
if ($e) { # if supported |
if ($char_stream) { # if supported |
478 |
## "Change the encoding" algorithm: |
## "Change the encoding" algorithm: |
479 |
|
|
480 |
## Step 1 |
## Step 1 |
481 |
if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
482 |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
483 |
($e, $e_status) = $charset->get_perl_encoding; |
($char_stream, $e_status) = $charset->get_decode_handle |
484 |
|
($byte_stream, |
485 |
|
byte_buffer => \ $buffer->{buffer}); |
486 |
} |
} |
487 |
$charset_name = $charset->get_iana_name; |
$charset_name = $charset->get_iana_name; |
488 |
|
|
511 |
} |
} |
512 |
}; # $self->{change_encoding} |
}; # $self->{change_encoding} |
513 |
|
|
514 |
|
my $char_onerror = sub { |
515 |
|
my (undef, $type, %opt) = @_; |
516 |
|
!!!parse-error (%opt, type => $type); |
517 |
|
if ($opt{octets}) { |
518 |
|
${$opt{octets}} = "\x{FFFD}"; # relacement character |
519 |
|
} |
520 |
|
}; |
521 |
|
$char_stream->onerror ($char_onerror); |
522 |
|
|
523 |
my @args = @_; shift @args; # $s |
my @args = @_; shift @args; # $s |
524 |
my $return; |
my $return; |
525 |
try { |
try { |
526 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_stream ($char_stream, @args); |
527 |
} catch Whatpm::HTML::RestartParser with { |
} catch Whatpm::HTML::RestartParser with { |
528 |
## NOTE: Invoked after {change_encoding}. |
## NOTE: Invoked after {change_encoding}. |
529 |
|
|
530 |
$self->{input_encoding} = $charset->get_iana_name; |
$self->{input_encoding} = $charset->get_iana_name; |
531 |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
532 |
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
533 |
value => $e->name, |
value => $self->{input_encoding}, |
534 |
level => $self->{unsupported_level}, |
level => $self->{unsupported_level}, |
535 |
line => 1, column => 1); |
line => 1, column => 1); |
536 |
} elsif (not ($e_status & |
} elsif (not ($e_status & |
540 |
level => $self->{unsupported_level}, |
level => $self->{unsupported_level}, |
541 |
line => 1, column => 1); |
line => 1, column => 1); |
542 |
} |
} |
|
$s = \ $e->decode ($$bytes_s); |
|
543 |
$self->{confident} = 1; |
$self->{confident} = 1; |
544 |
$return = $self->parse_char_string ($s, @args); |
$char_stream->onerror ($char_onerror); |
545 |
|
$return = $self->parse_char_stream ($char_stream, @args); |
546 |
}; |
}; |
547 |
return $return; |
return $return; |
548 |
} # parse_byte_string |
} # parse_byte_string |
1030 |
redo A; |
redo A; |
1031 |
} else { |
} else { |
1032 |
!!!cp (23); |
!!!cp (23); |
1033 |
!!!parse-error (type => 'bare stago'); |
!!!parse-error (type => 'bare stago', |
1034 |
|
line => $self->{line_prev}, |
1035 |
|
column => $self->{column_prev}); |
1036 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1037 |
## reconsume |
## reconsume |
1038 |
|
|