333 |
|
|
334 |
sub parse_byte_string ($$$$;$) { |
sub parse_byte_string ($$$$;$) { |
335 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
336 |
my $charset = shift; |
my $charset_name = shift; |
337 |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
338 |
my $s; |
my $s; |
339 |
|
|
340 |
if (defined $charset) { |
## HTML5 encoding sniffing algorithm |
341 |
require Encode; ## TODO: decode(utf8) don't delete BOM |
require Message::Charset::Info; |
342 |
$s = \ (Encode::decode ($charset, $$bytes_s)); |
my $charset; |
343 |
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
my ($e, $e_status); |
344 |
$self->{confident} = 1; |
|
345 |
} else { |
SNIFFING: { |
346 |
## TODO: Implement HTML5 detection algorithm |
|
347 |
|
## Step 1 |
348 |
|
if (defined $charset_name) { |
349 |
|
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
350 |
|
|
351 |
|
## ISSUE: Unsupported encoding is not ignored according to the spec. |
352 |
|
($e, $e_status) = $charset->get_perl_encoding |
353 |
|
(allow_error_reporting => 1, |
354 |
|
allow_fallback => 1); |
355 |
|
if ($e) { |
356 |
|
$self->{confident} = 1; |
357 |
|
last SNIFFING; |
358 |
|
} |
359 |
|
} |
360 |
|
|
361 |
|
## Step 2 |
362 |
|
# wait |
363 |
|
|
364 |
|
## Step 3 |
365 |
|
my $head = substr ($$bytes_s, 0, 3); |
366 |
|
if ($head =~ /^\xFE\xFF/) { |
367 |
|
$charset = Message::Charset::Info->get_by_iana_name ('utf-16be'); |
368 |
|
($e, $e_status) = $charset->get_perl_encoding |
369 |
|
(allow_error_reporting => 1, |
370 |
|
allow_fallback => 1); |
371 |
|
$self->{confident} = 1; |
372 |
|
last SNIFFING; |
373 |
|
} elsif ($head =~ /^\xFF\xFE/) { |
374 |
|
$charset = Message::Charset::Info->get_by_iana_name ('utf-16le'); |
375 |
|
($e, $e_status) = $charset->get_perl_encoding |
376 |
|
(allow_error_reporting => 1, |
377 |
|
allow_fallback => 1); |
378 |
|
$self->{confident} = 1; |
379 |
|
last SNIFFING; |
380 |
|
} elsif ($head eq "\xEF\xBB\xBF") { |
381 |
|
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
382 |
|
($e, $e_status) = $charset->get_perl_encoding |
383 |
|
(allow_error_reporting => 1, |
384 |
|
allow_fallback => 1); |
385 |
|
$self->{confident} = 1; |
386 |
|
last SNIFFING; |
387 |
|
} |
388 |
|
|
389 |
|
## Step 4 |
390 |
|
## TODO: <meta charset> |
391 |
|
|
392 |
|
## Step 5 |
393 |
|
## TODO: from history |
394 |
|
|
395 |
|
## Step 6 |
396 |
require Whatpm::Charset::UniversalCharDet; |
require Whatpm::Charset::UniversalCharDet; |
397 |
$charset = Whatpm::Charset::UniversalCharDet->detect_byte_string |
$charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string |
398 |
(substr ($$bytes_s, 0, 1024)); |
(substr ($$bytes_s, 0, 1024)); |
399 |
$charset ||= 'windows-1252'; |
if (defined $charset_name) { |
400 |
$s = \ (Encode::decode ($charset, $$bytes_s)); |
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
401 |
$self->{input_encoding} = $charset; |
|
402 |
|
## ISSUE: Unsupported encoding is not ignored according to the spec. |
403 |
|
($e, $e_status) = $charset->get_perl_encoding |
404 |
|
(allow_error_reporting => 1, |
405 |
|
allow_fallback => 1); |
406 |
|
if ($e) { |
407 |
|
$self->{confident} = 0; |
408 |
|
last SNIFFING; |
409 |
|
} |
410 |
|
} |
411 |
|
|
412 |
|
## Step 7: default |
413 |
|
## TODO: Make this configurable. |
414 |
|
$charset = Message::Charset::Info->get_by_iana_name ('windows-1252'); |
415 |
|
## NOTE: We choose |windows-1252| here, since |utf-8| should be |
416 |
|
## detectable in the step 6. |
417 |
|
($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1, |
418 |
|
allow_fallback => 1); |
419 |
$self->{confident} = 0; |
$self->{confident} = 0; |
420 |
|
} # SNIFFING |
421 |
|
|
422 |
|
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
423 |
|
|
424 |
|
} elsif (not ($e_status & |
425 |
|
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
426 |
|
|
427 |
} |
} |
428 |
|
$s = \ $e->decode ($$bytes_s); |
429 |
|
$self->{input_encoding} = $charset->get_iana_name; |
430 |
|
|
431 |
$self->{change_encoding} = sub { |
$self->{change_encoding} = sub { |
432 |
my $self = shift; |
my $self = shift; |
433 |
my $charset = lc shift; |
my $charset_name = lc shift; |
434 |
my $token = shift; |
my $token = shift; |
435 |
## TODO: if $charset is supported |
## TODO: if $charset_name is supported |
436 |
## TODO: normalize charset name |
## TODO: normalize charset name |
437 |
|
|
438 |
## "Change the encoding" algorithm: |
## "Change the encoding" algorithm: |
439 |
|
|
440 |
## Step 1 |
## Step 1 |
441 |
if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
if ($charset_name eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
442 |
$charset = 'utf-8'; |
$charset_name = 'utf-8'; |
443 |
} |
} |
444 |
|
|
445 |
## Step 2 |
## Step 2 |
446 |
if (defined $self->{input_encoding} and |
if (defined $self->{input_encoding} and |
447 |
$self->{input_encoding} eq $charset) { |
$self->{input_encoding} eq $charset_name) { |
448 |
$self->{confident} = 1; |
$self->{confident} = 1; |
449 |
return; |
return; |
450 |
} |
} |
451 |
|
|
452 |
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
453 |
':'.$charset, level => 'w', token => $token); |
':'.$charset_name, level => 'w', token => $token); |
454 |
|
|
455 |
## Step 3 |
## Step 3 |
456 |
# if (can) { |
# if (can) { |
460 |
# } |
# } |
461 |
|
|
462 |
## Step 4 |
## Step 4 |
463 |
throw Whatpm::HTML::RestartParser (charset => $charset); |
throw Whatpm::HTML::RestartParser (charset => $charset_name); |
464 |
}; # $self->{change_encoding} |
}; # $self->{change_encoding} |
465 |
|
|
466 |
my @args = @_; shift @args; # $s |
my @args = @_; shift @args; # $s |
468 |
try { |
try { |
469 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_string ($s, @args); |
470 |
} catch Whatpm::HTML::RestartParser with { |
} catch Whatpm::HTML::RestartParser with { |
471 |
my $charset = shift->{charset}; |
my $charset_name = shift->{charset}; |
472 |
$s = \ (Encode::decode ($charset, $$bytes_s)); |
$s = \ (Encode::decode ($charset_name, $$bytes_s)); |
473 |
$self->{input_encoding} = $charset; ## TODO: normalize |
$self->{input_encoding} = $charset_name; ## TODO: normalize |
474 |
$self->{confident} = 1; |
$self->{confident} = 1; |
475 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_string ($s, @args); |
476 |
}; |
}; |