/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.135 by wakaba, Sat May 17 07:31:49 2008 UTC revision 1.136 by wakaba, Sat May 17 12:29:24 2008 UTC
# Line 334  my $c1_entity_char = { Line 334  my $c1_entity_char = {
334  sub parse_byte_string ($$$$;$) {  sub parse_byte_string ($$$$;$) {
335    my $self = ref $_[0] ? shift : shift->new;    my $self = ref $_[0] ? shift : shift->new;
336    my $charset_name = shift;    my $charset_name = shift;
337    my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);    open my $byte_stream, '<', ref $_[0] ? $_[0] : \($_[0]);
   my $s;  
338    
339    my $onerror = $_[2] || sub {    my $onerror = $_[2] || sub {
340      my (%opt) = @_;      my (%opt) = @_;
# Line 346  sub parse_byte_string ($$$$;$) { Line 345  sub parse_byte_string ($$$$;$) {
345    ## HTML5 encoding sniffing algorithm    ## HTML5 encoding sniffing algorithm
346    require Message::Charset::Info;    require Message::Charset::Info;
347    my $charset;    my $charset;
348    my ($e, $e_status);    my $buffer;
349      my ($char_stream, $e_status);
350    
351    SNIFFING: {    SNIFFING: {
352    
# Line 355  sub parse_byte_string ($$$$;$) { Line 355  sub parse_byte_string ($$$$;$) {
355        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
356    
357        ## ISSUE: Unsupported encoding is not ignored according to the spec.        ## ISSUE: Unsupported encoding is not ignored according to the spec.
358        ($e, $e_status) = $charset->get_perl_encoding        ($char_stream, $e_status) = $charset->get_decode_handle
359            (allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
360             allow_fallback => 1);             allow_fallback => 1);
361        if ($e) {        if ($char_stream) {
362          $self->{confident} = 1;          $self->{confident} = 1;
363          last SNIFFING;          last SNIFFING;
364          } else {
365            ## TODO: unsupported error
366        }        }
367      }      }
368    
369      ## Step 2      ## Step 2
370      # wait      my $byte_buffer = '';
371        for (1..1024) {
372          my $char = $byte_stream->getc;
373          last unless defined $char;
374          $byte_buffer .= $char;
375        } ## TODO: timeout
376    
377      ## Step 3      ## Step 3
378      my $head = substr ($$bytes_s, 0, 3);      if ($byte_buffer =~ /^\xFE\xFF/) {
     if ($head =~ /^\xFE\xFF/) {  
379        $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');        $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
380        ($e, $e_status) = $charset->get_perl_encoding        ($char_stream, $e_status) = $charset->get_decode_handle
381            (allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
382             allow_fallback => 1);             allow_fallback => 1, byte_buffer => \$byte_buffer);
383        $self->{confident} = 1;        $self->{confident} = 1;
384        last SNIFFING;        last SNIFFING;
385      } elsif ($head =~ /^\xFF\xFE/) {      } elsif ($byte_buffer =~ /^\xFF\xFE/) {
386        $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');        $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
387        ($e, $e_status) = $charset->get_perl_encoding        ($char_stream, $e_status) = $charset->get_decode_handle
388            (allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
389             allow_fallback => 1);             allow_fallback => 1, byte_buffer => \$byte_buffer);
390        $self->{confident} = 1;        $self->{confident} = 1;
391        last SNIFFING;        last SNIFFING;
392      } elsif ($head eq "\xEF\xBB\xBF") {      } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
393        $charset = Message::Charset::Info->get_by_iana_name ('utf-8');        $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
394        ($e, $e_status) = $charset->get_perl_encoding        ($char_stream, $e_status) = $charset->get_decode_handle
395            (allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
396             allow_fallback => 1);             allow_fallback => 1, byte_buffer => \$byte_buffer);
397        $self->{confident} = 1;        $self->{confident} = 1;
398        last SNIFFING;        last SNIFFING;
399      }      }
# Line 401  sub parse_byte_string ($$$$;$) { Line 407  sub parse_byte_string ($$$$;$) {
407      ## Step 6      ## Step 6
408      require Whatpm::Charset::UniversalCharDet;      require Whatpm::Charset::UniversalCharDet;
409      $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string      $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
410          (substr ($$bytes_s, 0, 1024));          ($byte_buffer);
411      if (defined $charset_name) {      if (defined $charset_name) {
412        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
413    
414        ## ISSUE: Unsupported encoding is not ignored according to the spec.        ## ISSUE: Unsupported encoding is not ignored according to the spec.
415        ($e, $e_status) = $charset->get_perl_encoding        require Whatpm::Charset::DecodeHandle;
416            (allow_error_reporting => 1,        $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
417             allow_fallback => 1);            ($byte_stream);
418        if ($e) {        ($char_stream, $e_status) = $charset->get_decode_handle
419              ($buffer, allow_error_reporting => 1,
420               allow_fallback => 1, byte_buffer => \$byte_buffer);
421          if ($char_stream) {
422            $buffer->{buffer} = $byte_buffer;
423          !!!parse-error (type => 'sniffing:chardet', ## TODO: type name          !!!parse-error (type => 'sniffing:chardet', ## TODO: type name
424                          value => $charset_name,                          value => $charset_name,
425                          level => $self->{info_level},                          level => $self->{info_level},
# Line 424  sub parse_byte_string ($$$$;$) { Line 434  sub parse_byte_string ($$$$;$) {
434      $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');      $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
435          ## NOTE: We choose |windows-1252| here, since |utf-8| should be          ## NOTE: We choose |windows-1252| here, since |utf-8| should be
436          ## detectable in the step 6.          ## detectable in the step 6.
437      ($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1,      require Whatpm::Charset::DecodeHandle;
438                                                     allow_fallback => 1);      $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
439            ($byte_stream);
440        ($char_stream, $e_status)
441            = $charset->get_decode_handle ($buffer,
442                                           allow_error_reporting => 1,
443                                           allow_fallback => 1,
444                                           byte_buffer => \$byte_buffer);
445        $buffer->{buffer} = $byte_buffer;
446      !!!parse-error (type => 'sniffing:default', ## TODO: type name      !!!parse-error (type => 'sniffing:default', ## TODO: type name
447                      value => 'windows-1252',                      value => 'windows-1252',
448                      level => $self->{info_level},                      level => $self->{info_level},
# Line 436  sub parse_byte_string ($$$$;$) { Line 453  sub parse_byte_string ($$$$;$) {
453    $self->{input_encoding} = $charset->get_iana_name;    $self->{input_encoding} = $charset->get_iana_name;
454    if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {    if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
455      !!!parse-error (type => 'chardecode:fallback', ## TODO: type name      !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
456                      value => $e->name,                      value => $self->{input_encoding},
457                      level => $self->{unsupported_level},                      level => $self->{unsupported_level},
458                      line => 1, column => 1);                      line => 1, column => 1);
459    } elsif (not ($e_status &    } elsif (not ($e_status &
# Line 446  sub parse_byte_string ($$$$;$) { Line 463  sub parse_byte_string ($$$$;$) {
463                      level => $self->{unsupported_level},                      level => $self->{unsupported_level},
464                      line => 1, column => 1);                      line => 1, column => 1);
465    }    }
   $s = \ $e->decode ($$bytes_s);  
466    
467    $self->{change_encoding} = sub {    $self->{change_encoding} = sub {
468      my $self = shift;      my $self = shift;
# Line 454  sub parse_byte_string ($$$$;$) { Line 470  sub parse_byte_string ($$$$;$) {
470      my $token = shift;      my $token = shift;
471    
472      $charset = Message::Charset::Info->get_by_iana_name ($charset_name);      $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
473      ($e, $e_status) = $charset->get_perl_encoding      ($char_stream, $e_status) = $charset->get_decode_handle
474          (allow_error_reporting => 1, allow_fallback => 1);          ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
475             byte_buffer => \ $buffer->{buffer});
476            
477      if ($e) { # if supported      if ($char_stream) { # if supported
478        ## "Change the encoding" algorithm:        ## "Change the encoding" algorithm:
479    
480        ## Step 1            ## Step 1    
481        if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?        if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
482          $charset = Message::Charset::Info->get_by_iana_name ('utf-8');          $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
483          ($e, $e_status) = $charset->get_perl_encoding;          ($char_stream, $e_status) = $charset->get_decode_handle
484                ($byte_stream,
485                 byte_buffer => \ $buffer->{buffer});
486        }        }
487        $charset_name = $charset->get_iana_name;        $charset_name = $charset->get_iana_name;
488                
# Line 492  sub parse_byte_string ($$$$;$) { Line 511  sub parse_byte_string ($$$$;$) {
511      }      }
512    }; # $self->{change_encoding}    }; # $self->{change_encoding}
513    
514      my $char_onerror = sub {
515        my (undef, $type, %opt) = @_;
516        !!!parse-error (%opt, type => $type);
517        if ($opt{octets}) {
518          ${$opt{octets}} = "\x{FFFD}"; # relacement character
519        }
520      };
521      $char_stream->onerror ($char_onerror);
522    
523    my @args = @_; shift @args; # $s    my @args = @_; shift @args; # $s
524    my $return;    my $return;
525    try {    try {
526      $return = $self->parse_char_string ($s, @args);        $return = $self->parse_char_stream ($char_stream, @args);  
527    } catch Whatpm::HTML::RestartParser with {    } catch Whatpm::HTML::RestartParser with {
528      ## NOTE: Invoked after {change_encoding}.      ## NOTE: Invoked after {change_encoding}.
529    
530      $self->{input_encoding} = $charset->get_iana_name;      $self->{input_encoding} = $charset->get_iana_name;
531      if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {      if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
532        !!!parse-error (type => 'chardecode:fallback', ## TODO: type name        !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
533                        value => $e->name,                        value => $self->{input_encoding},
534                        level => $self->{unsupported_level},                        level => $self->{unsupported_level},
535                        line => 1, column => 1);                        line => 1, column => 1);
536      } elsif (not ($e_status &      } elsif (not ($e_status &
# Line 512  sub parse_byte_string ($$$$;$) { Line 540  sub parse_byte_string ($$$$;$) {
540                        level => $self->{unsupported_level},                        level => $self->{unsupported_level},
541                        line => 1, column => 1);                        line => 1, column => 1);
542      }      }
     $s = \ $e->decode ($$bytes_s);  
543      $self->{confident} = 1;      $self->{confident} = 1;
544      $return = $self->parse_char_string ($s, @args);      $char_stream->onerror ($char_onerror);
545        $return = $self->parse_char_stream ($char_stream, @args);
546    };    };
547    return $return;    return $return;
548  } # parse_byte_string  } # parse_byte_string
# Line 1002  sub _get_next_token ($) { Line 1030  sub _get_next_token ($) {
1030            redo A;            redo A;
1031          } else {          } else {
1032            !!!cp (23);            !!!cp (23);
1033            !!!parse-error (type => 'bare stago');            !!!parse-error (type => 'bare stago',
1034                              line => $self->{line_prev},
1035                              column => $self->{column_prev});
1036            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1037            ## reconsume            ## reconsume
1038    

Legend:
Removed from v.1.135  
changed lines
  Added in v.1.136

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24