/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.132 by wakaba, Sun Apr 13 10:36:40 2008 UTC revision 1.133 by wakaba, Sat May 17 04:54:11 2008 UTC
# Line 333  my $c1_entity_char = { Line 333  my $c1_entity_char = {
333    
334  sub parse_byte_string ($$$$;$) {  sub parse_byte_string ($$$$;$) {
335    my $self = ref $_[0] ? shift : shift->new;    my $self = ref $_[0] ? shift : shift->new;
336    my $charset = shift;    my $charset_name = shift;
337    my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);    my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
338    my $s;    my $s;
339      
340    if (defined $charset) {    ## HTML5 encoding sniffing algorithm
341      require Encode; ## TODO: decode(utf8) don't delete BOM    require Message::Charset::Info;
342      $s = \ (Encode::decode ($charset, $$bytes_s));    my $charset;
343      $self->{input_encoding} = lc $charset; ## TODO: normalize name    my ($e, $e_status);
344      $self->{confident} = 1;  
345    } else {    SNIFFING: {
346      ## TODO: Implement HTML5 detection algorithm  
347        ## Step 1
348        if (defined $charset_name) {
349          $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
350    
351          ## ISSUE: Unsupported encoding is not ignored according to the spec.
352          ($e, $e_status) = $charset->get_perl_encoding
353              (allow_error_reporting => 1,
354               allow_fallback => 1);
355          if ($e) {
356            $self->{confident} = 1;
357            last SNIFFING;
358          }
359        }
360    
361        ## Step 2
362        # wait
363    
364        ## Step 3
365        my $head = substr ($$bytes_s, 0, 3);
366        if ($head =~ /^\xFE\xFF/) {
367          $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
368          ($e, $e_status) = $charset->get_perl_encoding
369              (allow_error_reporting => 1,
370               allow_fallback => 1);
371          $self->{confident} = 1;
372          last SNIFFING;
373        } elsif ($head =~ /^\xFF\xFE/) {
374          $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
375          ($e, $e_status) = $charset->get_perl_encoding
376              (allow_error_reporting => 1,
377               allow_fallback => 1);
378          $self->{confident} = 1;
379          last SNIFFING;
380        } elsif ($head eq "\xEF\xBB\xBF") {
381          $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
382          ($e, $e_status) = $charset->get_perl_encoding
383              (allow_error_reporting => 1,
384               allow_fallback => 1);
385          $self->{confident} = 1;
386          last SNIFFING;
387        }
388    
389        ## Step 4
390        ## TODO: <meta charset>
391    
392        ## Step 5
393        ## TODO: from history
394    
395        ## Step 6
396      require Whatpm::Charset::UniversalCharDet;      require Whatpm::Charset::UniversalCharDet;
397      $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string      $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
398          (substr ($$bytes_s, 0, 1024));          (substr ($$bytes_s, 0, 1024));
399      $charset ||= 'windows-1252';      if (defined $charset_name) {
400      $s = \ (Encode::decode ($charset, $$bytes_s));        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
401      $self->{input_encoding} = $charset;  
402          ## ISSUE: Unsupported encoding is not ignored according to the spec.
403          ($e, $e_status) = $charset->get_perl_encoding
404              (allow_error_reporting => 1,
405               allow_fallback => 1);
406          if ($e) {
407            $self->{confident} = 0;
408            last SNIFFING;
409          }
410        }
411    
412        ## Step 7: default
413        ## TODO: Make this configurable.
414        $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
415            ## NOTE: We choose |windows-1252| here, since |utf-8| should be
416            ## detectable in the step 6.
417        ($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1,
418                                                       allow_fallback => 1);
419      $self->{confident} = 0;      $self->{confident} = 0;
420      } # SNIFFING
421    
422      if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
423        
424      } elsif (not ($e_status &
425                    Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
426        
427    }    }
428      $s = \ $e->decode ($$bytes_s);
429      $self->{input_encoding} = $charset->get_iana_name;
430    
431    $self->{change_encoding} = sub {    $self->{change_encoding} = sub {
432      my $self = shift;      my $self = shift;
433      my $charset = lc shift;      my $charset_name = lc shift;
434      my $token = shift;      my $token = shift;
435      ## TODO: if $charset is supported      ## TODO: if $charset_name is supported
436      ## TODO: normalize charset name      ## TODO: normalize charset name
437    
438      ## "Change the encoding" algorithm:      ## "Change the encoding" algorithm:
439    
440      ## Step 1          ## Step 1    
441      if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?      if ($charset_name eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
442        $charset = 'utf-8';        $charset_name = 'utf-8';
443      }      }
444    
445      ## Step 2      ## Step 2
446      if (defined $self->{input_encoding} and      if (defined $self->{input_encoding} and
447          $self->{input_encoding} eq $charset) {          $self->{input_encoding} eq $charset_name) {
448        $self->{confident} = 1;        $self->{confident} = 1;
449        return;        return;
450      }      }
451    
452      !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.      !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
453          ':'.$charset, level => 'w', token => $token);          ':'.$charset_name, level => 'w', token => $token);
454    
455      ## Step 3      ## Step 3
456      # if (can) {      # if (can) {
# Line 385  sub parse_byte_string ($$$$;$) { Line 460  sub parse_byte_string ($$$$;$) {
460      # }      # }
461    
462      ## Step 4      ## Step 4
463      throw Whatpm::HTML::RestartParser (charset => $charset);      throw Whatpm::HTML::RestartParser (charset => $charset_name);
464    }; # $self->{change_encoding}    }; # $self->{change_encoding}
465    
466    my @args = @_; shift @args; # $s    my @args = @_; shift @args; # $s
# Line 393  sub parse_byte_string ($$$$;$) { Line 468  sub parse_byte_string ($$$$;$) {
468    try {    try {
469      $return = $self->parse_char_string ($s, @args);        $return = $self->parse_char_string ($s, @args);  
470    } catch Whatpm::HTML::RestartParser with {    } catch Whatpm::HTML::RestartParser with {
471      my $charset = shift->{charset};      my $charset_name = shift->{charset};
472      $s = \ (Encode::decode ($charset, $$bytes_s));          $s = \ (Encode::decode ($charset_name, $$bytes_s));    
473      $self->{input_encoding} = $charset; ## TODO: normalize      $self->{input_encoding} = $charset_name; ## TODO: normalize
474      $self->{confident} = 1;      $self->{confident} = 1;
475      $return = $self->parse_char_string ($s, @args);      $return = $self->parse_char_string ($s, @args);
476    };    };

Legend:
Removed from v.1.132  
changed lines
  Added in v.1.133

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24