/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.132 by wakaba, Sun Apr 13 10:36:40 2008 UTC revision 1.143 by wakaba, Sat May 24 10:48:57 2008 UTC
# Line 11  use Error qw(:try); Line 11  use Error qw(:try);
11  ## TODO: 1252 parse error (revision 1264)  ## TODO: 1252 parse error (revision 1264)
12  ## TODO: 8859-11 = 874 (revision 1271)  ## TODO: 8859-11 = 874 (revision 1271)
13    
14    require IO::Handle;
15    
16  my $HTML_NS = q<http://www.w3.org/1999/xhtml>;  my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
17  my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;  my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
18  my $SVG_NS = q<http://www.w3.org/2000/svg>;  my $SVG_NS = q<http://www.w3.org/2000/svg>;
# Line 332  my $c1_entity_char = { Line 334  my $c1_entity_char = {
334  }; # $c1_entity_char  }; # $c1_entity_char
335    
336  sub parse_byte_string ($$$$;$) {  sub parse_byte_string ($$$$;$) {
337      my $self = shift;
338      my $charset_name = shift;
339      open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
340      return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
341    } # parse_byte_string
342    
343    sub parse_byte_stream ($$$$;$) {
344    my $self = ref $_[0] ? shift : shift->new;    my $self = ref $_[0] ? shift : shift->new;
345    my $charset = shift;    my $charset_name = shift;
346    my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);    my $byte_stream = $_[0];
   my $s;  
     
   if (defined $charset) {  
     require Encode; ## TODO: decode(utf8) don't delete BOM  
     $s = \ (Encode::decode ($charset, $$bytes_s));  
     $self->{input_encoding} = lc $charset; ## TODO: normalize name  
     $self->{confident} = 1;  
   } else {  
     ## TODO: Implement HTML5 detection algorithm  
     require Whatpm::Charset::UniversalCharDet;  
     $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string  
         (substr ($$bytes_s, 0, 1024));  
     $charset ||= 'windows-1252';  
     $s = \ (Encode::decode ($charset, $$bytes_s));  
     $self->{input_encoding} = $charset;  
     $self->{confident} = 0;  
   }  
347    
348    $self->{change_encoding} = sub {    my $onerror = $_[2] || sub {
349      my $self = shift;      my (%opt) = @_;
350      my $charset = lc shift;      warn "Parse error ($opt{type})\n";
351      my $token = shift;    };
352      ## TODO: if $charset is supported    $self->{parse_error} = $onerror; # updated later by parse_char_string
353      ## TODO: normalize charset name  
354      ## HTML5 encoding sniffing algorithm
355      require Message::Charset::Info;
356      my $charset;
357      my $buffer;
358      my ($char_stream, $e_status);
359    
360      SNIFFING: {
361    
362      ## "Change the encoding" algorithm:      ## Step 1
363        if (defined $charset_name) {
364          $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
365    
366      ## Step 1            ## ISSUE: Unsupported encoding is not ignored according to the spec.
367      if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?        ($char_stream, $e_status) = $charset->get_decode_handle
368        $charset = 'utf-8';            ($byte_stream, allow_error_reporting => 1,
369               allow_fallback => 1);
370          if ($char_stream) {
371            $self->{confident} = 1;
372            last SNIFFING;
373          } else {
374            ## TODO: unsupported error
375          }
376      }      }
377    
378      ## Step 2      ## Step 2
379      if (defined $self->{input_encoding} and      my $byte_buffer = '';
380          $self->{input_encoding} eq $charset) {      for (1..1024) {
381          my $char = $byte_stream->getc;
382          last unless defined $char;
383          $byte_buffer .= $char;
384        } ## TODO: timeout
385    
386        ## Step 3
387        if ($byte_buffer =~ /^\xFE\xFF/) {
388          $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
389          ($char_stream, $e_status) = $charset->get_decode_handle
390              ($byte_stream, allow_error_reporting => 1,
391               allow_fallback => 1, byte_buffer => \$byte_buffer);
392        $self->{confident} = 1;        $self->{confident} = 1;
393        return;        last SNIFFING;
394        } elsif ($byte_buffer =~ /^\xFF\xFE/) {
395          $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
396          ($char_stream, $e_status) = $charset->get_decode_handle
397              ($byte_stream, allow_error_reporting => 1,
398               allow_fallback => 1, byte_buffer => \$byte_buffer);
399          $self->{confident} = 1;
400          last SNIFFING;
401        } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
402          $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
403          ($char_stream, $e_status) = $charset->get_decode_handle
404              ($byte_stream, allow_error_reporting => 1,
405               allow_fallback => 1, byte_buffer => \$byte_buffer);
406          $self->{confident} = 1;
407          last SNIFFING;
408      }      }
409    
410      !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.      ## Step 4
411          ':'.$charset, level => 'w', token => $token);      ## TODO: <meta charset>
412    
413      ## Step 3      ## Step 5
414      # if (can) {      ## TODO: from history
       ## change the encoding on the fly.  
       #$self->{confident} = 1;  
       #return;  
     # }  
415    
416      ## Step 4      ## Step 6
417      throw Whatpm::HTML::RestartParser (charset => $charset);      require Whatpm::Charset::UniversalCharDet;
418        $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
419            ($byte_buffer);
420        if (defined $charset_name) {
421          $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
422    
423          ## ISSUE: Unsupported encoding is not ignored according to the spec.
424          require Whatpm::Charset::DecodeHandle;
425          $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
426              ($byte_stream);
427          ($char_stream, $e_status) = $charset->get_decode_handle
428              ($buffer, allow_error_reporting => 1,
429               allow_fallback => 1, byte_buffer => \$byte_buffer);
430          if ($char_stream) {
431            $buffer->{buffer} = $byte_buffer;
432            !!!parse-error (type => 'sniffing:chardet', ## TODO: type name
433                            value => $charset_name,
434                            level => $self->{info_level},
435                            line => 1, column => 1);
436            $self->{confident} = 0;
437            last SNIFFING;
438          }
439        }
440    
441        ## Step 7: default
442        ## TODO: Make this configurable.
443        $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
444            ## NOTE: We choose |windows-1252| here, since |utf-8| should be
445            ## detectable in the step 6.
446        require Whatpm::Charset::DecodeHandle;
447        $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
448            ($byte_stream);
449        ($char_stream, $e_status)
450            = $charset->get_decode_handle ($buffer,
451                                           allow_error_reporting => 1,
452                                           allow_fallback => 1,
453                                           byte_buffer => \$byte_buffer);
454        $buffer->{buffer} = $byte_buffer;
455        !!!parse-error (type => 'sniffing:default', ## TODO: type name
456                        value => 'windows-1252',
457                        level => $self->{info_level},
458                        line => 1, column => 1);
459        $self->{confident} = 0;
460      } # SNIFFING
461    
462      $self->{input_encoding} = $charset->get_iana_name;
463      if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
464        !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
465                        value => $self->{input_encoding},
466                        level => $self->{unsupported_level},
467                        line => 1, column => 1);
468      } elsif (not ($e_status &
469                    Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
470        !!!parse-error (type => 'chardecode:no error', ## TODO: type name
471                        value => $self->{input_encoding},
472                        level => $self->{unsupported_level},
473                        line => 1, column => 1);
474      }
475    
476      $self->{change_encoding} = sub {
477        my $self = shift;
478        $charset_name = shift;
479        my $token = shift;
480    
481        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
482        ($char_stream, $e_status) = $charset->get_decode_handle
483            ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
484             byte_buffer => \ $buffer->{buffer});
485        
486        if ($char_stream) { # if supported
487          ## "Change the encoding" algorithm:
488    
489          ## Step 1    
490          if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
491            $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
492            ($char_stream, $e_status) = $charset->get_decode_handle
493                ($byte_stream,
494                 byte_buffer => \ $buffer->{buffer});
495          }
496          $charset_name = $charset->get_iana_name;
497          
498          ## Step 2
499          if (defined $self->{input_encoding} and
500              $self->{input_encoding} eq $charset_name) {
501            !!!parse-error (type => 'charset label:matching', ## TODO: type
502                            value => $charset_name,
503                            level => $self->{info_level});
504            $self->{confident} = 1;
505            return;
506          }
507    
508          !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
509              ':'.$charset_name, level => 'w', token => $token);
510          
511          ## Step 3
512          # if (can) {
513            ## change the encoding on the fly.
514            #$self->{confident} = 1;
515            #return;
516          # }
517          
518          ## Step 4
519          throw Whatpm::HTML::RestartParser ();
520        }
521    }; # $self->{change_encoding}    }; # $self->{change_encoding}
522    
523      my $char_onerror = sub {
524        my (undef, $type, %opt) = @_;
525        !!!parse-error (%opt, type => $type,
526                        line => $self->{line}, column => $self->{column} + 1);
527        if ($opt{octets}) {
528          ${$opt{octets}} = "\x{FFFD}"; # relacement character
529        }
530      };
531      $char_stream->onerror ($char_onerror);
532    
533    my @args = @_; shift @args; # $s    my @args = @_; shift @args; # $s
534    my $return;    my $return;
535    try {    try {
536      $return = $self->parse_char_string ($s, @args);        $return = $self->parse_char_stream ($char_stream, @args);  
537    } catch Whatpm::HTML::RestartParser with {    } catch Whatpm::HTML::RestartParser with {
538      my $charset = shift->{charset};      ## NOTE: Invoked after {change_encoding}.
539      $s = \ (Encode::decode ($charset, $$bytes_s));      
540      $self->{input_encoding} = $charset; ## TODO: normalize      $self->{input_encoding} = $charset->get_iana_name;
541        if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
542          !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
543                          value => $self->{input_encoding},
544                          level => $self->{unsupported_level},
545                          line => 1, column => 1);
546        } elsif (not ($e_status &
547                      Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
548          !!!parse-error (type => 'chardecode:no error', ## TODO: type name
549                          value => $self->{input_encoding},
550                          level => $self->{unsupported_level},
551                          line => 1, column => 1);
552        }
553      $self->{confident} = 1;      $self->{confident} = 1;
554      $return = $self->parse_char_string ($s, @args);      $char_stream->onerror ($char_onerror);
555        $return = $self->parse_char_stream ($char_stream, @args);
556    };    };
557    return $return;    return $return;
558  } # parse_byte_string  } # parse_byte_stream
559    
560  ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM  ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
561  ## and the HTML layer MUST ignore it.  However, we does strip BOM in  ## and the HTML layer MUST ignore it.  However, we does strip BOM in
# Line 411  sub parse_byte_string ($$$$;$) { Line 566  sub parse_byte_string ($$$$;$) {
566  ## such as |parse_byte_string| in this module, must ensure that it does  ## such as |parse_byte_string| in this module, must ensure that it does
567  ## strip the BOM and never strip any ZWNBSP.  ## strip the BOM and never strip any ZWNBSP.
568    
569  *parse_char_string = \&parse_string;  sub parse_char_string ($$$;$) {
570      my $self = shift;
571      require utf8;
572      my $s = ref $_[0] ? $_[0] : \($_[0]);
573      open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
574      return $self->parse_char_stream ($input, @_[1..$#_]);
575    } # parse_char_string
576    *parse_string = \&parse_char_string;
577    
578  sub parse_string ($$$;$) {  sub parse_char_stream ($$$;$) {
579    my $self = ref $_[0] ? shift : shift->new;    my $self = ref $_[0] ? shift : shift->new;
580    my $s = ref $_[0] ? $_[0] : \($_[0]);    my $input = $_[0];
581    $self->{document} = $_[1];    $self->{document} = $_[1];
582    @{$self->{document}->child_nodes} = ();    @{$self->{document}->child_nodes} = ();
583    
# Line 434  sub parse_string ($$$;$) { Line 596  sub parse_string ($$$;$) {
596      pop @{$self->{prev_char}};      pop @{$self->{prev_char}};
597      unshift @{$self->{prev_char}}, $self->{next_char};      unshift @{$self->{prev_char}}, $self->{next_char};
598    
599      $self->{next_char} = -1 and return if $i >= length $$s;      my $char;
600      $self->{next_char} = ord substr $$s, $i++, 1;      if (defined $self->{next_next_char}) {
601          $char = $self->{next_next_char};
602          delete $self->{next_next_char};
603        } else {
604          $char = $input->getc;
605        }
606        $self->{next_char} = -1 and return unless defined $char;
607        $self->{next_char} = ord $char;
608    
609      ($self->{line_prev}, $self->{column_prev})      ($self->{line_prev}, $self->{column_prev})
610          = ($self->{line}, $self->{column});          = ($self->{line}, $self->{column});
# Line 447  sub parse_string ($$$;$) { Line 616  sub parse_string ($$$;$) {
616        $self->{column} = 0;        $self->{column} = 0;
617      } elsif ($self->{next_char} == 0x000D) { # CR      } elsif ($self->{next_char} == 0x000D) { # CR
618        !!!cp ('j2');        !!!cp ('j2');
619        $i++ if substr ($$s, $i, 1) eq "\x0A";        my $next = $input->getc;
620          if (defined $next and $next ne "\x0A") {
621            $self->{next_next_char} = $next;
622          }
623        $self->{next_char} = 0x000A; # LF # MUST        $self->{next_char} = 0x000A; # LF # MUST
624        $self->{line}++;        $self->{line}++;
625        $self->{column} = 0;        $self->{column} = 0;
# Line 500  sub parse_string ($$$;$) { Line 672  sub parse_string ($$$;$) {
672    delete $self->{parse_error}; # remove loop    delete $self->{parse_error}; # remove loop
673    
674    return $self->{document};    return $self->{document};
675  } # parse_string  } # parse_char_stream
676    
677  sub new ($) {  sub new ($) {
678    my $class = shift;    my $class = shift;
679    my $self = bless {}, $class;    my $self = bless {
680        must_level => 'm',
681        should_level => 's',
682        good_level => 'w',
683        warn_level => 'w',
684        info_level => 'i',
685        unsupported_level => 'u',
686      }, $class;
687    $self->{set_next_char} = sub {    $self->{set_next_char} = sub {
688      $self->{next_char} = -1;      $self->{next_char} = -1;
689    };    };
# Line 869  sub _get_next_token ($) { Line 1048  sub _get_next_token ($) {
1048            redo A;            redo A;
1049          } else {          } else {
1050            !!!cp (23);            !!!cp (23);
1051            !!!parse-error (type => 'bare stago');            !!!parse-error (type => 'bare stago',
1052                              line => $self->{line_prev},
1053                              column => $self->{column_prev});
1054            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1055            ## reconsume            ## reconsume
1056    
# Line 1646  sub _get_next_token ($) { Line 1827  sub _get_next_token ($) {
1827          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1828          !!!next-input-character;          !!!next-input-character;
1829          redo A;          redo A;
1830          } elsif ($self->{next_char} == -1) {
1831            !!!parse-error (type => 'unclosed tag');
1832            if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1833              !!!cp (122.3);
1834              $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1835            } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1836              if ($self->{current_token}->{attributes}) {
1837                !!!cp (122.1);
1838                !!!parse-error (type => 'end tag attribute');
1839              } else {
1840                ## NOTE: This state should never be reached.
1841                !!!cp (122.2);
1842              }
1843            } else {
1844              die "$0: $self->{current_token}->{type}: Unknown token type";
1845            }
1846            $self->{state} = DATA_STATE;
1847            ## Reconsume.
1848            !!!emit ($self->{current_token}); # start tag or end tag
1849            redo A;
1850        } else {        } else {
1851          !!!cp ('124.1');          !!!cp ('124.1');
1852          !!!parse-error (type => 'no space between attributes');          !!!parse-error (type => 'no space between attributes');
# Line 1678  sub _get_next_token ($) { Line 1879  sub _get_next_token ($) {
1879          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
1880    
1881          redo A;          redo A;
1882          } elsif ($self->{next_char} == -1) {
1883            !!!parse-error (type => 'unclosed tag');
1884            if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1885              !!!cp (124.7);
1886              $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1887            } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1888              if ($self->{current_token}->{attributes}) {
1889                !!!cp (124.5);
1890                !!!parse-error (type => 'end tag attribute');
1891              } else {
1892                ## NOTE: This state should never be reached.
1893                !!!cp (124.6);
1894              }
1895            } else {
1896              die "$0: $self->{current_token}->{type}: Unknown token type";
1897            }
1898            $self->{state} = DATA_STATE;
1899            ## Reconsume.
1900            !!!emit ($self->{current_token}); # start tag or end tag
1901            redo A;
1902        } else {        } else {
1903          !!!cp ('124.4');          !!!cp ('124.4');
1904          !!!parse-error (type => 'nestc');          !!!parse-error (type => 'nestc');
# Line 2518  sub _get_next_token ($) { Line 2739  sub _get_next_token ($) {
2739          redo A;          redo A;
2740        } elsif ($self->{next_char} == -1) {        } elsif ($self->{next_char} == -1) {
2741          !!!cp (217);          !!!cp (217);
         !!!parse-error (type => 'unclosed DOCTYPE');  
2742    
2743          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2744          ## reconsume          ## reconsume
# Line 2922  sub _tree_construction_initial ($) { Line 3142  sub _tree_construction_initial ($) {
3142        } elsif (defined $token->{public_identifier}) {        } elsif (defined $token->{public_identifier}) {
3143          my $pubid = $token->{public_identifier};          my $pubid = $token->{public_identifier};
3144          $pubid =~ tr/a-z/A-z/;          $pubid =~ tr/a-z/A-z/;
3145          if ({          my $prefix = [
3146            "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,            "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3147            "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,            "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3148            "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,            "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3149            "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,            "-//IETF//DTD HTML 2.0 LEVEL 1//",
3150            "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,            "-//IETF//DTD HTML 2.0 LEVEL 2//",
3151            "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,            "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3152            "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,            "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3153            "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,            "-//IETF//DTD HTML 2.0 STRICT//",
3154            "-//IETF//DTD HTML 2.0//EN" => 1,            "-//IETF//DTD HTML 2.0//",
3155            "-//IETF//DTD HTML 2.1E//EN" => 1,            "-//IETF//DTD HTML 2.1E//",
3156            "-//IETF//DTD HTML 3.0//EN" => 1,            "-//IETF//DTD HTML 3.0//",
3157            "-//IETF//DTD HTML 3.0//EN//" => 1,            "-//IETF//DTD HTML 3.2 FINAL//",
3158            "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,            "-//IETF//DTD HTML 3.2//",
3159            "-//IETF//DTD HTML 3.2//EN" => 1,            "-//IETF//DTD HTML 3//",
3160            "-//IETF//DTD HTML 3//EN" => 1,            "-//IETF//DTD HTML LEVEL 0//",
3161            "-//IETF//DTD HTML LEVEL 0//EN" => 1,            "-//IETF//DTD HTML LEVEL 1//",
3162            "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,            "-//IETF//DTD HTML LEVEL 2//",
3163            "-//IETF//DTD HTML LEVEL 1//EN" => 1,            "-//IETF//DTD HTML LEVEL 3//",
3164            "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,            "-//IETF//DTD HTML STRICT LEVEL 0//",
3165            "-//IETF//DTD HTML LEVEL 2//EN" => 1,            "-//IETF//DTD HTML STRICT LEVEL 1//",
3166            "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,            "-//IETF//DTD HTML STRICT LEVEL 2//",
3167            "-//IETF//DTD HTML LEVEL 3//EN" => 1,            "-//IETF//DTD HTML STRICT LEVEL 3//",
3168            "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,            "-//IETF//DTD HTML STRICT//",
3169            "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,            "-//IETF//DTD HTML//",
3170            "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,            "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3171            "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3172            "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3173            "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3174            "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3175            "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3176            "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3177            "-//IETF//DTD HTML STRICT//EN" => 1,            "-//NETSCAPE COMM. CORP.//DTD HTML//",
3178            "-//IETF//DTD HTML STRICT//EN//2.0" => 1,            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3179            "-//IETF//DTD HTML STRICT//EN//3.0" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3180            "-//IETF//DTD HTML//EN" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3181            "-//IETF//DTD HTML//EN//2.0" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3182            "-//IETF//DTD HTML//EN//3.0" => 1,            "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3183            "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,            "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3184            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3185            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3186            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3187            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3188            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,            "-//W3C//DTD HTML 3 1995-03-24//",
3189            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,            "-//W3C//DTD HTML 3.2 DRAFT//",
3190            "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,            "-//W3C//DTD HTML 3.2 FINAL//",
3191            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,            "-//W3C//DTD HTML 3.2//",
3192            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,            "-//W3C//DTD HTML 3.2S DRAFT//",
3193            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,            "-//W3C//DTD HTML 4.0 FRAMESET//",
3194            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,            "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3195            "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,            "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3196            "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,            "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3197            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,            "-//W3C//DTD W3 HTML//",
3198            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,            "-//W3O//DTD W3 HTML 3.0//",
3199            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,            "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3200            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,            "-//WEBTECHS//DTD MOZILLA HTML//",
3201            "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,          ]; # $prefix
3202            "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,          my $match;
3203            "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,          for (@$prefix) {
3204            "-//W3C//DTD HTML 3.2//EN" => 1,            if (substr ($prefix, 0, length $_) eq $_) {
3205            "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,              $match = 1;
3206            "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,              last;
3207            "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,            }
3208            "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,          }
3209            "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,          if ($match or
3210            "-//W3C//DTD W3 HTML//EN" => 1,              $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3211            "-//W3O//DTD W3 HTML 3.0//EN" => 1,              $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3212            "-//W3O//DTD W3 HTML 3.0//EN//" => 1,              $pubid eq "HTML") {
           "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,  
           "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,  
           "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,  
           "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,  
           "HTML" => 1,  
         }->{$pubid}) {  
3213            !!!cp ('t5');            !!!cp ('t5');
3214            $self->{document}->manakai_compat_mode ('quirks');            $self->{document}->manakai_compat_mode ('quirks');
3215          } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or          } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3216                   $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {                   $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3217            if (defined $token->{system_identifier}) {            if (defined $token->{system_identifier}) {
3218              !!!cp ('t6');              !!!cp ('t6');
3219              $self->{document}->manakai_compat_mode ('quirks');              $self->{document}->manakai_compat_mode ('quirks');
# Line 3007  sub _tree_construction_initial ($) { Line 3221  sub _tree_construction_initial ($) {
3221              !!!cp ('t7');              !!!cp ('t7');
3222              $self->{document}->manakai_compat_mode ('limited quirks');              $self->{document}->manakai_compat_mode ('limited quirks');
3223            }            }
3224          } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or          } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3225                   $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {                   $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3226            !!!cp ('t8');            !!!cp ('t8');
3227            $self->{document}->manakai_compat_mode ('limited quirks');            $self->{document}->manakai_compat_mode ('limited quirks');
3228          } else {          } else {
# Line 3021  sub _tree_construction_initial ($) { Line 3235  sub _tree_construction_initial ($) {
3235          my $sysid = $token->{system_identifier};          my $sysid = $token->{system_identifier};
3236          $sysid =~ tr/A-Z/a-z/;          $sysid =~ tr/A-Z/a-z/;
3237          if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {          if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3238            ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"            ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3239              ## marked as quirks.
3240            $self->{document}->manakai_compat_mode ('quirks');            $self->{document}->manakai_compat_mode ('quirks');
3241            !!!cp ('t11');            !!!cp ('t11');
3242          } else {          } else {
# Line 3193  sub _reset_insertion_mode ($) { Line 3408  sub _reset_insertion_mode ($) {
3408        if ($self->{open_elements}->[0]->[0] eq $node->[0]) {        if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3409          $last = 1;          $last = 1;
3410          if (defined $self->{inner_html_node}) {          if (defined $self->{inner_html_node}) {
3411            if ($self->{inner_html_node}->[1] & TABLE_CELL_EL) {            !!!cp ('t28');
3412              !!!cp ('t27');            $node = $self->{inner_html_node};
3413              #          } else {
3414            } else {            die "_reset_insertion_mode: t27";
             !!!cp ('t28');  
             $node = $self->{inner_html_node};  
           }  
3415          }          }
3416        }        }
3417              
3418      ## Step 4..14        ## Step 4..14
3419      my $new_mode;        my $new_mode;
3420      if ($node->[1] & FOREIGN_EL) {        if ($node->[1] & FOREIGN_EL) {
3421        ## NOTE: Strictly spaking, the line below only applies to MathML and          !!!cp ('t28.1');
3422        ## SVG elements.  Currently the HTML syntax supports only MathML and          ## NOTE: Strictly spaking, the line below only applies to MathML and
3423        ## SVG elements as foreigners.          ## SVG elements.  Currently the HTML syntax supports only MathML and
3424        $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;          ## SVG elements as foreigners.
3425        ## ISSUE: What is set as the secondary insertion mode?          $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
3426      } else {          ## ISSUE: What is set as the secondary insertion mode?
3427        $new_mode = {        } elsif ($node->[1] & TABLE_CELL_EL) {
3428            if ($last) {
3429              !!!cp ('t28.2');
3430              #
3431            } else {
3432              !!!cp ('t28.3');
3433              $new_mode = IN_CELL_IM;
3434            }
3435          } else {
3436            !!!cp ('t28.4');
3437            $new_mode = {
3438                        select => IN_SELECT_IM,                        select => IN_SELECT_IM,
3439                        ## NOTE: |option| and |optgroup| do not set                        ## NOTE: |option| and |optgroup| do not set
3440                        ## insertion mode to "in select" by themselves.                        ## insertion mode to "in select" by themselves.
                       td => IN_CELL_IM,  
                       th => IN_CELL_IM,  
3441                        tr => IN_ROW_IM,                        tr => IN_ROW_IM,
3442                        tbody => IN_TABLE_BODY_IM,                        tbody => IN_TABLE_BODY_IM,
3443                        thead => IN_TABLE_BODY_IM,                        thead => IN_TABLE_BODY_IM,
# Line 3229  sub _reset_insertion_mode ($) { Line 3449  sub _reset_insertion_mode ($) {
3449                        body => IN_BODY_IM,                        body => IN_BODY_IM,
3450                        frameset => IN_FRAMESET_IM,                        frameset => IN_FRAMESET_IM,
3451                       }->{$node->[0]->manakai_local_name};                       }->{$node->[0]->manakai_local_name};
3452      }        }
3453      $self->{insertion_mode} = $new_mode and return if defined $new_mode;        $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3454                
3455        ## Step 15        ## Step 15
3456        if ($node->[1] & HTML_EL) {        if ($node->[1] & HTML_EL) {
# Line 3973  sub _tree_construction_main ($) { Line 4193  sub _tree_construction_main ($) {
4193              !!!next-token;              !!!next-token;
4194              next B;              next B;
4195            } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {            } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4196              !!!cp ('t94');              !!!cp ('t93.2');
4197              #              !!!parse-error (type => 'after head:head', token => $token); ## TODO: error type
4198                ## Ignore the token
4199                !!!nack ('t93.3');
4200                !!!next-token;
4201                next B;
4202            } else {            } else {
4203              !!!cp ('t95');              !!!cp ('t95');
4204              !!!parse-error (type => 'in head:head', token => $token); # or in head noscript              !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
# Line 4057  sub _tree_construction_main ($) { Line 4281  sub _tree_construction_main ($) {
4281                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4282    
4283                unless ($self->{confident}) {                unless ($self->{confident}) {
4284                  if ($token->{attributes}->{charset}) { ## TODO: And if supported                  if ($token->{attributes}->{charset}) {
4285                    !!!cp ('t106');                    !!!cp ('t106');
4286                      ## NOTE: Whether the encoding is supported or not is handled
4287                      ## in the {change_encoding} callback.
4288                    $self->{change_encoding}                    $self->{change_encoding}
4289                        ->($self, $token->{attributes}->{charset}->{value},                        ->($self, $token->{attributes}->{charset}->{value},
4290                           $token);                           $token);
# Line 4068  sub _tree_construction_main ($) { Line 4294  sub _tree_construction_main ($) {
4294                                             $token->{attributes}->{charset}                                             $token->{attributes}->{charset}
4295                                                 ->{has_reference});                                                 ->{has_reference});
4296                  } elsif ($token->{attributes}->{content}) {                  } elsif ($token->{attributes}->{content}) {
                   ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.  
4297                    if ($token->{attributes}->{content}->{value}                    if ($token->{attributes}->{content}->{value}
4298                        =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]                        =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4299                            [\x09-\x0D\x20]*=                            [\x09-\x0D\x20]*=
4300                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4301                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4302                      !!!cp ('t107');                      !!!cp ('t107');
4303                        ## NOTE: Whether the encoding is supported or not is handled
4304                        ## in the {change_encoding} callback.
4305                      $self->{change_encoding}                      $self->{change_encoding}
4306                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4307                             $token);                             $token);
# Line 4293  sub _tree_construction_main ($) { Line 4520  sub _tree_construction_main ($) {
4520                  $self->{insertion_mode} = AFTER_HEAD_IM;                  $self->{insertion_mode} = AFTER_HEAD_IM;
4521                  !!!next-token;                  !!!next-token;
4522                  next B;                  next B;
4523                  } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4524                    !!!cp ('t134.1');
4525                    !!!parse-error (type => 'unmatched end tag:head', token => $token);
4526                    ## Ignore the token
4527                    !!!next-token;
4528                    next B;
4529                } else {                } else {
4530                  !!!cp ('t135');                  die "$0: $self->{insertion_mode}: Unknown insertion mode";
                 #  
4531                }                }
4532              } elsif ($token->{tag_name} eq 'noscript') {              } elsif ($token->{tag_name} eq 'noscript') {
4533                if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {                if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
# Line 4304  sub _tree_construction_main ($) { Line 4536  sub _tree_construction_main ($) {
4536                  $self->{insertion_mode} = IN_HEAD_IM;                  $self->{insertion_mode} = IN_HEAD_IM;
4537                  !!!next-token;                  !!!next-token;
4538                  next B;                  next B;
4539                } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {                } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4540                           $self->{insertion_mode} == AFTER_HEAD_IM) {
4541                  !!!cp ('t137');                  !!!cp ('t137');
4542                  !!!parse-error (type => 'unmatched end tag:noscript', token => $token);                  !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4543                  ## Ignore the token ## ISSUE: An issue in the spec.                  ## Ignore the token ## ISSUE: An issue in the spec.
# Line 4317  sub _tree_construction_main ($) { Line 4550  sub _tree_construction_main ($) {
4550              } elsif ({              } elsif ({
4551                        body => 1, html => 1,                        body => 1, html => 1,
4552                       }->{$token->{tag_name}}) {                       }->{$token->{tag_name}}) {
4553                if ($self->{insertion_mode} == BEFORE_HEAD_IM) {                if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4554                  !!!cp ('t139');                    $self->{insertion_mode} == IN_HEAD_IM or
4555                  ## As if <head>                    $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
                 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);  
                 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});  
                 push @{$self->{open_elements}},  
                     [$self->{head_element}, $el_category->{head}];  
   
                 $self->{insertion_mode} = IN_HEAD_IM;  
                 ## Reprocess in the "in head" insertion mode...  
               } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {  
4556                  !!!cp ('t140');                  !!!cp ('t140');
4557                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4558                  ## Ignore the token                  ## Ignore the token
4559                  !!!next-token;                  !!!next-token;
4560                  next B;                  next B;
4561                  } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4562                    !!!cp ('t140.1');
4563                    !!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token);
4564                    ## Ignore the token
4565                    !!!next-token;
4566                    next B;
4567                } else {                } else {
4568                  !!!cp ('t141');                  die "$0: $self->{insertion_mode}: Unknown insertion mode";
4569                }                }
4570                              } elsif ($token->{tag_name} eq 'p') {
4571                #                !!!cp ('t142');
4572              } elsif ({                !!!parse-error (type => 'unmatched end tag:p', token => $token);
4573                        p => 1, br => 1,                ## Ignore the token
4574                       }->{$token->{tag_name}}) {                !!!next-token;
4575                  next B;
4576                } elsif ($token->{tag_name} eq 'br') {
4577                if ($self->{insertion_mode} == BEFORE_HEAD_IM) {                if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4578                  !!!cp ('t142');                  !!!cp ('t142.2');
4579                  ## As if <head>                  ## (before head) as if <head>, (in head) as if </head>
4580                  !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);                  !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4581                  $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});                  $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4582                  push @{$self->{open_elements}},                  $self->{insertion_mode} = AFTER_HEAD_IM;
4583                      [$self->{head_element}, $el_category->{head}];    
4584                    ## Reprocess in the "after head" insertion mode...
4585                  } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4586                    !!!cp ('t143.2');
4587                    ## As if </head>
4588                    pop @{$self->{open_elements}};
4589                    $self->{insertion_mode} = AFTER_HEAD_IM;
4590      
4591                    ## Reprocess in the "after head" insertion mode...
4592                  } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4593                    !!!cp ('t143.3');
4594                    ## ISSUE: Two parse errors for <head><noscript></br>
4595                    !!!parse-error (type => 'unmatched end tag:br', token => $token);
4596                    ## As if </noscript>
4597                    pop @{$self->{open_elements}};
4598                  $self->{insertion_mode} = IN_HEAD_IM;                  $self->{insertion_mode} = IN_HEAD_IM;
4599    
4600                  ## Reprocess in the "in head" insertion mode...                  ## Reprocess in the "in head" insertion mode...
4601                } else {                  ## As if </head>
4602                  !!!cp ('t143');                  pop @{$self->{open_elements}};
4603                }                  $self->{insertion_mode} = AFTER_HEAD_IM;
4604    
4605                #                  ## Reprocess in the "after head" insertion mode...
4606              } else {                } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4607                if ($self->{insertion_mode} == AFTER_HEAD_IM) {                  !!!cp ('t143.4');
                 !!!cp ('t144');  
4608                  #                  #
4609                } else {                } else {
4610                  !!!cp ('t145');                  die "$0: $self->{insertion_mode}: Unknown insertion mode";
                 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);  
                 ## Ignore the token  
                 !!!next-token;  
                 next B;  
4611                }                }
4612    
4613                  ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4614                  !!!parse-error (type => 'unmatched end tag:br', token => $token);
4615                  ## Ignore the token
4616                  !!!next-token;
4617                  next B;
4618                } else {
4619                  !!!cp ('t145');
4620                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4621                  ## Ignore the token
4622                  !!!next-token;
4623                  next B;
4624              }              }
4625    
4626              if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {              if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
# Line 6091  sub _tree_construction_main ($) { Line 6345  sub _tree_construction_main ($) {
6345          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6346    
6347          unless ($self->{confident}) {          unless ($self->{confident}) {
6348            if ($token->{attributes}->{charset}) { ## TODO: And if supported            if ($token->{attributes}->{charset}) {
6349              !!!cp ('t335');              !!!cp ('t335');
6350                ## NOTE: Whether the encoding is supported or not is handled
6351                ## in the {change_encoding} callback.
6352              $self->{change_encoding}              $self->{change_encoding}
6353                  ->($self, $token->{attributes}->{charset}->{value}, $token);                  ->($self, $token->{attributes}->{charset}->{value}, $token);
6354                            
# Line 6101  sub _tree_construction_main ($) { Line 6357  sub _tree_construction_main ($) {
6357                                       $token->{attributes}->{charset}                                       $token->{attributes}->{charset}
6358                                           ->{has_reference});                                           ->{has_reference});
6359            } elsif ($token->{attributes}->{content}) {            } elsif ($token->{attributes}->{content}) {
             ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.  
6360              if ($token->{attributes}->{content}->{value}              if ($token->{attributes}->{content}->{value}
6361                  =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]                  =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6362                      [\x09-\x0D\x20]*=                      [\x09-\x0D\x20]*=
6363                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6364                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
6365                !!!cp ('t336');                !!!cp ('t336');
6366                  ## NOTE: Whether the encoding is supported or not is handled
6367                  ## in the {change_encoding} callback.
6368                $self->{change_encoding}                $self->{change_encoding}
6369                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6370                $meta_el->[0]->get_attribute_node_ns (undef, 'content')                $meta_el->[0]->get_attribute_node_ns (undef, 'content')

Legend:
Removed from v.1.132  
changed lines
  Added in v.1.143

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24