/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.133 by wakaba, Sat May 17 04:54:11 2008 UTC revision 1.145 by wakaba, Sat May 24 11:57:47 2008 UTC
# Line 8  use Error qw(:try); Line 8  use Error qw(:try);
8  ## doc.write ('');  ## doc.write ('');
9  ## alert (doc.compatMode);  ## alert (doc.compatMode);
10    
11  ## TODO: 1252 parse error (revision 1264)  require IO::Handle;
 ## TODO: 8859-11 = 874 (revision 1271)  
12    
13  my $HTML_NS = q<http://www.w3.org/1999/xhtml>;  my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14  my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;  my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
# Line 332  my $c1_entity_char = { Line 331  my $c1_entity_char = {
331  }; # $c1_entity_char  }; # $c1_entity_char
332    
333  sub parse_byte_string ($$$$;$) {  sub parse_byte_string ($$$$;$) {
334      my $self = shift;
335      my $charset_name = shift;
336      open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
337      return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
338    } # parse_byte_string
339    
340    sub parse_byte_stream ($$$$;$) {
341    my $self = ref $_[0] ? shift : shift->new;    my $self = ref $_[0] ? shift : shift->new;
342    my $charset_name = shift;    my $charset_name = shift;
343    my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);    my $byte_stream = $_[0];
344    my $s;  
345      my $onerror = $_[2] || sub {
346        my (%opt) = @_;
347        warn "Parse error ($opt{type})\n";
348      };
349      $self->{parse_error} = $onerror; # updated later by parse_char_string
350    
351    ## HTML5 encoding sniffing algorithm    ## HTML5 encoding sniffing algorithm
352    require Message::Charset::Info;    require Message::Charset::Info;
353    my $charset;    my $charset;
354    my ($e, $e_status);    my $buffer;
355      my ($char_stream, $e_status);
356    
357    SNIFFING: {    SNIFFING: {
358    
# Line 349  sub parse_byte_string ($$$$;$) { Line 361  sub parse_byte_string ($$$$;$) {
361        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
362    
363        ## ISSUE: Unsupported encoding is not ignored according to the spec.        ## ISSUE: Unsupported encoding is not ignored according to the spec.
364        ($e, $e_status) = $charset->get_perl_encoding        ($char_stream, $e_status) = $charset->get_decode_handle
365            (allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
366             allow_fallback => 1);             allow_fallback => 1);
367        if ($e) {        if ($char_stream) {
368          $self->{confident} = 1;          $self->{confident} = 1;
369          last SNIFFING;          last SNIFFING;
370          } else {
371            ## TODO: unsupported error
372        }        }
373      }      }
374    
375      ## Step 2      ## Step 2
376      # wait      my $byte_buffer = '';
377        for (1..1024) {
378          my $char = $byte_stream->getc;
379          last unless defined $char;
380          $byte_buffer .= $char;
381        } ## TODO: timeout
382    
383      ## Step 3      ## Step 3
384      my $head = substr ($$bytes_s, 0, 3);      if ($byte_buffer =~ /^\xFE\xFF/) {
     if ($head =~ /^\xFE\xFF/) {  
385        $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');        $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
386        ($e, $e_status) = $charset->get_perl_encoding        ($char_stream, $e_status) = $charset->get_decode_handle
387            (allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
388             allow_fallback => 1);             allow_fallback => 1, byte_buffer => \$byte_buffer);
389        $self->{confident} = 1;        $self->{confident} = 1;
390        last SNIFFING;        last SNIFFING;
391      } elsif ($head =~ /^\xFF\xFE/) {      } elsif ($byte_buffer =~ /^\xFF\xFE/) {
392        $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');        $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
393        ($e, $e_status) = $charset->get_perl_encoding        ($char_stream, $e_status) = $charset->get_decode_handle
394            (allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
395             allow_fallback => 1);             allow_fallback => 1, byte_buffer => \$byte_buffer);
396        $self->{confident} = 1;        $self->{confident} = 1;
397        last SNIFFING;        last SNIFFING;
398      } elsif ($head eq "\xEF\xBB\xBF") {      } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
399        $charset = Message::Charset::Info->get_by_iana_name ('utf-8');        $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
400        ($e, $e_status) = $charset->get_perl_encoding        ($char_stream, $e_status) = $charset->get_decode_handle
401            (allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
402             allow_fallback => 1);             allow_fallback => 1, byte_buffer => \$byte_buffer);
403        $self->{confident} = 1;        $self->{confident} = 1;
404        last SNIFFING;        last SNIFFING;
405      }      }
# Line 395  sub parse_byte_string ($$$$;$) { Line 413  sub parse_byte_string ($$$$;$) {
413      ## Step 6      ## Step 6
414      require Whatpm::Charset::UniversalCharDet;      require Whatpm::Charset::UniversalCharDet;
415      $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string      $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
416          (substr ($$bytes_s, 0, 1024));          ($byte_buffer);
417      if (defined $charset_name) {      if (defined $charset_name) {
418        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
419    
420        ## ISSUE: Unsupported encoding is not ignored according to the spec.        ## ISSUE: Unsupported encoding is not ignored according to the spec.
421        ($e, $e_status) = $charset->get_perl_encoding        require Whatpm::Charset::DecodeHandle;
422            (allow_error_reporting => 1,        $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
423             allow_fallback => 1);            ($byte_stream);
424        if ($e) {        ($char_stream, $e_status) = $charset->get_decode_handle
425              ($buffer, allow_error_reporting => 1,
426               allow_fallback => 1, byte_buffer => \$byte_buffer);
427          if ($char_stream) {
428            $buffer->{buffer} = $byte_buffer;
429            !!!parse-error (type => 'sniffing:chardet', ## TODO: type name
430                            value => $charset_name,
431                            level => $self->{info_level},
432                            line => 1, column => 1);
433          $self->{confident} = 0;          $self->{confident} = 0;
434          last SNIFFING;          last SNIFFING;
435        }        }
# Line 414  sub parse_byte_string ($$$$;$) { Line 440  sub parse_byte_string ($$$$;$) {
440      $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');      $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
441          ## NOTE: We choose |windows-1252| here, since |utf-8| should be          ## NOTE: We choose |windows-1252| here, since |utf-8| should be
442          ## detectable in the step 6.          ## detectable in the step 6.
443      ($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1,      require Whatpm::Charset::DecodeHandle;
444                                                     allow_fallback => 1);      $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
445            ($byte_stream);
446        ($char_stream, $e_status)
447            = $charset->get_decode_handle ($buffer,
448                                           allow_error_reporting => 1,
449                                           allow_fallback => 1,
450                                           byte_buffer => \$byte_buffer);
451        $buffer->{buffer} = $byte_buffer;
452        !!!parse-error (type => 'sniffing:default', ## TODO: type name
453                        value => 'windows-1252',
454                        level => $self->{info_level},
455                        line => 1, column => 1);
456      $self->{confident} = 0;      $self->{confident} = 0;
457    } # SNIFFING    } # SNIFFING
458    
459      $self->{input_encoding} = $charset->get_iana_name;
460    if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {    if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
461            !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
462                        value => $self->{input_encoding},
463                        level => $self->{unsupported_level},
464                        line => 1, column => 1);
465    } elsif (not ($e_status &    } elsif (not ($e_status &
466                  Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {                  Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
467            !!!parse-error (type => 'chardecode:no error', ## TODO: type name
468                        value => $self->{input_encoding},
469                        level => $self->{unsupported_level},
470                        line => 1, column => 1);
471    }    }
   $s = \ $e->decode ($$bytes_s);  
   $self->{input_encoding} = $charset->get_iana_name;  
472    
473    $self->{change_encoding} = sub {    $self->{change_encoding} = sub {
474      my $self = shift;      my $self = shift;
475      my $charset_name = lc shift;      $charset_name = shift;
476      my $token = shift;      my $token = shift;
     ## TODO: if $charset_name is supported  
     ## TODO: normalize charset name  
477    
478      ## "Change the encoding" algorithm:      $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
479        ($char_stream, $e_status) = $charset->get_decode_handle
480            ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
481             byte_buffer => \ $buffer->{buffer});
482        
483        if ($char_stream) { # if supported
484          ## "Change the encoding" algorithm:
485    
486      ## Step 1            ## Step 1    
487      if ($charset_name eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?        if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
488        $charset_name = 'utf-8';          $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
489      }          ($char_stream, $e_status) = $charset->get_decode_handle
490                ($byte_stream,
491                 byte_buffer => \ $buffer->{buffer});
492          }
493          $charset_name = $charset->get_iana_name;
494          
495          ## Step 2
496          if (defined $self->{input_encoding} and
497              $self->{input_encoding} eq $charset_name) {
498            !!!parse-error (type => 'charset label:matching', ## TODO: type
499                            value => $charset_name,
500                            level => $self->{info_level});
501            $self->{confident} = 1;
502            return;
503          }
504    
505      ## Step 2        !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
506      if (defined $self->{input_encoding} and            ':'.$charset_name, level => 'w', token => $token);
507          $self->{input_encoding} eq $charset_name) {        
508        $self->{confident} = 1;        ## Step 3
509        return;        # if (can) {
510            ## change the encoding on the fly.
511            #$self->{confident} = 1;
512            #return;
513          # }
514          
515          ## Step 4
516          throw Whatpm::HTML::RestartParser ();
517      }      }
   
     !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.  
         ':'.$charset_name, level => 'w', token => $token);  
   
     ## Step 3  
     # if (can) {  
       ## change the encoding on the fly.  
       #$self->{confident} = 1;  
       #return;  
     # }  
   
     ## Step 4  
     throw Whatpm::HTML::RestartParser (charset => $charset_name);  
518    }; # $self->{change_encoding}    }; # $self->{change_encoding}
519    
520      my $char_onerror = sub {
521        my (undef, $type, %opt) = @_;
522        !!!parse-error (%opt, type => $type,
523                        line => $self->{line}, column => $self->{column} + 1);
524        if ($opt{octets}) {
525          ${$opt{octets}} = "\x{FFFD}"; # relacement character
526        }
527      };
528      $char_stream->onerror ($char_onerror);
529    
530    my @args = @_; shift @args; # $s    my @args = @_; shift @args; # $s
531    my $return;    my $return;
532    try {    try {
533      $return = $self->parse_char_string ($s, @args);        $return = $self->parse_char_stream ($char_stream, @args);  
534    } catch Whatpm::HTML::RestartParser with {    } catch Whatpm::HTML::RestartParser with {
535      my $charset_name = shift->{charset};      ## NOTE: Invoked after {change_encoding}.
536      $s = \ (Encode::decode ($charset_name, $$bytes_s));      
537      $self->{input_encoding} = $charset_name; ## TODO: normalize      $self->{input_encoding} = $charset->get_iana_name;
538        if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
539          !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
540                          value => $self->{input_encoding},
541                          level => $self->{unsupported_level},
542                          line => 1, column => 1);
543        } elsif (not ($e_status &
544                      Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
545          !!!parse-error (type => 'chardecode:no error', ## TODO: type name
546                          value => $self->{input_encoding},
547                          level => $self->{unsupported_level},
548                          line => 1, column => 1);
549        }
550      $self->{confident} = 1;      $self->{confident} = 1;
551      $return = $self->parse_char_string ($s, @args);      $char_stream->onerror ($char_onerror);
552        $return = $self->parse_char_stream ($char_stream, @args);
553    };    };
554    return $return;    return $return;
555  } # parse_byte_string  } # parse_byte_stream
556    
557  ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM  ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
558  ## and the HTML layer MUST ignore it.  However, we does strip BOM in  ## and the HTML layer MUST ignore it.  However, we does strip BOM in
# Line 486  sub parse_byte_string ($$$$;$) { Line 563  sub parse_byte_string ($$$$;$) {
563  ## such as |parse_byte_string| in this module, must ensure that it does  ## such as |parse_byte_string| in this module, must ensure that it does
564  ## strip the BOM and never strip any ZWNBSP.  ## strip the BOM and never strip any ZWNBSP.
565    
566  *parse_char_string = \&parse_string;  sub parse_char_string ($$$;$) {
567      my $self = shift;
568      require utf8;
569      my $s = ref $_[0] ? $_[0] : \($_[0]);
570      open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
571      return $self->parse_char_stream ($input, @_[1..$#_]);
572    } # parse_char_string
573    *parse_string = \&parse_char_string;
574    
575  sub parse_string ($$$;$) {  sub parse_char_stream ($$$;$) {
576    my $self = ref $_[0] ? shift : shift->new;    my $self = ref $_[0] ? shift : shift->new;
577    my $s = ref $_[0] ? $_[0] : \($_[0]);    my $input = $_[0];
578    $self->{document} = $_[1];    $self->{document} = $_[1];
579    @{$self->{document}->child_nodes} = ();    @{$self->{document}->child_nodes} = ();
580    
# Line 509  sub parse_string ($$$;$) { Line 593  sub parse_string ($$$;$) {
593      pop @{$self->{prev_char}};      pop @{$self->{prev_char}};
594      unshift @{$self->{prev_char}}, $self->{next_char};      unshift @{$self->{prev_char}}, $self->{next_char};
595    
596      $self->{next_char} = -1 and return if $i >= length $$s;      my $char;
597      $self->{next_char} = ord substr $$s, $i++, 1;      if (defined $self->{next_next_char}) {
598          $char = $self->{next_next_char};
599          delete $self->{next_next_char};
600        } else {
601          $char = $input->getc;
602        }
603        $self->{next_char} = -1 and return unless defined $char;
604        $self->{next_char} = ord $char;
605    
606      ($self->{line_prev}, $self->{column_prev})      ($self->{line_prev}, $self->{column_prev})
607          = ($self->{line}, $self->{column});          = ($self->{line}, $self->{column});
# Line 522  sub parse_string ($$$;$) { Line 613  sub parse_string ($$$;$) {
613        $self->{column} = 0;        $self->{column} = 0;
614      } elsif ($self->{next_char} == 0x000D) { # CR      } elsif ($self->{next_char} == 0x000D) { # CR
615        !!!cp ('j2');        !!!cp ('j2');
616        $i++ if substr ($$s, $i, 1) eq "\x0A";        my $next = $input->getc;
617          if (defined $next and $next ne "\x0A") {
618            $self->{next_next_char} = $next;
619          }
620        $self->{next_char} = 0x000A; # LF # MUST        $self->{next_char} = 0x000A; # LF # MUST
621        $self->{line}++;        $self->{line}++;
622        $self->{column} = 0;        $self->{column} = 0;
# Line 575  sub parse_string ($$$;$) { Line 669  sub parse_string ($$$;$) {
669    delete $self->{parse_error}; # remove loop    delete $self->{parse_error}; # remove loop
670    
671    return $self->{document};    return $self->{document};
672  } # parse_string  } # parse_char_stream
673    
674  sub new ($) {  sub new ($) {
675    my $class = shift;    my $class = shift;
676    my $self = bless {}, $class;    my $self = bless {
677        must_level => 'm',
678        should_level => 's',
679        good_level => 'w',
680        warn_level => 'w',
681        info_level => 'i',
682        unsupported_level => 'u',
683      }, $class;
684    $self->{set_next_char} = sub {    $self->{set_next_char} = sub {
685      $self->{next_char} = -1;      $self->{next_char} = -1;
686    };    };
# Line 944  sub _get_next_token ($) { Line 1045  sub _get_next_token ($) {
1045            redo A;            redo A;
1046          } else {          } else {
1047            !!!cp (23);            !!!cp (23);
1048            !!!parse-error (type => 'bare stago');            !!!parse-error (type => 'bare stago',
1049                              line => $self->{line_prev},
1050                              column => $self->{column_prev});
1051            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1052            ## reconsume            ## reconsume
1053    
# Line 1721  sub _get_next_token ($) { Line 1824  sub _get_next_token ($) {
1824          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1825          !!!next-input-character;          !!!next-input-character;
1826          redo A;          redo A;
1827          } elsif ($self->{next_char} == -1) {
1828            !!!parse-error (type => 'unclosed tag');
1829            if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1830              !!!cp (122.3);
1831              $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1832            } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1833              if ($self->{current_token}->{attributes}) {
1834                !!!cp (122.1);
1835                !!!parse-error (type => 'end tag attribute');
1836              } else {
1837                ## NOTE: This state should never be reached.
1838                !!!cp (122.2);
1839              }
1840            } else {
1841              die "$0: $self->{current_token}->{type}: Unknown token type";
1842            }
1843            $self->{state} = DATA_STATE;
1844            ## Reconsume.
1845            !!!emit ($self->{current_token}); # start tag or end tag
1846            redo A;
1847        } else {        } else {
1848          !!!cp ('124.1');          !!!cp ('124.1');
1849          !!!parse-error (type => 'no space between attributes');          !!!parse-error (type => 'no space between attributes');
# Line 1753  sub _get_next_token ($) { Line 1876  sub _get_next_token ($) {
1876          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
1877    
1878          redo A;          redo A;
1879          } elsif ($self->{next_char} == -1) {
1880            !!!parse-error (type => 'unclosed tag');
1881            if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1882              !!!cp (124.7);
1883              $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1884            } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1885              if ($self->{current_token}->{attributes}) {
1886                !!!cp (124.5);
1887                !!!parse-error (type => 'end tag attribute');
1888              } else {
1889                ## NOTE: This state should never be reached.
1890                !!!cp (124.6);
1891              }
1892            } else {
1893              die "$0: $self->{current_token}->{type}: Unknown token type";
1894            }
1895            $self->{state} = DATA_STATE;
1896            ## Reconsume.
1897            !!!emit ($self->{current_token}); # start tag or end tag
1898            redo A;
1899        } else {        } else {
1900          !!!cp ('124.4');          !!!cp ('124.4');
1901          !!!parse-error (type => 'nestc');          !!!parse-error (type => 'nestc');
# Line 2593  sub _get_next_token ($) { Line 2736  sub _get_next_token ($) {
2736          redo A;          redo A;
2737        } elsif ($self->{next_char} == -1) {        } elsif ($self->{next_char} == -1) {
2738          !!!cp (217);          !!!cp (217);
         !!!parse-error (type => 'unclosed DOCTYPE');  
2739    
2740          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2741          ## reconsume          ## reconsume
# Line 2997  sub _tree_construction_initial ($) { Line 3139  sub _tree_construction_initial ($) {
3139        } elsif (defined $token->{public_identifier}) {        } elsif (defined $token->{public_identifier}) {
3140          my $pubid = $token->{public_identifier};          my $pubid = $token->{public_identifier};
3141          $pubid =~ tr/a-z/A-z/;          $pubid =~ tr/a-z/A-z/;
3142          if ({          my $prefix = [
3143            "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,            "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3144            "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,            "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3145            "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,            "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3146            "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,            "-//IETF//DTD HTML 2.0 LEVEL 1//",
3147            "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,            "-//IETF//DTD HTML 2.0 LEVEL 2//",
3148            "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,            "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3149            "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,            "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3150            "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,            "-//IETF//DTD HTML 2.0 STRICT//",
3151            "-//IETF//DTD HTML 2.0//EN" => 1,            "-//IETF//DTD HTML 2.0//",
3152            "-//IETF//DTD HTML 2.1E//EN" => 1,            "-//IETF//DTD HTML 2.1E//",
3153            "-//IETF//DTD HTML 3.0//EN" => 1,            "-//IETF//DTD HTML 3.0//",
3154            "-//IETF//DTD HTML 3.0//EN//" => 1,            "-//IETF//DTD HTML 3.2 FINAL//",
3155            "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,            "-//IETF//DTD HTML 3.2//",
3156            "-//IETF//DTD HTML 3.2//EN" => 1,            "-//IETF//DTD HTML 3//",
3157            "-//IETF//DTD HTML 3//EN" => 1,            "-//IETF//DTD HTML LEVEL 0//",
3158            "-//IETF//DTD HTML LEVEL 0//EN" => 1,            "-//IETF//DTD HTML LEVEL 1//",
3159            "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,            "-//IETF//DTD HTML LEVEL 2//",
3160            "-//IETF//DTD HTML LEVEL 1//EN" => 1,            "-//IETF//DTD HTML LEVEL 3//",
3161            "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,            "-//IETF//DTD HTML STRICT LEVEL 0//",
3162            "-//IETF//DTD HTML LEVEL 2//EN" => 1,            "-//IETF//DTD HTML STRICT LEVEL 1//",
3163            "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,            "-//IETF//DTD HTML STRICT LEVEL 2//",
3164            "-//IETF//DTD HTML LEVEL 3//EN" => 1,            "-//IETF//DTD HTML STRICT LEVEL 3//",
3165            "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,            "-//IETF//DTD HTML STRICT//",
3166            "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,            "-//IETF//DTD HTML//",
3167            "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,            "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3168            "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3169            "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3170            "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3171            "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3172            "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3173            "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3174            "-//IETF//DTD HTML STRICT//EN" => 1,            "-//NETSCAPE COMM. CORP.//DTD HTML//",
3175            "-//IETF//DTD HTML STRICT//EN//2.0" => 1,            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3176            "-//IETF//DTD HTML STRICT//EN//3.0" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3177            "-//IETF//DTD HTML//EN" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3178            "-//IETF//DTD HTML//EN//2.0" => 1,            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3179            "-//IETF//DTD HTML//EN//3.0" => 1,            "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3180            "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,            "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3181            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3182            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3183            "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3184            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3185            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,            "-//W3C//DTD HTML 3 1995-03-24//",
3186            "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,            "-//W3C//DTD HTML 3.2 DRAFT//",
3187            "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,            "-//W3C//DTD HTML 3.2 FINAL//",
3188            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,            "-//W3C//DTD HTML 3.2//",
3189            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,            "-//W3C//DTD HTML 3.2S DRAFT//",
3190            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,            "-//W3C//DTD HTML 4.0 FRAMESET//",
3191            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,            "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3192            "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,            "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3193            "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,            "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3194            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,            "-//W3C//DTD W3 HTML//",
3195            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,            "-//W3O//DTD W3 HTML 3.0//",
3196            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,            "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3197            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,            "-//WEBTECHS//DTD MOZILLA HTML//",
3198            "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,          ]; # $prefix
3199            "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,          my $match;
3200            "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,          for (@$prefix) {
3201            "-//W3C//DTD HTML 3.2//EN" => 1,            if (substr ($prefix, 0, length $_) eq $_) {
3202            "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,              $match = 1;
3203            "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,              last;
3204            "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,            }
3205            "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,          }
3206            "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,          if ($match or
3207            "-//W3C//DTD W3 HTML//EN" => 1,              $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3208            "-//W3O//DTD W3 HTML 3.0//EN" => 1,              $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3209            "-//W3O//DTD W3 HTML 3.0//EN//" => 1,              $pubid eq "HTML") {
           "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,  
           "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,  
           "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,  
           "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,  
           "HTML" => 1,  
         }->{$pubid}) {  
3210            !!!cp ('t5');            !!!cp ('t5');
3211            $self->{document}->manakai_compat_mode ('quirks');            $self->{document}->manakai_compat_mode ('quirks');
3212          } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or          } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3213                   $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {                   $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3214            if (defined $token->{system_identifier}) {            if (defined $token->{system_identifier}) {
3215              !!!cp ('t6');              !!!cp ('t6');
3216              $self->{document}->manakai_compat_mode ('quirks');              $self->{document}->manakai_compat_mode ('quirks');
# Line 3082  sub _tree_construction_initial ($) { Line 3218  sub _tree_construction_initial ($) {
3218              !!!cp ('t7');              !!!cp ('t7');
3219              $self->{document}->manakai_compat_mode ('limited quirks');              $self->{document}->manakai_compat_mode ('limited quirks');
3220            }            }
3221          } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or          } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3222                   $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {                   $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3223            !!!cp ('t8');            !!!cp ('t8');
3224            $self->{document}->manakai_compat_mode ('limited quirks');            $self->{document}->manakai_compat_mode ('limited quirks');
3225          } else {          } else {
# Line 3096  sub _tree_construction_initial ($) { Line 3232  sub _tree_construction_initial ($) {
3232          my $sysid = $token->{system_identifier};          my $sysid = $token->{system_identifier};
3233          $sysid =~ tr/A-Z/a-z/;          $sysid =~ tr/A-Z/a-z/;
3234          if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {          if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3235            ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"            ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3236              ## marked as quirks.
3237            $self->{document}->manakai_compat_mode ('quirks');            $self->{document}->manakai_compat_mode ('quirks');
3238            !!!cp ('t11');            !!!cp ('t11');
3239          } else {          } else {
# Line 3268  sub _reset_insertion_mode ($) { Line 3405  sub _reset_insertion_mode ($) {
3405        if ($self->{open_elements}->[0]->[0] eq $node->[0]) {        if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3406          $last = 1;          $last = 1;
3407          if (defined $self->{inner_html_node}) {          if (defined $self->{inner_html_node}) {
3408            if ($self->{inner_html_node}->[1] & TABLE_CELL_EL) {            !!!cp ('t28');
3409              !!!cp ('t27');            $node = $self->{inner_html_node};
3410              #          } else {
3411            } else {            die "_reset_insertion_mode: t27";
             !!!cp ('t28');  
             $node = $self->{inner_html_node};  
           }  
3412          }          }
3413        }        }
3414              
3415      ## Step 4..14        ## Step 4..14
3416      my $new_mode;        my $new_mode;
3417      if ($node->[1] & FOREIGN_EL) {        if ($node->[1] & FOREIGN_EL) {
3418        ## NOTE: Strictly spaking, the line below only applies to MathML and          !!!cp ('t28.1');
3419        ## SVG elements.  Currently the HTML syntax supports only MathML and          ## NOTE: Strictly spaking, the line below only applies to MathML and
3420        ## SVG elements as foreigners.          ## SVG elements.  Currently the HTML syntax supports only MathML and
3421        $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;          ## SVG elements as foreigners.
3422        ## ISSUE: What is set as the secondary insertion mode?          $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
3423      } else {          ## ISSUE: What is set as the secondary insertion mode?
3424        $new_mode = {        } elsif ($node->[1] & TABLE_CELL_EL) {
3425            if ($last) {
3426              !!!cp ('t28.2');
3427              #
3428            } else {
3429              !!!cp ('t28.3');
3430              $new_mode = IN_CELL_IM;
3431            }
3432          } else {
3433            !!!cp ('t28.4');
3434            $new_mode = {
3435                        select => IN_SELECT_IM,                        select => IN_SELECT_IM,
3436                        ## NOTE: |option| and |optgroup| do not set                        ## NOTE: |option| and |optgroup| do not set
3437                        ## insertion mode to "in select" by themselves.                        ## insertion mode to "in select" by themselves.
                       td => IN_CELL_IM,  
                       th => IN_CELL_IM,  
3438                        tr => IN_ROW_IM,                        tr => IN_ROW_IM,
3439                        tbody => IN_TABLE_BODY_IM,                        tbody => IN_TABLE_BODY_IM,
3440                        thead => IN_TABLE_BODY_IM,                        thead => IN_TABLE_BODY_IM,
# Line 3304  sub _reset_insertion_mode ($) { Line 3446  sub _reset_insertion_mode ($) {
3446                        body => IN_BODY_IM,                        body => IN_BODY_IM,
3447                        frameset => IN_FRAMESET_IM,                        frameset => IN_FRAMESET_IM,
3448                       }->{$node->[0]->manakai_local_name};                       }->{$node->[0]->manakai_local_name};
3449      }        }
3450      $self->{insertion_mode} = $new_mode and return if defined $new_mode;        $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3451                
3452        ## Step 15        ## Step 15
3453        if ($node->[1] & HTML_EL) {        if ($node->[1] & HTML_EL) {
# Line 4048  sub _tree_construction_main ($) { Line 4190  sub _tree_construction_main ($) {
4190              !!!next-token;              !!!next-token;
4191              next B;              next B;
4192            } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {            } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4193              !!!cp ('t94');              !!!cp ('t93.2');
4194              #              !!!parse-error (type => 'after head:head', token => $token); ## TODO: error type
4195                ## Ignore the token
4196                !!!nack ('t93.3');
4197                !!!next-token;
4198                next B;
4199            } else {            } else {
4200              !!!cp ('t95');              !!!cp ('t95');
4201              !!!parse-error (type => 'in head:head', token => $token); # or in head noscript              !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
# Line 4132  sub _tree_construction_main ($) { Line 4278  sub _tree_construction_main ($) {
4278                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4279    
4280                unless ($self->{confident}) {                unless ($self->{confident}) {
4281                  if ($token->{attributes}->{charset}) { ## TODO: And if supported                  if ($token->{attributes}->{charset}) {
4282                    !!!cp ('t106');                    !!!cp ('t106');
4283                      ## NOTE: Whether the encoding is supported or not is handled
4284                      ## in the {change_encoding} callback.
4285                    $self->{change_encoding}                    $self->{change_encoding}
4286                        ->($self, $token->{attributes}->{charset}->{value},                        ->($self, $token->{attributes}->{charset}->{value},
4287                           $token);                           $token);
# Line 4143  sub _tree_construction_main ($) { Line 4291  sub _tree_construction_main ($) {
4291                                             $token->{attributes}->{charset}                                             $token->{attributes}->{charset}
4292                                                 ->{has_reference});                                                 ->{has_reference});
4293                  } elsif ($token->{attributes}->{content}) {                  } elsif ($token->{attributes}->{content}) {
                   ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.  
4294                    if ($token->{attributes}->{content}->{value}                    if ($token->{attributes}->{content}->{value}
4295                        =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]                        =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4296                            [\x09-\x0D\x20]*=                            [\x09-\x0D\x20]*=
4297                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4298                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4299                      !!!cp ('t107');                      !!!cp ('t107');
4300                        ## NOTE: Whether the encoding is supported or not is handled
4301                        ## in the {change_encoding} callback.
4302                      $self->{change_encoding}                      $self->{change_encoding}
4303                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4304                             $token);                             $token);
# Line 4368  sub _tree_construction_main ($) { Line 4517  sub _tree_construction_main ($) {
4517                  $self->{insertion_mode} = AFTER_HEAD_IM;                  $self->{insertion_mode} = AFTER_HEAD_IM;
4518                  !!!next-token;                  !!!next-token;
4519                  next B;                  next B;
4520                  } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4521                    !!!cp ('t134.1');
4522                    !!!parse-error (type => 'unmatched end tag:head', token => $token);
4523                    ## Ignore the token
4524                    !!!next-token;
4525                    next B;
4526                } else {                } else {
4527                  !!!cp ('t135');                  die "$0: $self->{insertion_mode}: Unknown insertion mode";
                 #  
4528                }                }
4529              } elsif ($token->{tag_name} eq 'noscript') {              } elsif ($token->{tag_name} eq 'noscript') {
4530                if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {                if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
# Line 4379  sub _tree_construction_main ($) { Line 4533  sub _tree_construction_main ($) {
4533                  $self->{insertion_mode} = IN_HEAD_IM;                  $self->{insertion_mode} = IN_HEAD_IM;
4534                  !!!next-token;                  !!!next-token;
4535                  next B;                  next B;
4536                } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {                } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4537                           $self->{insertion_mode} == AFTER_HEAD_IM) {
4538                  !!!cp ('t137');                  !!!cp ('t137');
4539                  !!!parse-error (type => 'unmatched end tag:noscript', token => $token);                  !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4540                  ## Ignore the token ## ISSUE: An issue in the spec.                  ## Ignore the token ## ISSUE: An issue in the spec.
# Line 4392  sub _tree_construction_main ($) { Line 4547  sub _tree_construction_main ($) {
4547              } elsif ({              } elsif ({
4548                        body => 1, html => 1,                        body => 1, html => 1,
4549                       }->{$token->{tag_name}}) {                       }->{$token->{tag_name}}) {
4550                if ($self->{insertion_mode} == BEFORE_HEAD_IM) {                if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4551                  !!!cp ('t139');                    $self->{insertion_mode} == IN_HEAD_IM or
4552                  ## As if <head>                    $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
                 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);  
                 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});  
                 push @{$self->{open_elements}},  
                     [$self->{head_element}, $el_category->{head}];  
   
                 $self->{insertion_mode} = IN_HEAD_IM;  
                 ## Reprocess in the "in head" insertion mode...  
               } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {  
4553                  !!!cp ('t140');                  !!!cp ('t140');
4554                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4555                  ## Ignore the token                  ## Ignore the token
4556                  !!!next-token;                  !!!next-token;
4557                  next B;                  next B;
4558                  } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4559                    !!!cp ('t140.1');
4560                    !!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token);
4561                    ## Ignore the token
4562                    !!!next-token;
4563                    next B;
4564                } else {                } else {
4565                  !!!cp ('t141');                  die "$0: $self->{insertion_mode}: Unknown insertion mode";
4566                }                }
4567                              } elsif ($token->{tag_name} eq 'p') {
4568                #                !!!cp ('t142');
4569              } elsif ({                !!!parse-error (type => 'unmatched end tag:p', token => $token);
4570                        p => 1, br => 1,                ## Ignore the token
4571                       }->{$token->{tag_name}}) {                !!!next-token;
4572                  next B;
4573                } elsif ($token->{tag_name} eq 'br') {
4574                if ($self->{insertion_mode} == BEFORE_HEAD_IM) {                if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4575                  !!!cp ('t142');                  !!!cp ('t142.2');
4576                  ## As if <head>                  ## (before head) as if <head>, (in head) as if </head>
4577                  !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);                  !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4578                  $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});                  $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4579                  push @{$self->{open_elements}},                  $self->{insertion_mode} = AFTER_HEAD_IM;
4580                      [$self->{head_element}, $el_category->{head}];    
4581                    ## Reprocess in the "after head" insertion mode...
4582                  } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4583                    !!!cp ('t143.2');
4584                    ## As if </head>
4585                    pop @{$self->{open_elements}};
4586                    $self->{insertion_mode} = AFTER_HEAD_IM;
4587      
4588                    ## Reprocess in the "after head" insertion mode...
4589                  } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4590                    !!!cp ('t143.3');
4591                    ## ISSUE: Two parse errors for <head><noscript></br>
4592                    !!!parse-error (type => 'unmatched end tag:br', token => $token);
4593                    ## As if </noscript>
4594                    pop @{$self->{open_elements}};
4595                  $self->{insertion_mode} = IN_HEAD_IM;                  $self->{insertion_mode} = IN_HEAD_IM;
4596    
4597                  ## Reprocess in the "in head" insertion mode...                  ## Reprocess in the "in head" insertion mode...
4598                } else {                  ## As if </head>
4599                  !!!cp ('t143');                  pop @{$self->{open_elements}};
4600                }                  $self->{insertion_mode} = AFTER_HEAD_IM;
4601    
4602                #                  ## Reprocess in the "after head" insertion mode...
4603              } else {                } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4604                if ($self->{insertion_mode} == AFTER_HEAD_IM) {                  !!!cp ('t143.4');
                 !!!cp ('t144');  
4605                  #                  #
4606                } else {                } else {
4607                  !!!cp ('t145');                  die "$0: $self->{insertion_mode}: Unknown insertion mode";
                 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);  
                 ## Ignore the token  
                 !!!next-token;  
                 next B;  
4608                }                }
4609    
4610                  ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4611                  !!!parse-error (type => 'unmatched end tag:br', token => $token);
4612                  ## Ignore the token
4613                  !!!next-token;
4614                  next B;
4615                } else {
4616                  !!!cp ('t145');
4617                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4618                  ## Ignore the token
4619                  !!!next-token;
4620                  next B;
4621              }              }
4622    
4623              if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {              if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
# Line 6166  sub _tree_construction_main ($) { Line 6342  sub _tree_construction_main ($) {
6342          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6343    
6344          unless ($self->{confident}) {          unless ($self->{confident}) {
6345            if ($token->{attributes}->{charset}) { ## TODO: And if supported            if ($token->{attributes}->{charset}) {
6346              !!!cp ('t335');              !!!cp ('t335');
6347                ## NOTE: Whether the encoding is supported or not is handled
6348                ## in the {change_encoding} callback.
6349              $self->{change_encoding}              $self->{change_encoding}
6350                  ->($self, $token->{attributes}->{charset}->{value}, $token);                  ->($self, $token->{attributes}->{charset}->{value}, $token);
6351                            
# Line 6176  sub _tree_construction_main ($) { Line 6354  sub _tree_construction_main ($) {
6354                                       $token->{attributes}->{charset}                                       $token->{attributes}->{charset}
6355                                           ->{has_reference});                                           ->{has_reference});
6356            } elsif ($token->{attributes}->{content}) {            } elsif ($token->{attributes}->{content}) {
             ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.  
6357              if ($token->{attributes}->{content}->{value}              if ($token->{attributes}->{content}->{value}
6358                  =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]                  =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6359                      [\x09-\x0D\x20]*=                      [\x09-\x0D\x20]*=
6360                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6361                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6362                !!!cp ('t336');                !!!cp ('t336');
6363                  ## NOTE: Whether the encoding is supported or not is handled
6364                  ## in the {change_encoding} callback.
6365                $self->{change_encoding}                $self->{change_encoding}
6366                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6367                $meta_el->[0]->get_attribute_node_ns (undef, 'content')                $meta_el->[0]->get_attribute_node_ns (undef, 'content')

Legend:
Removed from v.1.133  
changed lines
  Added in v.1.145

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24