/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.126 by wakaba,
Sat Apr 12 14:54:33 2008 UTC
+revision 1.139 by wakaba,
Sat May 24 04:26:27 2008 UTC
 Line 8 
 use Error qw(:try);
  ## doc.write ('');
  ## alert (doc.compatMode);
- ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
  ## TODO: 1252 parse error (revision 1264)
  ## TODO: 8859-11 = 874 (revision 1271)
+ require IO::Handle;
  my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
  my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
  my $SVG_NS = q<http://www.w3.org/2000/svg>;
-Line 205 
 my $el_category_f = {
+Line 206 
 my $el_category_f = {
      mtext => FOREIGN_FLOW_CONTENT_EL,
    },
    $SVG_NS => {
-     foreignobject => FOREIGN_FLOW_CONTENT_EL, ## TODO: case
+     foreignObject => FOREIGN_FLOW_CONTENT_EL,
      desc => FOREIGN_FLOW_CONTENT_EL,
      title => FOREIGN_FLOW_CONTENT_EL,
    },
    ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
  };
+ my $svg_attr_name = {
+   attributetype => 'attributeType',
+   basefrequency => 'baseFrequency',
+   baseprofile => 'baseProfile',
+   calcmode => 'calcMode',
+   clippathunits => 'clipPathUnits',
+   contentscripttype => 'contentScriptType',
+   contentstyletype => 'contentStyleType',
+   diffuseconstant => 'diffuseConstant',
+   edgemode => 'edgeMode',
+   externalresourcesrequired => 'externalResourcesRequired',
+   fecolormatrix => 'feColorMatrix',
+   fecomposite => 'feComposite',
+   fegaussianblur => 'feGaussianBlur',
+   femorphology => 'feMorphology',
+   fetile => 'feTile',
+   filterres => 'filterRes',
+   filterunits => 'filterUnits',
+   glyphref => 'glyphRef',
+   gradienttransform => 'gradientTransform',
+   gradientunits => 'gradientUnits',
+   kernelmatrix => 'kernelMatrix',
+   kernelunitlength => 'kernelUnitLength',
+   keypoints => 'keyPoints',
+   keysplines => 'keySplines',
+   keytimes => 'keyTimes',
+   lengthadjust => 'lengthAdjust',
+   limitingconeangle => 'limitingConeAngle',
+   markerheight => 'markerHeight',
+   markerunits => 'markerUnits',
+   markerwidth => 'markerWidth',
+   maskcontentunits => 'maskContentUnits',
+   maskunits => 'maskUnits',
+   numoctaves => 'numOctaves',
+   pathlength => 'pathLength',
+   patterncontentunits => 'patternContentUnits',
+   patterntransform => 'patternTransform',
+   patternunits => 'patternUnits',
+   pointsatx => 'pointsAtX',
+   pointsaty => 'pointsAtY',
+   pointsatz => 'pointsAtZ',
+   preservealpha => 'preserveAlpha',
+   preserveaspectratio => 'preserveAspectRatio',
+   primitiveunits => 'primitiveUnits',
+   refx => 'refX',
+   refy => 'refY',
+   repeatcount => 'repeatCount',
+   repeatdur => 'repeatDur',
+   requiredextensions => 'requiredExtensions',
+   specularconstant => 'specularConstant',
+   specularexponent => 'specularExponent',
+   spreadmethod => 'spreadMethod',
+   startoffset => 'startOffset',
+   stddeviation => 'stdDeviation',
+   stitchtiles => 'stitchTiles',
+   surfacescale => 'surfaceScale',
+   systemlanguage => 'systemLanguage',
+   tablevalues => 'tableValues',
+   targetx => 'targetX',
+   targety => 'targetY',
+   textlength => 'textLength',
+   viewbox => 'viewBox',
+   viewtarget => 'viewTarget',
+   xchannelselector => 'xChannelSelector',
+   ychannelselector => 'yChannelSelector',
+   zoomandpan => 'zoomAndPan',
+ };
+ my $foreign_attr_xname = {
+   'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
+   'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
+   'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
+   'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
+   'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
+   'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
+   'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
+   'xml:base' => [$XML_NS, ['xml', 'base']],
+   'xml:lang' => [$XML_NS, ['xml', 'lang']],
+   'xml:space' => [$XML_NS, ['xml', 'space']],
+   'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
+   'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
+ };
+ ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
  my $c1_entity_char = {
 x80 => 0x20AC,
 x81 => 0xFFFD,
-Line 248 
 my $c1_entity_char = {
+Line 334 
 my $c1_entity_char = {
  }; # $c1_entity_char
  sub parse_byte_string ($$$$;$) {
+   my $self = shift;
+   my $charset_name = shift;
+   open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
+   return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
+ } # parse_byte_string
+ sub parse_byte_stream ($$$$;$) {
    my $self = ref $_[0] ? shift : shift->new;
-   my $charset = shift;
+   my $charset_name = shift;
-   my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
+   my $byte_stream = $_[0];
-   my $s;
-   if (defined $charset) {
-     require Encode; ## TODO: decode(utf8) don't delete BOM
-     $s = \ (Encode::decode ($charset, $$bytes_s));
-     $self->{input_encoding} = lc $charset; ## TODO: normalize name
-     $self->{confident} = 1;
-   } else {
-     ## TODO: Implement HTML5 detection algorithm
-     require Whatpm::Charset::UniversalCharDet;
-     $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
-         (substr ($$bytes_s, 0, 1024));
-     $charset ||= 'windows-1252';
-     $s = \ (Encode::decode ($charset, $$bytes_s));
-     $self->{input_encoding} = $charset;
-     $self->{confident} = 0;
-   }
-   $self->{change_encoding} = sub {
+   my $onerror = $_[2] || sub {
-     my $self = shift;
+     my (%opt) = @_;
-     my $charset = lc shift;
+     warn "Parse error ($opt{type})\n";
-     my $token = shift;
+   };
-     ## TODO: if $charset is supported
+   $self->{parse_error} = $onerror; # updated later by parse_char_string
-     ## TODO: normalize charset name
+   ## HTML5 encoding sniffing algorithm
+   require Message::Charset::Info;
+   my $charset;
+   my $buffer;
+   my ($char_stream, $e_status);
+   SNIFFING: {
-     ## "Change the encoding" algorithm:
+     ## Step 1
+     if (defined $charset_name) {
+       $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
-     ## Step 1
+       ## ISSUE: Unsupported encoding is not ignored according to the spec.
-     if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
+       ($char_stream, $e_status) = $charset->get_decode_handle
-       $charset = 'utf-8';
+           ($byte_stream, allow_error_reporting => 1,
+            allow_fallback => 1);
+       if ($char_stream) {
+         $self->{confident} = 1;
+         last SNIFFING;
+       } else {
+         ## TODO: unsupported error
+       }
      }
      ## Step 2
-     if (defined $self->{input_encoding} and
+     my $byte_buffer = '';
-         $self->{input_encoding} eq $charset) {
+     for (1..1024) {
+       my $char = $byte_stream->getc;
+       last unless defined $char;
+       $byte_buffer .= $char;
+     } ## TODO: timeout
+     ## Step 3
+     if ($byte_buffer =~ /^\xFE\xFF/) {
+       $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
+       ($char_stream, $e_status) = $charset->get_decode_handle
+           ($byte_stream, allow_error_reporting => 1,
+            allow_fallback => 1, byte_buffer => \$byte_buffer);
        $self->{confident} = 1;
-       return;
+       last SNIFFING;
+     } elsif ($byte_buffer =~ /^\xFF\xFE/) {
+       $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
+       ($char_stream, $e_status) = $charset->get_decode_handle
+           ($byte_stream, allow_error_reporting => 1,
+            allow_fallback => 1, byte_buffer => \$byte_buffer);
+       $self->{confident} = 1;
+       last SNIFFING;
+     } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
+       $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
+       ($char_stream, $e_status) = $charset->get_decode_handle
+           ($byte_stream, allow_error_reporting => 1,
+            allow_fallback => 1, byte_buffer => \$byte_buffer);
+       $self->{confident} = 1;
+       last SNIFFING;
      }
-     !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
+     ## Step 4
-         ':'.$charset, level => 'w', token => $token);
+     ## TODO: <meta charset>
-     ## Step 3
+     ## Step 5
-     # if (can) {
+     ## TODO: from history
-       ## change the encoding on the fly.
-       #$self->{confident} = 1;
-       #return;
-     # }
-     ## Step 4
+     ## Step 6
-     throw Whatpm::HTML::RestartParser (charset => $charset);
+     require Whatpm::Charset::UniversalCharDet;
+     $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
+         ($byte_buffer);
+     if (defined $charset_name) {
+       $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
+       ## ISSUE: Unsupported encoding is not ignored according to the spec.
+       require Whatpm::Charset::DecodeHandle;
+       $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
+           ($byte_stream);
+       ($char_stream, $e_status) = $charset->get_decode_handle
+           ($buffer, allow_error_reporting => 1,
+            allow_fallback => 1, byte_buffer => \$byte_buffer);
+       if ($char_stream) {
+         $buffer->{buffer} = $byte_buffer;
+         !!!parse-error (type => 'sniffing:chardet', ## TODO: type name
+                         value => $charset_name,
+                         level => $self->{info_level},
+                         line => 1, column => 1);
+         $self->{confident} = 0;
+         last SNIFFING;
+       }
+     }
+     ## Step 7: default
+     ## TODO: Make this configurable.
+     $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
+         ## NOTE: We choose |windows-1252| here, since |utf-8| should be
+         ## detectable in the step 6.
+     require Whatpm::Charset::DecodeHandle;
+     $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
+         ($byte_stream);
+     ($char_stream, $e_status)
+         = $charset->get_decode_handle ($buffer,
+                                        allow_error_reporting => 1,
+                                        allow_fallback => 1,
+                                        byte_buffer => \$byte_buffer);
+     $buffer->{buffer} = $byte_buffer;
+     !!!parse-error (type => 'sniffing:default', ## TODO: type name
+                     value => 'windows-1252',
+                     level => $self->{info_level},
+                     line => 1, column => 1);
+     $self->{confident} = 0;
+   } # SNIFFING
+   $self->{input_encoding} = $charset->get_iana_name;
+   if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
+     !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
+                     value => $self->{input_encoding},
+                     level => $self->{unsupported_level},
+                     line => 1, column => 1);
+   } elsif (not ($e_status &
+                 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
+     !!!parse-error (type => 'chardecode:no error', ## TODO: type name
+                     value => $self->{input_encoding},
+                     level => $self->{unsupported_level},
+                     line => 1, column => 1);
+   }
+   $self->{change_encoding} = sub {
+     my $self = shift;
+     $charset_name = shift;
+     my $token = shift;
+     $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
+     ($char_stream, $e_status) = $charset->get_decode_handle
+         ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
+          byte_buffer => \ $buffer->{buffer});
+     if ($char_stream) { # if supported
+       ## "Change the encoding" algorithm:
+       ## Step 1
+       if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
+         $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
+         ($char_stream, $e_status) = $charset->get_decode_handle
+             ($byte_stream,
+              byte_buffer => \ $buffer->{buffer});
+       }
+       $charset_name = $charset->get_iana_name;
+       ## Step 2
+       if (defined $self->{input_encoding} and
+           $self->{input_encoding} eq $charset_name) {
+         !!!parse-error (type => 'charset label:matching', ## TODO: type
+                         value => $charset_name,
+                         level => $self->{info_level});
+         $self->{confident} = 1;
+         return;
+       }
+       !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
+           ':'.$charset_name, level => 'w', token => $token);
+       ## Step 3
+       # if (can) {
+         ## change the encoding on the fly.
+         #$self->{confident} = 1;
+         #return;
+       # }
+       ## Step 4
+       throw Whatpm::HTML::RestartParser ();
+     }
    }; # $self->{change_encoding}
+   my $char_onerror = sub {
+     my (undef, $type, %opt) = @_;
+     !!!parse-error (%opt, type => $type,
+                     line => $self->{line}, column => $self->{column} + 1);
+     if ($opt{octets}) {
+       ${$opt{octets}} = "\x{FFFD}"; # relacement character
+     }
+   };
+   $char_stream->onerror ($char_onerror);
    my @args = @_; shift @args; # $s
    my $return;
    try {
-     $return = $self->parse_char_string ($s, @args);
+     $return = $self->parse_char_stream ($char_stream, @args);
    } catch Whatpm::HTML::RestartParser with {
-     my $charset = shift->{charset};
+     ## NOTE: Invoked after {change_encoding}.
-     $s = \ (Encode::decode ($charset, $$bytes_s));
-     $self->{input_encoding} = $charset; ## TODO: normalize
+     $self->{input_encoding} = $charset->get_iana_name;
+     if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
+       !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
+                       value => $self->{input_encoding},
+                       level => $self->{unsupported_level},
+                       line => 1, column => 1);
+     } elsif (not ($e_status &
+                   Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
+       !!!parse-error (type => 'chardecode:no error', ## TODO: type name
+                       value => $self->{input_encoding},
+                       level => $self->{unsupported_level},
+                       line => 1, column => 1);
+     }
      $self->{confident} = 1;
-     $return = $self->parse_char_string ($s, @args);
+     $char_stream->onerror ($char_onerror);
+     $return = $self->parse_char_stream ($char_stream, @args);
    };
    return $return;
- } # parse_byte_string
+ } # parse_byte_stream
  ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
  ## and the HTML layer MUST ignore it.  However, we does strip BOM in
-Line 327 
 sub parse_byte_string ($$$$;$) {
+Line 566 
 sub parse_byte_string ($$$$;$) {
  ## such as |parse_byte_string| in this module, must ensure that it does
  ## strip the BOM and never strip any ZWNBSP.
- *parse_char_string = \&parse_string;
+ sub parse_char_string ($$$;$) {
+   my $self = shift;
+   require utf8;
+   my $s = ref $_[0] ? $_[0] : \($_[0]);
+   open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
+   return $self->parse_char_stream ($input, @_[1..$#_]);
+ } # parse_char_string
+ *parse_string = \&parse_char_string;
- sub parse_string ($$$;$) {
+ sub parse_char_stream ($$$;$) {
    my $self = ref $_[0] ? shift : shift->new;
-   my $s = ref $_[0] ? $_[0] : \($_[0]);
+   my $input = $_[0];
    $self->{document} = $_[1];
    @{$self->{document}->child_nodes} = ();
-Line 350 
 sub parse_string ($$$;$) {
+Line 596 
 sub parse_string ($$$;$) {
      pop @{$self->{prev_char}};
      unshift @{$self->{prev_char}}, $self->{next_char};
-     $self->{next_char} = -1 and return if $i >= length $$s;
+     my $char;
-     $self->{next_char} = ord substr $$s, $i++, 1;
+     if (defined $self->{next_next_char}) {
+       $char = $self->{next_next_char};
+       delete $self->{next_next_char};
+     } else {
+       $char = $input->getc;
+     }
+     $self->{next_char} = -1 and return unless defined $char;
+     $self->{next_char} = ord $char;
      ($self->{line_prev}, $self->{column_prev})
          = ($self->{line}, $self->{column});
      $self->{column}++;
      if ($self->{next_char} == 0x000A) { # LF
+       !!!cp ('j1');
        $self->{line}++;
        $self->{column} = 0;
      } elsif ($self->{next_char} == 0x000D) { # CR
-       $i++ if substr ($$s, $i, 1) eq "\x0A";
+       !!!cp ('j2');
+       my $next = $input->getc;
+       if (defined $next and $next ne "\x0A") {
+         $self->{next_next_char} = $next;
+       }
        $self->{next_char} = 0x000A; # LF # MUST
        $self->{line}++;
        $self->{column} = 0;
      } elsif ($self->{next_char} > 0x10FFFF) {
+       !!!cp ('j3');
        $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
      } elsif ($self->{next_char} == 0x0000) { # NULL
+       !!!cp ('j4');
        !!!parse-error (type => 'NULL');
        $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
+     } elsif ($self->{next_char} <= 0x0008 or
+              (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
+              (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
+              (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
+              (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
+              {
+xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
+x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
+x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
+x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
+x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
+xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
+xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
+xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
+x10FFFE => 1, 0x10FFFF => 1,
+              }->{$self->{next_char}}) {
+       !!!cp ('j5');
+       !!!parse-error (type => 'control char', level => $self->{must_level});
+ ## TODO: error type documentation
      }
    };
    $self->{prev_char} = [-1, -1, -1];
-Line 393 
 sub parse_string ($$$;$) {
+Line 672 
 sub parse_string ($$$;$) {
    delete $self->{parse_error}; # remove loop
    return $self->{document};
- } # parse_string
+ } # parse_char_stream
  sub new ($) {
    my $class = shift;
-   my $self = bless {}, $class;
+   my $self = bless {
+     must_level => 'm',
+     should_level => 's',
+     good_level => 'w',
+     warn_level => 'w',
+     info_level => 'i',
+     unsupported_level => 'u',
+   }, $class;
    $self->{set_next_char} = sub {
      $self->{next_char} = -1;
    };
-Line 460 
 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STAT
+Line 746 
 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STAT
  sub BOGUS_DOCTYPE_STATE () { 32 }
  sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
  sub SELF_CLOSING_START_TAG_STATE () { 34 }
+ sub CDATA_BLOCK_STATE () { 35 }
  sub DOCTYPE_TOKEN () { 1 }
  sub COMMENT_TOKEN () { 2 }
-Line 761 
 sub _get_next_token ($) {
+Line 1048 
 sub _get_next_token ($) {
            redo A;
          } else {
            !!!cp (23);
-           !!!parse-error (type => 'bare stago');
+           !!!parse-error (type => 'bare stago',
+                           line => $self->{line_prev},
+                           column => $self->{column_prev});
            $self->{state} = DATA_STATE;
            ## reconsume
-Line 1685 
 sub _get_next_token ($) {
+Line 1974 
 sub _get_next_token ($) {
          } else {
            !!!cp (135);
          }
+       } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
+                $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
+                $self->{next_char} == 0x005B) { # [
+         !!!next-input-character;
+         push @next_char, $self->{next_char};
+         if ($self->{next_char} == 0x0043) { # C
+           !!!next-input-character;
+           push @next_char, $self->{next_char};
+           if ($self->{next_char} == 0x0044) { # D
+             !!!next-input-character;
+             push @next_char, $self->{next_char};
+             if ($self->{next_char} == 0x0041) { # A
+               !!!next-input-character;
+               push @next_char, $self->{next_char};
+               if ($self->{next_char} == 0x0054) { # T
+                 !!!next-input-character;
+                 push @next_char, $self->{next_char};
+                 if ($self->{next_char} == 0x0041) { # A
+                   !!!next-input-character;
+                   push @next_char, $self->{next_char};
+                   if ($self->{next_char} == 0x005B) { # [
+                     !!!cp (135.1);
+                     $self->{state} = CDATA_BLOCK_STATE;
+                     !!!next-input-character;
+                     redo A;
+                   } else {
+                     !!!cp (135.2);
+                   }
+                 } else {
+                   !!!cp (135.3);
+                 }
+               } else {
+                 !!!cp (135.4);
+               }
+             } else {
+               !!!cp (135.5);
+             }
+           } else {
+             !!!cp (135.6);
+           }
+         } else {
+           !!!cp (135.7);
+         }
        } else {
          !!!cp (136);
        }
-Line 2409 
 sub _get_next_token ($) {
+Line 2741 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
+     } elsif ($self->{state} == CDATA_BLOCK_STATE) {
+       my $s = '';
+       my ($l, $c) = ($self->{line}, $self->{column});
+       CS: while ($self->{next_char} != -1) {
+         if ($self->{next_char} == 0x005D) { # ]
+           !!!next-input-character;
+           if ($self->{next_char} == 0x005D) { # ]
+             !!!next-input-character;
+             MDC: {
+               if ($self->{next_char} == 0x003E) { # >
+                 !!!cp (221.1);
+                 !!!next-input-character;
+                 last CS;
+               } elsif ($self->{next_char} == 0x005D) { # ]
+                 !!!cp (221.2);
+                 $s .= ']';
+                 !!!next-input-character;
+                 redo MDC;
+               } else {
+                 !!!cp (221.3);
+                 $s .= ']]';
+                 #
+               }
+             } # MDC
+           } else {
+             !!!cp (221.4);
+             $s .= ']';
+             #
+           }
+         } else {
+           !!!cp (221.5);
+           #
+         }
+         $s .= chr $self->{next_char};
+         !!!next-input-character;
+       } # CS
+       $self->{state} = DATA_STATE;
+       ## next-input-character done or EOF, which is reconsumed.
+       if (length $s) {
+         !!!cp (221.6);
+         !!!emit ({type => CHARACTER_TOKEN, data => $s,
+                   line => $l, column => $c});
+       } else {
+         !!!cp (221.7);
+       }
+       redo A;
+       ## ISSUE: "text tokens" in spec.
+       ## TODO: Streaming support
      } else {
        die "$0: $self->{state}: Unknown state";
      }
-Line 2559 
 sub _tokenize_attempt_to_consume_an_enti
+Line 2945 
 sub _tokenize_attempt_to_consume_an_enti
      require Whatpm::_NamedEntityList;
      our $EntityChar;
-     while (length $entity_name < 10 and
+     while (length $entity_name < 30 and
             ## NOTE: Some number greater than the maximum length of entity name
             ((0x0041 <= $self->{next_char} and # a
               $self->{next_char} <= 0x005A) or # x
-Line 3592 
 sub _tree_construction_main ($) {
+Line 3978 
 sub _tree_construction_main ($) {
          !!!next-token;
          next B;
        } elsif ($token->{type} == START_TAG_TOKEN) {
-         if ($self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL or
+         if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
+              $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
              not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
              ($token->{tag_name} eq 'svg' and
               $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
-Line 3600 
 sub _tree_construction_main ($) {
+Line 3987 
 sub _tree_construction_main ($) {
            !!!cp ('t87.2');
            #
          } elsif ({
-   ## TODO:
+                   b => 1, big => 1, blockquote => 1, body => 1, br => 1,
+                   center => 1, code => 1, dd => 1, div => 1, dl => 1, em => 1,
+                   embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1, ## No h4!
+                   h5 => 1, h6 => 1, head => 1, hr => 1, i => 1, img => 1,
+                   li => 1, menu => 1, meta => 1, nobr => 1, p => 1, pre => 1,
+                   ruby => 1, s => 1, small => 1, span => 1, strong => 1,
+                   sub => 1, sup => 1, table => 1, tt => 1, u => 1, ul => 1,
+                   var => 1,
                   }->{$token->{tag_name}}) {
            !!!cp ('t87.2');
            !!!parse-error (type => 'not closed',
-Line 3611 
 sub _tree_construction_main ($) {
+Line 4005 
 sub _tree_construction_main ($) {
            pop @{$self->{open_elements}}
                while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
-           $self->{insertion_mode} &= ~ $self->{insertion_mode};
+           $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
            ## Reprocess.
            next B;
          } else {
-           ## TODO: case fixup
+           my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
+           my $tag_name = $token->{tag_name};
+           if ($nsuri eq $SVG_NS) {
+             $tag_name = {
+                altglyph => 'altGlyph',
+                altglyphdef => 'altGlyphDef',
+                altglyphitem => 'altGlyphItem',
+                animatecolor => 'animateColor',
+                animatemotion => 'animateMotion',
+                animatetransform => 'animateTransform',
+                clippath => 'clipPath',
+                feblend => 'feBlend',
+                fecolormatrix => 'feColorMatrix',
+                fecomponenttransfer => 'feComponentTransfer',
+                fecomposite => 'feComposite',
+                feconvolvematrix => 'feConvolveMatrix',
+                fediffuselighting => 'feDiffuseLighting',
+                fedisplacementmap => 'feDisplacementMap',
+                fedistantlight => 'feDistantLight',
+                feflood => 'feFlood',
+                fefunca => 'feFuncA',
+                fefuncb => 'feFuncB',
+                fefuncg => 'feFuncG',
+                fefuncr => 'feFuncR',
+                fegaussianblur => 'feGaussianBlur',
+                feimage => 'feImage',
+                femerge => 'feMerge',
+                femergenode => 'feMergeNode',
+                femorphology => 'feMorphology',
+                feoffset => 'feOffset',
+                fepointlight => 'fePointLight',
+                fespecularlighting => 'feSpecularLighting',
+                fespotlight => 'feSpotLight',
+                fetile => 'feTile',
+                feturbulence => 'feTurbulence',
+                foreignobject => 'foreignObject',
+                glyphref => 'glyphRef',
+                lineargradient => 'linearGradient',
+                radialgradient => 'radialGradient',
+                #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
+                textpath => 'textPath',
+             }->{$tag_name} || $tag_name;
+           }
-           !!!insert-element-f ($self->{open_elements}->[-1]->[0]->namespace_uri, $token);
+           ## "adjust SVG attributes" (SVG only) - done in insert-element-f
+           ## "adjust foreign attributes" - done in insert-element-f
+           !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
            if ($self->{self_closing}) {
              pop @{$self->{open_elements}};
-Line 3714 
 sub _tree_construction_main ($) {
+Line 4154 
 sub _tree_construction_main ($) {
              !!!next-token;
              next B;
            } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
-             !!!cp ('t94');
+             !!!cp ('t93.2');
-             #
+             !!!parse-error (type => 'after head:head', token => $token); ## TODO: error type
+             ## Ignore the token
+             !!!nack ('t93.3');
+             !!!next-token;
+             next B;
            } else {
              !!!cp ('t95');
              !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
-Line 3798 
 sub _tree_construction_main ($) {
+Line 4242 
 sub _tree_construction_main ($) {
                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
                unless ($self->{confident}) {
-                 if ($token->{attributes}->{charset}) { ## TODO: And if supported
+                 if ($token->{attributes}->{charset}) {
                    !!!cp ('t106');
+                   ## NOTE: Whether the encoding is supported or not is handled
+                   ## in the {change_encoding} callback.
                    $self->{change_encoding}
                        ->($self, $token->{attributes}->{charset}->{value},
                           $token);
-Line 3809 
 sub _tree_construction_main ($) {
+Line 4255 
 sub _tree_construction_main ($) {
                                             $token->{attributes}->{charset}
                                                 ->{has_reference});
                  } elsif ($token->{attributes}->{content}) {
-                   ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
                    if ($token->{attributes}->{content}->{value}
                        =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
                            [\x09-\x0D\x20]*=
                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
                      !!!cp ('t107');
+                     ## NOTE: Whether the encoding is supported or not is handled
+                     ## in the {change_encoding} callback.
                      $self->{change_encoding}
                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
                             $token);
-Line 4034 
 sub _tree_construction_main ($) {
+Line 4481 
 sub _tree_construction_main ($) {
                  $self->{insertion_mode} = AFTER_HEAD_IM;
                  !!!next-token;
                  next B;
+               } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
+                 !!!cp ('t134.1');
+                 !!!parse-error (type => 'unmatched end tag:head', token => $token);
+                 ## Ignore the token
+                 !!!next-token;
+                 next B;
                } else {
-                 !!!cp ('t135');
+                 die "$0: $self->{insertion_mode}: Unknown insertion mode";
-                 #
                }
              } elsif ($token->{tag_name} eq 'noscript') {
                if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
-Line 4045 
 sub _tree_construction_main ($) {
+Line 4497 
 sub _tree_construction_main ($) {
                  $self->{insertion_mode} = IN_HEAD_IM;
                  !!!next-token;
                  next B;
-               } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
+               } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
+                        $self->{insertion_mode} == AFTER_HEAD_IM) {
                  !!!cp ('t137');
                  !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
                  ## Ignore the token ## ISSUE: An issue in the spec.
-Line 4058 
 sub _tree_construction_main ($) {
+Line 4511 
 sub _tree_construction_main ($) {
              } elsif ({
                        body => 1, html => 1,
                       }->{$token->{tag_name}}) {
-               if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
+               if ($self->{insertion_mode} == BEFORE_HEAD_IM or
-                 !!!cp ('t139');
+                   $self->{insertion_mode} == IN_HEAD_IM or
-                 ## As if <head>
+                   $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
-                 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
-                 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
-                 push @{$self->{open_elements}},
-                     [$self->{head_element}, $el_category->{head}];
-                 $self->{insertion_mode} = IN_HEAD_IM;
-                 ## Reprocess in the "in head" insertion mode...
-               } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
                  !!!cp ('t140');
                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
                  ## Ignore the token
                  !!!next-token;
                  next B;
+               } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
+                 !!!cp ('t140.1');
+                 !!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token);
+                 ## Ignore the token
+                 !!!next-token;
+                 next B;
                } else {
-                 !!!cp ('t141');
+                 die "$0: $self->{insertion_mode}: Unknown insertion mode";
                }
+             } elsif ($token->{tag_name} eq 'p') {
-               #
+               !!!cp ('t142');
-             } elsif ({
+               !!!parse-error (type => 'unmatched end tag:p', token => $token);
-                       p => 1, br => 1,
+               ## Ignore the token
-                      }->{$token->{tag_name}}) {
+               !!!next-token;
+               next B;
+             } elsif ($token->{tag_name} eq 'br') {
                if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
-                 !!!cp ('t142');
+                 !!!cp ('t142.2');
-                 ## As if <head>
+                 ## (before head) as if <head>, (in head) as if </head>
                  !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
                  $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
-                 push @{$self->{open_elements}},
+                 $self->{insertion_mode} = AFTER_HEAD_IM;
-                     [$self->{head_element}, $el_category->{head}];
+                 ## Reprocess in the "after head" insertion mode...
+               } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
+                 !!!cp ('t143.2');
+                 ## As if </head>
+                 pop @{$self->{open_elements}};
+                 $self->{insertion_mode} = AFTER_HEAD_IM;
+                 ## Reprocess in the "after head" insertion mode...
+               } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
+                 !!!cp ('t143.3');
+                 ## ISSUE: Two parse errors for <head><noscript></br>
+                 !!!parse-error (type => 'unmatched end tag:br', token => $token);
+                 ## As if </noscript>
+                 pop @{$self->{open_elements}};
                  $self->{insertion_mode} = IN_HEAD_IM;
                  ## Reprocess in the "in head" insertion mode...
-               } else {
+                 ## As if </head>
-                 !!!cp ('t143');
+                 pop @{$self->{open_elements}};
-               }
+                 $self->{insertion_mode} = AFTER_HEAD_IM;
-               #
+                 ## Reprocess in the "after head" insertion mode...
-             } else {
+               } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
-               if ($self->{insertion_mode} == AFTER_HEAD_IM) {
+                 !!!cp ('t143.4');
-                 !!!cp ('t144');
                  #
                } else {
-                 !!!cp ('t145');
+                 die "$0: $self->{insertion_mode}: Unknown insertion mode";
-                 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
-                 ## Ignore the token
-                 !!!next-token;
-                 next B;
                }
+               ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
+               !!!parse-error (type => 'unmatched end tag:br', token => $token);
+               ## Ignore the token
+               !!!next-token;
+               next B;
+             } else {
+               !!!cp ('t145');
+               !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
+               ## Ignore the token
+               !!!next-token;
+               next B;
              }
              if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
-Line 5832 
 sub _tree_construction_main ($) {
+Line 6306 
 sub _tree_construction_main ($) {
          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
          unless ($self->{confident}) {
-           if ($token->{attributes}->{charset}) { ## TODO: And if supported
+           if ($token->{attributes}->{charset}) {
              !!!cp ('t335');
+             ## NOTE: Whether the encoding is supported or not is handled
+             ## in the {change_encoding} callback.
              $self->{change_encoding}
                  ->($self, $token->{attributes}->{charset}->{value}, $token);
-Line 5842 
 sub _tree_construction_main ($) {
+Line 6318 
 sub _tree_construction_main ($) {
                                       $token->{attributes}->{charset}
                                           ->{has_reference});
            } elsif ($token->{attributes}->{content}) {
-             ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
              if ($token->{attributes}->{content}->{value}
                  =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
                      [\x09-\x0D\x20]*=
                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
                !!!cp ('t336');
+               ## NOTE: Whether the encoding is supported or not is handled
+               ## in the {change_encoding} callback.
                $self->{change_encoding}
                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
                $meta_el->[0]->get_attribute_node_ns (undef, 'content')
-Line 6277 
 sub _tree_construction_main ($) {
+Line 6754 
 sub _tree_construction_main ($) {
        } elsif ($token->{tag_name} eq 'math' or
                 $token->{tag_name} eq 'svg') {
          $reconstruct_active_formatting_elements->($insert_to_current);
+         ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
+         ## "adjust foreign attributes" - done in insert-element-f
-         !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token);
+         !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
          if ($self->{self_closing}) {
            pop @{$self->{open_elements}};
-Line 6818 
 sub set_inner_html ($$$) {
+Line 7299 
 sub set_inner_html ($$$) {
          !!!cp ('i4');
          !!!parse-error (type => 'NULL');
          $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
+       } elsif ($self->{next_char} <= 0x0008 or
+                (0x000E <= $self->{next_char} and
+                 $self->{next_char} <= 0x001F) or
+                (0x007F <= $self->{next_char} and
+                 $self->{next_char} <= 0x009F) or
+                (0xD800 <= $self->{next_char} and
+                 $self->{next_char} <= 0xDFFF) or
+                (0xFDD0 <= $self->{next_char} and
+                 $self->{next_char} <= 0xFDDF) or
+                {
+xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
+x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
+x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
+x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
+x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
+xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
+xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
+xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
+x10FFFE => 1, 0x10FFFF => 1,
+                }->{$self->{next_char}}) {
+         !!!cp ('i4.1');
+         !!!parse-error (type => 'control char', level => $self->{must_level});
+ ## TODO: error type documentation
        }
      };
      $p->{prev_char} = [-1, -1, -1];

 Legend:



Removed from v.1.126
 


changed lines


 
Added in v.1.139
 Legend:



Removed from v.1.126
 


changed lines


 
Added in v.1.139
-Removed from v.1.126
+Added in v.1.139

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24