/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.70 by wakaba,
Sat Mar  1 00:42:52 2008 UTC
+revision 1.75 by wakaba,
Mon Mar  3 00:13:22 2008 UTC
 Line 8 
 use Error qw(:try);
  ## doc.write ('');
  ## alert (doc.compatMode);
- ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
- ## strip BOM and the HTML layer MUST ignore it.  Whether we can do it
- ## is not yet clear.
- ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
- ## "{U+FEFF}..." in GB18030?
  ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
  ## TODO: 1252 parse error (revision 1264)
  ## TODO: 8859-11 = 874 (revision 1271)
-Line 24 
 my $permitted_slash_tag_name = {
+Line 18 
 my $permitted_slash_tag_name = {
    meta => 1,
    hr => 1,
    br => 1,
-   img=> 1,
+   img => 1,
    embed => 1,
    param => 1,
    area => 1,
-Line 159 
 sub parse_byte_string ($$$$;$) {
+Line 153 
 sub parse_byte_string ($$$$;$) {
    return $return;
  } # parse_byte_string
+ ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
+ ## and the HTML layer MUST ignore it.  However, we does strip BOM in
+ ## the encoding layer and the HTML layer does not ignore any U+FEFF,
+ ## because the core part of our HTML parser expects a string of character,
+ ## not a string of bytes or code units or anything which might contain a BOM.
+ ## Therefore, any parser interface that accepts a string of bytes,
+ ## such as |parse_byte_string| in this module, must ensure that it does
+ ## strip the BOM and never strip any ZWNBSP.
  *parse_char_string = \&parse_string;
  sub parse_string ($$$;$) {
-Line 283 
 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUO
+Line 286 
 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUO
  sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
  sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
  sub BOGUS_DOCTYPE_STATE () { 32 }
+ sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
  sub DOCTYPE_TOKEN () { 1 }
  sub COMMENT_TOKEN () { 2 }
-Line 342 
 sub _initialize_tokenizer ($) {
+Line 346 
 sub _initialize_tokenizer ($) {
  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
  ##   ->{public_identifier} (DOCTYPE_TOKEN)
  ##   ->{system_identifier} (DOCTYPE_TOKEN)
- ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
+ ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
  ##        ->{name}
  ##        ->{value}
-Line 381 
 sub _get_next_token ($) {
+Line 385 
 sub _get_next_token ($) {
    A: {
      if ($self->{state} == DATA_STATE) {
        if ($self->{next_input_character} == 0x0026) { # &
-         if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
+         if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
+             not $self->{escape}) {
            $self->{state} = ENTITY_DATA_STATE;
            !!!next-input-character;
            redo A;
-Line 436 
 sub _get_next_token ($) {
+Line 441 
 sub _get_next_token ($) {
      } elsif ($self->{state} == ENTITY_DATA_STATE) {
        ## (cannot happen in CDATA state)
-       my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
+       my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
        $self->{state} = DATA_STATE;
        # next-input-character is already done
-Line 739 
 sub _get_next_token ($) {
+Line 744 
 sub _get_next_token ($) {
          redo A;
        } else {
+         if ({
+x0022 => 1, # "
+x0027 => 1, # '
+x003D => 1, # =
+             }->{$self->{next_input_character}}) {
+           !!!parse-error (type => 'bad attribute name');
+         }
          $self->{current_attribute} = {name => chr ($self->{next_input_character}),
                                value => ''};
          $self->{state} = ATTRIBUTE_NAME_STATE;
-Line 833 
 sub _get_next_token ($) {
+Line 845 
 sub _get_next_token ($) {
          redo A;
        } else {
+         if ($self->{next_input_character} == 0x0022 or # "
+             $self->{next_input_character} == 0x0027) { # '
+           !!!parse-error (type => 'bad attribute name');
+         }
          $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
          ## Stay in the state
          !!!next-input-character;
-Line 979 
 sub _get_next_token ($) {
+Line 995 
 sub _get_next_token ($) {
          redo A;
        } else {
+         if ($self->{next_input_character} == 0x003D) { # =
+           !!!parse-error (type => 'bad attribute value');
+         }
          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
          !!!next-input-character;
-Line 986 
 sub _get_next_token ($) {
+Line 1005 
 sub _get_next_token ($) {
        }
      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
        if ($self->{next_input_character} == 0x0022) { # "
-         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0026) { # &
-Line 1022 
 sub _get_next_token ($) {
+Line 1041 
 sub _get_next_token ($) {
        }
      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
        if ($self->{next_input_character} == 0x0027) { # '
-         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0026) { # &
-Line 1110 
 sub _get_next_token ($) {
+Line 1129 
 sub _get_next_token ($) {
          redo A;
        } else {
+         if ({
+x0022 => 1, # "
+x0027 => 1, # '
+x003D => 1, # =
+             }->{$self->{next_input_character}}) {
+           !!!parse-error (type => 'bad attribute value');
+         }
          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
      } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
-       my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
+       my $token = $self->_tokenize_attempt_to_consume_an_entity
+           (1,
+            $self->{last_attribute_value_state}
+              == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
+            $self->{last_attribute_value_state}
+              == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
+            -1);
        unless (defined $token) {
          $self->{current_attribute}->{value} .= '&';
-Line 1129 
 sub _get_next_token ($) {
+Line 1161 
 sub _get_next_token ($) {
        $self->{state} = $self->{last_attribute_value_state};
        # next-input-character is already done
        redo A;
+     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
+       if ($self->{next_input_character} == 0x0009 or # HT
+           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_input_character} == 0x0020) { # SP
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
+           $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
+         } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
+           if ($self->{current_token}->{attributes}) {
+             !!!parse-error (type => 'end tag attribute');
+           }
+         } else {
+           die "$0: $self->{current_token}->{type}: Unknown token type";
+         }
+         $self->{state} = DATA_STATE;
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # start tag or end tag
+         redo A;
+       } elsif ($self->{next_input_character} == 0x002F) { # /
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x003E and # >
+             $self->{current_token}->{type} == START_TAG_TOKEN and
+             $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
+           # permitted slash
+           #
+         } else {
+           !!!parse-error (type => 'nestc');
+         }
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         # next-input-character is already done
+         redo A;
+       } else {
+         !!!parse-error (type => 'no space between attributes');
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         ## reconsume
+         redo A;
+       }
      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
        ## (only happen if PCDATA state)
-Line 1368 
 sub _get_next_token ($) {
+Line 1447 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          !!!next-input-character;
-         !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
+         !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1376 
 sub _get_next_token ($) {
+Line 1455 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
+         !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
          redo A;
        } else {
          $self->{current_token}
              = {type => DOCTYPE_TOKEN,
                 name => chr ($self->{next_input_character}),
-                correct => 1};
+                #quirks => 0,
+               };
  ## ISSUE: "Set the token's name name to the" in the spec
          $self->{state} = DOCTYPE_NAME_STATE;
          !!!next-input-character;
-Line 1411 
 sub _get_next_token ($) {
+Line 1491 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1443 
 sub _get_next_token ($) {
+Line 1523 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1507 
 sub _get_next_token ($) {
+Line 1587 
 sub _get_next_token ($) {
        }
        !!!parse-error (type => 'string after DOCTYPE name');
+       $self->{current_token}->{quirks} = 1;
        $self->{state} = BOGUS_DOCTYPE_STATE;
        # next-input-character is already done
        redo A;
-Line 1534 
 sub _get_next_token ($) {
+Line 1616 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1544 
 sub _get_next_token ($) {
+Line 1626 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          !!!parse-error (type => 'string after PUBLIC');
+         $self->{current_token}->{quirks} = 1;
          $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
-Line 1565 
 sub _get_next_token ($) {
+Line 1649 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1575 
 sub _get_next_token ($) {
+Line 1659 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1597 
 sub _get_next_token ($) {
+Line 1681 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1607 
 sub _get_next_token ($) {
+Line 1691 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1649 
 sub _get_next_token ($) {
+Line 1733 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          !!!parse-error (type => 'string after PUBLIC literal');
+         $self->{current_token}->{quirks} = 1;
          $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
-Line 1682 
 sub _get_next_token ($) {
+Line 1768 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1692 
 sub _get_next_token ($) {
+Line 1778 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          !!!parse-error (type => 'string after SYSTEM');
+         $self->{current_token}->{quirks} = 1;
          $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
-Line 1713 
 sub _get_next_token ($) {
+Line 1801 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1723 
 sub _get_next_token ($) {
+Line 1811 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1745 
 sub _get_next_token ($) {
+Line 1833 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1755 
 sub _get_next_token ($) {
+Line 1843 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1787 
 sub _get_next_token ($) {
+Line 1875 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          !!!parse-error (type => 'string after SYSTEM literal');
+         #$self->{current_token}->{quirks} = 1;
          $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
-Line 1802 
 sub _get_next_token ($) {
+Line 1892 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1811 
 sub _get_next_token ($) {
+Line 1900 
 sub _get_next_token ($) {
          $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1828 
 sub _get_next_token ($) {
+Line 1916 
 sub _get_next_token ($) {
    die "$0: _get_next_token: unexpected case";
  } # _get_next_token
- sub _tokenize_attempt_to_consume_an_entity ($$) {
+ sub _tokenize_attempt_to_consume_an_entity ($$$) {
-   my ($self, $in_attr) = @_;
+   my ($self, $in_attr, $additional) = @_;
    if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
 x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
+        $additional => 1,
        }->{$self->{next_input_character}}) {
      ## Don't consume
      ## No error
-Line 2066 
 sub _tree_construction_initial ($) {
+Line 2155 
 sub _tree_construction_initial ($) {
        ## ISSUE: internalSubset = null??
        $self->{document}->append_child ($doctype);
-       if (not $token->{correct} or $doctype_name ne 'HTML') {
+       if ($token->{quirks} or $doctype_name ne 'HTML') {
          $self->{document}->manakai_compat_mode ('quirks');
        } elsif (defined $token->{public_identifier}) {
          my $pubid = $token->{public_identifier};
-Line 2120 
 sub _tree_construction_initial ($) {
+Line 2209 
 sub _tree_construction_initial ($) {
            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
+           "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
+           "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
-Line 5464 
 sub set_inner_html ($$$) {
+Line 5556 
 sub set_inner_html ($$$) {
      $p->_initialize_tree_constructor;
      ## Step 2
-     my $node_ln = $node->local_name;
+     my $node_ln = $node->manakai_local_name;
      $p->{content_model} = {
        title => RCDATA_CONTENT_MODEL,
        textarea => RCDATA_CONTENT_MODEL,
-Line 5504 
 sub set_inner_html ($$$) {
+Line 5596 
 sub set_inner_html ($$$) {
        if ($anode->node_type == 1) {
          my $nsuri = $anode->namespace_uri;
          if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
-           if ($anode->local_name eq 'form') { ## TODO: case?
+           if ($anode->manakai_local_name eq 'form') {
              $p->{form_element} = $anode;
              last AN;
            }

 Legend:



Removed from v.1.70
 


changed lines


 
Added in v.1.75
 Legend:



Removed from v.1.70
 


changed lines


 
Added in v.1.75
-Removed from v.1.70
+Added in v.1.75

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24