/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.25 by wakaba, Sun Jun 24 05:12:11 2007 UTC revision 1.26 by wakaba, Sun Jun 24 06:20:37 2007 UTC
# Line 247  sub _get_next_token ($) { Line 247  sub _get_next_token ($) {
247      } elsif ($self->{state} eq 'entity data') {      } elsif ($self->{state} eq 'entity data') {
248        ## (cannot happen in CDATA state)        ## (cannot happen in CDATA state)
249                
250        my $token = $self->_tokenize_attempt_to_consume_an_entity;        my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
251    
252        $self->{state} = 'data';        $self->{state} = 'data';
253        # next-input-character is already done        # next-input-character is already done
# Line 899  sub _get_next_token ($) { Line 899  sub _get_next_token ($) {
899          redo A;          redo A;
900        }        }
901      } elsif ($self->{state} eq 'entity in attribute value') {      } elsif ($self->{state} eq 'entity in attribute value') {
902        my $token = $self->_tokenize_attempt_to_consume_an_entity;        my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
903    
904        unless (defined $token) {        unless (defined $token) {
905          $self->{current_attribute}->{value} .= '&';          $self->{current_attribute}->{value} .= '&';
# Line 1409  sub _get_next_token ($) { Line 1409  sub _get_next_token ($) {
1409          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1410    
1411          $self->{state} = 'data';          $self->{state} = 'data';
1412          ## recomsume          ## reconsume
1413    
1414          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
1415          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
# Line 1452  sub _get_next_token ($) { Line 1452  sub _get_next_token ($) {
1452          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1453    
1454          $self->{state} = 'data';          $self->{state} = 'data';
1455          ## recomsume          ## reconsume
1456    
1457          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
1458          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
# Line 1527  sub _get_next_token ($) { Line 1527  sub _get_next_token ($) {
1527          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1528    
1529          $self->{state} = 'data';          $self->{state} = 'data';
1530          ## recomsume          ## reconsume
1531    
1532          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
1533          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
# Line 1570  sub _get_next_token ($) { Line 1570  sub _get_next_token ($) {
1570    die "$0: _get_next_token: unexpected case";    die "$0: _get_next_token: unexpected case";
1571  } # _get_next_token  } # _get_next_token
1572    
1573  sub _tokenize_attempt_to_consume_an_entity ($) {  sub _tokenize_attempt_to_consume_an_entity ($$) {
1574    my $self = shift;    my ($self, $in_attr) = @_;
1575    
1576    if ({    if ({
1577         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
# Line 1584  sub _tokenize_attempt_to_consume_an_enti Line 1584  sub _tokenize_attempt_to_consume_an_enti
1584      !!!next-input-character;      !!!next-input-character;
1585      if ($self->{next_input_character} == 0x0078 or # x      if ($self->{next_input_character} == 0x0078 or # x
1586          $self->{next_input_character} == 0x0058) { # X          $self->{next_input_character} == 0x0058) { # X
1587        my $num;        my $code;
1588        X: {        X: {
1589          my $x_char = $self->{next_input_character};          my $x_char = $self->{next_input_character};
1590          !!!next-input-character;          !!!next-input-character;
1591          if (0x0030 <= $self->{next_input_character} and          if (0x0030 <= $self->{next_input_character} and
1592              $self->{next_input_character} <= 0x0039) { # 0..9              $self->{next_input_character} <= 0x0039) { # 0..9
1593            $num ||= 0;            $code ||= 0;
1594            $num *= 0x10;            $code *= 0x10;
1595            $num += $self->{next_input_character} - 0x0030;            $code += $self->{next_input_character} - 0x0030;
1596            redo X;            redo X;
1597          } elsif (0x0061 <= $self->{next_input_character} and          } elsif (0x0061 <= $self->{next_input_character} and
1598                   $self->{next_input_character} <= 0x0066) { # a..f                   $self->{next_input_character} <= 0x0066) { # a..f
1599            ## ISSUE: the spec says U+0078, which is apparently incorrect            $code ||= 0;
1600            $num ||= 0;            $code *= 0x10;
1601            $num *= 0x10;            $code += $self->{next_input_character} - 0x0060 + 9;
           $num += $self->{next_input_character} - 0x0060 + 9;  
1602            redo X;            redo X;
1603          } elsif (0x0041 <= $self->{next_input_character} and          } elsif (0x0041 <= $self->{next_input_character} and
1604                   $self->{next_input_character} <= 0x0046) { # A..F                   $self->{next_input_character} <= 0x0046) { # A..F
1605            ## ISSUE: the spec says U+0058, which is apparently incorrect            $code ||= 0;
1606            $num ||= 0;            $code *= 0x10;
1607            $num *= 0x10;            $code += $self->{next_input_character} - 0x0040 + 9;
           $num += $self->{next_input_character} - 0x0040 + 9;  
1608            redo X;            redo X;
1609          } elsif (not defined $num) { # no hexadecimal digit          } elsif (not defined $code) { # no hexadecimal digit
1610            !!!parse-error (type => 'bare hcro');            !!!parse-error (type => 'bare hcro');
1611            $self->{next_input_character} = 0x0023; # #            $self->{next_input_character} = 0x0023; # #
1612            !!!back-next-input-character ($x_char);            !!!back-next-input-character ($x_char);
# Line 1619  sub _tokenize_attempt_to_consume_an_enti Line 1617  sub _tokenize_attempt_to_consume_an_enti
1617            !!!parse-error (type => 'no refc');            !!!parse-error (type => 'no refc');
1618          }          }
1619    
1620          ## TODO: check the definition for |a valid Unicode character|.          if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1621          ## <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8189>            !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1622          if ($num > 1114111 or $num == 0) {            $code = 0xFFFD;
1623            $num = 0xFFFD; # REPLACEMENT CHARACTER          } elsif ($code > 0x10FFFF) {
1624            ## ISSUE: Why this is not an error?            !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1625          } elsif (0x80 <= $num and $num <= 0x9F) {            $code = 0xFFFD;
1626            !!!parse-error (type => sprintf 'c1 entity:U+%04X', $num);          } elsif ($code == 0x000D) {
1627            $num = $c1_entity_char->{$num};            !!!parse-error (type => 'CR character reference');
1628              $code = 0x000A;
1629            } elsif (0x80 <= $code and $code <= 0x9F) {
1630              !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);
1631              $code = $c1_entity_char->{$code};
1632          }          }
1633    
1634          return {type => 'character', data => chr $num};          return {type => 'character', data => chr $code};
1635        } # X        } # X
1636      } elsif (0x0030 <= $self->{next_input_character} and      } elsif (0x0030 <= $self->{next_input_character} and
1637               $self->{next_input_character} <= 0x0039) { # 0..9               $self->{next_input_character} <= 0x0039) { # 0..9
# Line 1650  sub _tokenize_attempt_to_consume_an_enti Line 1652  sub _tokenize_attempt_to_consume_an_enti
1652          !!!parse-error (type => 'no refc');          !!!parse-error (type => 'no refc');
1653        }        }
1654    
1655        ## TODO: check the definition for |a valid Unicode character|.        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1656        if ($code > 1114111 or $code == 0) {          !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1657          $code = 0xFFFD; # REPLACEMENT CHARACTER          $code = 0xFFFD;
1658          ## ISSUE: Why this is not an error?        } elsif ($code > 0x10FFFF) {
1659            !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1660            $code = 0xFFFD;
1661          } elsif ($code == 0x000D) {
1662            !!!parse-error (type => 'CR character reference');
1663            $code = 0x000A;
1664        } elsif (0x80 <= $code and $code <= 0x9F) {        } elsif (0x80 <= $code and $code <= 0x9F) {
1665          !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);          !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);
1666          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
# Line 1689  sub _tokenize_attempt_to_consume_an_enti Line 1696  sub _tokenize_attempt_to_consume_an_enti
1696              $self->{next_input_character} == 0x003B)) { # ;              $self->{next_input_character} == 0x003B)) { # ;
1697        $entity_name .= chr $self->{next_input_character};        $entity_name .= chr $self->{next_input_character};
1698        if (defined $EntityChar->{$entity_name}) {        if (defined $EntityChar->{$entity_name}) {
         $value = $EntityChar->{$entity_name};  
1699          if ($self->{next_input_character} == 0x003B) { # ;          if ($self->{next_input_character} == 0x003B) { # ;
1700              $value = $EntityChar->{$entity_name};
1701            $match = 1;            $match = 1;
1702            !!!next-input-character;            !!!next-input-character;
1703            last;            last;
1704          } else {          } elsif (not $in_attr) {
1705              $value = $EntityChar->{$entity_name};
1706            $match = -1;            $match = -1;
1707            } else {
1708              $value .= chr $self->{next_input_character};
1709          }          }
1710        } else {        } else {
1711          $value .= chr $self->{next_input_character};          $value .= chr $self->{next_input_character};
# Line 1711  sub _tokenize_attempt_to_consume_an_enti Line 1721  sub _tokenize_attempt_to_consume_an_enti
1721      } else {      } else {
1722        !!!parse-error (type => 'bare ero');        !!!parse-error (type => 'bare ero');
1723        ## NOTE: No characters are consumed in the spec.        ## NOTE: No characters are consumed in the spec.
1724        !!!back-token ({type => 'character', data => $value});        return {type => 'character', data => '&'.$value};
       return undef;  
1725      }      }
1726    } else {    } else {
1727      ## no characters are consumed      ## no characters are consumed
# Line 1907  sub _tree_construction_initial ($) { Line 1916  sub _tree_construction_initial ($) {
1916      } elsif ($token->{type} eq 'character') {      } elsif ($token->{type} eq 'character') {
1917        if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D        if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1918          ## Ignore the token          ## Ignore the token
1919    
1920          unless (length $token->{data}) {          unless (length $token->{data}) {
1921            ## Stay in the phase            ## Stay in the phase
1922            !!!next-token;            !!!next-token;
# Line 1949  sub _tree_construction_root_element ($) Line 1959  sub _tree_construction_root_element ($)
1959          !!!next-token;          !!!next-token;
1960          redo B;          redo B;
1961        } elsif ($token->{type} eq 'character') {        } elsif ($token->{type} eq 'character') {
1962          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1963            $self->{document}->manakai_append_text ($1);            ## Ignore the token.
1964            ## ISSUE: DOM3 Core does not allow Document > Text  
1965            unless (length $token->{data}) {            unless (length $token->{data}) {
1966              ## Stay in the phase              ## Stay in the phase
1967              !!!next-token;              !!!next-token;
# Line 2451  sub _tree_construction_main ($) { Line 2461  sub _tree_construction_main ($) {
2461          !!!insert-element-t ($token->{tag_name}, $token->{attributes});          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2462          pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2463          !!!next-token;          !!!next-token;
2464            ## TODO: Extracting |charset| from |meta|.
2465          return;          return;
2466        } elsif ($token->{tag_name} eq 'title') {        } elsif ($token->{tag_name} eq 'title') {
2467          !!!parse-error (type => 'in body:title');          !!!parse-error (type => 'in body:title');
# Line 3350  sub _tree_construction_main ($) { Line 3361  sub _tree_construction_main ($) {
3361                }                }
3362                !!!insert-element ($token->{tag_name}, $token->{attributes});                !!!insert-element ($token->{tag_name}, $token->{attributes});
3363                pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.                pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3364                  ## TODO: Extracting |charset| from |meta|.
3365                pop @{$self->{open_elements}}                pop @{$self->{open_elements}}
3366                    if $self->{insertion_mode} eq 'after head';                    if $self->{insertion_mode} eq 'after head';
3367                !!!next-token;                !!!next-token;
# Line 5310  sub get_inner_html ($$$) { Line 5322  sub get_inner_html ($$$) {
5322        if (not $in_cdata and {        if (not $in_cdata and {
5323          style => 1, script => 1, xmp => 1, iframe => 1,          style => 1, script => 1, xmp => 1, iframe => 1,
5324          noembed => 1, noframes => 1, noscript => 1,          noembed => 1, noframes => 1, noscript => 1,
5325            plaintext => 1,
5326        }->{$tag_name}) {        }->{$tag_name}) {
5327          unshift @node, 'cdata-out';          unshift @node, 'cdata-out';
5328          $in_cdata = 1;          $in_cdata = 1;

Legend:
Removed from v.1.25  
changed lines
  Added in v.1.26

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24