247 |
} elsif ($self->{state} eq 'entity data') { |
} elsif ($self->{state} eq 'entity data') { |
248 |
## (cannot happen in CDATA state) |
## (cannot happen in CDATA state) |
249 |
|
|
250 |
my $token = $self->_tokenize_attempt_to_consume_an_entity; |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0); |
251 |
|
|
252 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
253 |
# next-input-character is already done |
# next-input-character is already done |
899 |
redo A; |
redo A; |
900 |
} |
} |
901 |
} elsif ($self->{state} eq 'entity in attribute value') { |
} elsif ($self->{state} eq 'entity in attribute value') { |
902 |
my $token = $self->_tokenize_attempt_to_consume_an_entity; |
my $token = $self->_tokenize_attempt_to_consume_an_entity (1); |
903 |
|
|
904 |
unless (defined $token) { |
unless (defined $token) { |
905 |
$self->{current_attribute}->{value} .= '&'; |
$self->{current_attribute}->{value} .= '&'; |
1409 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
1410 |
|
|
1411 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1412 |
## recomsume |
## reconsume |
1413 |
|
|
1414 |
delete $self->{current_token}->{correct}; |
delete $self->{current_token}->{correct}; |
1415 |
!!!emit ($self->{current_token}); # DOCTYPE |
!!!emit ($self->{current_token}); # DOCTYPE |
1452 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
1453 |
|
|
1454 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1455 |
## recomsume |
## reconsume |
1456 |
|
|
1457 |
delete $self->{current_token}->{correct}; |
delete $self->{current_token}->{correct}; |
1458 |
!!!emit ($self->{current_token}); # DOCTYPE |
!!!emit ($self->{current_token}); # DOCTYPE |
1527 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
1528 |
|
|
1529 |
$self->{state} = 'data'; |
$self->{state} = 'data'; |
1530 |
## recomsume |
## reconsume |
1531 |
|
|
1532 |
delete $self->{current_token}->{correct}; |
delete $self->{current_token}->{correct}; |
1533 |
!!!emit ($self->{current_token}); # DOCTYPE |
!!!emit ($self->{current_token}); # DOCTYPE |
1570 |
die "$0: _get_next_token: unexpected case"; |
die "$0: _get_next_token: unexpected case"; |
1571 |
} # _get_next_token |
} # _get_next_token |
1572 |
|
|
1573 |
sub _tokenize_attempt_to_consume_an_entity ($) { |
sub _tokenize_attempt_to_consume_an_entity ($$) { |
1574 |
my $self = shift; |
my ($self, $in_attr) = @_; |
1575 |
|
|
1576 |
if ({ |
if ({ |
1577 |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
1584 |
!!!next-input-character; |
!!!next-input-character; |
1585 |
if ($self->{next_input_character} == 0x0078 or # x |
if ($self->{next_input_character} == 0x0078 or # x |
1586 |
$self->{next_input_character} == 0x0058) { # X |
$self->{next_input_character} == 0x0058) { # X |
1587 |
my $num; |
my $code; |
1588 |
X: { |
X: { |
1589 |
my $x_char = $self->{next_input_character}; |
my $x_char = $self->{next_input_character}; |
1590 |
!!!next-input-character; |
!!!next-input-character; |
1591 |
if (0x0030 <= $self->{next_input_character} and |
if (0x0030 <= $self->{next_input_character} and |
1592 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
1593 |
$num ||= 0; |
$code ||= 0; |
1594 |
$num *= 0x10; |
$code *= 0x10; |
1595 |
$num += $self->{next_input_character} - 0x0030; |
$code += $self->{next_input_character} - 0x0030; |
1596 |
redo X; |
redo X; |
1597 |
} elsif (0x0061 <= $self->{next_input_character} and |
} elsif (0x0061 <= $self->{next_input_character} and |
1598 |
$self->{next_input_character} <= 0x0066) { # a..f |
$self->{next_input_character} <= 0x0066) { # a..f |
1599 |
## ISSUE: the spec says U+0078, which is apparently incorrect |
$code ||= 0; |
1600 |
$num ||= 0; |
$code *= 0x10; |
1601 |
$num *= 0x10; |
$code += $self->{next_input_character} - 0x0060 + 9; |
|
$num += $self->{next_input_character} - 0x0060 + 9; |
|
1602 |
redo X; |
redo X; |
1603 |
} elsif (0x0041 <= $self->{next_input_character} and |
} elsif (0x0041 <= $self->{next_input_character} and |
1604 |
$self->{next_input_character} <= 0x0046) { # A..F |
$self->{next_input_character} <= 0x0046) { # A..F |
1605 |
## ISSUE: the spec says U+0058, which is apparently incorrect |
$code ||= 0; |
1606 |
$num ||= 0; |
$code *= 0x10; |
1607 |
$num *= 0x10; |
$code += $self->{next_input_character} - 0x0040 + 9; |
|
$num += $self->{next_input_character} - 0x0040 + 9; |
|
1608 |
redo X; |
redo X; |
1609 |
} elsif (not defined $num) { # no hexadecimal digit |
} elsif (not defined $code) { # no hexadecimal digit |
1610 |
!!!parse-error (type => 'bare hcro'); |
!!!parse-error (type => 'bare hcro'); |
1611 |
$self->{next_input_character} = 0x0023; # # |
$self->{next_input_character} = 0x0023; # # |
1612 |
!!!back-next-input-character ($x_char); |
!!!back-next-input-character ($x_char); |
1617 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc'); |
1618 |
} |
} |
1619 |
|
|
1620 |
## TODO: check the definition for |a valid Unicode character|. |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
1621 |
## <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8189> |
!!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code); |
1622 |
if ($num > 1114111 or $num == 0) { |
$code = 0xFFFD; |
1623 |
$num = 0xFFFD; # REPLACEMENT CHARACTER |
} elsif ($code > 0x10FFFF) { |
1624 |
## ISSUE: Why this is not an error? |
!!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code); |
1625 |
} elsif (0x80 <= $num and $num <= 0x9F) { |
$code = 0xFFFD; |
1626 |
!!!parse-error (type => sprintf 'c1 entity:U+%04X', $num); |
} elsif ($code == 0x000D) { |
1627 |
$num = $c1_entity_char->{$num}; |
!!!parse-error (type => 'CR character reference'); |
1628 |
|
$code = 0x000A; |
1629 |
|
} elsif (0x80 <= $code and $code <= 0x9F) { |
1630 |
|
!!!parse-error (type => sprintf 'c1 entity:U+%04X', $code); |
1631 |
|
$code = $c1_entity_char->{$code}; |
1632 |
} |
} |
1633 |
|
|
1634 |
return {type => 'character', data => chr $num}; |
return {type => 'character', data => chr $code}; |
1635 |
} # X |
} # X |
1636 |
} elsif (0x0030 <= $self->{next_input_character} and |
} elsif (0x0030 <= $self->{next_input_character} and |
1637 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
1652 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc'); |
1653 |
} |
} |
1654 |
|
|
1655 |
## TODO: check the definition for |a valid Unicode character|. |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
1656 |
if ($code > 1114111 or $code == 0) { |
!!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code); |
1657 |
$code = 0xFFFD; # REPLACEMENT CHARACTER |
$code = 0xFFFD; |
1658 |
## ISSUE: Why this is not an error? |
} elsif ($code > 0x10FFFF) { |
1659 |
|
!!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code); |
1660 |
|
$code = 0xFFFD; |
1661 |
|
} elsif ($code == 0x000D) { |
1662 |
|
!!!parse-error (type => 'CR character reference'); |
1663 |
|
$code = 0x000A; |
1664 |
} elsif (0x80 <= $code and $code <= 0x9F) { |
} elsif (0x80 <= $code and $code <= 0x9F) { |
1665 |
!!!parse-error (type => sprintf 'c1 entity:U+%04X', $code); |
!!!parse-error (type => sprintf 'c1 entity:U+%04X', $code); |
1666 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
1696 |
$self->{next_input_character} == 0x003B)) { # ; |
$self->{next_input_character} == 0x003B)) { # ; |
1697 |
$entity_name .= chr $self->{next_input_character}; |
$entity_name .= chr $self->{next_input_character}; |
1698 |
if (defined $EntityChar->{$entity_name}) { |
if (defined $EntityChar->{$entity_name}) { |
|
$value = $EntityChar->{$entity_name}; |
|
1699 |
if ($self->{next_input_character} == 0x003B) { # ; |
if ($self->{next_input_character} == 0x003B) { # ; |
1700 |
|
$value = $EntityChar->{$entity_name}; |
1701 |
$match = 1; |
$match = 1; |
1702 |
!!!next-input-character; |
!!!next-input-character; |
1703 |
last; |
last; |
1704 |
} else { |
} elsif (not $in_attr) { |
1705 |
|
$value = $EntityChar->{$entity_name}; |
1706 |
$match = -1; |
$match = -1; |
1707 |
|
} else { |
1708 |
|
$value .= chr $self->{next_input_character}; |
1709 |
} |
} |
1710 |
} else { |
} else { |
1711 |
$value .= chr $self->{next_input_character}; |
$value .= chr $self->{next_input_character}; |
1721 |
} else { |
} else { |
1722 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero'); |
1723 |
## NOTE: No characters are consumed in the spec. |
## NOTE: No characters are consumed in the spec. |
1724 |
!!!back-token ({type => 'character', data => $value}); |
return {type => 'character', data => '&'.$value}; |
|
return undef; |
|
1725 |
} |
} |
1726 |
} else { |
} else { |
1727 |
## no characters are consumed |
## no characters are consumed |
1916 |
} elsif ($token->{type} eq 'character') { |
} elsif ($token->{type} eq 'character') { |
1917 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D |
1918 |
## Ignore the token |
## Ignore the token |
1919 |
|
|
1920 |
unless (length $token->{data}) { |
unless (length $token->{data}) { |
1921 |
## Stay in the phase |
## Stay in the phase |
1922 |
!!!next-token; |
!!!next-token; |
1959 |
!!!next-token; |
!!!next-token; |
1960 |
redo B; |
redo B; |
1961 |
} elsif ($token->{type} eq 'character') { |
} elsif ($token->{type} eq 'character') { |
1962 |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { |
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D |
1963 |
$self->{document}->manakai_append_text ($1); |
## Ignore the token. |
1964 |
## ISSUE: DOM3 Core does not allow Document > Text |
|
1965 |
unless (length $token->{data}) { |
unless (length $token->{data}) { |
1966 |
## Stay in the phase |
## Stay in the phase |
1967 |
!!!next-token; |
!!!next-token; |
2461 |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
2462 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
2463 |
!!!next-token; |
!!!next-token; |
2464 |
|
## TODO: Extracting |charset| from |meta|. |
2465 |
return; |
return; |
2466 |
} elsif ($token->{tag_name} eq 'title') { |
} elsif ($token->{tag_name} eq 'title') { |
2467 |
!!!parse-error (type => 'in body:title'); |
!!!parse-error (type => 'in body:title'); |
3361 |
} |
} |
3362 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
3363 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
3364 |
|
## TODO: Extracting |charset| from |meta|. |
3365 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
3366 |
if $self->{insertion_mode} eq 'after head'; |
if $self->{insertion_mode} eq 'after head'; |
3367 |
!!!next-token; |
!!!next-token; |
5322 |
if (not $in_cdata and { |
if (not $in_cdata and { |
5323 |
style => 1, script => 1, xmp => 1, iframe => 1, |
style => 1, script => 1, xmp => 1, iframe => 1, |
5324 |
noembed => 1, noframes => 1, noscript => 1, |
noembed => 1, noframes => 1, noscript => 1, |
5325 |
|
plaintext => 1, |
5326 |
}->{$tag_name}) { |
}->{$tag_name}) { |
5327 |
unshift @node, 'cdata-out'; |
unshift @node, 'cdata-out'; |
5328 |
$in_cdata = 1; |
$in_cdata = 1; |