| 97 |
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
| 98 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 99 |
} else { |
} else { |
| 100 |
$charset = 'windows-1252'; ## TODO: for now. |
## TODO: Implement HTML5 detection algorithm |
| 101 |
|
require Whatpm::Charset::UniversalCharDet; |
| 102 |
|
$charset = Whatpm::Charset::UniversalCharDet->detect_byte_string |
| 103 |
|
(substr ($$bytes_s, 0, 1024)); |
| 104 |
|
$charset ||= 'windows-1252'; |
| 105 |
$s = \ (Encode::decode ($charset, $$bytes_s)); |
$s = \ (Encode::decode ($charset, $$bytes_s)); |
| 106 |
$self->{input_encoding} = $charset; |
$self->{input_encoding} = $charset; |
| 107 |
$self->{confident} = 0; |
$self->{confident} = 0; |
| 340 |
## ->{system_identifier} (DOCTYPE_TOKEN) |
## ->{system_identifier} (DOCTYPE_TOKEN) |
| 341 |
## ->{correct} == 1 or 0 (DOCTYPE_TOKEN) |
## ->{correct} == 1 or 0 (DOCTYPE_TOKEN) |
| 342 |
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) |
## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) |
| 343 |
|
## ->{name} |
| 344 |
|
## ->{value} |
| 345 |
|
## ->{has_reference} == 1 or 0 |
| 346 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
| 347 |
|
|
| 348 |
## Emitted token MUST immediately be handled by the tree construction state. |
## Emitted token MUST immediately be handled by the tree construction state. |
| 1118 |
$self->{current_attribute}->{value} .= '&'; |
$self->{current_attribute}->{value} .= '&'; |
| 1119 |
} else { |
} else { |
| 1120 |
$self->{current_attribute}->{value} .= $token->{data}; |
$self->{current_attribute}->{value} .= $token->{data}; |
| 1121 |
|
$self->{current_attribute}->{has_reference} = $token->{has_reference}; |
| 1122 |
## ISSUE: spec says "append the returned character token to the current attribute's value" |
## ISSUE: spec says "append the returned character token to the current attribute's value" |
| 1123 |
} |
} |
| 1124 |
|
|
| 1555 |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 1556 |
!!!next-input-character; |
!!!next-input-character; |
| 1557 |
redo A; |
redo A; |
| 1558 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1559 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1560 |
|
|
| 1561 |
|
$self->{state} = DATA_STATE; |
| 1562 |
|
!!!next-input-character; |
| 1563 |
|
|
| 1564 |
|
delete $self->{current_token}->{correct}; |
| 1565 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1566 |
|
|
| 1567 |
|
redo A; |
| 1568 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1569 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1570 |
|
|
| 1587 |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 1588 |
!!!next-input-character; |
!!!next-input-character; |
| 1589 |
redo A; |
redo A; |
| 1590 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1591 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1592 |
|
|
| 1593 |
|
$self->{state} = DATA_STATE; |
| 1594 |
|
!!!next-input-character; |
| 1595 |
|
|
| 1596 |
|
delete $self->{current_token}->{correct}; |
| 1597 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1598 |
|
|
| 1599 |
|
redo A; |
| 1600 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1601 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1602 |
|
|
| 1703 |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 1704 |
!!!next-input-character; |
!!!next-input-character; |
| 1705 |
redo A; |
redo A; |
| 1706 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1707 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1708 |
|
|
| 1709 |
|
$self->{state} = DATA_STATE; |
| 1710 |
|
!!!next-input-character; |
| 1711 |
|
|
| 1712 |
|
delete $self->{current_token}->{correct}; |
| 1713 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1714 |
|
|
| 1715 |
|
redo A; |
| 1716 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1717 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
| 1718 |
|
|
| 1735 |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 1736 |
!!!next-input-character; |
!!!next-input-character; |
| 1737 |
redo A; |
redo A; |
| 1738 |
|
} elsif ($self->{next_input_character} == 0x003E) { # > |
| 1739 |
|
!!!parse-error (type => 'unclosed PUBLIC literal'); |
| 1740 |
|
|
| 1741 |
|
$self->{state} = DATA_STATE; |
| 1742 |
|
!!!next-input-character; |
| 1743 |
|
|
| 1744 |
|
delete $self->{current_token}->{correct}; |
| 1745 |
|
!!!emit ($self->{current_token}); # DOCTYPE |
| 1746 |
|
|
| 1747 |
|
redo A; |
| 1748 |
} elsif ($self->{next_input_character} == -1) { |
} elsif ($self->{next_input_character} == -1) { |
| 1749 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
| 1750 |
|
|
| 1885 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 1886 |
} |
} |
| 1887 |
|
|
| 1888 |
return {type => CHARACTER_TOKEN, data => chr $code}; |
return {type => CHARACTER_TOKEN, data => chr $code, |
| 1889 |
|
has_reference => 1}; |
| 1890 |
} # X |
} # X |
| 1891 |
} elsif (0x0030 <= $self->{next_input_character} and |
} elsif (0x0030 <= $self->{next_input_character} and |
| 1892 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
$self->{next_input_character} <= 0x0039) { # 0..9 |
| 1921 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 1922 |
} |
} |
| 1923 |
|
|
| 1924 |
return {type => CHARACTER_TOKEN, data => chr $code}; |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1}; |
| 1925 |
} else { |
} else { |
| 1926 |
!!!parse-error (type => 'bare nero'); |
!!!parse-error (type => 'bare nero'); |
| 1927 |
!!!back-next-input-character ($self->{next_input_character}); |
!!!back-next-input-character ($self->{next_input_character}); |
| 1969 |
} |
} |
| 1970 |
|
|
| 1971 |
if ($match > 0) { |
if ($match > 0) { |
| 1972 |
return {type => CHARACTER_TOKEN, data => $value}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
| 1973 |
} elsif ($match < 0) { |
} elsif ($match < 0) { |
| 1974 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc'); |
| 1975 |
if ($in_attr and $match < -1) { |
if ($in_attr and $match < -1) { |
| 1976 |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
| 1977 |
} else { |
} else { |
| 1978 |
return {type => CHARACTER_TOKEN, data => $value}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
| 1979 |
} |
} |
| 1980 |
} else { |
} else { |
| 1981 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero'); |
| 1982 |
## NOTE: No characters are consumed in the spec. |
## NOTE: "No characters are consumed" in the spec. |
| 1983 |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
| 1984 |
} |
} |
| 1985 |
} else { |
} else { |
| 2233 |
# |
# |
| 2234 |
} elsif ($token->{type} == START_TAG_TOKEN) { |
} elsif ($token->{type} == START_TAG_TOKEN) { |
| 2235 |
if ($token->{tag_name} eq 'html' and |
if ($token->{tag_name} eq 'html' and |
| 2236 |
$token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application" |
$token->{attributes}->{manifest}) { |
| 2237 |
$self->{application_cache_selection} |
$self->{application_cache_selection} |
| 2238 |
->($token->{attributes}->{manifest}->{value}); |
->($token->{attributes}->{manifest}->{value}); |
| 2239 |
## ISSUE: No relative reference resolution? |
## ISSUE: No relative reference resolution? |
| 2911 |
push @{$self->{open_elements}}, [$self->{head_element}, 'head']; |
push @{$self->{open_elements}}, [$self->{head_element}, 'head']; |
| 2912 |
} |
} |
| 2913 |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
!!!insert-element ($token->{tag_name}, $token->{attributes}); |
| 2914 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 2915 |
|
|
| 2916 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 2917 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 2918 |
$self->{change_encoding} |
$self->{change_encoding} |
| 2919 |
->($self, $token->{attributes}->{charset}->{value}); |
->($self, $token->{attributes}->{charset}->{value}); |
| 2920 |
|
|
| 2921 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 2922 |
|
->set_user_data (manakai_has_reference => |
| 2923 |
|
$token->{attributes}->{charset} |
| 2924 |
|
->{has_reference}); |
| 2925 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
| 2926 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 2927 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 2930 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 2931 |
$self->{change_encoding} |
$self->{change_encoding} |
| 2932 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 2933 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 2934 |
|
->set_user_data (manakai_has_reference => |
| 2935 |
|
$token->{attributes}->{content} |
| 2936 |
|
->{has_reference}); |
| 2937 |
} |
} |
| 2938 |
} |
} |
| 2939 |
|
} else { |
| 2940 |
|
if ($token->{attributes}->{charset}) { |
| 2941 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 2942 |
|
->set_user_data (manakai_has_reference => |
| 2943 |
|
$token->{attributes}->{charset} |
| 2944 |
|
->{has_reference}); |
| 2945 |
|
} |
| 2946 |
|
if ($token->{attributes}->{content}) { |
| 2947 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 2948 |
|
->set_user_data (manakai_has_reference => |
| 2949 |
|
$token->{attributes}->{content} |
| 2950 |
|
->{has_reference}); |
| 2951 |
|
} |
| 2952 |
} |
} |
| 2953 |
|
|
| 2954 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
| 4521 |
} elsif ($token->{tag_name} eq 'meta') { |
} elsif ($token->{tag_name} eq 'meta') { |
| 4522 |
## NOTE: This is an "as if in head" code clone, only "-t" differs |
## NOTE: This is an "as if in head" code clone, only "-t" differs |
| 4523 |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
!!!insert-element-t ($token->{tag_name}, $token->{attributes}); |
| 4524 |
pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4525 |
|
|
| 4526 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 4527 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
| 4528 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4529 |
->($self, $token->{attributes}->{charset}->{value}); |
->($self, $token->{attributes}->{charset}->{value}); |
| 4530 |
|
|
| 4531 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 4532 |
|
->set_user_data (manakai_has_reference => |
| 4533 |
|
$token->{attributes}->{charset} |
| 4534 |
|
->{has_reference}); |
| 4535 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
| 4536 |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
| 4537 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4540 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 4541 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4542 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3); |
| 4543 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 4544 |
|
->set_user_data (manakai_has_reference => |
| 4545 |
|
$token->{attributes}->{content} |
| 4546 |
|
->{has_reference}); |
| 4547 |
} |
} |
| 4548 |
} |
} |
| 4549 |
|
} else { |
| 4550 |
|
if ($token->{attributes}->{charset}) { |
| 4551 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'charset') |
| 4552 |
|
->set_user_data (manakai_has_reference => |
| 4553 |
|
$token->{attributes}->{charset} |
| 4554 |
|
->{has_reference}); |
| 4555 |
|
} |
| 4556 |
|
if ($token->{attributes}->{content}) { |
| 4557 |
|
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 4558 |
|
->set_user_data (manakai_has_reference => |
| 4559 |
|
$token->{attributes}->{content} |
| 4560 |
|
->{has_reference}); |
| 4561 |
|
} |
| 4562 |
} |
} |
| 4563 |
|
|
| 4564 |
!!!next-token; |
!!!next-token; |