| 874 |
my $self = shift; |
my $self = shift; |
| 875 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
| 876 |
#$self->{state_keyword}; # initialized when used |
#$self->{state_keyword}; # initialized when used |
| 877 |
|
#$self->{entity__value}; # initialized when used |
| 878 |
|
#$self->{entity__match}; # initialized when used |
| 879 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
| 880 |
undef $self->{current_token}; |
undef $self->{current_token}; |
| 881 |
undef $self->{current_attribute}; |
undef $self->{current_attribute}; |
| 882 |
undef $self->{last_emitted_start_tag_name}; |
undef $self->{last_emitted_start_tag_name}; |
| 883 |
undef $self->{last_attribute_value_state}; |
#$self->{prev_state}; # initialized when used |
| 884 |
delete $self->{self_closing}; |
delete $self->{self_closing}; |
|
$self->{char} = []; |
|
| 885 |
# $self->{next_char} |
# $self->{next_char} |
| 886 |
!!!next-input-character; |
!!!next-input-character; |
| 887 |
$self->{token} = []; |
$self->{token} = []; |
| 913 |
## has completed loading. If one has, then it MUST be executed |
## has completed loading. If one has, then it MUST be executed |
| 914 |
## and removed from the list. |
## and removed from the list. |
| 915 |
|
|
| 916 |
## NOTE: HTML5 "Writing HTML documents" section, applied to |
## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) |
| 917 |
## documents and not to user agents and conformance checkers, |
## (This requirement was dropped from HTML5 spec, unfortunately.) |
|
## contains some requirements that are not detected by the |
|
|
## parsing algorithm: |
|
|
## - Some requirements on character encoding declarations. ## TODO |
|
|
## - "Elements MUST NOT contain content that their content model disallows." |
|
|
## ... Some are parse error, some are not (will be reported by c.c.). |
|
|
## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO |
|
|
## - Text (in elements, attributes, and comments) SHOULD NOT contain |
|
|
## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?) |
|
|
|
|
|
## TODO: HTML5 poses authors two SHOULD-level requirements that cannot |
|
|
## be detected by the HTML5 parsing algorithm: |
|
|
## - Text, |
|
| 918 |
|
|
| 919 |
sub _get_next_token ($) { |
sub _get_next_token ($) { |
| 920 |
my $self = shift; |
my $self = shift; |
| 942 |
## "entity data state". In this implementation, the tokenizer |
## "entity data state". In this implementation, the tokenizer |
| 943 |
## is switched to the |ENTITY_STATE|, which is an implementation |
## is switched to the |ENTITY_STATE|, which is an implementation |
| 944 |
## of the "consume a character reference" algorithm. |
## of the "consume a character reference" algorithm. |
|
$self->{entity_in_attr} = 0; |
|
| 945 |
$self->{entity_additional} = -1; |
$self->{entity_additional} = -1; |
| 946 |
|
$self->{prev_state} = DATA_STATE; |
| 947 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
| 948 |
!!!next-input-character; |
!!!next-input-character; |
| 949 |
redo A; |
redo A; |
| 1687 |
redo A; |
redo A; |
| 1688 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
| 1689 |
!!!cp (96); |
!!!cp (96); |
|
$self->{last_attribute_value_state} = $self->{state}; |
|
| 1690 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1691 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1692 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1693 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
| 1694 |
$self->{entity_in_attr} = 1; |
$self->{prev_state} = $self->{state}; |
| 1695 |
$self->{entity_additional} = 0x0022; # " |
$self->{entity_additional} = 0x0022; # " |
| 1696 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
| 1697 |
!!!next-input-character; |
!!!next-input-character; |
| 1734 |
redo A; |
redo A; |
| 1735 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
| 1736 |
!!!cp (102); |
!!!cp (102); |
|
$self->{last_attribute_value_state} = $self->{state}; |
|
| 1737 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1738 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1739 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1740 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
$self->{entity_in_attr} = 1; |
|
| 1741 |
$self->{entity_additional} = 0x0027; # ' |
$self->{entity_additional} = 0x0027; # ' |
| 1742 |
|
$self->{prev_state} = $self->{state}; |
| 1743 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
| 1744 |
!!!next-input-character; |
!!!next-input-character; |
| 1745 |
redo A; |
redo A; |
| 1785 |
redo A; |
redo A; |
| 1786 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
| 1787 |
!!!cp (108); |
!!!cp (108); |
|
$self->{last_attribute_value_state} = $self->{state}; |
|
| 1788 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1789 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1790 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1791 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
$self->{entity_in_attr} = 1; |
|
| 1792 |
$self->{entity_additional} = -1; |
$self->{entity_additional} = -1; |
| 1793 |
|
$self->{prev_state} = $self->{state}; |
| 1794 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
| 1795 |
!!!next-input-character; |
!!!next-input-character; |
| 1796 |
redo A; |
redo A; |
| 2974 |
## appended to the parent element or the attribute value in later |
## appended to the parent element or the attribute value in later |
| 2975 |
## process of the tokenizer. |
## process of the tokenizer. |
| 2976 |
|
|
| 2977 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
| 2978 |
$self->{current_attribute}->{value} .= '&'; |
$self->{state} = $self->{prev_state}; |
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
| 2979 |
## Reconsume. |
## Reconsume. |
| 2980 |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
| 2981 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2982 |
column => $self->{column_prev}, |
column => $self->{column_prev}, |
| 2983 |
}); |
}); |
| 2984 |
redo A; |
redo A; |
| 2985 |
|
} else { |
| 2986 |
|
$self->{current_attribute}->{value} .= '&'; |
| 2987 |
|
$self->{state} = $self->{prev_state}; |
| 2988 |
|
## Reconsume. |
| 2989 |
|
redo A; |
| 2990 |
} |
} |
| 2991 |
} elsif ($self->{state} == ENTITY_HASH_STATE) { |
} elsif ($self->{state} == ENTITY_HASH_STATE) { |
| 2992 |
if ($self->{next_char} == 0x0078 or # x |
if ($self->{next_char} == 0x0078 or # x |
| 3011 |
## and then "&#" is appended to the parent element or the attribute |
## and then "&#" is appended to the parent element or the attribute |
| 3012 |
## value in the later processing. |
## value in the later processing. |
| 3013 |
|
|
| 3014 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
| 3015 |
$self->{current_attribute}->{value} .= '&#'; |
$self->{state} = $self->{prev_state}; |
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
| 3016 |
## Reconsume. |
## Reconsume. |
| 3017 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
| 3018 |
data => '&#', |
data => '&#', |
| 3020 |
column => $self->{column_prev} - 1, |
column => $self->{column_prev} - 1, |
| 3021 |
}); |
}); |
| 3022 |
redo A; |
redo A; |
| 3023 |
|
} else { |
| 3024 |
|
$self->{current_attribute}->{value} .= '&#'; |
| 3025 |
|
$self->{state} = $self->{prev_state}; |
| 3026 |
|
## Reconsume. |
| 3027 |
|
redo A; |
| 3028 |
} |
} |
| 3029 |
} |
} |
| 3030 |
} elsif ($self->{state} == NCR_NUM_STATE) { |
} elsif ($self->{state} == NCR_NUM_STATE) { |
| 3076 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 3077 |
} |
} |
| 3078 |
|
|
| 3079 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
| 3080 |
$self->{current_attribute}->{value} .= chr $code; |
$self->{state} = $self->{prev_state}; |
|
$self->{current_attribute}->{has_reference} = 1; |
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
| 3081 |
## Reconsume. |
## Reconsume. |
| 3082 |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
|
has_reference => 1, |
|
| 3083 |
line => $l, column => $c, |
line => $l, column => $c, |
| 3084 |
}); |
}); |
| 3085 |
redo A; |
redo A; |
| 3086 |
|
} else { |
| 3087 |
|
$self->{current_attribute}->{value} .= chr $code; |
| 3088 |
|
$self->{current_attribute}->{has_reference} = 1; |
| 3089 |
|
$self->{state} = $self->{prev_state}; |
| 3090 |
|
## Reconsume. |
| 3091 |
|
redo A; |
| 3092 |
} |
} |
| 3093 |
} elsif ($self->{state} == HEXREF_X_STATE) { |
} elsif ($self->{state} == HEXREF_X_STATE) { |
| 3094 |
if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or |
if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or |
| 3109 |
## and then "&#" followed by "X" or "x" is appended to the parent |
## and then "&#" followed by "X" or "x" is appended to the parent |
| 3110 |
## element or the attribute value in the later processing. |
## element or the attribute value in the later processing. |
| 3111 |
|
|
| 3112 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
| 3113 |
$self->{current_attribute}->{value} .= '&' . $self->{state_keyword}; |
$self->{state} = $self->{prev_state}; |
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
| 3114 |
## Reconsume. |
## Reconsume. |
| 3115 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
| 3116 |
data => '&' . $self->{state_keyword}, |
data => '&' . $self->{state_keyword}, |
| 3118 |
column => $self->{column_prev} - length $self->{state_keyword}, |
column => $self->{column_prev} - length $self->{state_keyword}, |
| 3119 |
}); |
}); |
| 3120 |
redo A; |
redo A; |
| 3121 |
|
} else { |
| 3122 |
|
$self->{current_attribute}->{value} .= '&' . $self->{state_keyword}; |
| 3123 |
|
$self->{state} = $self->{prev_state}; |
| 3124 |
|
## Reconsume. |
| 3125 |
|
redo A; |
| 3126 |
} |
} |
| 3127 |
} |
} |
| 3128 |
} elsif ($self->{state} == HEXREF_HEX_STATE) { |
} elsif ($self->{state} == HEXREF_HEX_STATE) { |
| 3188 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 3189 |
} |
} |
| 3190 |
|
|
| 3191 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
| 3192 |
$self->{current_attribute}->{value} .= chr $code; |
$self->{state} = $self->{prev_state}; |
|
$self->{current_attribute}->{has_reference} = 1; |
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
| 3193 |
## Reconsume. |
## Reconsume. |
| 3194 |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
|
has_reference => 1, |
|
| 3195 |
line => $l, column => $c, |
line => $l, column => $c, |
| 3196 |
}); |
}); |
| 3197 |
redo A; |
redo A; |
| 3198 |
|
} else { |
| 3199 |
|
$self->{current_attribute}->{value} .= chr $code; |
| 3200 |
|
$self->{current_attribute}->{has_reference} = 1; |
| 3201 |
|
$self->{state} = $self->{prev_state}; |
| 3202 |
|
## Reconsume. |
| 3203 |
|
redo A; |
| 3204 |
} |
} |
| 3205 |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
| 3206 |
if (length $self->{state_keyword} < 30 and |
if (length $self->{state_keyword} < 30 and |
| 3248 |
# |
# |
| 3249 |
} elsif ($self->{entity__match} < 0) { |
} elsif ($self->{entity__match} < 0) { |
| 3250 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc'); |
| 3251 |
if ($self->{entity_in_attr} and $self->{entity__match} < -1) { |
if ($self->{prev_state} != DATA_STATE and # in attribute |
| 3252 |
|
$self->{entity__match} < -1) { |
| 3253 |
!!!cp (1024); |
!!!cp (1024); |
| 3254 |
$data = '&' . $self->{state_keyword}; |
$data = '&' . $self->{state_keyword}; |
| 3255 |
# |
# |
| 3278 |
## that would not be consumed are appended in the data state or in an |
## that would not be consumed are appended in the data state or in an |
| 3279 |
## appropriate attribute value state anyway. |
## appropriate attribute value state anyway. |
| 3280 |
|
|
| 3281 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
| 3282 |
$self->{current_attribute}->{value} .= $data; |
$self->{state} = $self->{prev_state}; |
|
$self->{current_attribute}->{has_reference} = 1 if $has_ref; |
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
| 3283 |
## Reconsume. |
## Reconsume. |
| 3284 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
| 3285 |
data => $data, has_reference => $has_ref, |
data => $data, |
| 3286 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 3287 |
column => $self->{column_prev} + 1 - length $self->{state_keyword}, |
column => $self->{column_prev} + 1 - length $self->{state_keyword}, |
| 3288 |
}); |
}); |
| 3289 |
redo A; |
redo A; |
| 3290 |
|
} else { |
| 3291 |
|
$self->{current_attribute}->{value} .= $data; |
| 3292 |
|
$self->{current_attribute}->{has_reference} = 1 if $has_ref; |
| 3293 |
|
$self->{state} = $self->{prev_state}; |
| 3294 |
|
## Reconsume. |
| 3295 |
|
redo A; |
| 3296 |
} |
} |
| 3297 |
} else { |
} else { |
| 3298 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |