874 |
my $self = shift; |
my $self = shift; |
875 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
876 |
#$self->{state_keyword}; # initialized when used |
#$self->{state_keyword}; # initialized when used |
877 |
|
#$self->{entity__value}; # initialized when used |
878 |
|
#$self->{entity__match}; # initialized when used |
879 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
880 |
undef $self->{current_token}; |
undef $self->{current_token}; |
881 |
undef $self->{current_attribute}; |
undef $self->{current_attribute}; |
882 |
undef $self->{last_emitted_start_tag_name}; |
undef $self->{last_emitted_start_tag_name}; |
883 |
undef $self->{last_attribute_value_state}; |
#$self->{prev_state}; # initialized when used |
884 |
delete $self->{self_closing}; |
delete $self->{self_closing}; |
|
$self->{char} = []; |
|
885 |
# $self->{next_char} |
# $self->{next_char} |
886 |
!!!next-input-character; |
!!!next-input-character; |
887 |
$self->{token} = []; |
$self->{token} = []; |
913 |
## has completed loading. If one has, then it MUST be executed |
## has completed loading. If one has, then it MUST be executed |
914 |
## and removed from the list. |
## and removed from the list. |
915 |
|
|
916 |
## NOTE: HTML5 "Writing HTML documents" section, applied to |
## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) |
917 |
## documents and not to user agents and conformance checkers, |
## (This requirement was dropped from HTML5 spec, unfortunately.) |
|
## contains some requirements that are not detected by the |
|
|
## parsing algorithm: |
|
|
## - Some requirements on character encoding declarations. ## TODO |
|
|
## - "Elements MUST NOT contain content that their content model disallows." |
|
|
## ... Some are parse error, some are not (will be reported by c.c.). |
|
|
## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO |
|
|
## - Text (in elements, attributes, and comments) SHOULD NOT contain |
|
|
## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?) |
|
|
|
|
|
## TODO: HTML5 poses authors two SHOULD-level requirements that cannot |
|
|
## be detected by the HTML5 parsing algorithm: |
|
|
## - Text, |
|
918 |
|
|
919 |
sub _get_next_token ($) { |
sub _get_next_token ($) { |
920 |
my $self = shift; |
my $self = shift; |
942 |
## "entity data state". In this implementation, the tokenizer |
## "entity data state". In this implementation, the tokenizer |
943 |
## is switched to the |ENTITY_STATE|, which is an implementation |
## is switched to the |ENTITY_STATE|, which is an implementation |
944 |
## of the "consume a character reference" algorithm. |
## of the "consume a character reference" algorithm. |
|
$self->{entity_in_attr} = 0; |
|
945 |
$self->{entity_additional} = -1; |
$self->{entity_additional} = -1; |
946 |
|
$self->{prev_state} = DATA_STATE; |
947 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
948 |
!!!next-input-character; |
!!!next-input-character; |
949 |
redo A; |
redo A; |
1687 |
redo A; |
redo A; |
1688 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
1689 |
!!!cp (96); |
!!!cp (96); |
|
$self->{last_attribute_value_state} = $self->{state}; |
|
1690 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
1691 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
1692 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
1693 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
1694 |
$self->{entity_in_attr} = 1; |
$self->{prev_state} = $self->{state}; |
1695 |
$self->{entity_additional} = 0x0022; # " |
$self->{entity_additional} = 0x0022; # " |
1696 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
1697 |
!!!next-input-character; |
!!!next-input-character; |
1734 |
redo A; |
redo A; |
1735 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
1736 |
!!!cp (102); |
!!!cp (102); |
|
$self->{last_attribute_value_state} = $self->{state}; |
|
1737 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
1738 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
1739 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
1740 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
$self->{entity_in_attr} = 1; |
|
1741 |
$self->{entity_additional} = 0x0027; # ' |
$self->{entity_additional} = 0x0027; # ' |
1742 |
|
$self->{prev_state} = $self->{state}; |
1743 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
1744 |
!!!next-input-character; |
!!!next-input-character; |
1745 |
redo A; |
redo A; |
1785 |
redo A; |
redo A; |
1786 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
1787 |
!!!cp (108); |
!!!cp (108); |
|
$self->{last_attribute_value_state} = $self->{state}; |
|
1788 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
1789 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
1790 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
1791 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
$self->{entity_in_attr} = 1; |
|
1792 |
$self->{entity_additional} = -1; |
$self->{entity_additional} = -1; |
1793 |
|
$self->{prev_state} = $self->{state}; |
1794 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
1795 |
!!!next-input-character; |
!!!next-input-character; |
1796 |
redo A; |
redo A; |
2974 |
## appended to the parent element or the attribute value in later |
## appended to the parent element or the attribute value in later |
2975 |
## process of the tokenizer. |
## process of the tokenizer. |
2976 |
|
|
2977 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
2978 |
$self->{current_attribute}->{value} .= '&'; |
$self->{state} = $self->{prev_state}; |
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
2979 |
## Reconsume. |
## Reconsume. |
2980 |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
2981 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
2982 |
column => $self->{column_prev}, |
column => $self->{column_prev}, |
2983 |
}); |
}); |
2984 |
redo A; |
redo A; |
2985 |
|
} else { |
2986 |
|
$self->{current_attribute}->{value} .= '&'; |
2987 |
|
$self->{state} = $self->{prev_state}; |
2988 |
|
## Reconsume. |
2989 |
|
redo A; |
2990 |
} |
} |
2991 |
} elsif ($self->{state} == ENTITY_HASH_STATE) { |
} elsif ($self->{state} == ENTITY_HASH_STATE) { |
2992 |
if ($self->{next_char} == 0x0078 or # x |
if ($self->{next_char} == 0x0078 or # x |
3011 |
## and then "&#" is appended to the parent element or the attribute |
## and then "&#" is appended to the parent element or the attribute |
3012 |
## value in the later processing. |
## value in the later processing. |
3013 |
|
|
3014 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
3015 |
$self->{current_attribute}->{value} .= '&#'; |
$self->{state} = $self->{prev_state}; |
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
3016 |
## Reconsume. |
## Reconsume. |
3017 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
3018 |
data => '&#', |
data => '&#', |
3020 |
column => $self->{column_prev} - 1, |
column => $self->{column_prev} - 1, |
3021 |
}); |
}); |
3022 |
redo A; |
redo A; |
3023 |
|
} else { |
3024 |
|
$self->{current_attribute}->{value} .= '&#'; |
3025 |
|
$self->{state} = $self->{prev_state}; |
3026 |
|
## Reconsume. |
3027 |
|
redo A; |
3028 |
} |
} |
3029 |
} |
} |
3030 |
} elsif ($self->{state} == NCR_NUM_STATE) { |
} elsif ($self->{state} == NCR_NUM_STATE) { |
3076 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
3077 |
} |
} |
3078 |
|
|
3079 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
3080 |
$self->{current_attribute}->{value} .= chr $code; |
$self->{state} = $self->{prev_state}; |
|
$self->{current_attribute}->{has_reference} = 1; |
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
3081 |
## Reconsume. |
## Reconsume. |
3082 |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
|
has_reference => 1, |
|
3083 |
line => $l, column => $c, |
line => $l, column => $c, |
3084 |
}); |
}); |
3085 |
redo A; |
redo A; |
3086 |
|
} else { |
3087 |
|
$self->{current_attribute}->{value} .= chr $code; |
3088 |
|
$self->{current_attribute}->{has_reference} = 1; |
3089 |
|
$self->{state} = $self->{prev_state}; |
3090 |
|
## Reconsume. |
3091 |
|
redo A; |
3092 |
} |
} |
3093 |
} elsif ($self->{state} == HEXREF_X_STATE) { |
} elsif ($self->{state} == HEXREF_X_STATE) { |
3094 |
if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or |
if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or |
3109 |
## and then "&#" followed by "X" or "x" is appended to the parent |
## and then "&#" followed by "X" or "x" is appended to the parent |
3110 |
## element or the attribute value in the later processing. |
## element or the attribute value in the later processing. |
3111 |
|
|
3112 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
3113 |
$self->{current_attribute}->{value} .= '&' . $self->{state_keyword}; |
$self->{state} = $self->{prev_state}; |
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
3114 |
## Reconsume. |
## Reconsume. |
3115 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
3116 |
data => '&' . $self->{state_keyword}, |
data => '&' . $self->{state_keyword}, |
3118 |
column => $self->{column_prev} - length $self->{state_keyword}, |
column => $self->{column_prev} - length $self->{state_keyword}, |
3119 |
}); |
}); |
3120 |
redo A; |
redo A; |
3121 |
|
} else { |
3122 |
|
$self->{current_attribute}->{value} .= '&' . $self->{state_keyword}; |
3123 |
|
$self->{state} = $self->{prev_state}; |
3124 |
|
## Reconsume. |
3125 |
|
redo A; |
3126 |
} |
} |
3127 |
} |
} |
3128 |
} elsif ($self->{state} == HEXREF_HEX_STATE) { |
} elsif ($self->{state} == HEXREF_HEX_STATE) { |
3188 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
3189 |
} |
} |
3190 |
|
|
3191 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
3192 |
$self->{current_attribute}->{value} .= chr $code; |
$self->{state} = $self->{prev_state}; |
|
$self->{current_attribute}->{has_reference} = 1; |
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
3193 |
## Reconsume. |
## Reconsume. |
3194 |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
|
has_reference => 1, |
|
3195 |
line => $l, column => $c, |
line => $l, column => $c, |
3196 |
}); |
}); |
3197 |
redo A; |
redo A; |
3198 |
|
} else { |
3199 |
|
$self->{current_attribute}->{value} .= chr $code; |
3200 |
|
$self->{current_attribute}->{has_reference} = 1; |
3201 |
|
$self->{state} = $self->{prev_state}; |
3202 |
|
## Reconsume. |
3203 |
|
redo A; |
3204 |
} |
} |
3205 |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
3206 |
if (length $self->{state_keyword} < 30 and |
if (length $self->{state_keyword} < 30 and |
3248 |
# |
# |
3249 |
} elsif ($self->{entity__match} < 0) { |
} elsif ($self->{entity__match} < 0) { |
3250 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc'); |
3251 |
if ($self->{entity_in_attr} and $self->{entity__match} < -1) { |
if ($self->{prev_state} != DATA_STATE and # in attribute |
3252 |
|
$self->{entity__match} < -1) { |
3253 |
!!!cp (1024); |
!!!cp (1024); |
3254 |
$data = '&' . $self->{state_keyword}; |
$data = '&' . $self->{state_keyword}; |
3255 |
# |
# |
3278 |
## that would not be consumed are appended in the data state or in an |
## that would not be consumed are appended in the data state or in an |
3279 |
## appropriate attribute value state anyway. |
## appropriate attribute value state anyway. |
3280 |
|
|
3281 |
if ($self->{entity_in_attr}) { |
if ($self->{prev_state} == DATA_STATE) { |
3282 |
$self->{current_attribute}->{value} .= $data; |
$self->{state} = $self->{prev_state}; |
|
$self->{current_attribute}->{has_reference} = 1 if $has_ref; |
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
## Reconsume. |
|
|
redo A; |
|
|
} else { |
|
|
$self->{state} = DATA_STATE; |
|
3283 |
## Reconsume. |
## Reconsume. |
3284 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
3285 |
data => $data, has_reference => $has_ref, |
data => $data, |
3286 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
3287 |
column => $self->{column_prev} + 1 - length $self->{state_keyword}, |
column => $self->{column_prev} + 1 - length $self->{state_keyword}, |
3288 |
}); |
}); |
3289 |
redo A; |
redo A; |
3290 |
|
} else { |
3291 |
|
$self->{current_attribute}->{value} .= $data; |
3292 |
|
$self->{current_attribute}->{has_reference} = 1 if $has_ref; |
3293 |
|
$self->{state} = $self->{prev_state}; |
3294 |
|
## Reconsume. |
3295 |
|
redo A; |
3296 |
} |
} |
3297 |
} else { |
} else { |
3298 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |