| 669 |
$self->{column} = 0; |
$self->{column} = 0; |
| 670 |
} elsif ($self->{next_char} == 0x000D) { # CR |
} elsif ($self->{next_char} == 0x000D) { # CR |
| 671 |
!!!cp ('j2'); |
!!!cp ('j2'); |
| 672 |
|
## TODO: support for abort/streaming |
| 673 |
my $next = $input->getc; |
my $next = $input->getc; |
| 674 |
if (defined $next and $next ne "\x0A") { |
if (defined $next and $next ne "\x0A") { |
| 675 |
$self->{next_next_char} = $next; |
$self->{next_next_char} = $next; |
| 770 |
sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP } |
sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP } |
| 771 |
|
|
| 772 |
sub DATA_STATE () { 0 } |
sub DATA_STATE () { 0 } |
| 773 |
sub ENTITY_DATA_STATE () { 1 } |
#sub ENTITY_DATA_STATE () { 1 } |
| 774 |
sub TAG_OPEN_STATE () { 2 } |
sub TAG_OPEN_STATE () { 2 } |
| 775 |
sub CLOSE_TAG_OPEN_STATE () { 3 } |
sub CLOSE_TAG_OPEN_STATE () { 3 } |
| 776 |
sub TAG_NAME_STATE () { 4 } |
sub TAG_NAME_STATE () { 4 } |
| 781 |
sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } |
sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } |
| 782 |
sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } |
sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } |
| 783 |
sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } |
sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } |
| 784 |
sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 } |
#sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 } |
| 785 |
sub MARKUP_DECLARATION_OPEN_STATE () { 13 } |
sub MARKUP_DECLARATION_OPEN_STATE () { 13 } |
| 786 |
sub COMMENT_START_STATE () { 14 } |
sub COMMENT_START_STATE () { 14 } |
| 787 |
sub COMMENT_START_DASH_STATE () { 15 } |
sub COMMENT_START_DASH_STATE () { 15 } |
| 813 |
sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec |
sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec |
| 814 |
sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec |
sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec |
| 815 |
sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec |
sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec |
| 816 |
|
## NOTE: "Entity data state", "entity in attribute value state", and |
| 817 |
|
## "consume a character reference" algorithm are jointly implemented |
| 818 |
|
## using the following six states: |
| 819 |
|
sub ENTITY_STATE () { 44 } |
| 820 |
|
sub ENTITY_HASH_STATE () { 45 } |
| 821 |
|
sub NCR_NUM_STATE () { 46 } |
| 822 |
|
sub HEXREF_X_STATE () { 47 } |
| 823 |
|
sub HEXREF_HEX_STATE () { 48 } |
| 824 |
|
sub ENTITY_NAME_STATE () { 49 } |
| 825 |
|
|
| 826 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } |
| 827 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
| 875 |
my $self = shift; |
my $self = shift; |
| 876 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
| 877 |
#$self->{state_keyword}; # initialized when used |
#$self->{state_keyword}; # initialized when used |
| 878 |
|
#$self->{entity__value}; # initialized when used |
| 879 |
|
#$self->{entity__match}; # initialized when used |
| 880 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
| 881 |
undef $self->{current_token}; |
undef $self->{current_token}; |
| 882 |
undef $self->{current_attribute}; |
undef $self->{current_attribute}; |
| 883 |
undef $self->{last_emitted_start_tag_name}; |
undef $self->{last_emitted_start_tag_name}; |
| 884 |
undef $self->{last_attribute_value_state}; |
#$self->{prev_state}; # initialized when used |
| 885 |
delete $self->{self_closing}; |
delete $self->{self_closing}; |
|
$self->{char} = []; |
|
| 886 |
# $self->{next_char} |
# $self->{next_char} |
| 887 |
!!!next-input-character; |
!!!next-input-character; |
| 888 |
$self->{token} = []; |
$self->{token} = []; |
| 914 |
## has completed loading. If one has, then it MUST be executed |
## has completed loading. If one has, then it MUST be executed |
| 915 |
## and removed from the list. |
## and removed from the list. |
| 916 |
|
|
| 917 |
## NOTE: HTML5 "Writing HTML documents" section, applied to |
## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) |
| 918 |
## documents and not to user agents and conformance checkers, |
## (This requirement was dropped from HTML5 spec, unfortunately.) |
|
## contains some requirements that are not detected by the |
|
|
## parsing algorithm: |
|
|
## - Some requirements on character encoding declarations. ## TODO |
|
|
## - "Elements MUST NOT contain content that their content model disallows." |
|
|
## ... Some are parse error, some are not (will be reported by c.c.). |
|
|
## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO |
|
|
## - Text (in elements, attributes, and comments) SHOULD NOT contain |
|
|
## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?) |
|
|
|
|
|
## TODO: HTML5 poses authors two SHOULD-level requirements that cannot |
|
|
## be detected by the HTML5 parsing algorithm: |
|
|
## - Text, |
|
| 919 |
|
|
| 920 |
sub _get_next_token ($) { |
sub _get_next_token ($) { |
| 921 |
my $self = shift; |
my $self = shift; |
| 939 |
if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA |
if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA |
| 940 |
not $self->{escape}) { |
not $self->{escape}) { |
| 941 |
!!!cp (1); |
!!!cp (1); |
| 942 |
$self->{state} = ENTITY_DATA_STATE; |
## NOTE: In the spec, the tokenizer is switched to the |
| 943 |
|
## "entity data state". In this implementation, the tokenizer |
| 944 |
|
## is switched to the |ENTITY_STATE|, which is an implementation |
| 945 |
|
## of the "consume a character reference" algorithm. |
| 946 |
|
$self->{entity_additional} = -1; |
| 947 |
|
$self->{prev_state} = DATA_STATE; |
| 948 |
|
$self->{state} = ENTITY_STATE; |
| 949 |
!!!next-input-character; |
!!!next-input-character; |
| 950 |
redo A; |
redo A; |
| 951 |
} else { |
} else { |
| 1015 |
!!!emit ($token); |
!!!emit ($token); |
| 1016 |
|
|
| 1017 |
redo A; |
redo A; |
|
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
|
|
## (cannot happen in CDATA state) |
|
|
|
|
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
|
|
|
|
|
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
|
|
|
|
|
$self->{state} = DATA_STATE; |
|
|
# next-input-character is already done |
|
|
|
|
|
unless (defined $token) { |
|
|
!!!cp (13); |
|
|
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
|
|
line => $l, column => $c, |
|
|
}); |
|
|
} else { |
|
|
!!!cp (14); |
|
|
!!!emit ($token); |
|
|
} |
|
|
|
|
|
redo A; |
|
| 1018 |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
| 1019 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 1020 |
if ($self->{next_char} == 0x002F) { # / |
if ($self->{next_char} == 0x002F) { # / |
| 1688 |
redo A; |
redo A; |
| 1689 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
| 1690 |
!!!cp (96); |
!!!cp (96); |
| 1691 |
$self->{last_attribute_value_state} = $self->{state}; |
## NOTE: In the spec, the tokenizer is switched to the |
| 1692 |
$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
## "entity in attribute value state". In this implementation, the |
| 1693 |
|
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1694 |
|
## implementation of the "consume a character reference" algorithm. |
| 1695 |
|
$self->{prev_state} = $self->{state}; |
| 1696 |
|
$self->{entity_additional} = 0x0022; # " |
| 1697 |
|
$self->{state} = ENTITY_STATE; |
| 1698 |
!!!next-input-character; |
!!!next-input-character; |
| 1699 |
redo A; |
redo A; |
| 1700 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
| 1735 |
redo A; |
redo A; |
| 1736 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
| 1737 |
!!!cp (102); |
!!!cp (102); |
| 1738 |
$self->{last_attribute_value_state} = $self->{state}; |
## NOTE: In the spec, the tokenizer is switched to the |
| 1739 |
$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
## "entity in attribute value state". In this implementation, the |
| 1740 |
|
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1741 |
|
## implementation of the "consume a character reference" algorithm. |
| 1742 |
|
$self->{entity_additional} = 0x0027; # ' |
| 1743 |
|
$self->{prev_state} = $self->{state}; |
| 1744 |
|
$self->{state} = ENTITY_STATE; |
| 1745 |
!!!next-input-character; |
!!!next-input-character; |
| 1746 |
redo A; |
redo A; |
| 1747 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
| 1786 |
redo A; |
redo A; |
| 1787 |
} elsif ($self->{next_char} == 0x0026) { # & |
} elsif ($self->{next_char} == 0x0026) { # & |
| 1788 |
!!!cp (108); |
!!!cp (108); |
| 1789 |
$self->{last_attribute_value_state} = $self->{state}; |
## NOTE: In the spec, the tokenizer is switched to the |
| 1790 |
$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
## "entity in attribute value state". In this implementation, the |
| 1791 |
|
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1792 |
|
## implementation of the "consume a character reference" algorithm. |
| 1793 |
|
$self->{entity_additional} = -1; |
| 1794 |
|
$self->{prev_state} = $self->{state}; |
| 1795 |
|
$self->{state} = ENTITY_STATE; |
| 1796 |
!!!next-input-character; |
!!!next-input-character; |
| 1797 |
redo A; |
redo A; |
| 1798 |
} elsif ($self->{next_char} == 0x003E) { # > |
} elsif ($self->{next_char} == 0x003E) { # > |
| 1856 |
!!!next-input-character; |
!!!next-input-character; |
| 1857 |
redo A; |
redo A; |
| 1858 |
} |
} |
|
} elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) { |
|
|
my $token = $self->_tokenize_attempt_to_consume_an_entity |
|
|
(1, |
|
|
$self->{last_attribute_value_state} |
|
|
== ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # " |
|
|
$self->{last_attribute_value_state} |
|
|
== ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # ' |
|
|
-1); |
|
|
|
|
|
unless (defined $token) { |
|
|
!!!cp (117); |
|
|
$self->{current_attribute}->{value} .= '&'; |
|
|
} else { |
|
|
!!!cp (118); |
|
|
$self->{current_attribute}->{value} .= $token->{data}; |
|
|
$self->{current_attribute}->{has_reference} = $token->{has_reference}; |
|
|
## ISSUE: spec says "append the returned character token to the current attribute's value" |
|
|
} |
|
|
|
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
# next-input-character is already done |
|
|
redo A; |
|
| 1859 |
} elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { |
} elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { |
| 1860 |
if ($self->{next_char} == 0x0009 or # HT |
if ($self->{next_char} == 0x0009 or # HT |
| 1861 |
$self->{next_char} == 0x000A or # LF |
$self->{next_char} == 0x000A or # LF |
| 1975 |
} |
} |
| 1976 |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
| 1977 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
|
|
|
|
## NOTE: Set by the previous state |
|
|
#my $token = {type => COMMENT_TOKEN, data => ''}; |
|
|
|
|
|
BC: { |
|
|
if ($self->{next_char} == 0x003E) { # > |
|
|
!!!cp (124); |
|
|
$self->{state} = DATA_STATE; |
|
|
!!!next-input-character; |
|
|
|
|
|
!!!emit ($self->{current_token}); # comment |
|
|
|
|
|
redo A; |
|
|
} elsif ($self->{next_char} == -1) { |
|
|
!!!cp (125); |
|
|
$self->{state} = DATA_STATE; |
|
|
## reconsume |
|
| 1978 |
|
|
| 1979 |
!!!emit ($self->{current_token}); # comment |
## NOTE: Unlike spec's "bogus comment state", this implementation |
| 1980 |
|
## consumes characters one-by-one basis. |
| 1981 |
|
|
| 1982 |
|
if ($self->{next_char} == 0x003E) { # > |
| 1983 |
|
!!!cp (124); |
| 1984 |
|
$self->{state} = DATA_STATE; |
| 1985 |
|
!!!next-input-character; |
| 1986 |
|
|
| 1987 |
redo A; |
!!!emit ($self->{current_token}); # comment |
| 1988 |
} else { |
redo A; |
| 1989 |
!!!cp (126); |
} elsif ($self->{next_char} == -1) { |
| 1990 |
$self->{current_token}->{data} .= chr ($self->{next_char}); # comment |
!!!cp (125); |
| 1991 |
!!!next-input-character; |
$self->{state} = DATA_STATE; |
| 1992 |
redo BC; |
## reconsume |
|
} |
|
|
} # BC |
|
| 1993 |
|
|
| 1994 |
die "$0: _get_next_token: unexpected case [BC]"; |
!!!emit ($self->{current_token}); # comment |
| 1995 |
|
redo A; |
| 1996 |
|
} else { |
| 1997 |
|
!!!cp (126); |
| 1998 |
|
$self->{current_token}->{data} .= chr ($self->{next_char}); # comment |
| 1999 |
|
## Stay in the state. |
| 2000 |
|
!!!next-input-character; |
| 2001 |
|
redo A; |
| 2002 |
|
} |
| 2003 |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
| 2004 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
| 2005 |
|
|
| 2935 |
## Reconsume. |
## Reconsume. |
| 2936 |
redo A; |
redo A; |
| 2937 |
} |
} |
| 2938 |
} else { |
} elsif ($self->{state} == ENTITY_STATE) { |
| 2939 |
die "$0: $self->{state}: Unknown state"; |
if ({ |
| 2940 |
} |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
| 2941 |
} # A |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & |
| 2942 |
|
$self->{entity_additional} => 1, |
| 2943 |
die "$0: _get_next_token: unexpected case"; |
}->{$self->{next_char}}) { |
| 2944 |
} # _get_next_token |
!!!cp (1001); |
| 2945 |
|
## Don't consume |
| 2946 |
sub _tokenize_attempt_to_consume_an_entity ($$$) { |
## No error |
| 2947 |
my ($self, $in_attr, $additional) = @_; |
## Return nothing. |
| 2948 |
|
# |
| 2949 |
|
} elsif ($self->{next_char} == 0x0023) { # # |
| 2950 |
|
!!!cp (999); |
| 2951 |
|
$self->{state} = ENTITY_HASH_STATE; |
| 2952 |
|
$self->{state_keyword} = '#'; |
| 2953 |
|
!!!next-input-character; |
| 2954 |
|
redo A; |
| 2955 |
|
} elsif ((0x0041 <= $self->{next_char} and |
| 2956 |
|
$self->{next_char} <= 0x005A) or # A..Z |
| 2957 |
|
(0x0061 <= $self->{next_char} and |
| 2958 |
|
$self->{next_char} <= 0x007A)) { # a..z |
| 2959 |
|
!!!cp (998); |
| 2960 |
|
require Whatpm::_NamedEntityList; |
| 2961 |
|
$self->{state} = ENTITY_NAME_STATE; |
| 2962 |
|
$self->{state_keyword} = chr $self->{next_char}; |
| 2963 |
|
$self->{entity__value} = $self->{state_keyword}; |
| 2964 |
|
$self->{entity__match} = 0; |
| 2965 |
|
!!!next-input-character; |
| 2966 |
|
redo A; |
| 2967 |
|
} else { |
| 2968 |
|
!!!cp (1027); |
| 2969 |
|
!!!parse-error (type => 'bare ero'); |
| 2970 |
|
## Return nothing. |
| 2971 |
|
# |
| 2972 |
|
} |
| 2973 |
|
|
| 2974 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
## NOTE: No character is consumed by the "consume a character |
| 2975 |
|
## reference" algorithm. In other word, there is an "&" character |
| 2976 |
|
## that does not introduce a character reference, which would be |
| 2977 |
|
## appended to the parent element or the attribute value in later |
| 2978 |
|
## process of the tokenizer. |
| 2979 |
|
|
| 2980 |
|
if ($self->{prev_state} == DATA_STATE) { |
| 2981 |
|
!!!cp (997); |
| 2982 |
|
$self->{state} = $self->{prev_state}; |
| 2983 |
|
## Reconsume. |
| 2984 |
|
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
| 2985 |
|
line => $self->{line_prev}, |
| 2986 |
|
column => $self->{column_prev}, |
| 2987 |
|
}); |
| 2988 |
|
redo A; |
| 2989 |
|
} else { |
| 2990 |
|
!!!cp (996); |
| 2991 |
|
$self->{current_attribute}->{value} .= '&'; |
| 2992 |
|
$self->{state} = $self->{prev_state}; |
| 2993 |
|
## Reconsume. |
| 2994 |
|
redo A; |
| 2995 |
|
} |
| 2996 |
|
} elsif ($self->{state} == ENTITY_HASH_STATE) { |
| 2997 |
|
if ($self->{next_char} == 0x0078 or # x |
| 2998 |
|
$self->{next_char} == 0x0058) { # X |
| 2999 |
|
!!!cp (995); |
| 3000 |
|
$self->{state} = HEXREF_X_STATE; |
| 3001 |
|
$self->{state_keyword} .= chr $self->{next_char}; |
| 3002 |
|
!!!next-input-character; |
| 3003 |
|
redo A; |
| 3004 |
|
} elsif (0x0030 <= $self->{next_char} and |
| 3005 |
|
$self->{next_char} <= 0x0039) { # 0..9 |
| 3006 |
|
!!!cp (994); |
| 3007 |
|
$self->{state} = NCR_NUM_STATE; |
| 3008 |
|
$self->{state_keyword} = $self->{next_char} - 0x0030; |
| 3009 |
|
!!!next-input-character; |
| 3010 |
|
redo A; |
| 3011 |
|
} else { |
| 3012 |
|
!!!parse-error (type => 'bare nero', |
| 3013 |
|
line => $self->{line_prev}, |
| 3014 |
|
column => $self->{column_prev} - 1); |
| 3015 |
|
|
| 3016 |
if ({ |
## NOTE: According to the spec algorithm, nothing is returned, |
| 3017 |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
## and then "&#" is appended to the parent element or the attribute |
| 3018 |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
## value in the later processing. |
| 3019 |
$additional => 1, |
|
| 3020 |
}->{$self->{next_char}}) { |
if ($self->{prev_state} == DATA_STATE) { |
| 3021 |
!!!cp (1001); |
!!!cp (1019); |
| 3022 |
## Don't consume |
$self->{state} = $self->{prev_state}; |
| 3023 |
## No error |
## Reconsume. |
| 3024 |
return undef; |
!!!emit ({type => CHARACTER_TOKEN, |
| 3025 |
} elsif ($self->{next_char} == 0x0023) { # # |
data => '&#', |
| 3026 |
!!!next-input-character; |
line => $self->{line_prev}, |
| 3027 |
if ($self->{next_char} == 0x0078 or # x |
column => $self->{column_prev} - 1, |
| 3028 |
$self->{next_char} == 0x0058) { # X |
}); |
| 3029 |
my $code; |
redo A; |
|
X: { |
|
|
my $x_char = $self->{next_char}; |
|
|
!!!next-input-character; |
|
|
if (0x0030 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0039) { # 0..9 |
|
|
!!!cp (1002); |
|
|
$code ||= 0; |
|
|
$code *= 0x10; |
|
|
$code += $self->{next_char} - 0x0030; |
|
|
redo X; |
|
|
} elsif (0x0061 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0066) { # a..f |
|
|
!!!cp (1003); |
|
|
$code ||= 0; |
|
|
$code *= 0x10; |
|
|
$code += $self->{next_char} - 0x0060 + 9; |
|
|
redo X; |
|
|
} elsif (0x0041 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0046) { # A..F |
|
|
!!!cp (1004); |
|
|
$code ||= 0; |
|
|
$code *= 0x10; |
|
|
$code += $self->{next_char} - 0x0040 + 9; |
|
|
redo X; |
|
|
} elsif (not defined $code) { # no hexadecimal digit |
|
|
!!!cp (1005); |
|
|
!!!parse-error (type => 'bare hcro', line => $l, column => $c); |
|
|
!!!back-next-input-character ($x_char, $self->{next_char}); |
|
|
$self->{next_char} = 0x0023; # # |
|
|
return undef; |
|
|
} elsif ($self->{next_char} == 0x003B) { # ; |
|
|
!!!cp (1006); |
|
|
!!!next-input-character; |
|
| 3030 |
} else { |
} else { |
| 3031 |
!!!cp (1007); |
!!!cp (993); |
| 3032 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
$self->{current_attribute}->{value} .= '&#'; |
| 3033 |
|
$self->{state} = $self->{prev_state}; |
| 3034 |
|
## Reconsume. |
| 3035 |
|
redo A; |
| 3036 |
} |
} |
| 3037 |
|
} |
| 3038 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
} elsif ($self->{state} == NCR_NUM_STATE) { |
| 3039 |
!!!cp (1008); |
if (0x0030 <= $self->{next_char} and |
| 3040 |
!!!parse-error (type => 'invalid character reference', |
$self->{next_char} <= 0x0039) { # 0..9 |
|
text => (sprintf 'U+%04X', $code), |
|
|
line => $l, column => $c); |
|
|
$code = 0xFFFD; |
|
|
} elsif ($code > 0x10FFFF) { |
|
|
!!!cp (1009); |
|
|
!!!parse-error (type => 'invalid character reference', |
|
|
text => (sprintf 'U-%08X', $code), |
|
|
line => $l, column => $c); |
|
|
$code = 0xFFFD; |
|
|
} elsif ($code == 0x000D) { |
|
|
!!!cp (1010); |
|
|
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
|
|
$code = 0x000A; |
|
|
} elsif (0x80 <= $code and $code <= 0x9F) { |
|
|
!!!cp (1011); |
|
|
!!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c); |
|
|
$code = $c1_entity_char->{$code}; |
|
|
} |
|
|
|
|
|
return {type => CHARACTER_TOKEN, data => chr $code, |
|
|
has_reference => 1, |
|
|
line => $l, column => $c, |
|
|
}; |
|
|
} # X |
|
|
} elsif (0x0030 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0039) { # 0..9 |
|
|
my $code = $self->{next_char} - 0x0030; |
|
|
!!!next-input-character; |
|
|
|
|
|
while (0x0030 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0039) { # 0..9 |
|
| 3041 |
!!!cp (1012); |
!!!cp (1012); |
| 3042 |
$code *= 10; |
$self->{state_keyword} *= 10; |
| 3043 |
$code += $self->{next_char} - 0x0030; |
$self->{state_keyword} += $self->{next_char} - 0x0030; |
| 3044 |
|
|
| 3045 |
|
## Stay in the state. |
| 3046 |
!!!next-input-character; |
!!!next-input-character; |
| 3047 |
} |
redo A; |
| 3048 |
|
} elsif ($self->{next_char} == 0x003B) { # ; |
|
if ($self->{next_char} == 0x003B) { # ; |
|
| 3049 |
!!!cp (1013); |
!!!cp (1013); |
| 3050 |
!!!next-input-character; |
!!!next-input-character; |
| 3051 |
|
# |
| 3052 |
} else { |
} else { |
| 3053 |
!!!cp (1014); |
!!!cp (1014); |
| 3054 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
!!!parse-error (type => 'no refc'); |
| 3055 |
|
## Reconsume. |
| 3056 |
|
# |
| 3057 |
} |
} |
| 3058 |
|
|
| 3059 |
|
my $code = $self->{state_keyword}; |
| 3060 |
|
my $l = $self->{line_prev}; |
| 3061 |
|
my $c = $self->{column_prev}; |
| 3062 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
| 3063 |
!!!cp (1015); |
!!!cp (1015); |
| 3064 |
!!!parse-error (type => 'invalid character reference', |
!!!parse-error (type => 'invalid character reference', |
| 3083 |
line => $l, column => $c); |
line => $l, column => $c); |
| 3084 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 3085 |
} |
} |
| 3086 |
|
|
| 3087 |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, |
if ($self->{prev_state} == DATA_STATE) { |
| 3088 |
line => $l, column => $c, |
!!!cp (992); |
| 3089 |
}; |
$self->{state} = $self->{prev_state}; |
| 3090 |
} else { |
## Reconsume. |
| 3091 |
!!!cp (1019); |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
| 3092 |
!!!parse-error (type => 'bare nero', line => $l, column => $c); |
line => $l, column => $c, |
| 3093 |
!!!back-next-input-character ($self->{next_char}); |
}); |
| 3094 |
$self->{next_char} = 0x0023; # # |
redo A; |
| 3095 |
return undef; |
} else { |
| 3096 |
} |
!!!cp (991); |
| 3097 |
} elsif ((0x0041 <= $self->{next_char} and |
$self->{current_attribute}->{value} .= chr $code; |
| 3098 |
$self->{next_char} <= 0x005A) or |
$self->{current_attribute}->{has_reference} = 1; |
| 3099 |
(0x0061 <= $self->{next_char} and |
$self->{state} = $self->{prev_state}; |
| 3100 |
$self->{next_char} <= 0x007A)) { |
## Reconsume. |
| 3101 |
my $entity_name = chr $self->{next_char}; |
redo A; |
| 3102 |
!!!next-input-character; |
} |
| 3103 |
|
} elsif ($self->{state} == HEXREF_X_STATE) { |
| 3104 |
my $value = $entity_name; |
if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or |
| 3105 |
my $match = 0; |
(0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or |
| 3106 |
require Whatpm::_NamedEntityList; |
(0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) { |
| 3107 |
our $EntityChar; |
# 0..9, A..F, a..f |
| 3108 |
|
!!!cp (990); |
| 3109 |
while (length $entity_name < 30 and |
$self->{state} = HEXREF_HEX_STATE; |
| 3110 |
## NOTE: Some number greater than the maximum length of entity name |
$self->{state_keyword} = 0; |
| 3111 |
((0x0041 <= $self->{next_char} and # a |
## Reconsume. |
| 3112 |
$self->{next_char} <= 0x005A) or # x |
redo A; |
| 3113 |
(0x0061 <= $self->{next_char} and # a |
} else { |
| 3114 |
$self->{next_char} <= 0x007A) or # z |
!!!parse-error (type => 'bare hcro', |
| 3115 |
(0x0030 <= $self->{next_char} and # 0 |
line => $self->{line_prev}, |
| 3116 |
$self->{next_char} <= 0x0039) or # 9 |
column => $self->{column_prev} - 2); |
| 3117 |
$self->{next_char} == 0x003B)) { # ; |
|
| 3118 |
$entity_name .= chr $self->{next_char}; |
## NOTE: According to the spec algorithm, nothing is returned, |
| 3119 |
if (defined $EntityChar->{$entity_name}) { |
## and then "&#" followed by "X" or "x" is appended to the parent |
| 3120 |
if ($self->{next_char} == 0x003B) { # ; |
## element or the attribute value in the later processing. |
| 3121 |
!!!cp (1020); |
|
| 3122 |
$value = $EntityChar->{$entity_name}; |
if ($self->{prev_state} == DATA_STATE) { |
| 3123 |
$match = 1; |
!!!cp (1005); |
| 3124 |
!!!next-input-character; |
$self->{state} = $self->{prev_state}; |
| 3125 |
last; |
## Reconsume. |
| 3126 |
|
!!!emit ({type => CHARACTER_TOKEN, |
| 3127 |
|
data => '&' . $self->{state_keyword}, |
| 3128 |
|
line => $self->{line_prev}, |
| 3129 |
|
column => $self->{column_prev} - length $self->{state_keyword}, |
| 3130 |
|
}); |
| 3131 |
|
redo A; |
| 3132 |
|
} else { |
| 3133 |
|
!!!cp (989); |
| 3134 |
|
$self->{current_attribute}->{value} .= '&' . $self->{state_keyword}; |
| 3135 |
|
$self->{state} = $self->{prev_state}; |
| 3136 |
|
## Reconsume. |
| 3137 |
|
redo A; |
| 3138 |
|
} |
| 3139 |
|
} |
| 3140 |
|
} elsif ($self->{state} == HEXREF_HEX_STATE) { |
| 3141 |
|
if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) { |
| 3142 |
|
# 0..9 |
| 3143 |
|
!!!cp (1002); |
| 3144 |
|
$self->{state_keyword} *= 0x10; |
| 3145 |
|
$self->{state_keyword} += $self->{next_char} - 0x0030; |
| 3146 |
|
## Stay in the state. |
| 3147 |
|
!!!next-input-character; |
| 3148 |
|
redo A; |
| 3149 |
|
} elsif (0x0061 <= $self->{next_char} and |
| 3150 |
|
$self->{next_char} <= 0x0066) { # a..f |
| 3151 |
|
!!!cp (1003); |
| 3152 |
|
$self->{state_keyword} *= 0x10; |
| 3153 |
|
$self->{state_keyword} += $self->{next_char} - 0x0060 + 9; |
| 3154 |
|
## Stay in the state. |
| 3155 |
|
!!!next-input-character; |
| 3156 |
|
redo A; |
| 3157 |
|
} elsif (0x0041 <= $self->{next_char} and |
| 3158 |
|
$self->{next_char} <= 0x0046) { # A..F |
| 3159 |
|
!!!cp (1004); |
| 3160 |
|
$self->{state_keyword} *= 0x10; |
| 3161 |
|
$self->{state_keyword} += $self->{next_char} - 0x0040 + 9; |
| 3162 |
|
## Stay in the state. |
| 3163 |
|
!!!next-input-character; |
| 3164 |
|
redo A; |
| 3165 |
|
} elsif ($self->{next_char} == 0x003B) { # ; |
| 3166 |
|
!!!cp (1006); |
| 3167 |
|
!!!next-input-character; |
| 3168 |
|
# |
| 3169 |
|
} else { |
| 3170 |
|
!!!cp (1007); |
| 3171 |
|
!!!parse-error (type => 'no refc', |
| 3172 |
|
line => $self->{line}, |
| 3173 |
|
column => $self->{column}); |
| 3174 |
|
## Reconsume. |
| 3175 |
|
# |
| 3176 |
|
} |
| 3177 |
|
|
| 3178 |
|
my $code = $self->{state_keyword}; |
| 3179 |
|
my $l = $self->{line_prev}; |
| 3180 |
|
my $c = $self->{column_prev}; |
| 3181 |
|
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
| 3182 |
|
!!!cp (1008); |
| 3183 |
|
!!!parse-error (type => 'invalid character reference', |
| 3184 |
|
text => (sprintf 'U+%04X', $code), |
| 3185 |
|
line => $l, column => $c); |
| 3186 |
|
$code = 0xFFFD; |
| 3187 |
|
} elsif ($code > 0x10FFFF) { |
| 3188 |
|
!!!cp (1009); |
| 3189 |
|
!!!parse-error (type => 'invalid character reference', |
| 3190 |
|
text => (sprintf 'U-%08X', $code), |
| 3191 |
|
line => $l, column => $c); |
| 3192 |
|
$code = 0xFFFD; |
| 3193 |
|
} elsif ($code == 0x000D) { |
| 3194 |
|
!!!cp (1010); |
| 3195 |
|
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
| 3196 |
|
$code = 0x000A; |
| 3197 |
|
} elsif (0x80 <= $code and $code <= 0x9F) { |
| 3198 |
|
!!!cp (1011); |
| 3199 |
|
!!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c); |
| 3200 |
|
$code = $c1_entity_char->{$code}; |
| 3201 |
|
} |
| 3202 |
|
|
| 3203 |
|
if ($self->{prev_state} == DATA_STATE) { |
| 3204 |
|
!!!cp (988); |
| 3205 |
|
$self->{state} = $self->{prev_state}; |
| 3206 |
|
## Reconsume. |
| 3207 |
|
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
| 3208 |
|
line => $l, column => $c, |
| 3209 |
|
}); |
| 3210 |
|
redo A; |
| 3211 |
|
} else { |
| 3212 |
|
!!!cp (987); |
| 3213 |
|
$self->{current_attribute}->{value} .= chr $code; |
| 3214 |
|
$self->{current_attribute}->{has_reference} = 1; |
| 3215 |
|
$self->{state} = $self->{prev_state}; |
| 3216 |
|
## Reconsume. |
| 3217 |
|
redo A; |
| 3218 |
|
} |
| 3219 |
|
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
| 3220 |
|
if (length $self->{state_keyword} < 30 and |
| 3221 |
|
## NOTE: Some number greater than the maximum length of entity name |
| 3222 |
|
((0x0041 <= $self->{next_char} and # a |
| 3223 |
|
$self->{next_char} <= 0x005A) or # x |
| 3224 |
|
(0x0061 <= $self->{next_char} and # a |
| 3225 |
|
$self->{next_char} <= 0x007A) or # z |
| 3226 |
|
(0x0030 <= $self->{next_char} and # 0 |
| 3227 |
|
$self->{next_char} <= 0x0039) or # 9 |
| 3228 |
|
$self->{next_char} == 0x003B)) { # ; |
| 3229 |
|
our $EntityChar; |
| 3230 |
|
$self->{state_keyword} .= chr $self->{next_char}; |
| 3231 |
|
if (defined $EntityChar->{$self->{state_keyword}}) { |
| 3232 |
|
if ($self->{next_char} == 0x003B) { # ; |
| 3233 |
|
!!!cp (1020); |
| 3234 |
|
$self->{entity__value} = $EntityChar->{$self->{state_keyword}}; |
| 3235 |
|
$self->{entity__match} = 1; |
| 3236 |
|
!!!next-input-character; |
| 3237 |
|
# |
| 3238 |
|
} else { |
| 3239 |
|
!!!cp (1021); |
| 3240 |
|
$self->{entity__value} = $EntityChar->{$self->{state_keyword}}; |
| 3241 |
|
$self->{entity__match} = -1; |
| 3242 |
|
## Stay in the state. |
| 3243 |
|
!!!next-input-character; |
| 3244 |
|
redo A; |
| 3245 |
|
} |
| 3246 |
} else { |
} else { |
| 3247 |
!!!cp (1021); |
!!!cp (1022); |
| 3248 |
$value = $EntityChar->{$entity_name}; |
$self->{entity__value} .= chr $self->{next_char}; |
| 3249 |
$match = -1; |
$self->{entity__match} *= 2; |
| 3250 |
|
## Stay in the state. |
| 3251 |
!!!next-input-character; |
!!!next-input-character; |
| 3252 |
|
redo A; |
| 3253 |
|
} |
| 3254 |
|
} |
| 3255 |
|
|
| 3256 |
|
my $data; |
| 3257 |
|
my $has_ref; |
| 3258 |
|
if ($self->{entity__match} > 0) { |
| 3259 |
|
!!!cp (1023); |
| 3260 |
|
$data = $self->{entity__value}; |
| 3261 |
|
$has_ref = 1; |
| 3262 |
|
# |
| 3263 |
|
} elsif ($self->{entity__match} < 0) { |
| 3264 |
|
!!!parse-error (type => 'no refc'); |
| 3265 |
|
if ($self->{prev_state} != DATA_STATE and # in attribute |
| 3266 |
|
$self->{entity__match} < -1) { |
| 3267 |
|
!!!cp (1024); |
| 3268 |
|
$data = '&' . $self->{state_keyword}; |
| 3269 |
|
# |
| 3270 |
|
} else { |
| 3271 |
|
!!!cp (1025); |
| 3272 |
|
$data = $self->{entity__value}; |
| 3273 |
|
$has_ref = 1; |
| 3274 |
|
# |
| 3275 |
} |
} |
| 3276 |
} else { |
} else { |
| 3277 |
!!!cp (1022); |
!!!cp (1026); |
| 3278 |
$value .= chr $self->{next_char}; |
!!!parse-error (type => 'bare ero', |
| 3279 |
$match *= 2; |
line => $self->{line_prev}, |
| 3280 |
!!!next-input-character; |
column => $self->{column_prev}); |
| 3281 |
|
$data = '&' . $self->{state_keyword}; |
| 3282 |
|
# |
| 3283 |
} |
} |
| 3284 |
} |
|
| 3285 |
|
## NOTE: In these cases, when a character reference is found, |
| 3286 |
if ($match > 0) { |
## it is consumed and a character token is returned, or, otherwise, |
| 3287 |
!!!cp (1023); |
## nothing is consumed and returned, according to the spec algorithm. |
| 3288 |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
## In this implementation, anything that has been examined by the |
| 3289 |
line => $l, column => $c, |
## tokenizer is appended to the parent element or the attribute value |
| 3290 |
}; |
## as string, either literal string when no character reference or |
| 3291 |
} elsif ($match < 0) { |
## entity-replaced string otherwise, in this stage, since any characters |
| 3292 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
## that would not be consumed are appended in the data state or in an |
| 3293 |
if ($in_attr and $match < -1) { |
## appropriate attribute value state anyway. |
| 3294 |
!!!cp (1024); |
|
| 3295 |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name, |
if ($self->{prev_state} == DATA_STATE) { |
| 3296 |
line => $l, column => $c, |
!!!cp (986); |
| 3297 |
}; |
$self->{state} = $self->{prev_state}; |
| 3298 |
} else { |
## Reconsume. |
| 3299 |
!!!cp (1025); |
!!!emit ({type => CHARACTER_TOKEN, |
| 3300 |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
data => $data, |
| 3301 |
line => $l, column => $c, |
line => $self->{line_prev}, |
| 3302 |
}; |
column => $self->{column_prev} + 1 - length $self->{state_keyword}, |
| 3303 |
|
}); |
| 3304 |
|
redo A; |
| 3305 |
|
} else { |
| 3306 |
|
!!!cp (985); |
| 3307 |
|
$self->{current_attribute}->{value} .= $data; |
| 3308 |
|
$self->{current_attribute}->{has_reference} = 1 if $has_ref; |
| 3309 |
|
$self->{state} = $self->{prev_state}; |
| 3310 |
|
## Reconsume. |
| 3311 |
|
redo A; |
| 3312 |
} |
} |
| 3313 |
} else { |
} else { |
| 3314 |
!!!cp (1026); |
die "$0: $self->{state}: Unknown state"; |
|
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
|
|
## NOTE: "No characters are consumed" in the spec. |
|
|
return {type => CHARACTER_TOKEN, data => '&'.$value, |
|
|
line => $l, column => $c, |
|
|
}; |
|
| 3315 |
} |
} |
| 3316 |
} else { |
} # A |
| 3317 |
!!!cp (1027); |
|
| 3318 |
## no characters are consumed |
die "$0: _get_next_token: unexpected case"; |
| 3319 |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
} # _get_next_token |
|
return undef; |
|
|
} |
|
|
} # _tokenize_attempt_to_consume_an_entity |
|
| 3320 |
|
|
| 3321 |
sub _initialize_tree_constructor ($) { |
sub _initialize_tree_constructor ($) { |
| 3322 |
my $self = shift; |
my $self = shift; |