| 769 |
sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP } |
sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP } |
| 770 |
|
|
| 771 |
sub DATA_STATE () { 0 } |
sub DATA_STATE () { 0 } |
| 772 |
sub ENTITY_DATA_STATE () { 1 } |
#sub ENTITY_DATA_STATE () { 1 } |
| 773 |
sub TAG_OPEN_STATE () { 2 } |
sub TAG_OPEN_STATE () { 2 } |
| 774 |
sub CLOSE_TAG_OPEN_STATE () { 3 } |
sub CLOSE_TAG_OPEN_STATE () { 3 } |
| 775 |
sub TAG_NAME_STATE () { 4 } |
sub TAG_NAME_STATE () { 4 } |
| 780 |
sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } |
sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } |
| 781 |
sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } |
sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } |
| 782 |
sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } |
sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } |
| 783 |
sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 } |
#sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 } |
| 784 |
sub MARKUP_DECLARATION_OPEN_STATE () { 13 } |
sub MARKUP_DECLARATION_OPEN_STATE () { 13 } |
| 785 |
sub COMMENT_START_STATE () { 14 } |
sub COMMENT_START_STATE () { 14 } |
| 786 |
sub COMMENT_START_DASH_STATE () { 15 } |
sub COMMENT_START_DASH_STATE () { 15 } |
| 812 |
sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec |
sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec |
| 813 |
sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec |
sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec |
| 814 |
sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec |
sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec |
| 815 |
sub ENTITY_STATE () { 44 } # "consume a character reference" in the spec |
## NOTE: "Entity data state", "entity in attribute value state", and |
| 816 |
|
## "consume a character reference" algorithm are jointly implemented |
| 817 |
|
## using the following six states: |
| 818 |
|
sub ENTITY_STATE () { 44 } |
| 819 |
|
sub ENTITY_HASH_STATE () { 45 } |
| 820 |
|
sub NCR_NUM_STATE () { 46 } |
| 821 |
|
sub HEXREF_X_STATE () { 47 } |
| 822 |
|
sub HEXREF_HEX_STATE () { 48 } |
| 823 |
|
sub ENTITY_NAME_STATE () { 49 } |
| 824 |
|
|
| 825 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } |
| 826 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
| 953 |
## "entity data state". In this implementation, the tokenizer |
## "entity data state". In this implementation, the tokenizer |
| 954 |
## is switched to the |ENTITY_STATE|, which is an implementation |
## is switched to the |ENTITY_STATE|, which is an implementation |
| 955 |
## of the "consume a character reference" algorithm. |
## of the "consume a character reference" algorithm. |
|
#$self->{state} = ENTITY_DATA_STATE; |
|
| 956 |
$self->{entity_in_attr} = 0; |
$self->{entity_in_attr} = 0; |
| 957 |
$self->{entity_additional} = -1; |
$self->{entity_additional} = -1; |
| 958 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
| 1025 |
!!!emit ($token); |
!!!emit ($token); |
| 1026 |
|
|
| 1027 |
redo A; |
redo A; |
|
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
|
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
|
|
|
|
|
my $token = $self->{entity_return}; |
|
|
|
|
|
$self->{state} = DATA_STATE; |
|
|
# next-input-character is already done |
|
|
|
|
|
unless (defined $token) { |
|
|
!!!cp (13); |
|
|
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
|
|
line => $l, column => $c, |
|
|
}); |
|
|
} else { |
|
|
!!!cp (14); |
|
|
!!!emit ($token); |
|
|
} |
|
|
|
|
|
redo A; |
|
| 1028 |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
| 1029 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 1030 |
if ($self->{next_char} == 0x002F) { # / |
if ($self->{next_char} == 0x002F) { # / |
| 1703 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1704 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1705 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
#$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
|
| 1706 |
$self->{entity_in_attr} = 1; |
$self->{entity_in_attr} = 1; |
| 1707 |
$self->{entity_additional} = 0x0022; # " |
$self->{entity_additional} = 0x0022; # " |
| 1708 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
| 1751 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1752 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1753 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
#$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
|
| 1754 |
$self->{entity_in_attr} = 1; |
$self->{entity_in_attr} = 1; |
| 1755 |
$self->{entity_additional} = 0x0027; # ' |
$self->{entity_additional} = 0x0027; # ' |
| 1756 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
| 1803 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1804 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1805 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
#$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
|
| 1806 |
$self->{entity_in_attr} = 1; |
$self->{entity_in_attr} = 1; |
| 1807 |
$self->{entity_additional} = -1; |
$self->{entity_additional} = -1; |
| 1808 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
| 1869 |
!!!next-input-character; |
!!!next-input-character; |
| 1870 |
redo A; |
redo A; |
| 1871 |
} |
} |
|
} elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) { |
|
|
my $token = $self->{entity_return}; |
|
|
|
|
|
unless (defined $token) { |
|
|
!!!cp (117); |
|
|
$self->{current_attribute}->{value} .= '&'; |
|
|
} else { |
|
|
!!!cp (118); |
|
|
$self->{current_attribute}->{value} .= $token->{data}; |
|
|
$self->{current_attribute}->{has_reference} = $token->{has_reference}; |
|
|
## ISSUE: spec says "append the returned character token to the current attribute's value" |
|
|
} |
|
|
|
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
# next-input-character is already done |
|
|
redo A; |
|
| 1872 |
} elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { |
} elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { |
| 1873 |
if ($self->{next_char} == 0x0009 or # HT |
if ($self->{next_char} == 0x0009 or # HT |
| 1874 |
$self->{next_char} == 0x000A or # LF |
$self->{next_char} == 0x000A or # LF |
| 2948 |
## Reconsume. |
## Reconsume. |
| 2949 |
redo A; |
redo A; |
| 2950 |
} |
} |
|
|
|
| 2951 |
} elsif ($self->{state} == ENTITY_STATE) { |
} elsif ($self->{state} == ENTITY_STATE) { |
| 2952 |
my $in_attr = $self->{entity_in_attr}; |
if ({ |
| 2953 |
my $additional = $self->{entity_additional}; |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
| 2954 |
|
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & |
| 2955 |
|
$self->{entity_additional} => 1, |
| 2956 |
|
}->{$self->{next_char}}) { |
| 2957 |
|
!!!cp (1001); |
| 2958 |
|
## Don't consume |
| 2959 |
|
## No error |
| 2960 |
|
## Return nothing. |
| 2961 |
|
# |
| 2962 |
|
} elsif ($self->{next_char} == 0x0023) { # # |
| 2963 |
|
$self->{state} = ENTITY_HASH_STATE; |
| 2964 |
|
$self->{state_keyword} = '#'; |
| 2965 |
|
!!!next-input-character; |
| 2966 |
|
redo A; |
| 2967 |
|
} elsif ((0x0041 <= $self->{next_char} and |
| 2968 |
|
$self->{next_char} <= 0x005A) or # A..Z |
| 2969 |
|
(0x0061 <= $self->{next_char} and |
| 2970 |
|
$self->{next_char} <= 0x007A)) { # a..z |
| 2971 |
|
require Whatpm::_NamedEntityList; |
| 2972 |
|
$self->{state} = ENTITY_NAME_STATE; |
| 2973 |
|
$self->{state_keyword} = chr $self->{next_char}; |
| 2974 |
|
$self->{entity__value} = $self->{state_keyword}; |
| 2975 |
|
$self->{entity__match} = 0; |
| 2976 |
|
!!!next-input-character; |
| 2977 |
|
redo A; |
| 2978 |
|
} else { |
| 2979 |
|
!!!cp (1027); |
| 2980 |
|
!!!parse-error (type => 'bare ero'); |
| 2981 |
|
## Return nothing. |
| 2982 |
|
# |
| 2983 |
|
} |
| 2984 |
|
|
| 2985 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
## NOTE: No character is consumed by the "consume a character |
| 2986 |
|
## reference" algorithm. In other word, there is an "&" character |
| 2987 |
|
## that does not introduce a character reference, which would be |
| 2988 |
|
## appended to the parent element or the attribute value in later |
| 2989 |
|
## process of the tokenizer. |
| 2990 |
|
|
| 2991 |
if ({ |
if ($self->{entity_in_attr}) { |
| 2992 |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
$self->{current_attribute}->{value} .= '&'; |
| 2993 |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
$self->{state} = $self->{last_attribute_value_state}; |
| 2994 |
$additional => 1, |
## Reconsume. |
| 2995 |
}->{$self->{next_char}}) { |
redo A; |
| 2996 |
!!!cp (1001); |
} else { |
| 2997 |
## Don't consume |
$self->{state} = DATA_STATE; |
| 2998 |
## No error |
## Reconsume. |
| 2999 |
$self->{entity_return} = undef; |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
| 3000 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
line => $self->{line_prev}, |
| 3001 |
redo A; |
column => $self->{column_prev}, |
| 3002 |
} elsif ($self->{next_char} == 0x0023) { # # |
}); |
| 3003 |
!!!next-input-character; |
redo A; |
| 3004 |
if ($self->{next_char} == 0x0078 or # x |
} |
| 3005 |
$self->{next_char} == 0x0058) { # X |
} elsif ($self->{state} == ENTITY_HASH_STATE) { |
| 3006 |
my $code; |
if ($self->{next_char} == 0x0078 or # x |
| 3007 |
X: { |
$self->{next_char} == 0x0058) { # X |
| 3008 |
my $x_char = $self->{next_char}; |
$self->{state} = HEXREF_X_STATE; |
| 3009 |
!!!next-input-character; |
$self->{state_keyword} .= chr $self->{next_char}; |
| 3010 |
if (0x0030 <= $self->{next_char} and |
!!!next-input-character; |
| 3011 |
$self->{next_char} <= 0x0039) { # 0..9 |
redo A; |
| 3012 |
!!!cp (1002); |
} elsif (0x0030 <= $self->{next_char} and |
| 3013 |
$code ||= 0; |
$self->{next_char} <= 0x0039) { # 0..9 |
| 3014 |
$code *= 0x10; |
$self->{state} = NCR_NUM_STATE; |
| 3015 |
$code += $self->{next_char} - 0x0030; |
$self->{state_keyword} = $self->{next_char} - 0x0030; |
| 3016 |
redo X; |
!!!next-input-character; |
| 3017 |
} elsif (0x0061 <= $self->{next_char} and |
redo A; |
| 3018 |
$self->{next_char} <= 0x0066) { # a..f |
} else { |
| 3019 |
!!!cp (1003); |
!!!cp (1019); |
| 3020 |
$code ||= 0; |
!!!parse-error (type => 'bare nero', |
| 3021 |
$code *= 0x10; |
line => $self->{line_prev}, |
| 3022 |
$code += $self->{next_char} - 0x0060 + 9; |
column => $self->{column_prev} - 1); |
| 3023 |
redo X; |
|
| 3024 |
} elsif (0x0041 <= $self->{next_char} and |
## NOTE: According to the spec algorithm, nothing is returned, |
| 3025 |
$self->{next_char} <= 0x0046) { # A..F |
## and then "&#" is appended to the parent element or the attribute |
| 3026 |
!!!cp (1004); |
## value in the later processing. |
| 3027 |
$code ||= 0; |
|
| 3028 |
$code *= 0x10; |
if ($self->{entity_in_attr}) { |
| 3029 |
$code += $self->{next_char} - 0x0040 + 9; |
$self->{current_attribute}->{value} .= '&#'; |
| 3030 |
redo X; |
$self->{state} = $self->{last_attribute_value_state}; |
| 3031 |
} elsif (not defined $code) { # no hexadecimal digit |
## Reconsume. |
|
!!!cp (1005); |
|
|
!!!parse-error (type => 'bare hcro', line => $l, column => $c); |
|
|
!!!back-next-input-character ($x_char, $self->{next_char}); |
|
|
$self->{next_char} = 0x0023; # # |
|
|
$self->{entity_return} = undef; |
|
|
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
|
| 3032 |
redo A; |
redo A; |
|
} elsif ($self->{next_char} == 0x003B) { # ; |
|
|
!!!cp (1006); |
|
|
!!!next-input-character; |
|
| 3033 |
} else { |
} else { |
| 3034 |
!!!cp (1007); |
$self->{state} = DATA_STATE; |
| 3035 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
## Reconsume. |
| 3036 |
|
!!!emit ({type => CHARACTER_TOKEN, |
| 3037 |
|
data => '&#', |
| 3038 |
|
line => $self->{line_prev}, |
| 3039 |
|
column => $self->{column_prev} - 1, |
| 3040 |
|
}); |
| 3041 |
|
redo A; |
| 3042 |
} |
} |
| 3043 |
|
} |
| 3044 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
} elsif ($self->{state} == NCR_NUM_STATE) { |
| 3045 |
!!!cp (1008); |
if (0x0030 <= $self->{next_char} and |
| 3046 |
!!!parse-error (type => 'invalid character reference', |
$self->{next_char} <= 0x0039) { # 0..9 |
|
text => (sprintf 'U+%04X', $code), |
|
|
line => $l, column => $c); |
|
|
$code = 0xFFFD; |
|
|
} elsif ($code > 0x10FFFF) { |
|
|
!!!cp (1009); |
|
|
!!!parse-error (type => 'invalid character reference', |
|
|
text => (sprintf 'U-%08X', $code), |
|
|
line => $l, column => $c); |
|
|
$code = 0xFFFD; |
|
|
} elsif ($code == 0x000D) { |
|
|
!!!cp (1010); |
|
|
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
|
|
$code = 0x000A; |
|
|
} elsif (0x80 <= $code and $code <= 0x9F) { |
|
|
!!!cp (1011); |
|
|
!!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c); |
|
|
$code = $c1_entity_char->{$code}; |
|
|
} |
|
|
|
|
|
$self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code, |
|
|
has_reference => 1, |
|
|
line => $l, column => $c, |
|
|
}; |
|
|
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
|
|
redo A; |
|
|
} # X |
|
|
} elsif (0x0030 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0039) { # 0..9 |
|
|
my $code = $self->{next_char} - 0x0030; |
|
|
!!!next-input-character; |
|
|
|
|
|
while (0x0030 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0039) { # 0..9 |
|
| 3047 |
!!!cp (1012); |
!!!cp (1012); |
| 3048 |
$code *= 10; |
$self->{state_keyword} *= 10; |
| 3049 |
$code += $self->{next_char} - 0x0030; |
$self->{state_keyword} += $self->{next_char} - 0x0030; |
| 3050 |
|
|
| 3051 |
|
## Stay in the state. |
| 3052 |
!!!next-input-character; |
!!!next-input-character; |
| 3053 |
} |
redo A; |
| 3054 |
|
} elsif ($self->{next_char} == 0x003B) { # ; |
|
if ($self->{next_char} == 0x003B) { # ; |
|
| 3055 |
!!!cp (1013); |
!!!cp (1013); |
| 3056 |
!!!next-input-character; |
!!!next-input-character; |
| 3057 |
|
# |
| 3058 |
} else { |
} else { |
| 3059 |
!!!cp (1014); |
!!!cp (1014); |
| 3060 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
!!!parse-error (type => 'no refc'); |
| 3061 |
|
## Reconsume. |
| 3062 |
|
# |
| 3063 |
} |
} |
| 3064 |
|
|
| 3065 |
|
my $code = $self->{state_keyword}; |
| 3066 |
|
my $l = $self->{line_prev}; |
| 3067 |
|
my $c = $self->{column_prev}; |
| 3068 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
| 3069 |
!!!cp (1015); |
!!!cp (1015); |
| 3070 |
!!!parse-error (type => 'invalid character reference', |
!!!parse-error (type => 'invalid character reference', |
| 3089 |
line => $l, column => $c); |
line => $l, column => $c); |
| 3090 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 3091 |
} |
} |
| 3092 |
|
|
| 3093 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, |
if ($self->{entity_in_attr}) { |
| 3094 |
line => $l, column => $c, |
$self->{current_attribute}->{value} .= chr $code; |
| 3095 |
}; |
$self->{current_attribute}->{has_reference} = 1; |
| 3096 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
$self->{state} = $self->{last_attribute_value_state}; |
| 3097 |
redo A; |
## Reconsume. |
| 3098 |
} else { |
redo A; |
| 3099 |
!!!cp (1019); |
} else { |
| 3100 |
!!!parse-error (type => 'bare nero', line => $l, column => $c); |
$self->{state} = DATA_STATE; |
| 3101 |
!!!back-next-input-character ($self->{next_char}); |
## Reconsume. |
| 3102 |
$self->{next_char} = 0x0023; # # |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
| 3103 |
$self->{entity_return} = undef; |
has_reference => 1, |
| 3104 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
line => $l, column => $c, |
| 3105 |
redo A; |
}); |
| 3106 |
} |
redo A; |
| 3107 |
} elsif ((0x0041 <= $self->{next_char} and |
} |
| 3108 |
$self->{next_char} <= 0x005A) or |
} elsif ($self->{state} == HEXREF_X_STATE) { |
| 3109 |
(0x0061 <= $self->{next_char} and |
if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or |
| 3110 |
$self->{next_char} <= 0x007A)) { |
(0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or |
| 3111 |
my $entity_name = chr $self->{next_char}; |
(0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) { |
| 3112 |
!!!next-input-character; |
# 0..9, A..F, a..f |
| 3113 |
|
$self->{state} = HEXREF_HEX_STATE; |
| 3114 |
my $value = $entity_name; |
$self->{state_keyword} = 0; |
| 3115 |
my $match = 0; |
## Reconsume. |
| 3116 |
require Whatpm::_NamedEntityList; |
redo A; |
| 3117 |
our $EntityChar; |
} else { |
| 3118 |
|
!!!cp (1005); |
| 3119 |
while (length $entity_name < 30 and |
!!!parse-error (type => 'bare hcro', |
| 3120 |
## NOTE: Some number greater than the maximum length of entity name |
line => $self->{line_prev}, |
| 3121 |
((0x0041 <= $self->{next_char} and # a |
column => $self->{column_prev} - 2); |
| 3122 |
$self->{next_char} <= 0x005A) or # x |
|
| 3123 |
(0x0061 <= $self->{next_char} and # a |
## NOTE: According to the spec algorithm, nothing is returned, |
| 3124 |
$self->{next_char} <= 0x007A) or # z |
## and then "&#" followed by "X" or "x" is appended to the parent |
| 3125 |
(0x0030 <= $self->{next_char} and # 0 |
## element or the attribute value in the later processing. |
| 3126 |
$self->{next_char} <= 0x0039) or # 9 |
|
| 3127 |
$self->{next_char} == 0x003B)) { # ; |
if ($self->{entity_in_attr}) { |
| 3128 |
$entity_name .= chr $self->{next_char}; |
$self->{current_attribute}->{value} .= '&' . $self->{state_keyword}; |
| 3129 |
if (defined $EntityChar->{$entity_name}) { |
$self->{state} = $self->{last_attribute_value_state}; |
| 3130 |
if ($self->{next_char} == 0x003B) { # ; |
## Reconsume. |
| 3131 |
!!!cp (1020); |
redo A; |
|
$value = $EntityChar->{$entity_name}; |
|
|
$match = 1; |
|
|
!!!next-input-character; |
|
|
last; |
|
| 3132 |
} else { |
} else { |
| 3133 |
!!!cp (1021); |
$self->{state} = DATA_STATE; |
| 3134 |
$value = $EntityChar->{$entity_name}; |
## Reconsume. |
| 3135 |
$match = -1; |
!!!emit ({type => CHARACTER_TOKEN, |
| 3136 |
!!!next-input-character; |
data => '&' . $self->{state_keyword}, |
| 3137 |
|
line => $self->{line_prev}, |
| 3138 |
|
column => $self->{column_prev} - length $self->{state_keyword}, |
| 3139 |
|
}); |
| 3140 |
|
redo A; |
| 3141 |
} |
} |
| 3142 |
} else { |
} |
| 3143 |
!!!cp (1022); |
} elsif ($self->{state} == HEXREF_HEX_STATE) { |
| 3144 |
$value .= chr $self->{next_char}; |
if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) { |
| 3145 |
$match *= 2; |
# 0..9 |
| 3146 |
|
!!!cp (1002); |
| 3147 |
|
$self->{state_keyword} *= 0x10; |
| 3148 |
|
$self->{state_keyword} += $self->{next_char} - 0x0030; |
| 3149 |
|
## Stay in the state. |
| 3150 |
!!!next-input-character; |
!!!next-input-character; |
| 3151 |
|
redo A; |
| 3152 |
|
} elsif (0x0061 <= $self->{next_char} and |
| 3153 |
|
$self->{next_char} <= 0x0066) { # a..f |
| 3154 |
|
!!!cp (1003); |
| 3155 |
|
$self->{state_keyword} *= 0x10; |
| 3156 |
|
$self->{state_keyword} += $self->{next_char} - 0x0060 + 9; |
| 3157 |
|
## Stay in the state. |
| 3158 |
|
!!!next-input-character; |
| 3159 |
|
redo A; |
| 3160 |
|
} elsif (0x0041 <= $self->{next_char} and |
| 3161 |
|
$self->{next_char} <= 0x0046) { # A..F |
| 3162 |
|
!!!cp (1004); |
| 3163 |
|
$self->{state_keyword} *= 0x10; |
| 3164 |
|
$self->{state_keyword} += $self->{next_char} - 0x0040 + 9; |
| 3165 |
|
## Stay in the state. |
| 3166 |
|
!!!next-input-character; |
| 3167 |
|
redo A; |
| 3168 |
|
} elsif ($self->{next_char} == 0x003B) { # ; |
| 3169 |
|
!!!cp (1006); |
| 3170 |
|
!!!next-input-character; |
| 3171 |
|
# |
| 3172 |
|
} else { |
| 3173 |
|
!!!cp (1007); |
| 3174 |
|
!!!parse-error (type => 'no refc', |
| 3175 |
|
line => $self->{line}, |
| 3176 |
|
column => $self->{column}); |
| 3177 |
|
## Reconsume. |
| 3178 |
|
# |
| 3179 |
} |
} |
| 3180 |
} |
|
| 3181 |
|
my $code = $self->{state_keyword}; |
| 3182 |
if ($match > 0) { |
my $l = $self->{line_prev}; |
| 3183 |
!!!cp (1023); |
my $c = $self->{column_prev}; |
| 3184 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
| 3185 |
line => $l, column => $c, |
!!!cp (1008); |
| 3186 |
}; |
!!!parse-error (type => 'invalid character reference', |
| 3187 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
text => (sprintf 'U+%04X', $code), |
| 3188 |
redo A; |
line => $l, column => $c); |
| 3189 |
} elsif ($match < 0) { |
$code = 0xFFFD; |
| 3190 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
} elsif ($code > 0x10FFFF) { |
| 3191 |
if ($in_attr and $match < -1) { |
!!!cp (1009); |
| 3192 |
!!!cp (1024); |
!!!parse-error (type => 'invalid character reference', |
| 3193 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$entity_name, |
text => (sprintf 'U-%08X', $code), |
| 3194 |
line => $l, column => $c, |
line => $l, column => $c); |
| 3195 |
}; |
$code = 0xFFFD; |
| 3196 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
} elsif ($code == 0x000D) { |
| 3197 |
|
!!!cp (1010); |
| 3198 |
|
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
| 3199 |
|
$code = 0x000A; |
| 3200 |
|
} elsif (0x80 <= $code and $code <= 0x9F) { |
| 3201 |
|
!!!cp (1011); |
| 3202 |
|
!!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c); |
| 3203 |
|
$code = $c1_entity_char->{$code}; |
| 3204 |
|
} |
| 3205 |
|
|
| 3206 |
|
if ($self->{entity_in_attr}) { |
| 3207 |
|
$self->{current_attribute}->{value} .= chr $code; |
| 3208 |
|
$self->{current_attribute}->{has_reference} = 1; |
| 3209 |
|
$self->{state} = $self->{last_attribute_value_state}; |
| 3210 |
|
## Reconsume. |
| 3211 |
redo A; |
redo A; |
| 3212 |
} else { |
} else { |
| 3213 |
!!!cp (1025); |
$self->{state} = DATA_STATE; |
| 3214 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
## Reconsume. |
| 3215 |
line => $l, column => $c, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
| 3216 |
}; |
has_reference => 1, |
| 3217 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
line => $l, column => $c, |
| 3218 |
|
}); |
| 3219 |
redo A; |
redo A; |
| 3220 |
} |
} |
| 3221 |
} else { |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
| 3222 |
!!!cp (1026); |
if (length $self->{state_keyword} < 30 and |
| 3223 |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
## NOTE: Some number greater than the maximum length of entity name |
| 3224 |
## NOTE: "No characters are consumed" in the spec. |
((0x0041 <= $self->{next_char} and # a |
| 3225 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$value, |
$self->{next_char} <= 0x005A) or # x |
| 3226 |
line => $l, column => $c, |
(0x0061 <= $self->{next_char} and # a |
| 3227 |
}; |
$self->{next_char} <= 0x007A) or # z |
| 3228 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
(0x0030 <= $self->{next_char} and # 0 |
| 3229 |
redo A; |
$self->{next_char} <= 0x0039) or # 9 |
| 3230 |
} |
$self->{next_char} == 0x003B)) { # ; |
| 3231 |
} else { |
our $EntityChar; |
| 3232 |
!!!cp (1027); |
$self->{state_keyword} .= chr $self->{next_char}; |
| 3233 |
## no characters are consumed |
if (defined $EntityChar->{$self->{state_keyword}}) { |
| 3234 |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
if ($self->{next_char} == 0x003B) { # ; |
| 3235 |
$self->{entity_return} = undef; |
!!!cp (1020); |
| 3236 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
$self->{entity__value} = $EntityChar->{$self->{state_keyword}}; |
| 3237 |
redo A; |
$self->{entity__match} = 1; |
| 3238 |
} |
!!!next-input-character; |
| 3239 |
|
# |
| 3240 |
|
} else { |
| 3241 |
|
!!!cp (1021); |
| 3242 |
|
$self->{entity__value} = $EntityChar->{$self->{state_keyword}}; |
| 3243 |
|
$self->{entity__match} = -1; |
| 3244 |
|
## Stay in the state. |
| 3245 |
|
!!!next-input-character; |
| 3246 |
|
redo A; |
| 3247 |
|
} |
| 3248 |
|
} else { |
| 3249 |
|
!!!cp (1022); |
| 3250 |
|
$self->{entity__value} .= chr $self->{next_char}; |
| 3251 |
|
$self->{entity__match} *= 2; |
| 3252 |
|
## Stay in the state. |
| 3253 |
|
!!!next-input-character; |
| 3254 |
|
redo A; |
| 3255 |
|
} |
| 3256 |
|
} |
| 3257 |
|
|
| 3258 |
|
my $data; |
| 3259 |
|
my $has_ref; |
| 3260 |
|
if ($self->{entity__match} > 0) { |
| 3261 |
|
!!!cp (1023); |
| 3262 |
|
$data = $self->{entity__value}; |
| 3263 |
|
$has_ref = 1; |
| 3264 |
|
# |
| 3265 |
|
} elsif ($self->{entity__match} < 0) { |
| 3266 |
|
!!!parse-error (type => 'no refc'); |
| 3267 |
|
if ($self->{entity_in_attr} and $self->{entity__match} < -1) { |
| 3268 |
|
!!!cp (1024); |
| 3269 |
|
$data = '&' . $self->{state_keyword}; |
| 3270 |
|
# |
| 3271 |
|
} else { |
| 3272 |
|
!!!cp (1025); |
| 3273 |
|
$data = $self->{entity__value}; |
| 3274 |
|
$has_ref = 1; |
| 3275 |
|
# |
| 3276 |
|
} |
| 3277 |
|
} else { |
| 3278 |
|
!!!cp (1026); |
| 3279 |
|
!!!parse-error (type => 'bare ero', |
| 3280 |
|
line => $self->{line_prev}, |
| 3281 |
|
column => $self->{column_prev}); |
| 3282 |
|
$data = '&' . $self->{state_keyword}; |
| 3283 |
|
# |
| 3284 |
|
} |
| 3285 |
|
|
| 3286 |
|
## NOTE: In these cases, when a character reference is found, |
| 3287 |
|
## it is consumed and a character token is returned, or, otherwise, |
| 3288 |
|
## nothing is consumed and returned, according to the spec algorithm. |
| 3289 |
|
## In this implementation, anything that has been examined by the |
| 3290 |
|
## tokenizer is appended to the parent element or the attribute value |
| 3291 |
|
## as string, either literal string when no character reference or |
| 3292 |
|
## entity-replaced string otherwise, in this stage, since any characters |
| 3293 |
|
## that would not be consumed are appended in the data state or in an |
| 3294 |
|
## appropriate attribute value state anyway. |
| 3295 |
|
|
| 3296 |
|
if ($self->{entity_in_attr}) { |
| 3297 |
|
$self->{current_attribute}->{value} .= $data; |
| 3298 |
|
$self->{current_attribute}->{has_reference} = 1 if $has_ref; |
| 3299 |
|
$self->{state} = $self->{last_attribute_value_state}; |
| 3300 |
|
## Reconsume. |
| 3301 |
|
redo A; |
| 3302 |
|
} else { |
| 3303 |
|
$self->{state} = DATA_STATE; |
| 3304 |
|
## Reconsume. |
| 3305 |
|
!!!emit ({type => CHARACTER_TOKEN, |
| 3306 |
|
data => $data, has_reference => $has_ref, |
| 3307 |
|
line => $self->{line_prev}, |
| 3308 |
|
column => $self->{column_prev} + 1 - length $self->{state_keyword}, |
| 3309 |
|
}); |
| 3310 |
|
redo A; |
| 3311 |
|
} |
| 3312 |
} else { |
} else { |
| 3313 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |
| 3314 |
} |
} |