769 |
sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP } |
sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP } |
770 |
|
|
771 |
sub DATA_STATE () { 0 } |
sub DATA_STATE () { 0 } |
772 |
sub ENTITY_DATA_STATE () { 1 } |
#sub ENTITY_DATA_STATE () { 1 } |
773 |
sub TAG_OPEN_STATE () { 2 } |
sub TAG_OPEN_STATE () { 2 } |
774 |
sub CLOSE_TAG_OPEN_STATE () { 3 } |
sub CLOSE_TAG_OPEN_STATE () { 3 } |
775 |
sub TAG_NAME_STATE () { 4 } |
sub TAG_NAME_STATE () { 4 } |
780 |
sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } |
sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } |
781 |
sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } |
sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } |
782 |
sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } |
sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } |
783 |
sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 } |
#sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 } |
784 |
sub MARKUP_DECLARATION_OPEN_STATE () { 13 } |
sub MARKUP_DECLARATION_OPEN_STATE () { 13 } |
785 |
sub COMMENT_START_STATE () { 14 } |
sub COMMENT_START_STATE () { 14 } |
786 |
sub COMMENT_START_DASH_STATE () { 15 } |
sub COMMENT_START_DASH_STATE () { 15 } |
812 |
sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec |
sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec |
813 |
sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec |
sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec |
814 |
sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec |
sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec |
815 |
sub ENTITY_STATE () { 44 } # "consume a character reference" in the spec |
## NOTE: "Entity data state", "entity in attribute value state", and |
816 |
|
## "consume a character reference" algorithm are jointly implemented |
817 |
|
## using the following six states: |
818 |
|
sub ENTITY_STATE () { 44 } |
819 |
|
sub ENTITY_HASH_STATE () { 45 } |
820 |
|
sub NCR_NUM_STATE () { 46 } |
821 |
|
sub HEXREF_X_STATE () { 47 } |
822 |
|
sub HEXREF_HEX_STATE () { 48 } |
823 |
|
sub ENTITY_NAME_STATE () { 49 } |
824 |
|
|
825 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } |
826 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
953 |
## "entity data state". In this implementation, the tokenizer |
## "entity data state". In this implementation, the tokenizer |
954 |
## is switched to the |ENTITY_STATE|, which is an implementation |
## is switched to the |ENTITY_STATE|, which is an implementation |
955 |
## of the "consume a character reference" algorithm. |
## of the "consume a character reference" algorithm. |
|
#$self->{state} = ENTITY_DATA_STATE; |
|
956 |
$self->{entity_in_attr} = 0; |
$self->{entity_in_attr} = 0; |
957 |
$self->{entity_additional} = -1; |
$self->{entity_additional} = -1; |
958 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
1025 |
!!!emit ($token); |
!!!emit ($token); |
1026 |
|
|
1027 |
redo A; |
redo A; |
|
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
|
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
|
|
|
|
|
my $token = $self->{entity_return}; |
|
|
|
|
|
$self->{state} = DATA_STATE; |
|
|
# next-input-character is already done |
|
|
|
|
|
unless (defined $token) { |
|
|
!!!cp (13); |
|
|
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
|
|
line => $l, column => $c, |
|
|
}); |
|
|
} else { |
|
|
!!!cp (14); |
|
|
!!!emit ($token); |
|
|
} |
|
|
|
|
|
redo A; |
|
1028 |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
1029 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
1030 |
if ($self->{next_char} == 0x002F) { # / |
if ($self->{next_char} == 0x002F) { # / |
1703 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
1704 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
1705 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
#$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
|
1706 |
$self->{entity_in_attr} = 1; |
$self->{entity_in_attr} = 1; |
1707 |
$self->{entity_additional} = 0x0022; # " |
$self->{entity_additional} = 0x0022; # " |
1708 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
1751 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
1752 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
1753 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
#$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
|
1754 |
$self->{entity_in_attr} = 1; |
$self->{entity_in_attr} = 1; |
1755 |
$self->{entity_additional} = 0x0027; # ' |
$self->{entity_additional} = 0x0027; # ' |
1756 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
1803 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
1804 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
1805 |
## implementation of the "consume a character reference" algorithm. |
## implementation of the "consume a character reference" algorithm. |
|
#$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE; |
|
1806 |
$self->{entity_in_attr} = 1; |
$self->{entity_in_attr} = 1; |
1807 |
$self->{entity_additional} = -1; |
$self->{entity_additional} = -1; |
1808 |
$self->{state} = ENTITY_STATE; |
$self->{state} = ENTITY_STATE; |
1869 |
!!!next-input-character; |
!!!next-input-character; |
1870 |
redo A; |
redo A; |
1871 |
} |
} |
|
} elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) { |
|
|
my $token = $self->{entity_return}; |
|
|
|
|
|
unless (defined $token) { |
|
|
!!!cp (117); |
|
|
$self->{current_attribute}->{value} .= '&'; |
|
|
} else { |
|
|
!!!cp (118); |
|
|
$self->{current_attribute}->{value} .= $token->{data}; |
|
|
$self->{current_attribute}->{has_reference} = $token->{has_reference}; |
|
|
## ISSUE: spec says "append the returned character token to the current attribute's value" |
|
|
} |
|
|
|
|
|
$self->{state} = $self->{last_attribute_value_state}; |
|
|
# next-input-character is already done |
|
|
redo A; |
|
1872 |
} elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { |
} elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { |
1873 |
if ($self->{next_char} == 0x0009 or # HT |
if ($self->{next_char} == 0x0009 or # HT |
1874 |
$self->{next_char} == 0x000A or # LF |
$self->{next_char} == 0x000A or # LF |
2948 |
## Reconsume. |
## Reconsume. |
2949 |
redo A; |
redo A; |
2950 |
} |
} |
|
|
|
2951 |
} elsif ($self->{state} == ENTITY_STATE) { |
} elsif ($self->{state} == ENTITY_STATE) { |
2952 |
my $in_attr = $self->{entity_in_attr}; |
if ({ |
2953 |
my $additional = $self->{entity_additional}; |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
2954 |
|
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & |
2955 |
|
$self->{entity_additional} => 1, |
2956 |
|
}->{$self->{next_char}}) { |
2957 |
|
!!!cp (1001); |
2958 |
|
## Don't consume |
2959 |
|
## No error |
2960 |
|
## Return nothing. |
2961 |
|
# |
2962 |
|
} elsif ($self->{next_char} == 0x0023) { # # |
2963 |
|
$self->{state} = ENTITY_HASH_STATE; |
2964 |
|
$self->{state_keyword} = '#'; |
2965 |
|
!!!next-input-character; |
2966 |
|
redo A; |
2967 |
|
} elsif ((0x0041 <= $self->{next_char} and |
2968 |
|
$self->{next_char} <= 0x005A) or # A..Z |
2969 |
|
(0x0061 <= $self->{next_char} and |
2970 |
|
$self->{next_char} <= 0x007A)) { # a..z |
2971 |
|
require Whatpm::_NamedEntityList; |
2972 |
|
$self->{state} = ENTITY_NAME_STATE; |
2973 |
|
$self->{state_keyword} = chr $self->{next_char}; |
2974 |
|
$self->{entity__value} = $self->{state_keyword}; |
2975 |
|
$self->{entity__match} = 0; |
2976 |
|
!!!next-input-character; |
2977 |
|
redo A; |
2978 |
|
} else { |
2979 |
|
!!!cp (1027); |
2980 |
|
!!!parse-error (type => 'bare ero'); |
2981 |
|
## Return nothing. |
2982 |
|
# |
2983 |
|
} |
2984 |
|
|
2985 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
## NOTE: No character is consumed by the "consume a character |
2986 |
|
## reference" algorithm. In other word, there is an "&" character |
2987 |
|
## that does not introduce a character reference, which would be |
2988 |
|
## appended to the parent element or the attribute value in later |
2989 |
|
## process of the tokenizer. |
2990 |
|
|
2991 |
if ({ |
if ($self->{entity_in_attr}) { |
2992 |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
$self->{current_attribute}->{value} .= '&'; |
2993 |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
$self->{state} = $self->{last_attribute_value_state}; |
2994 |
$additional => 1, |
## Reconsume. |
2995 |
}->{$self->{next_char}}) { |
redo A; |
2996 |
!!!cp (1001); |
} else { |
2997 |
## Don't consume |
$self->{state} = DATA_STATE; |
2998 |
## No error |
## Reconsume. |
2999 |
$self->{entity_return} = undef; |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
3000 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
line => $self->{line_prev}, |
3001 |
redo A; |
column => $self->{column_prev}, |
3002 |
} elsif ($self->{next_char} == 0x0023) { # # |
}); |
3003 |
!!!next-input-character; |
redo A; |
3004 |
if ($self->{next_char} == 0x0078 or # x |
} |
3005 |
$self->{next_char} == 0x0058) { # X |
} elsif ($self->{state} == ENTITY_HASH_STATE) { |
3006 |
my $code; |
if ($self->{next_char} == 0x0078 or # x |
3007 |
X: { |
$self->{next_char} == 0x0058) { # X |
3008 |
my $x_char = $self->{next_char}; |
$self->{state} = HEXREF_X_STATE; |
3009 |
!!!next-input-character; |
$self->{state_keyword} .= chr $self->{next_char}; |
3010 |
if (0x0030 <= $self->{next_char} and |
!!!next-input-character; |
3011 |
$self->{next_char} <= 0x0039) { # 0..9 |
redo A; |
3012 |
!!!cp (1002); |
} elsif (0x0030 <= $self->{next_char} and |
3013 |
$code ||= 0; |
$self->{next_char} <= 0x0039) { # 0..9 |
3014 |
$code *= 0x10; |
$self->{state} = NCR_NUM_STATE; |
3015 |
$code += $self->{next_char} - 0x0030; |
$self->{state_keyword} = $self->{next_char} - 0x0030; |
3016 |
redo X; |
!!!next-input-character; |
3017 |
} elsif (0x0061 <= $self->{next_char} and |
redo A; |
3018 |
$self->{next_char} <= 0x0066) { # a..f |
} else { |
3019 |
!!!cp (1003); |
!!!cp (1019); |
3020 |
$code ||= 0; |
!!!parse-error (type => 'bare nero', |
3021 |
$code *= 0x10; |
line => $self->{line_prev}, |
3022 |
$code += $self->{next_char} - 0x0060 + 9; |
column => $self->{column_prev} - 1); |
3023 |
redo X; |
|
3024 |
} elsif (0x0041 <= $self->{next_char} and |
## NOTE: According to the spec algorithm, nothing is returned, |
3025 |
$self->{next_char} <= 0x0046) { # A..F |
## and then "&#" is appended to the parent element or the attribute |
3026 |
!!!cp (1004); |
## value in the later processing. |
3027 |
$code ||= 0; |
|
3028 |
$code *= 0x10; |
if ($self->{entity_in_attr}) { |
3029 |
$code += $self->{next_char} - 0x0040 + 9; |
$self->{current_attribute}->{value} .= '&#'; |
3030 |
redo X; |
$self->{state} = $self->{last_attribute_value_state}; |
3031 |
} elsif (not defined $code) { # no hexadecimal digit |
## Reconsume. |
|
!!!cp (1005); |
|
|
!!!parse-error (type => 'bare hcro', line => $l, column => $c); |
|
|
!!!back-next-input-character ($x_char, $self->{next_char}); |
|
|
$self->{next_char} = 0x0023; # # |
|
|
$self->{entity_return} = undef; |
|
|
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
|
3032 |
redo A; |
redo A; |
|
} elsif ($self->{next_char} == 0x003B) { # ; |
|
|
!!!cp (1006); |
|
|
!!!next-input-character; |
|
3033 |
} else { |
} else { |
3034 |
!!!cp (1007); |
$self->{state} = DATA_STATE; |
3035 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
## Reconsume. |
3036 |
|
!!!emit ({type => CHARACTER_TOKEN, |
3037 |
|
data => '&#', |
3038 |
|
line => $self->{line_prev}, |
3039 |
|
column => $self->{column_prev} - 1, |
3040 |
|
}); |
3041 |
|
redo A; |
3042 |
} |
} |
3043 |
|
} |
3044 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
} elsif ($self->{state} == NCR_NUM_STATE) { |
3045 |
!!!cp (1008); |
if (0x0030 <= $self->{next_char} and |
3046 |
!!!parse-error (type => 'invalid character reference', |
$self->{next_char} <= 0x0039) { # 0..9 |
|
text => (sprintf 'U+%04X', $code), |
|
|
line => $l, column => $c); |
|
|
$code = 0xFFFD; |
|
|
} elsif ($code > 0x10FFFF) { |
|
|
!!!cp (1009); |
|
|
!!!parse-error (type => 'invalid character reference', |
|
|
text => (sprintf 'U-%08X', $code), |
|
|
line => $l, column => $c); |
|
|
$code = 0xFFFD; |
|
|
} elsif ($code == 0x000D) { |
|
|
!!!cp (1010); |
|
|
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
|
|
$code = 0x000A; |
|
|
} elsif (0x80 <= $code and $code <= 0x9F) { |
|
|
!!!cp (1011); |
|
|
!!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c); |
|
|
$code = $c1_entity_char->{$code}; |
|
|
} |
|
|
|
|
|
$self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code, |
|
|
has_reference => 1, |
|
|
line => $l, column => $c, |
|
|
}; |
|
|
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
|
|
redo A; |
|
|
} # X |
|
|
} elsif (0x0030 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0039) { # 0..9 |
|
|
my $code = $self->{next_char} - 0x0030; |
|
|
!!!next-input-character; |
|
|
|
|
|
while (0x0030 <= $self->{next_char} and |
|
|
$self->{next_char} <= 0x0039) { # 0..9 |
|
3047 |
!!!cp (1012); |
!!!cp (1012); |
3048 |
$code *= 10; |
$self->{state_keyword} *= 10; |
3049 |
$code += $self->{next_char} - 0x0030; |
$self->{state_keyword} += $self->{next_char} - 0x0030; |
3050 |
|
|
3051 |
|
## Stay in the state. |
3052 |
!!!next-input-character; |
!!!next-input-character; |
3053 |
} |
redo A; |
3054 |
|
} elsif ($self->{next_char} == 0x003B) { # ; |
|
if ($self->{next_char} == 0x003B) { # ; |
|
3055 |
!!!cp (1013); |
!!!cp (1013); |
3056 |
!!!next-input-character; |
!!!next-input-character; |
3057 |
|
# |
3058 |
} else { |
} else { |
3059 |
!!!cp (1014); |
!!!cp (1014); |
3060 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
!!!parse-error (type => 'no refc'); |
3061 |
|
## Reconsume. |
3062 |
|
# |
3063 |
} |
} |
3064 |
|
|
3065 |
|
my $code = $self->{state_keyword}; |
3066 |
|
my $l = $self->{line_prev}; |
3067 |
|
my $c = $self->{column_prev}; |
3068 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
3069 |
!!!cp (1015); |
!!!cp (1015); |
3070 |
!!!parse-error (type => 'invalid character reference', |
!!!parse-error (type => 'invalid character reference', |
3089 |
line => $l, column => $c); |
line => $l, column => $c); |
3090 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
3091 |
} |
} |
3092 |
|
|
3093 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, |
if ($self->{entity_in_attr}) { |
3094 |
line => $l, column => $c, |
$self->{current_attribute}->{value} .= chr $code; |
3095 |
}; |
$self->{current_attribute}->{has_reference} = 1; |
3096 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
$self->{state} = $self->{last_attribute_value_state}; |
3097 |
redo A; |
## Reconsume. |
3098 |
} else { |
redo A; |
3099 |
!!!cp (1019); |
} else { |
3100 |
!!!parse-error (type => 'bare nero', line => $l, column => $c); |
$self->{state} = DATA_STATE; |
3101 |
!!!back-next-input-character ($self->{next_char}); |
## Reconsume. |
3102 |
$self->{next_char} = 0x0023; # # |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
3103 |
$self->{entity_return} = undef; |
has_reference => 1, |
3104 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
line => $l, column => $c, |
3105 |
redo A; |
}); |
3106 |
} |
redo A; |
3107 |
} elsif ((0x0041 <= $self->{next_char} and |
} |
3108 |
$self->{next_char} <= 0x005A) or |
} elsif ($self->{state} == HEXREF_X_STATE) { |
3109 |
(0x0061 <= $self->{next_char} and |
if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or |
3110 |
$self->{next_char} <= 0x007A)) { |
(0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or |
3111 |
my $entity_name = chr $self->{next_char}; |
(0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) { |
3112 |
!!!next-input-character; |
# 0..9, A..F, a..f |
3113 |
|
$self->{state} = HEXREF_HEX_STATE; |
3114 |
my $value = $entity_name; |
$self->{state_keyword} = 0; |
3115 |
my $match = 0; |
## Reconsume. |
3116 |
require Whatpm::_NamedEntityList; |
redo A; |
3117 |
our $EntityChar; |
} else { |
3118 |
|
!!!cp (1005); |
3119 |
while (length $entity_name < 30 and |
!!!parse-error (type => 'bare hcro', |
3120 |
## NOTE: Some number greater than the maximum length of entity name |
line => $self->{line_prev}, |
3121 |
((0x0041 <= $self->{next_char} and # a |
column => $self->{column_prev} - 2); |
3122 |
$self->{next_char} <= 0x005A) or # x |
|
3123 |
(0x0061 <= $self->{next_char} and # a |
## NOTE: According to the spec algorithm, nothing is returned, |
3124 |
$self->{next_char} <= 0x007A) or # z |
## and then "&#" followed by "X" or "x" is appended to the parent |
3125 |
(0x0030 <= $self->{next_char} and # 0 |
## element or the attribute value in the later processing. |
3126 |
$self->{next_char} <= 0x0039) or # 9 |
|
3127 |
$self->{next_char} == 0x003B)) { # ; |
if ($self->{entity_in_attr}) { |
3128 |
$entity_name .= chr $self->{next_char}; |
$self->{current_attribute}->{value} .= '&' . $self->{state_keyword}; |
3129 |
if (defined $EntityChar->{$entity_name}) { |
$self->{state} = $self->{last_attribute_value_state}; |
3130 |
if ($self->{next_char} == 0x003B) { # ; |
## Reconsume. |
3131 |
!!!cp (1020); |
redo A; |
|
$value = $EntityChar->{$entity_name}; |
|
|
$match = 1; |
|
|
!!!next-input-character; |
|
|
last; |
|
3132 |
} else { |
} else { |
3133 |
!!!cp (1021); |
$self->{state} = DATA_STATE; |
3134 |
$value = $EntityChar->{$entity_name}; |
## Reconsume. |
3135 |
$match = -1; |
!!!emit ({type => CHARACTER_TOKEN, |
3136 |
!!!next-input-character; |
data => '&' . $self->{state_keyword}, |
3137 |
|
line => $self->{line_prev}, |
3138 |
|
column => $self->{column_prev} - length $self->{state_keyword}, |
3139 |
|
}); |
3140 |
|
redo A; |
3141 |
} |
} |
3142 |
} else { |
} |
3143 |
!!!cp (1022); |
} elsif ($self->{state} == HEXREF_HEX_STATE) { |
3144 |
$value .= chr $self->{next_char}; |
if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) { |
3145 |
$match *= 2; |
# 0..9 |
3146 |
|
!!!cp (1002); |
3147 |
|
$self->{state_keyword} *= 0x10; |
3148 |
|
$self->{state_keyword} += $self->{next_char} - 0x0030; |
3149 |
|
## Stay in the state. |
3150 |
!!!next-input-character; |
!!!next-input-character; |
3151 |
|
redo A; |
3152 |
|
} elsif (0x0061 <= $self->{next_char} and |
3153 |
|
$self->{next_char} <= 0x0066) { # a..f |
3154 |
|
!!!cp (1003); |
3155 |
|
$self->{state_keyword} *= 0x10; |
3156 |
|
$self->{state_keyword} += $self->{next_char} - 0x0060 + 9; |
3157 |
|
## Stay in the state. |
3158 |
|
!!!next-input-character; |
3159 |
|
redo A; |
3160 |
|
} elsif (0x0041 <= $self->{next_char} and |
3161 |
|
$self->{next_char} <= 0x0046) { # A..F |
3162 |
|
!!!cp (1004); |
3163 |
|
$self->{state_keyword} *= 0x10; |
3164 |
|
$self->{state_keyword} += $self->{next_char} - 0x0040 + 9; |
3165 |
|
## Stay in the state. |
3166 |
|
!!!next-input-character; |
3167 |
|
redo A; |
3168 |
|
} elsif ($self->{next_char} == 0x003B) { # ; |
3169 |
|
!!!cp (1006); |
3170 |
|
!!!next-input-character; |
3171 |
|
# |
3172 |
|
} else { |
3173 |
|
!!!cp (1007); |
3174 |
|
!!!parse-error (type => 'no refc', |
3175 |
|
line => $self->{line}, |
3176 |
|
column => $self->{column}); |
3177 |
|
## Reconsume. |
3178 |
|
# |
3179 |
} |
} |
3180 |
} |
|
3181 |
|
my $code = $self->{state_keyword}; |
3182 |
if ($match > 0) { |
my $l = $self->{line_prev}; |
3183 |
!!!cp (1023); |
my $c = $self->{column_prev}; |
3184 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
3185 |
line => $l, column => $c, |
!!!cp (1008); |
3186 |
}; |
!!!parse-error (type => 'invalid character reference', |
3187 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
text => (sprintf 'U+%04X', $code), |
3188 |
redo A; |
line => $l, column => $c); |
3189 |
} elsif ($match < 0) { |
$code = 0xFFFD; |
3190 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
} elsif ($code > 0x10FFFF) { |
3191 |
if ($in_attr and $match < -1) { |
!!!cp (1009); |
3192 |
!!!cp (1024); |
!!!parse-error (type => 'invalid character reference', |
3193 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$entity_name, |
text => (sprintf 'U-%08X', $code), |
3194 |
line => $l, column => $c, |
line => $l, column => $c); |
3195 |
}; |
$code = 0xFFFD; |
3196 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
} elsif ($code == 0x000D) { |
3197 |
|
!!!cp (1010); |
3198 |
|
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
3199 |
|
$code = 0x000A; |
3200 |
|
} elsif (0x80 <= $code and $code <= 0x9F) { |
3201 |
|
!!!cp (1011); |
3202 |
|
!!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c); |
3203 |
|
$code = $c1_entity_char->{$code}; |
3204 |
|
} |
3205 |
|
|
3206 |
|
if ($self->{entity_in_attr}) { |
3207 |
|
$self->{current_attribute}->{value} .= chr $code; |
3208 |
|
$self->{current_attribute}->{has_reference} = 1; |
3209 |
|
$self->{state} = $self->{last_attribute_value_state}; |
3210 |
|
## Reconsume. |
3211 |
redo A; |
redo A; |
3212 |
} else { |
} else { |
3213 |
!!!cp (1025); |
$self->{state} = DATA_STATE; |
3214 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
## Reconsume. |
3215 |
line => $l, column => $c, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
3216 |
}; |
has_reference => 1, |
3217 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
line => $l, column => $c, |
3218 |
|
}); |
3219 |
redo A; |
redo A; |
3220 |
} |
} |
3221 |
} else { |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
3222 |
!!!cp (1026); |
if (length $self->{state_keyword} < 30 and |
3223 |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
## NOTE: Some number greater than the maximum length of entity name |
3224 |
## NOTE: "No characters are consumed" in the spec. |
((0x0041 <= $self->{next_char} and # a |
3225 |
$self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$value, |
$self->{next_char} <= 0x005A) or # x |
3226 |
line => $l, column => $c, |
(0x0061 <= $self->{next_char} and # a |
3227 |
}; |
$self->{next_char} <= 0x007A) or # z |
3228 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
(0x0030 <= $self->{next_char} and # 0 |
3229 |
redo A; |
$self->{next_char} <= 0x0039) or # 9 |
3230 |
} |
$self->{next_char} == 0x003B)) { # ; |
3231 |
} else { |
our $EntityChar; |
3232 |
!!!cp (1027); |
$self->{state_keyword} .= chr $self->{next_char}; |
3233 |
## no characters are consumed |
if (defined $EntityChar->{$self->{state_keyword}}) { |
3234 |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
if ($self->{next_char} == 0x003B) { # ; |
3235 |
$self->{entity_return} = undef; |
!!!cp (1020); |
3236 |
$self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE; |
$self->{entity__value} = $EntityChar->{$self->{state_keyword}}; |
3237 |
redo A; |
$self->{entity__match} = 1; |
3238 |
} |
!!!next-input-character; |
3239 |
|
# |
3240 |
|
} else { |
3241 |
|
!!!cp (1021); |
3242 |
|
$self->{entity__value} = $EntityChar->{$self->{state_keyword}}; |
3243 |
|
$self->{entity__match} = -1; |
3244 |
|
## Stay in the state. |
3245 |
|
!!!next-input-character; |
3246 |
|
redo A; |
3247 |
|
} |
3248 |
|
} else { |
3249 |
|
!!!cp (1022); |
3250 |
|
$self->{entity__value} .= chr $self->{next_char}; |
3251 |
|
$self->{entity__match} *= 2; |
3252 |
|
## Stay in the state. |
3253 |
|
!!!next-input-character; |
3254 |
|
redo A; |
3255 |
|
} |
3256 |
|
} |
3257 |
|
|
3258 |
|
my $data; |
3259 |
|
my $has_ref; |
3260 |
|
if ($self->{entity__match} > 0) { |
3261 |
|
!!!cp (1023); |
3262 |
|
$data = $self->{entity__value}; |
3263 |
|
$has_ref = 1; |
3264 |
|
# |
3265 |
|
} elsif ($self->{entity__match} < 0) { |
3266 |
|
!!!parse-error (type => 'no refc'); |
3267 |
|
if ($self->{entity_in_attr} and $self->{entity__match} < -1) { |
3268 |
|
!!!cp (1024); |
3269 |
|
$data = '&' . $self->{state_keyword}; |
3270 |
|
# |
3271 |
|
} else { |
3272 |
|
!!!cp (1025); |
3273 |
|
$data = $self->{entity__value}; |
3274 |
|
$has_ref = 1; |
3275 |
|
# |
3276 |
|
} |
3277 |
|
} else { |
3278 |
|
!!!cp (1026); |
3279 |
|
!!!parse-error (type => 'bare ero', |
3280 |
|
line => $self->{line_prev}, |
3281 |
|
column => $self->{column_prev}); |
3282 |
|
$data = '&' . $self->{state_keyword}; |
3283 |
|
# |
3284 |
|
} |
3285 |
|
|
3286 |
|
## NOTE: In these cases, when a character reference is found, |
3287 |
|
## it is consumed and a character token is returned, or, otherwise, |
3288 |
|
## nothing is consumed and returned, according to the spec algorithm. |
3289 |
|
## In this implementation, anything that has been examined by the |
3290 |
|
## tokenizer is appended to the parent element or the attribute value |
3291 |
|
## as string, either literal string when no character reference or |
3292 |
|
## entity-replaced string otherwise, in this stage, since any characters |
3293 |
|
## that would not be consumed are appended in the data state or in an |
3294 |
|
## appropriate attribute value state anyway. |
3295 |
|
|
3296 |
|
if ($self->{entity_in_attr}) { |
3297 |
|
$self->{current_attribute}->{value} .= $data; |
3298 |
|
$self->{current_attribute}->{has_reference} = 1 if $has_ref; |
3299 |
|
$self->{state} = $self->{last_attribute_value_state}; |
3300 |
|
## Reconsume. |
3301 |
|
redo A; |
3302 |
|
} else { |
3303 |
|
$self->{state} = DATA_STATE; |
3304 |
|
## Reconsume. |
3305 |
|
!!!emit ({type => CHARACTER_TOKEN, |
3306 |
|
data => $data, has_reference => $has_ref, |
3307 |
|
line => $self->{line_prev}, |
3308 |
|
column => $self->{column_prev} + 1 - length $self->{state_keyword}, |
3309 |
|
}); |
3310 |
|
redo A; |
3311 |
|
} |
3312 |
} else { |
} else { |
3313 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |
3314 |
} |
} |