| 15 |
CHARACTER_TOKEN |
CHARACTER_TOKEN |
| 16 |
PI_TOKEN |
PI_TOKEN |
| 17 |
ABORT_TOKEN |
ABORT_TOKEN |
| 18 |
|
END_OF_DOCTYPE_TOKEN |
| 19 |
); |
); |
| 20 |
|
|
| 21 |
our %EXPORT_TAGS = ( |
our %EXPORT_TAGS = ( |
| 28 |
CHARACTER_TOKEN |
CHARACTER_TOKEN |
| 29 |
PI_TOKEN |
PI_TOKEN |
| 30 |
ABORT_TOKEN |
ABORT_TOKEN |
| 31 |
|
END_OF_DOCTYPE_TOKEN |
| 32 |
)], |
)], |
| 33 |
); |
); |
| 34 |
} |
} |
| 35 |
|
|
| 36 |
|
## NOTE: Differences from the XML5 draft are marked as "XML5:". |
| 37 |
|
|
| 38 |
## Token types |
## Token types |
| 39 |
|
|
| 40 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token. |
| 41 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
| 42 |
sub START_TAG_TOKEN () { 3 } |
sub START_TAG_TOKEN () { 3 } |
| 43 |
sub END_TAG_TOKEN () { 4 } |
sub END_TAG_TOKEN () { 4 } |
| 44 |
sub END_OF_FILE_TOKEN () { 5 } |
sub END_OF_FILE_TOKEN () { 5 } |
| 45 |
sub CHARACTER_TOKEN () { 6 } |
sub CHARACTER_TOKEN () { 6 } |
| 46 |
sub PI_TOKEN () { 7 } # XML5 |
sub PI_TOKEN () { 7 } ## NOTE: XML only. |
| 47 |
sub ABORT_TOKEN () { 8 } # Not a token actually |
sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing. |
| 48 |
|
sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only |
| 49 |
|
|
| 50 |
|
## XML5: XML5 has "empty tag token". In this implementation, it is |
| 51 |
|
## represented as a start tag token with $self->{self_closing} flag |
| 52 |
|
## set to true. |
| 53 |
|
|
| 54 |
|
## XML5: XML5 has "short end tag token". In this implementation, it |
| 55 |
|
## is represented as an end tag token with $token->{tag_name} flag set |
| 56 |
|
## to an empty string. |
| 57 |
|
|
| 58 |
package Whatpm::HTML; |
package Whatpm::HTML; |
| 59 |
|
|
| 127 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
| 128 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
| 129 |
|
|
| 130 |
## XML states |
## XML-only states |
| 131 |
sub PI_STATE () { 51 } |
sub PI_STATE () { 51 } |
| 132 |
sub PI_TARGET_STATE () { 52 } |
sub PI_TARGET_STATE () { 52 } |
| 133 |
sub PI_TARGET_AFTER_STATE () { 53 } |
sub PI_TARGET_AFTER_STATE () { 53 } |
| 134 |
sub PI_DATA_STATE () { 54 } |
sub PI_DATA_STATE () { 54 } |
| 135 |
sub PI_AFTER_STATE () { 55 } |
sub PI_AFTER_STATE () { 55 } |
| 136 |
sub PI_DATA_AFTER_STATE () { 56 } |
sub PI_DATA_AFTER_STATE () { 56 } |
| 137 |
|
sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 } |
| 138 |
|
sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 } |
| 139 |
|
sub DOCTYPE_TAG_STATE () { 59 } |
| 140 |
|
sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 } |
| 141 |
|
|
| 142 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
| 143 |
## list and descriptions) |
## list and descriptions) |
| 203 |
#$self->{is_xml} (if XML) |
#$self->{is_xml} (if XML) |
| 204 |
|
|
| 205 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
| 206 |
$self->{s_kwd} = ''; # state keyword |
$self->{s_kwd} = ''; # Data state keyword |
| 207 |
|
#$self->{kwd} = ''; # State-dependent keyword; initialized when used |
| 208 |
#$self->{entity__value}; # initialized when used |
#$self->{entity__value}; # initialized when used |
| 209 |
#$self->{entity__match}; # initialized when used |
#$self->{entity__match}; # initialized when used |
| 210 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
| 249 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) |
| 250 |
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
| 251 |
## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. |
## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. |
| 252 |
|
## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN) |
| 253 |
|
|
| 254 |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
| 255 |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
| 256 |
## while the token is pushed back to the stack. |
## while the token is pushed back to the stack. |
| 270 |
0x0009 => 1, # CHARACTER TABULATION (HT) |
0x0009 => 1, # CHARACTER TABULATION (HT) |
| 271 |
0x000A => 1, # LINE FEED (LF) |
0x000A => 1, # LINE FEED (LF) |
| 272 |
#0x000B => 0, # LINE TABULATION (VT) |
#0x000B => 0, # LINE TABULATION (VT) |
| 273 |
0x000C => 1, # FORM FEED (FF) |
0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character. |
| 274 |
#0x000D => 1, # CARRIAGE RETURN (CR) |
#0x000D => 1, # CARRIAGE RETURN (CR) |
| 275 |
0x0020 => 1, # SPACE (SP) |
0x0020 => 1, # SPACE (SP) |
| 276 |
}; |
}; |
| 550 |
redo A; |
redo A; |
| 551 |
} elsif ($self->{nc} == 0x0021) { # ! |
} elsif ($self->{nc} == 0x0021) { # ! |
| 552 |
|
|
| 553 |
$self->{s_kwd} = '<' unless $self->{escape}; |
$self->{s_kwd} = $self->{escaped} ? '' : '<'; |
| 554 |
# |
# |
| 555 |
} else { |
} else { |
| 556 |
|
|
| 557 |
|
$self->{s_kwd} = ''; |
| 558 |
# |
# |
| 559 |
} |
} |
| 560 |
|
|
| 561 |
## reconsume |
## reconsume |
| 562 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
|
$self->{s_kwd} = ''; |
|
| 563 |
return ({type => CHARACTER_TOKEN, data => '<', |
return ({type => CHARACTER_TOKEN, data => '<', |
| 564 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 565 |
column => $self->{column_prev}, |
column => $self->{column_prev}, |
| 740 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 741 |
if (defined $self->{last_stag_name}) { |
if (defined $self->{last_stag_name}) { |
| 742 |
$self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE; |
$self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE; |
| 743 |
$self->{s_kwd} = ''; |
$self->{kwd} = ''; |
| 744 |
## Reconsume. |
## Reconsume. |
| 745 |
redo A; |
redo A; |
| 746 |
} else { |
} else { |
| 893 |
redo A; |
redo A; |
| 894 |
} |
} |
| 895 |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
| 896 |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1; |
| 897 |
if (length $ch) { |
if (length $ch) { |
| 898 |
my $CH = $ch; |
my $CH = $ch; |
| 899 |
$ch =~ tr/a-z/A-Z/; |
$ch =~ tr/a-z/A-Z/; |
| 901 |
if ($nch eq $ch or $nch eq $CH) { |
if ($nch eq $ch or $nch eq $CH) { |
| 902 |
|
|
| 903 |
## Stay in the state. |
## Stay in the state. |
| 904 |
$self->{s_kwd} .= $nch; |
$self->{kwd} .= $nch; |
| 905 |
|
|
| 906 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 907 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 920 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 921 |
## Reconsume. |
## Reconsume. |
| 922 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 923 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{kwd}, |
| 924 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 925 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 926 |
}); |
}); |
| 927 |
redo A; |
redo A; |
| 928 |
} |
} |
| 938 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 939 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 940 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 941 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{kwd}, |
| 942 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 943 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 944 |
}); |
}); |
| 945 |
redo A; |
redo A; |
| 946 |
} else { |
} else { |
| 949 |
= {type => END_TAG_TOKEN, |
= {type => END_TAG_TOKEN, |
| 950 |
tag_name => $self->{last_stag_name}, |
tag_name => $self->{last_stag_name}, |
| 951 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 952 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}}; |
column => $self->{column_prev} - 1 - length $self->{kwd}}; |
| 953 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 954 |
## Reconsume. |
## Reconsume. |
| 955 |
redo A; |
redo A; |
| 2188 |
redo A; |
redo A; |
| 2189 |
} |
} |
| 2190 |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
|
## (only happen if PCDATA state) |
|
|
|
|
| 2191 |
## NOTE: Unlike spec's "bogus comment state", this implementation |
## NOTE: Unlike spec's "bogus comment state", this implementation |
| 2192 |
## consumes characters one-by-one basis. |
## consumes characters one-by-one basis. |
| 2193 |
|
|
| 2194 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 2195 |
|
if ($self->{in_subset}) { |
| 2196 |
$self->{state} = DATA_STATE; |
|
| 2197 |
$self->{s_kwd} = ''; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2198 |
|
} else { |
| 2199 |
|
|
| 2200 |
|
$self->{state} = DATA_STATE; |
| 2201 |
|
$self->{s_kwd} = ''; |
| 2202 |
|
} |
| 2203 |
|
|
| 2204 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2205 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2215 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2216 |
redo A; |
redo A; |
| 2217 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 2218 |
|
if ($self->{in_subset}) { |
| 2219 |
$self->{state} = DATA_STATE; |
|
| 2220 |
$self->{s_kwd} = ''; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2221 |
|
} else { |
| 2222 |
|
|
| 2223 |
|
$self->{state} = DATA_STATE; |
| 2224 |
|
$self->{s_kwd} = ''; |
| 2225 |
|
} |
| 2226 |
## reconsume |
## reconsume |
| 2227 |
|
|
| 2228 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2249 |
redo A; |
redo A; |
| 2250 |
} |
} |
| 2251 |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
| 2252 |
## (only happen if PCDATA state) |
## XML5: "Markup declaration state" and "DOCTYPE markup |
| 2253 |
|
## declaration state". |
| 2254 |
|
|
| 2255 |
if ($self->{nc} == 0x002D) { # - |
if ($self->{nc} == 0x002D) { # - |
| 2256 |
|
|
| 2272 |
## ASCII case-insensitive. |
## ASCII case-insensitive. |
| 2273 |
|
|
| 2274 |
$self->{state} = MD_DOCTYPE_STATE; |
$self->{state} = MD_DOCTYPE_STATE; |
| 2275 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 2276 |
|
|
| 2277 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2278 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2291 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 2292 |
|
|
| 2293 |
$self->{state} = MD_CDATA_STATE; |
$self->{state} = MD_CDATA_STATE; |
| 2294 |
$self->{s_kwd} = '['; |
$self->{kwd} = '['; |
| 2295 |
|
|
| 2296 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2297 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2361 |
0x0054, # T |
0x0054, # T |
| 2362 |
0x0059, # Y |
0x0059, # Y |
| 2363 |
0x0050, # P |
0x0050, # P |
| 2364 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 2365 |
$self->{nc} == [ |
$self->{nc} == [ |
| 2366 |
undef, |
undef, |
| 2367 |
0x006F, # o |
0x006F, # o |
| 2369 |
0x0074, # t |
0x0074, # t |
| 2370 |
0x0079, # y |
0x0079, # y |
| 2371 |
0x0070, # p |
0x0070, # p |
| 2372 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 2373 |
|
|
| 2374 |
## Stay in the state. |
## Stay in the state. |
| 2375 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 2376 |
|
|
| 2377 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2378 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2385 |
} |
} |
| 2386 |
|
|
| 2387 |
redo A; |
redo A; |
| 2388 |
} elsif ((length $self->{s_kwd}) == 6 and |
} elsif ((length $self->{kwd}) == 6 and |
| 2389 |
($self->{nc} == 0x0045 or # E |
($self->{nc} == 0x0045 or # E |
| 2390 |
$self->{nc} == 0x0065)) { # e |
$self->{nc} == 0x0065)) { # e |
| 2391 |
if ($self->{s_kwd} ne 'DOCTYP') { |
if ($self->{is_xml} and |
| 2392 |
|
($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) { |
| 2393 |
|
|
| 2394 |
## XML5: case-sensitive. |
## XML5: case-sensitive. |
| 2395 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO |
| 2421 |
|
|
| 2422 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', |
| 2423 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2424 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}); |
column => $self->{column_prev} - 1 - length $self->{kwd}); |
| 2425 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 2426 |
## Reconsume. |
## Reconsume. |
| 2427 |
$self->{ct} = {type => COMMENT_TOKEN, |
$self->{ct} = {type => COMMENT_TOKEN, |
| 2428 |
data => $self->{s_kwd}, |
data => $self->{kwd}, |
| 2429 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2430 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 2431 |
}; |
}; |
| 2432 |
redo A; |
redo A; |
| 2433 |
} |
} |
| 2438 |
'[CD' => 0x0041, # A |
'[CD' => 0x0041, # A |
| 2439 |
'[CDA' => 0x0054, # T |
'[CDA' => 0x0054, # T |
| 2440 |
'[CDAT' => 0x0041, # A |
'[CDAT' => 0x0041, # A |
| 2441 |
}->{$self->{s_kwd}}) { |
}->{$self->{kwd}}) { |
| 2442 |
|
|
| 2443 |
## Stay in the state. |
## Stay in the state. |
| 2444 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 2445 |
|
|
| 2446 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2447 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2454 |
} |
} |
| 2455 |
|
|
| 2456 |
redo A; |
redo A; |
| 2457 |
} elsif ($self->{s_kwd} eq '[CDATA' and |
} elsif ($self->{kwd} eq '[CDATA' and |
| 2458 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 2459 |
if ($self->{is_xml} and |
if ($self->{is_xml} and |
| 2460 |
not $self->{tainted} and |
not $self->{tainted} and |
| 2489 |
|
|
| 2490 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', |
| 2491 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2492 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}); |
column => $self->{column_prev} - 1 - length $self->{kwd}); |
| 2493 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 2494 |
## Reconsume. |
## Reconsume. |
| 2495 |
$self->{ct} = {type => COMMENT_TOKEN, |
$self->{ct} = {type => COMMENT_TOKEN, |
| 2496 |
data => $self->{s_kwd}, |
data => $self->{kwd}, |
| 2497 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2498 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 2499 |
}; |
}; |
| 2500 |
redo A; |
redo A; |
| 2501 |
} |
} |
| 2516 |
|
|
| 2517 |
redo A; |
redo A; |
| 2518 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
|
|
|
| 2519 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); |
| 2520 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 2521 |
$self->{s_kwd} = ''; |
|
| 2522 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2523 |
|
} else { |
| 2524 |
|
|
| 2525 |
|
$self->{state} = DATA_STATE; |
| 2526 |
|
$self->{s_kwd} = ''; |
| 2527 |
|
} |
| 2528 |
|
|
| 2529 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2530 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2541 |
|
|
| 2542 |
redo A; |
redo A; |
| 2543 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
|
|
|
| 2544 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2545 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 2546 |
$self->{s_kwd} = ''; |
|
| 2547 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2548 |
|
} else { |
| 2549 |
|
|
| 2550 |
|
$self->{state} = DATA_STATE; |
| 2551 |
|
$self->{s_kwd} = ''; |
| 2552 |
|
} |
| 2553 |
## reconsume |
## reconsume |
| 2554 |
|
|
| 2555 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2590 |
|
|
| 2591 |
redo A; |
redo A; |
| 2592 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
|
|
|
| 2593 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); |
| 2594 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 2595 |
$self->{s_kwd} = ''; |
|
| 2596 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2597 |
|
} else { |
| 2598 |
|
|
| 2599 |
|
$self->{state} = DATA_STATE; |
| 2600 |
|
$self->{s_kwd} = ''; |
| 2601 |
|
} |
| 2602 |
|
|
| 2603 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2604 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2615 |
|
|
| 2616 |
redo A; |
redo A; |
| 2617 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
|
|
|
| 2618 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2619 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 2620 |
$self->{s_kwd} = ''; |
|
| 2621 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2622 |
|
} else { |
| 2623 |
|
|
| 2624 |
|
$self->{state} = DATA_STATE; |
| 2625 |
|
$self->{s_kwd} = ''; |
| 2626 |
|
} |
| 2627 |
## reconsume |
## reconsume |
| 2628 |
|
|
| 2629 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2664 |
|
|
| 2665 |
redo A; |
redo A; |
| 2666 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
|
|
|
| 2667 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2668 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 2669 |
$self->{s_kwd} = ''; |
|
| 2670 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2671 |
|
} else { |
| 2672 |
|
|
| 2673 |
|
$self->{state} = DATA_STATE; |
| 2674 |
|
$self->{s_kwd} = ''; |
| 2675 |
|
} |
| 2676 |
## reconsume |
## reconsume |
| 2677 |
|
|
| 2678 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2718 |
|
|
| 2719 |
redo A; |
redo A; |
| 2720 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
|
|
|
| 2721 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2722 |
$self->{s_kwd} = ''; |
if ($self->{in_subset}) { |
| 2723 |
$self->{state} = DATA_STATE; |
|
| 2724 |
$self->{s_kwd} = ''; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2725 |
|
} else { |
| 2726 |
|
|
| 2727 |
|
$self->{state} = DATA_STATE; |
| 2728 |
|
$self->{s_kwd} = ''; |
| 2729 |
|
} |
| 2730 |
## reconsume |
## reconsume |
| 2731 |
|
|
| 2732 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2751 |
} |
} |
| 2752 |
} elsif ($self->{state} == COMMENT_END_STATE) { |
} elsif ($self->{state} == COMMENT_END_STATE) { |
| 2753 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 2754 |
|
if ($self->{in_subset}) { |
| 2755 |
$self->{state} = DATA_STATE; |
|
| 2756 |
$self->{s_kwd} = ''; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2757 |
|
} else { |
| 2758 |
|
|
| 2759 |
|
$self->{state} = DATA_STATE; |
| 2760 |
|
$self->{s_kwd} = ''; |
| 2761 |
|
} |
| 2762 |
|
|
| 2763 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2764 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2795 |
|
|
| 2796 |
redo A; |
redo A; |
| 2797 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
|
|
|
| 2798 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2799 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 2800 |
$self->{s_kwd} = ''; |
|
| 2801 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2802 |
|
} else { |
| 2803 |
|
|
| 2804 |
|
$self->{state} = DATA_STATE; |
| 2805 |
|
$self->{s_kwd} = ''; |
| 2806 |
|
} |
| 2807 |
## reconsume |
## reconsume |
| 2808 |
|
|
| 2809 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2848 |
redo A; |
redo A; |
| 2849 |
} else { |
} else { |
| 2850 |
|
|
| 2851 |
|
## XML5: Unless EOF, swith to the bogus comment state. |
| 2852 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name'); |
| 2853 |
$self->{state} = BEFORE_DOCTYPE_NAME_STATE; |
$self->{state} = BEFORE_DOCTYPE_NAME_STATE; |
| 2854 |
## reconsume |
## reconsume |
| 2855 |
redo A; |
redo A; |
| 2856 |
} |
} |
| 2857 |
} elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) { |
| 2858 |
|
## XML5: "DOCTYPE root name before state". |
| 2859 |
|
|
| 2860 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 2861 |
|
|
| 2862 |
## Stay in the state |
## Stay in the state |
| 2874 |
redo A; |
redo A; |
| 2875 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 2876 |
|
|
| 2877 |
|
## XML5: No parse error. |
| 2878 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
| 2879 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2880 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2903 |
return ($self->{ct}); # DOCTYPE (quirks) |
return ($self->{ct}); # DOCTYPE (quirks) |
| 2904 |
|
|
| 2905 |
redo A; |
redo A; |
| 2906 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2907 |
|
|
| 2908 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
| 2909 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2910 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 2911 |
|
$self->{in_subset} = 1; |
| 2912 |
|
|
| 2913 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2914 |
|
$self->{line_prev} = $self->{line}; |
| 2915 |
|
$self->{column_prev} = $self->{column}; |
| 2916 |
|
$self->{column}++; |
| 2917 |
|
$self->{nc} |
| 2918 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 2919 |
|
} else { |
| 2920 |
|
$self->{set_nc}->($self); |
| 2921 |
|
} |
| 2922 |
|
|
| 2923 |
|
return ($self->{ct}); # DOCTYPE |
| 2924 |
|
redo A; |
| 2925 |
} else { |
} else { |
| 2926 |
|
|
| 2927 |
$self->{ct}->{name} = chr $self->{nc}; |
$self->{ct}->{name} = chr $self->{nc}; |
| 2941 |
redo A; |
redo A; |
| 2942 |
} |
} |
| 2943 |
} elsif ($self->{state} == DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == DOCTYPE_NAME_STATE) { |
| 2944 |
## ISSUE: Redundant "First," in the spec. |
## XML5: "DOCTYPE root name state". |
| 2945 |
|
|
| 2946 |
|
## ISSUE: Redundant "First," in the spec. |
| 2947 |
|
|
| 2948 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 2949 |
|
|
| 2950 |
$self->{state} = AFTER_DOCTYPE_NAME_STATE; |
$self->{state} = AFTER_DOCTYPE_NAME_STATE; |
| 2990 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 2991 |
|
|
| 2992 |
redo A; |
redo A; |
| 2993 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2994 |
|
|
| 2995 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2996 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 2997 |
|
$self->{in_subset} = 1; |
| 2998 |
|
|
| 2999 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3000 |
|
$self->{line_prev} = $self->{line}; |
| 3001 |
|
$self->{column_prev} = $self->{column}; |
| 3002 |
|
$self->{column}++; |
| 3003 |
|
$self->{nc} |
| 3004 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3005 |
|
} else { |
| 3006 |
|
$self->{set_nc}->($self); |
| 3007 |
|
} |
| 3008 |
|
|
| 3009 |
|
return ($self->{ct}); # DOCTYPE |
| 3010 |
|
redo A; |
| 3011 |
} else { |
} else { |
| 3012 |
|
|
| 3013 |
$self->{ct}->{name} |
$self->{ct}->{name} |
| 3027 |
redo A; |
redo A; |
| 3028 |
} |
} |
| 3029 |
} elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) { |
| 3030 |
|
## XML5: Corresponding to XML5's "DOCTYPE root name after |
| 3031 |
|
## state", but implemented differently. |
| 3032 |
|
|
| 3033 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 3034 |
|
|
| 3035 |
## Stay in the state |
## Stay in the state |
| 3077 |
redo A; |
redo A; |
| 3078 |
} elsif ($self->{nc} == 0x0050 or # P |
} elsif ($self->{nc} == 0x0050 or # P |
| 3079 |
$self->{nc} == 0x0070) { # p |
$self->{nc} == 0x0070) { # p |
| 3080 |
|
|
| 3081 |
$self->{state} = PUBLIC_STATE; |
$self->{state} = PUBLIC_STATE; |
| 3082 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 3083 |
|
|
| 3084 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3085 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3094 |
redo A; |
redo A; |
| 3095 |
} elsif ($self->{nc} == 0x0053 or # S |
} elsif ($self->{nc} == 0x0053 or # S |
| 3096 |
$self->{nc} == 0x0073) { # s |
$self->{nc} == 0x0073) { # s |
| 3097 |
|
|
| 3098 |
$self->{state} = SYSTEM_STATE; |
$self->{state} = SYSTEM_STATE; |
| 3099 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 3100 |
|
|
| 3101 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3102 |
|
$self->{line_prev} = $self->{line}; |
| 3103 |
|
$self->{column_prev} = $self->{column}; |
| 3104 |
|
$self->{column}++; |
| 3105 |
|
$self->{nc} |
| 3106 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3107 |
|
} else { |
| 3108 |
|
$self->{set_nc}->($self); |
| 3109 |
|
} |
| 3110 |
|
|
| 3111 |
|
redo A; |
| 3112 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3113 |
|
|
| 3114 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3115 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3116 |
|
$self->{in_subset} = 1; |
| 3117 |
|
|
| 3118 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3119 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3125 |
$self->{set_nc}->($self); |
$self->{set_nc}->($self); |
| 3126 |
} |
} |
| 3127 |
|
|
| 3128 |
|
return ($self->{ct}); # DOCTYPE |
| 3129 |
redo A; |
redo A; |
| 3130 |
} else { |
} else { |
| 3131 |
|
|
| 3154 |
0x0042, # B |
0x0042, # B |
| 3155 |
0x004C, # L |
0x004C, # L |
| 3156 |
0x0049, # I |
0x0049, # I |
| 3157 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 3158 |
$self->{nc} == [ |
$self->{nc} == [ |
| 3159 |
undef, |
undef, |
| 3160 |
0x0075, # u |
0x0075, # u |
| 3161 |
0x0062, # b |
0x0062, # b |
| 3162 |
0x006C, # l |
0x006C, # l |
| 3163 |
0x0069, # i |
0x0069, # i |
| 3164 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 3165 |
|
|
| 3166 |
## Stay in the state. |
## Stay in the state. |
| 3167 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 3168 |
|
|
| 3169 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3170 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3177 |
} |
} |
| 3178 |
|
|
| 3179 |
redo A; |
redo A; |
| 3180 |
} elsif ((length $self->{s_kwd}) == 5 and |
} elsif ((length $self->{kwd}) == 5 and |
| 3181 |
($self->{nc} == 0x0043 or # C |
($self->{nc} == 0x0043 or # C |
| 3182 |
$self->{nc} == 0x0063)) { # c |
$self->{nc} == 0x0063)) { # c |
| 3183 |
|
if ($self->{is_xml} and |
| 3184 |
|
($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c |
| 3185 |
|
|
| 3186 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type |
| 3187 |
|
text => 'PUBLIC', |
| 3188 |
|
line => $self->{line_prev}, |
| 3189 |
|
column => $self->{column_prev} - 4); |
| 3190 |
|
} else { |
| 3191 |
|
|
| 3192 |
|
} |
| 3193 |
$self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 3194 |
|
|
| 3195 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3207 |
|
|
| 3208 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', |
| 3209 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 3210 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}); |
column => $self->{column_prev} + 1 - length $self->{kwd}); |
| 3211 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3212 |
|
|
| 3213 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 3222 |
0x0053, # S |
0x0053, # S |
| 3223 |
0x0054, # T |
0x0054, # T |
| 3224 |
0x0045, # E |
0x0045, # E |
| 3225 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 3226 |
$self->{nc} == [ |
$self->{nc} == [ |
| 3227 |
undef, |
undef, |
| 3228 |
0x0079, # y |
0x0079, # y |
| 3229 |
0x0073, # s |
0x0073, # s |
| 3230 |
0x0074, # t |
0x0074, # t |
| 3231 |
0x0065, # e |
0x0065, # e |
| 3232 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 3233 |
|
|
| 3234 |
## Stay in the state. |
## Stay in the state. |
| 3235 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 3236 |
|
|
| 3237 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3238 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3245 |
} |
} |
| 3246 |
|
|
| 3247 |
redo A; |
redo A; |
| 3248 |
} elsif ((length $self->{s_kwd}) == 5 and |
} elsif ((length $self->{kwd}) == 5 and |
| 3249 |
($self->{nc} == 0x004D or # M |
($self->{nc} == 0x004D or # M |
| 3250 |
$self->{nc} == 0x006D)) { # m |
$self->{nc} == 0x006D)) { # m |
| 3251 |
|
if ($self->{is_xml} and |
| 3252 |
|
($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m |
| 3253 |
|
|
| 3254 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type |
| 3255 |
|
text => 'SYSTEM', |
| 3256 |
|
line => $self->{line_prev}, |
| 3257 |
|
column => $self->{column_prev} - 4); |
| 3258 |
|
} else { |
| 3259 |
|
|
| 3260 |
|
} |
| 3261 |
$self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 3262 |
|
|
| 3263 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3275 |
|
|
| 3276 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', |
| 3277 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 3278 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}); |
column => $self->{column_prev} + 1 - length $self->{kwd}); |
| 3279 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3280 |
|
|
| 3281 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 3364 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3365 |
|
|
| 3366 |
redo A; |
redo A; |
| 3367 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3368 |
|
|
| 3369 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal'); |
| 3370 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3371 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3372 |
|
$self->{in_subset} = 1; |
| 3373 |
|
|
| 3374 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3375 |
|
$self->{line_prev} = $self->{line}; |
| 3376 |
|
$self->{column_prev} = $self->{column}; |
| 3377 |
|
$self->{column}++; |
| 3378 |
|
$self->{nc} |
| 3379 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3380 |
|
} else { |
| 3381 |
|
$self->{set_nc}->($self); |
| 3382 |
|
} |
| 3383 |
|
|
| 3384 |
|
return ($self->{ct}); # DOCTYPE |
| 3385 |
|
redo A; |
| 3386 |
} else { |
} else { |
| 3387 |
|
|
| 3388 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC'); |
| 3593 |
|
|
| 3594 |
redo A; |
redo A; |
| 3595 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 3596 |
|
if ($self->{is_xml}) { |
| 3597 |
|
|
| 3598 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); |
| 3599 |
|
} else { |
| 3600 |
|
|
| 3601 |
|
} |
| 3602 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3603 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 3604 |
|
|
| 3628 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3629 |
|
|
| 3630 |
redo A; |
redo A; |
| 3631 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3632 |
|
|
| 3633 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); |
| 3634 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3635 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3636 |
|
$self->{in_subset} = 1; |
| 3637 |
|
|
| 3638 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3639 |
|
$self->{line_prev} = $self->{line}; |
| 3640 |
|
$self->{column_prev} = $self->{column}; |
| 3641 |
|
$self->{column}++; |
| 3642 |
|
$self->{nc} |
| 3643 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3644 |
|
} else { |
| 3645 |
|
$self->{set_nc}->($self); |
| 3646 |
|
} |
| 3647 |
|
|
| 3648 |
|
return ($self->{ct}); # DOCTYPE |
| 3649 |
|
redo A; |
| 3650 |
} else { |
} else { |
| 3651 |
|
|
| 3652 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal'); |
| 3747 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3748 |
|
|
| 3749 |
redo A; |
redo A; |
| 3750 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3751 |
|
|
| 3752 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); |
| 3753 |
|
|
| 3754 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3755 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3756 |
|
$self->{in_subset} = 1; |
| 3757 |
|
|
| 3758 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3759 |
|
$self->{line_prev} = $self->{line}; |
| 3760 |
|
$self->{column_prev} = $self->{column}; |
| 3761 |
|
$self->{column}++; |
| 3762 |
|
$self->{nc} |
| 3763 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3764 |
|
} else { |
| 3765 |
|
$self->{set_nc}->($self); |
| 3766 |
|
} |
| 3767 |
|
|
| 3768 |
|
return ($self->{ct}); # DOCTYPE |
| 3769 |
|
redo A; |
| 3770 |
} else { |
} else { |
| 3771 |
|
|
| 3772 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM'); |
| 3802 |
} |
} |
| 3803 |
|
|
| 3804 |
redo A; |
redo A; |
| 3805 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # > |
| 3806 |
|
|
| 3807 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
| 3808 |
|
|
| 3873 |
} |
} |
| 3874 |
|
|
| 3875 |
redo A; |
redo A; |
| 3876 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # > |
| 3877 |
|
|
| 3878 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
| 3879 |
|
|
| 3974 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3975 |
|
|
| 3976 |
redo A; |
redo A; |
| 3977 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3978 |
|
|
| 3979 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3980 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3981 |
|
$self->{in_subset} = 1; |
| 3982 |
|
|
| 3983 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3984 |
|
$self->{line_prev} = $self->{line}; |
| 3985 |
|
$self->{column_prev} = $self->{column}; |
| 3986 |
|
$self->{column}++; |
| 3987 |
|
$self->{nc} |
| 3988 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3989 |
|
} else { |
| 3990 |
|
$self->{set_nc}->($self); |
| 3991 |
|
} |
| 3992 |
|
|
| 3993 |
|
return ($self->{ct}); # DOCTYPE |
| 3994 |
|
redo A; |
| 3995 |
} else { |
} else { |
| 3996 |
|
|
| 3997 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal'); |
| 4031 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 4032 |
|
|
| 4033 |
redo A; |
redo A; |
| 4034 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 4035 |
|
|
| 4036 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 4037 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 4038 |
|
$self->{in_subset} = 1; |
| 4039 |
|
|
| 4040 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4041 |
|
$self->{line_prev} = $self->{line}; |
| 4042 |
|
$self->{column_prev} = $self->{column}; |
| 4043 |
|
$self->{column}++; |
| 4044 |
|
$self->{nc} |
| 4045 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4046 |
|
} else { |
| 4047 |
|
$self->{set_nc}->($self); |
| 4048 |
|
} |
| 4049 |
|
|
| 4050 |
|
return ($self->{ct}); # DOCTYPE |
| 4051 |
|
redo A; |
| 4052 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 4053 |
|
|
| 4054 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 4061 |
} else { |
} else { |
| 4062 |
|
|
| 4063 |
my $s = ''; |
my $s = ''; |
| 4064 |
$self->{read_until}->($s, q[>], 0); |
$self->{read_until}->($s, q{>[}, 0); |
| 4065 |
|
|
| 4066 |
## Stay in the state |
## Stay in the state |
| 4067 |
|
|
| 4229 |
} elsif ($self->{nc} == 0x0023) { # # |
} elsif ($self->{nc} == 0x0023) { # # |
| 4230 |
|
|
| 4231 |
$self->{state} = ENTITY_HASH_STATE; |
$self->{state} = ENTITY_HASH_STATE; |
| 4232 |
$self->{s_kwd} = '#'; |
$self->{kwd} = '#'; |
| 4233 |
|
|
| 4234 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4235 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 4249 |
|
|
| 4250 |
require Whatpm::_NamedEntityList; |
require Whatpm::_NamedEntityList; |
| 4251 |
$self->{state} = ENTITY_NAME_STATE; |
$self->{state} = ENTITY_NAME_STATE; |
| 4252 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 4253 |
$self->{entity__value} = $self->{s_kwd}; |
$self->{entity__value} = $self->{kwd}; |
| 4254 |
$self->{entity__match} = 0; |
$self->{entity__match} = 0; |
| 4255 |
|
|
| 4256 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4300 |
$self->{nc} == 0x0058) { # X |
$self->{nc} == 0x0058) { # X |
| 4301 |
|
|
| 4302 |
$self->{state} = HEXREF_X_STATE; |
$self->{state} = HEXREF_X_STATE; |
| 4303 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 4304 |
|
|
| 4305 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4306 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 4317 |
$self->{nc} <= 0x0039) { # 0..9 |
$self->{nc} <= 0x0039) { # 0..9 |
| 4318 |
|
|
| 4319 |
$self->{state} = NCR_NUM_STATE; |
$self->{state} = NCR_NUM_STATE; |
| 4320 |
$self->{s_kwd} = $self->{nc} - 0x0030; |
$self->{kwd} = $self->{nc} - 0x0030; |
| 4321 |
|
|
| 4322 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4323 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 4363 |
if (0x0030 <= $self->{nc} and |
if (0x0030 <= $self->{nc} and |
| 4364 |
$self->{nc} <= 0x0039) { # 0..9 |
$self->{nc} <= 0x0039) { # 0..9 |
| 4365 |
|
|
| 4366 |
$self->{s_kwd} *= 10; |
$self->{kwd} *= 10; |
| 4367 |
$self->{s_kwd} += $self->{nc} - 0x0030; |
$self->{kwd} += $self->{nc} - 0x0030; |
| 4368 |
|
|
| 4369 |
## Stay in the state. |
## Stay in the state. |
| 4370 |
|
|
| 4400 |
# |
# |
| 4401 |
} |
} |
| 4402 |
|
|
| 4403 |
my $code = $self->{s_kwd}; |
my $code = $self->{kwd}; |
| 4404 |
my $l = $self->{line_prev}; |
my $l = $self->{line_prev}; |
| 4405 |
my $c = $self->{column_prev}; |
my $c = $self->{column_prev}; |
| 4406 |
if ($charref_map->{$code}) { |
if ($charref_map->{$code}) { |
| 4443 |
# 0..9, A..F, a..f |
# 0..9, A..F, a..f |
| 4444 |
|
|
| 4445 |
$self->{state} = HEXREF_HEX_STATE; |
$self->{state} = HEXREF_HEX_STATE; |
| 4446 |
$self->{s_kwd} = 0; |
$self->{kwd} = 0; |
| 4447 |
## Reconsume. |
## Reconsume. |
| 4448 |
redo A; |
redo A; |
| 4449 |
} else { |
} else { |
| 4461 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4462 |
## Reconsume. |
## Reconsume. |
| 4463 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 4464 |
data => '&' . $self->{s_kwd}, |
data => '&' . $self->{kwd}, |
| 4465 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4466 |
column => $self->{column_prev} - length $self->{s_kwd}, |
column => $self->{column_prev} - length $self->{kwd}, |
| 4467 |
}); |
}); |
| 4468 |
redo A; |
redo A; |
| 4469 |
} else { |
} else { |
| 4470 |
|
|
| 4471 |
$self->{ca}->{value} .= '&' . $self->{s_kwd}; |
$self->{ca}->{value} .= '&' . $self->{kwd}; |
| 4472 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4473 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4474 |
## Reconsume. |
## Reconsume. |
| 4479 |
if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) { |
if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) { |
| 4480 |
# 0..9 |
# 0..9 |
| 4481 |
|
|
| 4482 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 4483 |
$self->{s_kwd} += $self->{nc} - 0x0030; |
$self->{kwd} += $self->{nc} - 0x0030; |
| 4484 |
## Stay in the state. |
## Stay in the state. |
| 4485 |
|
|
| 4486 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4497 |
} elsif (0x0061 <= $self->{nc} and |
} elsif (0x0061 <= $self->{nc} and |
| 4498 |
$self->{nc} <= 0x0066) { # a..f |
$self->{nc} <= 0x0066) { # a..f |
| 4499 |
|
|
| 4500 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 4501 |
$self->{s_kwd} += $self->{nc} - 0x0060 + 9; |
$self->{kwd} += $self->{nc} - 0x0060 + 9; |
| 4502 |
## Stay in the state. |
## Stay in the state. |
| 4503 |
|
|
| 4504 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4515 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
| 4516 |
$self->{nc} <= 0x0046) { # A..F |
$self->{nc} <= 0x0046) { # A..F |
| 4517 |
|
|
| 4518 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 4519 |
$self->{s_kwd} += $self->{nc} - 0x0040 + 9; |
$self->{kwd} += $self->{nc} - 0x0040 + 9; |
| 4520 |
## Stay in the state. |
## Stay in the state. |
| 4521 |
|
|
| 4522 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4553 |
# |
# |
| 4554 |
} |
} |
| 4555 |
|
|
| 4556 |
my $code = $self->{s_kwd}; |
my $code = $self->{kwd}; |
| 4557 |
my $l = $self->{line_prev}; |
my $l = $self->{line_prev}; |
| 4558 |
my $c = $self->{column_prev}; |
my $c = $self->{column_prev}; |
| 4559 |
if ($charref_map->{$code}) { |
if ($charref_map->{$code}) { |
| 4590 |
redo A; |
redo A; |
| 4591 |
} |
} |
| 4592 |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
| 4593 |
if (length $self->{s_kwd} < 30 and |
if (length $self->{kwd} < 30 and |
| 4594 |
## NOTE: Some number greater than the maximum length of entity name |
## NOTE: Some number greater than the maximum length of entity name |
| 4595 |
((0x0041 <= $self->{nc} and # a |
((0x0041 <= $self->{nc} and # a |
| 4596 |
$self->{nc} <= 0x005A) or # x |
$self->{nc} <= 0x005A) or # x |
| 4600 |
$self->{nc} <= 0x0039) or # 9 |
$self->{nc} <= 0x0039) or # 9 |
| 4601 |
$self->{nc} == 0x003B)) { # ; |
$self->{nc} == 0x003B)) { # ; |
| 4602 |
our $EntityChar; |
our $EntityChar; |
| 4603 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 4604 |
if (defined $EntityChar->{$self->{s_kwd}}) { |
if (defined $EntityChar->{$self->{kwd}}) { |
| 4605 |
if ($self->{nc} == 0x003B) { # ; |
if ($self->{nc} == 0x003B) { # ; |
| 4606 |
|
|
| 4607 |
$self->{entity__value} = $EntityChar->{$self->{s_kwd}}; |
$self->{entity__value} = $EntityChar->{$self->{kwd}}; |
| 4608 |
$self->{entity__match} = 1; |
$self->{entity__match} = 1; |
| 4609 |
|
|
| 4610 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4620 |
# |
# |
| 4621 |
} else { |
} else { |
| 4622 |
|
|
| 4623 |
$self->{entity__value} = $EntityChar->{$self->{s_kwd}}; |
$self->{entity__value} = $EntityChar->{$self->{kwd}}; |
| 4624 |
$self->{entity__match} = -1; |
$self->{entity__match} = -1; |
| 4625 |
## Stay in the state. |
## Stay in the state. |
| 4626 |
|
|
| 4668 |
if ($self->{prev_state} != DATA_STATE and # in attribute |
if ($self->{prev_state} != DATA_STATE and # in attribute |
| 4669 |
$self->{entity__match} < -1) { |
$self->{entity__match} < -1) { |
| 4670 |
|
|
| 4671 |
$data = '&' . $self->{s_kwd}; |
$data = '&' . $self->{kwd}; |
| 4672 |
# |
# |
| 4673 |
} else { |
} else { |
| 4674 |
|
|
| 4680 |
|
|
| 4681 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', |
| 4682 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4683 |
column => $self->{column_prev} - length $self->{s_kwd}); |
column => $self->{column_prev} - length $self->{kwd}); |
| 4684 |
$data = '&' . $self->{s_kwd}; |
$data = '&' . $self->{kwd}; |
| 4685 |
# |
# |
| 4686 |
} |
} |
| 4687 |
|
|
| 4704 |
data => $data, |
data => $data, |
| 4705 |
has_reference => $has_ref, |
has_reference => $has_ref, |
| 4706 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4707 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
column => $self->{column_prev} + 1 - length $self->{kwd}, |
| 4708 |
}); |
}); |
| 4709 |
redo A; |
redo A; |
| 4710 |
} else { |
} else { |
| 4774 |
redo A; |
redo A; |
| 4775 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 4776 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
| 4777 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 4778 |
$self->{s_kwd} = ''; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 4779 |
|
} else { |
| 4780 |
|
$self->{state} = DATA_STATE; |
| 4781 |
|
$self->{s_kwd} = ''; |
| 4782 |
|
} |
| 4783 |
## Reconsume. |
## Reconsume. |
| 4784 |
return ($self->{ct}); # pi |
return ($self->{ct}); # pi |
| 4785 |
redo A; |
redo A; |
| 4850 |
redo A; |
redo A; |
| 4851 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 4852 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
| 4853 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 4854 |
$self->{s_kwd} = ''; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 4855 |
|
} else { |
| 4856 |
|
$self->{state} = DATA_STATE; |
| 4857 |
|
$self->{s_kwd} = ''; |
| 4858 |
|
} |
| 4859 |
## Reprocess. |
## Reprocess. |
| 4860 |
return ($self->{ct}); # pi |
return ($self->{ct}); # pi |
| 4861 |
redo A; |
redo A; |
| 4880 |
} |
} |
| 4881 |
} elsif ($self->{state} == PI_AFTER_STATE) { |
} elsif ($self->{state} == PI_AFTER_STATE) { |
| 4882 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 4883 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 4884 |
$self->{s_kwd} = ''; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 4885 |
|
} else { |
| 4886 |
|
$self->{state} = DATA_STATE; |
| 4887 |
|
$self->{s_kwd} = ''; |
| 4888 |
|
} |
| 4889 |
|
|
| 4890 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4891 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 4930 |
} elsif ($self->{state} == PI_DATA_AFTER_STATE) { |
} elsif ($self->{state} == PI_DATA_AFTER_STATE) { |
| 4931 |
## XML5: Same as "pi after state" in XML5 |
## XML5: Same as "pi after state" in XML5 |
| 4932 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 4933 |
$self->{state} = DATA_STATE; |
if ($self->{in_subset}) { |
| 4934 |
$self->{s_kwd} = ''; |
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 4935 |
|
} else { |
| 4936 |
|
$self->{state} = DATA_STATE; |
| 4937 |
|
$self->{s_kwd} = ''; |
| 4938 |
|
} |
| 4939 |
|
|
| 4940 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4941 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 4970 |
## Reprocess. |
## Reprocess. |
| 4971 |
redo A; |
redo A; |
| 4972 |
} |
} |
| 4973 |
|
|
| 4974 |
|
} elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) { |
| 4975 |
|
if ($self->{nc} == 0x003C) { # < |
| 4976 |
|
$self->{state} = DOCTYPE_TAG_STATE; |
| 4977 |
|
|
| 4978 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4979 |
|
$self->{line_prev} = $self->{line}; |
| 4980 |
|
$self->{column_prev} = $self->{column}; |
| 4981 |
|
$self->{column}++; |
| 4982 |
|
$self->{nc} |
| 4983 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4984 |
|
} else { |
| 4985 |
|
$self->{set_nc}->($self); |
| 4986 |
|
} |
| 4987 |
|
|
| 4988 |
|
redo A; |
| 4989 |
|
} elsif ($self->{nc} == 0x0025) { # % |
| 4990 |
|
## XML5: Not defined yet. |
| 4991 |
|
|
| 4992 |
|
## TODO: |
| 4993 |
|
|
| 4994 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4995 |
|
$self->{line_prev} = $self->{line}; |
| 4996 |
|
$self->{column_prev} = $self->{column}; |
| 4997 |
|
$self->{column}++; |
| 4998 |
|
$self->{nc} |
| 4999 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5000 |
|
} else { |
| 5001 |
|
$self->{set_nc}->($self); |
| 5002 |
|
} |
| 5003 |
|
|
| 5004 |
|
redo A; |
| 5005 |
|
} elsif ($self->{nc} == 0x005D) { # ] |
| 5006 |
|
delete $self->{in_subset}; |
| 5007 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE; |
| 5008 |
|
|
| 5009 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5010 |
|
$self->{line_prev} = $self->{line}; |
| 5011 |
|
$self->{column_prev} = $self->{column}; |
| 5012 |
|
$self->{column}++; |
| 5013 |
|
$self->{nc} |
| 5014 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5015 |
|
} else { |
| 5016 |
|
$self->{set_nc}->($self); |
| 5017 |
|
} |
| 5018 |
|
|
| 5019 |
|
redo A; |
| 5020 |
|
} elsif ($is_space->{$self->{nc}}) { |
| 5021 |
|
## Stay in the state. |
| 5022 |
|
|
| 5023 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5024 |
|
$self->{line_prev} = $self->{line}; |
| 5025 |
|
$self->{column_prev} = $self->{column}; |
| 5026 |
|
$self->{column}++; |
| 5027 |
|
$self->{nc} |
| 5028 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5029 |
|
} else { |
| 5030 |
|
$self->{set_nc}->($self); |
| 5031 |
|
} |
| 5032 |
|
|
| 5033 |
|
redo A; |
| 5034 |
|
} elsif ($self->{nc} == -1) { |
| 5035 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type |
| 5036 |
|
delete $self->{in_subset}; |
| 5037 |
|
$self->{state} = DATA_STATE; |
| 5038 |
|
$self->{s_kwd} = ''; |
| 5039 |
|
## Reconsume. |
| 5040 |
|
return ({type => END_OF_DOCTYPE_TOKEN}); |
| 5041 |
|
redo A; |
| 5042 |
|
} else { |
| 5043 |
|
unless ($self->{internal_subset_tainted}) { |
| 5044 |
|
## XML5: No parse error. |
| 5045 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset'); |
| 5046 |
|
$self->{internal_subset_tainted} = 1; |
| 5047 |
|
} |
| 5048 |
|
## Stay in the state. |
| 5049 |
|
|
| 5050 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5051 |
|
$self->{line_prev} = $self->{line}; |
| 5052 |
|
$self->{column_prev} = $self->{column}; |
| 5053 |
|
$self->{column}++; |
| 5054 |
|
$self->{nc} |
| 5055 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5056 |
|
} else { |
| 5057 |
|
$self->{set_nc}->($self); |
| 5058 |
|
} |
| 5059 |
|
|
| 5060 |
|
redo A; |
| 5061 |
|
} |
| 5062 |
|
} elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) { |
| 5063 |
|
if ($self->{nc} == 0x003E) { # > |
| 5064 |
|
$self->{state} = DATA_STATE; |
| 5065 |
|
$self->{s_kwd} = ''; |
| 5066 |
|
|
| 5067 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5068 |
|
$self->{line_prev} = $self->{line}; |
| 5069 |
|
$self->{column_prev} = $self->{column}; |
| 5070 |
|
$self->{column}++; |
| 5071 |
|
$self->{nc} |
| 5072 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5073 |
|
} else { |
| 5074 |
|
$self->{set_nc}->($self); |
| 5075 |
|
} |
| 5076 |
|
|
| 5077 |
|
return ({type => END_OF_DOCTYPE_TOKEN}); |
| 5078 |
|
redo A; |
| 5079 |
|
} elsif ($self->{nc} == -1) { |
| 5080 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 5081 |
|
$self->{state} = DATA_STATE; |
| 5082 |
|
$self->{s_kwd} = ''; |
| 5083 |
|
## Reconsume. |
| 5084 |
|
return ({type => END_OF_DOCTYPE_TOKEN}); |
| 5085 |
|
redo A; |
| 5086 |
|
} else { |
| 5087 |
|
## XML5: No parse error and stay in the state. |
| 5088 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type |
| 5089 |
|
|
| 5090 |
|
$self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE; |
| 5091 |
|
|
| 5092 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5093 |
|
$self->{line_prev} = $self->{line}; |
| 5094 |
|
$self->{column_prev} = $self->{column}; |
| 5095 |
|
$self->{column}++; |
| 5096 |
|
$self->{nc} |
| 5097 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5098 |
|
} else { |
| 5099 |
|
$self->{set_nc}->($self); |
| 5100 |
|
} |
| 5101 |
|
|
| 5102 |
|
redo A; |
| 5103 |
|
} |
| 5104 |
|
} elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) { |
| 5105 |
|
if ($self->{nc} == 0x003E) { # > |
| 5106 |
|
$self->{state} = DATA_STATE; |
| 5107 |
|
$self->{s_kwd} = ''; |
| 5108 |
|
|
| 5109 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5110 |
|
$self->{line_prev} = $self->{line}; |
| 5111 |
|
$self->{column_prev} = $self->{column}; |
| 5112 |
|
$self->{column}++; |
| 5113 |
|
$self->{nc} |
| 5114 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5115 |
|
} else { |
| 5116 |
|
$self->{set_nc}->($self); |
| 5117 |
|
} |
| 5118 |
|
|
| 5119 |
|
return ({type => END_OF_DOCTYPE_TOKEN}); |
| 5120 |
|
redo A; |
| 5121 |
|
} elsif ($self->{nc} == -1) { |
| 5122 |
|
$self->{state} = DATA_STATE; |
| 5123 |
|
$self->{s_kwd} = ''; |
| 5124 |
|
## Reconsume. |
| 5125 |
|
return ({type => END_OF_DOCTYPE_TOKEN}); |
| 5126 |
|
redo A; |
| 5127 |
|
} else { |
| 5128 |
|
## Stay in the state. |
| 5129 |
|
|
| 5130 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5131 |
|
$self->{line_prev} = $self->{line}; |
| 5132 |
|
$self->{column_prev} = $self->{column}; |
| 5133 |
|
$self->{column}++; |
| 5134 |
|
$self->{nc} |
| 5135 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5136 |
|
} else { |
| 5137 |
|
$self->{set_nc}->($self); |
| 5138 |
|
} |
| 5139 |
|
|
| 5140 |
|
redo A; |
| 5141 |
|
} |
| 5142 |
|
} elsif ($self->{state} == DOCTYPE_TAG_STATE) { |
| 5143 |
|
if ($self->{nc} == 0x0021) { # ! |
| 5144 |
|
$self->{state} = MARKUP_DECLARATION_OPEN_STATE; |
| 5145 |
|
|
| 5146 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5147 |
|
$self->{line_prev} = $self->{line}; |
| 5148 |
|
$self->{column_prev} = $self->{column}; |
| 5149 |
|
$self->{column}++; |
| 5150 |
|
$self->{nc} |
| 5151 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5152 |
|
} else { |
| 5153 |
|
$self->{set_nc}->($self); |
| 5154 |
|
} |
| 5155 |
|
|
| 5156 |
|
redo A; |
| 5157 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 5158 |
|
$self->{state} = PI_STATE; |
| 5159 |
|
|
| 5160 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5161 |
|
$self->{line_prev} = $self->{line}; |
| 5162 |
|
$self->{column_prev} = $self->{column}; |
| 5163 |
|
$self->{column}++; |
| 5164 |
|
$self->{nc} |
| 5165 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5166 |
|
} else { |
| 5167 |
|
$self->{set_nc}->($self); |
| 5168 |
|
} |
| 5169 |
|
|
| 5170 |
|
redo A; |
| 5171 |
|
} elsif ($self->{nc} == -1) { |
| 5172 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago'); |
| 5173 |
|
$self->{state} = DATA_STATE; |
| 5174 |
|
$self->{s_kwd} = ''; |
| 5175 |
|
## Reconsume. |
| 5176 |
|
redo A; |
| 5177 |
|
} else { |
| 5178 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error. |
| 5179 |
|
line => $self->{line_prev}, |
| 5180 |
|
column => $self->{column_prev}); |
| 5181 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
| 5182 |
|
$self->{ct} = {type => COMMENT_TOKEN, |
| 5183 |
|
data => '', |
| 5184 |
|
}; ## NOTE: Will be discarded. |
| 5185 |
|
|
| 5186 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5187 |
|
$self->{line_prev} = $self->{line}; |
| 5188 |
|
$self->{column_prev} = $self->{column}; |
| 5189 |
|
$self->{column}++; |
| 5190 |
|
$self->{nc} |
| 5191 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5192 |
|
} else { |
| 5193 |
|
$self->{set_nc}->($self); |
| 5194 |
|
} |
| 5195 |
|
|
| 5196 |
|
redo A; |
| 5197 |
|
} |
| 5198 |
|
|
| 5199 |
} else { |
} else { |
| 5200 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |