| 114 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
| 115 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
| 116 |
|
|
| 117 |
|
## XML states |
| 118 |
|
sub PI_STATE () { 51 } |
| 119 |
|
sub PI_TARGET_STATE () { 52 } |
| 120 |
|
sub PI_TARGET_AFTER_STATE () { 53 } |
| 121 |
|
sub PI_DATA_STATE () { 54 } |
| 122 |
|
sub PI_AFTER_STATE () { 55 } |
| 123 |
|
sub PI_DATA_AFTER_STATE () { 56 } |
| 124 |
|
|
| 125 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
| 126 |
## list and descriptions) |
## list and descriptions) |
| 127 |
|
|
| 227 |
## ->{value} |
## ->{value} |
| 228 |
## ->{has_reference} == 1 or 0 |
## ->{has_reference} == 1 or 0 |
| 229 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
| 230 |
|
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
| 231 |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
| 232 |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
| 233 |
## while the token is pushed back to the stack. |
## while the token is pushed back to the stack. |
| 638 |
|
|
| 639 |
redo A; |
redo A; |
| 640 |
} elsif ($self->{nc} == 0x003F) { # ? |
} elsif ($self->{nc} == 0x003F) { # ? |
| 641 |
|
if ($self->{is_xml}) { |
| 642 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'pio', |
|
| 643 |
line => $self->{line_prev}, |
$self->{state} = PI_STATE; |
| 644 |
column => $self->{column_prev}); |
|
| 645 |
$self->{state} = BOGUS_COMMENT_STATE; |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 646 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
$self->{line_prev} = $self->{line}; |
| 647 |
line => $self->{line_prev}, |
$self->{column_prev} = $self->{column}; |
| 648 |
column => $self->{column_prev}, |
$self->{column}++; |
| 649 |
}; |
$self->{nc} |
| 650 |
## $self->{nc} is intentionally left as is |
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 651 |
redo A; |
} else { |
| 652 |
} else { |
$self->{set_nc}->($self); |
| 653 |
|
} |
| 654 |
|
|
| 655 |
|
redo A; |
| 656 |
|
} else { |
| 657 |
|
|
| 658 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'pio', |
| 659 |
|
line => $self->{line_prev}, |
| 660 |
|
column => $self->{column_prev}); |
| 661 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
| 662 |
|
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
| 663 |
|
line => $self->{line_prev}, |
| 664 |
|
column => $self->{column_prev}, |
| 665 |
|
}; |
| 666 |
|
## $self->{nc} is intentionally left as is |
| 667 |
|
redo A; |
| 668 |
|
} |
| 669 |
|
} elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) { |
| 670 |
|
|
| 671 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', |
| 672 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 681 |
}); |
}); |
| 682 |
|
|
| 683 |
redo A; |
redo A; |
| 684 |
|
} else { |
| 685 |
|
## XML5: "<:" is a parse error. |
| 686 |
|
|
| 687 |
|
$self->{ct} = {type => START_TAG_TOKEN, |
| 688 |
|
tag_name => chr ($self->{nc}), |
| 689 |
|
line => $self->{line_prev}, |
| 690 |
|
column => $self->{column_prev}}; |
| 691 |
|
$self->{state} = TAG_NAME_STATE; |
| 692 |
|
|
| 693 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 694 |
|
$self->{line_prev} = $self->{line}; |
| 695 |
|
$self->{column_prev} = $self->{column}; |
| 696 |
|
$self->{column}++; |
| 697 |
|
$self->{nc} |
| 698 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 699 |
|
} else { |
| 700 |
|
$self->{set_nc}->($self); |
| 701 |
|
} |
| 702 |
|
|
| 703 |
|
redo A; |
| 704 |
} |
} |
| 705 |
} else { |
} else { |
| 706 |
die "$0: $self->{content_model} in tag open"; |
die "$0: $self->{content_model} in tag open"; |
| 2273 |
redo A; |
redo A; |
| 2274 |
} elsif ($self->{s_kwd} eq '[CDATA' and |
} elsif ($self->{s_kwd} eq '[CDATA' and |
| 2275 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 2276 |
|
if ($self->{is_xml} and |
| 2277 |
|
not $self->{tainted} and |
| 2278 |
|
@{$self->{open_elements} or []} == 0) { |
| 2279 |
|
|
| 2280 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element', |
| 2281 |
|
line => $self->{line_prev}, |
| 2282 |
|
column => $self->{column_prev} - 7); |
| 2283 |
|
$self->{tainted} = 1; |
| 2284 |
|
} else { |
| 2285 |
|
|
| 2286 |
|
} |
| 2287 |
|
|
| 2288 |
$self->{ct} = {type => CHARACTER_TOKEN, |
$self->{ct} = {type => CHARACTER_TOKEN, |
| 2289 |
data => '', |
data => '', |
| 2290 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 3688 |
|
|
| 3689 |
redo A; |
redo A; |
| 3690 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 3691 |
|
if ($self->{is_xml}) { |
| 3692 |
|
|
| 3693 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type |
| 3694 |
|
} else { |
| 3695 |
|
|
| 3696 |
|
} |
| 3697 |
|
|
| 3698 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3699 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 3700 |
|
|
| 4016 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4017 |
## Reconsume. |
## Reconsume. |
| 4018 |
return ({type => CHARACTER_TOKEN, data => chr $code, |
return ({type => CHARACTER_TOKEN, data => chr $code, |
| 4019 |
|
has_reference => 1, |
| 4020 |
line => $l, column => $c, |
line => $l, column => $c, |
| 4021 |
}); |
}); |
| 4022 |
redo A; |
redo A; |
| 4169 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4170 |
## Reconsume. |
## Reconsume. |
| 4171 |
return ({type => CHARACTER_TOKEN, data => chr $code, |
return ({type => CHARACTER_TOKEN, data => chr $code, |
| 4172 |
|
has_reference => 1, |
| 4173 |
line => $l, column => $c, |
line => $l, column => $c, |
| 4174 |
}); |
}); |
| 4175 |
redo A; |
redo A; |
| 4295 |
## Reconsume. |
## Reconsume. |
| 4296 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 4297 |
data => $data, |
data => $data, |
| 4298 |
|
has_reference => $has_ref, |
| 4299 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4300 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
| 4301 |
}); |
}); |
| 4309 |
## Reconsume. |
## Reconsume. |
| 4310 |
redo A; |
redo A; |
| 4311 |
} |
} |
| 4312 |
|
|
| 4313 |
|
## XML-only states |
| 4314 |
|
|
| 4315 |
|
} elsif ($self->{state} == PI_STATE) { |
| 4316 |
|
if ($is_space->{$self->{nc}} or |
| 4317 |
|
$self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else" |
| 4318 |
|
$self->{nc} == -1) { |
| 4319 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type |
| 4320 |
|
line => $self->{line_prev}, |
| 4321 |
|
column => $self->{column_prev} |
| 4322 |
|
- 1 * ($self->{nc} != -1)); |
| 4323 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
| 4324 |
|
## Reconsume. |
| 4325 |
|
$self->{ct} = {type => COMMENT_TOKEN, |
| 4326 |
|
data => '?', |
| 4327 |
|
line => $self->{line_prev}, |
| 4328 |
|
column => $self->{column_prev} |
| 4329 |
|
- 1 * ($self->{nc} != -1), |
| 4330 |
|
}; |
| 4331 |
|
redo A; |
| 4332 |
|
} else { |
| 4333 |
|
$self->{ct} = {type => PI_TOKEN, |
| 4334 |
|
target => chr $self->{nc}, |
| 4335 |
|
data => '', |
| 4336 |
|
line => $self->{line_prev}, |
| 4337 |
|
column => $self->{column_prev} - 1, |
| 4338 |
|
}; |
| 4339 |
|
$self->{state} = PI_TARGET_STATE; |
| 4340 |
|
|
| 4341 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4342 |
|
$self->{line_prev} = $self->{line}; |
| 4343 |
|
$self->{column_prev} = $self->{column}; |
| 4344 |
|
$self->{column}++; |
| 4345 |
|
$self->{nc} |
| 4346 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4347 |
|
} else { |
| 4348 |
|
$self->{set_nc}->($self); |
| 4349 |
|
} |
| 4350 |
|
|
| 4351 |
|
redo A; |
| 4352 |
|
} |
| 4353 |
|
} elsif ($self->{state} == PI_TARGET_STATE) { |
| 4354 |
|
if ($is_space->{$self->{nc}}) { |
| 4355 |
|
$self->{state} = PI_TARGET_AFTER_STATE; |
| 4356 |
|
|
| 4357 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4358 |
|
$self->{line_prev} = $self->{line}; |
| 4359 |
|
$self->{column_prev} = $self->{column}; |
| 4360 |
|
$self->{column}++; |
| 4361 |
|
$self->{nc} |
| 4362 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4363 |
|
} else { |
| 4364 |
|
$self->{set_nc}->($self); |
| 4365 |
|
} |
| 4366 |
|
|
| 4367 |
|
redo A; |
| 4368 |
|
} elsif ($self->{nc} == -1) { |
| 4369 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
| 4370 |
|
$self->{state} = DATA_STATE; |
| 4371 |
|
$self->{s_kwd} = ''; |
| 4372 |
|
## Reconsume. |
| 4373 |
|
return ($self->{ct}); # pi |
| 4374 |
|
redo A; |
| 4375 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4376 |
|
$self->{state} = PI_AFTER_STATE; |
| 4377 |
|
|
| 4378 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4379 |
|
$self->{line_prev} = $self->{line}; |
| 4380 |
|
$self->{column_prev} = $self->{column}; |
| 4381 |
|
$self->{column}++; |
| 4382 |
|
$self->{nc} |
| 4383 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4384 |
|
} else { |
| 4385 |
|
$self->{set_nc}->($self); |
| 4386 |
|
} |
| 4387 |
|
|
| 4388 |
|
redo A; |
| 4389 |
|
} else { |
| 4390 |
|
## XML5: typo ("tag name" -> "target") |
| 4391 |
|
$self->{ct}->{target} .= chr $self->{nc}; # pi |
| 4392 |
|
|
| 4393 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4394 |
|
$self->{line_prev} = $self->{line}; |
| 4395 |
|
$self->{column_prev} = $self->{column}; |
| 4396 |
|
$self->{column}++; |
| 4397 |
|
$self->{nc} |
| 4398 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4399 |
|
} else { |
| 4400 |
|
$self->{set_nc}->($self); |
| 4401 |
|
} |
| 4402 |
|
|
| 4403 |
|
redo A; |
| 4404 |
|
} |
| 4405 |
|
} elsif ($self->{state} == PI_TARGET_AFTER_STATE) { |
| 4406 |
|
if ($is_space->{$self->{nc}}) { |
| 4407 |
|
## Stay in the state. |
| 4408 |
|
|
| 4409 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4410 |
|
$self->{line_prev} = $self->{line}; |
| 4411 |
|
$self->{column_prev} = $self->{column}; |
| 4412 |
|
$self->{column}++; |
| 4413 |
|
$self->{nc} |
| 4414 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4415 |
|
} else { |
| 4416 |
|
$self->{set_nc}->($self); |
| 4417 |
|
} |
| 4418 |
|
|
| 4419 |
|
redo A; |
| 4420 |
|
} else { |
| 4421 |
|
$self->{state} = PI_DATA_STATE; |
| 4422 |
|
## Reprocess. |
| 4423 |
|
redo A; |
| 4424 |
|
} |
| 4425 |
|
} elsif ($self->{state} == PI_DATA_STATE) { |
| 4426 |
|
if ($self->{nc} == 0x003F) { # ? |
| 4427 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
| 4428 |
|
|
| 4429 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4430 |
|
$self->{line_prev} = $self->{line}; |
| 4431 |
|
$self->{column_prev} = $self->{column}; |
| 4432 |
|
$self->{column}++; |
| 4433 |
|
$self->{nc} |
| 4434 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4435 |
|
} else { |
| 4436 |
|
$self->{set_nc}->($self); |
| 4437 |
|
} |
| 4438 |
|
|
| 4439 |
|
redo A; |
| 4440 |
|
} elsif ($self->{nc} == -1) { |
| 4441 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
| 4442 |
|
$self->{state} = DATA_STATE; |
| 4443 |
|
$self->{s_kwd} = ''; |
| 4444 |
|
## Reprocess. |
| 4445 |
|
return ($self->{ct}); # pi |
| 4446 |
|
redo A; |
| 4447 |
|
} else { |
| 4448 |
|
$self->{ct}->{data} .= chr $self->{nc}; # pi |
| 4449 |
|
$self->{read_until}->($self->{ct}->{data}, q[?], |
| 4450 |
|
length $self->{ct}->{data}); |
| 4451 |
|
## Stay in the state. |
| 4452 |
|
|
| 4453 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4454 |
|
$self->{line_prev} = $self->{line}; |
| 4455 |
|
$self->{column_prev} = $self->{column}; |
| 4456 |
|
$self->{column}++; |
| 4457 |
|
$self->{nc} |
| 4458 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4459 |
|
} else { |
| 4460 |
|
$self->{set_nc}->($self); |
| 4461 |
|
} |
| 4462 |
|
|
| 4463 |
|
## Reprocess. |
| 4464 |
|
redo A; |
| 4465 |
|
} |
| 4466 |
|
} elsif ($self->{state} == PI_AFTER_STATE) { |
| 4467 |
|
if ($self->{nc} == 0x003E) { # > |
| 4468 |
|
$self->{state} = DATA_STATE; |
| 4469 |
|
$self->{s_kwd} = ''; |
| 4470 |
|
|
| 4471 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4472 |
|
$self->{line_prev} = $self->{line}; |
| 4473 |
|
$self->{column_prev} = $self->{column}; |
| 4474 |
|
$self->{column}++; |
| 4475 |
|
$self->{nc} |
| 4476 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4477 |
|
} else { |
| 4478 |
|
$self->{set_nc}->($self); |
| 4479 |
|
} |
| 4480 |
|
|
| 4481 |
|
return ($self->{ct}); # pi |
| 4482 |
|
redo A; |
| 4483 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4484 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type |
| 4485 |
|
line => $self->{line_prev}, |
| 4486 |
|
column => $self->{column_prev}); ## XML5: no error |
| 4487 |
|
$self->{ct}->{data} .= '?'; |
| 4488 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
| 4489 |
|
|
| 4490 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4491 |
|
$self->{line_prev} = $self->{line}; |
| 4492 |
|
$self->{column_prev} = $self->{column}; |
| 4493 |
|
$self->{column}++; |
| 4494 |
|
$self->{nc} |
| 4495 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4496 |
|
} else { |
| 4497 |
|
$self->{set_nc}->($self); |
| 4498 |
|
} |
| 4499 |
|
|
| 4500 |
|
redo A; |
| 4501 |
|
} else { |
| 4502 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type |
| 4503 |
|
line => $self->{line_prev}, |
| 4504 |
|
column => $self->{column_prev} |
| 4505 |
|
+ 1 * ($self->{nc} == -1)); ## XML5: no error |
| 4506 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
| 4507 |
|
$self->{state} = PI_DATA_STATE; |
| 4508 |
|
## Reprocess. |
| 4509 |
|
redo A; |
| 4510 |
|
} |
| 4511 |
|
} elsif ($self->{state} == PI_DATA_AFTER_STATE) { |
| 4512 |
|
## XML5: Same as "pi after state" in XML5 |
| 4513 |
|
if ($self->{nc} == 0x003E) { # > |
| 4514 |
|
$self->{state} = DATA_STATE; |
| 4515 |
|
$self->{s_kwd} = ''; |
| 4516 |
|
|
| 4517 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4518 |
|
$self->{line_prev} = $self->{line}; |
| 4519 |
|
$self->{column_prev} = $self->{column}; |
| 4520 |
|
$self->{column}++; |
| 4521 |
|
$self->{nc} |
| 4522 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4523 |
|
} else { |
| 4524 |
|
$self->{set_nc}->($self); |
| 4525 |
|
} |
| 4526 |
|
|
| 4527 |
|
return ($self->{ct}); # pi |
| 4528 |
|
redo A; |
| 4529 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4530 |
|
$self->{ct}->{data} .= '?'; |
| 4531 |
|
## Stay in the state. |
| 4532 |
|
|
| 4533 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4534 |
|
$self->{line_prev} = $self->{line}; |
| 4535 |
|
$self->{column_prev} = $self->{column}; |
| 4536 |
|
$self->{column}++; |
| 4537 |
|
$self->{nc} |
| 4538 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4539 |
|
} else { |
| 4540 |
|
$self->{set_nc}->($self); |
| 4541 |
|
} |
| 4542 |
|
|
| 4543 |
|
redo A; |
| 4544 |
|
} else { |
| 4545 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
| 4546 |
|
$self->{state} = PI_DATA_STATE; |
| 4547 |
|
## Reprocess. |
| 4548 |
|
redo A; |
| 4549 |
|
} |
| 4550 |
|
|
| 4551 |
} else { |
} else { |
| 4552 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |
| 4553 |
} |
} |