| 114 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
| 115 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
| 116 |
|
|
| 117 |
|
## XML states |
| 118 |
|
sub PI_STATE () { 51 } |
| 119 |
|
sub PI_TARGET_STATE () { 52 } |
| 120 |
|
sub PI_TARGET_AFTER_STATE () { 53 } |
| 121 |
|
sub PI_DATA_STATE () { 54 } |
| 122 |
|
sub PI_AFTER_STATE () { 55 } |
| 123 |
|
sub PI_DATA_AFTER_STATE () { 56 } |
| 124 |
|
|
| 125 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
| 126 |
## list and descriptions) |
## list and descriptions) |
| 127 |
|
|
| 186 |
#$self->{is_xml} (if XML) |
#$self->{is_xml} (if XML) |
| 187 |
|
|
| 188 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
| 189 |
#$self->{s_kwd}; # state keyword - initialized when used |
$self->{s_kwd} = ''; # state keyword |
| 190 |
#$self->{entity__value}; # initialized when used |
#$self->{entity__value}; # initialized when used |
| 191 |
#$self->{entity__match}; # initialized when used |
#$self->{entity__match}; # initialized when used |
| 192 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
| 216 |
|
|
| 217 |
## A token has: |
## A token has: |
| 218 |
## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, |
## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, |
| 219 |
## CHARACTER_TOKEN, or END_OF_FILE_TOKEN |
## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN |
| 220 |
## ->{name} (DOCTYPE_TOKEN) |
## ->{name} (DOCTYPE_TOKEN) |
| 221 |
## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) |
## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) |
| 222 |
|
## ->{target} (PI_TOKEN) |
| 223 |
## ->{pubid} (DOCTYPE_TOKEN) |
## ->{pubid} (DOCTYPE_TOKEN) |
| 224 |
## ->{sysid} (DOCTYPE_TOKEN) |
## ->{sysid} (DOCTYPE_TOKEN) |
| 225 |
## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag |
## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag |
| 227 |
## ->{name} |
## ->{name} |
| 228 |
## ->{value} |
## ->{value} |
| 229 |
## ->{has_reference} == 1 or 0 |
## ->{has_reference} == 1 or 0 |
| 230 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{index}: Index of the attribute in a tag. |
| 231 |
|
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) |
| 232 |
|
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
| 233 |
|
## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. |
| 234 |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
| 235 |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
| 236 |
## while the token is pushed back to the stack. |
## while the token is pushed back to the stack. |
| 374 |
} |
} |
| 375 |
} elsif ($self->{nc} == 0x002D) { # - |
} elsif ($self->{nc} == 0x002D) { # - |
| 376 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 377 |
$self->{s_kwd} .= '-'; |
if ($self->{s_kwd} eq '<!-') { |
|
|
|
|
if ($self->{s_kwd} eq '<!--') { |
|
| 378 |
|
|
| 379 |
$self->{escape} = 1; # unless $self->{escape}; |
$self->{escape} = 1; # unless $self->{escape}; |
| 380 |
$self->{s_kwd} = '--'; |
$self->{s_kwd} = '--'; |
| 381 |
# |
# |
| 382 |
} elsif ($self->{s_kwd} eq '---') { |
} elsif ($self->{s_kwd} eq '-') { |
| 383 |
|
|
| 384 |
$self->{s_kwd} = '--'; |
$self->{s_kwd} = '--'; |
| 385 |
# |
# |
| 386 |
|
} elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') { |
| 387 |
|
|
| 388 |
|
$self->{s_kwd} .= '-'; |
| 389 |
|
# |
| 390 |
} else { |
} else { |
| 391 |
|
|
| 392 |
|
$self->{s_kwd} = '-'; |
| 393 |
# |
# |
| 394 |
} |
} |
| 395 |
} |
} |
| 435 |
if ($self->{s_kwd} eq '--') { |
if ($self->{s_kwd} eq '--') { |
| 436 |
|
|
| 437 |
delete $self->{escape}; |
delete $self->{escape}; |
| 438 |
|
# |
| 439 |
} else { |
} else { |
| 440 |
|
|
| 441 |
|
# |
| 442 |
} |
} |
| 443 |
|
} elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') { |
| 444 |
|
|
| 445 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type |
| 446 |
|
line => $self->{line_prev}, |
| 447 |
|
column => $self->{column_prev} - 1); |
| 448 |
|
# |
| 449 |
} else { |
} else { |
| 450 |
|
|
| 451 |
|
# |
| 452 |
} |
} |
| 453 |
|
|
| 454 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 455 |
# |
# |
| 456 |
|
} elsif ($self->{nc} == 0x005D) { # ] |
| 457 |
|
if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') { |
| 458 |
|
|
| 459 |
|
$self->{s_kwd} .= ']'; |
| 460 |
|
} elsif ($self->{s_kwd} eq ']]') { |
| 461 |
|
|
| 462 |
|
# |
| 463 |
|
} else { |
| 464 |
|
|
| 465 |
|
$self->{s_kwd} = ''; |
| 466 |
|
} |
| 467 |
|
# |
| 468 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 469 |
|
|
| 470 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 482 |
data => chr $self->{nc}, |
data => chr $self->{nc}, |
| 483 |
line => $self->{line}, column => $self->{column}, |
line => $self->{line}, column => $self->{column}, |
| 484 |
}; |
}; |
| 485 |
if ($self->{read_until}->($token->{data}, q[-!<>&], |
if ($self->{read_until}->($token->{data}, q{-!<>&\]}, |
| 486 |
length $token->{data})) { |
length $token->{data})) { |
| 487 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 488 |
} |
} |
| 489 |
|
|
| 490 |
## Stay in the data state. |
## Stay in the data state. |
| 491 |
if ($self->{content_model} == PCDATA_CONTENT_MODEL) { |
if (not $self->{is_xml} and |
| 492 |
|
$self->{content_model} == PCDATA_CONTENT_MODEL) { |
| 493 |
|
|
| 494 |
$self->{state} = PCDATA_STATE; |
$self->{state} = PCDATA_STATE; |
| 495 |
} else { |
} else { |
| 510 |
return ($token); |
return ($token); |
| 511 |
redo A; |
redo A; |
| 512 |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
| 513 |
|
## XML5: "tag state". |
| 514 |
|
|
| 515 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 516 |
if ($self->{nc} == 0x002F) { # / |
if ($self->{nc} == 0x002F) { # / |
| 517 |
|
|
| 539 |
|
|
| 540 |
## reconsume |
## reconsume |
| 541 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 542 |
|
$self->{s_kwd} = ''; |
| 543 |
return ({type => CHARACTER_TOKEN, data => '<', |
return ({type => CHARACTER_TOKEN, data => '<', |
| 544 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 545 |
column => $self->{column_prev}, |
column => $self->{column_prev}, |
| 581 |
|
|
| 582 |
$self->{ct} |
$self->{ct} |
| 583 |
= {type => START_TAG_TOKEN, |
= {type => START_TAG_TOKEN, |
| 584 |
tag_name => chr ($self->{nc} + 0x0020), |
tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
| 585 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 586 |
column => $self->{column_prev}}; |
column => $self->{column_prev}}; |
| 587 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 623 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 624 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
| 625 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 626 |
|
$self->{s_kwd} = ''; |
| 627 |
|
|
| 628 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 629 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 643 |
|
|
| 644 |
redo A; |
redo A; |
| 645 |
} elsif ($self->{nc} == 0x003F) { # ? |
} elsif ($self->{nc} == 0x003F) { # ? |
| 646 |
|
if ($self->{is_xml}) { |
| 647 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'pio', |
|
| 648 |
line => $self->{line_prev}, |
$self->{state} = PI_STATE; |
| 649 |
column => $self->{column_prev}); |
|
| 650 |
$self->{state} = BOGUS_COMMENT_STATE; |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 651 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
$self->{line_prev} = $self->{line}; |
| 652 |
line => $self->{line_prev}, |
$self->{column_prev} = $self->{column}; |
| 653 |
column => $self->{column_prev}, |
$self->{column}++; |
| 654 |
}; |
$self->{nc} |
| 655 |
## $self->{nc} is intentionally left as is |
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 656 |
redo A; |
} else { |
| 657 |
} else { |
$self->{set_nc}->($self); |
| 658 |
|
} |
| 659 |
|
|
| 660 |
|
redo A; |
| 661 |
|
} else { |
| 662 |
|
|
| 663 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'pio', |
| 664 |
|
line => $self->{line_prev}, |
| 665 |
|
column => $self->{column_prev}); |
| 666 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
| 667 |
|
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
| 668 |
|
line => $self->{line_prev}, |
| 669 |
|
column => $self->{column_prev}, |
| 670 |
|
}; |
| 671 |
|
## $self->{nc} is intentionally left as is |
| 672 |
|
redo A; |
| 673 |
|
} |
| 674 |
|
} elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) { |
| 675 |
|
|
| 676 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', |
| 677 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 678 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
| 679 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 680 |
|
$self->{s_kwd} = ''; |
| 681 |
## reconsume |
## reconsume |
| 682 |
|
|
| 683 |
return ({type => CHARACTER_TOKEN, data => '<', |
return ({type => CHARACTER_TOKEN, data => '<', |
| 686 |
}); |
}); |
| 687 |
|
|
| 688 |
redo A; |
redo A; |
| 689 |
|
} else { |
| 690 |
|
## XML5: "<:" is a parse error. |
| 691 |
|
|
| 692 |
|
$self->{ct} = {type => START_TAG_TOKEN, |
| 693 |
|
tag_name => chr ($self->{nc}), |
| 694 |
|
line => $self->{line_prev}, |
| 695 |
|
column => $self->{column_prev}}; |
| 696 |
|
$self->{state} = TAG_NAME_STATE; |
| 697 |
|
|
| 698 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 699 |
|
$self->{line_prev} = $self->{line}; |
| 700 |
|
$self->{column_prev} = $self->{column}; |
| 701 |
|
$self->{column}++; |
| 702 |
|
$self->{nc} |
| 703 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 704 |
|
} else { |
| 705 |
|
$self->{set_nc}->($self); |
| 706 |
|
} |
| 707 |
|
|
| 708 |
|
redo A; |
| 709 |
} |
} |
| 710 |
} else { |
} else { |
| 711 |
die "$0: $self->{content_model} in tag open"; |
die "$0: $self->{content_model} in tag open"; |
| 714 |
## NOTE: The "close tag open state" in the spec is implemented as |
## NOTE: The "close tag open state" in the spec is implemented as |
| 715 |
## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|. |
## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|. |
| 716 |
|
|
| 717 |
|
## XML5: "end tag state". |
| 718 |
|
|
| 719 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" |
| 720 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 721 |
if (defined $self->{last_stag_name}) { |
if (defined $self->{last_stag_name}) { |
| 728 |
## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>. |
## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>. |
| 729 |
|
|
| 730 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 731 |
|
$self->{s_kwd} = ''; |
| 732 |
## Reconsume. |
## Reconsume. |
| 733 |
return ({type => CHARACTER_TOKEN, data => '</', |
return ({type => CHARACTER_TOKEN, data => '</', |
| 734 |
line => $l, column => $c, |
line => $l, column => $c, |
| 742 |
|
|
| 743 |
$self->{ct} |
$self->{ct} |
| 744 |
= {type => END_TAG_TOKEN, |
= {type => END_TAG_TOKEN, |
| 745 |
tag_name => chr ($self->{nc} + 0x0020), |
tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
| 746 |
line => $l, column => $c}; |
line => $l, column => $c}; |
| 747 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 748 |
|
|
| 777 |
|
|
| 778 |
redo A; |
redo A; |
| 779 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
|
|
|
| 780 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag', |
| 781 |
line => $self->{line_prev}, ## "<" in "</>" |
line => $self->{line_prev}, ## "<" in "</>" |
| 782 |
column => $self->{column_prev} - 1); |
column => $self->{column_prev} - 1); |
| 783 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 784 |
|
$self->{s_kwd} = ''; |
| 785 |
|
if ($self->{is_xml}) { |
| 786 |
|
|
| 787 |
|
## XML5: No parse error. |
| 788 |
|
|
| 789 |
|
## NOTE: This parser raises a parse error, since it supports |
| 790 |
|
## XML1, not XML5. |
| 791 |
|
|
| 792 |
|
## NOTE: A short end tag token. |
| 793 |
|
my $ct = {type => END_TAG_TOKEN, |
| 794 |
|
tag_name => '', |
| 795 |
|
line => $self->{line_prev}, |
| 796 |
|
column => $self->{column_prev} - 1, |
| 797 |
|
}; |
| 798 |
|
|
| 799 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 800 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 801 |
$self->{column_prev} = $self->{column}; |
$self->{column_prev} = $self->{column}; |
| 806 |
$self->{set_nc}->($self); |
$self->{set_nc}->($self); |
| 807 |
} |
} |
| 808 |
|
|
| 809 |
|
return ($ct); |
| 810 |
|
} else { |
| 811 |
|
|
| 812 |
|
|
| 813 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 814 |
|
$self->{line_prev} = $self->{line}; |
| 815 |
|
$self->{column_prev} = $self->{column}; |
| 816 |
|
$self->{column}++; |
| 817 |
|
$self->{nc} |
| 818 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 819 |
|
} else { |
| 820 |
|
$self->{set_nc}->($self); |
| 821 |
|
} |
| 822 |
|
|
| 823 |
|
} |
| 824 |
redo A; |
redo A; |
| 825 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 826 |
|
|
| 827 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago'); |
| 828 |
|
$self->{s_kwd} = ''; |
| 829 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 830 |
# reconsume |
# reconsume |
| 831 |
|
|
| 834 |
}); |
}); |
| 835 |
|
|
| 836 |
redo A; |
redo A; |
| 837 |
} else { |
} elsif (not $self->{is_xml} or |
| 838 |
|
$is_space->{$self->{nc}}) { |
| 839 |
|
|
| 840 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag', |
| 841 |
|
line => $self->{line_prev}, # "<" of "</" |
| 842 |
|
column => $self->{column_prev} - 1); |
| 843 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 844 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
| 845 |
line => $self->{line_prev}, # "<" of "</" |
line => $self->{line_prev}, # "<" of "</" |
| 852 |
## generated from the bogus end tag, as defined in the |
## generated from the bogus end tag, as defined in the |
| 853 |
## "bogus comment state" entry. |
## "bogus comment state" entry. |
| 854 |
redo A; |
redo A; |
| 855 |
|
} else { |
| 856 |
|
## XML5: "</:" is a parse error. |
| 857 |
|
|
| 858 |
|
$self->{ct} = {type => END_TAG_TOKEN, |
| 859 |
|
tag_name => chr ($self->{nc}), |
| 860 |
|
line => $l, column => $c}; |
| 861 |
|
$self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state". |
| 862 |
|
|
| 863 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 864 |
|
$self->{line_prev} = $self->{line}; |
| 865 |
|
$self->{column_prev} = $self->{column}; |
| 866 |
|
$self->{column}++; |
| 867 |
|
$self->{nc} |
| 868 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 869 |
|
} else { |
| 870 |
|
$self->{set_nc}->($self); |
| 871 |
|
} |
| 872 |
|
|
| 873 |
|
redo A; |
| 874 |
} |
} |
| 875 |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
| 876 |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
| 897 |
} else { |
} else { |
| 898 |
|
|
| 899 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 900 |
|
$self->{s_kwd} = ''; |
| 901 |
## Reconsume. |
## Reconsume. |
| 902 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 903 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{s_kwd}, |
| 916 |
|
|
| 917 |
## Reconsume. |
## Reconsume. |
| 918 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 919 |
|
$self->{s_kwd} = ''; |
| 920 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 921 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{s_kwd}, |
| 922 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 968 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 969 |
} |
} |
| 970 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 971 |
|
$self->{s_kwd} = ''; |
| 972 |
|
|
| 973 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 974 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 987 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
| 988 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
| 989 |
|
|
| 990 |
$self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020); |
$self->{ct}->{tag_name} |
| 991 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
| 992 |
# start tag or end tag |
# start tag or end tag |
| 993 |
## Stay in this state |
## Stay in this state |
| 994 |
|
|
| 1021 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1022 |
} |
} |
| 1023 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1024 |
|
$self->{s_kwd} = ''; |
| 1025 |
# reconsume |
# reconsume |
| 1026 |
|
|
| 1027 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 1061 |
redo A; |
redo A; |
| 1062 |
} |
} |
| 1063 |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) { |
| 1064 |
|
## XML5: "Tag attribute name before state". |
| 1065 |
|
|
| 1066 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1067 |
|
|
| 1068 |
## Stay in the state |
## Stay in the state |
| 1094 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1095 |
} |
} |
| 1096 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1097 |
|
$self->{s_kwd} = ''; |
| 1098 |
|
|
| 1099 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1100 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 1114 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
| 1115 |
|
|
| 1116 |
$self->{ca} |
$self->{ca} |
| 1117 |
= {name => chr ($self->{nc} + 0x0020), |
= {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
| 1118 |
value => '', |
value => '', |
| 1119 |
line => $self->{line}, column => $self->{column}}; |
line => $self->{line}, column => $self->{column}}; |
| 1120 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
| 1162 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1163 |
} |
} |
| 1164 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1165 |
|
$self->{s_kwd} = ''; |
| 1166 |
# reconsume |
# reconsume |
| 1167 |
|
|
| 1168 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 1175 |
0x003D => 1, # = |
0x003D => 1, # = |
| 1176 |
}->{$self->{nc}}) { |
}->{$self->{nc}}) { |
| 1177 |
|
|
| 1178 |
|
## XML5: Not a parse error. |
| 1179 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1180 |
} else { |
} else { |
| 1181 |
|
|
| 1182 |
|
## XML5: ":" raises a parse error and is ignored. |
| 1183 |
} |
} |
| 1184 |
$self->{ca} |
$self->{ca} |
| 1185 |
= {name => chr ($self->{nc}), |
= {name => chr ($self->{nc}), |
| 1200 |
redo A; |
redo A; |
| 1201 |
} |
} |
| 1202 |
} elsif ($self->{state} == ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_NAME_STATE) { |
| 1203 |
|
## XML5: "Tag attribute name state". |
| 1204 |
|
|
| 1205 |
my $before_leave = sub { |
my $before_leave = sub { |
| 1206 |
if (exists $self->{ct}->{attributes} # start tag or end tag |
if (exists $self->{ct}->{attributes} # start tag or end tag |
| 1207 |
->{$self->{ca}->{name}}) { # MUST |
->{$self->{ca}->{name}}) { # MUST |
| 1212 |
|
|
| 1213 |
$self->{ct}->{attributes}->{$self->{ca}->{name}} |
$self->{ct}->{attributes}->{$self->{ca}->{name}} |
| 1214 |
= $self->{ca}; |
= $self->{ca}; |
| 1215 |
|
$self->{ca}->{index} = ++$self->{ct}->{last_index}; |
| 1216 |
} |
} |
| 1217 |
}; # $before_leave |
}; # $before_leave |
| 1218 |
|
|
| 1249 |
|
|
| 1250 |
redo A; |
redo A; |
| 1251 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 1252 |
|
if ($self->{is_xml}) { |
| 1253 |
|
|
| 1254 |
|
## XML5: Not a parse error. |
| 1255 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1256 |
|
} else { |
| 1257 |
|
|
| 1258 |
|
} |
| 1259 |
|
|
| 1260 |
$before_leave->(); |
$before_leave->(); |
| 1261 |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
| 1262 |
|
|
| 1271 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1272 |
} |
} |
| 1273 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1274 |
|
$self->{s_kwd} = ''; |
| 1275 |
|
|
| 1276 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1277 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 1290 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
| 1291 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
| 1292 |
|
|
| 1293 |
$self->{ca}->{name} .= chr ($self->{nc} + 0x0020); |
$self->{ca}->{name} |
| 1294 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
| 1295 |
## Stay in the state |
## Stay in the state |
| 1296 |
|
|
| 1297 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1306 |
|
|
| 1307 |
redo A; |
redo A; |
| 1308 |
} elsif ($self->{nc} == 0x002F) { # / |
} elsif ($self->{nc} == 0x002F) { # / |
| 1309 |
|
if ($self->{is_xml}) { |
| 1310 |
|
|
| 1311 |
|
## XML5: Not a parse error. |
| 1312 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1313 |
|
} else { |
| 1314 |
|
|
| 1315 |
|
} |
| 1316 |
|
|
| 1317 |
$before_leave->(); |
$before_leave->(); |
| 1318 |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
| 1347 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1348 |
} |
} |
| 1349 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1350 |
|
$self->{s_kwd} = ''; |
| 1351 |
# reconsume |
# reconsume |
| 1352 |
|
|
| 1353 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 1357 |
if ($self->{nc} == 0x0022 or # " |
if ($self->{nc} == 0x0022 or # " |
| 1358 |
$self->{nc} == 0x0027) { # ' |
$self->{nc} == 0x0027) { # ' |
| 1359 |
|
|
| 1360 |
|
## XML5: Not a parse error. |
| 1361 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1362 |
} else { |
} else { |
| 1363 |
|
|
| 1378 |
redo A; |
redo A; |
| 1379 |
} |
} |
| 1380 |
} elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) { |
| 1381 |
|
## XML5: "Tag attribute name after state". |
| 1382 |
|
|
| 1383 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1384 |
|
|
| 1385 |
## Stay in the state |
## Stay in the state |
| 1411 |
|
|
| 1412 |
redo A; |
redo A; |
| 1413 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 1414 |
|
if ($self->{is_xml}) { |
| 1415 |
|
|
| 1416 |
|
## XML5: Not a parse error. |
| 1417 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1418 |
|
} else { |
| 1419 |
|
|
| 1420 |
|
} |
| 1421 |
|
|
| 1422 |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
| 1423 |
|
|
| 1424 |
$self->{last_stag_name} = $self->{ct}->{tag_name}; |
$self->{last_stag_name} = $self->{ct}->{tag_name}; |
| 1435 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1436 |
} |
} |
| 1437 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1438 |
|
$self->{s_kwd} = ''; |
| 1439 |
|
|
| 1440 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1441 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 1455 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
| 1456 |
|
|
| 1457 |
$self->{ca} |
$self->{ca} |
| 1458 |
= {name => chr ($self->{nc} + 0x0020), |
= {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
| 1459 |
value => '', |
value => '', |
| 1460 |
line => $self->{line}, column => $self->{column}}; |
line => $self->{line}, column => $self->{column}}; |
| 1461 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
| 1472 |
|
|
| 1473 |
redo A; |
redo A; |
| 1474 |
} elsif ($self->{nc} == 0x002F) { # / |
} elsif ($self->{nc} == 0x002F) { # / |
| 1475 |
|
if ($self->{is_xml}) { |
| 1476 |
|
|
| 1477 |
|
## XML5: Not a parse error. |
| 1478 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1479 |
|
} else { |
| 1480 |
|
|
| 1481 |
|
} |
| 1482 |
|
|
| 1483 |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
| 1484 |
|
|
| 1510 |
} else { |
} else { |
| 1511 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1512 |
} |
} |
| 1513 |
|
$self->{s_kwd} = ''; |
| 1514 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1515 |
# reconsume |
# reconsume |
| 1516 |
|
|
| 1518 |
|
|
| 1519 |
redo A; |
redo A; |
| 1520 |
} else { |
} else { |
| 1521 |
|
if ($self->{is_xml}) { |
| 1522 |
|
|
| 1523 |
|
## XML5: Not a parse error. |
| 1524 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1525 |
|
} else { |
| 1526 |
|
|
| 1527 |
|
} |
| 1528 |
|
|
| 1529 |
if ($self->{nc} == 0x0022 or # " |
if ($self->{nc} == 0x0022 or # " |
| 1530 |
$self->{nc} == 0x0027) { # ' |
$self->{nc} == 0x0027) { # ' |
| 1531 |
|
|
| 1532 |
|
## XML5: Not a parse error. |
| 1533 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1534 |
} else { |
} else { |
| 1535 |
|
|
| 1553 |
redo A; |
redo A; |
| 1554 |
} |
} |
| 1555 |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) { |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) { |
| 1556 |
|
## XML5: "Tag attribute value before state". |
| 1557 |
|
|
| 1558 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1559 |
|
|
| 1560 |
## Stay in the state |
## Stay in the state |
| 1623 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1624 |
} |
} |
| 1625 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1626 |
|
$self->{s_kwd} = ''; |
| 1627 |
|
|
| 1628 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1629 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 1657 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1658 |
} |
} |
| 1659 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1660 |
|
$self->{s_kwd} = ''; |
| 1661 |
## reconsume |
## reconsume |
| 1662 |
|
|
| 1663 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 1666 |
} else { |
} else { |
| 1667 |
if ($self->{nc} == 0x003D) { # = |
if ($self->{nc} == 0x003D) { # = |
| 1668 |
|
|
| 1669 |
|
## XML5: Not a parse error. |
| 1670 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
| 1671 |
|
} elsif ($self->{is_xml}) { |
| 1672 |
|
|
| 1673 |
|
## XML5: No parse error. |
| 1674 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO |
| 1675 |
} else { |
} else { |
| 1676 |
|
|
| 1677 |
} |
} |
| 1691 |
redo A; |
redo A; |
| 1692 |
} |
} |
| 1693 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
| 1694 |
|
## XML5: "Tag attribute value double quoted state". |
| 1695 |
|
|
| 1696 |
if ($self->{nc} == 0x0022) { # " |
if ($self->{nc} == 0x0022) { # " |
| 1697 |
|
|
| 1698 |
|
## XML5: "Tag attribute name before state". |
| 1699 |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1700 |
|
|
| 1701 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1711 |
redo A; |
redo A; |
| 1712 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1713 |
|
|
| 1714 |
|
## XML5: Not defined yet. |
| 1715 |
|
|
| 1716 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1717 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1718 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1750 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1751 |
} |
} |
| 1752 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1753 |
|
$self->{s_kwd} = ''; |
| 1754 |
## reconsume |
## reconsume |
| 1755 |
|
|
| 1756 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 1757 |
|
|
| 1758 |
redo A; |
redo A; |
| 1759 |
} else { |
} else { |
| 1760 |
|
if ($self->{is_xml} and $self->{nc} == 0x003C) { # < |
| 1761 |
|
|
| 1762 |
|
## XML5: Not a parse error. |
| 1763 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type |
| 1764 |
|
} else { |
| 1765 |
|
|
| 1766 |
|
} |
| 1767 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 1768 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 1769 |
q["&], |
q["&<], |
| 1770 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 1771 |
|
|
| 1772 |
## Stay in the state |
## Stay in the state |
| 1784 |
redo A; |
redo A; |
| 1785 |
} |
} |
| 1786 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
| 1787 |
|
## XML5: "Tag attribute value single quoted state". |
| 1788 |
|
|
| 1789 |
if ($self->{nc} == 0x0027) { # ' |
if ($self->{nc} == 0x0027) { # ' |
| 1790 |
|
|
| 1791 |
|
## XML5: "Before attribute name state" (sic). |
| 1792 |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1793 |
|
|
| 1794 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1804 |
redo A; |
redo A; |
| 1805 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1806 |
|
|
| 1807 |
|
## XML5: Not defined yet. |
| 1808 |
|
|
| 1809 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1810 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1811 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1843 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1844 |
} |
} |
| 1845 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1846 |
|
$self->{s_kwd} = ''; |
| 1847 |
## reconsume |
## reconsume |
| 1848 |
|
|
| 1849 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 1850 |
|
|
| 1851 |
redo A; |
redo A; |
| 1852 |
} else { |
} else { |
| 1853 |
|
if ($self->{is_xml} and $self->{nc} == 0x003C) { # < |
| 1854 |
|
|
| 1855 |
|
## XML5: Not a parse error. |
| 1856 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type |
| 1857 |
|
} else { |
| 1858 |
|
|
| 1859 |
|
} |
| 1860 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 1861 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 1862 |
q['&], |
q['&<], |
| 1863 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 1864 |
|
|
| 1865 |
## Stay in the state |
## Stay in the state |
| 1877 |
redo A; |
redo A; |
| 1878 |
} |
} |
| 1879 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) { |
| 1880 |
|
## XML5: "Tag attribute value unquoted state". |
| 1881 |
|
|
| 1882 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1883 |
|
|
| 1884 |
|
## XML5: "Tag attribute name before state". |
| 1885 |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1886 |
|
|
| 1887 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1897 |
redo A; |
redo A; |
| 1898 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1899 |
|
|
| 1900 |
|
|
| 1901 |
|
## XML5: Not defined yet. |
| 1902 |
|
|
| 1903 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1904 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1905 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1936 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1937 |
} |
} |
| 1938 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1939 |
|
$self->{s_kwd} = ''; |
| 1940 |
|
|
| 1941 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1942 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 1970 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 1971 |
} |
} |
| 1972 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1973 |
|
$self->{s_kwd} = ''; |
| 1974 |
## reconsume |
## reconsume |
| 1975 |
|
|
| 1976 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 1983 |
0x003D => 1, # = |
0x003D => 1, # = |
| 1984 |
}->{$self->{nc}}) { |
}->{$self->{nc}}) { |
| 1985 |
|
|
| 1986 |
|
## XML5: Not a parse error. |
| 1987 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
| 1988 |
} else { |
} else { |
| 1989 |
|
|
| 2040 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 2041 |
} |
} |
| 2042 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2043 |
|
$self->{s_kwd} = ''; |
| 2044 |
|
|
| 2045 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2046 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2088 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 2089 |
} |
} |
| 2090 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2091 |
|
$self->{s_kwd} = ''; |
| 2092 |
## Reconsume. |
## Reconsume. |
| 2093 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 2094 |
redo A; |
redo A; |
| 2100 |
redo A; |
redo A; |
| 2101 |
} |
} |
| 2102 |
} elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) { |
} elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) { |
| 2103 |
|
## XML5: "Empty tag state". |
| 2104 |
|
|
| 2105 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 2106 |
if ($self->{ct}->{type} == END_TAG_TOKEN) { |
if ($self->{ct}->{type} == END_TAG_TOKEN) { |
| 2107 |
|
|
| 2121 |
} |
} |
| 2122 |
|
|
| 2123 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2124 |
|
$self->{s_kwd} = ''; |
| 2125 |
|
|
| 2126 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2127 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2153 |
} else { |
} else { |
| 2154 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 2155 |
} |
} |
| 2156 |
|
## XML5: "Tag attribute name before state". |
| 2157 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2158 |
|
$self->{s_kwd} = ''; |
| 2159 |
## Reconsume. |
## Reconsume. |
| 2160 |
return ($self->{ct}); # start tag or end tag |
return ($self->{ct}); # start tag or end tag |
| 2161 |
redo A; |
redo A; |
| 2176 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 2177 |
|
|
| 2178 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2179 |
|
$self->{s_kwd} = ''; |
| 2180 |
|
|
| 2181 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2182 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2194 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 2195 |
|
|
| 2196 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2197 |
|
$self->{s_kwd} = ''; |
| 2198 |
## reconsume |
## reconsume |
| 2199 |
|
|
| 2200 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2296 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2297 |
column => $self->{column_prev} - 2, |
column => $self->{column_prev} - 2, |
| 2298 |
}; |
}; |
| 2299 |
$self->{state} = COMMENT_START_STATE; |
$self->{state} = COMMENT_START_STATE; ## XML5: "comment state". |
| 2300 |
|
|
| 2301 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2302 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2359 |
} elsif ((length $self->{s_kwd}) == 6 and |
} elsif ((length $self->{s_kwd}) == 6 and |
| 2360 |
($self->{nc} == 0x0045 or # E |
($self->{nc} == 0x0045 or # E |
| 2361 |
$self->{nc} == 0x0065)) { # e |
$self->{nc} == 0x0065)) { # e |
| 2362 |
|
if ($self->{s_kwd} ne 'DOCTYP') { |
| 2363 |
|
|
| 2364 |
|
## XML5: case-sensitive. |
| 2365 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO |
| 2366 |
|
text => 'DOCTYPE', |
| 2367 |
|
line => $self->{line_prev}, |
| 2368 |
|
column => $self->{column_prev} - 5); |
| 2369 |
|
} else { |
| 2370 |
|
|
| 2371 |
|
} |
| 2372 |
$self->{state} = DOCTYPE_STATE; |
$self->{state} = DOCTYPE_STATE; |
| 2373 |
$self->{ct} = {type => DOCTYPE_TOKEN, |
$self->{ct} = {type => DOCTYPE_TOKEN, |
| 2374 |
quirks => 1, |
quirks => 1, |
| 2426 |
redo A; |
redo A; |
| 2427 |
} elsif ($self->{s_kwd} eq '[CDATA' and |
} elsif ($self->{s_kwd} eq '[CDATA' and |
| 2428 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 2429 |
|
if ($self->{is_xml} and |
| 2430 |
|
not $self->{tainted} and |
| 2431 |
|
@{$self->{open_elements} or []} == 0) { |
| 2432 |
|
|
| 2433 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element', |
| 2434 |
|
line => $self->{line_prev}, |
| 2435 |
|
column => $self->{column_prev} - 7); |
| 2436 |
|
$self->{tainted} = 1; |
| 2437 |
|
} else { |
| 2438 |
|
|
| 2439 |
|
} |
| 2440 |
|
|
| 2441 |
$self->{ct} = {type => CHARACTER_TOKEN, |
$self->{ct} = {type => CHARACTER_TOKEN, |
| 2442 |
data => '', |
data => '', |
| 2443 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2489 |
|
|
| 2490 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); |
| 2491 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2492 |
|
$self->{s_kwd} = ''; |
| 2493 |
|
|
| 2494 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2495 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2509 |
|
|
| 2510 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2511 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2512 |
|
$self->{s_kwd} = ''; |
| 2513 |
## reconsume |
## reconsume |
| 2514 |
|
|
| 2515 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2553 |
|
|
| 2554 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); |
| 2555 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2556 |
|
$self->{s_kwd} = ''; |
| 2557 |
|
|
| 2558 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2559 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2573 |
|
|
| 2574 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2575 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2576 |
|
$self->{s_kwd} = ''; |
| 2577 |
## reconsume |
## reconsume |
| 2578 |
|
|
| 2579 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2617 |
|
|
| 2618 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2619 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2620 |
|
$self->{s_kwd} = ''; |
| 2621 |
## reconsume |
## reconsume |
| 2622 |
|
|
| 2623 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2645 |
redo A; |
redo A; |
| 2646 |
} |
} |
| 2647 |
} elsif ($self->{state} == COMMENT_END_DASH_STATE) { |
} elsif ($self->{state} == COMMENT_END_DASH_STATE) { |
| 2648 |
|
## XML5: "comment dash state". |
| 2649 |
|
|
| 2650 |
if ($self->{nc} == 0x002D) { # - |
if ($self->{nc} == 0x002D) { # - |
| 2651 |
|
|
| 2652 |
$self->{state} = COMMENT_END_STATE; |
$self->{state} = COMMENT_END_STATE; |
| 2665 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 2666 |
|
|
| 2667 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2668 |
|
$self->{s_kwd} = ''; |
| 2669 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2670 |
|
$self->{s_kwd} = ''; |
| 2671 |
## reconsume |
## reconsume |
| 2672 |
|
|
| 2673 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2694 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 2695 |
|
|
| 2696 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2697 |
|
$self->{s_kwd} = ''; |
| 2698 |
|
|
| 2699 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2700 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2712 |
redo A; |
redo A; |
| 2713 |
} elsif ($self->{nc} == 0x002D) { # - |
} elsif ($self->{nc} == 0x002D) { # - |
| 2714 |
|
|
| 2715 |
|
## XML5: Not a parse error. |
| 2716 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
| 2717 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2718 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
| 2734 |
|
|
| 2735 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
| 2736 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2737 |
|
$self->{s_kwd} = ''; |
| 2738 |
## reconsume |
## reconsume |
| 2739 |
|
|
| 2740 |
return ($self->{ct}); # comment |
return ($self->{ct}); # comment |
| 2742 |
redo A; |
redo A; |
| 2743 |
} else { |
} else { |
| 2744 |
|
|
| 2745 |
|
## XML5: Not a parse error. |
| 2746 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
| 2747 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2748 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
| 2804 |
|
|
| 2805 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
| 2806 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2807 |
|
$self->{s_kwd} = ''; |
| 2808 |
|
|
| 2809 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2810 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2824 |
|
|
| 2825 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
| 2826 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2827 |
|
$self->{s_kwd} = ''; |
| 2828 |
## reconsume |
## reconsume |
| 2829 |
|
|
| 2830 |
return ($self->{ct}); # DOCTYPE (quirks) |
return ($self->{ct}); # DOCTYPE (quirks) |
| 2868 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 2869 |
|
|
| 2870 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2871 |
|
$self->{s_kwd} = ''; |
| 2872 |
|
|
| 2873 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2874 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2888 |
|
|
| 2889 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 2890 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2891 |
|
$self->{s_kwd} = ''; |
| 2892 |
## reconsume |
## reconsume |
| 2893 |
|
|
| 2894 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 2932 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 2933 |
|
|
| 2934 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2935 |
|
$self->{s_kwd} = ''; |
| 2936 |
|
|
| 2937 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2938 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2952 |
|
|
| 2953 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 2954 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2955 |
|
$self->{s_kwd} = ''; |
| 2956 |
## reconsume |
## reconsume |
| 2957 |
|
|
| 2958 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3181 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal'); |
| 3182 |
|
|
| 3183 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3184 |
|
$self->{s_kwd} = ''; |
| 3185 |
|
|
| 3186 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3187 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3203 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 3204 |
|
|
| 3205 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3206 |
|
$self->{s_kwd} = ''; |
| 3207 |
## reconsume |
## reconsume |
| 3208 |
|
|
| 3209 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3250 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); |
| 3251 |
|
|
| 3252 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3253 |
|
$self->{s_kwd} = ''; |
| 3254 |
|
|
| 3255 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3256 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3272 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); |
| 3273 |
|
|
| 3274 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3275 |
|
$self->{s_kwd} = ''; |
| 3276 |
## reconsume |
## reconsume |
| 3277 |
|
|
| 3278 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3321 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); |
| 3322 |
|
|
| 3323 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3324 |
|
$self->{s_kwd} = ''; |
| 3325 |
|
|
| 3326 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3327 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3343 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); |
| 3344 |
|
|
| 3345 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3346 |
|
$self->{s_kwd} = ''; |
| 3347 |
## reconsume |
## reconsume |
| 3348 |
|
|
| 3349 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3422 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 3423 |
|
|
| 3424 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3425 |
|
$self->{s_kwd} = ''; |
| 3426 |
|
|
| 3427 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3428 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3443 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 3444 |
|
|
| 3445 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3446 |
|
$self->{s_kwd} = ''; |
| 3447 |
## reconsume |
## reconsume |
| 3448 |
|
|
| 3449 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3521 |
|
|
| 3522 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); |
| 3523 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3524 |
|
$self->{s_kwd} = ''; |
| 3525 |
|
|
| 3526 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3527 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3543 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 3544 |
|
|
| 3545 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3546 |
|
$self->{s_kwd} = ''; |
| 3547 |
## reconsume |
## reconsume |
| 3548 |
|
|
| 3549 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3590 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
| 3591 |
|
|
| 3592 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3593 |
|
$self->{s_kwd} = ''; |
| 3594 |
|
|
| 3595 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3596 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3612 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
| 3613 |
|
|
| 3614 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3615 |
|
$self->{s_kwd} = ''; |
| 3616 |
## reconsume |
## reconsume |
| 3617 |
|
|
| 3618 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3661 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
| 3662 |
|
|
| 3663 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3664 |
|
$self->{s_kwd} = ''; |
| 3665 |
|
|
| 3666 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3667 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3683 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
| 3684 |
|
|
| 3685 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3686 |
|
$self->{s_kwd} = ''; |
| 3687 |
## reconsume |
## reconsume |
| 3688 |
|
|
| 3689 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3730 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 3731 |
|
|
| 3732 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3733 |
|
$self->{s_kwd} = ''; |
| 3734 |
|
|
| 3735 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3736 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3750 |
|
|
| 3751 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 3752 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3753 |
|
$self->{s_kwd} = ''; |
| 3754 |
## reconsume |
## reconsume |
| 3755 |
|
|
| 3756 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3780 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 3781 |
|
|
| 3782 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3783 |
|
$self->{s_kwd} = ''; |
| 3784 |
|
|
| 3785 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3786 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3799 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 3800 |
|
|
| 3801 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3802 |
|
$self->{s_kwd} = ''; |
| 3803 |
## reconsume |
## reconsume |
| 3804 |
|
|
| 3805 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3828 |
## NOTE: "CDATA section state" in the state is jointly implemented |
## NOTE: "CDATA section state" in the state is jointly implemented |
| 3829 |
## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, |
## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, |
| 3830 |
## and |CDATA_SECTION_MSE2_STATE|. |
## and |CDATA_SECTION_MSE2_STATE|. |
| 3831 |
|
|
| 3832 |
|
## XML5: "CDATA state". |
| 3833 |
|
|
| 3834 |
if ($self->{nc} == 0x005D) { # ] |
if ($self->{nc} == 0x005D) { # ] |
| 3835 |
|
|
| 3847 |
|
|
| 3848 |
redo A; |
redo A; |
| 3849 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 3850 |
|
if ($self->{is_xml}) { |
| 3851 |
|
|
| 3852 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type |
| 3853 |
|
} else { |
| 3854 |
|
|
| 3855 |
|
} |
| 3856 |
|
|
| 3857 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3858 |
|
$self->{s_kwd} = ''; |
| 3859 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
## Reconsume. |
|
$self->{line_prev} = $self->{line}; |
|
|
$self->{column_prev} = $self->{column}; |
|
|
$self->{column}++; |
|
|
$self->{nc} |
|
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
|
|
} else { |
|
|
$self->{set_nc}->($self); |
|
|
} |
|
|
|
|
| 3860 |
if (length $self->{ct}->{data}) { # character |
if (length $self->{ct}->{data}) { # character |
| 3861 |
|
|
| 3862 |
return ($self->{ct}); # character |
return ($self->{ct}); # character |
| 3889 |
|
|
| 3890 |
## ISSUE: "text tokens" in spec. |
## ISSUE: "text tokens" in spec. |
| 3891 |
} elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) { |
| 3892 |
|
## XML5: "CDATA bracket state". |
| 3893 |
|
|
| 3894 |
if ($self->{nc} == 0x005D) { # ] |
if ($self->{nc} == 0x005D) { # ] |
| 3895 |
|
|
| 3896 |
$self->{state} = CDATA_SECTION_MSE2_STATE; |
$self->{state} = CDATA_SECTION_MSE2_STATE; |
| 3908 |
redo A; |
redo A; |
| 3909 |
} else { |
} else { |
| 3910 |
|
|
| 3911 |
|
## XML5: If EOF, "]" is not appended and changed to the data state. |
| 3912 |
$self->{ct}->{data} .= ']'; |
$self->{ct}->{data} .= ']'; |
| 3913 |
$self->{state} = CDATA_SECTION_STATE; |
$self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state. |
| 3914 |
## Reconsume. |
## Reconsume. |
| 3915 |
redo A; |
redo A; |
| 3916 |
} |
} |
| 3917 |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
| 3918 |
|
## XML5: "CDATA end state". |
| 3919 |
|
|
| 3920 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 3921 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3922 |
|
$self->{s_kwd} = ''; |
| 3923 |
|
|
| 3924 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3925 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3959 |
|
|
| 3960 |
$self->{ct}->{data} .= ']]'; # character |
$self->{ct}->{data} .= ']]'; # character |
| 3961 |
$self->{state} = CDATA_SECTION_STATE; |
$self->{state} = CDATA_SECTION_STATE; |
| 3962 |
## Reconsume. |
## Reconsume. ## XML5: Emit. |
| 3963 |
redo A; |
redo A; |
| 3964 |
} |
} |
| 3965 |
} elsif ($self->{state} == ENTITY_STATE) { |
} elsif ($self->{state} == ENTITY_STATE) { |
| 4027 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
| 4028 |
|
|
| 4029 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4030 |
|
$self->{s_kwd} = ''; |
| 4031 |
## Reconsume. |
## Reconsume. |
| 4032 |
return ({type => CHARACTER_TOKEN, data => '&', |
return ({type => CHARACTER_TOKEN, data => '&', |
| 4033 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4038 |
|
|
| 4039 |
$self->{ca}->{value} .= '&'; |
$self->{ca}->{value} .= '&'; |
| 4040 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4041 |
|
$self->{s_kwd} = ''; |
| 4042 |
## Reconsume. |
## Reconsume. |
| 4043 |
redo A; |
redo A; |
| 4044 |
} |
} |
| 4089 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
| 4090 |
|
|
| 4091 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4092 |
|
$self->{s_kwd} = ''; |
| 4093 |
## Reconsume. |
## Reconsume. |
| 4094 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 4095 |
data => '&#', |
data => '&#', |
| 4101 |
|
|
| 4102 |
$self->{ca}->{value} .= '&#'; |
$self->{ca}->{value} .= '&#'; |
| 4103 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4104 |
|
$self->{s_kwd} = ''; |
| 4105 |
## Reconsume. |
## Reconsume. |
| 4106 |
redo A; |
redo A; |
| 4107 |
} |
} |
| 4167 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
| 4168 |
|
|
| 4169 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4170 |
|
$self->{s_kwd} = ''; |
| 4171 |
## Reconsume. |
## Reconsume. |
| 4172 |
return ({type => CHARACTER_TOKEN, data => chr $code, |
return ({type => CHARACTER_TOKEN, data => chr $code, |
| 4173 |
|
has_reference => 1, |
| 4174 |
line => $l, column => $c, |
line => $l, column => $c, |
| 4175 |
}); |
}); |
| 4176 |
redo A; |
redo A; |
| 4179 |
$self->{ca}->{value} .= chr $code; |
$self->{ca}->{value} .= chr $code; |
| 4180 |
$self->{ca}->{has_reference} = 1; |
$self->{ca}->{has_reference} = 1; |
| 4181 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4182 |
|
$self->{s_kwd} = ''; |
| 4183 |
## Reconsume. |
## Reconsume. |
| 4184 |
redo A; |
redo A; |
| 4185 |
} |
} |
| 4205 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
| 4206 |
|
|
| 4207 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4208 |
|
$self->{s_kwd} = ''; |
| 4209 |
## Reconsume. |
## Reconsume. |
| 4210 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 4211 |
data => '&' . $self->{s_kwd}, |
data => '&' . $self->{s_kwd}, |
| 4217 |
|
|
| 4218 |
$self->{ca}->{value} .= '&' . $self->{s_kwd}; |
$self->{ca}->{value} .= '&' . $self->{s_kwd}; |
| 4219 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4220 |
|
$self->{s_kwd} = ''; |
| 4221 |
## Reconsume. |
## Reconsume. |
| 4222 |
redo A; |
redo A; |
| 4223 |
} |
} |
| 4320 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
| 4321 |
|
|
| 4322 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4323 |
|
$self->{s_kwd} = ''; |
| 4324 |
## Reconsume. |
## Reconsume. |
| 4325 |
return ({type => CHARACTER_TOKEN, data => chr $code, |
return ({type => CHARACTER_TOKEN, data => chr $code, |
| 4326 |
|
has_reference => 1, |
| 4327 |
line => $l, column => $c, |
line => $l, column => $c, |
| 4328 |
}); |
}); |
| 4329 |
redo A; |
redo A; |
| 4332 |
$self->{ca}->{value} .= chr $code; |
$self->{ca}->{value} .= chr $code; |
| 4333 |
$self->{ca}->{has_reference} = 1; |
$self->{ca}->{has_reference} = 1; |
| 4334 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4335 |
|
$self->{s_kwd} = ''; |
| 4336 |
## Reconsume. |
## Reconsume. |
| 4337 |
redo A; |
redo A; |
| 4338 |
} |
} |
| 4445 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
| 4446 |
|
|
| 4447 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4448 |
|
$self->{s_kwd} = ''; |
| 4449 |
## Reconsume. |
## Reconsume. |
| 4450 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 4451 |
data => $data, |
data => $data, |
| 4452 |
|
has_reference => $has_ref, |
| 4453 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4454 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
| 4455 |
}); |
}); |
| 4459 |
$self->{ca}->{value} .= $data; |
$self->{ca}->{value} .= $data; |
| 4460 |
$self->{ca}->{has_reference} = 1 if $has_ref; |
$self->{ca}->{has_reference} = 1 if $has_ref; |
| 4461 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4462 |
|
$self->{s_kwd} = ''; |
| 4463 |
|
## Reconsume. |
| 4464 |
|
redo A; |
| 4465 |
|
} |
| 4466 |
|
|
| 4467 |
|
## XML-only states |
| 4468 |
|
|
| 4469 |
|
} elsif ($self->{state} == PI_STATE) { |
| 4470 |
|
if ($is_space->{$self->{nc}} or |
| 4471 |
|
$self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else" |
| 4472 |
|
$self->{nc} == -1) { |
| 4473 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type |
| 4474 |
|
line => $self->{line_prev}, |
| 4475 |
|
column => $self->{column_prev} |
| 4476 |
|
- 1 * ($self->{nc} != -1)); |
| 4477 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
| 4478 |
|
## Reconsume. |
| 4479 |
|
$self->{ct} = {type => COMMENT_TOKEN, |
| 4480 |
|
data => '?', |
| 4481 |
|
line => $self->{line_prev}, |
| 4482 |
|
column => $self->{column_prev} |
| 4483 |
|
- 1 * ($self->{nc} != -1), |
| 4484 |
|
}; |
| 4485 |
|
redo A; |
| 4486 |
|
} else { |
| 4487 |
|
$self->{ct} = {type => PI_TOKEN, |
| 4488 |
|
target => chr $self->{nc}, |
| 4489 |
|
data => '', |
| 4490 |
|
line => $self->{line_prev}, |
| 4491 |
|
column => $self->{column_prev} - 1, |
| 4492 |
|
}; |
| 4493 |
|
$self->{state} = PI_TARGET_STATE; |
| 4494 |
|
|
| 4495 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4496 |
|
$self->{line_prev} = $self->{line}; |
| 4497 |
|
$self->{column_prev} = $self->{column}; |
| 4498 |
|
$self->{column}++; |
| 4499 |
|
$self->{nc} |
| 4500 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4501 |
|
} else { |
| 4502 |
|
$self->{set_nc}->($self); |
| 4503 |
|
} |
| 4504 |
|
|
| 4505 |
|
redo A; |
| 4506 |
|
} |
| 4507 |
|
} elsif ($self->{state} == PI_TARGET_STATE) { |
| 4508 |
|
if ($is_space->{$self->{nc}}) { |
| 4509 |
|
$self->{state} = PI_TARGET_AFTER_STATE; |
| 4510 |
|
|
| 4511 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4512 |
|
$self->{line_prev} = $self->{line}; |
| 4513 |
|
$self->{column_prev} = $self->{column}; |
| 4514 |
|
$self->{column}++; |
| 4515 |
|
$self->{nc} |
| 4516 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4517 |
|
} else { |
| 4518 |
|
$self->{set_nc}->($self); |
| 4519 |
|
} |
| 4520 |
|
|
| 4521 |
|
redo A; |
| 4522 |
|
} elsif ($self->{nc} == -1) { |
| 4523 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
| 4524 |
|
$self->{state} = DATA_STATE; |
| 4525 |
|
$self->{s_kwd} = ''; |
| 4526 |
## Reconsume. |
## Reconsume. |
| 4527 |
|
return ($self->{ct}); # pi |
| 4528 |
|
redo A; |
| 4529 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4530 |
|
$self->{state} = PI_AFTER_STATE; |
| 4531 |
|
|
| 4532 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4533 |
|
$self->{line_prev} = $self->{line}; |
| 4534 |
|
$self->{column_prev} = $self->{column}; |
| 4535 |
|
$self->{column}++; |
| 4536 |
|
$self->{nc} |
| 4537 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4538 |
|
} else { |
| 4539 |
|
$self->{set_nc}->($self); |
| 4540 |
|
} |
| 4541 |
|
|
| 4542 |
|
redo A; |
| 4543 |
|
} else { |
| 4544 |
|
## XML5: typo ("tag name" -> "target") |
| 4545 |
|
$self->{ct}->{target} .= chr $self->{nc}; # pi |
| 4546 |
|
|
| 4547 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4548 |
|
$self->{line_prev} = $self->{line}; |
| 4549 |
|
$self->{column_prev} = $self->{column}; |
| 4550 |
|
$self->{column}++; |
| 4551 |
|
$self->{nc} |
| 4552 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4553 |
|
} else { |
| 4554 |
|
$self->{set_nc}->($self); |
| 4555 |
|
} |
| 4556 |
|
|
| 4557 |
redo A; |
redo A; |
| 4558 |
} |
} |
| 4559 |
|
} elsif ($self->{state} == PI_TARGET_AFTER_STATE) { |
| 4560 |
|
if ($is_space->{$self->{nc}}) { |
| 4561 |
|
## Stay in the state. |
| 4562 |
|
|
| 4563 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4564 |
|
$self->{line_prev} = $self->{line}; |
| 4565 |
|
$self->{column_prev} = $self->{column}; |
| 4566 |
|
$self->{column}++; |
| 4567 |
|
$self->{nc} |
| 4568 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4569 |
|
} else { |
| 4570 |
|
$self->{set_nc}->($self); |
| 4571 |
|
} |
| 4572 |
|
|
| 4573 |
|
redo A; |
| 4574 |
|
} else { |
| 4575 |
|
$self->{state} = PI_DATA_STATE; |
| 4576 |
|
## Reprocess. |
| 4577 |
|
redo A; |
| 4578 |
|
} |
| 4579 |
|
} elsif ($self->{state} == PI_DATA_STATE) { |
| 4580 |
|
if ($self->{nc} == 0x003F) { # ? |
| 4581 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
| 4582 |
|
|
| 4583 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4584 |
|
$self->{line_prev} = $self->{line}; |
| 4585 |
|
$self->{column_prev} = $self->{column}; |
| 4586 |
|
$self->{column}++; |
| 4587 |
|
$self->{nc} |
| 4588 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4589 |
|
} else { |
| 4590 |
|
$self->{set_nc}->($self); |
| 4591 |
|
} |
| 4592 |
|
|
| 4593 |
|
redo A; |
| 4594 |
|
} elsif ($self->{nc} == -1) { |
| 4595 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
| 4596 |
|
$self->{state} = DATA_STATE; |
| 4597 |
|
$self->{s_kwd} = ''; |
| 4598 |
|
## Reprocess. |
| 4599 |
|
return ($self->{ct}); # pi |
| 4600 |
|
redo A; |
| 4601 |
|
} else { |
| 4602 |
|
$self->{ct}->{data} .= chr $self->{nc}; # pi |
| 4603 |
|
$self->{read_until}->($self->{ct}->{data}, q[?], |
| 4604 |
|
length $self->{ct}->{data}); |
| 4605 |
|
## Stay in the state. |
| 4606 |
|
|
| 4607 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4608 |
|
$self->{line_prev} = $self->{line}; |
| 4609 |
|
$self->{column_prev} = $self->{column}; |
| 4610 |
|
$self->{column}++; |
| 4611 |
|
$self->{nc} |
| 4612 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4613 |
|
} else { |
| 4614 |
|
$self->{set_nc}->($self); |
| 4615 |
|
} |
| 4616 |
|
|
| 4617 |
|
## Reprocess. |
| 4618 |
|
redo A; |
| 4619 |
|
} |
| 4620 |
|
} elsif ($self->{state} == PI_AFTER_STATE) { |
| 4621 |
|
if ($self->{nc} == 0x003E) { # > |
| 4622 |
|
$self->{state} = DATA_STATE; |
| 4623 |
|
$self->{s_kwd} = ''; |
| 4624 |
|
|
| 4625 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4626 |
|
$self->{line_prev} = $self->{line}; |
| 4627 |
|
$self->{column_prev} = $self->{column}; |
| 4628 |
|
$self->{column}++; |
| 4629 |
|
$self->{nc} |
| 4630 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4631 |
|
} else { |
| 4632 |
|
$self->{set_nc}->($self); |
| 4633 |
|
} |
| 4634 |
|
|
| 4635 |
|
return ($self->{ct}); # pi |
| 4636 |
|
redo A; |
| 4637 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4638 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type |
| 4639 |
|
line => $self->{line_prev}, |
| 4640 |
|
column => $self->{column_prev}); ## XML5: no error |
| 4641 |
|
$self->{ct}->{data} .= '?'; |
| 4642 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
| 4643 |
|
|
| 4644 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4645 |
|
$self->{line_prev} = $self->{line}; |
| 4646 |
|
$self->{column_prev} = $self->{column}; |
| 4647 |
|
$self->{column}++; |
| 4648 |
|
$self->{nc} |
| 4649 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4650 |
|
} else { |
| 4651 |
|
$self->{set_nc}->($self); |
| 4652 |
|
} |
| 4653 |
|
|
| 4654 |
|
redo A; |
| 4655 |
|
} else { |
| 4656 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type |
| 4657 |
|
line => $self->{line_prev}, |
| 4658 |
|
column => $self->{column_prev} |
| 4659 |
|
+ 1 * ($self->{nc} == -1)); ## XML5: no error |
| 4660 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
| 4661 |
|
$self->{state} = PI_DATA_STATE; |
| 4662 |
|
## Reprocess. |
| 4663 |
|
redo A; |
| 4664 |
|
} |
| 4665 |
|
} elsif ($self->{state} == PI_DATA_AFTER_STATE) { |
| 4666 |
|
## XML5: Same as "pi after state" in XML5 |
| 4667 |
|
if ($self->{nc} == 0x003E) { # > |
| 4668 |
|
$self->{state} = DATA_STATE; |
| 4669 |
|
$self->{s_kwd} = ''; |
| 4670 |
|
|
| 4671 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4672 |
|
$self->{line_prev} = $self->{line}; |
| 4673 |
|
$self->{column_prev} = $self->{column}; |
| 4674 |
|
$self->{column}++; |
| 4675 |
|
$self->{nc} |
| 4676 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4677 |
|
} else { |
| 4678 |
|
$self->{set_nc}->($self); |
| 4679 |
|
} |
| 4680 |
|
|
| 4681 |
|
return ($self->{ct}); # pi |
| 4682 |
|
redo A; |
| 4683 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4684 |
|
$self->{ct}->{data} .= '?'; |
| 4685 |
|
## Stay in the state. |
| 4686 |
|
|
| 4687 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4688 |
|
$self->{line_prev} = $self->{line}; |
| 4689 |
|
$self->{column_prev} = $self->{column}; |
| 4690 |
|
$self->{column}++; |
| 4691 |
|
$self->{nc} |
| 4692 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4693 |
|
} else { |
| 4694 |
|
$self->{set_nc}->($self); |
| 4695 |
|
} |
| 4696 |
|
|
| 4697 |
|
redo A; |
| 4698 |
|
} else { |
| 4699 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
| 4700 |
|
$self->{state} = PI_DATA_STATE; |
| 4701 |
|
## Reprocess. |
| 4702 |
|
redo A; |
| 4703 |
|
} |
| 4704 |
|
|
| 4705 |
} else { |
} else { |
| 4706 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |
| 4707 |
} |
} |