| 31 |
); |
); |
| 32 |
} |
} |
| 33 |
|
|
| 34 |
|
## NOTE: Differences from the XML5 draft are marked as "XML5:". |
| 35 |
|
|
| 36 |
## Token types |
## Token types |
| 37 |
|
|
| 38 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token. |
| 39 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
| 40 |
sub START_TAG_TOKEN () { 3 } |
sub START_TAG_TOKEN () { 3 } |
| 41 |
sub END_TAG_TOKEN () { 4 } |
sub END_TAG_TOKEN () { 4 } |
| 42 |
sub END_OF_FILE_TOKEN () { 5 } |
sub END_OF_FILE_TOKEN () { 5 } |
| 43 |
sub CHARACTER_TOKEN () { 6 } |
sub CHARACTER_TOKEN () { 6 } |
| 44 |
sub PI_TOKEN () { 7 } # XML5 |
sub PI_TOKEN () { 7 } ## NOTE: XML only. |
| 45 |
sub ABORT_TOKEN () { 8 } # Not a token actually |
sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing. |
| 46 |
|
|
| 47 |
|
## XML5: XML5 has "empty tag token". In this implementation, it is |
| 48 |
|
## represented as a start tag token with $self->{self_closing} flag |
| 49 |
|
## set to true. |
| 50 |
|
|
| 51 |
|
## XML5: XML5 has "short end tag token". In this implementation, it |
| 52 |
|
## is represented as an end tag token with $token->{tag_name} flag set |
| 53 |
|
## to an empty string. |
| 54 |
|
|
| 55 |
package Whatpm::HTML; |
package Whatpm::HTML; |
| 56 |
|
|
| 124 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
| 125 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
| 126 |
|
|
| 127 |
|
## XML-only states |
| 128 |
|
sub PI_STATE () { 51 } |
| 129 |
|
sub PI_TARGET_STATE () { 52 } |
| 130 |
|
sub PI_TARGET_AFTER_STATE () { 53 } |
| 131 |
|
sub PI_DATA_STATE () { 54 } |
| 132 |
|
sub PI_AFTER_STATE () { 55 } |
| 133 |
|
sub PI_DATA_AFTER_STATE () { 56 } |
| 134 |
|
sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 } |
| 135 |
|
sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 } |
| 136 |
|
|
| 137 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
| 138 |
## list and descriptions) |
## list and descriptions) |
| 139 |
|
|
| 198 |
#$self->{is_xml} (if XML) |
#$self->{is_xml} (if XML) |
| 199 |
|
|
| 200 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
| 201 |
$self->{s_kwd} = ''; # state keyword |
$self->{s_kwd} = ''; # Data state keyword |
| 202 |
|
#$self->{kwd} = ''; # State-dependent keyword; initialized when used |
| 203 |
#$self->{entity__value}; # initialized when used |
#$self->{entity__value}; # initialized when used |
| 204 |
#$self->{entity__match}; # initialized when used |
#$self->{entity__match}; # initialized when used |
| 205 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
| 229 |
|
|
| 230 |
## A token has: |
## A token has: |
| 231 |
## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, |
## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, |
| 232 |
## CHARACTER_TOKEN, or END_OF_FILE_TOKEN |
## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN |
| 233 |
## ->{name} (DOCTYPE_TOKEN) |
## ->{name} (DOCTYPE_TOKEN) |
| 234 |
## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) |
## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) |
| 235 |
|
## ->{target} (PI_TOKEN) |
| 236 |
## ->{pubid} (DOCTYPE_TOKEN) |
## ->{pubid} (DOCTYPE_TOKEN) |
| 237 |
## ->{sysid} (DOCTYPE_TOKEN) |
## ->{sysid} (DOCTYPE_TOKEN) |
| 238 |
## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag |
## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag |
| 240 |
## ->{name} |
## ->{name} |
| 241 |
## ->{value} |
## ->{value} |
| 242 |
## ->{has_reference} == 1 or 0 |
## ->{has_reference} == 1 or 0 |
| 243 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{index}: Index of the attribute in a tag. |
| 244 |
|
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) |
| 245 |
|
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
| 246 |
|
## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. |
| 247 |
|
## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN) |
| 248 |
|
|
| 249 |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
| 250 |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
| 251 |
## while the token is pushed back to the stack. |
## while the token is pushed back to the stack. |
| 265 |
0x0009 => 1, # CHARACTER TABULATION (HT) |
0x0009 => 1, # CHARACTER TABULATION (HT) |
| 266 |
0x000A => 1, # LINE FEED (LF) |
0x000A => 1, # LINE FEED (LF) |
| 267 |
#0x000B => 0, # LINE TABULATION (VT) |
#0x000B => 0, # LINE TABULATION (VT) |
| 268 |
0x000C => 1, # FORM FEED (FF) |
0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character. |
| 269 |
#0x000D => 1, # CARRIAGE RETURN (CR) |
#0x000D => 1, # CARRIAGE RETURN (CR) |
| 270 |
0x0020 => 1, # SPACE (SP) |
0x0020 => 1, # SPACE (SP) |
| 271 |
}; |
}; |
| 525 |
return ($token); |
return ($token); |
| 526 |
redo A; |
redo A; |
| 527 |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
| 528 |
|
## XML5: "tag state". |
| 529 |
|
|
| 530 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 531 |
if ($self->{nc} == 0x002F) { # / |
if ($self->{nc} == 0x002F) { # / |
| 532 |
|
|
| 545 |
redo A; |
redo A; |
| 546 |
} elsif ($self->{nc} == 0x0021) { # ! |
} elsif ($self->{nc} == 0x0021) { # ! |
| 547 |
|
|
| 548 |
$self->{s_kwd} = '<' unless $self->{escape}; |
$self->{s_kwd} = $self->{escaped} ? '' : '<'; |
| 549 |
# |
# |
| 550 |
} else { |
} else { |
| 551 |
|
|
| 552 |
|
$self->{s_kwd} = ''; |
| 553 |
# |
# |
| 554 |
} |
} |
| 555 |
|
|
| 556 |
## reconsume |
## reconsume |
| 557 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
|
$self->{s_kwd} = ''; |
|
| 558 |
return ({type => CHARACTER_TOKEN, data => '<', |
return ({type => CHARACTER_TOKEN, data => '<', |
| 559 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 560 |
column => $self->{column_prev}, |
column => $self->{column_prev}, |
| 658 |
|
|
| 659 |
redo A; |
redo A; |
| 660 |
} elsif ($self->{nc} == 0x003F) { # ? |
} elsif ($self->{nc} == 0x003F) { # ? |
| 661 |
|
if ($self->{is_xml}) { |
| 662 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'pio', |
|
| 663 |
line => $self->{line_prev}, |
$self->{state} = PI_STATE; |
| 664 |
column => $self->{column_prev}); |
|
| 665 |
$self->{state} = BOGUS_COMMENT_STATE; |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 666 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
$self->{line_prev} = $self->{line}; |
| 667 |
line => $self->{line_prev}, |
$self->{column_prev} = $self->{column}; |
| 668 |
column => $self->{column_prev}, |
$self->{column}++; |
| 669 |
}; |
$self->{nc} |
| 670 |
## $self->{nc} is intentionally left as is |
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 671 |
redo A; |
} else { |
| 672 |
} else { |
$self->{set_nc}->($self); |
| 673 |
|
} |
| 674 |
|
|
| 675 |
|
redo A; |
| 676 |
|
} else { |
| 677 |
|
|
| 678 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'pio', |
| 679 |
|
line => $self->{line_prev}, |
| 680 |
|
column => $self->{column_prev}); |
| 681 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
| 682 |
|
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
| 683 |
|
line => $self->{line_prev}, |
| 684 |
|
column => $self->{column_prev}, |
| 685 |
|
}; |
| 686 |
|
## $self->{nc} is intentionally left as is |
| 687 |
|
redo A; |
| 688 |
|
} |
| 689 |
|
} elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) { |
| 690 |
|
|
| 691 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', |
| 692 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 701 |
}); |
}); |
| 702 |
|
|
| 703 |
redo A; |
redo A; |
| 704 |
|
} else { |
| 705 |
|
## XML5: "<:" is a parse error. |
| 706 |
|
|
| 707 |
|
$self->{ct} = {type => START_TAG_TOKEN, |
| 708 |
|
tag_name => chr ($self->{nc}), |
| 709 |
|
line => $self->{line_prev}, |
| 710 |
|
column => $self->{column_prev}}; |
| 711 |
|
$self->{state} = TAG_NAME_STATE; |
| 712 |
|
|
| 713 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 714 |
|
$self->{line_prev} = $self->{line}; |
| 715 |
|
$self->{column_prev} = $self->{column}; |
| 716 |
|
$self->{column}++; |
| 717 |
|
$self->{nc} |
| 718 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 719 |
|
} else { |
| 720 |
|
$self->{set_nc}->($self); |
| 721 |
|
} |
| 722 |
|
|
| 723 |
|
redo A; |
| 724 |
} |
} |
| 725 |
} else { |
} else { |
| 726 |
die "$0: $self->{content_model} in tag open"; |
die "$0: $self->{content_model} in tag open"; |
| 729 |
## NOTE: The "close tag open state" in the spec is implemented as |
## NOTE: The "close tag open state" in the spec is implemented as |
| 730 |
## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|. |
## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|. |
| 731 |
|
|
| 732 |
|
## XML5: "end tag state". |
| 733 |
|
|
| 734 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" |
| 735 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 736 |
if (defined $self->{last_stag_name}) { |
if (defined $self->{last_stag_name}) { |
| 737 |
$self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE; |
$self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE; |
| 738 |
$self->{s_kwd} = ''; |
$self->{kwd} = ''; |
| 739 |
## Reconsume. |
## Reconsume. |
| 740 |
redo A; |
redo A; |
| 741 |
} else { |
} else { |
| 792 |
|
|
| 793 |
redo A; |
redo A; |
| 794 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
|
|
|
| 795 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag', |
| 796 |
line => $self->{line_prev}, ## "<" in "</>" |
line => $self->{line_prev}, ## "<" in "</>" |
| 797 |
column => $self->{column_prev} - 1); |
column => $self->{column_prev} - 1); |
| 798 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 799 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 800 |
|
if ($self->{is_xml}) { |
| 801 |
|
|
| 802 |
|
## XML5: No parse error. |
| 803 |
|
|
| 804 |
|
## NOTE: This parser raises a parse error, since it supports |
| 805 |
|
## XML1, not XML5. |
| 806 |
|
|
| 807 |
|
## NOTE: A short end tag token. |
| 808 |
|
my $ct = {type => END_TAG_TOKEN, |
| 809 |
|
tag_name => '', |
| 810 |
|
line => $self->{line_prev}, |
| 811 |
|
column => $self->{column_prev} - 1, |
| 812 |
|
}; |
| 813 |
|
|
| 814 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 815 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 816 |
$self->{column_prev} = $self->{column}; |
$self->{column_prev} = $self->{column}; |
| 821 |
$self->{set_nc}->($self); |
$self->{set_nc}->($self); |
| 822 |
} |
} |
| 823 |
|
|
| 824 |
|
return ($ct); |
| 825 |
|
} else { |
| 826 |
|
|
| 827 |
|
|
| 828 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 829 |
|
$self->{line_prev} = $self->{line}; |
| 830 |
|
$self->{column_prev} = $self->{column}; |
| 831 |
|
$self->{column}++; |
| 832 |
|
$self->{nc} |
| 833 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 834 |
|
} else { |
| 835 |
|
$self->{set_nc}->($self); |
| 836 |
|
} |
| 837 |
|
|
| 838 |
|
} |
| 839 |
redo A; |
redo A; |
| 840 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 841 |
|
|
| 849 |
}); |
}); |
| 850 |
|
|
| 851 |
redo A; |
redo A; |
| 852 |
} else { |
} elsif (not $self->{is_xml} or |
| 853 |
|
$is_space->{$self->{nc}}) { |
| 854 |
|
|
| 855 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag', |
| 856 |
|
line => $self->{line_prev}, # "<" of "</" |
| 857 |
|
column => $self->{column_prev} - 1); |
| 858 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 859 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
| 860 |
line => $self->{line_prev}, # "<" of "</" |
line => $self->{line_prev}, # "<" of "</" |
| 867 |
## generated from the bogus end tag, as defined in the |
## generated from the bogus end tag, as defined in the |
| 868 |
## "bogus comment state" entry. |
## "bogus comment state" entry. |
| 869 |
redo A; |
redo A; |
| 870 |
|
} else { |
| 871 |
|
## XML5: "</:" is a parse error. |
| 872 |
|
|
| 873 |
|
$self->{ct} = {type => END_TAG_TOKEN, |
| 874 |
|
tag_name => chr ($self->{nc}), |
| 875 |
|
line => $l, column => $c}; |
| 876 |
|
$self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state". |
| 877 |
|
|
| 878 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 879 |
|
$self->{line_prev} = $self->{line}; |
| 880 |
|
$self->{column_prev} = $self->{column}; |
| 881 |
|
$self->{column}++; |
| 882 |
|
$self->{nc} |
| 883 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 884 |
|
} else { |
| 885 |
|
$self->{set_nc}->($self); |
| 886 |
|
} |
| 887 |
|
|
| 888 |
|
redo A; |
| 889 |
} |
} |
| 890 |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
| 891 |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1; |
| 892 |
if (length $ch) { |
if (length $ch) { |
| 893 |
my $CH = $ch; |
my $CH = $ch; |
| 894 |
$ch =~ tr/a-z/A-Z/; |
$ch =~ tr/a-z/A-Z/; |
| 896 |
if ($nch eq $ch or $nch eq $CH) { |
if ($nch eq $ch or $nch eq $CH) { |
| 897 |
|
|
| 898 |
## Stay in the state. |
## Stay in the state. |
| 899 |
$self->{s_kwd} .= $nch; |
$self->{kwd} .= $nch; |
| 900 |
|
|
| 901 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 902 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 915 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 916 |
## Reconsume. |
## Reconsume. |
| 917 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 918 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{kwd}, |
| 919 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 920 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 921 |
}); |
}); |
| 922 |
redo A; |
redo A; |
| 923 |
} |
} |
| 933 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 934 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 935 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 936 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{kwd}, |
| 937 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 938 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 939 |
}); |
}); |
| 940 |
redo A; |
redo A; |
| 941 |
} else { |
} else { |
| 944 |
= {type => END_TAG_TOKEN, |
= {type => END_TAG_TOKEN, |
| 945 |
tag_name => $self->{last_stag_name}, |
tag_name => $self->{last_stag_name}, |
| 946 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 947 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}}; |
column => $self->{column_prev} - 1 - length $self->{kwd}}; |
| 948 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 949 |
## Reconsume. |
## Reconsume. |
| 950 |
redo A; |
redo A; |
| 1076 |
redo A; |
redo A; |
| 1077 |
} |
} |
| 1078 |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) { |
| 1079 |
|
## XML5: "Tag attribute name before state". |
| 1080 |
|
|
| 1081 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1082 |
|
|
| 1083 |
## Stay in the state |
## Stay in the state |
| 1190 |
0x003D => 1, # = |
0x003D => 1, # = |
| 1191 |
}->{$self->{nc}}) { |
}->{$self->{nc}}) { |
| 1192 |
|
|
| 1193 |
|
## XML5: Not a parse error. |
| 1194 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1195 |
} else { |
} else { |
| 1196 |
|
|
| 1197 |
|
## XML5: ":" raises a parse error and is ignored. |
| 1198 |
} |
} |
| 1199 |
$self->{ca} |
$self->{ca} |
| 1200 |
= {name => chr ($self->{nc}), |
= {name => chr ($self->{nc}), |
| 1215 |
redo A; |
redo A; |
| 1216 |
} |
} |
| 1217 |
} elsif ($self->{state} == ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_NAME_STATE) { |
| 1218 |
|
## XML5: "Tag attribute name state". |
| 1219 |
|
|
| 1220 |
my $before_leave = sub { |
my $before_leave = sub { |
| 1221 |
if (exists $self->{ct}->{attributes} # start tag or end tag |
if (exists $self->{ct}->{attributes} # start tag or end tag |
| 1222 |
->{$self->{ca}->{name}}) { # MUST |
->{$self->{ca}->{name}}) { # MUST |
| 1227 |
|
|
| 1228 |
$self->{ct}->{attributes}->{$self->{ca}->{name}} |
$self->{ct}->{attributes}->{$self->{ca}->{name}} |
| 1229 |
= $self->{ca}; |
= $self->{ca}; |
| 1230 |
|
$self->{ca}->{index} = ++$self->{ct}->{last_index}; |
| 1231 |
} |
} |
| 1232 |
}; # $before_leave |
}; # $before_leave |
| 1233 |
|
|
| 1264 |
|
|
| 1265 |
redo A; |
redo A; |
| 1266 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 1267 |
|
if ($self->{is_xml}) { |
| 1268 |
|
|
| 1269 |
|
## XML5: Not a parse error. |
| 1270 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1271 |
|
} else { |
| 1272 |
|
|
| 1273 |
|
} |
| 1274 |
|
|
| 1275 |
$before_leave->(); |
$before_leave->(); |
| 1276 |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
| 1277 |
|
|
| 1321 |
|
|
| 1322 |
redo A; |
redo A; |
| 1323 |
} elsif ($self->{nc} == 0x002F) { # / |
} elsif ($self->{nc} == 0x002F) { # / |
| 1324 |
|
if ($self->{is_xml}) { |
| 1325 |
|
|
| 1326 |
|
## XML5: Not a parse error. |
| 1327 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1328 |
|
} else { |
| 1329 |
|
|
| 1330 |
|
} |
| 1331 |
|
|
| 1332 |
$before_leave->(); |
$before_leave->(); |
| 1333 |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
| 1372 |
if ($self->{nc} == 0x0022 or # " |
if ($self->{nc} == 0x0022 or # " |
| 1373 |
$self->{nc} == 0x0027) { # ' |
$self->{nc} == 0x0027) { # ' |
| 1374 |
|
|
| 1375 |
|
## XML5: Not a parse error. |
| 1376 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1377 |
} else { |
} else { |
| 1378 |
|
|
| 1393 |
redo A; |
redo A; |
| 1394 |
} |
} |
| 1395 |
} elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) { |
} elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) { |
| 1396 |
|
## XML5: "Tag attribute name after state". |
| 1397 |
|
|
| 1398 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1399 |
|
|
| 1400 |
## Stay in the state |
## Stay in the state |
| 1426 |
|
|
| 1427 |
redo A; |
redo A; |
| 1428 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 1429 |
|
if ($self->{is_xml}) { |
| 1430 |
|
|
| 1431 |
|
## XML5: Not a parse error. |
| 1432 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1433 |
|
} else { |
| 1434 |
|
|
| 1435 |
|
} |
| 1436 |
|
|
| 1437 |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
if ($self->{ct}->{type} == START_TAG_TOKEN) { |
| 1438 |
|
|
| 1439 |
$self->{last_stag_name} = $self->{ct}->{tag_name}; |
$self->{last_stag_name} = $self->{ct}->{tag_name}; |
| 1487 |
|
|
| 1488 |
redo A; |
redo A; |
| 1489 |
} elsif ($self->{nc} == 0x002F) { # / |
} elsif ($self->{nc} == 0x002F) { # / |
| 1490 |
|
if ($self->{is_xml}) { |
| 1491 |
|
|
| 1492 |
|
## XML5: Not a parse error. |
| 1493 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1494 |
|
} else { |
| 1495 |
|
|
| 1496 |
|
} |
| 1497 |
|
|
| 1498 |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
$self->{state} = SELF_CLOSING_START_TAG_STATE; |
| 1499 |
|
|
| 1533 |
|
|
| 1534 |
redo A; |
redo A; |
| 1535 |
} else { |
} else { |
| 1536 |
|
if ($self->{is_xml}) { |
| 1537 |
|
|
| 1538 |
|
## XML5: Not a parse error. |
| 1539 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type |
| 1540 |
|
} else { |
| 1541 |
|
|
| 1542 |
|
} |
| 1543 |
|
|
| 1544 |
if ($self->{nc} == 0x0022 or # " |
if ($self->{nc} == 0x0022 or # " |
| 1545 |
$self->{nc} == 0x0027) { # ' |
$self->{nc} == 0x0027) { # ' |
| 1546 |
|
|
| 1547 |
|
## XML5: Not a parse error. |
| 1548 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name'); |
| 1549 |
} else { |
} else { |
| 1550 |
|
|
| 1568 |
redo A; |
redo A; |
| 1569 |
} |
} |
| 1570 |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) { |
} elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) { |
| 1571 |
|
## XML5: "Tag attribute value before state". |
| 1572 |
|
|
| 1573 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1574 |
|
|
| 1575 |
## Stay in the state |
## Stay in the state |
| 1681 |
} else { |
} else { |
| 1682 |
if ($self->{nc} == 0x003D) { # = |
if ($self->{nc} == 0x003D) { # = |
| 1683 |
|
|
| 1684 |
|
## XML5: Not a parse error. |
| 1685 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
| 1686 |
|
} elsif ($self->{is_xml}) { |
| 1687 |
|
|
| 1688 |
|
## XML5: No parse error. |
| 1689 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO |
| 1690 |
} else { |
} else { |
| 1691 |
|
|
| 1692 |
} |
} |
| 1706 |
redo A; |
redo A; |
| 1707 |
} |
} |
| 1708 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { |
| 1709 |
|
## XML5: "Tag attribute value double quoted state". |
| 1710 |
|
|
| 1711 |
if ($self->{nc} == 0x0022) { # " |
if ($self->{nc} == 0x0022) { # " |
| 1712 |
|
|
| 1713 |
|
## XML5: "Tag attribute name before state". |
| 1714 |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1715 |
|
|
| 1716 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1726 |
redo A; |
redo A; |
| 1727 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1728 |
|
|
| 1729 |
|
## XML5: Not defined yet. |
| 1730 |
|
|
| 1731 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1732 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1733 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1772 |
|
|
| 1773 |
redo A; |
redo A; |
| 1774 |
} else { |
} else { |
| 1775 |
|
if ($self->{is_xml} and $self->{nc} == 0x003C) { # < |
| 1776 |
|
|
| 1777 |
|
## XML5: Not a parse error. |
| 1778 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type |
| 1779 |
|
} else { |
| 1780 |
|
|
| 1781 |
|
} |
| 1782 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 1783 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 1784 |
q["&], |
q["&<], |
| 1785 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 1786 |
|
|
| 1787 |
## Stay in the state |
## Stay in the state |
| 1799 |
redo A; |
redo A; |
| 1800 |
} |
} |
| 1801 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { |
| 1802 |
|
## XML5: "Tag attribute value single quoted state". |
| 1803 |
|
|
| 1804 |
if ($self->{nc} == 0x0027) { # ' |
if ($self->{nc} == 0x0027) { # ' |
| 1805 |
|
|
| 1806 |
|
## XML5: "Before attribute name state" (sic). |
| 1807 |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
$self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; |
| 1808 |
|
|
| 1809 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1819 |
redo A; |
redo A; |
| 1820 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1821 |
|
|
| 1822 |
|
## XML5: Not defined yet. |
| 1823 |
|
|
| 1824 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1825 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1826 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1865 |
|
|
| 1866 |
redo A; |
redo A; |
| 1867 |
} else { |
} else { |
| 1868 |
|
if ($self->{is_xml} and $self->{nc} == 0x003C) { # < |
| 1869 |
|
|
| 1870 |
|
## XML5: Not a parse error. |
| 1871 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type |
| 1872 |
|
} else { |
| 1873 |
|
|
| 1874 |
|
} |
| 1875 |
$self->{ca}->{value} .= chr ($self->{nc}); |
$self->{ca}->{value} .= chr ($self->{nc}); |
| 1876 |
$self->{read_until}->($self->{ca}->{value}, |
$self->{read_until}->($self->{ca}->{value}, |
| 1877 |
q['&], |
q['&<], |
| 1878 |
length $self->{ca}->{value}); |
length $self->{ca}->{value}); |
| 1879 |
|
|
| 1880 |
## Stay in the state |
## Stay in the state |
| 1892 |
redo A; |
redo A; |
| 1893 |
} |
} |
| 1894 |
} elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) { |
} elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) { |
| 1895 |
|
## XML5: "Tag attribute value unquoted state". |
| 1896 |
|
|
| 1897 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 1898 |
|
|
| 1899 |
|
## XML5: "Tag attribute name before state". |
| 1900 |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
$self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; |
| 1901 |
|
|
| 1902 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 1912 |
redo A; |
redo A; |
| 1913 |
} elsif ($self->{nc} == 0x0026) { # & |
} elsif ($self->{nc} == 0x0026) { # & |
| 1914 |
|
|
| 1915 |
|
|
| 1916 |
|
## XML5: Not defined yet. |
| 1917 |
|
|
| 1918 |
## NOTE: In the spec, the tokenizer is switched to the |
## NOTE: In the spec, the tokenizer is switched to the |
| 1919 |
## "entity in attribute value state". In this implementation, the |
## "entity in attribute value state". In this implementation, the |
| 1920 |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
## tokenizer is switched to the |ENTITY_STATE|, which is an |
| 1998 |
0x003D => 1, # = |
0x003D => 1, # = |
| 1999 |
}->{$self->{nc}}) { |
}->{$self->{nc}}) { |
| 2000 |
|
|
| 2001 |
|
## XML5: Not a parse error. |
| 2002 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); |
| 2003 |
} else { |
} else { |
| 2004 |
|
|
| 2115 |
redo A; |
redo A; |
| 2116 |
} |
} |
| 2117 |
} elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) { |
} elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) { |
| 2118 |
|
## XML5: "Empty tag state". |
| 2119 |
|
|
| 2120 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 2121 |
if ($self->{ct}->{type} == END_TAG_TOKEN) { |
if ($self->{ct}->{type} == END_TAG_TOKEN) { |
| 2122 |
|
|
| 2168 |
} else { |
} else { |
| 2169 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
| 2170 |
} |
} |
| 2171 |
|
## XML5: "Tag attribute name before state". |
| 2172 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2173 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2174 |
## Reconsume. |
## Reconsume. |
| 2258 |
## ASCII case-insensitive. |
## ASCII case-insensitive. |
| 2259 |
|
|
| 2260 |
$self->{state} = MD_DOCTYPE_STATE; |
$self->{state} = MD_DOCTYPE_STATE; |
| 2261 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 2262 |
|
|
| 2263 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2264 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2277 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 2278 |
|
|
| 2279 |
$self->{state} = MD_CDATA_STATE; |
$self->{state} = MD_CDATA_STATE; |
| 2280 |
$self->{s_kwd} = '['; |
$self->{kwd} = '['; |
| 2281 |
|
|
| 2282 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2283 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2311 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2312 |
column => $self->{column_prev} - 2, |
column => $self->{column_prev} - 2, |
| 2313 |
}; |
}; |
| 2314 |
$self->{state} = COMMENT_START_STATE; |
$self->{state} = COMMENT_START_STATE; ## XML5: "comment state". |
| 2315 |
|
|
| 2316 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2317 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2347 |
0x0054, # T |
0x0054, # T |
| 2348 |
0x0059, # Y |
0x0059, # Y |
| 2349 |
0x0050, # P |
0x0050, # P |
| 2350 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 2351 |
$self->{nc} == [ |
$self->{nc} == [ |
| 2352 |
undef, |
undef, |
| 2353 |
0x006F, # o |
0x006F, # o |
| 2355 |
0x0074, # t |
0x0074, # t |
| 2356 |
0x0079, # y |
0x0079, # y |
| 2357 |
0x0070, # p |
0x0070, # p |
| 2358 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 2359 |
|
|
| 2360 |
## Stay in the state. |
## Stay in the state. |
| 2361 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 2362 |
|
|
| 2363 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2364 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2371 |
} |
} |
| 2372 |
|
|
| 2373 |
redo A; |
redo A; |
| 2374 |
} elsif ((length $self->{s_kwd}) == 6 and |
} elsif ((length $self->{kwd}) == 6 and |
| 2375 |
($self->{nc} == 0x0045 or # E |
($self->{nc} == 0x0045 or # E |
| 2376 |
$self->{nc} == 0x0065)) { # e |
$self->{nc} == 0x0065)) { # e |
| 2377 |
|
if ($self->{is_xml} and |
| 2378 |
|
($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) { |
| 2379 |
|
|
| 2380 |
|
## XML5: case-sensitive. |
| 2381 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO |
| 2382 |
|
text => 'DOCTYPE', |
| 2383 |
|
line => $self->{line_prev}, |
| 2384 |
|
column => $self->{column_prev} - 5); |
| 2385 |
|
} else { |
| 2386 |
|
|
| 2387 |
|
} |
| 2388 |
$self->{state} = DOCTYPE_STATE; |
$self->{state} = DOCTYPE_STATE; |
| 2389 |
$self->{ct} = {type => DOCTYPE_TOKEN, |
$self->{ct} = {type => DOCTYPE_TOKEN, |
| 2390 |
quirks => 1, |
quirks => 1, |
| 2407 |
|
|
| 2408 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', |
| 2409 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2410 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}); |
column => $self->{column_prev} - 1 - length $self->{kwd}); |
| 2411 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 2412 |
## Reconsume. |
## Reconsume. |
| 2413 |
$self->{ct} = {type => COMMENT_TOKEN, |
$self->{ct} = {type => COMMENT_TOKEN, |
| 2414 |
data => $self->{s_kwd}, |
data => $self->{kwd}, |
| 2415 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2416 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 2417 |
}; |
}; |
| 2418 |
redo A; |
redo A; |
| 2419 |
} |
} |
| 2424 |
'[CD' => 0x0041, # A |
'[CD' => 0x0041, # A |
| 2425 |
'[CDA' => 0x0054, # T |
'[CDA' => 0x0054, # T |
| 2426 |
'[CDAT' => 0x0041, # A |
'[CDAT' => 0x0041, # A |
| 2427 |
}->{$self->{s_kwd}}) { |
}->{$self->{kwd}}) { |
| 2428 |
|
|
| 2429 |
## Stay in the state. |
## Stay in the state. |
| 2430 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 2431 |
|
|
| 2432 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2433 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2440 |
} |
} |
| 2441 |
|
|
| 2442 |
redo A; |
redo A; |
| 2443 |
} elsif ($self->{s_kwd} eq '[CDATA' and |
} elsif ($self->{kwd} eq '[CDATA' and |
| 2444 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
| 2445 |
|
if ($self->{is_xml} and |
| 2446 |
|
not $self->{tainted} and |
| 2447 |
|
@{$self->{open_elements} or []} == 0) { |
| 2448 |
|
|
| 2449 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element', |
| 2450 |
|
line => $self->{line_prev}, |
| 2451 |
|
column => $self->{column_prev} - 7); |
| 2452 |
|
$self->{tainted} = 1; |
| 2453 |
|
} else { |
| 2454 |
|
|
| 2455 |
|
} |
| 2456 |
|
|
| 2457 |
$self->{ct} = {type => CHARACTER_TOKEN, |
$self->{ct} = {type => CHARACTER_TOKEN, |
| 2458 |
data => '', |
data => '', |
| 2459 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2475 |
|
|
| 2476 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', |
| 2477 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2478 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}); |
column => $self->{column_prev} - 1 - length $self->{kwd}); |
| 2479 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 2480 |
## Reconsume. |
## Reconsume. |
| 2481 |
$self->{ct} = {type => COMMENT_TOKEN, |
$self->{ct} = {type => COMMENT_TOKEN, |
| 2482 |
data => $self->{s_kwd}, |
data => $self->{kwd}, |
| 2483 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2484 |
column => $self->{column_prev} - 1 - length $self->{s_kwd}, |
column => $self->{column_prev} - 1 - length $self->{kwd}, |
| 2485 |
}; |
}; |
| 2486 |
redo A; |
redo A; |
| 2487 |
} |
} |
| 2661 |
redo A; |
redo A; |
| 2662 |
} |
} |
| 2663 |
} elsif ($self->{state} == COMMENT_END_DASH_STATE) { |
} elsif ($self->{state} == COMMENT_END_DASH_STATE) { |
| 2664 |
|
## XML5: "comment dash state". |
| 2665 |
|
|
| 2666 |
if ($self->{nc} == 0x002D) { # - |
if ($self->{nc} == 0x002D) { # - |
| 2667 |
|
|
| 2668 |
$self->{state} = COMMENT_END_STATE; |
$self->{state} = COMMENT_END_STATE; |
| 2681 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 2682 |
|
|
| 2683 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); |
|
$self->{s_kwd} = ''; |
|
| 2684 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2685 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2686 |
## reconsume |
## reconsume |
| 2727 |
redo A; |
redo A; |
| 2728 |
} elsif ($self->{nc} == 0x002D) { # - |
} elsif ($self->{nc} == 0x002D) { # - |
| 2729 |
|
|
| 2730 |
|
## XML5: Not a parse error. |
| 2731 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
| 2732 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2733 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
| 2757 |
redo A; |
redo A; |
| 2758 |
} else { |
} else { |
| 2759 |
|
|
| 2760 |
|
## XML5: Not a parse error. |
| 2761 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
| 2762 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2763 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
| 2794 |
redo A; |
redo A; |
| 2795 |
} else { |
} else { |
| 2796 |
|
|
| 2797 |
|
## XML5: Unless EOF, swith to the bogus comment state. |
| 2798 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name'); |
| 2799 |
$self->{state} = BEFORE_DOCTYPE_NAME_STATE; |
$self->{state} = BEFORE_DOCTYPE_NAME_STATE; |
| 2800 |
## reconsume |
## reconsume |
| 2801 |
redo A; |
redo A; |
| 2802 |
} |
} |
| 2803 |
} elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) { |
| 2804 |
|
## XML5: "DOCTYPE root name before state". |
| 2805 |
|
|
| 2806 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 2807 |
|
|
| 2808 |
## Stay in the state |
## Stay in the state |
| 2820 |
redo A; |
redo A; |
| 2821 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 2822 |
|
|
| 2823 |
|
## XML5: No parse error. |
| 2824 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
| 2825 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 2826 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 2849 |
return ($self->{ct}); # DOCTYPE (quirks) |
return ($self->{ct}); # DOCTYPE (quirks) |
| 2850 |
|
|
| 2851 |
redo A; |
redo A; |
| 2852 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2853 |
|
|
| 2854 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); |
| 2855 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2856 |
|
|
| 2857 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2858 |
|
$self->{line_prev} = $self->{line}; |
| 2859 |
|
$self->{column_prev} = $self->{column}; |
| 2860 |
|
$self->{column}++; |
| 2861 |
|
$self->{nc} |
| 2862 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 2863 |
|
} else { |
| 2864 |
|
$self->{set_nc}->($self); |
| 2865 |
|
} |
| 2866 |
|
|
| 2867 |
|
redo A; |
| 2868 |
} else { |
} else { |
| 2869 |
|
|
| 2870 |
$self->{ct}->{name} = chr $self->{nc}; |
$self->{ct}->{name} = chr $self->{nc}; |
| 2884 |
redo A; |
redo A; |
| 2885 |
} |
} |
| 2886 |
} elsif ($self->{state} == DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == DOCTYPE_NAME_STATE) { |
| 2887 |
## ISSUE: Redundant "First," in the spec. |
## XML5: "DOCTYPE root name state". |
| 2888 |
|
|
| 2889 |
|
## ISSUE: Redundant "First," in the spec. |
| 2890 |
|
|
| 2891 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 2892 |
|
|
| 2893 |
$self->{state} = AFTER_DOCTYPE_NAME_STATE; |
$self->{state} = AFTER_DOCTYPE_NAME_STATE; |
| 2933 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 2934 |
|
|
| 2935 |
redo A; |
redo A; |
| 2936 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 2937 |
|
|
| 2938 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 2939 |
|
|
| 2940 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2941 |
|
$self->{line_prev} = $self->{line}; |
| 2942 |
|
$self->{column_prev} = $self->{column}; |
| 2943 |
|
$self->{column}++; |
| 2944 |
|
$self->{nc} |
| 2945 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 2946 |
|
} else { |
| 2947 |
|
$self->{set_nc}->($self); |
| 2948 |
|
} |
| 2949 |
|
|
| 2950 |
|
redo A; |
| 2951 |
} else { |
} else { |
| 2952 |
|
|
| 2953 |
$self->{ct}->{name} |
$self->{ct}->{name} |
| 2967 |
redo A; |
redo A; |
| 2968 |
} |
} |
| 2969 |
} elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) { |
} elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) { |
| 2970 |
|
## XML5: Corresponding to XML5's "DOCTYPE root name after |
| 2971 |
|
## state", but implemented differently. |
| 2972 |
|
|
| 2973 |
if ($is_space->{$self->{nc}}) { |
if ($is_space->{$self->{nc}}) { |
| 2974 |
|
|
| 2975 |
## Stay in the state |
## Stay in the state |
| 3017 |
redo A; |
redo A; |
| 3018 |
} elsif ($self->{nc} == 0x0050 or # P |
} elsif ($self->{nc} == 0x0050 or # P |
| 3019 |
$self->{nc} == 0x0070) { # p |
$self->{nc} == 0x0070) { # p |
| 3020 |
|
|
| 3021 |
$self->{state} = PUBLIC_STATE; |
$self->{state} = PUBLIC_STATE; |
| 3022 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 3023 |
|
|
| 3024 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3025 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3034 |
redo A; |
redo A; |
| 3035 |
} elsif ($self->{nc} == 0x0053 or # S |
} elsif ($self->{nc} == 0x0053 or # S |
| 3036 |
$self->{nc} == 0x0073) { # s |
$self->{nc} == 0x0073) { # s |
| 3037 |
|
|
| 3038 |
$self->{state} = SYSTEM_STATE; |
$self->{state} = SYSTEM_STATE; |
| 3039 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 3040 |
|
|
| 3041 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3042 |
|
$self->{line_prev} = $self->{line}; |
| 3043 |
|
$self->{column_prev} = $self->{column}; |
| 3044 |
|
$self->{column}++; |
| 3045 |
|
$self->{nc} |
| 3046 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3047 |
|
} else { |
| 3048 |
|
$self->{set_nc}->($self); |
| 3049 |
|
} |
| 3050 |
|
|
| 3051 |
|
redo A; |
| 3052 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3053 |
|
|
| 3054 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3055 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3056 |
|
|
| 3057 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3058 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3092 |
0x0042, # B |
0x0042, # B |
| 3093 |
0x004C, # L |
0x004C, # L |
| 3094 |
0x0049, # I |
0x0049, # I |
| 3095 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 3096 |
$self->{nc} == [ |
$self->{nc} == [ |
| 3097 |
undef, |
undef, |
| 3098 |
0x0075, # u |
0x0075, # u |
| 3099 |
0x0062, # b |
0x0062, # b |
| 3100 |
0x006C, # l |
0x006C, # l |
| 3101 |
0x0069, # i |
0x0069, # i |
| 3102 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 3103 |
|
|
| 3104 |
## Stay in the state. |
## Stay in the state. |
| 3105 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 3106 |
|
|
| 3107 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3108 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3115 |
} |
} |
| 3116 |
|
|
| 3117 |
redo A; |
redo A; |
| 3118 |
} elsif ((length $self->{s_kwd}) == 5 and |
} elsif ((length $self->{kwd}) == 5 and |
| 3119 |
($self->{nc} == 0x0043 or # C |
($self->{nc} == 0x0043 or # C |
| 3120 |
$self->{nc} == 0x0063)) { # c |
$self->{nc} == 0x0063)) { # c |
| 3121 |
|
if ($self->{is_xml} and |
| 3122 |
|
($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c |
| 3123 |
|
|
| 3124 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type |
| 3125 |
|
text => 'PUBLIC', |
| 3126 |
|
line => $self->{line_prev}, |
| 3127 |
|
column => $self->{column_prev} - 4); |
| 3128 |
|
} else { |
| 3129 |
|
|
| 3130 |
|
} |
| 3131 |
$self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
$self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; |
| 3132 |
|
|
| 3133 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3145 |
|
|
| 3146 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', |
| 3147 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 3148 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}); |
column => $self->{column_prev} + 1 - length $self->{kwd}); |
| 3149 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3150 |
|
|
| 3151 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 3160 |
0x0053, # S |
0x0053, # S |
| 3161 |
0x0054, # T |
0x0054, # T |
| 3162 |
0x0045, # E |
0x0045, # E |
| 3163 |
]->[length $self->{s_kwd}] or |
]->[length $self->{kwd}] or |
| 3164 |
$self->{nc} == [ |
$self->{nc} == [ |
| 3165 |
undef, |
undef, |
| 3166 |
0x0079, # y |
0x0079, # y |
| 3167 |
0x0073, # s |
0x0073, # s |
| 3168 |
0x0074, # t |
0x0074, # t |
| 3169 |
0x0065, # e |
0x0065, # e |
| 3170 |
]->[length $self->{s_kwd}]) { |
]->[length $self->{kwd}]) { |
| 3171 |
|
|
| 3172 |
## Stay in the state. |
## Stay in the state. |
| 3173 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 3174 |
|
|
| 3175 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3176 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 3183 |
} |
} |
| 3184 |
|
|
| 3185 |
redo A; |
redo A; |
| 3186 |
} elsif ((length $self->{s_kwd}) == 5 and |
} elsif ((length $self->{kwd}) == 5 and |
| 3187 |
($self->{nc} == 0x004D or # M |
($self->{nc} == 0x004D or # M |
| 3188 |
$self->{nc} == 0x006D)) { # m |
$self->{nc} == 0x006D)) { # m |
| 3189 |
|
if ($self->{is_xml} and |
| 3190 |
|
($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m |
| 3191 |
|
|
| 3192 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type |
| 3193 |
|
text => 'SYSTEM', |
| 3194 |
|
line => $self->{line_prev}, |
| 3195 |
|
column => $self->{column_prev} - 4); |
| 3196 |
|
} else { |
| 3197 |
|
|
| 3198 |
|
} |
| 3199 |
$self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
$self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; |
| 3200 |
|
|
| 3201 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3213 |
|
|
| 3214 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', |
| 3215 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 3216 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}); |
column => $self->{column_prev} + 1 - length $self->{kwd}); |
| 3217 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
| 3218 |
|
|
| 3219 |
$self->{state} = BOGUS_DOCTYPE_STATE; |
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 3302 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3303 |
|
|
| 3304 |
redo A; |
redo A; |
| 3305 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3306 |
|
|
| 3307 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal'); |
| 3308 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3309 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3310 |
|
|
| 3311 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3312 |
|
$self->{line_prev} = $self->{line}; |
| 3313 |
|
$self->{column_prev} = $self->{column}; |
| 3314 |
|
$self->{column}++; |
| 3315 |
|
$self->{nc} |
| 3316 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3317 |
|
} else { |
| 3318 |
|
$self->{set_nc}->($self); |
| 3319 |
|
} |
| 3320 |
|
|
| 3321 |
|
redo A; |
| 3322 |
} else { |
} else { |
| 3323 |
|
|
| 3324 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC'); |
| 3529 |
|
|
| 3530 |
redo A; |
redo A; |
| 3531 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
| 3532 |
|
if ($self->{is_xml}) { |
| 3533 |
|
|
| 3534 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); |
| 3535 |
|
} else { |
| 3536 |
|
|
| 3537 |
|
} |
| 3538 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3539 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 3540 |
|
|
| 3564 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3565 |
|
|
| 3566 |
redo A; |
redo A; |
| 3567 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3568 |
|
|
| 3569 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); |
| 3570 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3571 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3572 |
|
|
| 3573 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3574 |
|
$self->{line_prev} = $self->{line}; |
| 3575 |
|
$self->{column_prev} = $self->{column}; |
| 3576 |
|
$self->{column}++; |
| 3577 |
|
$self->{nc} |
| 3578 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3579 |
|
} else { |
| 3580 |
|
$self->{set_nc}->($self); |
| 3581 |
|
} |
| 3582 |
|
|
| 3583 |
|
redo A; |
| 3584 |
} else { |
} else { |
| 3585 |
|
|
| 3586 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal'); |
| 3681 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3682 |
|
|
| 3683 |
redo A; |
redo A; |
| 3684 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3685 |
|
|
| 3686 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); |
| 3687 |
|
|
| 3688 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3689 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3690 |
|
|
| 3691 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3692 |
|
$self->{line_prev} = $self->{line}; |
| 3693 |
|
$self->{column_prev} = $self->{column}; |
| 3694 |
|
$self->{column}++; |
| 3695 |
|
$self->{nc} |
| 3696 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3697 |
|
} else { |
| 3698 |
|
$self->{set_nc}->($self); |
| 3699 |
|
} |
| 3700 |
|
|
| 3701 |
|
redo A; |
| 3702 |
} else { |
} else { |
| 3703 |
|
|
| 3704 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM'); |
| 3734 |
} |
} |
| 3735 |
|
|
| 3736 |
redo A; |
redo A; |
| 3737 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # > |
| 3738 |
|
|
| 3739 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
| 3740 |
|
|
| 3805 |
} |
} |
| 3806 |
|
|
| 3807 |
redo A; |
redo A; |
| 3808 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # > |
| 3809 |
|
|
| 3810 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); |
| 3811 |
|
|
| 3906 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3907 |
|
|
| 3908 |
redo A; |
redo A; |
| 3909 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3910 |
|
|
| 3911 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3912 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3913 |
|
|
| 3914 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3915 |
|
$self->{line_prev} = $self->{line}; |
| 3916 |
|
$self->{column_prev} = $self->{column}; |
| 3917 |
|
$self->{column}++; |
| 3918 |
|
$self->{nc} |
| 3919 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3920 |
|
} else { |
| 3921 |
|
$self->{set_nc}->($self); |
| 3922 |
|
} |
| 3923 |
|
|
| 3924 |
|
redo A; |
| 3925 |
} else { |
} else { |
| 3926 |
|
|
| 3927 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal'); |
| 3961 |
return ($self->{ct}); # DOCTYPE |
return ($self->{ct}); # DOCTYPE |
| 3962 |
|
|
| 3963 |
redo A; |
redo A; |
| 3964 |
|
} elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [ |
| 3965 |
|
if ($self->{ct}->{has_internal_subset}) { # DOCTYPE |
| 3966 |
|
|
| 3967 |
|
## Stay in the state. |
| 3968 |
|
|
| 3969 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3970 |
|
$self->{line_prev} = $self->{line}; |
| 3971 |
|
$self->{column_prev} = $self->{column}; |
| 3972 |
|
$self->{column}++; |
| 3973 |
|
$self->{nc} |
| 3974 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3975 |
|
} else { |
| 3976 |
|
$self->{set_nc}->($self); |
| 3977 |
|
} |
| 3978 |
|
|
| 3979 |
|
redo A; |
| 3980 |
|
} else { |
| 3981 |
|
|
| 3982 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; |
| 3983 |
|
$self->{ct}->{has_internal_subset} = 1; # DOCTYPE |
| 3984 |
|
|
| 3985 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 3986 |
|
$self->{line_prev} = $self->{line}; |
| 3987 |
|
$self->{column_prev} = $self->{column}; |
| 3988 |
|
$self->{column}++; |
| 3989 |
|
$self->{nc} |
| 3990 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 3991 |
|
} else { |
| 3992 |
|
$self->{set_nc}->($self); |
| 3993 |
|
} |
| 3994 |
|
|
| 3995 |
|
redo A; |
| 3996 |
|
} |
| 3997 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 3998 |
|
|
| 3999 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 4006 |
} else { |
} else { |
| 4007 |
|
|
| 4008 |
my $s = ''; |
my $s = ''; |
| 4009 |
$self->{read_until}->($s, q[>], 0); |
$self->{read_until}->($s, q{>[}, 0); |
| 4010 |
|
|
| 4011 |
## Stay in the state |
## Stay in the state |
| 4012 |
|
|
| 4026 |
## NOTE: "CDATA section state" in the state is jointly implemented |
## NOTE: "CDATA section state" in the state is jointly implemented |
| 4027 |
## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, |
## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, |
| 4028 |
## and |CDATA_SECTION_MSE2_STATE|. |
## and |CDATA_SECTION_MSE2_STATE|. |
| 4029 |
|
|
| 4030 |
|
## XML5: "CDATA state". |
| 4031 |
|
|
| 4032 |
if ($self->{nc} == 0x005D) { # ] |
if ($self->{nc} == 0x005D) { # ] |
| 4033 |
|
|
| 4045 |
|
|
| 4046 |
redo A; |
redo A; |
| 4047 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 4048 |
|
if ($self->{is_xml}) { |
| 4049 |
|
|
| 4050 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type |
| 4051 |
|
} else { |
| 4052 |
|
|
| 4053 |
|
} |
| 4054 |
|
|
| 4055 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 4056 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4057 |
|
## Reconsume. |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
|
|
$self->{line_prev} = $self->{line}; |
|
|
$self->{column_prev} = $self->{column}; |
|
|
$self->{column}++; |
|
|
$self->{nc} |
|
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
|
|
} else { |
|
|
$self->{set_nc}->($self); |
|
|
} |
|
|
|
|
| 4058 |
if (length $self->{ct}->{data}) { # character |
if (length $self->{ct}->{data}) { # character |
| 4059 |
|
|
| 4060 |
return ($self->{ct}); # character |
return ($self->{ct}); # character |
| 4087 |
|
|
| 4088 |
## ISSUE: "text tokens" in spec. |
## ISSUE: "text tokens" in spec. |
| 4089 |
} elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) { |
| 4090 |
|
## XML5: "CDATA bracket state". |
| 4091 |
|
|
| 4092 |
if ($self->{nc} == 0x005D) { # ] |
if ($self->{nc} == 0x005D) { # ] |
| 4093 |
|
|
| 4094 |
$self->{state} = CDATA_SECTION_MSE2_STATE; |
$self->{state} = CDATA_SECTION_MSE2_STATE; |
| 4106 |
redo A; |
redo A; |
| 4107 |
} else { |
} else { |
| 4108 |
|
|
| 4109 |
|
## XML5: If EOF, "]" is not appended and changed to the data state. |
| 4110 |
$self->{ct}->{data} .= ']'; |
$self->{ct}->{data} .= ']'; |
| 4111 |
$self->{state} = CDATA_SECTION_STATE; |
$self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state. |
| 4112 |
## Reconsume. |
## Reconsume. |
| 4113 |
redo A; |
redo A; |
| 4114 |
} |
} |
| 4115 |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
| 4116 |
|
## XML5: "CDATA end state". |
| 4117 |
|
|
| 4118 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 4119 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 4120 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4157 |
|
|
| 4158 |
$self->{ct}->{data} .= ']]'; # character |
$self->{ct}->{data} .= ']]'; # character |
| 4159 |
$self->{state} = CDATA_SECTION_STATE; |
$self->{state} = CDATA_SECTION_STATE; |
| 4160 |
## Reconsume. |
## Reconsume. ## XML5: Emit. |
| 4161 |
redo A; |
redo A; |
| 4162 |
} |
} |
| 4163 |
} elsif ($self->{state} == ENTITY_STATE) { |
} elsif ($self->{state} == ENTITY_STATE) { |
| 4174 |
} elsif ($self->{nc} == 0x0023) { # # |
} elsif ($self->{nc} == 0x0023) { # # |
| 4175 |
|
|
| 4176 |
$self->{state} = ENTITY_HASH_STATE; |
$self->{state} = ENTITY_HASH_STATE; |
| 4177 |
$self->{s_kwd} = '#'; |
$self->{kwd} = '#'; |
| 4178 |
|
|
| 4179 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4180 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 4194 |
|
|
| 4195 |
require Whatpm::_NamedEntityList; |
require Whatpm::_NamedEntityList; |
| 4196 |
$self->{state} = ENTITY_NAME_STATE; |
$self->{state} = ENTITY_NAME_STATE; |
| 4197 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{kwd} = chr $self->{nc}; |
| 4198 |
$self->{entity__value} = $self->{s_kwd}; |
$self->{entity__value} = $self->{kwd}; |
| 4199 |
$self->{entity__match} = 0; |
$self->{entity__match} = 0; |
| 4200 |
|
|
| 4201 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4245 |
$self->{nc} == 0x0058) { # X |
$self->{nc} == 0x0058) { # X |
| 4246 |
|
|
| 4247 |
$self->{state} = HEXREF_X_STATE; |
$self->{state} = HEXREF_X_STATE; |
| 4248 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 4249 |
|
|
| 4250 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4251 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 4262 |
$self->{nc} <= 0x0039) { # 0..9 |
$self->{nc} <= 0x0039) { # 0..9 |
| 4263 |
|
|
| 4264 |
$self->{state} = NCR_NUM_STATE; |
$self->{state} = NCR_NUM_STATE; |
| 4265 |
$self->{s_kwd} = $self->{nc} - 0x0030; |
$self->{kwd} = $self->{nc} - 0x0030; |
| 4266 |
|
|
| 4267 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4268 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 4308 |
if (0x0030 <= $self->{nc} and |
if (0x0030 <= $self->{nc} and |
| 4309 |
$self->{nc} <= 0x0039) { # 0..9 |
$self->{nc} <= 0x0039) { # 0..9 |
| 4310 |
|
|
| 4311 |
$self->{s_kwd} *= 10; |
$self->{kwd} *= 10; |
| 4312 |
$self->{s_kwd} += $self->{nc} - 0x0030; |
$self->{kwd} += $self->{nc} - 0x0030; |
| 4313 |
|
|
| 4314 |
## Stay in the state. |
## Stay in the state. |
| 4315 |
|
|
| 4345 |
# |
# |
| 4346 |
} |
} |
| 4347 |
|
|
| 4348 |
my $code = $self->{s_kwd}; |
my $code = $self->{kwd}; |
| 4349 |
my $l = $self->{line_prev}; |
my $l = $self->{line_prev}; |
| 4350 |
my $c = $self->{column_prev}; |
my $c = $self->{column_prev}; |
| 4351 |
if ($charref_map->{$code}) { |
if ($charref_map->{$code}) { |
| 4368 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4369 |
## Reconsume. |
## Reconsume. |
| 4370 |
return ({type => CHARACTER_TOKEN, data => chr $code, |
return ({type => CHARACTER_TOKEN, data => chr $code, |
| 4371 |
|
has_reference => 1, |
| 4372 |
line => $l, column => $c, |
line => $l, column => $c, |
| 4373 |
}); |
}); |
| 4374 |
redo A; |
redo A; |
| 4388 |
# 0..9, A..F, a..f |
# 0..9, A..F, a..f |
| 4389 |
|
|
| 4390 |
$self->{state} = HEXREF_HEX_STATE; |
$self->{state} = HEXREF_HEX_STATE; |
| 4391 |
$self->{s_kwd} = 0; |
$self->{kwd} = 0; |
| 4392 |
## Reconsume. |
## Reconsume. |
| 4393 |
redo A; |
redo A; |
| 4394 |
} else { |
} else { |
| 4406 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4407 |
## Reconsume. |
## Reconsume. |
| 4408 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 4409 |
data => '&' . $self->{s_kwd}, |
data => '&' . $self->{kwd}, |
| 4410 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4411 |
column => $self->{column_prev} - length $self->{s_kwd}, |
column => $self->{column_prev} - length $self->{kwd}, |
| 4412 |
}); |
}); |
| 4413 |
redo A; |
redo A; |
| 4414 |
} else { |
} else { |
| 4415 |
|
|
| 4416 |
$self->{ca}->{value} .= '&' . $self->{s_kwd}; |
$self->{ca}->{value} .= '&' . $self->{kwd}; |
| 4417 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
| 4418 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4419 |
## Reconsume. |
## Reconsume. |
| 4424 |
if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) { |
if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) { |
| 4425 |
# 0..9 |
# 0..9 |
| 4426 |
|
|
| 4427 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 4428 |
$self->{s_kwd} += $self->{nc} - 0x0030; |
$self->{kwd} += $self->{nc} - 0x0030; |
| 4429 |
## Stay in the state. |
## Stay in the state. |
| 4430 |
|
|
| 4431 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4442 |
} elsif (0x0061 <= $self->{nc} and |
} elsif (0x0061 <= $self->{nc} and |
| 4443 |
$self->{nc} <= 0x0066) { # a..f |
$self->{nc} <= 0x0066) { # a..f |
| 4444 |
|
|
| 4445 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 4446 |
$self->{s_kwd} += $self->{nc} - 0x0060 + 9; |
$self->{kwd} += $self->{nc} - 0x0060 + 9; |
| 4447 |
## Stay in the state. |
## Stay in the state. |
| 4448 |
|
|
| 4449 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4460 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
| 4461 |
$self->{nc} <= 0x0046) { # A..F |
$self->{nc} <= 0x0046) { # A..F |
| 4462 |
|
|
| 4463 |
$self->{s_kwd} *= 0x10; |
$self->{kwd} *= 0x10; |
| 4464 |
$self->{s_kwd} += $self->{nc} - 0x0040 + 9; |
$self->{kwd} += $self->{nc} - 0x0040 + 9; |
| 4465 |
## Stay in the state. |
## Stay in the state. |
| 4466 |
|
|
| 4467 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4498 |
# |
# |
| 4499 |
} |
} |
| 4500 |
|
|
| 4501 |
my $code = $self->{s_kwd}; |
my $code = $self->{kwd}; |
| 4502 |
my $l = $self->{line_prev}; |
my $l = $self->{line_prev}; |
| 4503 |
my $c = $self->{column_prev}; |
my $c = $self->{column_prev}; |
| 4504 |
if ($charref_map->{$code}) { |
if ($charref_map->{$code}) { |
| 4521 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 4522 |
## Reconsume. |
## Reconsume. |
| 4523 |
return ({type => CHARACTER_TOKEN, data => chr $code, |
return ({type => CHARACTER_TOKEN, data => chr $code, |
| 4524 |
|
has_reference => 1, |
| 4525 |
line => $l, column => $c, |
line => $l, column => $c, |
| 4526 |
}); |
}); |
| 4527 |
redo A; |
redo A; |
| 4535 |
redo A; |
redo A; |
| 4536 |
} |
} |
| 4537 |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
} elsif ($self->{state} == ENTITY_NAME_STATE) { |
| 4538 |
if (length $self->{s_kwd} < 30 and |
if (length $self->{kwd} < 30 and |
| 4539 |
## NOTE: Some number greater than the maximum length of entity name |
## NOTE: Some number greater than the maximum length of entity name |
| 4540 |
((0x0041 <= $self->{nc} and # a |
((0x0041 <= $self->{nc} and # a |
| 4541 |
$self->{nc} <= 0x005A) or # x |
$self->{nc} <= 0x005A) or # x |
| 4545 |
$self->{nc} <= 0x0039) or # 9 |
$self->{nc} <= 0x0039) or # 9 |
| 4546 |
$self->{nc} == 0x003B)) { # ; |
$self->{nc} == 0x003B)) { # ; |
| 4547 |
our $EntityChar; |
our $EntityChar; |
| 4548 |
$self->{s_kwd} .= chr $self->{nc}; |
$self->{kwd} .= chr $self->{nc}; |
| 4549 |
if (defined $EntityChar->{$self->{s_kwd}}) { |
if (defined $EntityChar->{$self->{kwd}}) { |
| 4550 |
if ($self->{nc} == 0x003B) { # ; |
if ($self->{nc} == 0x003B) { # ; |
| 4551 |
|
|
| 4552 |
$self->{entity__value} = $EntityChar->{$self->{s_kwd}}; |
$self->{entity__value} = $EntityChar->{$self->{kwd}}; |
| 4553 |
$self->{entity__match} = 1; |
$self->{entity__match} = 1; |
| 4554 |
|
|
| 4555 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4565 |
# |
# |
| 4566 |
} else { |
} else { |
| 4567 |
|
|
| 4568 |
$self->{entity__value} = $EntityChar->{$self->{s_kwd}}; |
$self->{entity__value} = $EntityChar->{$self->{kwd}}; |
| 4569 |
$self->{entity__match} = -1; |
$self->{entity__match} = -1; |
| 4570 |
## Stay in the state. |
## Stay in the state. |
| 4571 |
|
|
| 4613 |
if ($self->{prev_state} != DATA_STATE and # in attribute |
if ($self->{prev_state} != DATA_STATE and # in attribute |
| 4614 |
$self->{entity__match} < -1) { |
$self->{entity__match} < -1) { |
| 4615 |
|
|
| 4616 |
$data = '&' . $self->{s_kwd}; |
$data = '&' . $self->{kwd}; |
| 4617 |
# |
# |
| 4618 |
} else { |
} else { |
| 4619 |
|
|
| 4625 |
|
|
| 4626 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', |
| 4627 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4628 |
column => $self->{column_prev} - length $self->{s_kwd}); |
column => $self->{column_prev} - length $self->{kwd}); |
| 4629 |
$data = '&' . $self->{s_kwd}; |
$data = '&' . $self->{kwd}; |
| 4630 |
# |
# |
| 4631 |
} |
} |
| 4632 |
|
|
| 4647 |
## Reconsume. |
## Reconsume. |
| 4648 |
return ({type => CHARACTER_TOKEN, |
return ({type => CHARACTER_TOKEN, |
| 4649 |
data => $data, |
data => $data, |
| 4650 |
|
has_reference => $has_ref, |
| 4651 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 4652 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
column => $self->{column_prev} + 1 - length $self->{kwd}, |
| 4653 |
}); |
}); |
| 4654 |
redo A; |
redo A; |
| 4655 |
} else { |
} else { |
| 4661 |
## Reconsume. |
## Reconsume. |
| 4662 |
redo A; |
redo A; |
| 4663 |
} |
} |
| 4664 |
|
|
| 4665 |
|
## XML-only states |
| 4666 |
|
|
| 4667 |
|
} elsif ($self->{state} == PI_STATE) { |
| 4668 |
|
if ($is_space->{$self->{nc}} or |
| 4669 |
|
$self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else" |
| 4670 |
|
$self->{nc} == -1) { |
| 4671 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type |
| 4672 |
|
line => $self->{line_prev}, |
| 4673 |
|
column => $self->{column_prev} |
| 4674 |
|
- 1 * ($self->{nc} != -1)); |
| 4675 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
| 4676 |
|
## Reconsume. |
| 4677 |
|
$self->{ct} = {type => COMMENT_TOKEN, |
| 4678 |
|
data => '?', |
| 4679 |
|
line => $self->{line_prev}, |
| 4680 |
|
column => $self->{column_prev} |
| 4681 |
|
- 1 * ($self->{nc} != -1), |
| 4682 |
|
}; |
| 4683 |
|
redo A; |
| 4684 |
|
} else { |
| 4685 |
|
$self->{ct} = {type => PI_TOKEN, |
| 4686 |
|
target => chr $self->{nc}, |
| 4687 |
|
data => '', |
| 4688 |
|
line => $self->{line_prev}, |
| 4689 |
|
column => $self->{column_prev} - 1, |
| 4690 |
|
}; |
| 4691 |
|
$self->{state} = PI_TARGET_STATE; |
| 4692 |
|
|
| 4693 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4694 |
|
$self->{line_prev} = $self->{line}; |
| 4695 |
|
$self->{column_prev} = $self->{column}; |
| 4696 |
|
$self->{column}++; |
| 4697 |
|
$self->{nc} |
| 4698 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4699 |
|
} else { |
| 4700 |
|
$self->{set_nc}->($self); |
| 4701 |
|
} |
| 4702 |
|
|
| 4703 |
|
redo A; |
| 4704 |
|
} |
| 4705 |
|
} elsif ($self->{state} == PI_TARGET_STATE) { |
| 4706 |
|
if ($is_space->{$self->{nc}}) { |
| 4707 |
|
$self->{state} = PI_TARGET_AFTER_STATE; |
| 4708 |
|
|
| 4709 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4710 |
|
$self->{line_prev} = $self->{line}; |
| 4711 |
|
$self->{column_prev} = $self->{column}; |
| 4712 |
|
$self->{column}++; |
| 4713 |
|
$self->{nc} |
| 4714 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4715 |
|
} else { |
| 4716 |
|
$self->{set_nc}->($self); |
| 4717 |
|
} |
| 4718 |
|
|
| 4719 |
|
redo A; |
| 4720 |
|
} elsif ($self->{nc} == -1) { |
| 4721 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
| 4722 |
|
$self->{state} = DATA_STATE; |
| 4723 |
|
$self->{s_kwd} = ''; |
| 4724 |
|
## Reconsume. |
| 4725 |
|
return ($self->{ct}); # pi |
| 4726 |
|
redo A; |
| 4727 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4728 |
|
$self->{state} = PI_AFTER_STATE; |
| 4729 |
|
|
| 4730 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4731 |
|
$self->{line_prev} = $self->{line}; |
| 4732 |
|
$self->{column_prev} = $self->{column}; |
| 4733 |
|
$self->{column}++; |
| 4734 |
|
$self->{nc} |
| 4735 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4736 |
|
} else { |
| 4737 |
|
$self->{set_nc}->($self); |
| 4738 |
|
} |
| 4739 |
|
|
| 4740 |
|
redo A; |
| 4741 |
|
} else { |
| 4742 |
|
## XML5: typo ("tag name" -> "target") |
| 4743 |
|
$self->{ct}->{target} .= chr $self->{nc}; # pi |
| 4744 |
|
|
| 4745 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4746 |
|
$self->{line_prev} = $self->{line}; |
| 4747 |
|
$self->{column_prev} = $self->{column}; |
| 4748 |
|
$self->{column}++; |
| 4749 |
|
$self->{nc} |
| 4750 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4751 |
|
} else { |
| 4752 |
|
$self->{set_nc}->($self); |
| 4753 |
|
} |
| 4754 |
|
|
| 4755 |
|
redo A; |
| 4756 |
|
} |
| 4757 |
|
} elsif ($self->{state} == PI_TARGET_AFTER_STATE) { |
| 4758 |
|
if ($is_space->{$self->{nc}}) { |
| 4759 |
|
## Stay in the state. |
| 4760 |
|
|
| 4761 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4762 |
|
$self->{line_prev} = $self->{line}; |
| 4763 |
|
$self->{column_prev} = $self->{column}; |
| 4764 |
|
$self->{column}++; |
| 4765 |
|
$self->{nc} |
| 4766 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4767 |
|
} else { |
| 4768 |
|
$self->{set_nc}->($self); |
| 4769 |
|
} |
| 4770 |
|
|
| 4771 |
|
redo A; |
| 4772 |
|
} else { |
| 4773 |
|
$self->{state} = PI_DATA_STATE; |
| 4774 |
|
## Reprocess. |
| 4775 |
|
redo A; |
| 4776 |
|
} |
| 4777 |
|
} elsif ($self->{state} == PI_DATA_STATE) { |
| 4778 |
|
if ($self->{nc} == 0x003F) { # ? |
| 4779 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
| 4780 |
|
|
| 4781 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4782 |
|
$self->{line_prev} = $self->{line}; |
| 4783 |
|
$self->{column_prev} = $self->{column}; |
| 4784 |
|
$self->{column}++; |
| 4785 |
|
$self->{nc} |
| 4786 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4787 |
|
} else { |
| 4788 |
|
$self->{set_nc}->($self); |
| 4789 |
|
} |
| 4790 |
|
|
| 4791 |
|
redo A; |
| 4792 |
|
} elsif ($self->{nc} == -1) { |
| 4793 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type |
| 4794 |
|
$self->{state} = DATA_STATE; |
| 4795 |
|
$self->{s_kwd} = ''; |
| 4796 |
|
## Reprocess. |
| 4797 |
|
return ($self->{ct}); # pi |
| 4798 |
|
redo A; |
| 4799 |
|
} else { |
| 4800 |
|
$self->{ct}->{data} .= chr $self->{nc}; # pi |
| 4801 |
|
$self->{read_until}->($self->{ct}->{data}, q[?], |
| 4802 |
|
length $self->{ct}->{data}); |
| 4803 |
|
## Stay in the state. |
| 4804 |
|
|
| 4805 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4806 |
|
$self->{line_prev} = $self->{line}; |
| 4807 |
|
$self->{column_prev} = $self->{column}; |
| 4808 |
|
$self->{column}++; |
| 4809 |
|
$self->{nc} |
| 4810 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4811 |
|
} else { |
| 4812 |
|
$self->{set_nc}->($self); |
| 4813 |
|
} |
| 4814 |
|
|
| 4815 |
|
## Reprocess. |
| 4816 |
|
redo A; |
| 4817 |
|
} |
| 4818 |
|
} elsif ($self->{state} == PI_AFTER_STATE) { |
| 4819 |
|
if ($self->{nc} == 0x003E) { # > |
| 4820 |
|
$self->{state} = DATA_STATE; |
| 4821 |
|
$self->{s_kwd} = ''; |
| 4822 |
|
|
| 4823 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4824 |
|
$self->{line_prev} = $self->{line}; |
| 4825 |
|
$self->{column_prev} = $self->{column}; |
| 4826 |
|
$self->{column}++; |
| 4827 |
|
$self->{nc} |
| 4828 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4829 |
|
} else { |
| 4830 |
|
$self->{set_nc}->($self); |
| 4831 |
|
} |
| 4832 |
|
|
| 4833 |
|
return ($self->{ct}); # pi |
| 4834 |
|
redo A; |
| 4835 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4836 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type |
| 4837 |
|
line => $self->{line_prev}, |
| 4838 |
|
column => $self->{column_prev}); ## XML5: no error |
| 4839 |
|
$self->{ct}->{data} .= '?'; |
| 4840 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
| 4841 |
|
|
| 4842 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4843 |
|
$self->{line_prev} = $self->{line}; |
| 4844 |
|
$self->{column_prev} = $self->{column}; |
| 4845 |
|
$self->{column}++; |
| 4846 |
|
$self->{nc} |
| 4847 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4848 |
|
} else { |
| 4849 |
|
$self->{set_nc}->($self); |
| 4850 |
|
} |
| 4851 |
|
|
| 4852 |
|
redo A; |
| 4853 |
|
} else { |
| 4854 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type |
| 4855 |
|
line => $self->{line_prev}, |
| 4856 |
|
column => $self->{column_prev} |
| 4857 |
|
+ 1 * ($self->{nc} == -1)); ## XML5: no error |
| 4858 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
| 4859 |
|
$self->{state} = PI_DATA_STATE; |
| 4860 |
|
## Reprocess. |
| 4861 |
|
redo A; |
| 4862 |
|
} |
| 4863 |
|
} elsif ($self->{state} == PI_DATA_AFTER_STATE) { |
| 4864 |
|
## XML5: Same as "pi after state" in XML5 |
| 4865 |
|
if ($self->{nc} == 0x003E) { # > |
| 4866 |
|
$self->{state} = DATA_STATE; |
| 4867 |
|
$self->{s_kwd} = ''; |
| 4868 |
|
|
| 4869 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4870 |
|
$self->{line_prev} = $self->{line}; |
| 4871 |
|
$self->{column_prev} = $self->{column}; |
| 4872 |
|
$self->{column}++; |
| 4873 |
|
$self->{nc} |
| 4874 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4875 |
|
} else { |
| 4876 |
|
$self->{set_nc}->($self); |
| 4877 |
|
} |
| 4878 |
|
|
| 4879 |
|
return ($self->{ct}); # pi |
| 4880 |
|
redo A; |
| 4881 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
| 4882 |
|
$self->{ct}->{data} .= '?'; |
| 4883 |
|
## Stay in the state. |
| 4884 |
|
|
| 4885 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4886 |
|
$self->{line_prev} = $self->{line}; |
| 4887 |
|
$self->{column_prev} = $self->{column}; |
| 4888 |
|
$self->{column}++; |
| 4889 |
|
$self->{nc} |
| 4890 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4891 |
|
} else { |
| 4892 |
|
$self->{set_nc}->($self); |
| 4893 |
|
} |
| 4894 |
|
|
| 4895 |
|
redo A; |
| 4896 |
|
} else { |
| 4897 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
| 4898 |
|
$self->{state} = PI_DATA_STATE; |
| 4899 |
|
## Reprocess. |
| 4900 |
|
redo A; |
| 4901 |
|
} |
| 4902 |
|
|
| 4903 |
|
} elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) { |
| 4904 |
|
if ($self->{nc} == 0x003C) { # < |
| 4905 |
|
## TODO: |
| 4906 |
|
|
| 4907 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4908 |
|
$self->{line_prev} = $self->{line}; |
| 4909 |
|
$self->{column_prev} = $self->{column}; |
| 4910 |
|
$self->{column}++; |
| 4911 |
|
$self->{nc} |
| 4912 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4913 |
|
} else { |
| 4914 |
|
$self->{set_nc}->($self); |
| 4915 |
|
} |
| 4916 |
|
|
| 4917 |
|
redo A; |
| 4918 |
|
} elsif ($self->{nc} == 0x0025) { # % |
| 4919 |
|
## XML5: Not defined yet. |
| 4920 |
|
|
| 4921 |
|
## TODO: |
| 4922 |
|
|
| 4923 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4924 |
|
$self->{line_prev} = $self->{line}; |
| 4925 |
|
$self->{column_prev} = $self->{column}; |
| 4926 |
|
$self->{column}++; |
| 4927 |
|
$self->{nc} |
| 4928 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4929 |
|
} else { |
| 4930 |
|
$self->{set_nc}->($self); |
| 4931 |
|
} |
| 4932 |
|
|
| 4933 |
|
redo A; |
| 4934 |
|
} elsif ($self->{nc} == 0x005D) { # ] |
| 4935 |
|
$self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE; |
| 4936 |
|
|
| 4937 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4938 |
|
$self->{line_prev} = $self->{line}; |
| 4939 |
|
$self->{column_prev} = $self->{column}; |
| 4940 |
|
$self->{column}++; |
| 4941 |
|
$self->{nc} |
| 4942 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4943 |
|
} else { |
| 4944 |
|
$self->{set_nc}->($self); |
| 4945 |
|
} |
| 4946 |
|
|
| 4947 |
|
redo A; |
| 4948 |
|
} elsif ($is_space->{$self->{nc}}) { |
| 4949 |
|
## Stay in the state. |
| 4950 |
|
|
| 4951 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4952 |
|
$self->{line_prev} = $self->{line}; |
| 4953 |
|
$self->{column_prev} = $self->{column}; |
| 4954 |
|
$self->{column}++; |
| 4955 |
|
$self->{nc} |
| 4956 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4957 |
|
} else { |
| 4958 |
|
$self->{set_nc}->($self); |
| 4959 |
|
} |
| 4960 |
|
|
| 4961 |
|
redo A; |
| 4962 |
|
} elsif ($self->{nc} == -1) { |
| 4963 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type |
| 4964 |
|
$self->{state} = DATA_STATE; |
| 4965 |
|
$self->{s_kwd} = ''; |
| 4966 |
|
## Reconsume. |
| 4967 |
|
return ($self->{ct}); # DOCTYPE |
| 4968 |
|
redo A; |
| 4969 |
|
} else { |
| 4970 |
|
unless ($self->{internal_subset_tainted}) { |
| 4971 |
|
## XML5: No parse error. |
| 4972 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset'); |
| 4973 |
|
$self->{internal_subset_tainted} = 1; |
| 4974 |
|
} |
| 4975 |
|
## Stay in the state. |
| 4976 |
|
|
| 4977 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4978 |
|
$self->{line_prev} = $self->{line}; |
| 4979 |
|
$self->{column_prev} = $self->{column}; |
| 4980 |
|
$self->{column}++; |
| 4981 |
|
$self->{nc} |
| 4982 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 4983 |
|
} else { |
| 4984 |
|
$self->{set_nc}->($self); |
| 4985 |
|
} |
| 4986 |
|
|
| 4987 |
|
redo A; |
| 4988 |
|
} |
| 4989 |
|
} elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) { |
| 4990 |
|
if ($self->{nc} == 0x003E) { # > |
| 4991 |
|
$self->{state} = DATA_STATE; |
| 4992 |
|
$self->{s_kwd} = ''; |
| 4993 |
|
|
| 4994 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 4995 |
|
$self->{line_prev} = $self->{line}; |
| 4996 |
|
$self->{column_prev} = $self->{column}; |
| 4997 |
|
$self->{column}++; |
| 4998 |
|
$self->{nc} |
| 4999 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5000 |
|
} else { |
| 5001 |
|
$self->{set_nc}->($self); |
| 5002 |
|
} |
| 5003 |
|
|
| 5004 |
|
return ($self->{ct}); # DOCTYPE |
| 5005 |
|
redo A; |
| 5006 |
|
} elsif ($self->{nc} == -1) { |
| 5007 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); |
| 5008 |
|
$self->{state} = DATA_STATE; |
| 5009 |
|
$self->{s_kwd} = ''; |
| 5010 |
|
## Reconsume. |
| 5011 |
|
return ($self->{ct}); # DOCTYPE |
| 5012 |
|
redo A; |
| 5013 |
|
} else { |
| 5014 |
|
## XML5: No parse error and stay in the state. |
| 5015 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type |
| 5016 |
|
|
| 5017 |
|
$self->{state} = BOGUS_DOCTYPE_STATE; |
| 5018 |
|
|
| 5019 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 5020 |
|
$self->{line_prev} = $self->{line}; |
| 5021 |
|
$self->{column_prev} = $self->{column}; |
| 5022 |
|
$self->{column}++; |
| 5023 |
|
$self->{nc} |
| 5024 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 5025 |
|
} else { |
| 5026 |
|
$self->{set_nc}->($self); |
| 5027 |
|
} |
| 5028 |
|
|
| 5029 |
|
redo A; |
| 5030 |
|
} |
| 5031 |
|
|
| 5032 |
} else { |
} else { |
| 5033 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |
| 5034 |
} |
} |