| 507 |
return ($token); |
return ($token); |
| 508 |
redo A; |
redo A; |
| 509 |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
| 510 |
|
## XML5: "tag state". |
| 511 |
|
|
| 512 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 513 |
if ($self->{nc} == 0x002F) { # / |
if ($self->{nc} == 0x002F) { # / |
| 514 |
|
|
| 711 |
## NOTE: The "close tag open state" in the spec is implemented as |
## NOTE: The "close tag open state" in the spec is implemented as |
| 712 |
## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|. |
## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|. |
| 713 |
|
|
| 714 |
|
## XML5: "end tag state". |
| 715 |
|
|
| 716 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" |
| 717 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 718 |
if (defined $self->{last_stag_name}) { |
if (defined $self->{last_stag_name}) { |
| 774 |
|
|
| 775 |
redo A; |
redo A; |
| 776 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
|
|
|
| 777 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag', |
| 778 |
line => $self->{line_prev}, ## "<" in "</>" |
line => $self->{line_prev}, ## "<" in "</>" |
| 779 |
column => $self->{column_prev} - 1); |
column => $self->{column_prev} - 1); |
| 780 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 781 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 782 |
|
if ($self->{is_xml}) { |
| 783 |
|
|
| 784 |
|
## XML5: No parse error. |
| 785 |
|
|
| 786 |
|
## NOTE: This parser raises a parse error, since it supports |
| 787 |
|
## XML1, not XML5. |
| 788 |
|
|
| 789 |
|
## NOTE: A short end tag token. |
| 790 |
|
my $ct = {type => END_TAG_TOKEN, |
| 791 |
|
tag_name => '', |
| 792 |
|
line => $self->{line_prev}, |
| 793 |
|
column => $self->{column_prev} - 1, |
| 794 |
|
}; |
| 795 |
|
|
| 796 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 797 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 798 |
$self->{column_prev} = $self->{column}; |
$self->{column_prev} = $self->{column}; |
| 803 |
$self->{set_nc}->($self); |
$self->{set_nc}->($self); |
| 804 |
} |
} |
| 805 |
|
|
| 806 |
|
return ($ct); |
| 807 |
|
} else { |
| 808 |
|
|
| 809 |
|
|
| 810 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 811 |
|
$self->{line_prev} = $self->{line}; |
| 812 |
|
$self->{column_prev} = $self->{column}; |
| 813 |
|
$self->{column}++; |
| 814 |
|
$self->{nc} |
| 815 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 816 |
|
} else { |
| 817 |
|
$self->{set_nc}->($self); |
| 818 |
|
} |
| 819 |
|
|
| 820 |
|
} |
| 821 |
redo A; |
redo A; |
| 822 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
| 823 |
|
|
| 831 |
}); |
}); |
| 832 |
|
|
| 833 |
redo A; |
redo A; |
| 834 |
} else { |
} elsif (not $self->{is_xml} or |
| 835 |
|
$is_space->{$self->{nc}}) { |
| 836 |
|
|
| 837 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag'); |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag', |
| 838 |
|
line => $self->{line_prev}, # "<" of "</" |
| 839 |
|
column => $self->{column_prev} - 1); |
| 840 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 841 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
| 842 |
line => $self->{line_prev}, # "<" of "</" |
line => $self->{line_prev}, # "<" of "</" |
| 849 |
## generated from the bogus end tag, as defined in the |
## generated from the bogus end tag, as defined in the |
| 850 |
## "bogus comment state" entry. |
## "bogus comment state" entry. |
| 851 |
redo A; |
redo A; |
| 852 |
|
} else { |
| 853 |
|
## XML5: "</:" is a parse error. |
| 854 |
|
|
| 855 |
|
$self->{ct} = {type => END_TAG_TOKEN, |
| 856 |
|
tag_name => chr ($self->{nc}), |
| 857 |
|
line => $l, column => $c}; |
| 858 |
|
$self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state". |
| 859 |
|
|
| 860 |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 861 |
|
$self->{line_prev} = $self->{line}; |
| 862 |
|
$self->{column_prev} = $self->{column}; |
| 863 |
|
$self->{column}++; |
| 864 |
|
$self->{nc} |
| 865 |
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
| 866 |
|
} else { |
| 867 |
|
$self->{set_nc}->($self); |
| 868 |
|
} |
| 869 |
|
|
| 870 |
|
redo A; |
| 871 |
} |
} |
| 872 |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
| 873 |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
| 2205 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2206 |
column => $self->{column_prev} - 2, |
column => $self->{column_prev} - 2, |
| 2207 |
}; |
}; |
| 2208 |
$self->{state} = COMMENT_START_STATE; |
$self->{state} = COMMENT_START_STATE; ## XML5: "comment state". |
| 2209 |
|
|
| 2210 |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
| 2211 |
$self->{line_prev} = $self->{line}; |
$self->{line_prev} = $self->{line}; |
| 2268 |
} elsif ((length $self->{s_kwd}) == 6 and |
} elsif ((length $self->{s_kwd}) == 6 and |
| 2269 |
($self->{nc} == 0x0045 or # E |
($self->{nc} == 0x0045 or # E |
| 2270 |
$self->{nc} == 0x0065)) { # e |
$self->{nc} == 0x0065)) { # e |
| 2271 |
|
if ($self->{s_kwd} ne 'DOCTYP') { |
| 2272 |
|
|
| 2273 |
|
## XML5: case-sensitive. |
| 2274 |
|
$self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO |
| 2275 |
|
text => 'DOCTYPE', |
| 2276 |
|
line => $self->{line_prev}, |
| 2277 |
|
column => $self->{column_prev} - 5); |
| 2278 |
|
} else { |
| 2279 |
|
|
| 2280 |
|
} |
| 2281 |
$self->{state} = DOCTYPE_STATE; |
$self->{state} = DOCTYPE_STATE; |
| 2282 |
$self->{ct} = {type => DOCTYPE_TOKEN, |
$self->{ct} = {type => DOCTYPE_TOKEN, |
| 2283 |
quirks => 1, |
quirks => 1, |
| 2554 |
redo A; |
redo A; |
| 2555 |
} |
} |
| 2556 |
} elsif ($self->{state} == COMMENT_END_DASH_STATE) { |
} elsif ($self->{state} == COMMENT_END_DASH_STATE) { |
| 2557 |
|
## XML5: "comment dash state". |
| 2558 |
|
|
| 2559 |
if ($self->{nc} == 0x002D) { # - |
if ($self->{nc} == 0x002D) { # - |
| 2560 |
|
|
| 2561 |
$self->{state} = COMMENT_END_STATE; |
$self->{state} = COMMENT_END_STATE; |
| 2621 |
redo A; |
redo A; |
| 2622 |
} elsif ($self->{nc} == 0x002D) { # - |
} elsif ($self->{nc} == 0x002D) { # - |
| 2623 |
|
|
| 2624 |
|
## XML5: Not a parse error. |
| 2625 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
| 2626 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2627 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
| 2651 |
redo A; |
redo A; |
| 2652 |
} else { |
} else { |
| 2653 |
|
|
| 2654 |
|
## XML5: Not a parse error. |
| 2655 |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
$self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', |
| 2656 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
| 2657 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
| 3737 |
## NOTE: "CDATA section state" in the state is jointly implemented |
## NOTE: "CDATA section state" in the state is jointly implemented |
| 3738 |
## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, |
## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, |
| 3739 |
## and |CDATA_SECTION_MSE2_STATE|. |
## and |CDATA_SECTION_MSE2_STATE|. |
| 3740 |
|
|
| 3741 |
|
## XML5: "CDATA state". |
| 3742 |
|
|
| 3743 |
if ($self->{nc} == 0x005D) { # ] |
if ($self->{nc} == 0x005D) { # ] |
| 3744 |
|
|
| 3765 |
|
|
| 3766 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3767 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 3768 |
|
## Reconsume. |
|
if ($self->{char_buffer_pos} < length $self->{char_buffer}) { |
|
|
$self->{line_prev} = $self->{line}; |
|
|
$self->{column_prev} = $self->{column}; |
|
|
$self->{column}++; |
|
|
$self->{nc} |
|
|
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); |
|
|
} else { |
|
|
$self->{set_nc}->($self); |
|
|
} |
|
|
|
|
| 3769 |
if (length $self->{ct}->{data}) { # character |
if (length $self->{ct}->{data}) { # character |
| 3770 |
|
|
| 3771 |
return ($self->{ct}); # character |
return ($self->{ct}); # character |
| 3798 |
|
|
| 3799 |
## ISSUE: "text tokens" in spec. |
## ISSUE: "text tokens" in spec. |
| 3800 |
} elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) { |
| 3801 |
|
## XML5: "CDATA bracket state". |
| 3802 |
|
|
| 3803 |
if ($self->{nc} == 0x005D) { # ] |
if ($self->{nc} == 0x005D) { # ] |
| 3804 |
|
|
| 3805 |
$self->{state} = CDATA_SECTION_MSE2_STATE; |
$self->{state} = CDATA_SECTION_MSE2_STATE; |
| 3817 |
redo A; |
redo A; |
| 3818 |
} else { |
} else { |
| 3819 |
|
|
| 3820 |
|
## XML5: If EOF, "]" is not appended and changed to the data state. |
| 3821 |
$self->{ct}->{data} .= ']'; |
$self->{ct}->{data} .= ']'; |
| 3822 |
$self->{state} = CDATA_SECTION_STATE; |
$self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state. |
| 3823 |
## Reconsume. |
## Reconsume. |
| 3824 |
redo A; |
redo A; |
| 3825 |
} |
} |
| 3826 |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
| 3827 |
|
## XML5: "CDATA end state". |
| 3828 |
|
|
| 3829 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
| 3830 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 3831 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
| 3868 |
|
|
| 3869 |
$self->{ct}->{data} .= ']]'; # character |
$self->{ct}->{data} .= ']]'; # character |
| 3870 |
$self->{state} = CDATA_SECTION_STATE; |
$self->{state} = CDATA_SECTION_STATE; |
| 3871 |
## Reconsume. |
## Reconsume. ## XML5: Emit. |
| 3872 |
redo A; |
redo A; |
| 3873 |
} |
} |
| 3874 |
} elsif ($self->{state} == ENTITY_STATE) { |
} elsif ($self->{state} == ENTITY_STATE) { |