114 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
115 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
116 |
|
|
117 |
|
## XML states |
118 |
|
sub PI_STATE () { 51 } |
119 |
|
sub PI_TARGET_STATE () { 52 } |
120 |
|
sub PI_TARGET_AFTER_STATE () { 53 } |
121 |
|
sub PI_DATA_STATE () { 54 } |
122 |
|
sub PI_AFTER_STATE () { 55 } |
123 |
|
sub PI_DATA_AFTER_STATE () { 56 } |
124 |
|
|
125 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
126 |
## list and descriptions) |
## list and descriptions) |
127 |
|
|
217 |
## ->{value} |
## ->{value} |
218 |
## ->{has_reference} == 1 or 0 |
## ->{has_reference} == 1 or 0 |
219 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
220 |
|
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
221 |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
222 |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
223 |
## while the token is pushed back to the stack. |
## while the token is pushed back to the stack. |
508 |
|
|
509 |
redo A; |
redo A; |
510 |
} elsif ($self->{nc} == 0x003F) { # ? |
} elsif ($self->{nc} == 0x003F) { # ? |
511 |
!!!cp (22); |
if ($self->{is_xml}) { |
512 |
!!!parse-error (type => 'pio', |
!!!cp (22.1); |
513 |
line => $self->{line_prev}, |
$self->{state} = PI_STATE; |
514 |
column => $self->{column_prev}); |
!!!next-input-character; |
515 |
$self->{state} = BOGUS_COMMENT_STATE; |
redo A; |
516 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
} else { |
517 |
line => $self->{line_prev}, |
!!!cp (22); |
518 |
column => $self->{column_prev}, |
!!!parse-error (type => 'pio', |
519 |
}; |
line => $self->{line_prev}, |
520 |
## $self->{nc} is intentionally left as is |
column => $self->{column_prev}); |
521 |
redo A; |
$self->{state} = BOGUS_COMMENT_STATE; |
522 |
} else { |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
523 |
|
line => $self->{line_prev}, |
524 |
|
column => $self->{column_prev}, |
525 |
|
}; |
526 |
|
## $self->{nc} is intentionally left as is |
527 |
|
redo A; |
528 |
|
} |
529 |
|
} elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) { |
530 |
!!!cp (23); |
!!!cp (23); |
531 |
!!!parse-error (type => 'bare stago', |
!!!parse-error (type => 'bare stago', |
532 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
541 |
}); |
}); |
542 |
|
|
543 |
redo A; |
redo A; |
544 |
|
} else { |
545 |
|
## XML5: "<:" is a parse error. |
546 |
|
!!!cp (23.1); |
547 |
|
$self->{ct} = {type => START_TAG_TOKEN, |
548 |
|
tag_name => chr ($self->{nc}), |
549 |
|
line => $self->{line_prev}, |
550 |
|
column => $self->{column_prev}}; |
551 |
|
$self->{state} = TAG_NAME_STATE; |
552 |
|
!!!next-input-character; |
553 |
|
redo A; |
554 |
} |
} |
555 |
} else { |
} else { |
556 |
die "$0: $self->{content_model} in tag open"; |
die "$0: $self->{content_model} in tag open"; |
1583 |
redo A; |
redo A; |
1584 |
} elsif ($self->{s_kwd} eq '[CDATA' and |
} elsif ($self->{s_kwd} eq '[CDATA' and |
1585 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
1586 |
!!!cp (135.2); |
if ($self->{is_xml} and |
1587 |
|
not $self->{tainted} and |
1588 |
|
@{$self->{open_elements} or []} == 0) { |
1589 |
|
!!!cp (135.2); |
1590 |
|
!!!parse-error (type => 'cdata outside of root element', |
1591 |
|
line => $self->{line_prev}, |
1592 |
|
column => $self->{column_prev} - 7); |
1593 |
|
$self->{tainted} = 1; |
1594 |
|
} else { |
1595 |
|
!!!cp (135.21); |
1596 |
|
} |
1597 |
|
|
1598 |
$self->{ct} = {type => CHARACTER_TOKEN, |
$self->{ct} = {type => CHARACTER_TOKEN, |
1599 |
data => '', |
data => '', |
1600 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
2368 |
!!!next-input-character; |
!!!next-input-character; |
2369 |
redo A; |
redo A; |
2370 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
2371 |
|
if ($self->{is_xml}) { |
2372 |
|
!!!cp (221.11); |
2373 |
|
!!!parse-error (type => 'no mse'); ## TODO: type |
2374 |
|
} else { |
2375 |
|
!!!cp (221.12); |
2376 |
|
} |
2377 |
|
|
2378 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2379 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
2380 |
!!!next-input-character; |
!!!next-input-character; |
2586 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
2587 |
## Reconsume. |
## Reconsume. |
2588 |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
2589 |
|
has_reference => 1, |
2590 |
line => $l, column => $c, |
line => $l, column => $c, |
2591 |
}); |
}); |
2592 |
redo A; |
redo A; |
2699 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
2700 |
## Reconsume. |
## Reconsume. |
2701 |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
2702 |
|
has_reference => 1, |
2703 |
line => $l, column => $c, |
line => $l, column => $c, |
2704 |
}); |
}); |
2705 |
redo A; |
redo A; |
2795 |
## Reconsume. |
## Reconsume. |
2796 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
2797 |
data => $data, |
data => $data, |
2798 |
|
has_reference => $has_ref, |
2799 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
2800 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
2801 |
}); |
}); |
2809 |
## Reconsume. |
## Reconsume. |
2810 |
redo A; |
redo A; |
2811 |
} |
} |
2812 |
|
|
2813 |
|
## XML-only states |
2814 |
|
|
2815 |
|
} elsif ($self->{state} == PI_STATE) { |
2816 |
|
if ($is_space->{$self->{nc}} or |
2817 |
|
$self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else" |
2818 |
|
$self->{nc} == -1) { |
2819 |
|
!!!parse-error (type => 'bare pio', ## TODO: type |
2820 |
|
line => $self->{line_prev}, |
2821 |
|
column => $self->{column_prev} |
2822 |
|
- 1 * ($self->{nc} != -1)); |
2823 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
2824 |
|
## Reconsume. |
2825 |
|
$self->{ct} = {type => COMMENT_TOKEN, |
2826 |
|
data => '?', |
2827 |
|
line => $self->{line_prev}, |
2828 |
|
column => $self->{column_prev} |
2829 |
|
- 1 * ($self->{nc} != -1), |
2830 |
|
}; |
2831 |
|
redo A; |
2832 |
|
} else { |
2833 |
|
$self->{ct} = {type => PI_TOKEN, |
2834 |
|
target => chr $self->{nc}, |
2835 |
|
data => '', |
2836 |
|
line => $self->{line_prev}, |
2837 |
|
column => $self->{column_prev} - 1, |
2838 |
|
}; |
2839 |
|
$self->{state} = PI_TARGET_STATE; |
2840 |
|
!!!next-input-character; |
2841 |
|
redo A; |
2842 |
|
} |
2843 |
|
} elsif ($self->{state} == PI_TARGET_STATE) { |
2844 |
|
if ($is_space->{$self->{nc}}) { |
2845 |
|
$self->{state} = PI_TARGET_AFTER_STATE; |
2846 |
|
!!!next-input-character; |
2847 |
|
redo A; |
2848 |
|
} elsif ($self->{nc} == -1) { |
2849 |
|
!!!parse-error (type => 'no pic'); ## TODO: type |
2850 |
|
$self->{state} = DATA_STATE; |
2851 |
|
$self->{s_kwd} = ''; |
2852 |
|
## Reconsume. |
2853 |
|
!!!emit ($self->{ct}); # pi |
2854 |
|
redo A; |
2855 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
2856 |
|
$self->{state} = PI_AFTER_STATE; |
2857 |
|
!!!next-input-character; |
2858 |
|
redo A; |
2859 |
|
} else { |
2860 |
|
## XML5: typo ("tag name" -> "target") |
2861 |
|
$self->{ct}->{target} .= chr $self->{nc}; # pi |
2862 |
|
!!!next-input-character; |
2863 |
|
redo A; |
2864 |
|
} |
2865 |
|
} elsif ($self->{state} == PI_TARGET_AFTER_STATE) { |
2866 |
|
if ($is_space->{$self->{nc}}) { |
2867 |
|
## Stay in the state. |
2868 |
|
!!!next-input-character; |
2869 |
|
redo A; |
2870 |
|
} else { |
2871 |
|
$self->{state} = PI_DATA_STATE; |
2872 |
|
## Reprocess. |
2873 |
|
redo A; |
2874 |
|
} |
2875 |
|
} elsif ($self->{state} == PI_DATA_STATE) { |
2876 |
|
if ($self->{nc} == 0x003F) { # ? |
2877 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
2878 |
|
!!!next-input-character; |
2879 |
|
redo A; |
2880 |
|
} elsif ($self->{nc} == -1) { |
2881 |
|
!!!parse-error (type => 'no pic'); ## TODO: type |
2882 |
|
$self->{state} = DATA_STATE; |
2883 |
|
$self->{s_kwd} = ''; |
2884 |
|
## Reprocess. |
2885 |
|
!!!emit ($self->{ct}); # pi |
2886 |
|
redo A; |
2887 |
|
} else { |
2888 |
|
$self->{ct}->{data} .= chr $self->{nc}; # pi |
2889 |
|
$self->{read_until}->($self->{ct}->{data}, q[?], |
2890 |
|
length $self->{ct}->{data}); |
2891 |
|
## Stay in the state. |
2892 |
|
!!!next-input-character; |
2893 |
|
## Reprocess. |
2894 |
|
redo A; |
2895 |
|
} |
2896 |
|
} elsif ($self->{state} == PI_AFTER_STATE) { |
2897 |
|
if ($self->{nc} == 0x003E) { # > |
2898 |
|
$self->{state} = DATA_STATE; |
2899 |
|
$self->{s_kwd} = ''; |
2900 |
|
!!!next-input-character; |
2901 |
|
!!!emit ($self->{ct}); # pi |
2902 |
|
redo A; |
2903 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
2904 |
|
!!!parse-error (type => 'no s after target', ## TODO: type |
2905 |
|
line => $self->{line_prev}, |
2906 |
|
column => $self->{column_prev}); ## XML5: no error |
2907 |
|
$self->{ct}->{data} .= '?'; |
2908 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
2909 |
|
!!!next-input-character; |
2910 |
|
redo A; |
2911 |
|
} else { |
2912 |
|
!!!parse-error (type => 'no s after target', ## TODO: type |
2913 |
|
line => $self->{line_prev}, |
2914 |
|
column => $self->{column_prev} |
2915 |
|
+ 1 * ($self->{nc} == -1)); ## XML5: no error |
2916 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
2917 |
|
$self->{state} = PI_DATA_STATE; |
2918 |
|
## Reprocess. |
2919 |
|
redo A; |
2920 |
|
} |
2921 |
|
} elsif ($self->{state} == PI_DATA_AFTER_STATE) { |
2922 |
|
## XML5: Same as "pi after state" in XML5 |
2923 |
|
if ($self->{nc} == 0x003E) { # > |
2924 |
|
$self->{state} = DATA_STATE; |
2925 |
|
$self->{s_kwd} = ''; |
2926 |
|
!!!next-input-character; |
2927 |
|
!!!emit ($self->{ct}); # pi |
2928 |
|
redo A; |
2929 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
2930 |
|
$self->{ct}->{data} .= '?'; |
2931 |
|
## Stay in the state. |
2932 |
|
!!!next-input-character; |
2933 |
|
redo A; |
2934 |
|
} else { |
2935 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
2936 |
|
$self->{state} = PI_DATA_STATE; |
2937 |
|
## Reprocess. |
2938 |
|
redo A; |
2939 |
|
} |
2940 |
|
|
2941 |
} else { |
} else { |
2942 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |
2943 |
} |
} |