2 |
use strict; |
use strict; |
3 |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
4 |
|
|
5 |
|
BEGIN { |
6 |
|
require Exporter; |
7 |
|
push our @ISA, 'Exporter'; |
8 |
|
|
9 |
|
our @EXPORT_OK = qw( |
10 |
|
DOCTYPE_TOKEN |
11 |
|
COMMENT_TOKEN |
12 |
|
START_TAG_TOKEN |
13 |
|
END_TAG_TOKEN |
14 |
|
END_OF_FILE_TOKEN |
15 |
|
CHARACTER_TOKEN |
16 |
|
PI_TOKEN |
17 |
|
ABORT_TOKEN |
18 |
|
); |
19 |
|
|
20 |
|
our %EXPORT_TAGS = ( |
21 |
|
token => [qw( |
22 |
|
DOCTYPE_TOKEN |
23 |
|
COMMENT_TOKEN |
24 |
|
START_TAG_TOKEN |
25 |
|
END_TAG_TOKEN |
26 |
|
END_OF_FILE_TOKEN |
27 |
|
CHARACTER_TOKEN |
28 |
|
PI_TOKEN |
29 |
|
ABORT_TOKEN |
30 |
|
)], |
31 |
|
); |
32 |
|
} |
33 |
|
|
34 |
|
## Token types |
35 |
|
|
36 |
|
sub DOCTYPE_TOKEN () { 1 } |
37 |
|
sub COMMENT_TOKEN () { 2 } |
38 |
|
sub START_TAG_TOKEN () { 3 } |
39 |
|
sub END_TAG_TOKEN () { 4 } |
40 |
|
sub END_OF_FILE_TOKEN () { 5 } |
41 |
|
sub CHARACTER_TOKEN () { 6 } |
42 |
|
sub PI_TOKEN () { 7 } # XML5 |
43 |
|
sub ABORT_TOKEN () { 8 } # Not a token actually |
44 |
|
|
45 |
package Whatpm::HTML; |
package Whatpm::HTML; |
46 |
|
|
47 |
|
BEGIN { Whatpm::HTML::Tokenizer->import (':token') } |
48 |
|
|
49 |
## Content model flags |
## Content model flags |
50 |
|
|
51 |
sub CM_ENTITY () { 0b001 } # & markup in data |
sub CM_ENTITY () { 0b001 } # & markup in data |
114 |
sub ENTITY_NAME_STATE () { 49 } |
sub ENTITY_NAME_STATE () { 49 } |
115 |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
sub PCDATA_STATE () { 50 } # "data state" in the spec |
116 |
|
|
117 |
## Token types |
## XML states |
118 |
|
sub PI_STATE () { 51 } |
119 |
sub DOCTYPE_TOKEN () { 1 } |
sub PI_TARGET_STATE () { 52 } |
120 |
sub COMMENT_TOKEN () { 2 } |
sub PI_TARGET_AFTER_STATE () { 53 } |
121 |
sub START_TAG_TOKEN () { 3 } |
sub PI_DATA_STATE () { 54 } |
122 |
sub END_TAG_TOKEN () { 4 } |
sub PI_AFTER_STATE () { 55 } |
123 |
sub END_OF_FILE_TOKEN () { 5 } |
sub PI_DATA_AFTER_STATE () { 56 } |
|
sub CHARACTER_TOKEN () { 6 } |
|
124 |
|
|
125 |
## Tree constructor state constants (see Whatpm::HTML for the full |
## Tree constructor state constants (see Whatpm::HTML for the full |
126 |
## list and descriptions) |
## list and descriptions) |
183 |
#$self->{level} |
#$self->{level} |
184 |
#$self->{set_nc} |
#$self->{set_nc} |
185 |
#$self->{parse_error} |
#$self->{parse_error} |
186 |
|
#$self->{is_xml} (if XML) |
187 |
|
|
188 |
$self->{state} = DATA_STATE; # MUST |
$self->{state} = DATA_STATE; # MUST |
189 |
#$self->{s_kwd}; # state keyword - initialized when used |
$self->{s_kwd} = ''; # state keyword |
190 |
#$self->{entity__value}; # initialized when used |
#$self->{entity__value}; # initialized when used |
191 |
#$self->{entity__match}; # initialized when used |
#$self->{entity__match}; # initialized when used |
192 |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
$self->{content_model} = PCDATA_CONTENT_MODEL; # be |
217 |
## ->{value} |
## ->{value} |
218 |
## ->{has_reference} == 1 or 0 |
## ->{has_reference} == 1 or 0 |
219 |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) |
220 |
|
## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) |
221 |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. |
222 |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
## |->{self_closing}| is used to save the value of |$self->{self_closing}| |
223 |
## while the token is pushed back to the stack. |
## while the token is pushed back to the stack. |
321 |
} |
} |
322 |
} elsif ($self->{nc} == 0x002D) { # - |
} elsif ($self->{nc} == 0x002D) { # - |
323 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
324 |
$self->{s_kwd} .= '-'; |
if ($self->{s_kwd} eq '<!-') { |
|
|
|
|
if ($self->{s_kwd} eq '<!--') { |
|
325 |
!!!cp (3); |
!!!cp (3); |
326 |
$self->{escape} = 1; # unless $self->{escape}; |
$self->{escape} = 1; # unless $self->{escape}; |
327 |
$self->{s_kwd} = '--'; |
$self->{s_kwd} = '--'; |
328 |
# |
# |
329 |
} elsif ($self->{s_kwd} eq '---') { |
} elsif ($self->{s_kwd} eq '-') { |
330 |
!!!cp (4); |
!!!cp (4); |
331 |
$self->{s_kwd} = '--'; |
$self->{s_kwd} = '--'; |
332 |
# |
# |
333 |
|
} elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') { |
334 |
|
!!!cp (4.1); |
335 |
|
$self->{s_kwd} .= '-'; |
336 |
|
# |
337 |
} else { |
} else { |
338 |
!!!cp (5); |
!!!cp (5); |
339 |
|
$self->{s_kwd} = '-'; |
340 |
# |
# |
341 |
} |
} |
342 |
} |
} |
372 |
if ($self->{s_kwd} eq '--') { |
if ($self->{s_kwd} eq '--') { |
373 |
!!!cp (8); |
!!!cp (8); |
374 |
delete $self->{escape}; |
delete $self->{escape}; |
375 |
|
# |
376 |
} else { |
} else { |
377 |
!!!cp (9); |
!!!cp (9); |
378 |
|
# |
379 |
} |
} |
380 |
|
} elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') { |
381 |
|
!!!cp (9.1); |
382 |
|
!!!parse-error (type => 'unmatched mse', ## TODO: type |
383 |
|
line => $self->{line_prev}, |
384 |
|
column => $self->{column_prev} - 1); |
385 |
|
# |
386 |
} else { |
} else { |
387 |
!!!cp (10); |
!!!cp (10); |
388 |
|
# |
389 |
} |
} |
390 |
|
|
391 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
392 |
# |
# |
393 |
|
} elsif ($self->{nc} == 0x005D) { # ] |
394 |
|
if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') { |
395 |
|
!!!cp (10.1); |
396 |
|
$self->{s_kwd} .= ']'; |
397 |
|
} elsif ($self->{s_kwd} eq ']]') { |
398 |
|
!!!cp (10.2); |
399 |
|
# |
400 |
|
} else { |
401 |
|
!!!cp (10.3); |
402 |
|
$self->{s_kwd} = ''; |
403 |
|
} |
404 |
|
# |
405 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
406 |
!!!cp (11); |
!!!cp (11); |
407 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
419 |
data => chr $self->{nc}, |
data => chr $self->{nc}, |
420 |
line => $self->{line}, column => $self->{column}, |
line => $self->{line}, column => $self->{column}, |
421 |
}; |
}; |
422 |
if ($self->{read_until}->($token->{data}, q[-!<>&], |
if ($self->{read_until}->($token->{data}, q{-!<>&\]}, |
423 |
length $token->{data})) { |
length $token->{data})) { |
424 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
425 |
} |
} |
426 |
|
|
427 |
## Stay in the data state. |
## Stay in the data state. |
428 |
if ($self->{content_model} == PCDATA_CONTENT_MODEL) { |
if (not $self->{is_xml} and |
429 |
|
$self->{content_model} == PCDATA_CONTENT_MODEL) { |
430 |
!!!cp (13); |
!!!cp (13); |
431 |
$self->{state} = PCDATA_STATE; |
$self->{state} = PCDATA_STATE; |
432 |
} else { |
} else { |
454 |
|
|
455 |
## reconsume |
## reconsume |
456 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
457 |
|
$self->{s_kwd} = ''; |
458 |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
459 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
460 |
column => $self->{column_prev}, |
column => $self->{column_prev}, |
476 |
!!!cp (19); |
!!!cp (19); |
477 |
$self->{ct} |
$self->{ct} |
478 |
= {type => START_TAG_TOKEN, |
= {type => START_TAG_TOKEN, |
479 |
tag_name => chr ($self->{nc} + 0x0020), |
tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
480 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
481 |
column => $self->{column_prev}}; |
column => $self->{column_prev}}; |
482 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
498 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
499 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
500 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
501 |
|
$self->{s_kwd} = ''; |
502 |
!!!next-input-character; |
!!!next-input-character; |
503 |
|
|
504 |
!!!emit ({type => CHARACTER_TOKEN, data => '<>', |
!!!emit ({type => CHARACTER_TOKEN, data => '<>', |
508 |
|
|
509 |
redo A; |
redo A; |
510 |
} elsif ($self->{nc} == 0x003F) { # ? |
} elsif ($self->{nc} == 0x003F) { # ? |
511 |
!!!cp (22); |
if ($self->{is_xml}) { |
512 |
!!!parse-error (type => 'pio', |
!!!cp (22.1); |
513 |
line => $self->{line_prev}, |
$self->{state} = PI_STATE; |
514 |
column => $self->{column_prev}); |
!!!next-input-character; |
515 |
$self->{state} = BOGUS_COMMENT_STATE; |
redo A; |
516 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
} else { |
517 |
line => $self->{line_prev}, |
!!!cp (22); |
518 |
column => $self->{column_prev}, |
!!!parse-error (type => 'pio', |
519 |
}; |
line => $self->{line_prev}, |
520 |
## $self->{nc} is intentionally left as is |
column => $self->{column_prev}); |
521 |
redo A; |
$self->{state} = BOGUS_COMMENT_STATE; |
522 |
|
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
523 |
|
line => $self->{line_prev}, |
524 |
|
column => $self->{column_prev}, |
525 |
|
}; |
526 |
|
## $self->{nc} is intentionally left as is |
527 |
|
redo A; |
528 |
|
} |
529 |
} else { |
} else { |
530 |
!!!cp (23); |
!!!cp (23); |
531 |
!!!parse-error (type => 'bare stago', |
!!!parse-error (type => 'bare stago', |
532 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
533 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
534 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
535 |
|
$self->{s_kwd} = ''; |
536 |
## reconsume |
## reconsume |
537 |
|
|
538 |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
561 |
## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>. |
## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>. |
562 |
!!!cp (28); |
!!!cp (28); |
563 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
564 |
|
$self->{s_kwd} = ''; |
565 |
## Reconsume. |
## Reconsume. |
566 |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
567 |
line => $l, column => $c, |
line => $l, column => $c, |
575 |
!!!cp (29); |
!!!cp (29); |
576 |
$self->{ct} |
$self->{ct} |
577 |
= {type => END_TAG_TOKEN, |
= {type => END_TAG_TOKEN, |
578 |
tag_name => chr ($self->{nc} + 0x0020), |
tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
579 |
line => $l, column => $c}; |
line => $l, column => $c}; |
580 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
581 |
!!!next-input-character; |
!!!next-input-character; |
595 |
line => $self->{line_prev}, ## "<" in "</>" |
line => $self->{line_prev}, ## "<" in "</>" |
596 |
column => $self->{column_prev} - 1); |
column => $self->{column_prev} - 1); |
597 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
598 |
|
$self->{s_kwd} = ''; |
599 |
!!!next-input-character; |
!!!next-input-character; |
600 |
redo A; |
redo A; |
601 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
602 |
!!!cp (32); |
!!!cp (32); |
603 |
!!!parse-error (type => 'bare etago'); |
!!!parse-error (type => 'bare etago'); |
604 |
|
$self->{s_kwd} = ''; |
605 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
606 |
# reconsume |
# reconsume |
607 |
|
|
641 |
} else { |
} else { |
642 |
!!!cp (25); |
!!!cp (25); |
643 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
644 |
|
$self->{s_kwd} = ''; |
645 |
## Reconsume. |
## Reconsume. |
646 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
647 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{s_kwd}, |
660 |
!!!cp (26); |
!!!cp (26); |
661 |
## Reconsume. |
## Reconsume. |
662 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
663 |
|
$self->{s_kwd} = ''; |
664 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
665 |
data => '</' . $self->{s_kwd}, |
data => '</' . $self->{s_kwd}, |
666 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
702 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
703 |
} |
} |
704 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
705 |
|
$self->{s_kwd} = ''; |
706 |
!!!next-input-character; |
!!!next-input-character; |
707 |
|
|
708 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
711 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
712 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
713 |
!!!cp (38); |
!!!cp (38); |
714 |
$self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020); |
$self->{ct}->{tag_name} |
715 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
716 |
# start tag or end tag |
# start tag or end tag |
717 |
## Stay in this state |
## Stay in this state |
718 |
!!!next-input-character; |
!!!next-input-character; |
735 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
736 |
} |
} |
737 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
738 |
|
$self->{s_kwd} = ''; |
739 |
# reconsume |
# reconsume |
740 |
|
|
741 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
776 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
777 |
} |
} |
778 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
779 |
|
$self->{s_kwd} = ''; |
780 |
!!!next-input-character; |
!!!next-input-character; |
781 |
|
|
782 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
786 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
787 |
!!!cp (49); |
!!!cp (49); |
788 |
$self->{ca} |
$self->{ca} |
789 |
= {name => chr ($self->{nc} + 0x0020), |
= {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
790 |
value => '', |
value => '', |
791 |
line => $self->{line}, column => $self->{column}}; |
line => $self->{line}, column => $self->{column}}; |
792 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
814 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
815 |
} |
} |
816 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
817 |
|
$self->{s_kwd} = ''; |
818 |
# reconsume |
# reconsume |
819 |
|
|
820 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
880 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
881 |
} |
} |
882 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
883 |
|
$self->{s_kwd} = ''; |
884 |
!!!next-input-character; |
!!!next-input-character; |
885 |
|
|
886 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
889 |
} elsif (0x0041 <= $self->{nc} and |
} elsif (0x0041 <= $self->{nc} and |
890 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
891 |
!!!cp (63); |
!!!cp (63); |
892 |
$self->{ca}->{name} .= chr ($self->{nc} + 0x0020); |
$self->{ca}->{name} |
893 |
|
.= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)); |
894 |
## Stay in the state |
## Stay in the state |
895 |
!!!next-input-character; |
!!!next-input-character; |
896 |
redo A; |
redo A; |
919 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
920 |
} |
} |
921 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
922 |
|
$self->{s_kwd} = ''; |
923 |
# reconsume |
# reconsume |
924 |
|
|
925 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
966 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
967 |
} |
} |
968 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
969 |
|
$self->{s_kwd} = ''; |
970 |
!!!next-input-character; |
!!!next-input-character; |
971 |
|
|
972 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
976 |
$self->{nc} <= 0x005A) { # A..Z |
$self->{nc} <= 0x005A) { # A..Z |
977 |
!!!cp (76); |
!!!cp (76); |
978 |
$self->{ca} |
$self->{ca} |
979 |
= {name => chr ($self->{nc} + 0x0020), |
= {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)), |
980 |
value => '', |
value => '', |
981 |
line => $self->{line}, column => $self->{column}}; |
line => $self->{line}, column => $self->{column}}; |
982 |
$self->{state} = ATTRIBUTE_NAME_STATE; |
$self->{state} = ATTRIBUTE_NAME_STATE; |
1004 |
} else { |
} else { |
1005 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1006 |
} |
} |
1007 |
|
$self->{s_kwd} = ''; |
1008 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1009 |
# reconsume |
# reconsume |
1010 |
|
|
1066 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1067 |
} |
} |
1068 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1069 |
|
$self->{s_kwd} = ''; |
1070 |
!!!next-input-character; |
!!!next-input-character; |
1071 |
|
|
1072 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1090 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1091 |
} |
} |
1092 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1093 |
|
$self->{s_kwd} = ''; |
1094 |
## reconsume |
## reconsume |
1095 |
|
|
1096 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1143 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1144 |
} |
} |
1145 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1146 |
|
$self->{s_kwd} = ''; |
1147 |
## reconsume |
## reconsume |
1148 |
|
|
1149 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1195 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1196 |
} |
} |
1197 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1198 |
|
$self->{s_kwd} = ''; |
1199 |
## reconsume |
## reconsume |
1200 |
|
|
1201 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1246 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1247 |
} |
} |
1248 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1249 |
|
$self->{s_kwd} = ''; |
1250 |
!!!next-input-character; |
!!!next-input-character; |
1251 |
|
|
1252 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1270 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1271 |
} |
} |
1272 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1273 |
|
$self->{s_kwd} = ''; |
1274 |
## reconsume |
## reconsume |
1275 |
|
|
1276 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1319 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1320 |
} |
} |
1321 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1322 |
|
$self->{s_kwd} = ''; |
1323 |
!!!next-input-character; |
!!!next-input-character; |
1324 |
|
|
1325 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1347 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1348 |
} |
} |
1349 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1350 |
|
$self->{s_kwd} = ''; |
1351 |
## Reconsume. |
## Reconsume. |
1352 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1353 |
redo A; |
redo A; |
1378 |
} |
} |
1379 |
|
|
1380 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1381 |
|
$self->{s_kwd} = ''; |
1382 |
!!!next-input-character; |
!!!next-input-character; |
1383 |
|
|
1384 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1401 |
die "$0: $self->{ct}->{type}: Unknown token type"; |
die "$0: $self->{ct}->{type}: Unknown token type"; |
1402 |
} |
} |
1403 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1404 |
|
$self->{s_kwd} = ''; |
1405 |
## Reconsume. |
## Reconsume. |
1406 |
!!!emit ($self->{ct}); # start tag or end tag |
!!!emit ($self->{ct}); # start tag or end tag |
1407 |
redo A; |
redo A; |
1422 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
1423 |
!!!cp (124); |
!!!cp (124); |
1424 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1425 |
|
$self->{s_kwd} = ''; |
1426 |
!!!next-input-character; |
!!!next-input-character; |
1427 |
|
|
1428 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1430 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
1431 |
!!!cp (125); |
!!!cp (125); |
1432 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1433 |
|
$self->{s_kwd} = ''; |
1434 |
## reconsume |
## reconsume |
1435 |
|
|
1436 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1462 |
$self->{s_kwd} = chr $self->{nc}; |
$self->{s_kwd} = chr $self->{nc}; |
1463 |
!!!next-input-character; |
!!!next-input-character; |
1464 |
redo A; |
redo A; |
1465 |
} elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
} elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
1466 |
$self->{open_elements}->[-1]->[1] & FOREIGN_EL and |
$self->{open_elements}->[-1]->[1] & FOREIGN_EL) or |
1467 |
|
$self->{is_xml}) and |
1468 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
1469 |
!!!cp (135.4); |
!!!cp (135.4); |
1470 |
$self->{state} = MD_CDATA_STATE; |
$self->{state} = MD_CDATA_STATE; |
1573 |
redo A; |
redo A; |
1574 |
} elsif ($self->{s_kwd} eq '[CDATA' and |
} elsif ($self->{s_kwd} eq '[CDATA' and |
1575 |
$self->{nc} == 0x005B) { # [ |
$self->{nc} == 0x005B) { # [ |
1576 |
!!!cp (135.2); |
if ($self->{is_xml} and |
1577 |
|
not $self->{tainted} and |
1578 |
|
@{$self->{open_elements} or []} == 0) { |
1579 |
|
!!!cp (135.2); |
1580 |
|
!!!parse-error (type => 'cdata outside of root element', |
1581 |
|
line => $self->{line_prev}, |
1582 |
|
column => $self->{column_prev} - 7); |
1583 |
|
$self->{tainted} = 1; |
1584 |
|
} else { |
1585 |
|
!!!cp (135.21); |
1586 |
|
} |
1587 |
|
|
1588 |
$self->{ct} = {type => CHARACTER_TOKEN, |
$self->{ct} = {type => CHARACTER_TOKEN, |
1589 |
data => '', |
data => '', |
1590 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
1616 |
!!!cp (138); |
!!!cp (138); |
1617 |
!!!parse-error (type => 'bogus comment'); |
!!!parse-error (type => 'bogus comment'); |
1618 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1619 |
|
$self->{s_kwd} = ''; |
1620 |
!!!next-input-character; |
!!!next-input-character; |
1621 |
|
|
1622 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1626 |
!!!cp (139); |
!!!cp (139); |
1627 |
!!!parse-error (type => 'unclosed comment'); |
!!!parse-error (type => 'unclosed comment'); |
1628 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1629 |
|
$self->{s_kwd} = ''; |
1630 |
## reconsume |
## reconsume |
1631 |
|
|
1632 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1650 |
!!!cp (142); |
!!!cp (142); |
1651 |
!!!parse-error (type => 'bogus comment'); |
!!!parse-error (type => 'bogus comment'); |
1652 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1653 |
|
$self->{s_kwd} = ''; |
1654 |
!!!next-input-character; |
!!!next-input-character; |
1655 |
|
|
1656 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1660 |
!!!cp (143); |
!!!cp (143); |
1661 |
!!!parse-error (type => 'unclosed comment'); |
!!!parse-error (type => 'unclosed comment'); |
1662 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1663 |
|
$self->{s_kwd} = ''; |
1664 |
## reconsume |
## reconsume |
1665 |
|
|
1666 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1684 |
!!!cp (146); |
!!!cp (146); |
1685 |
!!!parse-error (type => 'unclosed comment'); |
!!!parse-error (type => 'unclosed comment'); |
1686 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1687 |
|
$self->{s_kwd} = ''; |
1688 |
## reconsume |
## reconsume |
1689 |
|
|
1690 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1710 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
1711 |
!!!cp (149); |
!!!cp (149); |
1712 |
!!!parse-error (type => 'unclosed comment'); |
!!!parse-error (type => 'unclosed comment'); |
1713 |
|
$self->{s_kwd} = ''; |
1714 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1715 |
|
$self->{s_kwd} = ''; |
1716 |
## reconsume |
## reconsume |
1717 |
|
|
1718 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1729 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
1730 |
!!!cp (151); |
!!!cp (151); |
1731 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1732 |
|
$self->{s_kwd} = ''; |
1733 |
!!!next-input-character; |
!!!next-input-character; |
1734 |
|
|
1735 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1748 |
!!!cp (153); |
!!!cp (153); |
1749 |
!!!parse-error (type => 'unclosed comment'); |
!!!parse-error (type => 'unclosed comment'); |
1750 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1751 |
|
$self->{s_kwd} = ''; |
1752 |
## reconsume |
## reconsume |
1753 |
|
|
1754 |
!!!emit ($self->{ct}); # comment |
!!!emit ($self->{ct}); # comment |
1787 |
!!!cp (158); |
!!!cp (158); |
1788 |
!!!parse-error (type => 'no DOCTYPE name'); |
!!!parse-error (type => 'no DOCTYPE name'); |
1789 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1790 |
|
$self->{s_kwd} = ''; |
1791 |
!!!next-input-character; |
!!!next-input-character; |
1792 |
|
|
1793 |
!!!emit ($self->{ct}); # DOCTYPE (quirks) |
!!!emit ($self->{ct}); # DOCTYPE (quirks) |
1797 |
!!!cp (159); |
!!!cp (159); |
1798 |
!!!parse-error (type => 'no DOCTYPE name'); |
!!!parse-error (type => 'no DOCTYPE name'); |
1799 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1800 |
|
$self->{s_kwd} = ''; |
1801 |
## reconsume |
## reconsume |
1802 |
|
|
1803 |
!!!emit ($self->{ct}); # DOCTYPE (quirks) |
!!!emit ($self->{ct}); # DOCTYPE (quirks) |
1821 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
1822 |
!!!cp (162); |
!!!cp (162); |
1823 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1824 |
|
$self->{s_kwd} = ''; |
1825 |
!!!next-input-character; |
!!!next-input-character; |
1826 |
|
|
1827 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
1831 |
!!!cp (163); |
!!!cp (163); |
1832 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
1833 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1834 |
|
$self->{s_kwd} = ''; |
1835 |
## reconsume |
## reconsume |
1836 |
|
|
1837 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
1855 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
1856 |
!!!cp (166); |
!!!cp (166); |
1857 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1858 |
|
$self->{s_kwd} = ''; |
1859 |
!!!next-input-character; |
!!!next-input-character; |
1860 |
|
|
1861 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
1865 |
!!!cp (167); |
!!!cp (167); |
1866 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
1867 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1868 |
|
$self->{s_kwd} = ''; |
1869 |
## reconsume |
## reconsume |
1870 |
|
|
1871 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
1994 |
!!!parse-error (type => 'no PUBLIC literal'); |
!!!parse-error (type => 'no PUBLIC literal'); |
1995 |
|
|
1996 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1997 |
|
$self->{s_kwd} = ''; |
1998 |
!!!next-input-character; |
!!!next-input-character; |
1999 |
|
|
2000 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2006 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
2007 |
|
|
2008 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2009 |
|
$self->{s_kwd} = ''; |
2010 |
## reconsume |
## reconsume |
2011 |
|
|
2012 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2033 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
2034 |
|
|
2035 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2036 |
|
$self->{s_kwd} = ''; |
2037 |
!!!next-input-character; |
!!!next-input-character; |
2038 |
|
|
2039 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2045 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
2046 |
|
|
2047 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2048 |
|
$self->{s_kwd} = ''; |
2049 |
## reconsume |
## reconsume |
2050 |
|
|
2051 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2074 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
2075 |
|
|
2076 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2077 |
|
$self->{s_kwd} = ''; |
2078 |
!!!next-input-character; |
!!!next-input-character; |
2079 |
|
|
2080 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2086 |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
!!!parse-error (type => 'unclosed PUBLIC literal'); |
2087 |
|
|
2088 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2089 |
|
$self->{s_kwd} = ''; |
2090 |
## reconsume |
## reconsume |
2091 |
|
|
2092 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2125 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
2126 |
!!!cp (198); |
!!!cp (198); |
2127 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2128 |
|
$self->{s_kwd} = ''; |
2129 |
!!!next-input-character; |
!!!next-input-character; |
2130 |
|
|
2131 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
2136 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
2137 |
|
|
2138 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2139 |
|
$self->{s_kwd} = ''; |
2140 |
## reconsume |
## reconsume |
2141 |
|
|
2142 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2174 |
!!!cp (204); |
!!!cp (204); |
2175 |
!!!parse-error (type => 'no SYSTEM literal'); |
!!!parse-error (type => 'no SYSTEM literal'); |
2176 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2177 |
|
$self->{s_kwd} = ''; |
2178 |
!!!next-input-character; |
!!!next-input-character; |
2179 |
|
|
2180 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2186 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
2187 |
|
|
2188 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2189 |
|
$self->{s_kwd} = ''; |
2190 |
## reconsume |
## reconsume |
2191 |
|
|
2192 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2213 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
2214 |
|
|
2215 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2216 |
|
$self->{s_kwd} = ''; |
2217 |
!!!next-input-character; |
!!!next-input-character; |
2218 |
|
|
2219 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2225 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
2226 |
|
|
2227 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2228 |
|
$self->{s_kwd} = ''; |
2229 |
## reconsume |
## reconsume |
2230 |
|
|
2231 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2254 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
2255 |
|
|
2256 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2257 |
|
$self->{s_kwd} = ''; |
2258 |
!!!next-input-character; |
!!!next-input-character; |
2259 |
|
|
2260 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2266 |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
!!!parse-error (type => 'unclosed SYSTEM literal'); |
2267 |
|
|
2268 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2269 |
|
$self->{s_kwd} = ''; |
2270 |
## reconsume |
## reconsume |
2271 |
|
|
2272 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2293 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
2294 |
!!!cp (216); |
!!!cp (216); |
2295 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2296 |
|
$self->{s_kwd} = ''; |
2297 |
!!!next-input-character; |
!!!next-input-character; |
2298 |
|
|
2299 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
2303 |
!!!cp (217); |
!!!cp (217); |
2304 |
!!!parse-error (type => 'unclosed DOCTYPE'); |
!!!parse-error (type => 'unclosed DOCTYPE'); |
2305 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2306 |
|
$self->{s_kwd} = ''; |
2307 |
## reconsume |
## reconsume |
2308 |
|
|
2309 |
$self->{ct}->{quirks} = 1; |
$self->{ct}->{quirks} = 1; |
2323 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
2324 |
!!!cp (219); |
!!!cp (219); |
2325 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2326 |
|
$self->{s_kwd} = ''; |
2327 |
!!!next-input-character; |
!!!next-input-character; |
2328 |
|
|
2329 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
2332 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
2333 |
!!!cp (220); |
!!!cp (220); |
2334 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2335 |
|
$self->{s_kwd} = ''; |
2336 |
## reconsume |
## reconsume |
2337 |
|
|
2338 |
!!!emit ($self->{ct}); # DOCTYPE |
!!!emit ($self->{ct}); # DOCTYPE |
2358 |
!!!next-input-character; |
!!!next-input-character; |
2359 |
redo A; |
redo A; |
2360 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
2361 |
|
if ($self->{is_xml}) { |
2362 |
|
!!!cp (221.11); |
2363 |
|
!!!parse-error (type => 'no mse'); ## TODO: type |
2364 |
|
} else { |
2365 |
|
!!!cp (221.12); |
2366 |
|
} |
2367 |
|
|
2368 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2369 |
|
$self->{s_kwd} = ''; |
2370 |
!!!next-input-character; |
!!!next-input-character; |
2371 |
if (length $self->{ct}->{data}) { # character |
if (length $self->{ct}->{data}) { # character |
2372 |
!!!cp (221.2); |
!!!cp (221.2); |
2405 |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
2406 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
2407 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2408 |
|
$self->{s_kwd} = ''; |
2409 |
!!!next-input-character; |
!!!next-input-character; |
2410 |
if (length $self->{ct}->{data}) { # character |
if (length $self->{ct}->{data}) { # character |
2411 |
!!!cp (221.7); |
!!!cp (221.7); |
2473 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
2474 |
!!!cp (997); |
!!!cp (997); |
2475 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2476 |
|
$self->{s_kwd} = ''; |
2477 |
## Reconsume. |
## Reconsume. |
2478 |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
2479 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
2484 |
!!!cp (996); |
!!!cp (996); |
2485 |
$self->{ca}->{value} .= '&'; |
$self->{ca}->{value} .= '&'; |
2486 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2487 |
|
$self->{s_kwd} = ''; |
2488 |
## Reconsume. |
## Reconsume. |
2489 |
redo A; |
redo A; |
2490 |
} |
} |
2515 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
2516 |
!!!cp (1019); |
!!!cp (1019); |
2517 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2518 |
|
$self->{s_kwd} = ''; |
2519 |
## Reconsume. |
## Reconsume. |
2520 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
2521 |
data => '&#', |
data => '&#', |
2527 |
!!!cp (993); |
!!!cp (993); |
2528 |
$self->{ca}->{value} .= '&#'; |
$self->{ca}->{value} .= '&#'; |
2529 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2530 |
|
$self->{s_kwd} = ''; |
2531 |
## Reconsume. |
## Reconsume. |
2532 |
redo A; |
redo A; |
2533 |
} |
} |
2573 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
2574 |
!!!cp (992); |
!!!cp (992); |
2575 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2576 |
|
$self->{s_kwd} = ''; |
2577 |
## Reconsume. |
## Reconsume. |
2578 |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
2579 |
|
has_reference => 1, |
2580 |
line => $l, column => $c, |
line => $l, column => $c, |
2581 |
}); |
}); |
2582 |
redo A; |
redo A; |
2585 |
$self->{ca}->{value} .= chr $code; |
$self->{ca}->{value} .= chr $code; |
2586 |
$self->{ca}->{has_reference} = 1; |
$self->{ca}->{has_reference} = 1; |
2587 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2588 |
|
$self->{s_kwd} = ''; |
2589 |
## Reconsume. |
## Reconsume. |
2590 |
redo A; |
redo A; |
2591 |
} |
} |
2611 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
2612 |
!!!cp (1005); |
!!!cp (1005); |
2613 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2614 |
|
$self->{s_kwd} = ''; |
2615 |
## Reconsume. |
## Reconsume. |
2616 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
2617 |
data => '&' . $self->{s_kwd}, |
data => '&' . $self->{s_kwd}, |
2623 |
!!!cp (989); |
!!!cp (989); |
2624 |
$self->{ca}->{value} .= '&' . $self->{s_kwd}; |
$self->{ca}->{value} .= '&' . $self->{s_kwd}; |
2625 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2626 |
|
$self->{s_kwd} = ''; |
2627 |
## Reconsume. |
## Reconsume. |
2628 |
redo A; |
redo A; |
2629 |
} |
} |
2686 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
2687 |
!!!cp (988); |
!!!cp (988); |
2688 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2689 |
|
$self->{s_kwd} = ''; |
2690 |
## Reconsume. |
## Reconsume. |
2691 |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
!!!emit ({type => CHARACTER_TOKEN, data => chr $code, |
2692 |
|
has_reference => 1, |
2693 |
line => $l, column => $c, |
line => $l, column => $c, |
2694 |
}); |
}); |
2695 |
redo A; |
redo A; |
2698 |
$self->{ca}->{value} .= chr $code; |
$self->{ca}->{value} .= chr $code; |
2699 |
$self->{ca}->{has_reference} = 1; |
$self->{ca}->{has_reference} = 1; |
2700 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2701 |
|
$self->{s_kwd} = ''; |
2702 |
## Reconsume. |
## Reconsume. |
2703 |
redo A; |
redo A; |
2704 |
} |
} |
2781 |
if ($self->{prev_state} == DATA_STATE) { |
if ($self->{prev_state} == DATA_STATE) { |
2782 |
!!!cp (986); |
!!!cp (986); |
2783 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2784 |
|
$self->{s_kwd} = ''; |
2785 |
## Reconsume. |
## Reconsume. |
2786 |
!!!emit ({type => CHARACTER_TOKEN, |
!!!emit ({type => CHARACTER_TOKEN, |
2787 |
data => $data, |
data => $data, |
2788 |
|
has_reference => $has_ref, |
2789 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
2790 |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
column => $self->{column_prev} + 1 - length $self->{s_kwd}, |
2791 |
}); |
}); |
2795 |
$self->{ca}->{value} .= $data; |
$self->{ca}->{value} .= $data; |
2796 |
$self->{ca}->{has_reference} = 1 if $has_ref; |
$self->{ca}->{has_reference} = 1 if $has_ref; |
2797 |
$self->{state} = $self->{prev_state}; |
$self->{state} = $self->{prev_state}; |
2798 |
|
$self->{s_kwd} = ''; |
2799 |
|
## Reconsume. |
2800 |
|
redo A; |
2801 |
|
} |
2802 |
|
|
2803 |
|
## XML-only states |
2804 |
|
|
2805 |
|
} elsif ($self->{state} == PI_STATE) { |
2806 |
|
if ($is_space->{$self->{nc}} or |
2807 |
|
$self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else" |
2808 |
|
$self->{nc} == -1) { |
2809 |
|
!!!parse-error (type => 'bare pio', ## TODO: type |
2810 |
|
line => $self->{line_prev}, |
2811 |
|
column => $self->{column_prev} |
2812 |
|
- 1 * ($self->{nc} != -1)); |
2813 |
|
$self->{state} = BOGUS_COMMENT_STATE; |
2814 |
|
## Reconsume. |
2815 |
|
$self->{ct} = {type => COMMENT_TOKEN, |
2816 |
|
data => '?', |
2817 |
|
line => $self->{line_prev}, |
2818 |
|
column => $self->{column_prev} |
2819 |
|
- 1 * ($self->{nc} != -1), |
2820 |
|
}; |
2821 |
|
redo A; |
2822 |
|
} else { |
2823 |
|
$self->{ct} = {type => PI_TOKEN, |
2824 |
|
target => chr $self->{nc}, |
2825 |
|
data => '', |
2826 |
|
line => $self->{line_prev}, |
2827 |
|
column => $self->{column_prev} - 1, |
2828 |
|
}; |
2829 |
|
$self->{state} = PI_TARGET_STATE; |
2830 |
|
!!!next-input-character; |
2831 |
|
redo A; |
2832 |
|
} |
2833 |
|
} elsif ($self->{state} == PI_TARGET_STATE) { |
2834 |
|
if ($is_space->{$self->{nc}}) { |
2835 |
|
$self->{state} = PI_TARGET_AFTER_STATE; |
2836 |
|
!!!next-input-character; |
2837 |
|
redo A; |
2838 |
|
} elsif ($self->{nc} == -1) { |
2839 |
|
!!!parse-error (type => 'no pic'); ## TODO: type |
2840 |
|
$self->{state} = DATA_STATE; |
2841 |
|
$self->{s_kwd} = ''; |
2842 |
## Reconsume. |
## Reconsume. |
2843 |
|
!!!emit ($self->{ct}); # pi |
2844 |
|
redo A; |
2845 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
2846 |
|
$self->{state} = PI_AFTER_STATE; |
2847 |
|
!!!next-input-character; |
2848 |
|
redo A; |
2849 |
|
} else { |
2850 |
|
## XML5: typo ("tag name" -> "target") |
2851 |
|
$self->{ct}->{target} .= chr $self->{nc}; # pi |
2852 |
|
!!!next-input-character; |
2853 |
|
redo A; |
2854 |
|
} |
2855 |
|
} elsif ($self->{state} == PI_TARGET_AFTER_STATE) { |
2856 |
|
if ($is_space->{$self->{nc}}) { |
2857 |
|
## Stay in the state. |
2858 |
|
!!!next-input-character; |
2859 |
|
redo A; |
2860 |
|
} else { |
2861 |
|
$self->{state} = PI_DATA_STATE; |
2862 |
|
## Reprocess. |
2863 |
|
redo A; |
2864 |
|
} |
2865 |
|
} elsif ($self->{state} == PI_DATA_STATE) { |
2866 |
|
if ($self->{nc} == 0x003F) { # ? |
2867 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
2868 |
|
!!!next-input-character; |
2869 |
|
redo A; |
2870 |
|
} elsif ($self->{nc} == -1) { |
2871 |
|
!!!parse-error (type => 'no pic'); ## TODO: type |
2872 |
|
$self->{state} = DATA_STATE; |
2873 |
|
$self->{s_kwd} = ''; |
2874 |
|
## Reprocess. |
2875 |
|
!!!emit ($self->{ct}); # pi |
2876 |
|
redo A; |
2877 |
|
} else { |
2878 |
|
$self->{ct}->{data} .= chr $self->{nc}; # pi |
2879 |
|
$self->{read_until}->($self->{ct}->{data}, q[?], |
2880 |
|
length $self->{ct}->{data}); |
2881 |
|
## Stay in the state. |
2882 |
|
!!!next-input-character; |
2883 |
|
## Reprocess. |
2884 |
redo A; |
redo A; |
2885 |
} |
} |
2886 |
|
} elsif ($self->{state} == PI_AFTER_STATE) { |
2887 |
|
if ($self->{nc} == 0x003E) { # > |
2888 |
|
$self->{state} = DATA_STATE; |
2889 |
|
$self->{s_kwd} = ''; |
2890 |
|
!!!next-input-character; |
2891 |
|
!!!emit ($self->{ct}); # pi |
2892 |
|
redo A; |
2893 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
2894 |
|
!!!parse-error (type => 'no s after target', ## TODO: type |
2895 |
|
line => $self->{line_prev}, |
2896 |
|
column => $self->{column_prev}); ## XML5: no error |
2897 |
|
$self->{ct}->{data} .= '?'; |
2898 |
|
$self->{state} = PI_DATA_AFTER_STATE; |
2899 |
|
!!!next-input-character; |
2900 |
|
redo A; |
2901 |
|
} else { |
2902 |
|
!!!parse-error (type => 'no s after target', ## TODO: type |
2903 |
|
line => $self->{line_prev}, |
2904 |
|
column => $self->{column_prev} |
2905 |
|
+ 1 * ($self->{nc} == -1)); ## XML5: no error |
2906 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
2907 |
|
$self->{state} = PI_DATA_STATE; |
2908 |
|
## Reprocess. |
2909 |
|
redo A; |
2910 |
|
} |
2911 |
|
} elsif ($self->{state} == PI_DATA_AFTER_STATE) { |
2912 |
|
## XML5: Same as "pi after state" in XML5 |
2913 |
|
if ($self->{nc} == 0x003E) { # > |
2914 |
|
$self->{state} = DATA_STATE; |
2915 |
|
$self->{s_kwd} = ''; |
2916 |
|
!!!next-input-character; |
2917 |
|
!!!emit ($self->{ct}); # pi |
2918 |
|
redo A; |
2919 |
|
} elsif ($self->{nc} == 0x003F) { # ? |
2920 |
|
$self->{ct}->{data} .= '?'; |
2921 |
|
## Stay in the state. |
2922 |
|
!!!next-input-character; |
2923 |
|
redo A; |
2924 |
|
} else { |
2925 |
|
$self->{ct}->{data} .= '?'; ## XML5: not appended |
2926 |
|
$self->{state} = PI_DATA_STATE; |
2927 |
|
## Reprocess. |
2928 |
|
redo A; |
2929 |
|
} |
2930 |
|
|
2931 |
} else { |
} else { |
2932 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |
2933 |
} |
} |