437 |
!!!emit ($token); |
!!!emit ($token); |
438 |
redo A; |
redo A; |
439 |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
} elsif ($self->{state} == TAG_OPEN_STATE) { |
440 |
|
## XML5: "tag state". |
441 |
|
|
442 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
443 |
if ($self->{nc} == 0x002F) { # / |
if ($self->{nc} == 0x002F) { # / |
444 |
!!!cp (15); |
!!!cp (15); |
528 |
## $self->{nc} is intentionally left as is |
## $self->{nc} is intentionally left as is |
529 |
redo A; |
redo A; |
530 |
} |
} |
531 |
} else { |
} elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) { |
532 |
!!!cp (23); |
!!!cp (23); |
533 |
!!!parse-error (type => 'bare stago', |
!!!parse-error (type => 'bare stago', |
534 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
543 |
}); |
}); |
544 |
|
|
545 |
redo A; |
redo A; |
546 |
|
} else { |
547 |
|
## XML5: "<:" is a parse error. |
548 |
|
!!!cp (23.1); |
549 |
|
$self->{ct} = {type => START_TAG_TOKEN, |
550 |
|
tag_name => chr ($self->{nc}), |
551 |
|
line => $self->{line_prev}, |
552 |
|
column => $self->{column_prev}}; |
553 |
|
$self->{state} = TAG_NAME_STATE; |
554 |
|
!!!next-input-character; |
555 |
|
redo A; |
556 |
} |
} |
557 |
} else { |
} else { |
558 |
die "$0: $self->{content_model} in tag open"; |
die "$0: $self->{content_model} in tag open"; |
561 |
## NOTE: The "close tag open state" in the spec is implemented as |
## NOTE: The "close tag open state" in the spec is implemented as |
562 |
## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|. |
## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|. |
563 |
|
|
564 |
|
## XML5: "end tag state". |
565 |
|
|
566 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" |
567 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
568 |
if (defined $self->{last_stag_name}) { |
if (defined $self->{last_stag_name}) { |
604 |
!!!next-input-character; |
!!!next-input-character; |
605 |
redo A; |
redo A; |
606 |
} elsif ($self->{nc} == 0x003E) { # > |
} elsif ($self->{nc} == 0x003E) { # > |
|
!!!cp (31); |
|
607 |
!!!parse-error (type => 'empty end tag', |
!!!parse-error (type => 'empty end tag', |
608 |
line => $self->{line_prev}, ## "<" in "</>" |
line => $self->{line_prev}, ## "<" in "</>" |
609 |
column => $self->{column_prev} - 1); |
column => $self->{column_prev} - 1); |
610 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
611 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
612 |
!!!next-input-character; |
if ($self->{is_xml}) { |
613 |
|
!!!cp (31); |
614 |
|
## XML5: No parse error. |
615 |
|
|
616 |
|
## NOTE: This parser raises a parse error, since it supports |
617 |
|
## XML1, not XML5. |
618 |
|
|
619 |
|
## NOTE: A short end tag token. |
620 |
|
my $ct = {type => END_TAG_TOKEN, |
621 |
|
tag_name => '', |
622 |
|
line => $self->{line_prev}, |
623 |
|
column => $self->{column_prev} - 1, |
624 |
|
}; |
625 |
|
!!!next-input-character; |
626 |
|
!!!emit ($ct); |
627 |
|
} else { |
628 |
|
!!!cp (31.1); |
629 |
|
!!!next-input-character; |
630 |
|
} |
631 |
redo A; |
redo A; |
632 |
} elsif ($self->{nc} == -1) { |
} elsif ($self->{nc} == -1) { |
633 |
!!!cp (32); |
!!!cp (32); |
641 |
}); |
}); |
642 |
|
|
643 |
redo A; |
redo A; |
644 |
} else { |
} elsif (not $self->{is_xml} or |
645 |
|
$is_space->{$self->{nc}}) { |
646 |
!!!cp (33); |
!!!cp (33); |
647 |
!!!parse-error (type => 'bogus end tag'); |
!!!parse-error (type => 'bogus end tag', |
648 |
|
line => $self->{line_prev}, # "<" of "</" |
649 |
|
column => $self->{column_prev} - 1); |
650 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
651 |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
$self->{ct} = {type => COMMENT_TOKEN, data => '', |
652 |
line => $self->{line_prev}, # "<" of "</" |
line => $self->{line_prev}, # "<" of "</" |
659 |
## generated from the bogus end tag, as defined in the |
## generated from the bogus end tag, as defined in the |
660 |
## "bogus comment state" entry. |
## "bogus comment state" entry. |
661 |
redo A; |
redo A; |
662 |
|
} else { |
663 |
|
## XML5: "</:" is a parse error. |
664 |
|
!!!cp (30.1); |
665 |
|
$self->{ct} = {type => END_TAG_TOKEN, |
666 |
|
tag_name => chr ($self->{nc}), |
667 |
|
line => $l, column => $c}; |
668 |
|
$self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state". |
669 |
|
!!!next-input-character; |
670 |
|
redo A; |
671 |
} |
} |
672 |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
} elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) { |
673 |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1; |
1535 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
1536 |
column => $self->{column_prev} - 2, |
column => $self->{column_prev} - 2, |
1537 |
}; |
}; |
1538 |
$self->{state} = COMMENT_START_STATE; |
$self->{state} = COMMENT_START_STATE; ## XML5: "comment state". |
1539 |
!!!next-input-character; |
!!!next-input-character; |
1540 |
redo A; |
redo A; |
1541 |
} else { |
} else { |
1578 |
} elsif ((length $self->{s_kwd}) == 6 and |
} elsif ((length $self->{s_kwd}) == 6 and |
1579 |
($self->{nc} == 0x0045 or # E |
($self->{nc} == 0x0045 or # E |
1580 |
$self->{nc} == 0x0065)) { # e |
$self->{nc} == 0x0065)) { # e |
1581 |
!!!cp (129); |
if ($self->{s_kwd} ne 'DOCTYP') { |
1582 |
|
!!!cp (129); |
1583 |
|
## XML5: case-sensitive. |
1584 |
|
!!!parse-error (type => 'lowercase keyword', ## TODO |
1585 |
|
text => 'DOCTYPE', |
1586 |
|
line => $self->{line_prev}, |
1587 |
|
column => $self->{column_prev} - 5); |
1588 |
|
} else { |
1589 |
|
!!!cp (129.1); |
1590 |
|
} |
1591 |
$self->{state} = DOCTYPE_STATE; |
$self->{state} = DOCTYPE_STATE; |
1592 |
$self->{ct} = {type => DOCTYPE_TOKEN, |
$self->{ct} = {type => DOCTYPE_TOKEN, |
1593 |
quirks => 1, |
quirks => 1, |
1754 |
redo A; |
redo A; |
1755 |
} |
} |
1756 |
} elsif ($self->{state} == COMMENT_END_DASH_STATE) { |
} elsif ($self->{state} == COMMENT_END_DASH_STATE) { |
1757 |
|
## XML5: "comment dash state". |
1758 |
|
|
1759 |
if ($self->{nc} == 0x002D) { # - |
if ($self->{nc} == 0x002D) { # - |
1760 |
!!!cp (148); |
!!!cp (148); |
1761 |
$self->{state} = COMMENT_END_STATE; |
$self->{state} = COMMENT_END_STATE; |
1791 |
redo A; |
redo A; |
1792 |
} elsif ($self->{nc} == 0x002D) { # - |
} elsif ($self->{nc} == 0x002D) { # - |
1793 |
!!!cp (152); |
!!!cp (152); |
1794 |
|
## XML5: Not a parse error. |
1795 |
!!!parse-error (type => 'dash in comment', |
!!!parse-error (type => 'dash in comment', |
1796 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
1797 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
1811 |
redo A; |
redo A; |
1812 |
} else { |
} else { |
1813 |
!!!cp (154); |
!!!cp (154); |
1814 |
|
## XML5: Not a parse error. |
1815 |
!!!parse-error (type => 'dash in comment', |
!!!parse-error (type => 'dash in comment', |
1816 |
line => $self->{line_prev}, |
line => $self->{line_prev}, |
1817 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
2407 |
## NOTE: "CDATA section state" in the state is jointly implemented |
## NOTE: "CDATA section state" in the state is jointly implemented |
2408 |
## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, |
## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, |
2409 |
## and |CDATA_SECTION_MSE2_STATE|. |
## and |CDATA_SECTION_MSE2_STATE|. |
2410 |
|
|
2411 |
|
## XML5: "CDATA state". |
2412 |
|
|
2413 |
if ($self->{nc} == 0x005D) { # ] |
if ($self->{nc} == 0x005D) { # ] |
2414 |
!!!cp (221.1); |
!!!cp (221.1); |
2425 |
|
|
2426 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2427 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
2428 |
!!!next-input-character; |
## Reconsume. |
2429 |
if (length $self->{ct}->{data}) { # character |
if (length $self->{ct}->{data}) { # character |
2430 |
!!!cp (221.2); |
!!!cp (221.2); |
2431 |
!!!emit ($self->{ct}); # character |
!!!emit ($self->{ct}); # character |
2448 |
|
|
2449 |
## ISSUE: "text tokens" in spec. |
## ISSUE: "text tokens" in spec. |
2450 |
} elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) { |
2451 |
|
## XML5: "CDATA bracket state". |
2452 |
|
|
2453 |
if ($self->{nc} == 0x005D) { # ] |
if ($self->{nc} == 0x005D) { # ] |
2454 |
!!!cp (221.5); |
!!!cp (221.5); |
2455 |
$self->{state} = CDATA_SECTION_MSE2_STATE; |
$self->{state} = CDATA_SECTION_MSE2_STATE; |
2457 |
redo A; |
redo A; |
2458 |
} else { |
} else { |
2459 |
!!!cp (221.6); |
!!!cp (221.6); |
2460 |
|
## XML5: If EOF, "]" is not appended and changed to the data state. |
2461 |
$self->{ct}->{data} .= ']'; |
$self->{ct}->{data} .= ']'; |
2462 |
$self->{state} = CDATA_SECTION_STATE; |
$self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state. |
2463 |
## Reconsume. |
## Reconsume. |
2464 |
redo A; |
redo A; |
2465 |
} |
} |
2466 |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
} elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) { |
2467 |
|
## XML5: "CDATA end state". |
2468 |
|
|
2469 |
if ($self->{nc} == 0x003E) { # > |
if ($self->{nc} == 0x003E) { # > |
2470 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
2471 |
$self->{s_kwd} = ''; |
$self->{s_kwd} = ''; |
2488 |
!!!cp (221.11); |
!!!cp (221.11); |
2489 |
$self->{ct}->{data} .= ']]'; # character |
$self->{ct}->{data} .= ']]'; # character |
2490 |
$self->{state} = CDATA_SECTION_STATE; |
$self->{state} = CDATA_SECTION_STATE; |
2491 |
## Reconsume. |
## Reconsume. ## XML5: Emit. |
2492 |
redo A; |
redo A; |
2493 |
} |
} |
2494 |
} elsif ($self->{state} == ENTITY_STATE) { |
} elsif ($self->{state} == ENTITY_STATE) { |