466 |
# Anything else |
# Anything else |
467 |
my $token = {type => CHARACTER_TOKEN, |
my $token = {type => CHARACTER_TOKEN, |
468 |
data => chr $self->{next_char}, |
data => chr $self->{next_char}, |
469 |
line => $self->{line}, column => $self->{column}}; |
#line => $self->{line}, column => $self->{column}, |
470 |
|
}; |
471 |
## Stay in the data state |
## Stay in the data state |
472 |
!!!next-input-character; |
!!!next-input-character; |
473 |
|
|
477 |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
478 |
## (cannot happen in CDATA state) |
## (cannot happen in CDATA state) |
479 |
|
|
480 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
#my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
481 |
|
|
482 |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
483 |
|
|
487 |
unless (defined $token) { |
unless (defined $token) { |
488 |
!!!cp (13); |
!!!cp (13); |
489 |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
490 |
line => $l, column => $c}); |
#line => $l, column => $c, |
491 |
|
}); |
492 |
} else { |
} else { |
493 |
!!!cp (14); |
!!!cp (14); |
494 |
!!!emit ($token); |
!!!emit ($token); |
508 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
509 |
|
|
510 |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
511 |
line => $self->{line_prev}, |
#line => $self->{line_prev}, |
512 |
column => $self->{column_prev}}); |
#column => $self->{column_prev}, |
513 |
|
}); |
514 |
|
|
515 |
redo A; |
redo A; |
516 |
} |
} |
555 |
!!!next-input-character; |
!!!next-input-character; |
556 |
|
|
557 |
!!!emit ({type => CHARACTER_TOKEN, data => '<>', |
!!!emit ({type => CHARACTER_TOKEN, data => '<>', |
558 |
line => $self->{line_prev}, |
#line => $self->{line_prev}, |
559 |
column => $self->{column_prev}}); |
#column => $self->{column_prev}, |
560 |
|
}); |
561 |
|
|
562 |
redo A; |
redo A; |
563 |
} elsif ($self->{next_char} == 0x003F) { # ? |
} elsif ($self->{next_char} == 0x003F) { # ? |
567 |
column => $self->{column_prev}); |
column => $self->{column_prev}); |
568 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
569 |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
570 |
line => $self->{line_prev}, |
#line => $self->{line_prev}, |
571 |
column => $self->{column_prev}}; |
#column => $self->{column_prev}, |
572 |
|
}; |
573 |
## $self->{next_char} is intentionally left as is |
## $self->{next_char} is intentionally left as is |
574 |
redo A; |
redo A; |
575 |
} else { |
} else { |
579 |
## reconsume |
## reconsume |
580 |
|
|
581 |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
582 |
line => $self->{line_prev}, |
#line => $self->{line_prev}, |
583 |
column => $self->{column_prev}}); |
#column => $self->{column_prev}, |
584 |
|
}); |
585 |
|
|
586 |
redo A; |
redo A; |
587 |
} |
} |
610 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
611 |
|
|
612 |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
613 |
line => $l, column => $c}); |
#line => $l, column => $c, |
614 |
|
}); |
615 |
|
|
616 |
redo A; |
redo A; |
617 |
} |
} |
631 |
!!!back-next-input-character (@next_char); |
!!!back-next-input-character (@next_char); |
632 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
633 |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
634 |
line => $l, column => $c}); |
#line => $l, column => $c, |
635 |
|
}); |
636 |
redo A; |
redo A; |
637 |
} else { |
} else { |
638 |
!!!cp (27); |
!!!cp (27); |
646 |
# next-input-character is already done |
# next-input-character is already done |
647 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
648 |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
649 |
line => $l, column => $c}); |
#line => $l, column => $c, |
650 |
|
}); |
651 |
redo A; |
redo A; |
652 |
} |
} |
653 |
} |
} |
686 |
# reconsume |
# reconsume |
687 |
|
|
688 |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
689 |
line => $l, column => $c}); |
#line => $l, column => $c, |
690 |
|
}); |
691 |
|
|
692 |
redo A; |
redo A; |
693 |
} else { |
} else { |
695 |
!!!parse-error (type => 'bogus end tag'); |
!!!parse-error (type => 'bogus end tag'); |
696 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
697 |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
698 |
line => $self->{line_prev}, # "<" of "</" |
#line => $self->{line_prev}, # "<" of "</" |
699 |
column => $self->{column_prev} - 1}; |
#column => $self->{column_prev} - 1, |
700 |
|
}; |
701 |
## $self->{next_char} is intentionally left as is |
## $self->{next_char} is intentionally left as is |
702 |
redo A; |
redo A; |
703 |
} |
} |
1436 |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
1437 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
1438 |
|
|
1439 |
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); |
#my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); |
1440 |
|
|
1441 |
my @next_char; |
my @next_char; |
1442 |
push @next_char, $self->{next_char}; |
push @next_char, $self->{next_char}; |
1447 |
if ($self->{next_char} == 0x002D) { # - |
if ($self->{next_char} == 0x002D) { # - |
1448 |
!!!cp (127); |
!!!cp (127); |
1449 |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
1450 |
line => $l, column => $c}; |
#line => $l, column => $c, |
1451 |
|
}; |
1452 |
$self->{state} = COMMENT_START_STATE; |
$self->{state} = COMMENT_START_STATE; |
1453 |
!!!next-input-character; |
!!!next-input-character; |
1454 |
redo A; |
redo A; |
1486 |
$self->{state} = DOCTYPE_STATE; |
$self->{state} = DOCTYPE_STATE; |
1487 |
$self->{current_token} = {type => DOCTYPE_TOKEN, |
$self->{current_token} = {type => DOCTYPE_TOKEN, |
1488 |
quirks => 1, |
quirks => 1, |
1489 |
line => $l, column => $c}; |
#line => $l, column => $c, |
1490 |
|
}; |
1491 |
!!!next-input-character; |
!!!next-input-character; |
1492 |
redo A; |
redo A; |
1493 |
} else { |
} else { |
1517 |
!!!back-next-input-character (@next_char); |
!!!back-next-input-character (@next_char); |
1518 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
1519 |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
1520 |
line => $l, column => $c}; |
#line => $l, column => $c, |
1521 |
|
}; |
1522 |
redo A; |
redo A; |
1523 |
|
|
1524 |
## ISSUE: typos in spec: chacacters, is is a parse error |
## ISSUE: typos in spec: chacacters, is is a parse error |
2316 |
} |
} |
2317 |
|
|
2318 |
return {type => CHARACTER_TOKEN, data => chr $code, |
return {type => CHARACTER_TOKEN, data => chr $code, |
2319 |
has_reference => 1, line => $l, column => $c}; |
has_reference => 1, |
2320 |
|
#line => $l, column => $c, |
2321 |
|
}; |
2322 |
} # X |
} # X |
2323 |
} elsif (0x0030 <= $self->{next_char} and |
} elsif (0x0030 <= $self->{next_char} and |
2324 |
$self->{next_char} <= 0x0039) { # 0..9 |
$self->{next_char} <= 0x0039) { # 0..9 |
2361 |
} |
} |
2362 |
|
|
2363 |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, |
2364 |
line => $l, column => $c}; |
#line => $l, column => $c, |
2365 |
|
}; |
2366 |
} else { |
} else { |
2367 |
!!!cp (1019); |
!!!cp (1019); |
2368 |
!!!parse-error (type => 'bare nero', line => $l, column => $c); |
!!!parse-error (type => 'bare nero', line => $l, column => $c); |
2416 |
if ($match > 0) { |
if ($match > 0) { |
2417 |
!!!cp (1023); |
!!!cp (1023); |
2418 |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
2419 |
line => $l, column => $c}; |
#line => $l, column => $c, |
2420 |
|
}; |
2421 |
} elsif ($match < 0) { |
} elsif ($match < 0) { |
2422 |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
2423 |
if ($in_attr and $match < -1) { |
if ($in_attr and $match < -1) { |
2424 |
!!!cp (1024); |
!!!cp (1024); |
2425 |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name, |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name, |
2426 |
line => $l, column => $c}; |
#line => $l, column => $c, |
2427 |
|
}; |
2428 |
} else { |
} else { |
2429 |
!!!cp (1025); |
!!!cp (1025); |
2430 |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
2431 |
line => $l, column => $c}; |
#line => $l, column => $c, |
2432 |
|
}; |
2433 |
} |
} |
2434 |
} else { |
} else { |
2435 |
!!!cp (1026); |
!!!cp (1026); |
2436 |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
2437 |
## NOTE: "No characters are consumed" in the spec. |
## NOTE: "No characters are consumed" in the spec. |
2438 |
return {type => CHARACTER_TOKEN, data => '&'.$value, |
return {type => CHARACTER_TOKEN, data => '&'.$value, |
2439 |
line => $l, column => $c}; |
#line => $l, column => $c, |
2440 |
|
}; |
2441 |
} |
} |
2442 |
} else { |
} else { |
2443 |
!!!cp (1027); |
!!!cp (1027); |
2748 |
!!!cp ('t24'); |
!!!cp ('t24'); |
2749 |
$self->{application_cache_selection} |
$self->{application_cache_selection} |
2750 |
->($token->{attributes}->{manifest}->{value}); |
->($token->{attributes}->{manifest}->{value}); |
2751 |
## ISSUE: No relative reference resolution? |
## ISSUE: Spec is unclear on relative references. |
2752 |
|
## According to Hixie (#whatwg 2008-03-19), it should be |
2753 |
|
## resolved against the base URI of the document in HTML |
2754 |
|
## or xml:base of the element in XHTML. |
2755 |
} else { |
} else { |
2756 |
!!!cp ('t25'); |
!!!cp ('t25'); |
2757 |
$self->{application_cache_selection}->(undef); |
$self->{application_cache_selection}->(undef); |
5932 |
if ($prompt_attr) { |
if ($prompt_attr) { |
5933 |
!!!cp ('t390'); |
!!!cp ('t390'); |
5934 |
push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}, |
push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}, |
5935 |
line => $token->{line}, column => $token->{column}}; |
#line => $token->{line}, column => $token->{column}, |
5936 |
|
}; |
5937 |
} else { |
} else { |
5938 |
!!!cp ('t391'); |
!!!cp ('t391'); |
5939 |
push @tokens, {type => CHARACTER_TOKEN, |
push @tokens, {type => CHARACTER_TOKEN, |
5940 |
data => 'This is a searchable index. Insert your search keywords here: ', |
data => 'This is a searchable index. Insert your search keywords here: ', |
5941 |
line => $token->{line}, column => $token->{column}}; # SHOULD |
#line => $token->{line}, column => $token->{column}, |
5942 |
|
}; # SHOULD |
5943 |
## TODO: make this configurable |
## TODO: make this configurable |
5944 |
} |
} |
5945 |
push @tokens, |
push @tokens, |