| 177 |
if defined $self->{input_encoding}; |
if defined $self->{input_encoding}; |
| 178 |
|
|
| 179 |
my $i = 0; |
my $i = 0; |
| 180 |
my $line = 1; |
$self->{line_prev} = $self->{line} = 1; |
| 181 |
my $column = 0; |
$self->{column_prev} = $self->{column} = 0; |
| 182 |
$self->{set_next_char} = sub { |
$self->{set_next_char} = sub { |
| 183 |
my $self = shift; |
my $self = shift; |
| 184 |
|
|
| 187 |
|
|
| 188 |
$self->{next_char} = -1 and return if $i >= length $$s; |
$self->{next_char} = -1 and return if $i >= length $$s; |
| 189 |
$self->{next_char} = ord substr $$s, $i++, 1; |
$self->{next_char} = ord substr $$s, $i++, 1; |
| 190 |
$column++; |
|
| 191 |
|
($self->{line_prev}, $self->{column_prev}) |
| 192 |
|
= ($self->{line}, $self->{column}); |
| 193 |
|
$self->{column}++; |
| 194 |
|
|
| 195 |
if ($self->{next_char} == 0x000A) { # LF |
if ($self->{next_char} == 0x000A) { # LF |
| 196 |
$line++; |
$self->{line}++; |
| 197 |
$column = 0; |
$self->{column} = 0; |
| 198 |
} elsif ($self->{next_char} == 0x000D) { # CR |
} elsif ($self->{next_char} == 0x000D) { # CR |
| 199 |
$i++ if substr ($$s, $i, 1) eq "\x0A"; |
$i++ if substr ($$s, $i, 1) eq "\x0A"; |
| 200 |
$self->{next_char} = 0x000A; # LF # MUST |
$self->{next_char} = 0x000A; # LF # MUST |
| 201 |
$line++; |
$self->{line}++; |
| 202 |
$column = 0; |
$self->{column} = 0; |
| 203 |
} elsif ($self->{next_char} > 0x10FFFF) { |
} elsif ($self->{next_char} > 0x10FFFF) { |
| 204 |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
| 205 |
} elsif ($self->{next_char} == 0x0000) { # NULL |
} elsif ($self->{next_char} == 0x0000) { # NULL |
| 212 |
|
|
| 213 |
my $onerror = $_[2] || sub { |
my $onerror = $_[2] || sub { |
| 214 |
my (%opt) = @_; |
my (%opt) = @_; |
| 215 |
warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n"; |
my $line = $opt{token} ? $opt{token}->{line} : $opt{line}; |
| 216 |
|
my $column = $opt{token} ? $opt{token}->{column} : $opt{column}; |
| 217 |
|
warn "Parse error ($opt{type}) at line $line column $column\n"; |
| 218 |
}; |
}; |
| 219 |
$self->{parse_error} = sub { |
$self->{parse_error} = sub { |
| 220 |
$onerror->(@_, line => $line, column => $column); |
$onerror->(line => $self->{line}, column => $self->{column}, @_); |
| 221 |
}; |
}; |
| 222 |
|
|
| 223 |
$self->_initialize_tokenizer; |
$self->_initialize_tokenizer; |
| 225 |
$self->_construct_tree; |
$self->_construct_tree; |
| 226 |
$self->_terminate_tree_constructor; |
$self->_terminate_tree_constructor; |
| 227 |
|
|
| 228 |
|
delete $self->{parse_error}; # remove loop |
| 229 |
|
|
| 230 |
return $self->{document}; |
return $self->{document}; |
| 231 |
} # parse_string |
} # parse_string |
| 232 |
|
|
| 456 |
# |
# |
| 457 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
| 458 |
!!!cp (11); |
!!!cp (11); |
| 459 |
!!!emit ({type => END_OF_FILE_TOKEN}); |
!!!emit ({type => END_OF_FILE_TOKEN, |
| 460 |
|
line => $self->{line}, column => $self->{column}}); |
| 461 |
last A; ## TODO: ok? |
last A; ## TODO: ok? |
| 462 |
} else { |
} else { |
| 463 |
!!!cp (12); |
!!!cp (12); |
| 464 |
} |
} |
| 465 |
# Anything else |
# Anything else |
| 466 |
my $token = {type => CHARACTER_TOKEN, |
my $token = {type => CHARACTER_TOKEN, |
| 467 |
data => chr $self->{next_char}}; |
data => chr $self->{next_char}, |
| 468 |
|
line => $self->{line}, column => $self->{column}}; |
| 469 |
## Stay in the data state |
## Stay in the data state |
| 470 |
!!!next-input-character; |
!!!next-input-character; |
| 471 |
|
|
| 474 |
redo A; |
redo A; |
| 475 |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
| 476 |
## (cannot happen in CDATA state) |
## (cannot happen in CDATA state) |
| 477 |
|
|
| 478 |
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
| 479 |
|
|
| 480 |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
| 481 |
|
|
| 484 |
|
|
| 485 |
unless (defined $token) { |
unless (defined $token) { |
| 486 |
!!!cp (13); |
!!!cp (13); |
| 487 |
!!!emit ({type => CHARACTER_TOKEN, data => '&'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
| 488 |
|
line => $l, column => $c}); |
| 489 |
} else { |
} else { |
| 490 |
!!!cp (14); |
!!!cp (14); |
| 491 |
!!!emit ($token); |
!!!emit ($token); |
| 504 |
## reconsume |
## reconsume |
| 505 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 506 |
|
|
| 507 |
!!!emit ({type => CHARACTER_TOKEN, data => '<'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
| 508 |
|
line => $self->{line_prev}, |
| 509 |
|
column => $self->{column_prev}}); |
| 510 |
|
|
| 511 |
redo A; |
redo A; |
| 512 |
} |
} |
| 526 |
!!!cp (19); |
!!!cp (19); |
| 527 |
$self->{current_token} |
$self->{current_token} |
| 528 |
= {type => START_TAG_TOKEN, |
= {type => START_TAG_TOKEN, |
| 529 |
tag_name => chr ($self->{next_char} + 0x0020)}; |
tag_name => chr ($self->{next_char} + 0x0020), |
| 530 |
|
line => $self->{line_prev}, |
| 531 |
|
column => $self->{column_prev}}; |
| 532 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 533 |
!!!next-input-character; |
!!!next-input-character; |
| 534 |
redo A; |
redo A; |
| 536 |
$self->{next_char} <= 0x007A) { # a..z |
$self->{next_char} <= 0x007A) { # a..z |
| 537 |
!!!cp (20); |
!!!cp (20); |
| 538 |
$self->{current_token} = {type => START_TAG_TOKEN, |
$self->{current_token} = {type => START_TAG_TOKEN, |
| 539 |
tag_name => chr ($self->{next_char})}; |
tag_name => chr ($self->{next_char}), |
| 540 |
|
line => $self->{line_prev}, |
| 541 |
|
column => $self->{column_prev}}; |
| 542 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 543 |
!!!next-input-character; |
!!!next-input-character; |
| 544 |
redo A; |
redo A; |
| 548 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 549 |
!!!next-input-character; |
!!!next-input-character; |
| 550 |
|
|
| 551 |
!!!emit ({type => CHARACTER_TOKEN, data => '<>'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '<>', |
| 552 |
|
line => $self->{line_prev}, |
| 553 |
|
column => $self->{column_prev}}); |
| 554 |
|
|
| 555 |
redo A; |
redo A; |
| 556 |
} elsif ($self->{next_char} == 0x003F) { # ? |
} elsif ($self->{next_char} == 0x003F) { # ? |
| 557 |
!!!cp (22); |
!!!cp (22); |
| 558 |
!!!parse-error (type => 'pio'); |
!!!parse-error (type => 'pio'); |
| 559 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 560 |
|
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
| 561 |
|
line => $self->{line_prev}, |
| 562 |
|
column => $self->{column_prev}}; |
| 563 |
## $self->{next_char} is intentionally left as is |
## $self->{next_char} is intentionally left as is |
| 564 |
redo A; |
redo A; |
| 565 |
} else { |
} else { |
| 568 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 569 |
## reconsume |
## reconsume |
| 570 |
|
|
| 571 |
!!!emit ({type => CHARACTER_TOKEN, data => '<'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
| 572 |
|
line => $self->{line_prev}, |
| 573 |
|
column => $self->{column_prev}}); |
| 574 |
|
|
| 575 |
redo A; |
redo A; |
| 576 |
} |
} |
| 578 |
die "$0: $self->{content_model} in tag open"; |
die "$0: $self->{content_model} in tag open"; |
| 579 |
} |
} |
| 580 |
} elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) { |
} elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) { |
| 581 |
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
| 582 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
| 583 |
if (defined $self->{last_emitted_start_tag_name}) { |
if (defined $self->{last_emitted_start_tag_name}) { |
| 584 |
|
|
| 585 |
## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564> |
## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564> |
| 586 |
my @next_char; |
my @next_char; |
| 587 |
TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) { |
TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) { |
| 598 |
!!!back-next-input-character (@next_char); |
!!!back-next-input-character (@next_char); |
| 599 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 600 |
|
|
| 601 |
!!!emit ({type => CHARACTER_TOKEN, data => '</'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
| 602 |
|
line => $l, column => $c}); |
| 603 |
|
|
| 604 |
redo A; |
redo A; |
| 605 |
} |
} |
| 618 |
$self->{next_char} = shift @next_char; # reconsume |
$self->{next_char} = shift @next_char; # reconsume |
| 619 |
!!!back-next-input-character (@next_char); |
!!!back-next-input-character (@next_char); |
| 620 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 621 |
!!!emit ({type => CHARACTER_TOKEN, data => '</'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
| 622 |
|
line => $l, column => $c}); |
| 623 |
redo A; |
redo A; |
| 624 |
} else { |
} else { |
| 625 |
!!!cp (27); |
!!!cp (27); |
| 632 |
!!!cp (28); |
!!!cp (28); |
| 633 |
# next-input-character is already done |
# next-input-character is already done |
| 634 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 635 |
!!!emit ({type => CHARACTER_TOKEN, data => '</'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
| 636 |
|
line => $l, column => $c}); |
| 637 |
redo A; |
redo A; |
| 638 |
} |
} |
| 639 |
} |
} |
| 641 |
if (0x0041 <= $self->{next_char} and |
if (0x0041 <= $self->{next_char} and |
| 642 |
$self->{next_char} <= 0x005A) { # A..Z |
$self->{next_char} <= 0x005A) { # A..Z |
| 643 |
!!!cp (29); |
!!!cp (29); |
| 644 |
$self->{current_token} = {type => END_TAG_TOKEN, |
$self->{current_token} |
| 645 |
tag_name => chr ($self->{next_char} + 0x0020)}; |
= {type => END_TAG_TOKEN, |
| 646 |
|
tag_name => chr ($self->{next_char} + 0x0020), |
| 647 |
|
line => $l, column => $c}; |
| 648 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 649 |
!!!next-input-character; |
!!!next-input-character; |
| 650 |
redo A; |
redo A; |
| 652 |
$self->{next_char} <= 0x007A) { # a..z |
$self->{next_char} <= 0x007A) { # a..z |
| 653 |
!!!cp (30); |
!!!cp (30); |
| 654 |
$self->{current_token} = {type => END_TAG_TOKEN, |
$self->{current_token} = {type => END_TAG_TOKEN, |
| 655 |
tag_name => chr ($self->{next_char})}; |
tag_name => chr ($self->{next_char}), |
| 656 |
|
line => $l, column => $c}; |
| 657 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
| 658 |
!!!next-input-character; |
!!!next-input-character; |
| 659 |
redo A; |
redo A; |
| 669 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 670 |
# reconsume |
# reconsume |
| 671 |
|
|
| 672 |
!!!emit ({type => CHARACTER_TOKEN, data => '</'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
| 673 |
|
line => $l, column => $c}); |
| 674 |
|
|
| 675 |
redo A; |
redo A; |
| 676 |
} else { |
} else { |
| 677 |
!!!cp (33); |
!!!cp (33); |
| 678 |
!!!parse-error (type => 'bogus end tag'); |
!!!parse-error (type => 'bogus end tag'); |
| 679 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 680 |
|
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
| 681 |
|
line => $self->{line_prev}, # "<" of "</" |
| 682 |
|
column => $self->{column_prev} - 1}; |
| 683 |
## $self->{next_char} is intentionally left as is |
## $self->{next_char} is intentionally left as is |
| 684 |
redo A; |
redo A; |
| 685 |
} |
} |
| 1416 |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
| 1417 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
| 1418 |
|
|
| 1419 |
my $token = {type => COMMENT_TOKEN, data => ''}; |
## NOTE: Set by the previous state |
| 1420 |
|
#my $token = {type => COMMENT_TOKEN, data => ''}; |
| 1421 |
|
|
| 1422 |
BC: { |
BC: { |
| 1423 |
if ($self->{next_char} == 0x003E) { # > |
if ($self->{next_char} == 0x003E) { # > |
| 1425 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1426 |
!!!next-input-character; |
!!!next-input-character; |
| 1427 |
|
|
| 1428 |
!!!emit ($token); |
!!!emit ($self->{current_token}); # comment |
| 1429 |
|
|
| 1430 |
redo A; |
redo A; |
| 1431 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
| 1433 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1434 |
## reconsume |
## reconsume |
| 1435 |
|
|
| 1436 |
!!!emit ($token); |
!!!emit ($self->{current_token}); # comment |
| 1437 |
|
|
| 1438 |
redo A; |
redo A; |
| 1439 |
} else { |
} else { |
| 1440 |
!!!cp (126); |
!!!cp (126); |
| 1441 |
$token->{data} .= chr ($self->{next_char}); |
$self->{current_token}->{data} .= chr ($self->{next_char}); # comment |
| 1442 |
!!!next-input-character; |
!!!next-input-character; |
| 1443 |
redo BC; |
redo BC; |
| 1444 |
} |
} |
| 1448 |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
| 1449 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
| 1450 |
|
|
| 1451 |
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); |
| 1452 |
|
|
| 1453 |
my @next_char; |
my @next_char; |
| 1454 |
push @next_char, $self->{next_char}; |
push @next_char, $self->{next_char}; |
| 1455 |
|
|
| 1458 |
push @next_char, $self->{next_char}; |
push @next_char, $self->{next_char}; |
| 1459 |
if ($self->{next_char} == 0x002D) { # - |
if ($self->{next_char} == 0x002D) { # - |
| 1460 |
!!!cp (127); |
!!!cp (127); |
| 1461 |
$self->{current_token} = {type => COMMENT_TOKEN, data => ''}; |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
| 1462 |
|
line => $l, column => $c}; |
| 1463 |
$self->{state} = COMMENT_START_STATE; |
$self->{state} = COMMENT_START_STATE; |
| 1464 |
!!!next-input-character; |
!!!next-input-character; |
| 1465 |
redo A; |
redo A; |
| 1495 |
!!!cp (129); |
!!!cp (129); |
| 1496 |
## TODO: What a stupid code this is! |
## TODO: What a stupid code this is! |
| 1497 |
$self->{state} = DOCTYPE_STATE; |
$self->{state} = DOCTYPE_STATE; |
| 1498 |
|
$self->{current_token} = {type => DOCTYPE_TOKEN, |
| 1499 |
|
quirks => 1, |
| 1500 |
|
line => $l, column => $c}; |
| 1501 |
!!!next-input-character; |
!!!next-input-character; |
| 1502 |
redo A; |
redo A; |
| 1503 |
} else { |
} else { |
| 1526 |
$self->{next_char} = shift @next_char; |
$self->{next_char} = shift @next_char; |
| 1527 |
!!!back-next-input-character (@next_char); |
!!!back-next-input-character (@next_char); |
| 1528 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
| 1529 |
|
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
| 1530 |
|
line => $l, column => $c}; |
| 1531 |
redo A; |
redo A; |
| 1532 |
|
|
| 1533 |
## ISSUE: typos in spec: chacacters, is is a parse error |
## ISSUE: typos in spec: chacacters, is is a parse error |
| 1706 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1707 |
!!!next-input-character; |
!!!next-input-character; |
| 1708 |
|
|
| 1709 |
!!!emit ({type => DOCTYPE_TOKEN, quirks => 1}); |
!!!emit ($self->{current_token}); # DOCTYPE (quirks) |
| 1710 |
|
|
| 1711 |
redo A; |
redo A; |
| 1712 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
| 1715 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1716 |
## reconsume |
## reconsume |
| 1717 |
|
|
| 1718 |
!!!emit ({type => DOCTYPE_TOKEN, quirks => 1}); |
!!!emit ($self->{current_token}); # DOCTYPE (quirks) |
| 1719 |
|
|
| 1720 |
redo A; |
redo A; |
| 1721 |
} else { |
} else { |
| 1722 |
!!!cp (160); |
!!!cp (160); |
| 1723 |
$self->{current_token} |
$self->{current_token}->{name} = chr $self->{next_char}; |
| 1724 |
= {type => DOCTYPE_TOKEN, |
delete $self->{current_token}->{quirks}; |
|
name => chr ($self->{next_char}), |
|
|
#quirks => 0, |
|
|
}; |
|
| 1725 |
## ISSUE: "Set the token's name name to the" in the spec |
## ISSUE: "Set the token's name name to the" in the spec |
| 1726 |
$self->{state} = DOCTYPE_NAME_STATE; |
$self->{state} = DOCTYPE_NAME_STATE; |
| 1727 |
!!!next-input-character; |
!!!next-input-character; |
| 2248 |
sub _tokenize_attempt_to_consume_an_entity ($$$) { |
sub _tokenize_attempt_to_consume_an_entity ($$$) { |
| 2249 |
my ($self, $in_attr, $additional) = @_; |
my ($self, $in_attr, $additional) = @_; |
| 2250 |
|
|
| 2251 |
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
| 2252 |
|
|
| 2253 |
if ({ |
if ({ |
| 2254 |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
| 2255 |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
| 2290 |
redo X; |
redo X; |
| 2291 |
} elsif (not defined $code) { # no hexadecimal digit |
} elsif (not defined $code) { # no hexadecimal digit |
| 2292 |
!!!cp (1005); |
!!!cp (1005); |
| 2293 |
!!!parse-error (type => 'bare hcro'); |
!!!parse-error (type => 'bare hcro', line => $l, column => $c); |
| 2294 |
!!!back-next-input-character ($x_char, $self->{next_char}); |
!!!back-next-input-character ($x_char, $self->{next_char}); |
| 2295 |
$self->{next_char} = 0x0023; # # |
$self->{next_char} = 0x0023; # # |
| 2296 |
return undef; |
return undef; |
| 2299 |
!!!next-input-character; |
!!!next-input-character; |
| 2300 |
} else { |
} else { |
| 2301 |
!!!cp (1007); |
!!!cp (1007); |
| 2302 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
| 2303 |
} |
} |
| 2304 |
|
|
| 2305 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
| 2306 |
!!!cp (1008); |
!!!cp (1008); |
| 2307 |
!!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code); |
!!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c); |
| 2308 |
$code = 0xFFFD; |
$code = 0xFFFD; |
| 2309 |
} elsif ($code > 0x10FFFF) { |
} elsif ($code > 0x10FFFF) { |
| 2310 |
!!!cp (1009); |
!!!cp (1009); |
| 2311 |
!!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code); |
!!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c); |
| 2312 |
$code = 0xFFFD; |
$code = 0xFFFD; |
| 2313 |
} elsif ($code == 0x000D) { |
} elsif ($code == 0x000D) { |
| 2314 |
!!!cp (1010); |
!!!cp (1010); |
| 2315 |
!!!parse-error (type => 'CR character reference'); |
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
| 2316 |
$code = 0x000A; |
$code = 0x000A; |
| 2317 |
} elsif (0x80 <= $code and $code <= 0x9F) { |
} elsif (0x80 <= $code and $code <= 0x9F) { |
| 2318 |
!!!cp (1011); |
!!!cp (1011); |
| 2319 |
!!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code); |
!!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c); |
| 2320 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 2321 |
} |
} |
| 2322 |
|
|
| 2323 |
return {type => CHARACTER_TOKEN, data => chr $code, |
return {type => CHARACTER_TOKEN, data => chr $code, |
| 2324 |
has_reference => 1}; |
has_reference => 1, line => $l, column => $c}; |
| 2325 |
} # X |
} # X |
| 2326 |
} elsif (0x0030 <= $self->{next_char} and |
} elsif (0x0030 <= $self->{next_char} and |
| 2327 |
$self->{next_char} <= 0x0039) { # 0..9 |
$self->{next_char} <= 0x0039) { # 0..9 |
| 2342 |
!!!next-input-character; |
!!!next-input-character; |
| 2343 |
} else { |
} else { |
| 2344 |
!!!cp (1014); |
!!!cp (1014); |
| 2345 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
| 2346 |
} |
} |
| 2347 |
|
|
| 2348 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
| 2349 |
!!!cp (1015); |
!!!cp (1015); |
| 2350 |
!!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code); |
!!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c); |
| 2351 |
$code = 0xFFFD; |
$code = 0xFFFD; |
| 2352 |
} elsif ($code > 0x10FFFF) { |
} elsif ($code > 0x10FFFF) { |
| 2353 |
!!!cp (1016); |
!!!cp (1016); |
| 2354 |
!!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code); |
!!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c); |
| 2355 |
$code = 0xFFFD; |
$code = 0xFFFD; |
| 2356 |
} elsif ($code == 0x000D) { |
} elsif ($code == 0x000D) { |
| 2357 |
!!!cp (1017); |
!!!cp (1017); |
| 2358 |
!!!parse-error (type => 'CR character reference'); |
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
| 2359 |
$code = 0x000A; |
$code = 0x000A; |
| 2360 |
} elsif (0x80 <= $code and $code <= 0x9F) { |
} elsif (0x80 <= $code and $code <= 0x9F) { |
| 2361 |
!!!cp (1018); |
!!!cp (1018); |
| 2362 |
!!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code); |
!!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c); |
| 2363 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
| 2364 |
} |
} |
| 2365 |
|
|
| 2366 |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1}; |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, |
| 2367 |
|
line => $l, column => $c}; |
| 2368 |
} else { |
} else { |
| 2369 |
!!!cp (1019); |
!!!cp (1019); |
| 2370 |
!!!parse-error (type => 'bare nero'); |
!!!parse-error (type => 'bare nero', line => $l, column => $c); |
| 2371 |
!!!back-next-input-character ($self->{next_char}); |
!!!back-next-input-character ($self->{next_char}); |
| 2372 |
$self->{next_char} = 0x0023; # # |
$self->{next_char} = 0x0023; # # |
| 2373 |
return undef; |
return undef; |
| 2417 |
|
|
| 2418 |
if ($match > 0) { |
if ($match > 0) { |
| 2419 |
!!!cp (1023); |
!!!cp (1023); |
| 2420 |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
| 2421 |
|
line => $l, column => $c}; |
| 2422 |
} elsif ($match < 0) { |
} elsif ($match < 0) { |
| 2423 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
| 2424 |
if ($in_attr and $match < -1) { |
if ($in_attr and $match < -1) { |
| 2425 |
!!!cp (1024); |
!!!cp (1024); |
| 2426 |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name, |
| 2427 |
|
line => $l, column => $c}; |
| 2428 |
} else { |
} else { |
| 2429 |
!!!cp (1025); |
!!!cp (1025); |
| 2430 |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
| 2431 |
|
line => $l, column => $c}; |
| 2432 |
} |
} |
| 2433 |
} else { |
} else { |
| 2434 |
!!!cp (1026); |
!!!cp (1026); |
| 2435 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
| 2436 |
## NOTE: "No characters are consumed" in the spec. |
## NOTE: "No characters are consumed" in the spec. |
| 2437 |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
return {type => CHARACTER_TOKEN, data => '&'.$value, |
| 2438 |
|
line => $l, column => $c}; |
| 2439 |
} |
} |
| 2440 |
} else { |
} else { |
| 2441 |
!!!cp (1027); |
!!!cp (1027); |
| 2442 |
## no characters are consumed |
## no characters are consumed |
| 2443 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
| 2444 |
return undef; |
return undef; |
| 2445 |
} |
} |
| 2446 |
} # _tokenize_attempt_to_consume_an_entity |
} # _tokenize_attempt_to_consume_an_entity |