177 |
if defined $self->{input_encoding}; |
if defined $self->{input_encoding}; |
178 |
|
|
179 |
my $i = 0; |
my $i = 0; |
180 |
my $line = 1; |
$self->{line_prev} = $self->{line} = 1; |
181 |
my $column = 0; |
$self->{column_prev} = $self->{column} = 0; |
182 |
$self->{set_next_char} = sub { |
$self->{set_next_char} = sub { |
183 |
my $self = shift; |
my $self = shift; |
184 |
|
|
187 |
|
|
188 |
$self->{next_char} = -1 and return if $i >= length $$s; |
$self->{next_char} = -1 and return if $i >= length $$s; |
189 |
$self->{next_char} = ord substr $$s, $i++, 1; |
$self->{next_char} = ord substr $$s, $i++, 1; |
190 |
$column++; |
|
191 |
|
($self->{line_prev}, $self->{column_prev}) |
192 |
|
= ($self->{line}, $self->{column}); |
193 |
|
$self->{column}++; |
194 |
|
|
195 |
if ($self->{next_char} == 0x000A) { # LF |
if ($self->{next_char} == 0x000A) { # LF |
196 |
$line++; |
$self->{line}++; |
197 |
$column = 0; |
$self->{column} = 0; |
198 |
} elsif ($self->{next_char} == 0x000D) { # CR |
} elsif ($self->{next_char} == 0x000D) { # CR |
199 |
$i++ if substr ($$s, $i, 1) eq "\x0A"; |
$i++ if substr ($$s, $i, 1) eq "\x0A"; |
200 |
$self->{next_char} = 0x000A; # LF # MUST |
$self->{next_char} = 0x000A; # LF # MUST |
201 |
$line++; |
$self->{line}++; |
202 |
$column = 0; |
$self->{column} = 0; |
203 |
} elsif ($self->{next_char} > 0x10FFFF) { |
} elsif ($self->{next_char} > 0x10FFFF) { |
204 |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
205 |
} elsif ($self->{next_char} == 0x0000) { # NULL |
} elsif ($self->{next_char} == 0x0000) { # NULL |
212 |
|
|
213 |
my $onerror = $_[2] || sub { |
my $onerror = $_[2] || sub { |
214 |
my (%opt) = @_; |
my (%opt) = @_; |
215 |
warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n"; |
my $line = $opt{token} ? $opt{token}->{line} : $opt{line}; |
216 |
|
my $column = $opt{token} ? $opt{token}->{column} : $opt{column}; |
217 |
|
warn "Parse error ($opt{type}) at line $line column $column\n"; |
218 |
}; |
}; |
219 |
$self->{parse_error} = sub { |
$self->{parse_error} = sub { |
220 |
$onerror->(@_, line => $line, column => $column); |
$onerror->(line => $self->{line}, column => $self->{column}, @_); |
221 |
}; |
}; |
222 |
|
|
223 |
$self->_initialize_tokenizer; |
$self->_initialize_tokenizer; |
225 |
$self->_construct_tree; |
$self->_construct_tree; |
226 |
$self->_terminate_tree_constructor; |
$self->_terminate_tree_constructor; |
227 |
|
|
228 |
|
delete $self->{parse_error}; # remove loop |
229 |
|
|
230 |
return $self->{document}; |
return $self->{document}; |
231 |
} # parse_string |
} # parse_string |
232 |
|
|
456 |
# |
# |
457 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
458 |
!!!cp (11); |
!!!cp (11); |
459 |
!!!emit ({type => END_OF_FILE_TOKEN}); |
!!!emit ({type => END_OF_FILE_TOKEN, |
460 |
|
line => $self->{line}, column => $self->{column}}); |
461 |
last A; ## TODO: ok? |
last A; ## TODO: ok? |
462 |
} else { |
} else { |
463 |
!!!cp (12); |
!!!cp (12); |
464 |
} |
} |
465 |
# Anything else |
# Anything else |
466 |
my $token = {type => CHARACTER_TOKEN, |
my $token = {type => CHARACTER_TOKEN, |
467 |
data => chr $self->{next_char}}; |
data => chr $self->{next_char}, |
468 |
|
line => $self->{line}, column => $self->{column}}; |
469 |
## Stay in the data state |
## Stay in the data state |
470 |
!!!next-input-character; |
!!!next-input-character; |
471 |
|
|
474 |
redo A; |
redo A; |
475 |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
} elsif ($self->{state} == ENTITY_DATA_STATE) { |
476 |
## (cannot happen in CDATA state) |
## (cannot happen in CDATA state) |
477 |
|
|
478 |
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
479 |
|
|
480 |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1); |
481 |
|
|
484 |
|
|
485 |
unless (defined $token) { |
unless (defined $token) { |
486 |
!!!cp (13); |
!!!cp (13); |
487 |
!!!emit ({type => CHARACTER_TOKEN, data => '&'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '&', |
488 |
|
line => $l, column => $c}); |
489 |
} else { |
} else { |
490 |
!!!cp (14); |
!!!cp (14); |
491 |
!!!emit ($token); |
!!!emit ($token); |
504 |
## reconsume |
## reconsume |
505 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
506 |
|
|
507 |
!!!emit ({type => CHARACTER_TOKEN, data => '<'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
508 |
|
line => $self->{line_prev}, |
509 |
|
column => $self->{column_prev}}); |
510 |
|
|
511 |
redo A; |
redo A; |
512 |
} |
} |
526 |
!!!cp (19); |
!!!cp (19); |
527 |
$self->{current_token} |
$self->{current_token} |
528 |
= {type => START_TAG_TOKEN, |
= {type => START_TAG_TOKEN, |
529 |
tag_name => chr ($self->{next_char} + 0x0020)}; |
tag_name => chr ($self->{next_char} + 0x0020), |
530 |
|
line => $self->{line_prev}, |
531 |
|
column => $self->{column_prev}}; |
532 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
533 |
!!!next-input-character; |
!!!next-input-character; |
534 |
redo A; |
redo A; |
536 |
$self->{next_char} <= 0x007A) { # a..z |
$self->{next_char} <= 0x007A) { # a..z |
537 |
!!!cp (20); |
!!!cp (20); |
538 |
$self->{current_token} = {type => START_TAG_TOKEN, |
$self->{current_token} = {type => START_TAG_TOKEN, |
539 |
tag_name => chr ($self->{next_char})}; |
tag_name => chr ($self->{next_char}), |
540 |
|
line => $self->{line_prev}, |
541 |
|
column => $self->{column_prev}}; |
542 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
543 |
!!!next-input-character; |
!!!next-input-character; |
544 |
redo A; |
redo A; |
548 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
549 |
!!!next-input-character; |
!!!next-input-character; |
550 |
|
|
551 |
!!!emit ({type => CHARACTER_TOKEN, data => '<>'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '<>', |
552 |
|
line => $self->{line_prev}, |
553 |
|
column => $self->{column_prev}}); |
554 |
|
|
555 |
redo A; |
redo A; |
556 |
} elsif ($self->{next_char} == 0x003F) { # ? |
} elsif ($self->{next_char} == 0x003F) { # ? |
557 |
!!!cp (22); |
!!!cp (22); |
558 |
!!!parse-error (type => 'pio'); |
!!!parse-error (type => 'pio'); |
559 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
560 |
|
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
561 |
|
line => $self->{line_prev}, |
562 |
|
column => $self->{column_prev}}; |
563 |
## $self->{next_char} is intentionally left as is |
## $self->{next_char} is intentionally left as is |
564 |
redo A; |
redo A; |
565 |
} else { |
} else { |
568 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
569 |
## reconsume |
## reconsume |
570 |
|
|
571 |
!!!emit ({type => CHARACTER_TOKEN, data => '<'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '<', |
572 |
|
line => $self->{line_prev}, |
573 |
|
column => $self->{column_prev}}); |
574 |
|
|
575 |
redo A; |
redo A; |
576 |
} |
} |
578 |
die "$0: $self->{content_model} in tag open"; |
die "$0: $self->{content_model} in tag open"; |
579 |
} |
} |
580 |
} elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) { |
} elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) { |
581 |
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
582 |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA |
583 |
if (defined $self->{last_emitted_start_tag_name}) { |
if (defined $self->{last_emitted_start_tag_name}) { |
584 |
|
|
585 |
## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564> |
## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564> |
586 |
my @next_char; |
my @next_char; |
587 |
TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) { |
TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) { |
598 |
!!!back-next-input-character (@next_char); |
!!!back-next-input-character (@next_char); |
599 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
600 |
|
|
601 |
!!!emit ({type => CHARACTER_TOKEN, data => '</'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
602 |
|
line => $l, column => $c}); |
603 |
|
|
604 |
redo A; |
redo A; |
605 |
} |
} |
618 |
$self->{next_char} = shift @next_char; # reconsume |
$self->{next_char} = shift @next_char; # reconsume |
619 |
!!!back-next-input-character (@next_char); |
!!!back-next-input-character (@next_char); |
620 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
621 |
!!!emit ({type => CHARACTER_TOKEN, data => '</'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
622 |
|
line => $l, column => $c}); |
623 |
redo A; |
redo A; |
624 |
} else { |
} else { |
625 |
!!!cp (27); |
!!!cp (27); |
632 |
!!!cp (28); |
!!!cp (28); |
633 |
# next-input-character is already done |
# next-input-character is already done |
634 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
635 |
!!!emit ({type => CHARACTER_TOKEN, data => '</'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
636 |
|
line => $l, column => $c}); |
637 |
redo A; |
redo A; |
638 |
} |
} |
639 |
} |
} |
641 |
if (0x0041 <= $self->{next_char} and |
if (0x0041 <= $self->{next_char} and |
642 |
$self->{next_char} <= 0x005A) { # A..Z |
$self->{next_char} <= 0x005A) { # A..Z |
643 |
!!!cp (29); |
!!!cp (29); |
644 |
$self->{current_token} = {type => END_TAG_TOKEN, |
$self->{current_token} |
645 |
tag_name => chr ($self->{next_char} + 0x0020)}; |
= {type => END_TAG_TOKEN, |
646 |
|
tag_name => chr ($self->{next_char} + 0x0020), |
647 |
|
line => $l, column => $c}; |
648 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
649 |
!!!next-input-character; |
!!!next-input-character; |
650 |
redo A; |
redo A; |
652 |
$self->{next_char} <= 0x007A) { # a..z |
$self->{next_char} <= 0x007A) { # a..z |
653 |
!!!cp (30); |
!!!cp (30); |
654 |
$self->{current_token} = {type => END_TAG_TOKEN, |
$self->{current_token} = {type => END_TAG_TOKEN, |
655 |
tag_name => chr ($self->{next_char})}; |
tag_name => chr ($self->{next_char}), |
656 |
|
line => $l, column => $c}; |
657 |
$self->{state} = TAG_NAME_STATE; |
$self->{state} = TAG_NAME_STATE; |
658 |
!!!next-input-character; |
!!!next-input-character; |
659 |
redo A; |
redo A; |
669 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
670 |
# reconsume |
# reconsume |
671 |
|
|
672 |
!!!emit ({type => CHARACTER_TOKEN, data => '</'}); |
!!!emit ({type => CHARACTER_TOKEN, data => '</', |
673 |
|
line => $l, column => $c}); |
674 |
|
|
675 |
redo A; |
redo A; |
676 |
} else { |
} else { |
677 |
!!!cp (33); |
!!!cp (33); |
678 |
!!!parse-error (type => 'bogus end tag'); |
!!!parse-error (type => 'bogus end tag'); |
679 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
680 |
|
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
681 |
|
line => $self->{line_prev}, # "<" of "</" |
682 |
|
column => $self->{column_prev} - 1}; |
683 |
## $self->{next_char} is intentionally left as is |
## $self->{next_char} is intentionally left as is |
684 |
redo A; |
redo A; |
685 |
} |
} |
1416 |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
} elsif ($self->{state} == BOGUS_COMMENT_STATE) { |
1417 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
1418 |
|
|
1419 |
my $token = {type => COMMENT_TOKEN, data => ''}; |
## NOTE: Set by the previous state |
1420 |
|
#my $token = {type => COMMENT_TOKEN, data => ''}; |
1421 |
|
|
1422 |
BC: { |
BC: { |
1423 |
if ($self->{next_char} == 0x003E) { # > |
if ($self->{next_char} == 0x003E) { # > |
1425 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1426 |
!!!next-input-character; |
!!!next-input-character; |
1427 |
|
|
1428 |
!!!emit ($token); |
!!!emit ($self->{current_token}); # comment |
1429 |
|
|
1430 |
redo A; |
redo A; |
1431 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
1433 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1434 |
## reconsume |
## reconsume |
1435 |
|
|
1436 |
!!!emit ($token); |
!!!emit ($self->{current_token}); # comment |
1437 |
|
|
1438 |
redo A; |
redo A; |
1439 |
} else { |
} else { |
1440 |
!!!cp (126); |
!!!cp (126); |
1441 |
$token->{data} .= chr ($self->{next_char}); |
$self->{current_token}->{data} .= chr ($self->{next_char}); # comment |
1442 |
!!!next-input-character; |
!!!next-input-character; |
1443 |
redo BC; |
redo BC; |
1444 |
} |
} |
1448 |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
} elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) { |
1449 |
## (only happen if PCDATA state) |
## (only happen if PCDATA state) |
1450 |
|
|
1451 |
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); |
1452 |
|
|
1453 |
my @next_char; |
my @next_char; |
1454 |
push @next_char, $self->{next_char}; |
push @next_char, $self->{next_char}; |
1455 |
|
|
1458 |
push @next_char, $self->{next_char}; |
push @next_char, $self->{next_char}; |
1459 |
if ($self->{next_char} == 0x002D) { # - |
if ($self->{next_char} == 0x002D) { # - |
1460 |
!!!cp (127); |
!!!cp (127); |
1461 |
$self->{current_token} = {type => COMMENT_TOKEN, data => ''}; |
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
1462 |
|
line => $l, column => $c}; |
1463 |
$self->{state} = COMMENT_START_STATE; |
$self->{state} = COMMENT_START_STATE; |
1464 |
!!!next-input-character; |
!!!next-input-character; |
1465 |
redo A; |
redo A; |
1495 |
!!!cp (129); |
!!!cp (129); |
1496 |
## TODO: What a stupid code this is! |
## TODO: What a stupid code this is! |
1497 |
$self->{state} = DOCTYPE_STATE; |
$self->{state} = DOCTYPE_STATE; |
1498 |
|
$self->{current_token} = {type => DOCTYPE_TOKEN, |
1499 |
|
quirks => 1, |
1500 |
|
line => $l, column => $c}; |
1501 |
!!!next-input-character; |
!!!next-input-character; |
1502 |
redo A; |
redo A; |
1503 |
} else { |
} else { |
1526 |
$self->{next_char} = shift @next_char; |
$self->{next_char} = shift @next_char; |
1527 |
!!!back-next-input-character (@next_char); |
!!!back-next-input-character (@next_char); |
1528 |
$self->{state} = BOGUS_COMMENT_STATE; |
$self->{state} = BOGUS_COMMENT_STATE; |
1529 |
|
$self->{current_token} = {type => COMMENT_TOKEN, data => '', |
1530 |
|
line => $l, column => $c}; |
1531 |
redo A; |
redo A; |
1532 |
|
|
1533 |
## ISSUE: typos in spec: chacacters, is is a parse error |
## ISSUE: typos in spec: chacacters, is is a parse error |
1706 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1707 |
!!!next-input-character; |
!!!next-input-character; |
1708 |
|
|
1709 |
!!!emit ({type => DOCTYPE_TOKEN, quirks => 1}); |
!!!emit ($self->{current_token}); # DOCTYPE (quirks) |
1710 |
|
|
1711 |
redo A; |
redo A; |
1712 |
} elsif ($self->{next_char} == -1) { |
} elsif ($self->{next_char} == -1) { |
1715 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
1716 |
## reconsume |
## reconsume |
1717 |
|
|
1718 |
!!!emit ({type => DOCTYPE_TOKEN, quirks => 1}); |
!!!emit ($self->{current_token}); # DOCTYPE (quirks) |
1719 |
|
|
1720 |
redo A; |
redo A; |
1721 |
} else { |
} else { |
1722 |
!!!cp (160); |
!!!cp (160); |
1723 |
$self->{current_token} |
$self->{current_token}->{name} = chr $self->{next_char}; |
1724 |
= {type => DOCTYPE_TOKEN, |
delete $self->{current_token}->{quirks}; |
|
name => chr ($self->{next_char}), |
|
|
#quirks => 0, |
|
|
}; |
|
1725 |
## ISSUE: "Set the token's name name to the" in the spec |
## ISSUE: "Set the token's name name to the" in the spec |
1726 |
$self->{state} = DOCTYPE_NAME_STATE; |
$self->{state} = DOCTYPE_NAME_STATE; |
1727 |
!!!next-input-character; |
!!!next-input-character; |
2248 |
sub _tokenize_attempt_to_consume_an_entity ($$$) { |
sub _tokenize_attempt_to_consume_an_entity ($$$) { |
2249 |
my ($self, $in_attr, $additional) = @_; |
my ($self, $in_attr, $additional) = @_; |
2250 |
|
|
2251 |
|
my ($l, $c) = ($self->{line_prev}, $self->{column_prev}); |
2252 |
|
|
2253 |
if ({ |
if ({ |
2254 |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF, |
2255 |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR |
2290 |
redo X; |
redo X; |
2291 |
} elsif (not defined $code) { # no hexadecimal digit |
} elsif (not defined $code) { # no hexadecimal digit |
2292 |
!!!cp (1005); |
!!!cp (1005); |
2293 |
!!!parse-error (type => 'bare hcro'); |
!!!parse-error (type => 'bare hcro', line => $l, column => $c); |
2294 |
!!!back-next-input-character ($x_char, $self->{next_char}); |
!!!back-next-input-character ($x_char, $self->{next_char}); |
2295 |
$self->{next_char} = 0x0023; # # |
$self->{next_char} = 0x0023; # # |
2296 |
return undef; |
return undef; |
2299 |
!!!next-input-character; |
!!!next-input-character; |
2300 |
} else { |
} else { |
2301 |
!!!cp (1007); |
!!!cp (1007); |
2302 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
2303 |
} |
} |
2304 |
|
|
2305 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
2306 |
!!!cp (1008); |
!!!cp (1008); |
2307 |
!!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code); |
!!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c); |
2308 |
$code = 0xFFFD; |
$code = 0xFFFD; |
2309 |
} elsif ($code > 0x10FFFF) { |
} elsif ($code > 0x10FFFF) { |
2310 |
!!!cp (1009); |
!!!cp (1009); |
2311 |
!!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code); |
!!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c); |
2312 |
$code = 0xFFFD; |
$code = 0xFFFD; |
2313 |
} elsif ($code == 0x000D) { |
} elsif ($code == 0x000D) { |
2314 |
!!!cp (1010); |
!!!cp (1010); |
2315 |
!!!parse-error (type => 'CR character reference'); |
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
2316 |
$code = 0x000A; |
$code = 0x000A; |
2317 |
} elsif (0x80 <= $code and $code <= 0x9F) { |
} elsif (0x80 <= $code and $code <= 0x9F) { |
2318 |
!!!cp (1011); |
!!!cp (1011); |
2319 |
!!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code); |
!!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c); |
2320 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
2321 |
} |
} |
2322 |
|
|
2323 |
return {type => CHARACTER_TOKEN, data => chr $code, |
return {type => CHARACTER_TOKEN, data => chr $code, |
2324 |
has_reference => 1}; |
has_reference => 1, line => $l, column => $c}; |
2325 |
} # X |
} # X |
2326 |
} elsif (0x0030 <= $self->{next_char} and |
} elsif (0x0030 <= $self->{next_char} and |
2327 |
$self->{next_char} <= 0x0039) { # 0..9 |
$self->{next_char} <= 0x0039) { # 0..9 |
2342 |
!!!next-input-character; |
!!!next-input-character; |
2343 |
} else { |
} else { |
2344 |
!!!cp (1014); |
!!!cp (1014); |
2345 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
2346 |
} |
} |
2347 |
|
|
2348 |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) { |
2349 |
!!!cp (1015); |
!!!cp (1015); |
2350 |
!!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code); |
!!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c); |
2351 |
$code = 0xFFFD; |
$code = 0xFFFD; |
2352 |
} elsif ($code > 0x10FFFF) { |
} elsif ($code > 0x10FFFF) { |
2353 |
!!!cp (1016); |
!!!cp (1016); |
2354 |
!!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code); |
!!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c); |
2355 |
$code = 0xFFFD; |
$code = 0xFFFD; |
2356 |
} elsif ($code == 0x000D) { |
} elsif ($code == 0x000D) { |
2357 |
!!!cp (1017); |
!!!cp (1017); |
2358 |
!!!parse-error (type => 'CR character reference'); |
!!!parse-error (type => 'CR character reference', line => $l, column => $c); |
2359 |
$code = 0x000A; |
$code = 0x000A; |
2360 |
} elsif (0x80 <= $code and $code <= 0x9F) { |
} elsif (0x80 <= $code and $code <= 0x9F) { |
2361 |
!!!cp (1018); |
!!!cp (1018); |
2362 |
!!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code); |
!!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c); |
2363 |
$code = $c1_entity_char->{$code}; |
$code = $c1_entity_char->{$code}; |
2364 |
} |
} |
2365 |
|
|
2366 |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1}; |
return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, |
2367 |
|
line => $l, column => $c}; |
2368 |
} else { |
} else { |
2369 |
!!!cp (1019); |
!!!cp (1019); |
2370 |
!!!parse-error (type => 'bare nero'); |
!!!parse-error (type => 'bare nero', line => $l, column => $c); |
2371 |
!!!back-next-input-character ($self->{next_char}); |
!!!back-next-input-character ($self->{next_char}); |
2372 |
$self->{next_char} = 0x0023; # # |
$self->{next_char} = 0x0023; # # |
2373 |
return undef; |
return undef; |
2417 |
|
|
2418 |
if ($match > 0) { |
if ($match > 0) { |
2419 |
!!!cp (1023); |
!!!cp (1023); |
2420 |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
2421 |
|
line => $l, column => $c}; |
2422 |
} elsif ($match < 0) { |
} elsif ($match < 0) { |
2423 |
!!!parse-error (type => 'no refc'); |
!!!parse-error (type => 'no refc', line => $l, column => $c); |
2424 |
if ($in_attr and $match < -1) { |
if ($in_attr and $match < -1) { |
2425 |
!!!cp (1024); |
!!!cp (1024); |
2426 |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name}; |
return {type => CHARACTER_TOKEN, data => '&'.$entity_name, |
2427 |
|
line => $l, column => $c}; |
2428 |
} else { |
} else { |
2429 |
!!!cp (1025); |
!!!cp (1025); |
2430 |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1}; |
return {type => CHARACTER_TOKEN, data => $value, has_reference => 1, |
2431 |
|
line => $l, column => $c}; |
2432 |
} |
} |
2433 |
} else { |
} else { |
2434 |
!!!cp (1026); |
!!!cp (1026); |
2435 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
2436 |
## NOTE: "No characters are consumed" in the spec. |
## NOTE: "No characters are consumed" in the spec. |
2437 |
return {type => CHARACTER_TOKEN, data => '&'.$value}; |
return {type => CHARACTER_TOKEN, data => '&'.$value, |
2438 |
|
line => $l, column => $c}; |
2439 |
} |
} |
2440 |
} else { |
} else { |
2441 |
!!!cp (1027); |
!!!cp (1027); |
2442 |
## no characters are consumed |
## no characters are consumed |
2443 |
!!!parse-error (type => 'bare ero'); |
!!!parse-error (type => 'bare ero', line => $l, column => $c); |
2444 |
return undef; |
return undef; |
2445 |
} |
} |
2446 |
} # _tokenize_attempt_to_consume_an_entity |
} # _tokenize_attempt_to_consume_an_entity |