| 8 |
## doc.write (''); |
## doc.write (''); |
| 9 |
## alert (doc.compatMode); |
## alert (doc.compatMode); |
| 10 |
|
|
|
## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263) |
|
| 11 |
## TODO: 1252 parse error (revision 1264) |
## TODO: 1252 parse error (revision 1264) |
| 12 |
## TODO: 8859-11 = 874 (revision 1271) |
## TODO: 8859-11 = 874 (revision 1271) |
| 13 |
|
|
| 14 |
|
require IO::Handle; |
| 15 |
|
|
| 16 |
my $HTML_NS = q<http://www.w3.org/1999/xhtml>; |
my $HTML_NS = q<http://www.w3.org/1999/xhtml>; |
| 17 |
my $MML_NS = q<http://www.w3.org/1998/Math/MathML>; |
my $MML_NS = q<http://www.w3.org/1998/Math/MathML>; |
| 18 |
my $SVG_NS = q<http://www.w3.org/2000/svg>; |
my $SVG_NS = q<http://www.w3.org/2000/svg>; |
| 206 |
mtext => FOREIGN_FLOW_CONTENT_EL, |
mtext => FOREIGN_FLOW_CONTENT_EL, |
| 207 |
}, |
}, |
| 208 |
$SVG_NS => { |
$SVG_NS => { |
| 209 |
foreignobject => FOREIGN_FLOW_CONTENT_EL, ## TODO: case |
foreignObject => FOREIGN_FLOW_CONTENT_EL, |
| 210 |
desc => FOREIGN_FLOW_CONTENT_EL, |
desc => FOREIGN_FLOW_CONTENT_EL, |
| 211 |
title => FOREIGN_FLOW_CONTENT_EL, |
title => FOREIGN_FLOW_CONTENT_EL, |
| 212 |
}, |
}, |
| 213 |
## NOTE: In addition, FOREIGN_EL is set to non-HTML elements. |
## NOTE: In addition, FOREIGN_EL is set to non-HTML elements. |
| 214 |
}; |
}; |
| 215 |
|
|
| 216 |
|
my $svg_attr_name = { |
| 217 |
|
attributetype => 'attributeType', |
| 218 |
|
basefrequency => 'baseFrequency', |
| 219 |
|
baseprofile => 'baseProfile', |
| 220 |
|
calcmode => 'calcMode', |
| 221 |
|
clippathunits => 'clipPathUnits', |
| 222 |
|
contentscripttype => 'contentScriptType', |
| 223 |
|
contentstyletype => 'contentStyleType', |
| 224 |
|
diffuseconstant => 'diffuseConstant', |
| 225 |
|
edgemode => 'edgeMode', |
| 226 |
|
externalresourcesrequired => 'externalResourcesRequired', |
| 227 |
|
fecolormatrix => 'feColorMatrix', |
| 228 |
|
fecomposite => 'feComposite', |
| 229 |
|
fegaussianblur => 'feGaussianBlur', |
| 230 |
|
femorphology => 'feMorphology', |
| 231 |
|
fetile => 'feTile', |
| 232 |
|
filterres => 'filterRes', |
| 233 |
|
filterunits => 'filterUnits', |
| 234 |
|
glyphref => 'glyphRef', |
| 235 |
|
gradienttransform => 'gradientTransform', |
| 236 |
|
gradientunits => 'gradientUnits', |
| 237 |
|
kernelmatrix => 'kernelMatrix', |
| 238 |
|
kernelunitlength => 'kernelUnitLength', |
| 239 |
|
keypoints => 'keyPoints', |
| 240 |
|
keysplines => 'keySplines', |
| 241 |
|
keytimes => 'keyTimes', |
| 242 |
|
lengthadjust => 'lengthAdjust', |
| 243 |
|
limitingconeangle => 'limitingConeAngle', |
| 244 |
|
markerheight => 'markerHeight', |
| 245 |
|
markerunits => 'markerUnits', |
| 246 |
|
markerwidth => 'markerWidth', |
| 247 |
|
maskcontentunits => 'maskContentUnits', |
| 248 |
|
maskunits => 'maskUnits', |
| 249 |
|
numoctaves => 'numOctaves', |
| 250 |
|
pathlength => 'pathLength', |
| 251 |
|
patterncontentunits => 'patternContentUnits', |
| 252 |
|
patterntransform => 'patternTransform', |
| 253 |
|
patternunits => 'patternUnits', |
| 254 |
|
pointsatx => 'pointsAtX', |
| 255 |
|
pointsaty => 'pointsAtY', |
| 256 |
|
pointsatz => 'pointsAtZ', |
| 257 |
|
preservealpha => 'preserveAlpha', |
| 258 |
|
preserveaspectratio => 'preserveAspectRatio', |
| 259 |
|
primitiveunits => 'primitiveUnits', |
| 260 |
|
refx => 'refX', |
| 261 |
|
refy => 'refY', |
| 262 |
|
repeatcount => 'repeatCount', |
| 263 |
|
repeatdur => 'repeatDur', |
| 264 |
|
requiredextensions => 'requiredExtensions', |
| 265 |
|
specularconstant => 'specularConstant', |
| 266 |
|
specularexponent => 'specularExponent', |
| 267 |
|
spreadmethod => 'spreadMethod', |
| 268 |
|
startoffset => 'startOffset', |
| 269 |
|
stddeviation => 'stdDeviation', |
| 270 |
|
stitchtiles => 'stitchTiles', |
| 271 |
|
surfacescale => 'surfaceScale', |
| 272 |
|
systemlanguage => 'systemLanguage', |
| 273 |
|
tablevalues => 'tableValues', |
| 274 |
|
targetx => 'targetX', |
| 275 |
|
targety => 'targetY', |
| 276 |
|
textlength => 'textLength', |
| 277 |
|
viewbox => 'viewBox', |
| 278 |
|
viewtarget => 'viewTarget', |
| 279 |
|
xchannelselector => 'xChannelSelector', |
| 280 |
|
ychannelselector => 'yChannelSelector', |
| 281 |
|
zoomandpan => 'zoomAndPan', |
| 282 |
|
}; |
| 283 |
|
|
| 284 |
|
my $foreign_attr_xname = { |
| 285 |
|
'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']], |
| 286 |
|
'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']], |
| 287 |
|
'xlink:href' => [$XLINK_NS, ['xlink', 'href']], |
| 288 |
|
'xlink:role' => [$XLINK_NS, ['xlink', 'role']], |
| 289 |
|
'xlink:show' => [$XLINK_NS, ['xlink', 'show']], |
| 290 |
|
'xlink:title' => [$XLINK_NS, ['xlink', 'title']], |
| 291 |
|
'xlink:type' => [$XLINK_NS, ['xlink', 'type']], |
| 292 |
|
'xml:base' => [$XML_NS, ['xml', 'base']], |
| 293 |
|
'xml:lang' => [$XML_NS, ['xml', 'lang']], |
| 294 |
|
'xml:space' => [$XML_NS, ['xml', 'space']], |
| 295 |
|
'xmlns' => [$XMLNS_NS, [undef, 'xmlns']], |
| 296 |
|
'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']], |
| 297 |
|
}; |
| 298 |
|
|
| 299 |
|
## ISSUE: xmlns:xlink="non-xlink-ns" is not an error. |
| 300 |
|
|
| 301 |
my $c1_entity_char = { |
my $c1_entity_char = { |
| 302 |
0x80 => 0x20AC, |
0x80 => 0x20AC, |
| 303 |
0x81 => 0xFFFD, |
0x81 => 0xFFFD, |
| 334 |
}; # $c1_entity_char |
}; # $c1_entity_char |
| 335 |
|
|
| 336 |
sub parse_byte_string ($$$$;$) { |
sub parse_byte_string ($$$$;$) { |
| 337 |
|
my $self = shift; |
| 338 |
|
my $charset_name = shift; |
| 339 |
|
open my $input, '<', ref $_[0] ? $_[0] : \($_[0]); |
| 340 |
|
return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]); |
| 341 |
|
} # parse_byte_string |
| 342 |
|
|
| 343 |
|
sub parse_byte_stream ($$$$;$) { |
| 344 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 345 |
my $charset = shift; |
my $charset_name = shift; |
| 346 |
my $bytes_s = ref $_[0] ? $_[0] : \($_[0]); |
my $byte_stream = $_[0]; |
|
my $s; |
|
|
|
|
|
if (defined $charset) { |
|
|
require Encode; ## TODO: decode(utf8) don't delete BOM |
|
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
|
|
$self->{input_encoding} = lc $charset; ## TODO: normalize name |
|
|
$self->{confident} = 1; |
|
|
} else { |
|
|
## TODO: Implement HTML5 detection algorithm |
|
|
require Whatpm::Charset::UniversalCharDet; |
|
|
$charset = Whatpm::Charset::UniversalCharDet->detect_byte_string |
|
|
(substr ($$bytes_s, 0, 1024)); |
|
|
$charset ||= 'windows-1252'; |
|
|
$s = \ (Encode::decode ($charset, $$bytes_s)); |
|
|
$self->{input_encoding} = $charset; |
|
|
$self->{confident} = 0; |
|
|
} |
|
| 347 |
|
|
| 348 |
$self->{change_encoding} = sub { |
my $onerror = $_[2] || sub { |
| 349 |
my $self = shift; |
my (%opt) = @_; |
| 350 |
my $charset = lc shift; |
warn "Parse error ($opt{type})\n"; |
| 351 |
my $token = shift; |
}; |
| 352 |
## TODO: if $charset is supported |
$self->{parse_error} = $onerror; # updated later by parse_char_string |
| 353 |
## TODO: normalize charset name |
|
| 354 |
|
## HTML5 encoding sniffing algorithm |
| 355 |
|
require Message::Charset::Info; |
| 356 |
|
my $charset; |
| 357 |
|
my $buffer; |
| 358 |
|
my ($char_stream, $e_status); |
| 359 |
|
|
| 360 |
|
SNIFFING: { |
| 361 |
|
|
| 362 |
## "Change the encoding" algorithm: |
## Step 1 |
| 363 |
|
if (defined $charset_name) { |
| 364 |
|
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 365 |
|
|
| 366 |
## Step 1 |
## ISSUE: Unsupported encoding is not ignored according to the spec. |
| 367 |
if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
($char_stream, $e_status) = $charset->get_decode_handle |
| 368 |
$charset = 'utf-8'; |
($byte_stream, allow_error_reporting => 1, |
| 369 |
|
allow_fallback => 1); |
| 370 |
|
if ($char_stream) { |
| 371 |
|
$self->{confident} = 1; |
| 372 |
|
last SNIFFING; |
| 373 |
|
} else { |
| 374 |
|
## TODO: unsupported error |
| 375 |
|
} |
| 376 |
} |
} |
| 377 |
|
|
| 378 |
## Step 2 |
## Step 2 |
| 379 |
if (defined $self->{input_encoding} and |
my $byte_buffer = ''; |
| 380 |
$self->{input_encoding} eq $charset) { |
for (1..1024) { |
| 381 |
|
my $char = $byte_stream->getc; |
| 382 |
|
last unless defined $char; |
| 383 |
|
$byte_buffer .= $char; |
| 384 |
|
} ## TODO: timeout |
| 385 |
|
|
| 386 |
|
## Step 3 |
| 387 |
|
if ($byte_buffer =~ /^\xFE\xFF/) { |
| 388 |
|
$charset = Message::Charset::Info->get_by_iana_name ('utf-16be'); |
| 389 |
|
($char_stream, $e_status) = $charset->get_decode_handle |
| 390 |
|
($byte_stream, allow_error_reporting => 1, |
| 391 |
|
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 392 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 393 |
return; |
last SNIFFING; |
| 394 |
|
} elsif ($byte_buffer =~ /^\xFF\xFE/) { |
| 395 |
|
$charset = Message::Charset::Info->get_by_iana_name ('utf-16le'); |
| 396 |
|
($char_stream, $e_status) = $charset->get_decode_handle |
| 397 |
|
($byte_stream, allow_error_reporting => 1, |
| 398 |
|
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 399 |
|
$self->{confident} = 1; |
| 400 |
|
last SNIFFING; |
| 401 |
|
} elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) { |
| 402 |
|
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
| 403 |
|
($char_stream, $e_status) = $charset->get_decode_handle |
| 404 |
|
($byte_stream, allow_error_reporting => 1, |
| 405 |
|
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 406 |
|
$self->{confident} = 1; |
| 407 |
|
last SNIFFING; |
| 408 |
} |
} |
| 409 |
|
|
| 410 |
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
## Step 4 |
| 411 |
':'.$charset, level => 'w', token => $token); |
## TODO: <meta charset> |
| 412 |
|
|
| 413 |
## Step 3 |
## Step 5 |
| 414 |
# if (can) { |
## TODO: from history |
|
## change the encoding on the fly. |
|
|
#$self->{confident} = 1; |
|
|
#return; |
|
|
# } |
|
| 415 |
|
|
| 416 |
## Step 4 |
## Step 6 |
| 417 |
throw Whatpm::HTML::RestartParser (charset => $charset); |
require Whatpm::Charset::UniversalCharDet; |
| 418 |
|
$charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string |
| 419 |
|
($byte_buffer); |
| 420 |
|
if (defined $charset_name) { |
| 421 |
|
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 422 |
|
|
| 423 |
|
## ISSUE: Unsupported encoding is not ignored according to the spec. |
| 424 |
|
require Whatpm::Charset::DecodeHandle; |
| 425 |
|
$buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new |
| 426 |
|
($byte_stream); |
| 427 |
|
($char_stream, $e_status) = $charset->get_decode_handle |
| 428 |
|
($buffer, allow_error_reporting => 1, |
| 429 |
|
allow_fallback => 1, byte_buffer => \$byte_buffer); |
| 430 |
|
if ($char_stream) { |
| 431 |
|
$buffer->{buffer} = $byte_buffer; |
| 432 |
|
!!!parse-error (type => 'sniffing:chardet', ## TODO: type name |
| 433 |
|
value => $charset_name, |
| 434 |
|
level => $self->{info_level}, |
| 435 |
|
line => 1, column => 1); |
| 436 |
|
$self->{confident} = 0; |
| 437 |
|
last SNIFFING; |
| 438 |
|
} |
| 439 |
|
} |
| 440 |
|
|
| 441 |
|
## Step 7: default |
| 442 |
|
## TODO: Make this configurable. |
| 443 |
|
$charset = Message::Charset::Info->get_by_iana_name ('windows-1252'); |
| 444 |
|
## NOTE: We choose |windows-1252| here, since |utf-8| should be |
| 445 |
|
## detectable in the step 6. |
| 446 |
|
require Whatpm::Charset::DecodeHandle; |
| 447 |
|
$buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new |
| 448 |
|
($byte_stream); |
| 449 |
|
($char_stream, $e_status) |
| 450 |
|
= $charset->get_decode_handle ($buffer, |
| 451 |
|
allow_error_reporting => 1, |
| 452 |
|
allow_fallback => 1, |
| 453 |
|
byte_buffer => \$byte_buffer); |
| 454 |
|
$buffer->{buffer} = $byte_buffer; |
| 455 |
|
!!!parse-error (type => 'sniffing:default', ## TODO: type name |
| 456 |
|
value => 'windows-1252', |
| 457 |
|
level => $self->{info_level}, |
| 458 |
|
line => 1, column => 1); |
| 459 |
|
$self->{confident} = 0; |
| 460 |
|
} # SNIFFING |
| 461 |
|
|
| 462 |
|
$self->{input_encoding} = $charset->get_iana_name; |
| 463 |
|
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
| 464 |
|
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
| 465 |
|
value => $self->{input_encoding}, |
| 466 |
|
level => $self->{unsupported_level}, |
| 467 |
|
line => 1, column => 1); |
| 468 |
|
} elsif (not ($e_status & |
| 469 |
|
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
| 470 |
|
!!!parse-error (type => 'chardecode:no error', ## TODO: type name |
| 471 |
|
value => $self->{input_encoding}, |
| 472 |
|
level => $self->{unsupported_level}, |
| 473 |
|
line => 1, column => 1); |
| 474 |
|
} |
| 475 |
|
|
| 476 |
|
$self->{change_encoding} = sub { |
| 477 |
|
my $self = shift; |
| 478 |
|
$charset_name = shift; |
| 479 |
|
my $token = shift; |
| 480 |
|
|
| 481 |
|
$charset = Message::Charset::Info->get_by_iana_name ($charset_name); |
| 482 |
|
($char_stream, $e_status) = $charset->get_decode_handle |
| 483 |
|
($byte_stream, allow_error_reporting => 1, allow_fallback => 1, |
| 484 |
|
byte_buffer => \ $buffer->{buffer}); |
| 485 |
|
|
| 486 |
|
if ($char_stream) { # if supported |
| 487 |
|
## "Change the encoding" algorithm: |
| 488 |
|
|
| 489 |
|
## Step 1 |
| 490 |
|
if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8? |
| 491 |
|
$charset = Message::Charset::Info->get_by_iana_name ('utf-8'); |
| 492 |
|
($char_stream, $e_status) = $charset->get_decode_handle |
| 493 |
|
($byte_stream, |
| 494 |
|
byte_buffer => \ $buffer->{buffer}); |
| 495 |
|
} |
| 496 |
|
$charset_name = $charset->get_iana_name; |
| 497 |
|
|
| 498 |
|
## Step 2 |
| 499 |
|
if (defined $self->{input_encoding} and |
| 500 |
|
$self->{input_encoding} eq $charset_name) { |
| 501 |
|
!!!parse-error (type => 'charset label:matching', ## TODO: type |
| 502 |
|
value => $charset_name, |
| 503 |
|
level => $self->{info_level}); |
| 504 |
|
$self->{confident} = 1; |
| 505 |
|
return; |
| 506 |
|
} |
| 507 |
|
|
| 508 |
|
!!!parse-error (type => 'charset label detected:'.$self->{input_encoding}. |
| 509 |
|
':'.$charset_name, level => 'w', token => $token); |
| 510 |
|
|
| 511 |
|
## Step 3 |
| 512 |
|
# if (can) { |
| 513 |
|
## change the encoding on the fly. |
| 514 |
|
#$self->{confident} = 1; |
| 515 |
|
#return; |
| 516 |
|
# } |
| 517 |
|
|
| 518 |
|
## Step 4 |
| 519 |
|
throw Whatpm::HTML::RestartParser (); |
| 520 |
|
} |
| 521 |
}; # $self->{change_encoding} |
}; # $self->{change_encoding} |
| 522 |
|
|
| 523 |
|
my $char_onerror = sub { |
| 524 |
|
my (undef, $type, %opt) = @_; |
| 525 |
|
!!!parse-error (%opt, type => $type, |
| 526 |
|
line => $self->{line}, column => $self->{column} + 1); |
| 527 |
|
if ($opt{octets}) { |
| 528 |
|
${$opt{octets}} = "\x{FFFD}"; # relacement character |
| 529 |
|
} |
| 530 |
|
}; |
| 531 |
|
$char_stream->onerror ($char_onerror); |
| 532 |
|
|
| 533 |
my @args = @_; shift @args; # $s |
my @args = @_; shift @args; # $s |
| 534 |
my $return; |
my $return; |
| 535 |
try { |
try { |
| 536 |
$return = $self->parse_char_string ($s, @args); |
$return = $self->parse_char_stream ($char_stream, @args); |
| 537 |
} catch Whatpm::HTML::RestartParser with { |
} catch Whatpm::HTML::RestartParser with { |
| 538 |
my $charset = shift->{charset}; |
## NOTE: Invoked after {change_encoding}. |
| 539 |
$s = \ (Encode::decode ($charset, $$bytes_s)); |
|
| 540 |
$self->{input_encoding} = $charset; ## TODO: normalize |
$self->{input_encoding} = $charset->get_iana_name; |
| 541 |
|
if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { |
| 542 |
|
!!!parse-error (type => 'chardecode:fallback', ## TODO: type name |
| 543 |
|
value => $self->{input_encoding}, |
| 544 |
|
level => $self->{unsupported_level}, |
| 545 |
|
line => 1, column => 1); |
| 546 |
|
} elsif (not ($e_status & |
| 547 |
|
Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) { |
| 548 |
|
!!!parse-error (type => 'chardecode:no error', ## TODO: type name |
| 549 |
|
value => $self->{input_encoding}, |
| 550 |
|
level => $self->{unsupported_level}, |
| 551 |
|
line => 1, column => 1); |
| 552 |
|
} |
| 553 |
$self->{confident} = 1; |
$self->{confident} = 1; |
| 554 |
$return = $self->parse_char_string ($s, @args); |
$char_stream->onerror ($char_onerror); |
| 555 |
|
$return = $self->parse_char_stream ($char_stream, @args); |
| 556 |
}; |
}; |
| 557 |
return $return; |
return $return; |
| 558 |
} # parse_byte_string |
} # parse_byte_stream |
| 559 |
|
|
| 560 |
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM |
## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM |
| 561 |
## and the HTML layer MUST ignore it. However, we does strip BOM in |
## and the HTML layer MUST ignore it. However, we does strip BOM in |
| 566 |
## such as |parse_byte_string| in this module, must ensure that it does |
## such as |parse_byte_string| in this module, must ensure that it does |
| 567 |
## strip the BOM and never strip any ZWNBSP. |
## strip the BOM and never strip any ZWNBSP. |
| 568 |
|
|
| 569 |
*parse_char_string = \&parse_string; |
sub parse_char_string ($$$;$) { |
| 570 |
|
my $self = shift; |
| 571 |
|
require utf8; |
| 572 |
|
my $s = ref $_[0] ? $_[0] : \($_[0]); |
| 573 |
|
open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s; |
| 574 |
|
return $self->parse_char_stream ($input, @_[1..$#_]); |
| 575 |
|
} # parse_char_string |
| 576 |
|
*parse_string = \&parse_char_string; |
| 577 |
|
|
| 578 |
sub parse_string ($$$;$) { |
sub parse_char_stream ($$$;$) { |
| 579 |
my $self = ref $_[0] ? shift : shift->new; |
my $self = ref $_[0] ? shift : shift->new; |
| 580 |
my $s = ref $_[0] ? $_[0] : \($_[0]); |
my $input = $_[0]; |
| 581 |
$self->{document} = $_[1]; |
$self->{document} = $_[1]; |
| 582 |
@{$self->{document}->child_nodes} = (); |
@{$self->{document}->child_nodes} = (); |
| 583 |
|
|
| 596 |
pop @{$self->{prev_char}}; |
pop @{$self->{prev_char}}; |
| 597 |
unshift @{$self->{prev_char}}, $self->{next_char}; |
unshift @{$self->{prev_char}}, $self->{next_char}; |
| 598 |
|
|
| 599 |
$self->{next_char} = -1 and return if $i >= length $$s; |
my $char; |
| 600 |
$self->{next_char} = ord substr $$s, $i++, 1; |
if (defined $self->{next_next_char}) { |
| 601 |
|
$char = $self->{next_next_char}; |
| 602 |
|
delete $self->{next_next_char}; |
| 603 |
|
} else { |
| 604 |
|
$char = $input->getc; |
| 605 |
|
} |
| 606 |
|
$self->{next_char} = -1 and return unless defined $char; |
| 607 |
|
$self->{next_char} = ord $char; |
| 608 |
|
|
| 609 |
($self->{line_prev}, $self->{column_prev}) |
($self->{line_prev}, $self->{column_prev}) |
| 610 |
= ($self->{line}, $self->{column}); |
= ($self->{line}, $self->{column}); |
| 611 |
$self->{column}++; |
$self->{column}++; |
| 612 |
|
|
| 613 |
if ($self->{next_char} == 0x000A) { # LF |
if ($self->{next_char} == 0x000A) { # LF |
| 614 |
|
!!!cp ('j1'); |
| 615 |
$self->{line}++; |
$self->{line}++; |
| 616 |
$self->{column} = 0; |
$self->{column} = 0; |
| 617 |
} elsif ($self->{next_char} == 0x000D) { # CR |
} elsif ($self->{next_char} == 0x000D) { # CR |
| 618 |
$i++ if substr ($$s, $i, 1) eq "\x0A"; |
!!!cp ('j2'); |
| 619 |
|
my $next = $input->getc; |
| 620 |
|
if (defined $next and $next ne "\x0A") { |
| 621 |
|
$self->{next_next_char} = $next; |
| 622 |
|
} |
| 623 |
$self->{next_char} = 0x000A; # LF # MUST |
$self->{next_char} = 0x000A; # LF # MUST |
| 624 |
$self->{line}++; |
$self->{line}++; |
| 625 |
$self->{column} = 0; |
$self->{column} = 0; |
| 626 |
} elsif ($self->{next_char} > 0x10FFFF) { |
} elsif ($self->{next_char} > 0x10FFFF) { |
| 627 |
|
!!!cp ('j3'); |
| 628 |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
| 629 |
} elsif ($self->{next_char} == 0x0000) { # NULL |
} elsif ($self->{next_char} == 0x0000) { # NULL |
| 630 |
|
!!!cp ('j4'); |
| 631 |
!!!parse-error (type => 'NULL'); |
!!!parse-error (type => 'NULL'); |
| 632 |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
| 633 |
|
} elsif ($self->{next_char} <= 0x0008 or |
| 634 |
|
(0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or |
| 635 |
|
(0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or |
| 636 |
|
(0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or |
| 637 |
|
(0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or |
| 638 |
|
{ |
| 639 |
|
0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1, |
| 640 |
|
0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1, |
| 641 |
|
0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1, |
| 642 |
|
0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1, |
| 643 |
|
0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1, |
| 644 |
|
0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1, |
| 645 |
|
0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1, |
| 646 |
|
0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1, |
| 647 |
|
0x10FFFE => 1, 0x10FFFF => 1, |
| 648 |
|
}->{$self->{next_char}}) { |
| 649 |
|
!!!cp ('j5'); |
| 650 |
|
!!!parse-error (type => 'control char', level => $self->{must_level}); |
| 651 |
|
## TODO: error type documentation |
| 652 |
} |
} |
| 653 |
}; |
}; |
| 654 |
$self->{prev_char} = [-1, -1, -1]; |
$self->{prev_char} = [-1, -1, -1]; |
| 672 |
delete $self->{parse_error}; # remove loop |
delete $self->{parse_error}; # remove loop |
| 673 |
|
|
| 674 |
return $self->{document}; |
return $self->{document}; |
| 675 |
} # parse_string |
} # parse_char_stream |
| 676 |
|
|
| 677 |
sub new ($) { |
sub new ($) { |
| 678 |
my $class = shift; |
my $class = shift; |
| 679 |
my $self = bless {}, $class; |
my $self = bless { |
| 680 |
|
must_level => 'm', |
| 681 |
|
should_level => 's', |
| 682 |
|
good_level => 'w', |
| 683 |
|
warn_level => 'w', |
| 684 |
|
info_level => 'i', |
| 685 |
|
unsupported_level => 'u', |
| 686 |
|
}, $class; |
| 687 |
$self->{set_next_char} = sub { |
$self->{set_next_char} = sub { |
| 688 |
$self->{next_char} = -1; |
$self->{next_char} = -1; |
| 689 |
}; |
}; |
| 746 |
sub BOGUS_DOCTYPE_STATE () { 32 } |
sub BOGUS_DOCTYPE_STATE () { 32 } |
| 747 |
sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 } |
sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 } |
| 748 |
sub SELF_CLOSING_START_TAG_STATE () { 34 } |
sub SELF_CLOSING_START_TAG_STATE () { 34 } |
| 749 |
|
sub CDATA_BLOCK_STATE () { 35 } |
| 750 |
|
|
| 751 |
sub DOCTYPE_TOKEN () { 1 } |
sub DOCTYPE_TOKEN () { 1 } |
| 752 |
sub COMMENT_TOKEN () { 2 } |
sub COMMENT_TOKEN () { 2 } |
| 1048 |
redo A; |
redo A; |
| 1049 |
} else { |
} else { |
| 1050 |
!!!cp (23); |
!!!cp (23); |
| 1051 |
!!!parse-error (type => 'bare stago'); |
!!!parse-error (type => 'bare stago', |
| 1052 |
|
line => $self->{line_prev}, |
| 1053 |
|
column => $self->{column_prev}); |
| 1054 |
$self->{state} = DATA_STATE; |
$self->{state} = DATA_STATE; |
| 1055 |
## reconsume |
## reconsume |
| 1056 |
|
|
| 1974 |
} else { |
} else { |
| 1975 |
!!!cp (135); |
!!!cp (135); |
| 1976 |
} |
} |
| 1977 |
|
} elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and |
| 1978 |
|
$self->{open_elements}->[-1]->[1] & FOREIGN_EL and |
| 1979 |
|
$self->{next_char} == 0x005B) { # [ |
| 1980 |
|
!!!next-input-character; |
| 1981 |
|
push @next_char, $self->{next_char}; |
| 1982 |
|
if ($self->{next_char} == 0x0043) { # C |
| 1983 |
|
!!!next-input-character; |
| 1984 |
|
push @next_char, $self->{next_char}; |
| 1985 |
|
if ($self->{next_char} == 0x0044) { # D |
| 1986 |
|
!!!next-input-character; |
| 1987 |
|
push @next_char, $self->{next_char}; |
| 1988 |
|
if ($self->{next_char} == 0x0041) { # A |
| 1989 |
|
!!!next-input-character; |
| 1990 |
|
push @next_char, $self->{next_char}; |
| 1991 |
|
if ($self->{next_char} == 0x0054) { # T |
| 1992 |
|
!!!next-input-character; |
| 1993 |
|
push @next_char, $self->{next_char}; |
| 1994 |
|
if ($self->{next_char} == 0x0041) { # A |
| 1995 |
|
!!!next-input-character; |
| 1996 |
|
push @next_char, $self->{next_char}; |
| 1997 |
|
if ($self->{next_char} == 0x005B) { # [ |
| 1998 |
|
!!!cp (135.1); |
| 1999 |
|
$self->{state} = CDATA_BLOCK_STATE; |
| 2000 |
|
!!!next-input-character; |
| 2001 |
|
redo A; |
| 2002 |
|
} else { |
| 2003 |
|
!!!cp (135.2); |
| 2004 |
|
} |
| 2005 |
|
} else { |
| 2006 |
|
!!!cp (135.3); |
| 2007 |
|
} |
| 2008 |
|
} else { |
| 2009 |
|
!!!cp (135.4); |
| 2010 |
|
} |
| 2011 |
|
} else { |
| 2012 |
|
!!!cp (135.5); |
| 2013 |
|
} |
| 2014 |
|
} else { |
| 2015 |
|
!!!cp (135.6); |
| 2016 |
|
} |
| 2017 |
|
} else { |
| 2018 |
|
!!!cp (135.7); |
| 2019 |
|
} |
| 2020 |
} else { |
} else { |
| 2021 |
!!!cp (136); |
!!!cp (136); |
| 2022 |
} |
} |
| 2741 |
!!!next-input-character; |
!!!next-input-character; |
| 2742 |
redo A; |
redo A; |
| 2743 |
} |
} |
| 2744 |
|
} elsif ($self->{state} == CDATA_BLOCK_STATE) { |
| 2745 |
|
my $s = ''; |
| 2746 |
|
|
| 2747 |
|
my ($l, $c) = ($self->{line}, $self->{column}); |
| 2748 |
|
|
| 2749 |
|
CS: while ($self->{next_char} != -1) { |
| 2750 |
|
if ($self->{next_char} == 0x005D) { # ] |
| 2751 |
|
!!!next-input-character; |
| 2752 |
|
if ($self->{next_char} == 0x005D) { # ] |
| 2753 |
|
!!!next-input-character; |
| 2754 |
|
MDC: { |
| 2755 |
|
if ($self->{next_char} == 0x003E) { # > |
| 2756 |
|
!!!cp (221.1); |
| 2757 |
|
!!!next-input-character; |
| 2758 |
|
last CS; |
| 2759 |
|
} elsif ($self->{next_char} == 0x005D) { # ] |
| 2760 |
|
!!!cp (221.2); |
| 2761 |
|
$s .= ']'; |
| 2762 |
|
!!!next-input-character; |
| 2763 |
|
redo MDC; |
| 2764 |
|
} else { |
| 2765 |
|
!!!cp (221.3); |
| 2766 |
|
$s .= ']]'; |
| 2767 |
|
# |
| 2768 |
|
} |
| 2769 |
|
} # MDC |
| 2770 |
|
} else { |
| 2771 |
|
!!!cp (221.4); |
| 2772 |
|
$s .= ']'; |
| 2773 |
|
# |
| 2774 |
|
} |
| 2775 |
|
} else { |
| 2776 |
|
!!!cp (221.5); |
| 2777 |
|
# |
| 2778 |
|
} |
| 2779 |
|
$s .= chr $self->{next_char}; |
| 2780 |
|
!!!next-input-character; |
| 2781 |
|
} # CS |
| 2782 |
|
|
| 2783 |
|
$self->{state} = DATA_STATE; |
| 2784 |
|
## next-input-character done or EOF, which is reconsumed. |
| 2785 |
|
|
| 2786 |
|
if (length $s) { |
| 2787 |
|
!!!cp (221.6); |
| 2788 |
|
!!!emit ({type => CHARACTER_TOKEN, data => $s, |
| 2789 |
|
line => $l, column => $c}); |
| 2790 |
|
} else { |
| 2791 |
|
!!!cp (221.7); |
| 2792 |
|
} |
| 2793 |
|
|
| 2794 |
|
redo A; |
| 2795 |
|
|
| 2796 |
|
## ISSUE: "text tokens" in spec. |
| 2797 |
|
## TODO: Streaming support |
| 2798 |
} else { |
} else { |
| 2799 |
die "$0: $self->{state}: Unknown state"; |
die "$0: $self->{state}: Unknown state"; |
| 2800 |
} |
} |
| 2945 |
require Whatpm::_NamedEntityList; |
require Whatpm::_NamedEntityList; |
| 2946 |
our $EntityChar; |
our $EntityChar; |
| 2947 |
|
|
| 2948 |
while (length $entity_name < 10 and |
while (length $entity_name < 30 and |
| 2949 |
## NOTE: Some number greater than the maximum length of entity name |
## NOTE: Some number greater than the maximum length of entity name |
| 2950 |
((0x0041 <= $self->{next_char} and # a |
((0x0041 <= $self->{next_char} and # a |
| 2951 |
$self->{next_char} <= 0x005A) or # x |
$self->{next_char} <= 0x005A) or # x |
| 3978 |
!!!next-token; |
!!!next-token; |
| 3979 |
next B; |
next B; |
| 3980 |
} elsif ($token->{type} == START_TAG_TOKEN) { |
} elsif ($token->{type} == START_TAG_TOKEN) { |
| 3981 |
if ($self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL or |
if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and |
| 3982 |
|
$self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or |
| 3983 |
not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or |
not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or |
| 3984 |
($token->{tag_name} eq 'svg' and |
($token->{tag_name} eq 'svg' and |
| 3985 |
$self->{open_elements}->[-1]->[1] & MML_AXML_EL)) { |
$self->{open_elements}->[-1]->[1] & MML_AXML_EL)) { |
| 3987 |
!!!cp ('t87.2'); |
!!!cp ('t87.2'); |
| 3988 |
# |
# |
| 3989 |
} elsif ({ |
} elsif ({ |
| 3990 |
## TODO: |
b => 1, big => 1, blockquote => 1, body => 1, br => 1, |
| 3991 |
|
center => 1, code => 1, dd => 1, div => 1, dl => 1, em => 1, |
| 3992 |
|
embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1, ## No h4! |
| 3993 |
|
h5 => 1, h6 => 1, head => 1, hr => 1, i => 1, img => 1, |
| 3994 |
|
li => 1, menu => 1, meta => 1, nobr => 1, p => 1, pre => 1, |
| 3995 |
|
ruby => 1, s => 1, small => 1, span => 1, strong => 1, |
| 3996 |
|
sub => 1, sup => 1, table => 1, tt => 1, u => 1, ul => 1, |
| 3997 |
|
var => 1, |
| 3998 |
}->{$token->{tag_name}}) { |
}->{$token->{tag_name}}) { |
| 3999 |
!!!cp ('t87.2'); |
!!!cp ('t87.2'); |
| 4000 |
!!!parse-error (type => 'not closed', |
!!!parse-error (type => 'not closed', |
| 4005 |
pop @{$self->{open_elements}} |
pop @{$self->{open_elements}} |
| 4006 |
while $self->{open_elements}->[-1]->[1] & FOREIGN_EL; |
while $self->{open_elements}->[-1]->[1] & FOREIGN_EL; |
| 4007 |
|
|
| 4008 |
$self->{insertion_mode} &= ~ $self->{insertion_mode}; |
$self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM; |
| 4009 |
## Reprocess. |
## Reprocess. |
| 4010 |
next B; |
next B; |
| 4011 |
} else { |
} else { |
| 4012 |
## TODO: case fixup |
my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri; |
| 4013 |
|
my $tag_name = $token->{tag_name}; |
| 4014 |
|
if ($nsuri eq $SVG_NS) { |
| 4015 |
|
$tag_name = { |
| 4016 |
|
altglyph => 'altGlyph', |
| 4017 |
|
altglyphdef => 'altGlyphDef', |
| 4018 |
|
altglyphitem => 'altGlyphItem', |
| 4019 |
|
animatecolor => 'animateColor', |
| 4020 |
|
animatemotion => 'animateMotion', |
| 4021 |
|
animatetransform => 'animateTransform', |
| 4022 |
|
clippath => 'clipPath', |
| 4023 |
|
feblend => 'feBlend', |
| 4024 |
|
fecolormatrix => 'feColorMatrix', |
| 4025 |
|
fecomponenttransfer => 'feComponentTransfer', |
| 4026 |
|
fecomposite => 'feComposite', |
| 4027 |
|
feconvolvematrix => 'feConvolveMatrix', |
| 4028 |
|
fediffuselighting => 'feDiffuseLighting', |
| 4029 |
|
fedisplacementmap => 'feDisplacementMap', |
| 4030 |
|
fedistantlight => 'feDistantLight', |
| 4031 |
|
feflood => 'feFlood', |
| 4032 |
|
fefunca => 'feFuncA', |
| 4033 |
|
fefuncb => 'feFuncB', |
| 4034 |
|
fefuncg => 'feFuncG', |
| 4035 |
|
fefuncr => 'feFuncR', |
| 4036 |
|
fegaussianblur => 'feGaussianBlur', |
| 4037 |
|
feimage => 'feImage', |
| 4038 |
|
femerge => 'feMerge', |
| 4039 |
|
femergenode => 'feMergeNode', |
| 4040 |
|
femorphology => 'feMorphology', |
| 4041 |
|
feoffset => 'feOffset', |
| 4042 |
|
fepointlight => 'fePointLight', |
| 4043 |
|
fespecularlighting => 'feSpecularLighting', |
| 4044 |
|
fespotlight => 'feSpotLight', |
| 4045 |
|
fetile => 'feTile', |
| 4046 |
|
feturbulence => 'feTurbulence', |
| 4047 |
|
foreignobject => 'foreignObject', |
| 4048 |
|
glyphref => 'glyphRef', |
| 4049 |
|
lineargradient => 'linearGradient', |
| 4050 |
|
radialgradient => 'radialGradient', |
| 4051 |
|
#solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2) |
| 4052 |
|
textpath => 'textPath', |
| 4053 |
|
}->{$tag_name} || $tag_name; |
| 4054 |
|
} |
| 4055 |
|
|
| 4056 |
!!!insert-element-f ($self->{open_elements}->[-1]->[0]->namespace_uri, $token); |
## "adjust SVG attributes" (SVG only) - done in insert-element-f |
| 4057 |
|
|
| 4058 |
|
## "adjust foreign attributes" - done in insert-element-f |
| 4059 |
|
|
| 4060 |
|
!!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token); |
| 4061 |
|
|
| 4062 |
if ($self->{self_closing}) { |
if ($self->{self_closing}) { |
| 4063 |
pop @{$self->{open_elements}}; |
pop @{$self->{open_elements}}; |
| 4154 |
!!!next-token; |
!!!next-token; |
| 4155 |
next B; |
next B; |
| 4156 |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4157 |
!!!cp ('t94'); |
!!!cp ('t93.2'); |
| 4158 |
# |
!!!parse-error (type => 'after head:head', token => $token); ## TODO: error type |
| 4159 |
|
## Ignore the token |
| 4160 |
|
!!!nack ('t93.3'); |
| 4161 |
|
!!!next-token; |
| 4162 |
|
next B; |
| 4163 |
} else { |
} else { |
| 4164 |
!!!cp ('t95'); |
!!!cp ('t95'); |
| 4165 |
!!!parse-error (type => 'in head:head', token => $token); # or in head noscript |
!!!parse-error (type => 'in head:head', token => $token); # or in head noscript |
| 4242 |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 4243 |
|
|
| 4244 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 4245 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { |
| 4246 |
!!!cp ('t106'); |
!!!cp ('t106'); |
| 4247 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 4248 |
|
## in the {change_encoding} callback. |
| 4249 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4250 |
->($self, $token->{attributes}->{charset}->{value}, |
->($self, $token->{attributes}->{charset}->{value}, |
| 4251 |
$token); |
$token); |
| 4255 |
$token->{attributes}->{charset} |
$token->{attributes}->{charset} |
| 4256 |
->{has_reference}); |
->{has_reference}); |
| 4257 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
|
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
|
| 4258 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 4259 |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 4260 |
[\x09-\x0D\x20]*= |
[\x09-\x0D\x20]*= |
| 4261 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 4262 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 4263 |
!!!cp ('t107'); |
!!!cp ('t107'); |
| 4264 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 4265 |
|
## in the {change_encoding} callback. |
| 4266 |
$self->{change_encoding} |
$self->{change_encoding} |
| 4267 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, |
| 4268 |
$token); |
$token); |
| 4481 |
$self->{insertion_mode} = AFTER_HEAD_IM; |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4482 |
!!!next-token; |
!!!next-token; |
| 4483 |
next B; |
next B; |
| 4484 |
|
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4485 |
|
!!!cp ('t134.1'); |
| 4486 |
|
!!!parse-error (type => 'unmatched end tag:head', token => $token); |
| 4487 |
|
## Ignore the token |
| 4488 |
|
!!!next-token; |
| 4489 |
|
next B; |
| 4490 |
} else { |
} else { |
| 4491 |
!!!cp ('t135'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
|
# |
|
| 4492 |
} |
} |
| 4493 |
} elsif ($token->{tag_name} eq 'noscript') { |
} elsif ($token->{tag_name} eq 'noscript') { |
| 4494 |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
| 4497 |
$self->{insertion_mode} = IN_HEAD_IM; |
$self->{insertion_mode} = IN_HEAD_IM; |
| 4498 |
!!!next-token; |
!!!next-token; |
| 4499 |
next B; |
next B; |
| 4500 |
} elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
} elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or |
| 4501 |
|
$self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4502 |
!!!cp ('t137'); |
!!!cp ('t137'); |
| 4503 |
!!!parse-error (type => 'unmatched end tag:noscript', token => $token); |
!!!parse-error (type => 'unmatched end tag:noscript', token => $token); |
| 4504 |
## Ignore the token ## ISSUE: An issue in the spec. |
## Ignore the token ## ISSUE: An issue in the spec. |
| 4511 |
} elsif ({ |
} elsif ({ |
| 4512 |
body => 1, html => 1, |
body => 1, html => 1, |
| 4513 |
}->{$token->{tag_name}}) { |
}->{$token->{tag_name}}) { |
| 4514 |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
if ($self->{insertion_mode} == BEFORE_HEAD_IM or |
| 4515 |
!!!cp ('t139'); |
$self->{insertion_mode} == IN_HEAD_IM or |
| 4516 |
## As if <head> |
$self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
|
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
|
|
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
|
|
push @{$self->{open_elements}}, |
|
|
[$self->{head_element}, $el_category->{head}]; |
|
|
|
|
|
$self->{insertion_mode} = IN_HEAD_IM; |
|
|
## Reprocess in the "in head" insertion mode... |
|
|
} elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
|
| 4517 |
!!!cp ('t140'); |
!!!cp ('t140'); |
| 4518 |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
| 4519 |
## Ignore the token |
## Ignore the token |
| 4520 |
!!!next-token; |
!!!next-token; |
| 4521 |
next B; |
next B; |
| 4522 |
|
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4523 |
|
!!!cp ('t140.1'); |
| 4524 |
|
!!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token); |
| 4525 |
|
## Ignore the token |
| 4526 |
|
!!!next-token; |
| 4527 |
|
next B; |
| 4528 |
} else { |
} else { |
| 4529 |
!!!cp ('t141'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
| 4530 |
} |
} |
| 4531 |
|
} elsif ($token->{tag_name} eq 'p') { |
| 4532 |
# |
!!!cp ('t142'); |
| 4533 |
} elsif ({ |
!!!parse-error (type => 'unmatched end tag:p', token => $token); |
| 4534 |
p => 1, br => 1, |
## Ignore the token |
| 4535 |
}->{$token->{tag_name}}) { |
!!!next-token; |
| 4536 |
|
next B; |
| 4537 |
|
} elsif ($token->{tag_name} eq 'br') { |
| 4538 |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
if ($self->{insertion_mode} == BEFORE_HEAD_IM) { |
| 4539 |
!!!cp ('t142'); |
!!!cp ('t142.2'); |
| 4540 |
## As if <head> |
## (before head) as if <head>, (in head) as if </head> |
| 4541 |
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
!!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token); |
| 4542 |
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
$self->{open_elements}->[-1]->[0]->append_child ($self->{head_element}); |
| 4543 |
push @{$self->{open_elements}}, |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4544 |
[$self->{head_element}, $el_category->{head}]; |
|
| 4545 |
|
## Reprocess in the "after head" insertion mode... |
| 4546 |
|
} elsif ($self->{insertion_mode} == IN_HEAD_IM) { |
| 4547 |
|
!!!cp ('t143.2'); |
| 4548 |
|
## As if </head> |
| 4549 |
|
pop @{$self->{open_elements}}; |
| 4550 |
|
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4551 |
|
|
| 4552 |
|
## Reprocess in the "after head" insertion mode... |
| 4553 |
|
} elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
| 4554 |
|
!!!cp ('t143.3'); |
| 4555 |
|
## ISSUE: Two parse errors for <head><noscript></br> |
| 4556 |
|
!!!parse-error (type => 'unmatched end tag:br', token => $token); |
| 4557 |
|
## As if </noscript> |
| 4558 |
|
pop @{$self->{open_elements}}; |
| 4559 |
$self->{insertion_mode} = IN_HEAD_IM; |
$self->{insertion_mode} = IN_HEAD_IM; |
| 4560 |
|
|
| 4561 |
## Reprocess in the "in head" insertion mode... |
## Reprocess in the "in head" insertion mode... |
| 4562 |
} else { |
## As if </head> |
| 4563 |
!!!cp ('t143'); |
pop @{$self->{open_elements}}; |
| 4564 |
} |
$self->{insertion_mode} = AFTER_HEAD_IM; |
| 4565 |
|
|
| 4566 |
# |
## Reprocess in the "after head" insertion mode... |
| 4567 |
} else { |
} elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { |
| 4568 |
if ($self->{insertion_mode} == AFTER_HEAD_IM) { |
!!!cp ('t143.4'); |
|
!!!cp ('t144'); |
|
| 4569 |
# |
# |
| 4570 |
} else { |
} else { |
| 4571 |
!!!cp ('t145'); |
die "$0: $self->{insertion_mode}: Unknown insertion mode"; |
|
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
|
|
## Ignore the token |
|
|
!!!next-token; |
|
|
next B; |
|
| 4572 |
} |
} |
| 4573 |
|
|
| 4574 |
|
## ISSUE: does not agree with IE7 - it doesn't ignore </br>. |
| 4575 |
|
!!!parse-error (type => 'unmatched end tag:br', token => $token); |
| 4576 |
|
## Ignore the token |
| 4577 |
|
!!!next-token; |
| 4578 |
|
next B; |
| 4579 |
|
} else { |
| 4580 |
|
!!!cp ('t145'); |
| 4581 |
|
!!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token); |
| 4582 |
|
## Ignore the token |
| 4583 |
|
!!!next-token; |
| 4584 |
|
next B; |
| 4585 |
} |
} |
| 4586 |
|
|
| 4587 |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { |
| 6306 |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec. |
| 6307 |
|
|
| 6308 |
unless ($self->{confident}) { |
unless ($self->{confident}) { |
| 6309 |
if ($token->{attributes}->{charset}) { ## TODO: And if supported |
if ($token->{attributes}->{charset}) { |
| 6310 |
!!!cp ('t335'); |
!!!cp ('t335'); |
| 6311 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 6312 |
|
## in the {change_encoding} callback. |
| 6313 |
$self->{change_encoding} |
$self->{change_encoding} |
| 6314 |
->($self, $token->{attributes}->{charset}->{value}, $token); |
->($self, $token->{attributes}->{charset}->{value}, $token); |
| 6315 |
|
|
| 6318 |
$token->{attributes}->{charset} |
$token->{attributes}->{charset} |
| 6319 |
->{has_reference}); |
->{has_reference}); |
| 6320 |
} elsif ($token->{attributes}->{content}) { |
} elsif ($token->{attributes}->{content}) { |
|
## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition. |
|
| 6321 |
if ($token->{attributes}->{content}->{value} |
if ($token->{attributes}->{content}->{value} |
| 6322 |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
=~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt] |
| 6323 |
[\x09-\x0D\x20]*= |
[\x09-\x0D\x20]*= |
| 6324 |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
[\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'| |
| 6325 |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) { |
| 6326 |
!!!cp ('t336'); |
!!!cp ('t336'); |
| 6327 |
|
## NOTE: Whether the encoding is supported or not is handled |
| 6328 |
|
## in the {change_encoding} callback. |
| 6329 |
$self->{change_encoding} |
$self->{change_encoding} |
| 6330 |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); |
->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); |
| 6331 |
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
$meta_el->[0]->get_attribute_node_ns (undef, 'content') |
| 6754 |
} elsif ($token->{tag_name} eq 'math' or |
} elsif ($token->{tag_name} eq 'math' or |
| 6755 |
$token->{tag_name} eq 'svg') { |
$token->{tag_name} eq 'svg') { |
| 6756 |
$reconstruct_active_formatting_elements->($insert_to_current); |
$reconstruct_active_formatting_elements->($insert_to_current); |
| 6757 |
|
|
| 6758 |
|
## "adjust SVG attributes" ('svg' only) - done in insert-element-f |
| 6759 |
|
|
| 6760 |
|
## "adjust foreign attributes" - done in insert-element-f |
| 6761 |
|
|
| 6762 |
!!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token); |
!!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token); |
| 6763 |
|
|
| 6764 |
if ($self->{self_closing}) { |
if ($self->{self_closing}) { |
| 6765 |
pop @{$self->{open_elements}}; |
pop @{$self->{open_elements}}; |
| 7299 |
!!!cp ('i4'); |
!!!cp ('i4'); |
| 7300 |
!!!parse-error (type => 'NULL'); |
!!!parse-error (type => 'NULL'); |
| 7301 |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
| 7302 |
|
} elsif ($self->{next_char} <= 0x0008 or |
| 7303 |
|
(0x000E <= $self->{next_char} and |
| 7304 |
|
$self->{next_char} <= 0x001F) or |
| 7305 |
|
(0x007F <= $self->{next_char} and |
| 7306 |
|
$self->{next_char} <= 0x009F) or |
| 7307 |
|
(0xD800 <= $self->{next_char} and |
| 7308 |
|
$self->{next_char} <= 0xDFFF) or |
| 7309 |
|
(0xFDD0 <= $self->{next_char} and |
| 7310 |
|
$self->{next_char} <= 0xFDDF) or |
| 7311 |
|
{ |
| 7312 |
|
0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1, |
| 7313 |
|
0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1, |
| 7314 |
|
0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1, |
| 7315 |
|
0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1, |
| 7316 |
|
0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1, |
| 7317 |
|
0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1, |
| 7318 |
|
0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1, |
| 7319 |
|
0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1, |
| 7320 |
|
0x10FFFE => 1, 0x10FFFF => 1, |
| 7321 |
|
}->{$self->{next_char}}) { |
| 7322 |
|
!!!cp ('i4.1'); |
| 7323 |
|
!!!parse-error (type => 'control char', level => $self->{must_level}); |
| 7324 |
|
## TODO: error type documentation |
| 7325 |
} |
} |
| 7326 |
}; |
}; |
| 7327 |
$p->{prev_char} = [-1, -1, -1]; |
$p->{prev_char} = [-1, -1, -1]; |