package What::HTML;
use strict;
our $VERSION=do{my @r=(q$Revision: 1.7 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
## This is a very, very early version of an HTML parser.
my $permitted_slash_tag_name = {
base => 1,
link => 1,
meta => 1,
hr => 1,
br => 1,
img=> 1,
embed => 1,
param => 1,
area => 1,
col => 1,
input => 1,
my $entity_char = {
AElig => "\x{00C6}",
Aacute => "\x{00C1}",
Acirc => "\x{00C2}",
Agrave => "\x{00C0}",
Alpha => "\x{0391}",
Aring => "\x{00C5}",
Atilde => "\x{00C3}",
Auml => "\x{00C4}",
Beta => "\x{0392}",
Ccedil => "\x{00C7}",
Chi => "\x{03A7}",
Dagger => "\x{2021}",
Delta => "\x{0394}",
ETH => "\x{00D0}",
Eacute => "\x{00C9}",
Ecirc => "\x{00CA}",
Egrave => "\x{00C8}",
Epsilon => "\x{0395}",
Eta => "\x{0397}",
Euml => "\x{00CB}",
Gamma => "\x{0393}",
Iacute => "\x{00CD}",
Icirc => "\x{00CE}",
Igrave => "\x{00CC}",
Iota => "\x{0399}",
Iuml => "\x{00CF}",
Kappa => "\x{039A}",
Lambda => "\x{039B}",
Mu => "\x{039C}",
Ntilde => "\x{00D1}",
Nu => "\x{039D}",
OElig => "\x{0152}",
Oacute => "\x{00D3}",
Ocirc => "\x{00D4}",
Ograve => "\x{00D2}",
Omega => "\x{03A9}",
Omicron => "\x{039F}",
Oslash => "\x{00D8}",
Otilde => "\x{00D5}",
Ouml => "\x{00D6}",
Phi => "\x{03A6}",
Pi => "\x{03A0}",
Prime => "\x{2033}",
Psi => "\x{03A8}",
Rho => "\x{03A1}",
Scaron => "\x{0160}",
Sigma => "\x{03A3}",
THORN => "\x{00DE}",
Tau => "\x{03A4}",
Theta => "\x{0398}",
Uacute => "\x{00DA}",
Ucirc => "\x{00DB}",
Ugrave => "\x{00D9}",
Upsilon => "\x{03A5}",
Uuml => "\x{00DC}",
Xi => "\x{039E}",
Yacute => "\x{00DD}",
Yuml => "\x{0178}",
Zeta => "\x{0396}",
aacute => "\x{00E1}",
acirc => "\x{00E2}",
acute => "\x{00B4}",
aelig => "\x{00E6}",
agrave => "\x{00E0}",
alefsym => "\x{2135}",
alpha => "\x{03B1}",
amp => "\x{0026}",
AMP => "\x{0026}",
and => "\x{2227}",
ang => "\x{2220}",
apos => "\x{0027}",
aring => "\x{00E5}",
asymp => "\x{2248}",
atilde => "\x{00E3}",
auml => "\x{00E4}",
bdquo => "\x{201E}",
beta => "\x{03B2}",
brvbar => "\x{00A6}",
bull => "\x{2022}",
cap => "\x{2229}",
ccedil => "\x{00E7}",
cedil => "\x{00B8}",
cent => "\x{00A2}",
chi => "\x{03C7}",
circ => "\x{02C6}",
clubs => "\x{2663}",
cong => "\x{2245}",
copy => "\x{00A9}",
COPY => "\x{00A9}",
crarr => "\x{21B5}",
cup => "\x{222A}",
curren => "\x{00A4}",
dArr => "\x{21D3}",
dagger => "\x{2020}",
darr => "\x{2193}",
deg => "\x{00B0}",
delta => "\x{03B4}",
diams => "\x{2666}",
divide => "\x{00F7}",
eacute => "\x{00E9}",
ecirc => "\x{00EA}",
egrave => "\x{00E8}",
empty => "\x{2205}",
emsp => "\x{2003}",
ensp => "\x{2002}",
epsilon => "\x{03B5}",
equiv => "\x{2261}",
eta => "\x{03B7}",
eth => "\x{00F0}",
euml => "\x{00EB}",
euro => "\x{20AC}",
exist => "\x{2203}",
fnof => "\x{0192}",
forall => "\x{2200}",
frac12 => "\x{00BD}",
frac14 => "\x{00BC}",
frac34 => "\x{00BE}",
frasl => "\x{2044}",
gamma => "\x{03B3}",
ge => "\x{2265}",
gt => "\x{003E}",
GT => "\x{003E}",
hArr => "\x{21D4}",
harr => "\x{2194}",
hearts => "\x{2665}",
hellip => "\x{2026}",
iacute => "\x{00ED}",
icirc => "\x{00EE}",
iexcl => "\x{00A1}",
igrave => "\x{00EC}",
image => "\x{2111}",
infin => "\x{221E}",
int => "\x{222B}",
iota => "\x{03B9}",
iquest => "\x{00BF}",
isin => "\x{2208}",
iuml => "\x{00EF}",
kappa => "\x{03BA}",
lArr => "\x{21D0}",
lambda => "\x{03BB}",
lang => "\x{2329}",
laquo => "\x{00AB}",
larr => "\x{2190}",
lceil => "\x{2308}",
ldquo => "\x{201C}",
le => "\x{2264}",
lfloor => "\x{230A}",
lowast => "\x{2217}",
loz => "\x{25CA}",
lrm => "\x{200E}",
lsaquo => "\x{2039}",
lsquo => "\x{2018}",
lt => "\x{003C}",
LT => "\x{003C}",
macr => "\x{00AF}",
mdash => "\x{2014}",
micro => "\x{00B5}",
middot => "\x{00B7}",
minus => "\x{2212}",
mu => "\x{03BC}",
nabla => "\x{2207}",
nbsp => "\x{00A0}",
ndash => "\x{2013}",
ne => "\x{2260}",
ni => "\x{220B}",
not => "\x{00AC}",
notin => "\x{2209}",
nsub => "\x{2284}",
ntilde => "\x{00F1}",
nu => "\x{03BD}",
oacute => "\x{00F3}",
ocirc => "\x{00F4}",
oelig => "\x{0153}",
ograve => "\x{00F2}",
oline => "\x{203E}",
omega => "\x{03C9}",
omicron => "\x{03BF}",
oplus => "\x{2295}",
or => "\x{2228}",
ordf => "\x{00AA}",
ordm => "\x{00BA}",
oslash => "\x{00F8}",
otilde => "\x{00F5}",
otimes => "\x{2297}",
ouml => "\x{00F6}",
para => "\x{00B6}",
part => "\x{2202}",
permil => "\x{2030}",
perp => "\x{22A5}",
phi => "\x{03C6}",
pi => "\x{03C0}",
piv => "\x{03D6}",
plusmn => "\x{00B1}",
pound => "\x{00A3}",
prime => "\x{2032}",
prod => "\x{220F}",
prop => "\x{221D}",
psi => "\x{03C8}",
quot => "\x{0022}",
QUOT => "\x{0022}",
rArr => "\x{21D2}",
radic => "\x{221A}",
rang => "\x{232A}",
raquo => "\x{00BB}",
rarr => "\x{2192}",
rceil => "\x{2309}",
rdquo => "\x{201D}",
real => "\x{211C}",
reg => "\x{00AE}",
REG => "\x{00AE}",
rfloor => "\x{230B}",
rho => "\x{03C1}",
rlm => "\x{200F}",
rsaquo => "\x{203A}",
rsquo => "\x{2019}",
sbquo => "\x{201A}",
scaron => "\x{0161}",
sdot => "\x{22C5}",
sect => "\x{00A7}",
shy => "\x{00AD}",
sigma => "\x{03C3}",
sigmaf => "\x{03C2}",
sim => "\x{223C}",
spades => "\x{2660}",
sub => "\x{2282}",
sube => "\x{2286}",
sum => "\x{2211}",
sup => "\x{2283}",
sup1 => "\x{00B9}",
sup2 => "\x{00B2}",
sup3 => "\x{00B3}",
supe => "\x{2287}",
szlig => "\x{00DF}",
tau => "\x{03C4}",
there4 => "\x{2234}",
theta => "\x{03B8}",
thetasym => "\x{03D1}",
thinsp => "\x{2009}",
thorn => "\x{00FE}",
tilde => "\x{02DC}",
times => "\x{00D7}",
trade => "\x{2122}",
uArr => "\x{21D1}",
uacute => "\x{00FA}",
uarr => "\x{2191}",
ucirc => "\x{00FB}",
ugrave => "\x{00F9}",
uml => "\x{00A8}",
upsih => "\x{03D2}",
upsilon => "\x{03C5}",
uuml => "\x{00FC}",
weierp => "\x{2118}",
xi => "\x{03BE}",
yacute => "\x{00FD}",
yen => "\x{00A5}",
yuml => "\x{00FF}",
zeta => "\x{03B6}",
zwj => "\x{200D}",
zwnj => "\x{200C}",
my $special_category = {
address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
my $scoping_category = {
button => 1, caption => 1, html => 1, marquee => 1, object => 1,
table => 1, td => 1, th => 1,
my $formatting_category = {
a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
# $phrasing_category: all other elements
sub new ($) {
my $class = shift;
my $self = bless {}, $class;
$self->{set_next_input_character} = sub {
$self->{next_input_character} = -1;
$self->{parse_error} = sub {
return $self;
} # new
## Implementations MUST act as if state machine in the spec
sub _initialize_tokenizer ($) {
my $self = shift;
$self->{state} = 'data'; # MUST
$self->{content_model_flag} = 'PCDATA'; # be
undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
undef $self->{current_attribute};
undef $self->{last_emitted_start_tag_name};
undef $self->{last_attribute_value_state};
$self->{char} = [];
# $self->{next_input_character}
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
$self->{token} = [];
} # _initialize_tokenizer
## A token has:
## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
## 'character', or 'end-of-file'
## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
## ISSUE: the spec need s/tagname/tag name/
## ->{error} == 1 or 0 (DOCTYPE)
## ->{attributes} isa HASH (start tag, end tag)
## ->{data} (comment, character)
## Macros
## Macros MUST be preceded by three EXCLAMATION MARKs.
## emit ($token)
## Emits the specified token.
## Emitted token MUST immediately be handled by the tree construction state.
## Before each step, UA MAY check to see if either one of the scripts in
## "list of scripts that will execute as soon as possible" or the first
## script in the "list of scripts that will execute asynchronously",
## has completed loading. If one has, then it MUST be executed
## and removed from the list.
sub _get_next_token ($) {
my $self = shift;
if (@{$self->{token}}) {
return shift @{$self->{token}};
A: {
if ($self->{state} eq 'data') {
if ($self->{next_input_character} == 0x0026) { # &
if ($self->{content_model_flag} eq 'PCDATA' or
$self->{content_model_flag} eq 'RCDATA') {
$self->{state} = 'entity data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} else {
} elsif ($self->{next_input_character} == 0x003C) { # <
if ($self->{content_model_flag} ne 'PLAINTEXT') {
$self->{state} = 'tag open';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} else {
} elsif ($self->{next_input_character} == -1) {
return ({type => 'end-of-file'});
last A; ## TODO: ok?
# Anything else
my $token = {type => 'character',
data => chr $self->{next_input_character}};
## Stay in the data state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($token);
redo A;
} elsif ($self->{state} eq 'entity data') {
## (cannot happen in CDATA state)
my $token = $self->_tokenize_attempt_to_consume_an_entity;
$self->{state} = 'data';
# next-input-character is already done
unless (defined $token) {
return ({type => 'character', data => '&'});
} else {
return ($token);
redo A;
} elsif ($self->{state} eq 'tag open') {
if ($self->{content_model_flag} eq 'RCDATA' or
$self->{content_model_flag} eq 'CDATA') {
if ($self->{next_input_character} == 0x002F) { # /
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
$self->{state} = 'close tag open';
redo A;
} else {
## reconsume
$self->{state} = 'data';
return (type => 'character', data => {'/'});
redo A;
} elsif ($self->{content_model_flag} eq 'PCDATA') {
if ($self->{next_input_character} == 0x0021) { # !
$self->{state} = 'markup declaration open';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x002F) { # /
$self->{state} = 'close tag open';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif (0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x005A) { # A..Z
= {type => 'start tag',
tag_name => chr ($self->{next_input_character} + 0x0020)};
$self->{state} = 'tag name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif (0x0061 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x007A) { # a..z
$self->{current_token} = {type => 'start tag',
tag_name => chr ($self->{next_input_character})};
$self->{state} = 'tag name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ({type => 'character', data => '<>'});
redo A;
} elsif ($self->{next_input_character} == 0x003F) { # ?
$self->{state} = 'bogus comment';
## $self->{next_input_character} is intentionally left as is
redo A;
} else {
$self->{state} = 'data';
## reconsume
return ({type => 'character', data => '<'});
redo A;
} else {
die "$0: $self->{content_model_flag}: Unknown content model flag";
} elsif ($self->{state} eq 'close tag open') {
if ($self->{content_model_flag} eq 'RCDATA' or
$self->{content_model_flag} eq 'CDATA') {
my @next_char;
TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
push @next_char, $self->{next_input_character};
my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
} else {
$self->{next_input_character} = shift @next_char; # reconsume
unshift @{$self->{char}}, (@next_char);
$self->{state} = 'data';
return ({type => 'character', data => ''});
redo A;
push @next_char, $self->{next_input_character};
unless ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020 or # SP
$self->{next_input_character} == 0x003E or # >
$self->{next_input_character} == 0x002F or # /
$self->{next_input_character} == 0x003C or # <
$self->{next_input_character} == -1) {
$self->{next_input_character} = shift @next_char; # reconsume
unshift @{$self->{char}}, (@next_char);
$self->{state} = 'data';
return ({type => 'character', data => ''});
redo A;
} else {
$self->{next_input_character} = shift @next_char;
unshift @{$self->{char}}, (@next_char);
# and consume...
if (0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x005A) { # A..Z
$self->{current_token} = {type => 'end tag',
tag_name => chr ($self->{next_input_character} + 0x0020)};
$self->{state} = 'tag name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif (0x0061 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x007A) { # a..z
$self->{current_token} = {type => 'end tag',
tag_name => chr ($self->{next_input_character})};
$self->{state} = 'tag name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{state} = 'data';
# reconsume
return ({type => 'character', data => ''});
redo A;
} else {
$self->{state} = 'bogus comment';
## $self->{next_input_character} is intentionally left as is
redo A;
} elsif ($self->{state} eq 'tag name') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
$self->{state} = 'before attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} elsif (0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x005A) { # A..Z
$self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
# start tag or end tag
## Stay in this state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003C or # <
$self->{next_input_character} == -1) {
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
# reconsume
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} elsif ($self->{next_input_character} == 0x002F) { # /
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
if ($self->{next_input_character} == 0x003E and # >
$self->{current_token}->{type} eq 'start tag' and
$permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
# permitted slash
} else {
$self->{state} = 'before attribute name';
# next-input-character is already done
redo A;
} else {
$self->{current_token}->{tag_name} .= chr $self->{next_input_character};
# start tag or end tag
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'before attribute name') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} elsif (0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x005A) { # A..Z
$self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
value => ''};
$self->{state} = 'attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x002F) { # /
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
if ($self->{next_input_character} == 0x003E and # >
$self->{current_token}->{type} eq 'start tag' and
$permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
# permitted slash
} else {
## Stay in the state
# next-input-character is already done
redo A;
} elsif ($self->{next_input_character} == 0x003C or # <
$self->{next_input_character} == -1) {
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
# reconsume
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} else {
$self->{current_attribute} = {name => chr ($self->{next_input_character}),
value => ''};
$self->{state} = 'attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'attribute name') {
my $before_leave = sub {
if (exists $self->{current_token}->{attributes} # start tag or end tag
->{$self->{current_attribute}->{name}}) { # MUST
## Discard $self->{current_attribute} # MUST
} else {
= $self->{current_attribute};
}; # $before_leave
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
$self->{state} = 'after attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003D) { # =
$self->{state} = 'before attribute value';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} elsif (0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x005A) { # A..Z
$self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x002F) { # /
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
if ($self->{next_input_character} == 0x003E and # >
$self->{current_token}->{type} eq 'start tag' and
$permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
# permitted slash
} else {
$self->{state} = 'before attribute name';
# next-input-character is already done
redo A;
} elsif ($self->{next_input_character} == 0x003C or # <
$self->{next_input_character} == -1) {
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
# reconsume
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} else {
$self->{current_attribute}->{name} .= chr ($self->{next_input_character});
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'after attribute name') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003D) { # =
$self->{state} = 'before attribute value';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} elsif (0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x005A) { # A..Z
$self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
value => ''};
$self->{state} = 'attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x002F) { # /
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
if ($self->{next_input_character} == 0x003E and # >
$self->{current_token}->{type} eq 'start tag' and
$permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
# permitted slash
} else {
$self->{state} = 'before attribute name';
# next-input-character is already done
redo A;
} elsif ($self->{next_input_character} == 0x003C or # <
$self->{next_input_character} == -1) {
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
# reconsume
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} else {
$self->{current_attribute} = {name => chr ($self->{next_input_character}),
value => ''};
$self->{state} = 'attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'before attribute value') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x0022) { # "
$self->{state} = 'attribute value (double-quoted)';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x0026) { # &
$self->{state} = 'attribute value (unquoted)';
## reconsume
redo A;
} elsif ($self->{next_input_character} == 0x0027) { # '
$self->{state} = 'attribute value (single-quoted)';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} elsif ($self->{next_input_character} == 0x003C or # <
$self->{next_input_character} == -1) {
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} else {
$self->{current_attribute}->{value} .= chr ($self->{next_input_character});
$self->{state} = 'attribute value (unquoted)';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'attribute value (double-quoted)') {
if ($self->{next_input_character} == 0x0022) { # "
$self->{state} = 'before attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x0026) { # &
$self->{last_attribute_value_state} = 'attribute value (double-quoted)';
$self->{state} = 'entity in attribute value';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == -1) {
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} else {
$self->{current_attribute}->{value} .= chr ($self->{next_input_character});
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'attribute value (single-quoted)') {
if ($self->{next_input_character} == 0x0027) { # '
$self->{state} = 'before attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x0026) { # &
$self->{last_attribute_value_state} = 'attribute value (single-quoted)';
$self->{state} = 'entity in attribute value';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == -1) {
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} else {
$self->{current_attribute}->{value} .= chr ($self->{next_input_character});
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'attribute value (unquoted)') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # HT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
$self->{state} = 'before attribute name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x0026) { # &
$self->{last_attribute_value_state} = 'attribute value (unquoted)';
$self->{state} = 'entity in attribute value';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} elsif ($self->{next_input_character} == 0x003C or # <
$self->{next_input_character} == -1) {
if ($self->{current_token}->{type} eq 'start tag') {
$self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
} elsif ($self->{current_token}->{type} eq 'end tag') {
$self->{content_model_flag} = 'PCDATA'; # MUST
if ($self->{current_token}->{attributes}) {
} else {
die "$0: $self->{current_token}->{type}: Unknown token type";
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # start tag or end tag
undef $self->{current_token};
redo A;
} else {
$self->{current_attribute}->{value} .= chr ($self->{next_input_character});
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'entity in attribute value') {
my $token = $self->_tokenize_attempt_to_consume_an_entity;
unless (defined $token) {
$self->{current_attribute}->{value} .= '&';
} else {
$self->{current_attribute}->{value} .= $token->{data};
## ISSUE: spec says "append the returned character token to the current attribute's value"
$self->{state} = $self->{last_attribute_value_state};
# next-input-character is already done
redo A;
} elsif ($self->{state} eq 'bogus comment') {
## (only happen if PCDATA state)
my $token = {type => 'comment', data => ''};
BC: {
if ($self->{next_input_character} == 0x003E) { # >
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($token);
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{state} = 'data';
## reconsume
return ($token);
redo A;
} else {
$token->{data} .= chr ($self->{next_input_character});
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo BC;
} # BC
} elsif ($self->{state} eq 'markup declaration open') {
## (only happen if PCDATA state)
my @next_char;
push @next_char, $self->{next_input_character};
if ($self->{next_input_character} == 0x002D) { # -
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
push @next_char, $self->{next_input_character};
if ($self->{next_input_character} == 0x002D) { # -
$self->{current_token} = {type => 'comment', data => ''};
$self->{state} = 'comment';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x0044 or # D
$self->{next_input_character} == 0x0064) { # d
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
push @next_char, $self->{next_input_character};
if ($self->{next_input_character} == 0x004F or # O
$self->{next_input_character} == 0x006F) { # o
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
push @next_char, $self->{next_input_character};
if ($self->{next_input_character} == 0x0043 or # C
$self->{next_input_character} == 0x0063) { # c
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
push @next_char, $self->{next_input_character};
if ($self->{next_input_character} == 0x0054 or # T
$self->{next_input_character} == 0x0074) { # t
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
push @next_char, $self->{next_input_character};
if ($self->{next_input_character} == 0x0059 or # Y
$self->{next_input_character} == 0x0079) { # y
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
push @next_char, $self->{next_input_character};
if ($self->{next_input_character} == 0x0050 or # P
$self->{next_input_character} == 0x0070) { # p
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
push @next_char, $self->{next_input_character};
if ($self->{next_input_character} == 0x0045 or # E
$self->{next_input_character} == 0x0065) { # e
## ISSUE: What a stupid code this is!
$self->{state} = 'DOCTYPE';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
$self->{next_input_character} = shift @next_char;
unshift @{$self->{char}}, (@next_char);
$self->{state} = 'bogus comment';
redo A;
## ISSUE: typos in spec: chacacters, is is a parse error
## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
} elsif ($self->{state} eq 'comment') {
if ($self->{next_input_character} == 0x002D) { # -
$self->{state} = 'comment dash';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # comment
undef $self->{current_token};
redo A;
} else {
$self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'comment dash') {
if ($self->{next_input_character} == 0x002D) { # -
$self->{state} = 'comment end';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # comment
undef $self->{current_token};
redo A;
} else {
$self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
$self->{state} = 'comment';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'comment end') {
if ($self->{next_input_character} == 0x003E) { # >
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # comment
undef $self->{current_token};
redo A;
} elsif ($self->{next_input_character} == 0x002D) { # -
$self->{current_token}->{data} .= '-'; # comment
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # comment
undef $self->{current_token};
redo A;
} else {
$self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
$self->{state} = 'comment';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'DOCTYPE') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
$self->{state} = 'before DOCTYPE name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} else {
$self->{state} = 'before DOCTYPE name';
## reconsume
redo A;
} elsif ($self->{state} eq 'before DOCTYPE name') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif (0x0061 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x007A) { # a..z
$self->{current_token} = {type => 'DOCTYPE',
name => chr ($self->{next_input_character} - 0x0020),
error => 1};
$self->{state} = 'DOCTYPE name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ({type => 'DOCTYPE', name => '', error => 1});
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{state} = 'data';
## reconsume
return ({type => 'DOCTYPE', name => '', error => 1});
redo A;
} else {
$self->{current_token} = {type => 'DOCTYPE',
name => chr ($self->{next_input_character}),
error => 1};
$self->{state} = 'DOCTYPE name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'DOCTYPE name') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
$self->{state} = 'after DOCTYPE name';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # DOCTYPE
undef $self->{current_token};
redo A;
} elsif (0x0061 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x007A) { # a..z
$self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
#$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
$self->{state} = 'data';
## reconsume
return ($self->{current_token});
undef $self->{current_token};
redo A;
} else {
.= chr ($self->{next_input_character}); # DOCTYPE
#$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'after DOCTYPE name') {
if ($self->{next_input_character} == 0x0009 or # HT
$self->{next_input_character} == 0x000A or # LF
$self->{next_input_character} == 0x000B or # VT
$self->{next_input_character} == 0x000C or # FF
$self->{next_input_character} == 0x0020) { # SP
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{next_input_character} == 0x003E) { # >
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # DOCTYPE
undef $self->{current_token};
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # DOCTYPE
undef $self->{current_token};
redo A;
} else {
$self->{current_token}->{error} = 1; # DOCTYPE
$self->{state} = 'bogus DOCTYPE';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} elsif ($self->{state} eq 'bogus DOCTYPE') {
if ($self->{next_input_character} == 0x003E) { # >
$self->{state} = 'data';
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
return ($self->{current_token}); # DOCTYPE
undef $self->{current_token};
redo A;
} elsif ($self->{next_input_character} == -1) {
$self->{state} = 'data';
## reconsume
return ($self->{current_token}); # DOCTYPE
undef $self->{current_token};
redo A;
} else {
## Stay in the state
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
redo A;
} else {
die "$0: $self->{state}: Unknown state";
} # A
die "$0: _get_next_token: unexpected case";
} # _get_next_token
sub _tokenize_attempt_to_consume_an_entity ($) {
my $self = shift;
if ($self->{next_input_character} == 0x0023) { # #
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
my $num;
if ($self->{next_input_character} == 0x0078 or # x
$self->{next_input_character} == 0x0058) { # X
X: {
my $x_char = $self->{next_input_character};
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
if (0x0030 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x0039) { # 0..9
$num ||= 0;
$num *= 0x10;
$num += $self->{next_input_character} - 0x0030;
redo X;
} elsif (0x0061 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x0066) { # a..f
## ISSUE: the spec says U+0078, which is apparently incorrect
$num ||= 0;
$num *= 0x10;
$num += $self->{next_input_character} - 0x0060 + 9;
redo X;
} elsif (0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x0046) { # A..F
## ISSUE: the spec says U+0058, which is apparently incorrect
$num ||= 0;
$num *= 0x10;
$num += $self->{next_input_character} - 0x0040 + 9;
redo X;
} elsif (not defined $num) { # no hexadecimal digit
$self->{next_input_character} = 0x0023; # #
unshift @{$self->{char}}, ($x_char);
return undef;
} elsif ($self->{next_input_character} == 0x003B) { # ;
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
} else {
## TODO: check the definition for |a valid Unicode character|.
if ($num > 1114111 or $num == 0) {
## ISSUE: Why this is not an error?
return {type => 'character', data => chr $num};
} # X
} elsif (0x0030 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x0039) { # 0..9
my $code = $self->{next_input_character} - 0x0030;
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
while (0x0030 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x0039) { # 0..9
$code *= 10;
$code += $self->{next_input_character} - 0x0030;
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
if ($self->{next_input_character} == 0x003B) { # ;
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
} else {
## TODO: check the definition for |a valid Unicode character|.
if ($code > 1114111 or $code == 0) {
## ISSUE: Why this is not an error?
return {type => 'character', data => chr $code};
} else {
unshift @{$self->{char}}, ($self->{next_input_character});
$self->{next_input_character} = 0x0023; # #
return undef;
} elsif ((0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x005A) or
(0x0061 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x007A)) {
my $entity_name = chr $self->{next_input_character};
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
my $value = $entity_name;
my $match;
while (length $entity_name < 10 and
## NOTE: Some number greater than the maximum length of entity name
((0x0041 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x005A) or
(0x0061 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x007A) or
(0x0030 <= $self->{next_input_character} and
$self->{next_input_character} <= 0x0039))) {
$entity_name .= chr $self->{next_input_character};
if (defined $entity_char->{$entity_name}) {
$value = $entity_char->{$entity_name};
$match = 1;
} else {
$value .= chr $self->{next_input_character};
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
if ($match) {
if ($self->{next_input_character} == 0x003B) { # ;
if (@{$self->{char}}) {
$self->{next_input_character} = shift @{$self->{char}};
} else {
} else {
return {type => 'character', data => $value};
} else {
## NOTE: No characters are consumed in the spec.
unshift @{$self->{token}}, ({type => 'character', data => $value});
return undef;
} else {
## no characters are consumed
return undef;
} # _tokenize_attempt_to_consume_an_entity
sub _initialize_tree_constructor ($) {
my $self = shift;
require What::NanoDOM;
$self->{document} = What::NanoDOM::Document->new;
$self->{document}->strict_error_checking (0);
## TODO: Turn mutation events off # MUST
## TODO: Turn loose Document option (manakai extension) on
} # _initialize_tree_constructor
sub _terminate_tree_constructor ($) {
my $self = shift;
$self->{document}->strict_error_checking (1);
## TODO: Turn mutation events on
} # _terminate_tree_constructor
## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
sub _construct_tree ($) {
my ($self) = @_;
## When an interactive UA render the $self->{document} available
## to the user, or when it begin accepting user input, are
## not defined.
## Append a character: collect it and all subsequent consecutive
## characters and insert one Text node whose data is concatenation
## of all those characters. # MUST
my $token;
$token = $self->_get_next_token;
my $phase = 'initial'; # MUST
my $open_elements = [];
my $active_formatting_elements = [];
my $head_element;
my $form_element;
my $insertion_mode = 'before head';
my $reconstruct_active_formatting_elements = sub { # MUST
## Step 1
return unless @$active_formatting_elements;
## Step 3
my $i = -1;
my $entry = $active_formatting_elements->[$i];
## Step 2
return if $entry->[0] eq '#marker';
for (@$open_elements) {
if ($entry->[0] eq $_->[0]) {
## Step 4
S4: {
last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
## Step 5
$entry = $active_formatting_elements->[$i];
## Step 6
if ($entry->[0] eq '#marker') {
} else {
my $in_open_elements;
OE: for (@$open_elements) {
if ($entry->[0] eq $_->[0]) {
$in_open_elements = 1;
last OE;
if ($in_open_elements) {
} else {
redo S4;
## Step 7
$entry = $active_formatting_elements->[$i];
} # S4
S7: {
## Step 8
my $clone = $entry->[0]->clone_node (0);
## Step 9
$open_elements->[-1]->[0]->append_child ($clone);
push @$open_elements, [$clone, $entry->[1]];
## Step 10
$active_formatting_elements->[$i] = $open_elements->[-1];
unless ($i == $#$active_formatting_elements) {
## Step 7'
$entry = $active_formatting_elements->[$i];
redo S7;
} # S7
}; # $reconstruct_active_formatting_elements
my $clear_up_to_marker = sub {
for (reverse 0..$#$active_formatting_elements) {
if ($active_formatting_elements->[$_]->[0] eq '#marker') {
splice @$active_formatting_elements, $_;
}; # $clear_up_to_marker
my $reset_insertion_mode = sub {
## Step 1
my $last;
## Step 2
my $i = -1;
my $node = $open_elements->[$i];
## Step 3
S3: {
$last = 1 if $open_elements->[0]->[0] eq $node->[0];
## TODO: the element whose inner_html is set is neither td nor th, then $node = the element
## Step 4..13
my $new_mode = {
select => 'in select',
td => 'in cell',
th => 'in cell',
tr => 'in row',
tbody => 'in table body',
thead => 'in table head',
tfoot => 'in table foot',
caption => 'in caption',
colgroup => 'in column group',
table => 'in table',
head => 'in body', # not in head!
body => 'in body',
frameset => 'in frameset',
$insertion_mode = $new_mode and return if defined $new_mode;
## Step 14
if ($node->[1] eq 'html') {
unless (defined $head_element) {
$insertion_mode = 'before head';
} else {
$insertion_mode = 'after head';
## Step 15
$insertion_mode = 'in body' and return if $last;
## Step 16
$node = $open_elements->[$i];
## Step 17
redo S3;
} # S3
}; # $reset_insertion_mode
my $style_start_tag = sub {
my $style_el;
$style_el = $self->{document}->create_element_ns
(q, [undef, 'style']);
## $insertion_mode eq 'in head' and ... (always true)
(($insertion_mode eq 'in head' and defined $head_element)
? $head_element : $open_elements->[-1]->[0])
->append_child ($style_el);
$self->{content_model_flag} = 'CDATA';
my $text = '';
$token = $self->_get_next_token;
while ($token->{type} eq 'character') {
$text .= $token->{data};
$token = $self->_get_next_token;
} # stop if non-character token or tokenizer stops tokenising
if (length $text) {
$style_el->manakai_append_text ($text);
$self->{content_model_flag} = 'PCDATA';
if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
## Ignore the token
} else {
## ISSUE: And ignore?
$token = $self->_get_next_token;
}; # $style_start_tag
my $script_start_tag = sub {
my $script_el;
$script_el = $self->{document}->create_element_ns
(q, [undef, 'script']);
## TODO: mark as "parser-inserted"
$self->{content_model_flag} = 'CDATA';
my $text = '';
$token = $self->_get_next_token;
while ($token->{type} eq 'character') {
$text .= $token->{data};
$token = $self->_get_next_token;
} # stop if non-character token or tokenizer stops tokenising
if (length $text) {
$script_el->manakai_append_text ($text);
$self->{content_model_flag} = 'PCDATA';
if ($token->{type} eq 'end tag' and
$token->{tag_name} eq 'script') {
## Ignore the token
} else {
## ISSUE: And ignore?
## TODO: mark as "already executed"
## TODO: inner_html mode then mark as "already executed" and skip
if (1) {
## TODO: $old_insertion_point = current insertion point
## TODO: insertion point = just before the next input character
(($insertion_mode eq 'in head' and defined $head_element)
? $head_element : $open_elements->[-1]->[0])->append_child ($script_el);
## TODO: insertion point = $old_insertion_point (might be "undefined")
## TODO: if there is a script that will execute as soon as the parser resume, then...
$token = $self->_get_next_token;
}; # $script_start_tag
my $formatting_end_tag = sub {
my $tag_name = shift;
FET: {
## Step 1
my $formatting_element;
my $formatting_element_i_in_active;
AFE: for (reverse 0..$#$active_formatting_elements) {
if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
$formatting_element = $active_formatting_elements->[$_];
$formatting_element_i_in_active = $_;
last AFE;
} elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
last AFE;
} # AFE
unless (defined $formatting_element) {
## Ignore the token
$token = $self->_get_next_token;
## has an element in scope
my $in_scope = 1;
my $formatting_element_i_in_open;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[0] eq $formatting_element->[0]) {
if ($in_scope) {
$formatting_element_i_in_open = $_;
} else { # in open elements but not in scope
## Ignore the token
$token = $self->_get_next_token;
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$node->[1]}) {
$in_scope = 0;
unless (defined $formatting_element_i_in_open) {
pop @$active_formatting_elements; # $formatting_element
$token = $self->_get_next_token; ## TODO: ok?
if (not $open_elements->[-1]->[0] eq $formatting_element->[0]) {
## Step 2
my $furthest_block;
my $furthest_block_i_in_open;
OE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if (not $formatting_category->{$node->[1]} and
#not $phrasing_category->{$node->[1]} and
($special_category->{$node->[1]} or
$scoping_category->{$node->[1]})) {
$furthest_block = $node;
$furthest_block_i_in_open = $_;
} elsif ($node->[0] eq $formatting_element->[0]) {
last OE;
} # OE
## Step 3
unless (defined $furthest_block) { # MUST
splice @$open_elements, $formatting_element_i_in_open;
splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
$token = $self->_get_next_token;
## Step 4
my $common_ancestor_node = $open_elements->[$formatting_element_i_in_open - 1];
## Step 5
my $furthest_block_parent = $furthest_block->[0]->parent_node;
if (defined $furthest_block_parent) {
$furthest_block_parent->remove_child ($furthest_block->[0]);
## Step 6
my $bookmark_prev_el
= $active_formatting_elements->[$formatting_element_i_in_active - 1]
## Step 7
my $node = $furthest_block;
my $node_i_in_open = $furthest_block_i_in_open;
my $last_node = $furthest_block;
S7: {
## Step 1
$node = $open_elements->[$node_i_in_open];
## Step 2
my $node_i_in_active;
S7S2: {
for (reverse 0..$#$active_formatting_elements) {
if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
$node_i_in_active = $_;
last S7S2;
splice @$open_elements, $node_i_in_open, 1;
redo S7;
} # S7S2
## Step 3
last S7 if $node->[0] eq $formatting_element->[0];
## Step 4
if ($last_node->[0] eq $furthest_block->[0]) {
$bookmark_prev_el = $node->[0];
## Step 5
if ($node->[0]->has_child_nodes ()) {
my $clone = [$node->[0]->clone_node (0), $node->[1]];
$active_formatting_elements->[$node_i_in_active] = $clone;
$open_elements->[$node_i_in_open] = $clone;
$node = $clone;
## Step 6
$node->[0]->append_child ($last_node->[0]);
## Step 7
$last_node = $node;
## Step 8
redo S7;
} # S7
## Step 8
$common_ancestor_node->[0]->append_child ($last_node->[0]);
## Step 9
my $clone = [$formatting_element->[0]->clone_node (0),
## Step 10
my @cn = @{$furthest_block->[0]->child_nodes};
$clone->[0]->append_child ($_) for @cn;
## Step 11
$furthest_block->[0]->append_child ($clone->[0]);
## Step 12
my $i;
AFE: for (reverse 0..$#$active_formatting_elements) {
if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
splice @$active_formatting_elements, $_, 1;
$i-- and last AFE if defined $i;
} elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
$i = $_;
} # AFE
splice @$active_formatting_elements, $i + 1, 0, $clone;
## Step 13
undef $i;
OE: for (reverse 0..$#$open_elements) {
if ($open_elements->[$_]->[0] eq $formatting_element->[0]) {
splice @$open_elements, $_, 1;
$i-- and last OE if defined $i;
} elsif ($open_elements->[$_]->[0] eq $furthest_block->[0]) {
$i = $_;
} # OE
splice @$open_elements, $i + 1, 1, $clone;
## Step 14
redo FET;
} # FET
}; # $formatting_end_tag
my $in_body = sub {
my $insert = shift;
if ($token->{type} eq 'start tag') {
if ($token->{tag_name} eq 'script') {
} elsif ($token->{tag_name} eq 'style') {
} elsif ({
base => 1, link => 1, meta => 1, title => 1,
}->{$token->{tag_name}}) {
## NOTE: This is an "as if in head" code clone
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
if (defined $head_element) {
$head_element->append_child ($el);
} else {
## ISSUE: Issue on magical in the spec
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'body') {
if (@$open_elements == 1 or
$open_elements->[1]->[1] ne 'body') {
## Ignore the token
} else {
my $body_el = $open_elements->[1]->[0];
for my $attr_name (keys %{$token->{attributes}}) {
unless ($body_el->has_attribute_ns (undef, $attr_name)) {
(undef, [undef, $attr_name],
$token = $self->_get_next_token;
} elsif ({
address => 1, blockquote => 1, center => 1, dir => 1,
div => 1, dl => 1, fieldset => 1, listing => 1,
menu => 1, ol => 1, p => 1, ul => 1,
pre => 1,
}->{$token->{tag_name}}) {
## has a p element in scope
INSCOPE: for (reverse @$open_elements) {
if ($_->[1] eq 'p') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'p'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$_->[1]}) {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
if ($token->{tag_name} eq 'pre') {
$token = $self->_get_next_token;
if ($token->{type} eq 'character') {
$token->{data} =~ s/^\x0A//;
unless (length $token->{data}) {
$token = $self->_get_next_token;
} else {
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'form') {
if (defined $form_element) {
## Ignore the token
} else {
## has a p element in scope
INSCOPE: for (reverse @$open_elements) {
if ($_->[1] eq 'p') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'p'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$_->[1]}) {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$form_element = $open_elements->[-1]->[0];
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'li') {
## has a p element in scope
INSCOPE: for (reverse @$open_elements) {
if ($_->[1] eq 'p') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'p'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$_->[1]}) {
## Step 1
my $i = -1;
my $node = $open_elements->[$i];
LI: {
## Step 2
if ($node->[1] eq 'li') {
splice @$open_elements, $i;
last LI;
## Step 3
if (not $formatting_category->{$node->[1]} and
#not $phrasing_category->{$node->[1]} and
($special_category->{$node->[1]} or
$scoping_category->{$node->[1]}) and
$node->[1] ne 'address' and $node->[1] ne 'div') {
last LI;
## Step 4
$node = $open_elements->[$i];
redo LI;
} # LI
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
## has a p element in scope
INSCOPE: for (reverse @$open_elements) {
if ($_->[1] eq 'p') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'p'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$_->[1]}) {
## Step 1
my $i = -1;
my $node = $open_elements->[$i];
LI: {
## Step 2
if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
splice @$open_elements, $i;
last LI;
## Step 3
if (not $formatting_category->{$node->[1]} and
#not $phrasing_category->{$node->[1]} and
($special_category->{$node->[1]} or
$scoping_category->{$node->[1]}) and
$node->[1] ne 'address' and $node->[1] ne 'div') {
last LI;
## Step 4
$node = $open_elements->[$i];
redo LI;
} # LI
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'plaintext') {
## has a p element in scope
INSCOPE: for (reverse @$open_elements) {
if ($_->[1] eq 'p') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'p'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$_->[1]}) {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$self->{content_model_flag} = 'PLAINTEXT';
$token = $self->_get_next_token;
} elsif ({
h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
}->{$token->{tag_name}}) {
## has a p element in scope
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'p') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'p'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$node->[1]}) {
## has an element in scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ({
h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
}->{$node->[1]}) {
$i = $_;
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$node->[1]}) {
if (defined $i) {
splice @$open_elements, $i;
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'a') {
AFE: for my $i (reverse 0..$#$active_formatting_elements) {
my $node = $active_formatting_elements->[$i];
if ($node->[1] eq 'a') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'a'};
splice @$active_formatting_elements, $i;
OE: for (reverse 0..$#$open_elements) {
if ($open_elements->[$_]->[0] eq $node->[0]) {
splice @$open_elements, $_;
last OE;
} # OE
last AFE;
} elsif ($node->[0] eq '#marker') {
last AFE;
} # AFE
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
push @$active_formatting_elements, $open_elements->[-1];
$token = $self->_get_next_token;
} elsif ({
b => 1, big => 1, em => 1, font => 1, i => 1,
nobr => 1, s => 1, small => 1, strile => 1,
strong => 1, tt => 1, u => 1,
}->{$token->{tag_name}}) {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
push @$active_formatting_elements, $open_elements->[-1];
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'button') {
## has a button element in scope
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'button') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'button'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$node->[1]}) {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
push @$active_formatting_elements, ['#marker', ''];
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'marquee' or
$token->{tag_name} eq 'object') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
push @$active_formatting_elements, ['#marker', ''];
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'xmp') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$self->{content_model_flag} = 'CDATA';
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'table') {
## has a p element in scope
INSCOPE: for (reverse @$open_elements) {
if ($_->[1] eq 'p') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'p'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$_->[1]}) {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$insertion_mode = 'in table';
$token = $self->_get_next_token;
} elsif ({
area => 1, basefont => 1, bgsound => 1, br => 1,
embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
image => 1,
}->{$token->{tag_name}}) {
if ($token->{tag_name} eq 'image') {
$token->{tag_name} = 'img';
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
pop @$open_elements;
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'hr') {
## has a p element in scope
INSCOPE: for (reverse @$open_elements) {
if ($_->[1] eq 'p') {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => 'p'};
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$_->[1]}) {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
pop @$open_elements;
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'input') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
## TODO: associate with $form_element if defined
pop @$open_elements;
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'isindex') {
if (defined $form_element) {
## Ignore the token
$token = $self->_get_next_token;
} else {
my $at = $token->{attributes};
$at->{name} = {name => 'name', value => 'isindex'};
my @tokens = (
{type => 'start tag', tag_name => 'form'},
{type => 'start tag', tag_name => 'hr'},
{type => 'start tag', tag_name => 'p'},
{type => 'start tag', tag_name => 'label'},
{type => 'character',
data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
## TODO: make this configurable
{type => 'start tag', tag_name => 'input', attributes => $at},
#{type => 'character', data => ''}, # SHOULD
{type => 'end tag', tag_name => 'label'},
{type => 'end tag', tag_name => 'p'},
{type => 'start tag', tag_name => 'hr'},
{type => 'end tag', tag_name => 'form'},
$token = shift @tokens;
unshift @{$self->{token}}, (@tokens);
} elsif ({
textarea => 1,
noembed => 1,
noframes => 1,
noscript => 0, ## TODO: 1 if scripting is enabled
}->{$token->{tag_name}}) {
my $tag_name = $token->{tag_name};
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
if ($token->{tag_name} eq 'textarea') {
## TODO: form_element if defined
$self->{content_model_flag} = 'RCDATA';
} else {
$self->{content_model_flag} = 'CDATA';
my $text = '';
$token = $self->_get_next_token;
while ($token->{type} eq 'character') {
$text .= $token->{data};
$token = $self->_get_next_token;
if (length $text) {
$el->manakai_append_text ($text);
$self->{content_model_flag} = 'PCDATA';
if ($token->{type} eq 'end tag' and
$token->{tag_name} eq $tag_name) {
## Ignore the token
} else {
## ISSUE: And ignore?
$token = $self->_get_next_token;
} elsif ($token->{type} eq 'select') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$insertion_mode = 'in select';
$token = $self->_get_next_token;
} elsif ({
caption => 1, col => 1, colgroup => 1, frame => 1,
frameset => 1, head => 1, option => 1, optgroup => 1,
tbody => 1, td => 1, tfoot => 1, th => 1,
thead => 1, tr => 1,
}->{$token->{tag_name}}) {
## Ignore the token
$token = $self->_get_next_token;
## ISSUE: An issue on HTML5 new elements in the spec.
} else {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
push @$open_elements, [$el, $token->{tag_name}];
$token = $self->_get_next_token;
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'body') {
if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
## ISSUE: There is an issue in the spec.
if ($open_elements->[-1]->[1] ne 'body') {
$insertion_mode = 'after body';
$token = $self->_get_next_token;
} else {
## Ignore the token
$token = $self->_get_next_token;
} elsif ($token->{tag_name} eq 'html') {
if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
## ISSUE: There is an issue in the spec.
if ($open_elements->[-1]->[1] ne 'body') {
$insertion_mode = 'after body';
## reprocess
} else {
## Ignore the token
$token = $self->_get_next_token;
} elsif ({
address => 1, blockquote => 1, center => 1, dir => 1,
div => 1, dl => 1, fieldset => 1, listing => 1,
menu => 1, ol => 1, pre => 1, ul => 1,
form => 1,
p => 1,
dd => 1, dt => 1, li => 1,
button => 1, marquee => 1, object => 1,
}->{$token->{tag_name}}) {
## has an element in scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
## generate implied end tags
if ({
dd => ($token->{tag_name} ne 'dd'),
dt => ($token->{tag_name} ne 'dt'),
li => ($token->{tag_name} ne 'li'),
p => ($token->{tag_name} ne 'p'),
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
$i = $_;
last INSCOPE unless $token->{tag_name} eq 'p';
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$node->[1]}) {
if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
splice @$open_elements, $i if defined $i;
undef $form_element if $token->{tag_name} eq 'form';
if {
button => 1, marquee => 1, object => 1,
$token = $self->_get_next_token;
} elsif ({
h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
}->{$token->{tag_name}}) {
## has an element in scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ({
h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
}->{$node->[1]}) {
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
$i = $_;
} elsif ({
table => 1, caption => 1, td => 1, th => 1,
button => 1, marquee => 1, object => 1, html => 1,
}->{$node->[1]}) {
if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
splice @$open_elements, $i if defined $i;
$token = $self->_get_next_token;
} elsif ({
a => 1,
b => 1, big => 1, em => 1, font => 1, i => 1,
nobr => 1, s => 1, small => 1, strile => 1,
strong => 1, tt => 1, u => 1,
}->{$token->{tag_name}}) {
} elsif ({
caption => 1, col => 1, colgroup => 1, frame => 1,
frameset => 1, head => 1, option => 1, optgroup => 1,
tbody => 1, td => 1, tfoot => 1, th => 1,
thead => 1, tr => 1,
area => 1, basefont => 1, bgsound => 1, br => 1,
embed => 1, hr => 1, iframe => 1, image => 1,
img => 1, input => 1, isindex=> 1, noembed => 1,
noframes => 1, param => 1, select => 1, spacer => 1,
table => 1, textarea => 1, wbr => 1,
noscript => 0, ## TODO: if scripting is enabled
}->{$token->{tag_name}}) {
## Ignore the token
$token = $self->_get_next_token;
## ISSUE: Issue on HTML5 new elements in spec
} else {
## Step 1
my $node_i = -1;
my $node = $open_elements->[$node_i];
## Step 2
S2: {
if ($node->[1] eq $token->{tag_name}) {
## Step 1
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
## Step 2
if ($token->{tag_name} ne $open_elements->[-1]->[1]) {
## Step 3
splice @$open_elements, $node_i;
last S2;
} else {
## Step 3
if (not $formatting_category->{$node->[1]} and
#not $phrasing_category->{$node->[1]} and
($special_category->{$node->[1]} or
$scoping_category->{$node->[1]})) {
## Ignore the token
$token = $self->_get_next_token;
last S2;
## Step 4
$node = $open_elements->[$node_i];
## Step 5;
redo S2;
} # S2
}; # $in_body
B: {
if ($phase eq 'initial') {
if ($token->{type} eq 'DOCTYPE') {
if ($token->{error}) {
## ISSUE: Spec currently left this case undefined.
$self->{parse_error}-> ('missing DOCTYPE');
my $doctype = $self->{document}->create_document_type_definition
$self->{document}->append_child ($doctype);
$phase = 'root element';
$token = $self->_get_next_token;
redo B;
} elsif ({
comment => 1,
'start tag' => 1,
'end tag' => 1,
'end-of-file' => 1,
}->{$token->{type}}) {
## ISSUE: Spec currently left this case undefined.
$self->{parse_error}-> ('missing DOCTYPE');
$phase = 'root element';
## reprocess
redo B;
} elsif ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
$self->{document}->manakai_append_text ($1);
## ISSUE: DOM3 Core does not allow Document > Text
unless (length $token->{data}) {
## Stay in the phase
$token = $self->_get_next_token;
redo B;
## ISSUE: Spec currently left this case undefined.
$self->{parse_error}-> ('missing DOCTYPE');
$phase = 'root element';
## reprocess
redo B;
} else {
die "$0: $token->{type}: Unknown token";
} elsif ($phase eq 'root element') {
if ($token->{type} eq 'DOCTYPE') {
## Ignore the token
## Stay in the phase
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$self->{document}->append_child ($comment);
## Stay in the phase
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
$self->{document}->manakai_append_text ($1);
## ISSUE: DOM3 Core does not allow Document > Text
unless (length $token->{data}) {
## Stay in the phase
$token = $self->_get_next_token;
redo B;
} elsif ({
'start tag' => 1,
'end tag' => 1,
'end-of-file' => 1,
}->{$token->{type}}) {
## ISSUE: There is an issue in the spec
} else {
die "$0: $token->{type}: Unknown token";
my $root_element;
$root_element = $self->{document}->create_element_ns
(q, [undef, 'html']);
$self->{document}->append_child ($root_element);
$open_elements = [[$root_element, 'html']];
$phase = 'main';
## reprocess
redo B;
} elsif ($phase eq 'main') {
if ($token->{type} eq 'DOCTYPE') {
## Ignore the token
## Stay in the phase
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag' and
$token->{tag_name} eq 'html') {
## TODO: unless it is the first start tag token, parse-error
my $top_el = $open_elements->[0]->[0];
for my $attr_name (keys %{$token->{attributes}}) {
unless ($top_el->has_attribute_ns (undef, $attr_name)) {
$top_el->set_attribute_ns (undef, [undef, $attr_name],
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'end-of-file') {
## Generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag', tag_name => $open_elements->[-1]->[1]};
redo B;
if (@$open_elements > 2 or
(@$open_elements == 2 and $open_elements->[1]->[1] ne 'body')) {
} else {
## TODO: inner_html parser and @$open_elements > 1 and $open_elements->[1] ne 'body', then parse-error
## Stop parsing
last B;
## ISSUE: There is an issue in the spec.
} else {
if ($insertion_mode eq 'before head') {
if ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
$open_elements->[-1]->[0]->manakai_append_text ($1);
unless (length $token->{data}) {
$token = $self->_get_next_token;
redo B;
## As if
$head_element = $self->{document}->create_element_ns
(q, [undef, 'head']);
$open_elements->[-1]->[0]->append_child ($head_element);
push @$open_elements, [$head_element, 'head'];
$insertion_mode = 'in head';
## reprocess
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
$head_element = $self->{document}->create_element_ns
(q, [undef, 'head']);
for my $attr_name (keys %{ $attr}) {
$head_element->set_attribute_ns (undef, [undef, $attr_name],
$attr ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($head_element);
push @$open_elements, [$head_element, 'head'];
$insertion_mode = 'in head';
if ($token->{tag_name} eq 'head') {
$token = $self->_get_next_token;
#} elsif ({
# base => 1, link => 1, meta => 1,
# script => 1, style => 1, title => 1,
# }->{$token->{tag_name}}) {
# ## reprocess
} else {
## reprocess
redo B;
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'html') {
## As if
$head_element = $self->{document}->create_element_ns
(q, [undef, 'head']);
$open_elements->[-1]->[0]->append_child ($head_element);
push @$open_elements, [$head_element, 'head'];
$insertion_mode = 'in head';
## reprocess
redo B;
} else {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
die "$0: $token->{type}: Unknown type";
} elsif ($insertion_mode eq 'in head') {
if ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
$open_elements->[-1]->[0]->manakai_append_text ($1);
unless (length $token->{data}) {
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ($token->{tag_name} eq 'title') {
my $title_el;
$title_el = $self->{document}->create_element_ns
(q, [undef, 'title']);
(defined $head_element ? $head_element : $open_elements->[-1]->[0])
->append_child ($title_el);
$self->{content_model_flag} = 'RCDATA';
my $text = '';
$token = $self->_get_next_token;
while ($token->{type} eq 'character') {
$text .= $token->{data};
$token = $self->_get_next_token;
if (length $text) {
$title_el->manakai_append_text ($text);
$self->{content_model_flag} = 'PCDATA';
if ($token->{type} eq 'end tag' and
$token->{tag_name} eq 'title') {
## Ignore the token
} else {
## ISSUE: And ignore?
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'style') {
redo B;
} elsif ($token->{tag_name} eq 'script') {
redo B;
} elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
## NOTE: There are "as if in head" code clones
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
(defined $head_element ? $head_element : $open_elements->[-1]->[0])
->append_child ($el);
## ISSUE: Issue on magical in the spec
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'head') {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'head') {
if ($open_elements->[-1]->[1] eq 'head') {
pop @$open_elements;
} else {
$insertion_mode = 'after head';
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'html') {
} else {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
if ($open_elements->[-1]->[1] eq 'head') {
## As if
pop @$open_elements;
$insertion_mode = 'after head';
## reprocess
redo B;
## ISSUE: An issue in the spec.
} elsif ($insertion_mode eq 'after head') {
if ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
$open_elements->[-1]->[0]->manakai_append_text ($1);
unless (length $token->{data}) {
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ($token->{tag_name} eq 'body') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, 'body']);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, 'body'];
$insertion_mode = 'in body';
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'frameset') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, 'frameset']);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, 'frameset'];
$insertion_mode = 'in frameset';
$token = $self->_get_next_token;
redo B;
} elsif ({
base => 1, link => 1, meta => 1,
script=> 1, style => 1, title => 1,
}->{$token->{tag_name}}) {
$insertion_mode = 'in head';
## reprocess
redo B;
} else {
} else {
## As if
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, 'body']);
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, 'body'];
$insertion_mode = 'in body';
## reprocess
redo B;
} elsif ($insertion_mode eq 'in body') {
if ($token->{type} eq 'character') {
## NOTE: There is a code clone of "character in body".
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
## NOTE: There is a code clone of "comment in body".
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} else {
$in_body->(sub {
$open_elements->[-1]->[0]->append_child (shift);
redo B;
} elsif ($insertion_mode eq 'in table') {
if ($token->{type} eq 'character') {
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ({
caption => 1,
colgroup => 1,
tbody => 1, tfoot => 1, thead => 1,
}->{$token->{tag_name}}) {
## Clear back to table context
while ($open_elements->[-1]->[1] ne 'table' and
$open_elements->[-1]->[1] ne 'html') {
pop @$open_elements;
push @$active_formatting_elements, ['#marker', '']
if $token->{tag_name} eq 'caption';
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name}];
$insertion_mode = {
caption => 'in caption',
colgroup => 'in column group',
tbody => 'in table body',
tfoot => 'in table body',
thead => 'in table body',
$token = $self->_get_next_token;
redo B;
} elsif ({
col => 1,
td => 1, th => 1, tr => 1,
}->{$token->{tag_name}}) {
## Clear back to table context
while ($open_elements->[-1]->[1] ne 'table' and
$open_elements->[-1]->[1] ne 'html') {
pop @$open_elements;
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name} eq 'col' ? 'colgroup' : 'tbody']);
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name} eq 'col' ? 'colgroup' : 'tbody'];
$insertion_mode = $token->{tag_name} eq 'col'
? 'in column group' : 'in table body';
## reprocess
redo B;
} elsif ($token->{tag_name} eq 'table') {
## NOTE: There are code clones for this "table in table"
## As if
## have a table element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'table') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore tokens
$token = $self->_get_next_token;
redo B;
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token; #
$token = {type => 'end tag', tag_name => 'table'};
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
redo B;
if ($open_elements->[-1]->[1] ne 'table') {
splice @$open_elements, $i;
## reprocess
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'table') {
## have a table element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
redo B;
if ($open_elements->[-1]->[1] ne 'table') {
splice @$open_elements, $i;
$token = $self->_get_next_token;
redo B;
} elsif ({
body => 1, caption => 1, col => 1, colgroup => 1,
html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
thead => 1, tr => 1,
}->{$token->{tag_name}}) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
} else {
## NOTE: There are code clones of "misc in table".
$in_body->(sub {
my $child = shift;
if ({
table => 1, tbody => 1, tfoot => 1,
thead => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
my $foster_parent_element;
my $next_sibling;
OE: for (reverse 0..$#$open_elements) {
if ($open_elements->[$_]->[1] eq 'table') {
my $parent = $open_elements->[$_]->[0]->parent_node;
if (defined $parent and $parent->node_type == 1) {
$foster_parent_element = $parent;
$next_sibling = $open_elements->[$_]->[0];
} else {
= $open_elements->[$_ - 1]->[0];
last OE;
} # OE
$foster_parent_element = $open_elements->[0]->[0]
unless defined $foster_parent_element;
($child, $next_sibling);
} else {
$open_elements->[-1]->[0]->append_child ($child);
redo B;
} elsif ($insertion_mode eq 'in caption') {
if ($token->{type} eq 'character') {
## NOTE: This is a code clone of "character in body".
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
## NOTE: This is a code clone of "comment in body".
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ({
caption => 1, col => 1, colgroup => 1, tbody => 1,
td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
}->{$token->{tag_name}}) {
## As if
## have a table element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'caption') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token; # >
$token = {type => 'end tag', tag_name => 'caption'};
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
redo B;
if ($open_elements->[-1]->[1] ne 'caption') {
splice @$open_elements, $i;
$insertion_mode = 'in table';
## reprocess
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'caption') {
## have a table element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
redo B;
if ($open_elements->[-1]->[1] ne 'caption') {
splice @$open_elements, $i;
$insertion_mode = 'in table';
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'table') {
## As if
## have a table element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'caption') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token; #
$token = {type => 'end tag', tag_name => 'caption'};
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
redo B;
if ($open_elements->[-1]->[1] ne 'caption') {
splice @$open_elements, $i;
$insertion_mode = 'in table';
## reprocess
redo B;
} elsif ({
body => 1, col => 1, colgroup => 1,
html => 1, tbody => 1, td => 1, tfoot => 1,
th => 1, thead => 1, tr => 1,
}->{$token->{tag_name}}) {
## Ignore the token
redo B;
} else {
} else {
$in_body->(sub {
$open_elements->[-1]->[0]->append_child (shift);
redo B;
} elsif ($insertion_mode eq 'in column group') {
if ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
$open_elements->[-1]->[0]->manakai_append_text ($1);
unless (length $token->{data}) {
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ($token->{tag_name} eq 'col') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name}];
pop @$open_elements;
$token = $self->_get_next_token;
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'colgroup') {
if ($open_elements->[-1]->[1] eq 'html') {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
pop @$open_elements; # colgroup
$insertion_mode = 'in table';
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'col') {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
} else {
## As if
if ($open_elements->[-1]->[1] eq 'html') {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
pop @$open_elements; # colgroup
$insertion_mode = 'in table';
## reprocess
redo B;
} elsif ($insertion_mode eq 'in table body') {
if ($token->{type} eq 'character') {
## Copied from 'in table'
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
## Copied from 'in table'
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ({
tr => 1,
th => 1, td => 1,
}->{$token->{tag_name}}) {
## Clear back to table body context
while (not {
tbody => 1, tfoot => 1, thead => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
$insertion_mode = 'in row';
if ($token->{tag_name} eq 'tr') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name}];
$token = $self->_get_next_token;
} else {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, 'tr']);
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, 'tr'];
## reprocess
redo B;
} elsif ({
caption => 1, col => 1, colgroup => 1,
tbody => 1, tfoot => 1, thead => 1,
}->{$token->{tag_name}}) {
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ({
tbody => 1, thead => 1, tfoot => 1,
}->{$node->[1]}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Clear back to table body context
while (not {
tbody => 1, tfoot => 1, thead => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
## As if <{current node}>
## have an element in table scope
## true by definition
## Clear back to table body context
## nop by definition
pop @$open_elements;
$insertion_mode = 'in table';
## reprocess
redo B;
} elsif ($token->{tag_name} eq 'table') {
## NOTE: This is a code clone of "table in table"
## As if
## have a table element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'table') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore tokens
$token = $self->_get_next_token;
redo B;
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token; #
$token = {type => 'end tag', tag_name => 'table'};
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
redo B;
if ($open_elements->[-1]->[1] ne 'table') {
splice @$open_elements, $i;
## reprocess
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ({
tbody => 1, tfoot => 1, thead => 1,
}->{$token->{tag_name}}) {
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Clear back to table body context
while (not {
tbody => 1, tfoot => 1, thead => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
pop @$open_elements;
$insertion_mode = 'in table';
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'table') {
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ({
tbody => 1, thead => 1, tfoot => 1,
}->{$node->[1]}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Clear back to table body context
while (not {
tbody => 1, tfoot => 1, thead => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
## As if <{current node}>
## have an element in table scope
## true by definition
## Clear back to table body context
## nop by definition
pop @$open_elements;
$insertion_mode = 'in table';
## reprocess
redo B;
} elsif ({
body => 1, caption => 1, col => 1, colgroup => 1,
html => 1, td => 1, th => 1, tr => 1,
}->{$token->{tag_name}}) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
} else {
## As if in table
## NOTE: This is a code clone of "misc in table".
$in_body->(sub {
my $child = shift;
if ({
table => 1, tbody => 1, tfoot => 1,
thead => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
my $foster_parent_element;
my $next_sibling;
OE: for (reverse 0..$#$open_elements) {
if ($open_elements->[$_]->[1] eq 'table') {
my $parent = $open_elements->[$_]->[0]->parent_node;
if (defined $parent and $parent->node_type == 1) {
$foster_parent_element = $parent;
$next_sibling = $open_elements->[$_]->[0];
} else {
= $open_elements->[$_ - 1]->[0];
last OE;
} # OE
$foster_parent_element = $open_elements->[0]->[0]
unless defined $foster_parent_element;
($child, $next_sibling);
} else {
$open_elements->[-1]->[0]->append_child ($child);
redo B;
} elsif ($insertion_mode eq 'in row') {
if ($token->{type} eq 'character') {
## Copied from 'in table'
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
## Copied from 'in table'
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ($token->{tag_name} eq 'th' or
$token->{tag_name} eq 'td') {
## Clear back to table row context
while (not {
tr => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name}];
$insertion_mode = 'in cell';
push @$active_formatting_elements, ['#marker', ''];
$token = $self->_get_next_token;
redo B;
} elsif ({
caption => 1, col => 1, colgroup => 1,
tbody => 1, tfoot => 1, thead => 1, tr => 1,
}->{$token->{tag_name}}) {
## As if
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'tr') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Clear back to table row context
while (not {
tr => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
pop @$open_elements; # tr
$insertion_mode = 'in table body';
## reprocess
redo B;
} elsif ($token->{tag_name} eq 'table') {
## NOTE: This is a code clone of "table in table"
## As if
## have a table element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'table') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore tokens
$token = $self->_get_next_token;
redo B;
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => 1, th => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token; #
$token = {type => 'end tag', tag_name => 'table'};
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
redo B;
if ($open_elements->[-1]->[1] ne 'table') {
splice @$open_elements, $i;
## reprocess
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'tr') {
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Clear back to table row context
while (not {
tr => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
pop @$open_elements; # tr
$insertion_mode = 'in table body';
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'table') {
## As if
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'tr') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Clear back to table row context
while (not {
tr => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
pop @$open_elements; # tr
$insertion_mode = 'in table body';
## reprocess
redo B;
} elsif ({
tbody => 1, tfoot => 1, thead => 1,
}->{$token->{tag_name}}) {
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## As if
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'tr') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Clear back to table row context
while (not {
tr => 1, html => 1,
}->{$open_elements->[-1]->[1]}) {
pop @$open_elements;
pop @$open_elements; # tr
$insertion_mode = 'in table body';
## reprocess
redo B;
} elsif ({
body => 1, caption => 1, col => 1,
colgroup => 1, html => 1, td => 1, th => 1,
}->{$token->{tag_name}}) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} else {
} else {
## As if in table
## NOTE: This is a code clone of "misc in table".
$in_body->(sub {
my $child = shift;
if ({
table => 1, tbody => 1, tfoot => 1,
thead => 1, tr => 1,
}->{$open_elements->[-1]->[1]}) {
my $foster_parent_element;
my $next_sibling;
OE: for (reverse 0..$#$open_elements) {
if ($open_elements->[$_]->[1] eq 'table') {
my $parent = $open_elements->[$_]->[0]->parent_node;
if (defined $parent and $parent->node_type == 1) {
$foster_parent_element = $parent;
$next_sibling = $open_elements->[$_]->[0];
} else {
= $open_elements->[$_ - 1]->[0];
last OE;
} # OE
$foster_parent_element = $open_elements->[0]->[0]
unless defined $foster_parent_element;
($child, $next_sibling);
} else {
$open_elements->[-1]->[0]->append_child ($child);
redo B;
} elsif ($insertion_mode eq 'in cell') {
if ($token->{type} eq 'character') {
## NOTE: This is a code clone of "character in body".
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
## NOTE: This is a code clone of "comment in body".
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ({
caption => 1, col => 1, colgroup => 1,
tbody => 1, td => 1, tfoot => 1, th => 1,
thead => 1, tr => 1,
}->{$token->{tag_name}}) {
## have an element in table scope
my $tn;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'td' or $node->[1] eq 'th') {
$tn = $node->[1];
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $tn) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Close the cell
unshift @{$self->{token}}, $token; # >
$token = {type => 'end tag', tag_name => $tn};
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## generate implied end tags
if ({
dd => 1, dt => 1, li => 1, p => 1,
td => ($token->{tag_name} eq 'th'),
th => ($token->{tag_name} eq 'td'),
tr => 1,
}->{$open_elements->[-1]->[1]}) {
unshift @{$self->{token}}, $token;
$token = {type => 'end tag',
tag_name => $open_elements->[-1]->[1]}; # MUST
redo B;
if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
splice @$open_elements, $i;
$insertion_mode = 'in row';
$token = $self->_get_next_token;
redo B;
} elsif ({
body => 1, caption => 1, col => 1,
colgroup => 1, html => 1,
}->{$token->{tag_name}}) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} elsif ({
table => 1, tbody => 1, tfoot => 1,
thead => 1, tr => 1,
}->{$token->{tag_name}}) {
## have an element in table scope
my $i;
my $tn;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
$tn = $node->[1];
## NOTE: There is exactly one |td| or |th| element
## in scope in the stack of open elements by definition.
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## Close the cell
unshift @{$self->{token}}, $token; # ?>
$token = {type => 'end tag', tag_name => $tn};
redo B;
} else {
} else {
$in_body->(sub {
$open_elements->[-1]->[0]->append_child (shift);
redo B;
} elsif ($insertion_mode eq 'in select') {
if ($token->{type} eq 'character') {
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ($token->{tag_name} eq 'option') {
if ($open_elements->[-1]->[1] eq 'option') {
## As if
pop @$open_elements;
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name}];
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'optgroup') {
if ($open_elements->[-1]->[1] eq 'option') {
## As if
pop @$open_elements;
if ($open_elements->[-1]->[1] eq 'optgroup') {
## As if
pop @$open_elements;
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name}];
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'select') {
## As if instead
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
splice @$open_elements, $i;
$token = $self->_get_next_token;
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'optgroup') {
if ($open_elements->[-1]->[1] eq 'option' and
$open_elements->[-2]->[1] eq 'optgroup') {
## As if
splice @$open_elements, -2;
} elsif ($open_elements->[-1]->[1] eq 'optgroup') {
pop @$open_elements;
} else {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'option') {
if ($open_elements->[-1]->[1] eq 'option') {
pop @$open_elements;
} else {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'select') {
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
splice @$open_elements, $i;
$token = $self->_get_next_token;
redo B;
} elsif ({
caption => 1, table => 1, tbody => 1,
tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
}->{$token->{tag_name}}) {
## have an element in table scope
my $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq $token->{tag_name}) {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## As if
## have an element in table scope
undef $i;
INSCOPE: for (reverse 0..$#$open_elements) {
my $node = $open_elements->[$_];
if ($node->[1] eq 'select') {
$i = $_;
} elsif ({
table => 1, html => 1,
}->{$node->[1]}) {
unless (defined $i) {
## Ignore the token
$token = $self->_get_next_token; ## TODO: ok?
redo B;
splice @$open_elements, $i;
## reprocess
redo B;
} else {
} else {
## Ignore the token
redo B;
} elsif ($insertion_mode eq 'after body') {
if ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
## As if in body
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
unless (length $token->{data}) {
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[0]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'html') {
## TODO: if inner_html, parse-error, ignore the token; otherwise,
$phase = 'trailing end';
$token = $self->_get_next_token;
redo B;
} else {
} else {
$insertion_mode = 'in body';
## reprocess
redo B;
} elsif ($insertion_mode eq 'in frameset') {
if ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
unless (length $token->{data}) {
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ($token->{tag_name} eq 'frameset') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name}];
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'frame') {
my $el;
$el = $self->{document}->create_element_ns
(q, [undef, $token->{tag_name}]);
for my $attr_name (keys %{ $token->{attributes}}) {
$el->set_attribute_ns (undef, [undef, $attr_name],
$token->{attributes} ->{$attr_name}->{value});
$open_elements->[-1]->[0]->append_child ($el);
push @$open_elements, [$el, $token->{tag_name}];
pop @$open_elements;
$token = $self->_get_next_token;
redo B;
} elsif ($token->{tag_name} eq 'noframes') {
$in_body->(sub {
$open_elements->[-1]->[0]->append_child (shift);
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'frameset') {
if ($open_elements->[-1]->[1] eq 'html' and
@$open_elements == 1) {
## Ignore the token
$token = $self->_get_next_token;
} else {
pop @$open_elements;
$token = $self->_get_next_token;
## if not inner_html and
if ($open_elements->[-1]->[1] ne 'frameset') {
$insertion_mode = 'after frameset';
redo B;
} else {
} else {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} elsif ($insertion_mode eq 'after frameset') {
if ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
unless (length $token->{data}) {
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$open_elements->[-1]->[0]->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'start tag') {
if ($token->{tag_name} eq 'noframes') {
$in_body->(sub {
$open_elements->[-1]->[0]->append_child (shift);
redo B;
} else {
} elsif ($token->{type} eq 'end tag') {
if ($token->{tag_name} eq 'html') {
$phase = 'trailing end';
$token = $self->_get_next_token;
redo B;
} else {
} else {
## Ignore the token
$token = $self->_get_next_token;
redo B;
## ISSUE: An issue in spec there
} else {
die "$0: $insertion_mode: Unknown insertion mode";
} elsif ($phase eq 'trailing end') {
## states in the main stage is preserved yet # MUST
if ($token->{type} eq 'DOCTYPE') {
## Ignore the token
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'comment') {
my $comment = $self->{document}->create_comment ($token->{data});
$self->{document}->append_child ($comment);
$token = $self->_get_next_token;
redo B;
} elsif ($token->{type} eq 'character') {
if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
## As if in the main phase.
## NOTE: The insertion mode in the main phase
## just before the phase has been changed to the trailing
## end phase is either "after body" or "after frameset".
if $phase eq 'main';
$open_elements->[-1]->[0]->manakai_append_text ($token->{data});
unless (length $token->{data}) {
$token = $self->_get_next_token;
redo B;
$phase = 'main';
## reprocess
redo B;
} elsif ($token->{type} eq 'start tag' or
$token->{type} eq 'end tag') {
$phase = 'main';
## reprocess
redo B;
} elsif ($token->{type} eq 'end-of-file') {
## Stop parsing
last B;
} else {
die "$0: $token->{type}: Unknown token";
} # B
## Stop parsing # MUST
## TODO: script stuffs
} # _construct_tree
sub inner_html ($$$) {
my ($class, $node, $on_error) = @_;
## Step 1
my $s = '';
my $in_cdata;
my $parent = $node;
while (defined $parent) {
if ($parent->node_type == 1 and
$parent->namespace_uri eq '' and
style => 1, script => 1, xmp => 1, iframe => 1,
noembed => 1, noframes => 1, noscript => 1,
}->{$parent->local_name}) { ## TODO: case thingy
$in_cdata = 1;
$parent = $parent->parent_node;
## Step 2
my @node = @{$node->child_nodes};
C: while (@node) {
my $child = shift @node;
unless (ref $child) {
if ($child eq 'cdata-out') {
$in_cdata = 0;
} else {
$s .= $child; # end tag
next C;
my $nt = $child->node_type;
if ($nt == 1) { # Element
my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
$s .= '<' . $tag_name;
## ISSUE: Non-html elements
my @attrs = @{$child->attributes}; # sort order MUST be stable
for my $attr (@attrs) { # order is implementation dependent
my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
$s .= ' ' . $attr_name . '="';
my $attr_value = $attr->value;
## escape
$attr_value =~ s/&/&/g;
$attr_value =~ s/</g;
$attr_value =~ s/>/>/g;
$attr_value =~ s/"/"/g;
$s .= $attr_value . '"';
$s .= '>';
next C if {
area => 1, base => 1, basefont => 1, bgsound => 1,
br => 1, col => 1, embed => 1, frame => 1, hr => 1,
img => 1, input => 1, link => 1, meta => 1, param => 1,
spacer => 1, wbr => 1,
if (not $in_cdata and {
style => 1, script => 1, xmp => 1, iframe => 1,
noembed => 1, noframes => 1, noscript => 1,
}->{$tag_name}) {
unshift @node, 'cdata-out';
$in_cdata = 1;
unshift @node, @{$child->child_nodes}, '' . $tag_name . '>';
} elsif ($nt == 3 or $nt == 4) {
if ($in_cdata) {
$s .= $child->data;
} else {
my $value = $child->data;
$value =~ s/&/&/g;
$value =~ s/</g;
$value =~ s/>/>/g;
$value =~ s/"/"/g;
$s .= $value;
} elsif ($nt == 8) {
$s .= '';
} elsif ($nt == 10) {
$s .= 'name . '>';
} elsif ($nt == 5) { # entrefs
push @node, @{$child->child_nodes};
} else {
} # C
## Step 3
return \$s;
} # inner_html
# $Date: 2007/04/30 14:12:02 $