/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (show annotations) (download) (as text)
Sat Jun 23 03:30:06 2007 UTC (17 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.9: +37 -40 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	23 Jun 2007 03:16:30 -0000
	* tokenizer-test-1.test: Tests for C1 character
	references are added.

2007-06-23  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	23 Jun 2007 03:26:51 -0000
	* HTML.pm.src: An error message was incorrect.
	HTML5 revision 869 (C1 character references).

2007-06-23  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.9 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## This is an early version of an HTML parser.
6
7 my $permitted_slash_tag_name = {
8 base => 1,
9 link => 1,
10 meta => 1,
11 hr => 1,
12 br => 1,
13 img=> 1,
14 embed => 1,
15 param => 1,
16 area => 1,
17 col => 1,
18 input => 1,
19 };
20
21 my $entity_char = {
22 AElig => "\x{00C6}",
23 Aacute => "\x{00C1}",
24 Acirc => "\x{00C2}",
25 Agrave => "\x{00C0}",
26 Alpha => "\x{0391}",
27 Aring => "\x{00C5}",
28 Atilde => "\x{00C3}",
29 Auml => "\x{00C4}",
30 Beta => "\x{0392}",
31 Ccedil => "\x{00C7}",
32 Chi => "\x{03A7}",
33 Dagger => "\x{2021}",
34 Delta => "\x{0394}",
35 ETH => "\x{00D0}",
36 Eacute => "\x{00C9}",
37 Ecirc => "\x{00CA}",
38 Egrave => "\x{00C8}",
39 Epsilon => "\x{0395}",
40 Eta => "\x{0397}",
41 Euml => "\x{00CB}",
42 Gamma => "\x{0393}",
43 Iacute => "\x{00CD}",
44 Icirc => "\x{00CE}",
45 Igrave => "\x{00CC}",
46 Iota => "\x{0399}",
47 Iuml => "\x{00CF}",
48 Kappa => "\x{039A}",
49 Lambda => "\x{039B}",
50 Mu => "\x{039C}",
51 Ntilde => "\x{00D1}",
52 Nu => "\x{039D}",
53 OElig => "\x{0152}",
54 Oacute => "\x{00D3}",
55 Ocirc => "\x{00D4}",
56 Ograve => "\x{00D2}",
57 Omega => "\x{03A9}",
58 Omicron => "\x{039F}",
59 Oslash => "\x{00D8}",
60 Otilde => "\x{00D5}",
61 Ouml => "\x{00D6}",
62 Phi => "\x{03A6}",
63 Pi => "\x{03A0}",
64 Prime => "\x{2033}",
65 Psi => "\x{03A8}",
66 Rho => "\x{03A1}",
67 Scaron => "\x{0160}",
68 Sigma => "\x{03A3}",
69 THORN => "\x{00DE}",
70 Tau => "\x{03A4}",
71 Theta => "\x{0398}",
72 Uacute => "\x{00DA}",
73 Ucirc => "\x{00DB}",
74 Ugrave => "\x{00D9}",
75 Upsilon => "\x{03A5}",
76 Uuml => "\x{00DC}",
77 Xi => "\x{039E}",
78 Yacute => "\x{00DD}",
79 Yuml => "\x{0178}",
80 Zeta => "\x{0396}",
81 aacute => "\x{00E1}",
82 acirc => "\x{00E2}",
83 acute => "\x{00B4}",
84 aelig => "\x{00E6}",
85 agrave => "\x{00E0}",
86 alefsym => "\x{2135}",
87 alpha => "\x{03B1}",
88 amp => "\x{0026}",
89 AMP => "\x{0026}",
90 and => "\x{2227}",
91 ang => "\x{2220}",
92 apos => "\x{0027}",
93 aring => "\x{00E5}",
94 asymp => "\x{2248}",
95 atilde => "\x{00E3}",
96 auml => "\x{00E4}",
97 bdquo => "\x{201E}",
98 beta => "\x{03B2}",
99 brvbar => "\x{00A6}",
100 bull => "\x{2022}",
101 cap => "\x{2229}",
102 ccedil => "\x{00E7}",
103 cedil => "\x{00B8}",
104 cent => "\x{00A2}",
105 chi => "\x{03C7}",
106 circ => "\x{02C6}",
107 clubs => "\x{2663}",
108 cong => "\x{2245}",
109 copy => "\x{00A9}",
110 COPY => "\x{00A9}",
111 crarr => "\x{21B5}",
112 cup => "\x{222A}",
113 curren => "\x{00A4}",
114 dArr => "\x{21D3}",
115 dagger => "\x{2020}",
116 darr => "\x{2193}",
117 deg => "\x{00B0}",
118 delta => "\x{03B4}",
119 diams => "\x{2666}",
120 divide => "\x{00F7}",
121 eacute => "\x{00E9}",
122 ecirc => "\x{00EA}",
123 egrave => "\x{00E8}",
124 empty => "\x{2205}",
125 emsp => "\x{2003}",
126 ensp => "\x{2002}",
127 epsilon => "\x{03B5}",
128 equiv => "\x{2261}",
129 eta => "\x{03B7}",
130 eth => "\x{00F0}",
131 euml => "\x{00EB}",
132 euro => "\x{20AC}",
133 exist => "\x{2203}",
134 fnof => "\x{0192}",
135 forall => "\x{2200}",
136 frac12 => "\x{00BD}",
137 frac14 => "\x{00BC}",
138 frac34 => "\x{00BE}",
139 frasl => "\x{2044}",
140 gamma => "\x{03B3}",
141 ge => "\x{2265}",
142 gt => "\x{003E}",
143 GT => "\x{003E}",
144 hArr => "\x{21D4}",
145 harr => "\x{2194}",
146 hearts => "\x{2665}",
147 hellip => "\x{2026}",
148 iacute => "\x{00ED}",
149 icirc => "\x{00EE}",
150 iexcl => "\x{00A1}",
151 igrave => "\x{00EC}",
152 image => "\x{2111}",
153 infin => "\x{221E}",
154 int => "\x{222B}",
155 iota => "\x{03B9}",
156 iquest => "\x{00BF}",
157 isin => "\x{2208}",
158 iuml => "\x{00EF}",
159 kappa => "\x{03BA}",
160 lArr => "\x{21D0}",
161 lambda => "\x{03BB}",
162 lang => "\x{2329}",
163 laquo => "\x{00AB}",
164 larr => "\x{2190}",
165 lceil => "\x{2308}",
166 ldquo => "\x{201C}",
167 le => "\x{2264}",
168 lfloor => "\x{230A}",
169 lowast => "\x{2217}",
170 loz => "\x{25CA}",
171 lrm => "\x{200E}",
172 lsaquo => "\x{2039}",
173 lsquo => "\x{2018}",
174 lt => "\x{003C}",
175 LT => "\x{003C}",
176 macr => "\x{00AF}",
177 mdash => "\x{2014}",
178 micro => "\x{00B5}",
179 middot => "\x{00B7}",
180 minus => "\x{2212}",
181 mu => "\x{03BC}",
182 nabla => "\x{2207}",
183 nbsp => "\x{00A0}",
184 ndash => "\x{2013}",
185 ne => "\x{2260}",
186 ni => "\x{220B}",
187 not => "\x{00AC}",
188 notin => "\x{2209}",
189 nsub => "\x{2284}",
190 ntilde => "\x{00F1}",
191 nu => "\x{03BD}",
192 oacute => "\x{00F3}",
193 ocirc => "\x{00F4}",
194 oelig => "\x{0153}",
195 ograve => "\x{00F2}",
196 oline => "\x{203E}",
197 omega => "\x{03C9}",
198 omicron => "\x{03BF}",
199 oplus => "\x{2295}",
200 or => "\x{2228}",
201 ordf => "\x{00AA}",
202 ordm => "\x{00BA}",
203 oslash => "\x{00F8}",
204 otilde => "\x{00F5}",
205 otimes => "\x{2297}",
206 ouml => "\x{00F6}",
207 para => "\x{00B6}",
208 part => "\x{2202}",
209 permil => "\x{2030}",
210 perp => "\x{22A5}",
211 phi => "\x{03C6}",
212 pi => "\x{03C0}",
213 piv => "\x{03D6}",
214 plusmn => "\x{00B1}",
215 pound => "\x{00A3}",
216 prime => "\x{2032}",
217 prod => "\x{220F}",
218 prop => "\x{221D}",
219 psi => "\x{03C8}",
220 quot => "\x{0022}",
221 QUOT => "\x{0022}",
222 rArr => "\x{21D2}",
223 radic => "\x{221A}",
224 rang => "\x{232A}",
225 raquo => "\x{00BB}",
226 rarr => "\x{2192}",
227 rceil => "\x{2309}",
228 rdquo => "\x{201D}",
229 real => "\x{211C}",
230 reg => "\x{00AE}",
231 REG => "\x{00AE}",
232 rfloor => "\x{230B}",
233 rho => "\x{03C1}",
234 rlm => "\x{200F}",
235 rsaquo => "\x{203A}",
236 rsquo => "\x{2019}",
237 sbquo => "\x{201A}",
238 scaron => "\x{0161}",
239 sdot => "\x{22C5}",
240 sect => "\x{00A7}",
241 shy => "\x{00AD}",
242 sigma => "\x{03C3}",
243 sigmaf => "\x{03C2}",
244 sim => "\x{223C}",
245 spades => "\x{2660}",
246 sub => "\x{2282}",
247 sube => "\x{2286}",
248 sum => "\x{2211}",
249 sup => "\x{2283}",
250 sup1 => "\x{00B9}",
251 sup2 => "\x{00B2}",
252 sup3 => "\x{00B3}",
253 supe => "\x{2287}",
254 szlig => "\x{00DF}",
255 tau => "\x{03C4}",
256 there4 => "\x{2234}",
257 theta => "\x{03B8}",
258 thetasym => "\x{03D1}",
259 thinsp => "\x{2009}",
260 thorn => "\x{00FE}",
261 tilde => "\x{02DC}",
262 times => "\x{00D7}",
263 trade => "\x{2122}",
264 uArr => "\x{21D1}",
265 uacute => "\x{00FA}",
266 uarr => "\x{2191}",
267 ucirc => "\x{00FB}",
268 ugrave => "\x{00F9}",
269 uml => "\x{00A8}",
270 upsih => "\x{03D2}",
271 upsilon => "\x{03C5}",
272 uuml => "\x{00FC}",
273 weierp => "\x{2118}",
274 xi => "\x{03BE}",
275 yacute => "\x{00FD}",
276 yen => "\x{00A5}",
277 yuml => "\x{00FF}",
278 zeta => "\x{03B6}",
279 zwj => "\x{200D}",
280 zwnj => "\x{200C}",
281 }; # $entity_char
282
283 my $c1_entity_char = {
284 0x80 => 0x20AC,
285 0x81 => 0xFFFD,
286 0x82 => 0x201A,
287 0x83 => 0x0192,
288 0x84 => 0x201E,
289 0x85 => 0x2026,
290 0x86 => 0x2020,
291 0x87 => 0x2021,
292 0x88 => 0x02C6,
293 0x89 => 0x2030,
294 0x8A => 0x0160,
295 0x8B => 0x2039,
296 0x8C => 0x0152,
297 0x8D => 0xFFFD,
298 0x8E => 0x017D,
299 0x8F => 0xFFFD,
300 0x90 => 0xFFFD,
301 0x91 => 0x2018,
302 0x92 => 0x2019,
303 0x93 => 0x201C,
304 0x94 => 0x201D,
305 0x95 => 0x2022,
306 0x96 => 0x2013,
307 0x97 => 0x2014,
308 0x98 => 0x02DC,
309 0x99 => 0x2122,
310 0x9A => 0x0161,
311 0x9B => 0x203A,
312 0x9C => 0x0153,
313 0x9D => 0xFFFD,
314 0x9E => 0x017E,
315 0x9F => 0x0178,
316 }; # $c1_entity_char
317
318 my $special_category = {
319 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
320 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
321 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
322 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
323 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
324 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
325 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
326 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
327 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
328 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
329 };
330 my $scoping_category = {
331 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
332 table => 1, td => 1, th => 1,
333 };
334 my $formatting_category = {
335 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
336 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
337 };
338 # $phrasing_category: all other elements
339
340 sub parse_string ($$$;$) {
341 my $self = shift->new;
342 my $s = \$_[0];
343 $self->{document} = $_[1];
344
345 ## NOTE: |set_inner_html| copies most of this method's code
346
347 my $i = 0;
348 my $line = 1;
349 my $column = 0;
350 $self->{set_next_input_character} = sub {
351 my $self = shift;
352 $self->{next_input_character} = -1 and return if $i >= length $$s;
353 $self->{next_input_character} = ord substr $$s, $i++, 1;
354 $column++;
355
356 if ($self->{next_input_character} == 0x000A) { # LF
357 $line++;
358 $column = 0;
359 } elsif ($self->{next_input_character} == 0x000D) { # CR
360 if ($i >= length $$s) {
361 #
362 } else {
363 my $next_char = ord substr $$s, $i++, 1;
364 if ($next_char == 0x000A) { # LF
365 #
366 } else {
367 push @{$self->{char}}, $next_char;
368 }
369 }
370 $self->{next_input_character} = 0x000A; # LF # MUST
371 $line++;
372 $column = 0;
373 } elsif ($self->{next_input_character} > 0x10FFFF) {
374 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
375 } elsif ($self->{next_input_character} == 0x0000) { # NULL
376 !!!parse-error (type => 'NULL');
377 ## TODO: test
378 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
379 }
380 };
381
382 my $onerror = $_[2] || sub {
383 my (%opt) = @_;
384 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
385 };
386 $self->{parse_error} = sub {
387 $onerror->(@_, line => $line, column => $column);
388 };
389
390 $self->_initialize_tokenizer;
391 $self->_initialize_tree_constructor;
392 $self->_construct_tree;
393 $self->_terminate_tree_constructor;
394
395 return $self->{document};
396 } # parse_string
397
398 sub new ($) {
399 my $class = shift;
400 my $self = bless {}, $class;
401 $self->{set_next_input_character} = sub {
402 $self->{next_input_character} = -1;
403 };
404 $self->{parse_error} = sub {
405 #
406 };
407 return $self;
408 } # new
409
410 ## Implementations MUST act as if state machine in the spec
411
412 sub _initialize_tokenizer ($) {
413 my $self = shift;
414 $self->{state} = 'data'; # MUST
415 $self->{content_model_flag} = 'PCDATA'; # be
416 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
417 undef $self->{current_attribute};
418 undef $self->{last_emitted_start_tag_name};
419 undef $self->{last_attribute_value_state};
420 $self->{char} = [];
421 # $self->{next_input_character}
422 !!!next-input-character;
423 $self->{token} = [];
424 } # _initialize_tokenizer
425
426 ## A token has:
427 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
428 ## 'character', or 'end-of-file'
429 ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
430 ## ISSUE: the spec need s/tagname/tag name/
431 ## ->{error} == 1 or 0 (DOCTYPE)
432 ## ->{attributes} isa HASH (start tag, end tag)
433 ## ->{data} (comment, character)
434
435 ## Macros
436 ## Macros MUST be preceded by three EXCLAMATION MARKs.
437 ## emit ($token)
438 ## Emits the specified token.
439
440 ## Emitted token MUST immediately be handled by the tree construction state.
441
442 ## Before each step, UA MAY check to see if either one of the scripts in
443 ## "list of scripts that will execute as soon as possible" or the first
444 ## script in the "list of scripts that will execute asynchronously",
445 ## has completed loading. If one has, then it MUST be executed
446 ## and removed from the list.
447
448 ## ISSUE: <http://html5.org/tools/web-apps-tracker?from=874&to=876>
449
450 sub _get_next_token ($) {
451 my $self = shift;
452 if (@{$self->{token}}) {
453 return shift @{$self->{token}};
454 }
455
456 A: {
457 if ($self->{state} eq 'data') {
458 if ($self->{next_input_character} == 0x0026) { # &
459 if ($self->{content_model_flag} eq 'PCDATA' or
460 $self->{content_model_flag} eq 'RCDATA') {
461 $self->{state} = 'entity data';
462 !!!next-input-character;
463 redo A;
464 } else {
465 #
466 }
467 } elsif ($self->{next_input_character} == 0x003C) { # <
468 if ($self->{content_model_flag} ne 'PLAINTEXT') {
469 $self->{state} = 'tag open';
470 !!!next-input-character;
471 redo A;
472 } else {
473 #
474 }
475 } elsif ($self->{next_input_character} == -1) {
476 !!!emit ({type => 'end-of-file'});
477 last A; ## TODO: ok?
478 }
479 # Anything else
480 my $token = {type => 'character',
481 data => chr $self->{next_input_character}};
482 ## Stay in the data state
483 !!!next-input-character;
484
485 !!!emit ($token);
486
487 redo A;
488 } elsif ($self->{state} eq 'entity data') {
489 ## (cannot happen in CDATA state)
490
491 my $token = $self->_tokenize_attempt_to_consume_an_entity;
492
493 $self->{state} = 'data';
494 # next-input-character is already done
495
496 unless (defined $token) {
497 !!!emit ({type => 'character', data => '&'});
498 } else {
499 !!!emit ($token);
500 }
501
502 redo A;
503 } elsif ($self->{state} eq 'tag open') {
504 if ($self->{content_model_flag} eq 'RCDATA' or
505 $self->{content_model_flag} eq 'CDATA') {
506 if ($self->{next_input_character} == 0x002F) { # /
507 !!!next-input-character;
508 $self->{state} = 'close tag open';
509 redo A;
510 } else {
511 ## reconsume
512 $self->{state} = 'data';
513
514 !!!emit ({type => 'character', data => '<'});
515
516 redo A;
517 }
518 } elsif ($self->{content_model_flag} eq 'PCDATA') {
519 if ($self->{next_input_character} == 0x0021) { # !
520 $self->{state} = 'markup declaration open';
521 !!!next-input-character;
522 redo A;
523 } elsif ($self->{next_input_character} == 0x002F) { # /
524 $self->{state} = 'close tag open';
525 !!!next-input-character;
526 redo A;
527 } elsif (0x0041 <= $self->{next_input_character} and
528 $self->{next_input_character} <= 0x005A) { # A..Z
529 $self->{current_token}
530 = {type => 'start tag',
531 tag_name => chr ($self->{next_input_character} + 0x0020)};
532 $self->{state} = 'tag name';
533 !!!next-input-character;
534 redo A;
535 } elsif (0x0061 <= $self->{next_input_character} and
536 $self->{next_input_character} <= 0x007A) { # a..z
537 $self->{current_token} = {type => 'start tag',
538 tag_name => chr ($self->{next_input_character})};
539 $self->{state} = 'tag name';
540 !!!next-input-character;
541 redo A;
542 } elsif ($self->{next_input_character} == 0x003E) { # >
543 !!!parse-error (type => 'empty start tag');
544 $self->{state} = 'data';
545 !!!next-input-character;
546
547 !!!emit ({type => 'character', data => '<>'});
548
549 redo A;
550 } elsif ($self->{next_input_character} == 0x003F) { # ?
551 !!!parse-error (type => 'pio');
552 $self->{state} = 'bogus comment';
553 ## $self->{next_input_character} is intentionally left as is
554 redo A;
555 } else {
556 !!!parse-error (type => 'bare stago');
557 $self->{state} = 'data';
558 ## reconsume
559
560 !!!emit ({type => 'character', data => '<'});
561
562 redo A;
563 }
564 } else {
565 die "$0: $self->{content_model_flag}: Unknown content model flag";
566 }
567 } elsif ($self->{state} eq 'close tag open') {
568 if ($self->{content_model_flag} eq 'RCDATA' or
569 $self->{content_model_flag} eq 'CDATA') {
570 my @next_char;
571 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
572 push @next_char, $self->{next_input_character};
573 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
574 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
575 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
576 !!!next-input-character;
577 next TAGNAME;
578 } else {
579 !!!parse-error (type => 'unmatched end tag');
580 $self->{next_input_character} = shift @next_char; # reconsume
581 !!!back-next-input-character (@next_char);
582 $self->{state} = 'data';
583
584 !!!emit ({type => 'character', data => '</'});
585
586 redo A;
587 }
588 }
589 push @next_char, $self->{next_input_character};
590
591 unless ($self->{next_input_character} == 0x0009 or # HT
592 $self->{next_input_character} == 0x000A or # LF
593 $self->{next_input_character} == 0x000B or # VT
594 $self->{next_input_character} == 0x000C or # FF
595 $self->{next_input_character} == 0x0020 or # SP
596 $self->{next_input_character} == 0x003E or # >
597 $self->{next_input_character} == 0x002F or # /
598 $self->{next_input_character} == 0x003C or # <
599 $self->{next_input_character} == -1) {
600 !!!parse-error (type => 'unmatched end tag');
601 $self->{next_input_character} = shift @next_char; # reconsume
602 !!!back-next-input-character (@next_char);
603 $self->{state} = 'data';
604
605 !!!emit ({type => 'character', data => '</'});
606
607 redo A;
608 } else {
609 $self->{next_input_character} = shift @next_char;
610 !!!back-next-input-character (@next_char);
611 # and consume...
612 }
613 }
614
615 if (0x0041 <= $self->{next_input_character} and
616 $self->{next_input_character} <= 0x005A) { # A..Z
617 $self->{current_token} = {type => 'end tag',
618 tag_name => chr ($self->{next_input_character} + 0x0020)};
619 $self->{state} = 'tag name';
620 !!!next-input-character;
621 redo A;
622 } elsif (0x0061 <= $self->{next_input_character} and
623 $self->{next_input_character} <= 0x007A) { # a..z
624 $self->{current_token} = {type => 'end tag',
625 tag_name => chr ($self->{next_input_character})};
626 $self->{state} = 'tag name';
627 !!!next-input-character;
628 redo A;
629 } elsif ($self->{next_input_character} == 0x003E) { # >
630 !!!parse-error (type => 'empty end tag');
631 $self->{state} = 'data';
632 !!!next-input-character;
633 redo A;
634 } elsif ($self->{next_input_character} == -1) {
635 !!!parse-error (type => 'bare etago');
636 $self->{state} = 'data';
637 # reconsume
638
639 !!!emit ({type => 'character', data => '</'});
640
641 redo A;
642 } else {
643 !!!parse-error (type => 'bogus end tag');
644 $self->{state} = 'bogus comment';
645 ## $self->{next_input_character} is intentionally left as is
646 redo A;
647 }
648 } elsif ($self->{state} eq 'tag name') {
649 if ($self->{next_input_character} == 0x0009 or # HT
650 $self->{next_input_character} == 0x000A or # LF
651 $self->{next_input_character} == 0x000B or # VT
652 $self->{next_input_character} == 0x000C or # FF
653 $self->{next_input_character} == 0x0020) { # SP
654 $self->{state} = 'before attribute name';
655 !!!next-input-character;
656 redo A;
657 } elsif ($self->{next_input_character} == 0x003E) { # >
658 if ($self->{current_token}->{type} eq 'start tag') {
659 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
660 } elsif ($self->{current_token}->{type} eq 'end tag') {
661 $self->{content_model_flag} = 'PCDATA'; # MUST
662 if ($self->{current_token}->{attributes}) {
663 !!!parse-error (type => 'end tag attribute');
664 }
665 } else {
666 die "$0: $self->{current_token}->{type}: Unknown token type";
667 }
668 $self->{state} = 'data';
669 !!!next-input-character;
670
671 !!!emit ($self->{current_token}); # start tag or end tag
672 undef $self->{current_token};
673
674 redo A;
675 } elsif (0x0041 <= $self->{next_input_character} and
676 $self->{next_input_character} <= 0x005A) { # A..Z
677 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
678 # start tag or end tag
679 ## Stay in this state
680 !!!next-input-character;
681 redo A;
682 } elsif ($self->{next_input_character} == 0x003C or # <
683 $self->{next_input_character} == -1) {
684 !!!parse-error (type => 'unclosed tag');
685 if ($self->{current_token}->{type} eq 'start tag') {
686 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
687 } elsif ($self->{current_token}->{type} eq 'end tag') {
688 $self->{content_model_flag} = 'PCDATA'; # MUST
689 if ($self->{current_token}->{attributes}) {
690 !!!parse-error (type => 'end tag attribute');
691 }
692 } else {
693 die "$0: $self->{current_token}->{type}: Unknown token type";
694 }
695 $self->{state} = 'data';
696 # reconsume
697
698 !!!emit ($self->{current_token}); # start tag or end tag
699 undef $self->{current_token};
700
701 redo A;
702 } elsif ($self->{next_input_character} == 0x002F) { # /
703 !!!next-input-character;
704 if ($self->{next_input_character} == 0x003E and # >
705 $self->{current_token}->{type} eq 'start tag' and
706 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
707 # permitted slash
708 #
709 } else {
710 !!!parse-error (type => 'nestc');
711 }
712 $self->{state} = 'before attribute name';
713 # next-input-character is already done
714 redo A;
715 } else {
716 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
717 # start tag or end tag
718 ## Stay in the state
719 !!!next-input-character;
720 redo A;
721 }
722 } elsif ($self->{state} eq 'before attribute name') {
723 if ($self->{next_input_character} == 0x0009 or # HT
724 $self->{next_input_character} == 0x000A or # LF
725 $self->{next_input_character} == 0x000B or # VT
726 $self->{next_input_character} == 0x000C or # FF
727 $self->{next_input_character} == 0x0020) { # SP
728 ## Stay in the state
729 !!!next-input-character;
730 redo A;
731 } elsif ($self->{next_input_character} == 0x003E) { # >
732 if ($self->{current_token}->{type} eq 'start tag') {
733 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
734 } elsif ($self->{current_token}->{type} eq 'end tag') {
735 $self->{content_model_flag} = 'PCDATA'; # MUST
736 if ($self->{current_token}->{attributes}) {
737 !!!parse-error (type => 'end tag attribute');
738 }
739 } else {
740 die "$0: $self->{current_token}->{type}: Unknown token type";
741 }
742 $self->{state} = 'data';
743 !!!next-input-character;
744
745 !!!emit ($self->{current_token}); # start tag or end tag
746 undef $self->{current_token};
747
748 redo A;
749 } elsif (0x0041 <= $self->{next_input_character} and
750 $self->{next_input_character} <= 0x005A) { # A..Z
751 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
752 value => ''};
753 $self->{state} = 'attribute name';
754 !!!next-input-character;
755 redo A;
756 } elsif ($self->{next_input_character} == 0x002F) { # /
757 !!!next-input-character;
758 if ($self->{next_input_character} == 0x003E and # >
759 $self->{current_token}->{type} eq 'start tag' and
760 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
761 # permitted slash
762 #
763 } else {
764 !!!parse-error (type => 'nestc');
765 }
766 ## Stay in the state
767 # next-input-character is already done
768 redo A;
769 } elsif ($self->{next_input_character} == 0x003C or # <
770 $self->{next_input_character} == -1) {
771 !!!parse-error (type => 'unclosed tag');
772 if ($self->{current_token}->{type} eq 'start tag') {
773 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
774 } elsif ($self->{current_token}->{type} eq 'end tag') {
775 $self->{content_model_flag} = 'PCDATA'; # MUST
776 if ($self->{current_token}->{attributes}) {
777 !!!parse-error (type => 'end tag attribute');
778 }
779 } else {
780 die "$0: $self->{current_token}->{type}: Unknown token type";
781 }
782 $self->{state} = 'data';
783 # reconsume
784
785 !!!emit ($self->{current_token}); # start tag or end tag
786 undef $self->{current_token};
787
788 redo A;
789 } else {
790 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
791 value => ''};
792 $self->{state} = 'attribute name';
793 !!!next-input-character;
794 redo A;
795 }
796 } elsif ($self->{state} eq 'attribute name') {
797 my $before_leave = sub {
798 if (exists $self->{current_token}->{attributes} # start tag or end tag
799 ->{$self->{current_attribute}->{name}}) { # MUST
800 !!!parse-error (type => 'dupulicate attribute');
801 ## Discard $self->{current_attribute} # MUST
802 } else {
803 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
804 = $self->{current_attribute};
805 }
806 }; # $before_leave
807
808 if ($self->{next_input_character} == 0x0009 or # HT
809 $self->{next_input_character} == 0x000A or # LF
810 $self->{next_input_character} == 0x000B or # VT
811 $self->{next_input_character} == 0x000C or # FF
812 $self->{next_input_character} == 0x0020) { # SP
813 $before_leave->();
814 $self->{state} = 'after attribute name';
815 !!!next-input-character;
816 redo A;
817 } elsif ($self->{next_input_character} == 0x003D) { # =
818 $before_leave->();
819 $self->{state} = 'before attribute value';
820 !!!next-input-character;
821 redo A;
822 } elsif ($self->{next_input_character} == 0x003E) { # >
823 $before_leave->();
824 if ($self->{current_token}->{type} eq 'start tag') {
825 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
826 } elsif ($self->{current_token}->{type} eq 'end tag') {
827 $self->{content_model_flag} = 'PCDATA'; # MUST
828 if ($self->{current_token}->{attributes}) {
829 !!!parse-error (type => 'end tag attribute');
830 }
831 } else {
832 die "$0: $self->{current_token}->{type}: Unknown token type";
833 }
834 $self->{state} = 'data';
835 !!!next-input-character;
836
837 !!!emit ($self->{current_token}); # start tag or end tag
838 undef $self->{current_token};
839
840 redo A;
841 } elsif (0x0041 <= $self->{next_input_character} and
842 $self->{next_input_character} <= 0x005A) { # A..Z
843 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
844 ## Stay in the state
845 !!!next-input-character;
846 redo A;
847 } elsif ($self->{next_input_character} == 0x002F) { # /
848 $before_leave->();
849 !!!next-input-character;
850 if ($self->{next_input_character} == 0x003E and # >
851 $self->{current_token}->{type} eq 'start tag' and
852 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
853 # permitted slash
854 #
855 } else {
856 !!!parse-error (type => 'nestc');
857 }
858 $self->{state} = 'before attribute name';
859 # next-input-character is already done
860 redo A;
861 } elsif ($self->{next_input_character} == 0x003C or # <
862 $self->{next_input_character} == -1) {
863 !!!parse-error (type => 'unclosed tag');
864 $before_leave->();
865 if ($self->{current_token}->{type} eq 'start tag') {
866 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
867 } elsif ($self->{current_token}->{type} eq 'end tag') {
868 $self->{content_model_flag} = 'PCDATA'; # MUST
869 if ($self->{current_token}->{attributes}) {
870 !!!parse-error (type => 'end tag attribute');
871 }
872 } else {
873 die "$0: $self->{current_token}->{type}: Unknown token type";
874 }
875 $self->{state} = 'data';
876 # reconsume
877
878 !!!emit ($self->{current_token}); # start tag or end tag
879 undef $self->{current_token};
880
881 redo A;
882 } else {
883 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
884 ## Stay in the state
885 !!!next-input-character;
886 redo A;
887 }
888 } elsif ($self->{state} eq 'after attribute name') {
889 if ($self->{next_input_character} == 0x0009 or # HT
890 $self->{next_input_character} == 0x000A or # LF
891 $self->{next_input_character} == 0x000B or # VT
892 $self->{next_input_character} == 0x000C or # FF
893 $self->{next_input_character} == 0x0020) { # SP
894 ## Stay in the state
895 !!!next-input-character;
896 redo A;
897 } elsif ($self->{next_input_character} == 0x003D) { # =
898 $self->{state} = 'before attribute value';
899 !!!next-input-character;
900 redo A;
901 } elsif ($self->{next_input_character} == 0x003E) { # >
902 if ($self->{current_token}->{type} eq 'start tag') {
903 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
904 } elsif ($self->{current_token}->{type} eq 'end tag') {
905 $self->{content_model_flag} = 'PCDATA'; # MUST
906 if ($self->{current_token}->{attributes}) {
907 !!!parse-error (type => 'end tag attribute');
908 }
909 } else {
910 die "$0: $self->{current_token}->{type}: Unknown token type";
911 }
912 $self->{state} = 'data';
913 !!!next-input-character;
914
915 !!!emit ($self->{current_token}); # start tag or end tag
916 undef $self->{current_token};
917
918 redo A;
919 } elsif (0x0041 <= $self->{next_input_character} and
920 $self->{next_input_character} <= 0x005A) { # A..Z
921 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
922 value => ''};
923 $self->{state} = 'attribute name';
924 !!!next-input-character;
925 redo A;
926 } elsif ($self->{next_input_character} == 0x002F) { # /
927 !!!next-input-character;
928 if ($self->{next_input_character} == 0x003E and # >
929 $self->{current_token}->{type} eq 'start tag' and
930 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
931 # permitted slash
932 #
933 } else {
934 !!!parse-error (type => 'nestc');
935 }
936 $self->{state} = 'before attribute name';
937 # next-input-character is already done
938 redo A;
939 } elsif ($self->{next_input_character} == 0x003C or # <
940 $self->{next_input_character} == -1) {
941 !!!parse-error (type => 'unclosed tag');
942 if ($self->{current_token}->{type} eq 'start tag') {
943 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
944 } elsif ($self->{current_token}->{type} eq 'end tag') {
945 $self->{content_model_flag} = 'PCDATA'; # MUST
946 if ($self->{current_token}->{attributes}) {
947 !!!parse-error (type => 'end tag attribute');
948 }
949 } else {
950 die "$0: $self->{current_token}->{type}: Unknown token type";
951 }
952 $self->{state} = 'data';
953 # reconsume
954
955 !!!emit ($self->{current_token}); # start tag or end tag
956 undef $self->{current_token};
957
958 redo A;
959 } else {
960 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
961 value => ''};
962 $self->{state} = 'attribute name';
963 !!!next-input-character;
964 redo A;
965 }
966 } elsif ($self->{state} eq 'before attribute value') {
967 if ($self->{next_input_character} == 0x0009 or # HT
968 $self->{next_input_character} == 0x000A or # LF
969 $self->{next_input_character} == 0x000B or # VT
970 $self->{next_input_character} == 0x000C or # FF
971 $self->{next_input_character} == 0x0020) { # SP
972 ## Stay in the state
973 !!!next-input-character;
974 redo A;
975 } elsif ($self->{next_input_character} == 0x0022) { # "
976 $self->{state} = 'attribute value (double-quoted)';
977 !!!next-input-character;
978 redo A;
979 } elsif ($self->{next_input_character} == 0x0026) { # &
980 $self->{state} = 'attribute value (unquoted)';
981 ## reconsume
982 redo A;
983 } elsif ($self->{next_input_character} == 0x0027) { # '
984 $self->{state} = 'attribute value (single-quoted)';
985 !!!next-input-character;
986 redo A;
987 } elsif ($self->{next_input_character} == 0x003E) { # >
988 if ($self->{current_token}->{type} eq 'start tag') {
989 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
990 } elsif ($self->{current_token}->{type} eq 'end tag') {
991 $self->{content_model_flag} = 'PCDATA'; # MUST
992 if ($self->{current_token}->{attributes}) {
993 !!!parse-error (type => 'end tag attribute');
994 }
995 } else {
996 die "$0: $self->{current_token}->{type}: Unknown token type";
997 }
998 $self->{state} = 'data';
999 !!!next-input-character;
1000
1001 !!!emit ($self->{current_token}); # start tag or end tag
1002 undef $self->{current_token};
1003
1004 redo A;
1005 } elsif ($self->{next_input_character} == 0x003C or # <
1006 $self->{next_input_character} == -1) {
1007 !!!parse-error (type => 'unclosed tag');
1008 if ($self->{current_token}->{type} eq 'start tag') {
1009 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1010 } elsif ($self->{current_token}->{type} eq 'end tag') {
1011 $self->{content_model_flag} = 'PCDATA'; # MUST
1012 if ($self->{current_token}->{attributes}) {
1013 !!!parse-error (type => 'end tag attribute');
1014 }
1015 } else {
1016 die "$0: $self->{current_token}->{type}: Unknown token type";
1017 }
1018 $self->{state} = 'data';
1019 ## reconsume
1020
1021 !!!emit ($self->{current_token}); # start tag or end tag
1022 undef $self->{current_token};
1023
1024 redo A;
1025 } else {
1026 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1027 $self->{state} = 'attribute value (unquoted)';
1028 !!!next-input-character;
1029 redo A;
1030 }
1031 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
1032 if ($self->{next_input_character} == 0x0022) { # "
1033 $self->{state} = 'before attribute name';
1034 !!!next-input-character;
1035 redo A;
1036 } elsif ($self->{next_input_character} == 0x0026) { # &
1037 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
1038 $self->{state} = 'entity in attribute value';
1039 !!!next-input-character;
1040 redo A;
1041 } elsif ($self->{next_input_character} == -1) {
1042 !!!parse-error (type => 'unclosed attribute value');
1043 if ($self->{current_token}->{type} eq 'start tag') {
1044 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1045 } elsif ($self->{current_token}->{type} eq 'end tag') {
1046 $self->{content_model_flag} = 'PCDATA'; # MUST
1047 if ($self->{current_token}->{attributes}) {
1048 !!!parse-error (type => 'end tag attribute');
1049 }
1050 } else {
1051 die "$0: $self->{current_token}->{type}: Unknown token type";
1052 }
1053 $self->{state} = 'data';
1054 ## reconsume
1055
1056 !!!emit ($self->{current_token}); # start tag or end tag
1057 undef $self->{current_token};
1058
1059 redo A;
1060 } else {
1061 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1062 ## Stay in the state
1063 !!!next-input-character;
1064 redo A;
1065 }
1066 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
1067 if ($self->{next_input_character} == 0x0027) { # '
1068 $self->{state} = 'before attribute name';
1069 !!!next-input-character;
1070 redo A;
1071 } elsif ($self->{next_input_character} == 0x0026) { # &
1072 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
1073 $self->{state} = 'entity in attribute value';
1074 !!!next-input-character;
1075 redo A;
1076 } elsif ($self->{next_input_character} == -1) {
1077 !!!parse-error (type => 'unclosed attribute value');
1078 if ($self->{current_token}->{type} eq 'start tag') {
1079 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1080 } elsif ($self->{current_token}->{type} eq 'end tag') {
1081 $self->{content_model_flag} = 'PCDATA'; # MUST
1082 if ($self->{current_token}->{attributes}) {
1083 !!!parse-error (type => 'end tag attribute');
1084 }
1085 } else {
1086 die "$0: $self->{current_token}->{type}: Unknown token type";
1087 }
1088 $self->{state} = 'data';
1089 ## reconsume
1090
1091 !!!emit ($self->{current_token}); # start tag or end tag
1092 undef $self->{current_token};
1093
1094 redo A;
1095 } else {
1096 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1097 ## Stay in the state
1098 !!!next-input-character;
1099 redo A;
1100 }
1101 } elsif ($self->{state} eq 'attribute value (unquoted)') {
1102 if ($self->{next_input_character} == 0x0009 or # HT
1103 $self->{next_input_character} == 0x000A or # LF
1104 $self->{next_input_character} == 0x000B or # HT
1105 $self->{next_input_character} == 0x000C or # FF
1106 $self->{next_input_character} == 0x0020) { # SP
1107 $self->{state} = 'before attribute name';
1108 !!!next-input-character;
1109 redo A;
1110 } elsif ($self->{next_input_character} == 0x0026) { # &
1111 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1112 $self->{state} = 'entity in attribute value';
1113 !!!next-input-character;
1114 redo A;
1115 } elsif ($self->{next_input_character} == 0x003E) { # >
1116 if ($self->{current_token}->{type} eq 'start tag') {
1117 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1118 } elsif ($self->{current_token}->{type} eq 'end tag') {
1119 $self->{content_model_flag} = 'PCDATA'; # MUST
1120 if ($self->{current_token}->{attributes}) {
1121 !!!parse-error (type => 'end tag attribute');
1122 }
1123 } else {
1124 die "$0: $self->{current_token}->{type}: Unknown token type";
1125 }
1126 $self->{state} = 'data';
1127 !!!next-input-character;
1128
1129 !!!emit ($self->{current_token}); # start tag or end tag
1130 undef $self->{current_token};
1131
1132 redo A;
1133 } elsif ($self->{next_input_character} == 0x003C or # <
1134 $self->{next_input_character} == -1) {
1135 !!!parse-error (type => 'unclosed tag');
1136 if ($self->{current_token}->{type} eq 'start tag') {
1137 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1138 } elsif ($self->{current_token}->{type} eq 'end tag') {
1139 $self->{content_model_flag} = 'PCDATA'; # MUST
1140 if ($self->{current_token}->{attributes}) {
1141 !!!parse-error (type => 'end tag attribute');
1142 }
1143 } else {
1144 die "$0: $self->{current_token}->{type}: Unknown token type";
1145 }
1146 $self->{state} = 'data';
1147 ## reconsume
1148
1149 !!!emit ($self->{current_token}); # start tag or end tag
1150 undef $self->{current_token};
1151
1152 redo A;
1153 } else {
1154 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1155 ## Stay in the state
1156 !!!next-input-character;
1157 redo A;
1158 }
1159 } elsif ($self->{state} eq 'entity in attribute value') {
1160 my $token = $self->_tokenize_attempt_to_consume_an_entity;
1161
1162 unless (defined $token) {
1163 $self->{current_attribute}->{value} .= '&';
1164 } else {
1165 $self->{current_attribute}->{value} .= $token->{data};
1166 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1167 }
1168
1169 $self->{state} = $self->{last_attribute_value_state};
1170 # next-input-character is already done
1171 redo A;
1172 } elsif ($self->{state} eq 'bogus comment') {
1173 ## (only happen if PCDATA state)
1174
1175 my $token = {type => 'comment', data => ''};
1176
1177 BC: {
1178 if ($self->{next_input_character} == 0x003E) { # >
1179 $self->{state} = 'data';
1180 !!!next-input-character;
1181
1182 !!!emit ($token);
1183
1184 redo A;
1185 } elsif ($self->{next_input_character} == -1) {
1186 $self->{state} = 'data';
1187 ## reconsume
1188
1189 !!!emit ($token);
1190
1191 redo A;
1192 } else {
1193 $token->{data} .= chr ($self->{next_input_character});
1194 !!!next-input-character;
1195 redo BC;
1196 }
1197 } # BC
1198 } elsif ($self->{state} eq 'markup declaration open') {
1199 ## (only happen if PCDATA state)
1200
1201 my @next_char;
1202 push @next_char, $self->{next_input_character};
1203
1204 if ($self->{next_input_character} == 0x002D) { # -
1205 !!!next-input-character;
1206 push @next_char, $self->{next_input_character};
1207 if ($self->{next_input_character} == 0x002D) { # -
1208 $self->{current_token} = {type => 'comment', data => ''};
1209 $self->{state} = 'comment';
1210 !!!next-input-character;
1211 redo A;
1212 }
1213 } elsif ($self->{next_input_character} == 0x0044 or # D
1214 $self->{next_input_character} == 0x0064) { # d
1215 !!!next-input-character;
1216 push @next_char, $self->{next_input_character};
1217 if ($self->{next_input_character} == 0x004F or # O
1218 $self->{next_input_character} == 0x006F) { # o
1219 !!!next-input-character;
1220 push @next_char, $self->{next_input_character};
1221 if ($self->{next_input_character} == 0x0043 or # C
1222 $self->{next_input_character} == 0x0063) { # c
1223 !!!next-input-character;
1224 push @next_char, $self->{next_input_character};
1225 if ($self->{next_input_character} == 0x0054 or # T
1226 $self->{next_input_character} == 0x0074) { # t
1227 !!!next-input-character;
1228 push @next_char, $self->{next_input_character};
1229 if ($self->{next_input_character} == 0x0059 or # Y
1230 $self->{next_input_character} == 0x0079) { # y
1231 !!!next-input-character;
1232 push @next_char, $self->{next_input_character};
1233 if ($self->{next_input_character} == 0x0050 or # P
1234 $self->{next_input_character} == 0x0070) { # p
1235 !!!next-input-character;
1236 push @next_char, $self->{next_input_character};
1237 if ($self->{next_input_character} == 0x0045 or # E
1238 $self->{next_input_character} == 0x0065) { # e
1239 ## ISSUE: What a stupid code this is!
1240 $self->{state} = 'DOCTYPE';
1241 !!!next-input-character;
1242 redo A;
1243 }
1244 }
1245 }
1246 }
1247 }
1248 }
1249 }
1250
1251 !!!parse-error (type => 'bogus comment open');
1252 $self->{next_input_character} = shift @next_char;
1253 !!!back-next-input-character (@next_char);
1254 $self->{state} = 'bogus comment';
1255 redo A;
1256
1257 ## ISSUE: typos in spec: chacacters, is is a parse error
1258 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1259 } elsif ($self->{state} eq 'comment') {
1260 if ($self->{next_input_character} == 0x002D) { # -
1261 $self->{state} = 'comment dash';
1262 !!!next-input-character;
1263 redo A;
1264 } elsif ($self->{next_input_character} == -1) {
1265 !!!parse-error (type => 'unclosed comment');
1266 $self->{state} = 'data';
1267 ## reconsume
1268
1269 !!!emit ($self->{current_token}); # comment
1270 undef $self->{current_token};
1271
1272 redo A;
1273 } else {
1274 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1275 ## Stay in the state
1276 !!!next-input-character;
1277 redo A;
1278 }
1279 } elsif ($self->{state} eq 'comment dash') {
1280 if ($self->{next_input_character} == 0x002D) { # -
1281 $self->{state} = 'comment end';
1282 !!!next-input-character;
1283 redo A;
1284 } elsif ($self->{next_input_character} == -1) {
1285 !!!parse-error (type => 'unclosed comment');
1286 $self->{state} = 'data';
1287 ## reconsume
1288
1289 !!!emit ($self->{current_token}); # comment
1290 undef $self->{current_token};
1291
1292 redo A;
1293 } else {
1294 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1295 $self->{state} = 'comment';
1296 !!!next-input-character;
1297 redo A;
1298 }
1299 } elsif ($self->{state} eq 'comment end') {
1300 if ($self->{next_input_character} == 0x003E) { # >
1301 $self->{state} = 'data';
1302 !!!next-input-character;
1303
1304 !!!emit ($self->{current_token}); # comment
1305 undef $self->{current_token};
1306
1307 redo A;
1308 } elsif ($self->{next_input_character} == 0x002D) { # -
1309 !!!parse-error (type => 'dash in comment');
1310 $self->{current_token}->{data} .= '-'; # comment
1311 ## Stay in the state
1312 !!!next-input-character;
1313 redo A;
1314 } elsif ($self->{next_input_character} == -1) {
1315 !!!parse-error (type => 'unclosed comment');
1316 $self->{state} = 'data';
1317 ## reconsume
1318
1319 !!!emit ($self->{current_token}); # comment
1320 undef $self->{current_token};
1321
1322 redo A;
1323 } else {
1324 !!!parse-error (type => 'dash in comment');
1325 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1326 $self->{state} = 'comment';
1327 !!!next-input-character;
1328 redo A;
1329 }
1330 } elsif ($self->{state} eq 'DOCTYPE') {
1331 if ($self->{next_input_character} == 0x0009 or # HT
1332 $self->{next_input_character} == 0x000A or # LF
1333 $self->{next_input_character} == 0x000B or # VT
1334 $self->{next_input_character} == 0x000C or # FF
1335 $self->{next_input_character} == 0x0020) { # SP
1336 $self->{state} = 'before DOCTYPE name';
1337 !!!next-input-character;
1338 redo A;
1339 } else {
1340 !!!parse-error (type => 'no space before DOCTYPE name');
1341 $self->{state} = 'before DOCTYPE name';
1342 ## reconsume
1343 redo A;
1344 }
1345 } elsif ($self->{state} eq 'before DOCTYPE name') {
1346 if ($self->{next_input_character} == 0x0009 or # HT
1347 $self->{next_input_character} == 0x000A or # LF
1348 $self->{next_input_character} == 0x000B or # VT
1349 $self->{next_input_character} == 0x000C or # FF
1350 $self->{next_input_character} == 0x0020) { # SP
1351 ## Stay in the state
1352 !!!next-input-character;
1353 redo A;
1354 } elsif (0x0061 <= $self->{next_input_character} and
1355 $self->{next_input_character} <= 0x007A) { # a..z
1356 ## ISSUE: "Set the token's name name to the" in the spec
1357 $self->{current_token} = {type => 'DOCTYPE',
1358 name => chr ($self->{next_input_character} - 0x0020),
1359 error => 1};
1360 $self->{state} = 'DOCTYPE name';
1361 !!!next-input-character;
1362 redo A;
1363 } elsif ($self->{next_input_character} == 0x003E) { # >
1364 !!!parse-error (type => 'no DOCTYPE name');
1365 $self->{state} = 'data';
1366 !!!next-input-character;
1367
1368 !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1369
1370 redo A;
1371 } elsif ($self->{next_input_character} == -1) {
1372 !!!parse-error (type => 'no DOCTYPE name');
1373 $self->{state} = 'data';
1374 ## reconsume
1375
1376 !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1377
1378 redo A;
1379 } else {
1380 $self->{current_token} = {type => 'DOCTYPE',
1381 name => chr ($self->{next_input_character}),
1382 error => 1};
1383 ## ISSUE: "Set the token's name name to the" in the spec
1384 $self->{state} = 'DOCTYPE name';
1385 !!!next-input-character;
1386 redo A;
1387 }
1388 } elsif ($self->{state} eq 'DOCTYPE name') {
1389 if ($self->{next_input_character} == 0x0009 or # HT
1390 $self->{next_input_character} == 0x000A or # LF
1391 $self->{next_input_character} == 0x000B or # VT
1392 $self->{next_input_character} == 0x000C or # FF
1393 $self->{next_input_character} == 0x0020) { # SP
1394 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1395 $self->{state} = 'after DOCTYPE name';
1396 !!!next-input-character;
1397 redo A;
1398 } elsif ($self->{next_input_character} == 0x003E) { # >
1399 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1400 $self->{state} = 'data';
1401 !!!next-input-character;
1402
1403 !!!emit ($self->{current_token}); # DOCTYPE
1404 undef $self->{current_token};
1405
1406 redo A;
1407 } elsif (0x0061 <= $self->{next_input_character} and
1408 $self->{next_input_character} <= 0x007A) { # a..z
1409 $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1410 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1411 ## Stay in the state
1412 !!!next-input-character;
1413 redo A;
1414 } elsif ($self->{next_input_character} == -1) {
1415 !!!parse-error (type => 'unclosed DOCTYPE');
1416 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1417 $self->{state} = 'data';
1418 ## reconsume
1419
1420 !!!emit ($self->{current_token});
1421 undef $self->{current_token};
1422
1423 redo A;
1424 } else {
1425 $self->{current_token}->{name}
1426 .= chr ($self->{next_input_character}); # DOCTYPE
1427 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1428 ## Stay in the state
1429 !!!next-input-character;
1430 redo A;
1431 }
1432 } elsif ($self->{state} eq 'after DOCTYPE name') {
1433 if ($self->{next_input_character} == 0x0009 or # HT
1434 $self->{next_input_character} == 0x000A or # LF
1435 $self->{next_input_character} == 0x000B or # VT
1436 $self->{next_input_character} == 0x000C or # FF
1437 $self->{next_input_character} == 0x0020) { # SP
1438 ## Stay in the state
1439 !!!next-input-character;
1440 redo A;
1441 } elsif ($self->{next_input_character} == 0x003E) { # >
1442 $self->{state} = 'data';
1443 !!!next-input-character;
1444
1445 !!!emit ($self->{current_token}); # DOCTYPE
1446 undef $self->{current_token};
1447
1448 redo A;
1449 } elsif ($self->{next_input_character} == -1) {
1450 !!!parse-error (type => 'unclosed DOCTYPE');
1451 $self->{state} = 'data';
1452 ## reconsume
1453
1454 !!!emit ($self->{current_token}); # DOCTYPE
1455 undef $self->{current_token};
1456
1457 redo A;
1458 } else {
1459 !!!parse-error (type => 'string after DOCTYPE name');
1460 $self->{current_token}->{error} = 1; # DOCTYPE
1461 $self->{state} = 'bogus DOCTYPE';
1462 !!!next-input-character;
1463 redo A;
1464 }
1465 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1466 if ($self->{next_input_character} == 0x003E) { # >
1467 $self->{state} = 'data';
1468 !!!next-input-character;
1469
1470 !!!emit ($self->{current_token}); # DOCTYPE
1471 undef $self->{current_token};
1472
1473 redo A;
1474 } elsif ($self->{next_input_character} == -1) {
1475 !!!parse-error (type => 'unclosed DOCTYPE');
1476 $self->{state} = 'data';
1477 ## reconsume
1478
1479 !!!emit ($self->{current_token}); # DOCTYPE
1480 undef $self->{current_token};
1481
1482 redo A;
1483 } else {
1484 ## Stay in the state
1485 !!!next-input-character;
1486 redo A;
1487 }
1488 } else {
1489 die "$0: $self->{state}: Unknown state";
1490 }
1491 } # A
1492
1493 die "$0: _get_next_token: unexpected case";
1494 } # _get_next_token
1495
1496 sub _tokenize_attempt_to_consume_an_entity ($) {
1497 my $self = shift;
1498
1499 if ($self->{next_input_character} == 0x0023) { # #
1500 !!!next-input-character;
1501 if ($self->{next_input_character} == 0x0078 or # x
1502 $self->{next_input_character} == 0x0058) { # X
1503 my $num;
1504 X: {
1505 my $x_char = $self->{next_input_character};
1506 !!!next-input-character;
1507 if (0x0030 <= $self->{next_input_character} and
1508 $self->{next_input_character} <= 0x0039) { # 0..9
1509 $num ||= 0;
1510 $num *= 0x10;
1511 $num += $self->{next_input_character} - 0x0030;
1512 redo X;
1513 } elsif (0x0061 <= $self->{next_input_character} and
1514 $self->{next_input_character} <= 0x0066) { # a..f
1515 ## ISSUE: the spec says U+0078, which is apparently incorrect
1516 $num ||= 0;
1517 $num *= 0x10;
1518 $num += $self->{next_input_character} - 0x0060 + 9;
1519 redo X;
1520 } elsif (0x0041 <= $self->{next_input_character} and
1521 $self->{next_input_character} <= 0x0046) { # A..F
1522 ## ISSUE: the spec says U+0058, which is apparently incorrect
1523 $num ||= 0;
1524 $num *= 0x10;
1525 $num += $self->{next_input_character} - 0x0040 + 9;
1526 redo X;
1527 } elsif (not defined $num) { # no hexadecimal digit
1528 !!!parse-error (type => 'bare hcro');
1529 $self->{next_input_character} = 0x0023; # #
1530 !!!back-next-input-character ($x_char);
1531 return undef;
1532 } elsif ($self->{next_input_character} == 0x003B) { # ;
1533 !!!next-input-character;
1534 } else {
1535 !!!parse-error (type => 'no refc');
1536 }
1537
1538 ## TODO: check the definition for |a valid Unicode character|.
1539 ## <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8189>
1540 if ($num > 1114111 or $num == 0) {
1541 $num = 0xFFFD; # REPLACEMENT CHARACTER
1542 ## ISSUE: Why this is not an error?
1543 } elsif (0x80 <= $num and $num <= 0x9F) {
1544 !!!parse-error (type => sprintf 'c1 entity:U+%04X', $num);
1545 $num = $c1_entity_char->{$num};
1546 }
1547
1548 return {type => 'character', data => chr $num};
1549 } # X
1550 } elsif (0x0030 <= $self->{next_input_character} and
1551 $self->{next_input_character} <= 0x0039) { # 0..9
1552 my $code = $self->{next_input_character} - 0x0030;
1553 !!!next-input-character;
1554
1555 while (0x0030 <= $self->{next_input_character} and
1556 $self->{next_input_character} <= 0x0039) { # 0..9
1557 $code *= 10;
1558 $code += $self->{next_input_character} - 0x0030;
1559
1560 !!!next-input-character;
1561 }
1562
1563 if ($self->{next_input_character} == 0x003B) { # ;
1564 !!!next-input-character;
1565 } else {
1566 !!!parse-error (type => 'no refc');
1567 }
1568
1569 ## TODO: check the definition for |a valid Unicode character|.
1570 if ($code > 1114111 or $code == 0) {
1571 $code = 0xFFFD; # REPLACEMENT CHARACTER
1572 ## ISSUE: Why this is not an error?
1573 } elsif (0x80 <= $code and $code <= 0x9F) {
1574 !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);
1575 $code = $c1_entity_char->{$code};
1576 }
1577
1578 return {type => 'character', data => chr $code};
1579 } else {
1580 !!!parse-error (type => 'bare nero');
1581 !!!back-next-input-character ($self->{next_input_character});
1582 $self->{next_input_character} = 0x0023; # #
1583 return undef;
1584 }
1585 } elsif ((0x0041 <= $self->{next_input_character} and
1586 $self->{next_input_character} <= 0x005A) or
1587 (0x0061 <= $self->{next_input_character} and
1588 $self->{next_input_character} <= 0x007A)) {
1589 my $entity_name = chr $self->{next_input_character};
1590 !!!next-input-character;
1591
1592 my $value = $entity_name;
1593 my $match;
1594
1595 while (length $entity_name < 10 and
1596 ## NOTE: Some number greater than the maximum length of entity name
1597 ((0x0041 <= $self->{next_input_character} and
1598 $self->{next_input_character} <= 0x005A) or
1599 (0x0061 <= $self->{next_input_character} and
1600 $self->{next_input_character} <= 0x007A) or
1601 (0x0030 <= $self->{next_input_character} and
1602 $self->{next_input_character} <= 0x0039))) {
1603 $entity_name .= chr $self->{next_input_character};
1604 if (defined $entity_char->{$entity_name}) {
1605 $value = $entity_char->{$entity_name};
1606 $match = 1;
1607 } else {
1608 $value .= chr $self->{next_input_character};
1609 }
1610 !!!next-input-character;
1611 }
1612
1613 if ($match) {
1614 if ($self->{next_input_character} == 0x003B) { # ;
1615 !!!next-input-character;
1616 } else {
1617 !!!parse-error (type => 'refc');
1618 }
1619
1620 return {type => 'character', data => $value};
1621 } else {
1622 !!!parse-error (type => 'bare ero');
1623 ## NOTE: No characters are consumed in the spec.
1624 !!!back-token ({type => 'character', data => $value});
1625 return undef;
1626 }
1627 } else {
1628 ## no characters are consumed
1629 !!!parse-error (type => 'bare ero');
1630 return undef;
1631 }
1632 } # _tokenize_attempt_to_consume_an_entity
1633
1634 sub _initialize_tree_constructor ($) {
1635 my $self = shift;
1636 ## NOTE: $self->{document} MUST be specified before this method is called
1637 $self->{document}->strict_error_checking (0);
1638 ## TODO: Turn mutation events off # MUST
1639 ## TODO: Turn loose Document option (manakai extension) on
1640 ## TODO: Mark the Document as an HTML document # MUST
1641 } # _initialize_tree_constructor
1642
1643 sub _terminate_tree_constructor ($) {
1644 my $self = shift;
1645 $self->{document}->strict_error_checking (1);
1646 ## TODO: Turn mutation events on
1647 } # _terminate_tree_constructor
1648
1649 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1650
1651 { # tree construction stage
1652 my $token;
1653
1654 sub _construct_tree ($) {
1655 my ($self) = @_;
1656
1657 ## When an interactive UA render the $self->{document} available
1658 ## to the user, or when it begin accepting user input, are
1659 ## not defined.
1660
1661 ## Append a character: collect it and all subsequent consecutive
1662 ## characters and insert one Text node whose data is concatenation
1663 ## of all those characters. # MUST
1664
1665 !!!next-token;
1666
1667 $self->{insertion_mode} = 'before head';
1668 undef $self->{form_element};
1669 undef $self->{head_element};
1670 $self->{open_elements} = [];
1671 undef $self->{inner_html_node};
1672
1673 $self->_tree_construction_initial; # MUST
1674 $self->_tree_construction_root_element;
1675 $self->_tree_construction_main;
1676 } # _construct_tree
1677
1678 sub _tree_construction_initial ($) {
1679 my $self = shift;
1680 B: {
1681 if ($token->{type} eq 'DOCTYPE') {
1682 if ($token->{error}) {
1683 ## ISSUE: Spec currently left this case undefined.
1684 !!!parse-error (type => 'bogus DOCTYPE');
1685 }
1686 my $doctype = $self->{document}->create_document_type_definition
1687 ($token->{name});
1688 $self->{document}->append_child ($doctype);
1689 #$phase = 'root element';
1690 !!!next-token;
1691 #redo B;
1692 return;
1693 } elsif ({
1694 comment => 1,
1695 'start tag' => 1,
1696 'end tag' => 1,
1697 'end-of-file' => 1,
1698 }->{$token->{type}}) {
1699 ## ISSUE: Spec currently left this case undefined.
1700 !!!parse-error (type => 'missing DOCTYPE');
1701 #$phase = 'root element';
1702 ## reprocess
1703 #redo B;
1704 return;
1705 } elsif ($token->{type} eq 'character') {
1706 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
1707 $self->{document}->manakai_append_text ($1);
1708 ## ISSUE: DOM3 Core does not allow Document > Text
1709 unless (length $token->{data}) {
1710 ## Stay in the phase
1711 !!!next-token;
1712 redo B;
1713 }
1714 }
1715 ## ISSUE: Spec currently left this case undefined.
1716 !!!parse-error (type => 'missing DOCTYPE');
1717 #$phase = 'root element';
1718 ## reprocess
1719 #redo B;
1720 return;
1721 } else {
1722 die "$0: $token->{type}: Unknown token";
1723 }
1724 } # B
1725 } # _tree_construction_initial
1726
1727 sub _tree_construction_root_element ($) {
1728 my $self = shift;
1729
1730 B: {
1731 if ($token->{type} eq 'DOCTYPE') {
1732 !!!parse-error (type => 'in html:#DOCTYPE');
1733 ## Ignore the token
1734 ## Stay in the phase
1735 !!!next-token;
1736 redo B;
1737 } elsif ($token->{type} eq 'comment') {
1738 my $comment = $self->{document}->create_comment ($token->{data});
1739 $self->{document}->append_child ($comment);
1740 ## Stay in the phase
1741 !!!next-token;
1742 redo B;
1743 } elsif ($token->{type} eq 'character') {
1744 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
1745 $self->{document}->manakai_append_text ($1);
1746 ## ISSUE: DOM3 Core does not allow Document > Text
1747 unless (length $token->{data}) {
1748 ## Stay in the phase
1749 !!!next-token;
1750 redo B;
1751 }
1752 }
1753 #
1754 } elsif ({
1755 'start tag' => 1,
1756 'end tag' => 1,
1757 'end-of-file' => 1,
1758 }->{$token->{type}}) {
1759 ## ISSUE: There is an issue in the spec
1760 #
1761 } else {
1762 die "$0: $token->{type}: Unknown token";
1763 }
1764 my $root_element; !!!create-element ($root_element, 'html');
1765 $self->{document}->append_child ($root_element);
1766 push @{$self->{open_elements}}, [$root_element, 'html'];
1767 #$phase = 'main';
1768 ## reprocess
1769 #redo B;
1770 return;
1771 } # B
1772 } # _tree_construction_root_element
1773
1774 sub _reset_insertion_mode ($) {
1775 my $self = shift;
1776
1777 ## Step 1
1778 my $last;
1779
1780 ## Step 2
1781 my $i = -1;
1782 my $node = $self->{open_elements}->[$i];
1783
1784 ## Step 3
1785 S3: {
1786 $last = 1 if $self->{open_elements}->[0]->[0] eq $node->[0];
1787 if (defined $self->{inner_html_node}) {
1788 if ($self->{inner_html_node}->[1] eq 'td' or
1789 $self->{inner_html_node}->[1] eq 'th') {
1790 #
1791 } else {
1792 $node = $self->{inner_html_node};
1793 }
1794 }
1795
1796 ## Step 4..13
1797 my $new_mode = {
1798 select => 'in select',
1799 td => 'in cell',
1800 th => 'in cell',
1801 tr => 'in row',
1802 tbody => 'in table body',
1803 thead => 'in table head',
1804 tfoot => 'in table foot',
1805 caption => 'in caption',
1806 colgroup => 'in column group',
1807 table => 'in table',
1808 head => 'in body', # not in head!
1809 body => 'in body',
1810 frameset => 'in frameset',
1811 }->{$node->[1]};
1812 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
1813
1814 ## Step 14
1815 if ($node->[1] eq 'html') {
1816 unless (defined $self->{head_element}) {
1817 $self->{insertion_mode} = 'before head';
1818 } else {
1819 $self->{insertion_mode} = 'after head';
1820 }
1821 return;
1822 }
1823
1824 ## Step 15
1825 $self->{insertion_mode} = 'in body' and return if $last;
1826
1827 ## Step 16
1828 $i--;
1829 $node = $self->{open_elements}->[$i];
1830
1831 ## Step 17
1832 redo S3;
1833 } # S3
1834 } # _reset_insertion_mode
1835
1836 sub _tree_construction_main ($) {
1837 my $self = shift;
1838
1839 my $phase = 'main';
1840
1841 my $active_formatting_elements = [];
1842
1843 my $reconstruct_active_formatting_elements = sub { # MUST
1844 my $insert = shift;
1845
1846 ## Step 1
1847 return unless @$active_formatting_elements;
1848
1849 ## Step 3
1850 my $i = -1;
1851 my $entry = $active_formatting_elements->[$i];
1852
1853 ## Step 2
1854 return if $entry->[0] eq '#marker';
1855 for (@{$self->{open_elements}}) {
1856 if ($entry->[0] eq $_->[0]) {
1857 return;
1858 }
1859 }
1860
1861 S4: {
1862 ## Step 4
1863 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
1864
1865 ## Step 5
1866 $i--;
1867 $entry = $active_formatting_elements->[$i];
1868
1869 ## Step 6
1870 if ($entry->[0] eq '#marker') {
1871 #
1872 } else {
1873 my $in_open_elements;
1874 OE: for (@{$self->{open_elements}}) {
1875 if ($entry->[0] eq $_->[0]) {
1876 $in_open_elements = 1;
1877 last OE;
1878 }
1879 }
1880 if ($in_open_elements) {
1881 #
1882 } else {
1883 redo S4;
1884 }
1885 }
1886
1887 ## Step 7
1888 $i++;
1889 $entry = $active_formatting_elements->[$i];
1890 } # S4
1891
1892 S7: {
1893 ## Step 8
1894 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
1895
1896 ## Step 9
1897 $insert->($clone->[0]);
1898 push @{$self->{open_elements}}, $clone;
1899
1900 ## Step 10
1901 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
1902
1903 ## Step 11
1904 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
1905 ## Step 7'
1906 $i++;
1907 $entry = $active_formatting_elements->[$i];
1908
1909 redo S7;
1910 }
1911 } # S7
1912 }; # $reconstruct_active_formatting_elements
1913
1914 my $clear_up_to_marker = sub {
1915 for (reverse 0..$#$active_formatting_elements) {
1916 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
1917 splice @$active_formatting_elements, $_;
1918 return;
1919 }
1920 }
1921 }; # $clear_up_to_marker
1922
1923 my $style_start_tag = sub {
1924 my $style_el; !!!create-element ($style_el, 'style', $token->{attributes});
1925 ## $self->{insertion_mode} eq 'in head' and ... (always true)
1926 (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
1927 ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
1928 ->append_child ($style_el);
1929 $self->{content_model_flag} = 'CDATA';
1930
1931 my $text = '';
1932 !!!next-token;
1933 while ($token->{type} eq 'character') {
1934 $text .= $token->{data};
1935 !!!next-token;
1936 } # stop if non-character token or tokenizer stops tokenising
1937 if (length $text) {
1938 $style_el->manakai_append_text ($text);
1939 }
1940
1941 $self->{content_model_flag} = 'PCDATA';
1942
1943 if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
1944 ## Ignore the token
1945 } else {
1946 !!!parse-error (type => 'in CDATA:#'.$token->{type});
1947 ## ISSUE: And ignore?
1948 }
1949 !!!next-token;
1950 }; # $style_start_tag
1951
1952 my $script_start_tag = sub {
1953 my $script_el;
1954 !!!create-element ($script_el, 'script', $token->{attributes});
1955 ## TODO: mark as "parser-inserted"
1956
1957 $self->{content_model_flag} = 'CDATA';
1958
1959 my $text = '';
1960 !!!next-token;
1961 while ($token->{type} eq 'character') {
1962 $text .= $token->{data};
1963 !!!next-token;
1964 } # stop if non-character token or tokenizer stops tokenising
1965 if (length $text) {
1966 $script_el->manakai_append_text ($text);
1967 }
1968
1969 $self->{content_model_flag} = 'PCDATA';
1970
1971 if ($token->{type} eq 'end tag' and
1972 $token->{tag_name} eq 'script') {
1973 ## Ignore the token
1974 } else {
1975 !!!parse-error (type => 'in CDATA:#'.$token->{type});
1976 ## ISSUE: And ignore?
1977 ## TODO: mark as "already executed"
1978 }
1979
1980 if (defined $self->{inner_html_node}) {
1981 ## TODO: mark as "already executed"
1982 } else {
1983 ## TODO: $old_insertion_point = current insertion point
1984 ## TODO: insertion point = just before the next input character
1985
1986 (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
1987 ? $self->{head_element} : $self->{open_elements}->[-1]->[0])->append_child ($script_el);
1988
1989 ## TODO: insertion point = $old_insertion_point (might be "undefined")
1990
1991 ## TODO: if there is a script that will execute as soon as the parser resume, then...
1992 }
1993
1994 !!!next-token;
1995 }; # $script_start_tag
1996
1997 my $formatting_end_tag = sub {
1998 my $tag_name = shift;
1999
2000 FET: {
2001 ## Step 1
2002 my $formatting_element;
2003 my $formatting_element_i_in_active;
2004 AFE: for (reverse 0..$#$active_formatting_elements) {
2005 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2006 $formatting_element = $active_formatting_elements->[$_];
2007 $formatting_element_i_in_active = $_;
2008 last AFE;
2009 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2010 last AFE;
2011 }
2012 } # AFE
2013 unless (defined $formatting_element) {
2014 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2015 ## Ignore the token
2016 !!!next-token;
2017 return;
2018 }
2019 ## has an element in scope
2020 my $in_scope = 1;
2021 my $formatting_element_i_in_open;
2022 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2023 my $node = $self->{open_elements}->[$_];
2024 if ($node->[0] eq $formatting_element->[0]) {
2025 if ($in_scope) {
2026 $formatting_element_i_in_open = $_;
2027 last INSCOPE;
2028 } else { # in open elements but not in scope
2029 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2030 ## Ignore the token
2031 !!!next-token;
2032 return;
2033 }
2034 } elsif ({
2035 table => 1, caption => 1, td => 1, th => 1,
2036 button => 1, marquee => 1, object => 1, html => 1,
2037 }->{$node->[1]}) {
2038 $in_scope = 0;
2039 }
2040 } # INSCOPE
2041 unless (defined $formatting_element_i_in_open) {
2042 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2043 pop @$active_formatting_elements; # $formatting_element
2044 !!!next-token; ## TODO: ok?
2045 return;
2046 }
2047 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2048 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2049 }
2050
2051 ## Step 2
2052 my $furthest_block;
2053 my $furthest_block_i_in_open;
2054 OE: for (reverse 0..$#{$self->{open_elements}}) {
2055 my $node = $self->{open_elements}->[$_];
2056 if (not $formatting_category->{$node->[1]} and
2057 #not $phrasing_category->{$node->[1]} and
2058 ($special_category->{$node->[1]} or
2059 $scoping_category->{$node->[1]})) {
2060 $furthest_block = $node;
2061 $furthest_block_i_in_open = $_;
2062 } elsif ($node->[0] eq $formatting_element->[0]) {
2063 last OE;
2064 }
2065 } # OE
2066
2067 ## Step 3
2068 unless (defined $furthest_block) { # MUST
2069 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2070 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2071 !!!next-token;
2072 return;
2073 }
2074
2075 ## Step 4
2076 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2077
2078 ## Step 5
2079 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2080 if (defined $furthest_block_parent) {
2081 $furthest_block_parent->remove_child ($furthest_block->[0]);
2082 }
2083
2084 ## Step 6
2085 my $bookmark_prev_el
2086 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2087 ->[0];
2088
2089 ## Step 7
2090 my $node = $furthest_block;
2091 my $node_i_in_open = $furthest_block_i_in_open;
2092 my $last_node = $furthest_block;
2093 S7: {
2094 ## Step 1
2095 $node_i_in_open--;
2096 $node = $self->{open_elements}->[$node_i_in_open];
2097
2098 ## Step 2
2099 my $node_i_in_active;
2100 S7S2: {
2101 for (reverse 0..$#$active_formatting_elements) {
2102 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2103 $node_i_in_active = $_;
2104 last S7S2;
2105 }
2106 }
2107 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2108 redo S7;
2109 } # S7S2
2110
2111 ## Step 3
2112 last S7 if $node->[0] eq $formatting_element->[0];
2113
2114 ## Step 4
2115 if ($last_node->[0] eq $furthest_block->[0]) {
2116 $bookmark_prev_el = $node->[0];
2117 }
2118
2119 ## Step 5
2120 if ($node->[0]->has_child_nodes ()) {
2121 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2122 $active_formatting_elements->[$node_i_in_active] = $clone;
2123 $self->{open_elements}->[$node_i_in_open] = $clone;
2124 $node = $clone;
2125 }
2126
2127 ## Step 6
2128 $node->[0]->append_child ($last_node->[0]);
2129
2130 ## Step 7
2131 $last_node = $node;
2132
2133 ## Step 8
2134 redo S7;
2135 } # S7
2136
2137 ## Step 8
2138 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2139
2140 ## Step 9
2141 my $clone = [$formatting_element->[0]->clone_node (0),
2142 $formatting_element->[1]];
2143
2144 ## Step 10
2145 my @cn = @{$furthest_block->[0]->child_nodes};
2146 $clone->[0]->append_child ($_) for @cn;
2147
2148 ## Step 11
2149 $furthest_block->[0]->append_child ($clone->[0]);
2150
2151 ## Step 12
2152 my $i;
2153 AFE: for (reverse 0..$#$active_formatting_elements) {
2154 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2155 splice @$active_formatting_elements, $_, 1;
2156 $i-- and last AFE if defined $i;
2157 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2158 $i = $_;
2159 }
2160 } # AFE
2161 splice @$active_formatting_elements, $i + 1, 0, $clone;
2162
2163 ## Step 13
2164 undef $i;
2165 OE: for (reverse 0..$#{$self->{open_elements}}) {
2166 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2167 splice @{$self->{open_elements}}, $_, 1;
2168 $i-- and last OE if defined $i;
2169 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2170 $i = $_;
2171 }
2172 } # OE
2173 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2174
2175 ## Step 14
2176 redo FET;
2177 } # FET
2178 }; # $formatting_end_tag
2179
2180 my $insert_to_current = sub {
2181 $self->{open_elements}->[-1]->[0]->append_child (shift);
2182 }; # $insert_to_current
2183
2184 my $insert_to_foster = sub {
2185 my $child = shift;
2186 if ({
2187 table => 1, tbody => 1, tfoot => 1,
2188 thead => 1, tr => 1,
2189 }->{$self->{open_elements}->[-1]->[1]}) {
2190 # MUST
2191 my $foster_parent_element;
2192 my $next_sibling;
2193 OE: for (reverse 0..$#{$self->{open_elements}}) {
2194 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2195 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2196 if (defined $parent and $parent->node_type == 1) {
2197 $foster_parent_element = $parent;
2198 $next_sibling = $self->{open_elements}->[$_]->[0];
2199 } else {
2200 $foster_parent_element
2201 = $self->{open_elements}->[$_ - 1]->[0];
2202 }
2203 last OE;
2204 }
2205 } # OE
2206 $foster_parent_element = $self->{open_elements}->[0]->[0]
2207 unless defined $foster_parent_element;
2208 $foster_parent_element->insert_before
2209 ($child, $next_sibling);
2210 } else {
2211 $self->{open_elements}->[-1]->[0]->append_child ($child);
2212 }
2213 }; # $insert_to_foster
2214
2215 my $in_body = sub {
2216 my $insert = shift;
2217 if ($token->{type} eq 'start tag') {
2218 if ($token->{tag_name} eq 'script') {
2219 $script_start_tag->();
2220 return;
2221 } elsif ($token->{tag_name} eq 'style') {
2222 $style_start_tag->();
2223 return;
2224 } elsif ({
2225 base => 1, link => 1, meta => 1,
2226 }->{$token->{tag_name}}) {
2227 !!!parse-error (type => 'in body:'.$token->{tag_name});
2228 ## NOTE: This is an "as if in head" code clone
2229 my $el;
2230 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2231 if (defined $self->{head_element}) {
2232 $self->{head_element}->append_child ($el);
2233 } else {
2234 $insert->($el);
2235 }
2236
2237 !!!next-token;
2238 return;
2239 } elsif ($token->{tag_name} eq 'title') {
2240 !!!parse-error (type => 'in body:title');
2241 ## NOTE: There is an "as if in head" code clone
2242 my $title_el;
2243 !!!create-element ($title_el, 'title', $token->{attributes});
2244 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
2245 ->append_child ($title_el);
2246 $self->{content_model_flag} = 'RCDATA';
2247
2248 my $text = '';
2249 !!!next-token;
2250 while ($token->{type} eq 'character') {
2251 $text .= $token->{data};
2252 !!!next-token;
2253 }
2254 if (length $text) {
2255 $title_el->manakai_append_text ($text);
2256 }
2257
2258 $self->{content_model_flag} = 'PCDATA';
2259
2260 if ($token->{type} eq 'end tag' and
2261 $token->{tag_name} eq 'title') {
2262 ## Ignore the token
2263 } else {
2264 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2265 ## ISSUE: And ignore?
2266 }
2267 !!!next-token;
2268 return;
2269 } elsif ($token->{tag_name} eq 'body') {
2270 !!!parse-error (type => 'in body:body');
2271
2272 if (@{$self->{open_elements}} == 1 or
2273 $self->{open_elements}->[1]->[1] ne 'body') {
2274 ## Ignore the token
2275 } else {
2276 my $body_el = $self->{open_elements}->[1]->[0];
2277 for my $attr_name (keys %{$token->{attributes}}) {
2278 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2279 $body_el->set_attribute_ns
2280 (undef, [undef, $attr_name],
2281 $token->{attributes}->{$attr_name}->{value});
2282 }
2283 }
2284 }
2285 !!!next-token;
2286 return;
2287 } elsif ({
2288 address => 1, blockquote => 1, center => 1, dir => 1,
2289 div => 1, dl => 1, fieldset => 1, listing => 1,
2290 menu => 1, ol => 1, p => 1, ul => 1,
2291 pre => 1,
2292 }->{$token->{tag_name}}) {
2293 ## has a p element in scope
2294 INSCOPE: for (reverse @{$self->{open_elements}}) {
2295 if ($_->[1] eq 'p') {
2296 !!!back-token;
2297 $token = {type => 'end tag', tag_name => 'p'};
2298 return;
2299 } elsif ({
2300 table => 1, caption => 1, td => 1, th => 1,
2301 button => 1, marquee => 1, object => 1, html => 1,
2302 }->{$_->[1]}) {
2303 last INSCOPE;
2304 }
2305 } # INSCOPE
2306
2307 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2308 if ($token->{tag_name} eq 'pre') {
2309 !!!next-token;
2310 if ($token->{type} eq 'character') {
2311 $token->{data} =~ s/^\x0A//;
2312 unless (length $token->{data}) {
2313 !!!next-token;
2314 }
2315 }
2316 } else {
2317 !!!next-token;
2318 }
2319 return;
2320 } elsif ($token->{tag_name} eq 'form') {
2321 if (defined $self->{form_element}) {
2322 !!!parse-error (type => 'in form:form');
2323 ## Ignore the token
2324 !!!next-token;
2325 return;
2326 } else {
2327 ## has a p element in scope
2328 INSCOPE: for (reverse @{$self->{open_elements}}) {
2329 if ($_->[1] eq 'p') {
2330 !!!back-token;
2331 $token = {type => 'end tag', tag_name => 'p'};
2332 return;
2333 } elsif ({
2334 table => 1, caption => 1, td => 1, th => 1,
2335 button => 1, marquee => 1, object => 1, html => 1,
2336 }->{$_->[1]}) {
2337 last INSCOPE;
2338 }
2339 } # INSCOPE
2340
2341 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2342 $self->{form_element} = $self->{open_elements}->[-1]->[0];
2343 !!!next-token;
2344 return;
2345 }
2346 } elsif ($token->{tag_name} eq 'li') {
2347 ## has a p element in scope
2348 INSCOPE: for (reverse @{$self->{open_elements}}) {
2349 if ($_->[1] eq 'p') {
2350 !!!back-token;
2351 $token = {type => 'end tag', tag_name => 'p'};
2352 return;
2353 } elsif ({
2354 table => 1, caption => 1, td => 1, th => 1,
2355 button => 1, marquee => 1, object => 1, html => 1,
2356 }->{$_->[1]}) {
2357 last INSCOPE;
2358 }
2359 } # INSCOPE
2360
2361 ## Step 1
2362 my $i = -1;
2363 my $node = $self->{open_elements}->[$i];
2364 LI: {
2365 ## Step 2
2366 if ($node->[1] eq 'li') {
2367 if ($i != -1) {
2368 !!!parse-error (type => 'end tag missing:'.
2369 $self->{open_elements}->[-1]->[1]);
2370 ## TODO: test
2371 }
2372 splice @{$self->{open_elements}}, $i;
2373 last LI;
2374 }
2375
2376 ## Step 3
2377 if (not $formatting_category->{$node->[1]} and
2378 #not $phrasing_category->{$node->[1]} and
2379 ($special_category->{$node->[1]} or
2380 $scoping_category->{$node->[1]}) and
2381 $node->[1] ne 'address' and $node->[1] ne 'div') {
2382 last LI;
2383 }
2384
2385 ## Step 4
2386 $i--;
2387 $node = $self->{open_elements}->[$i];
2388 redo LI;
2389 } # LI
2390
2391 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2392 !!!next-token;
2393 return;
2394 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2395 ## has a p element in scope
2396 INSCOPE: for (reverse @{$self->{open_elements}}) {
2397 if ($_->[1] eq 'p') {
2398 !!!back-token;
2399 $token = {type => 'end tag', tag_name => 'p'};
2400 return;
2401 } elsif ({
2402 table => 1, caption => 1, td => 1, th => 1,
2403 button => 1, marquee => 1, object => 1, html => 1,
2404 }->{$_->[1]}) {
2405 last INSCOPE;
2406 }
2407 } # INSCOPE
2408
2409 ## Step 1
2410 my $i = -1;
2411 my $node = $self->{open_elements}->[$i];
2412 LI: {
2413 ## Step 2
2414 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2415 if ($i != -1) {
2416 !!!parse-error (type => 'end tag missing:'.
2417 $self->{open_elements}->[-1]->[1]);
2418 ## TODO: test
2419 }
2420 splice @{$self->{open_elements}}, $i;
2421 last LI;
2422 }
2423
2424 ## Step 3
2425 if (not $formatting_category->{$node->[1]} and
2426 #not $phrasing_category->{$node->[1]} and
2427 ($special_category->{$node->[1]} or
2428 $scoping_category->{$node->[1]}) and
2429 $node->[1] ne 'address' and $node->[1] ne 'div') {
2430 last LI;
2431 }
2432
2433 ## Step 4
2434 $i--;
2435 $node = $self->{open_elements}->[$i];
2436 redo LI;
2437 } # LI
2438
2439 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2440 !!!next-token;
2441 return;
2442 } elsif ($token->{tag_name} eq 'plaintext') {
2443 ## has a p element in scope
2444 INSCOPE: for (reverse @{$self->{open_elements}}) {
2445 if ($_->[1] eq 'p') {
2446 !!!back-token;
2447 $token = {type => 'end tag', tag_name => 'p'};
2448 return;
2449 } elsif ({
2450 table => 1, caption => 1, td => 1, th => 1,
2451 button => 1, marquee => 1, object => 1, html => 1,
2452 }->{$_->[1]}) {
2453 last INSCOPE;
2454 }
2455 } # INSCOPE
2456
2457 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2458
2459 $self->{content_model_flag} = 'PLAINTEXT';
2460
2461 !!!next-token;
2462 return;
2463 } elsif ({
2464 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2465 }->{$token->{tag_name}}) {
2466 ## has a p element in scope
2467 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2468 my $node = $self->{open_elements}->[$_];
2469 if ($node->[1] eq 'p') {
2470 !!!back-token;
2471 $token = {type => 'end tag', tag_name => 'p'};
2472 return;
2473 } elsif ({
2474 table => 1, caption => 1, td => 1, th => 1,
2475 button => 1, marquee => 1, object => 1, html => 1,
2476 }->{$node->[1]}) {
2477 last INSCOPE;
2478 }
2479 } # INSCOPE
2480
2481 ## has an element in scope
2482 my $i;
2483 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2484 my $node = $self->{open_elements}->[$_];
2485 if ({
2486 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2487 }->{$node->[1]}) {
2488 $i = $_;
2489 last INSCOPE;
2490 } elsif ({
2491 table => 1, caption => 1, td => 1, th => 1,
2492 button => 1, marquee => 1, object => 1, html => 1,
2493 }->{$node->[1]}) {
2494 last INSCOPE;
2495 }
2496 } # INSCOPE
2497
2498 if (defined $i) {
2499 !!!parse-error (type => 'in hn:hn');
2500 splice @{$self->{open_elements}}, $i;
2501 }
2502
2503 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2504
2505 !!!next-token;
2506 return;
2507 } elsif ($token->{tag_name} eq 'a') {
2508 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2509 my $node = $active_formatting_elements->[$i];
2510 if ($node->[1] eq 'a') {
2511 !!!parse-error (type => 'in a:a');
2512
2513 !!!back-token;
2514 $token = {type => 'end tag', tag_name => 'a'};
2515 $formatting_end_tag->($token->{tag_name});
2516
2517 AFE2: for (reverse 0..$#$active_formatting_elements) {
2518 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2519 splice @$active_formatting_elements, $_, 1;
2520 last AFE2;
2521 }
2522 } # AFE2
2523 OE: for (reverse 0..$#{$self->{open_elements}}) {
2524 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
2525 splice @{$self->{open_elements}}, $_, 1;
2526 last OE;
2527 }
2528 } # OE
2529 last AFE;
2530 } elsif ($node->[0] eq '#marker') {
2531 last AFE;
2532 }
2533 } # AFE
2534
2535 $reconstruct_active_formatting_elements->($insert_to_current);
2536
2537 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2538 push @$active_formatting_elements, $self->{open_elements}->[-1];
2539
2540 !!!next-token;
2541 return;
2542 } elsif ({
2543 b => 1, big => 1, em => 1, font => 1, i => 1,
2544 nobr => 1, s => 1, small => 1, strile => 1,
2545 strong => 1, tt => 1, u => 1,
2546 }->{$token->{tag_name}}) {
2547 $reconstruct_active_formatting_elements->($insert_to_current);
2548
2549 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2550 push @$active_formatting_elements, $self->{open_elements}->[-1];
2551
2552 !!!next-token;
2553 return;
2554 } elsif ($token->{tag_name} eq 'button') {
2555 ## has a button element in scope
2556 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2557 my $node = $self->{open_elements}->[$_];
2558 if ($node->[1] eq 'button') {
2559 !!!parse-error (type => 'in button:button');
2560 !!!back-token;
2561 $token = {type => 'end tag', tag_name => 'button'};
2562 return;
2563 } elsif ({
2564 table => 1, caption => 1, td => 1, th => 1,
2565 button => 1, marquee => 1, object => 1, html => 1,
2566 }->{$node->[1]}) {
2567 last INSCOPE;
2568 }
2569 } # INSCOPE
2570
2571 $reconstruct_active_formatting_elements->($insert_to_current);
2572
2573 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2574 push @$active_formatting_elements, ['#marker', ''];
2575
2576 !!!next-token;
2577 return;
2578 } elsif ($token->{tag_name} eq 'marquee' or
2579 $token->{tag_name} eq 'object') {
2580 $reconstruct_active_formatting_elements->($insert_to_current);
2581
2582 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2583 push @$active_formatting_elements, ['#marker', ''];
2584
2585 !!!next-token;
2586 return;
2587 } elsif ($token->{tag_name} eq 'xmp') {
2588 $reconstruct_active_formatting_elements->($insert_to_current);
2589
2590 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2591
2592 $self->{content_model_flag} = 'CDATA';
2593
2594 !!!next-token;
2595 return;
2596 } elsif ($token->{tag_name} eq 'table') {
2597 ## has a p element in scope
2598 INSCOPE: for (reverse @{$self->{open_elements}}) {
2599 if ($_->[1] eq 'p') {
2600 !!!back-token;
2601 $token = {type => 'end tag', tag_name => 'p'};
2602 return;
2603 } elsif ({
2604 table => 1, caption => 1, td => 1, th => 1,
2605 button => 1, marquee => 1, object => 1, html => 1,
2606 }->{$_->[1]}) {
2607 last INSCOPE;
2608 }
2609 } # INSCOPE
2610
2611 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2612
2613 $self->{insertion_mode} = 'in table';
2614
2615 !!!next-token;
2616 return;
2617 } elsif ({
2618 area => 1, basefont => 1, bgsound => 1, br => 1,
2619 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2620 image => 1,
2621 }->{$token->{tag_name}}) {
2622 if ($token->{tag_name} eq 'image') {
2623 !!!parse-error (type => 'image');
2624 $token->{tag_name} = 'img';
2625 }
2626
2627 $reconstruct_active_formatting_elements->($insert_to_current);
2628
2629 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2630 pop @{$self->{open_elements}};
2631
2632 !!!next-token;
2633 return;
2634 } elsif ($token->{tag_name} eq 'hr') {
2635 ## has a p element in scope
2636 INSCOPE: for (reverse @{$self->{open_elements}}) {
2637 if ($_->[1] eq 'p') {
2638 !!!back-token;
2639 $token = {type => 'end tag', tag_name => 'p'};
2640 return;
2641 } elsif ({
2642 table => 1, caption => 1, td => 1, th => 1,
2643 button => 1, marquee => 1, object => 1, html => 1,
2644 }->{$_->[1]}) {
2645 last INSCOPE;
2646 }
2647 } # INSCOPE
2648
2649 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2650 pop @{$self->{open_elements}};
2651
2652 !!!next-token;
2653 return;
2654 } elsif ($token->{tag_name} eq 'input') {
2655 $reconstruct_active_formatting_elements->($insert_to_current);
2656
2657 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2658 ## TODO: associate with $self->{form_element} if defined
2659 pop @{$self->{open_elements}};
2660
2661 !!!next-token;
2662 return;
2663 } elsif ($token->{tag_name} eq 'isindex') {
2664 !!!parse-error (type => 'isindex');
2665
2666 if (defined $self->{form_element}) {
2667 ## Ignore the token
2668 !!!next-token;
2669 return;
2670 } else {
2671 my $at = $token->{attributes};
2672 $at->{name} = {name => 'name', value => 'isindex'};
2673 my @tokens = (
2674 {type => 'start tag', tag_name => 'form'},
2675 {type => 'start tag', tag_name => 'hr'},
2676 {type => 'start tag', tag_name => 'p'},
2677 {type => 'start tag', tag_name => 'label'},
2678 {type => 'character',
2679 data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
2680 ## TODO: make this configurable
2681 {type => 'start tag', tag_name => 'input', attributes => $at},
2682 #{type => 'character', data => ''}, # SHOULD
2683 {type => 'end tag', tag_name => 'label'},
2684 {type => 'end tag', tag_name => 'p'},
2685 {type => 'start tag', tag_name => 'hr'},
2686 {type => 'end tag', tag_name => 'form'},
2687 );
2688 $token = shift @tokens;
2689 !!!back-token (@tokens);
2690 return;
2691 }
2692 } elsif ({
2693 textarea => 1,
2694 iframe => 1,
2695 noembed => 1,
2696 noframes => 1,
2697 noscript => 0, ## TODO: 1 if scripting is enabled
2698 }->{$token->{tag_name}}) {
2699 my $tag_name = $token->{tag_name};
2700 my $el;
2701 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2702
2703 if ($token->{tag_name} eq 'textarea') {
2704 ## TODO: $self->{form_element} if defined
2705 $self->{content_model_flag} = 'RCDATA';
2706 } else {
2707 $self->{content_model_flag} = 'CDATA';
2708 }
2709
2710 $insert->($el);
2711
2712 my $text = '';
2713 if ($token->{tag_name} eq 'textarea') {
2714 !!!next-token;
2715 if ($token->{type} eq 'character') {
2716 $token->{data} =~ s/^\x0A//;
2717 unless (length $token->{data}) {
2718 !!!next-token;
2719 }
2720 }
2721 } else {
2722 !!!next-token;
2723 }
2724 while ($token->{type} eq 'character') {
2725 $text .= $token->{data};
2726 !!!next-token;
2727 }
2728 if (length $text) {
2729 $el->manakai_append_text ($text);
2730 }
2731
2732 $self->{content_model_flag} = 'PCDATA';
2733
2734 if ($token->{type} eq 'end tag' and
2735 $token->{tag_name} eq $tag_name) {
2736 ## Ignore the token
2737 } else {
2738 if ($token->{tag_name} eq 'textarea') {
2739 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2740 } else {
2741 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2742 }
2743 ## ISSUE: And ignore?
2744 }
2745 !!!next-token;
2746 return;
2747 } elsif ($token->{tag_name} eq 'select') {
2748 $reconstruct_active_formatting_elements->($insert_to_current);
2749
2750 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2751
2752 $self->{insertion_mode} = 'in select';
2753 !!!next-token;
2754 return;
2755 } elsif ({
2756 caption => 1, col => 1, colgroup => 1, frame => 1,
2757 frameset => 1, head => 1, option => 1, optgroup => 1,
2758 tbody => 1, td => 1, tfoot => 1, th => 1,
2759 thead => 1, tr => 1,
2760 }->{$token->{tag_name}}) {
2761 !!!parse-error (type => 'in body:'.$token->{tag_name});
2762 ## Ignore the token
2763 !!!next-token;
2764 return;
2765
2766 ## ISSUE: An issue on HTML5 new elements in the spec.
2767 } else {
2768 $reconstruct_active_formatting_elements->($insert_to_current);
2769
2770 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2771
2772 !!!next-token;
2773 return;
2774 }
2775 } elsif ($token->{type} eq 'end tag') {
2776 if ($token->{tag_name} eq 'body') {
2777 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
2778 ## ISSUE: There is an issue in the spec.
2779 if ($self->{open_elements}->[-1]->[1] ne 'body') {
2780 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2781 }
2782 $self->{insertion_mode} = 'after body';
2783 !!!next-token;
2784 return;
2785 } else {
2786 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2787 ## Ignore the token
2788 !!!next-token;
2789 return;
2790 }
2791 } elsif ($token->{tag_name} eq 'html') {
2792 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
2793 ## ISSUE: There is an issue in the spec.
2794 if ($self->{open_elements}->[-1]->[1] ne 'body') {
2795 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
2796 }
2797 $self->{insertion_mode} = 'after body';
2798 ## reprocess
2799 return;
2800 } else {
2801 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2802 ## Ignore the token
2803 !!!next-token;
2804 return;
2805 }
2806 } elsif ({
2807 address => 1, blockquote => 1, center => 1, dir => 1,
2808 div => 1, dl => 1, fieldset => 1, listing => 1,
2809 menu => 1, ol => 1, pre => 1, ul => 1,
2810 form => 1,
2811 p => 1,
2812 dd => 1, dt => 1, li => 1,
2813 button => 1, marquee => 1, object => 1,
2814 }->{$token->{tag_name}}) {
2815 ## has an element in scope
2816 my $i;
2817 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2818 my $node = $self->{open_elements}->[$_];
2819 if ($node->[1] eq $token->{tag_name}) {
2820 ## generate implied end tags
2821 if ({
2822 dd => ($token->{tag_name} ne 'dd'),
2823 dt => ($token->{tag_name} ne 'dt'),
2824 li => ($token->{tag_name} ne 'li'),
2825 p => ($token->{tag_name} ne 'p'),
2826 td => 1, th => 1, tr => 1,
2827 }->{$self->{open_elements}->[-1]->[1]}) {
2828 !!!back-token;
2829 $token = {type => 'end tag',
2830 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
2831 return;
2832 }
2833 $i = $_;
2834 last INSCOPE unless $token->{tag_name} eq 'p';
2835 } elsif ({
2836 table => 1, caption => 1, td => 1, th => 1,
2837 button => 1, marquee => 1, object => 1, html => 1,
2838 }->{$node->[1]}) {
2839 last INSCOPE;
2840 }
2841 } # INSCOPE
2842
2843 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
2844 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2845 }
2846
2847 splice @{$self->{open_elements}}, $i if defined $i;
2848 undef $self->{form_element} if $token->{tag_name} eq 'form';
2849 $clear_up_to_marker->()
2850 if {
2851 button => 1, marquee => 1, object => 1,
2852 }->{$token->{tag_name}};
2853 !!!next-token;
2854 return;
2855 } elsif ({
2856 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2857 }->{$token->{tag_name}}) {
2858 ## has an element in scope
2859 my $i;
2860 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2861 my $node = $self->{open_elements}->[$_];
2862 if ({
2863 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2864 }->{$node->[1]}) {
2865 ## generate implied end tags
2866 if ({
2867 dd => 1, dt => 1, li => 1, p => 1,
2868 td => 1, th => 1, tr => 1,
2869 }->{$self->{open_elements}->[-1]->[1]}) {
2870 !!!back-token;
2871 $token = {type => 'end tag',
2872 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
2873 return;
2874 }
2875 $i = $_;
2876 last INSCOPE;
2877 } elsif ({
2878 table => 1, caption => 1, td => 1, th => 1,
2879 button => 1, marquee => 1, object => 1, html => 1,
2880 }->{$node->[1]}) {
2881 last INSCOPE;
2882 }
2883 } # INSCOPE
2884
2885 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
2886 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2887 }
2888
2889 splice @{$self->{open_elements}}, $i if defined $i;
2890 !!!next-token;
2891 return;
2892 } elsif ({
2893 a => 1,
2894 b => 1, big => 1, em => 1, font => 1, i => 1,
2895 nobr => 1, s => 1, small => 1, strile => 1,
2896 strong => 1, tt => 1, u => 1,
2897 }->{$token->{tag_name}}) {
2898 $formatting_end_tag->($token->{tag_name});
2899 ## TODO: <http://html5.org/tools/web-apps-tracker?from=883&to=884>
2900 return;
2901 } elsif ({
2902 caption => 1, col => 1, colgroup => 1, frame => 1,
2903 frameset => 1, head => 1, option => 1, optgroup => 1,
2904 tbody => 1, td => 1, tfoot => 1, th => 1,
2905 thead => 1, tr => 1,
2906 area => 1, basefont => 1, bgsound => 1, br => 1,
2907 embed => 1, hr => 1, iframe => 1, image => 1,
2908 img => 1, input => 1, isindex => 1, noembed => 1,
2909 noframes => 1, param => 1, select => 1, spacer => 1,
2910 table => 1, textarea => 1, wbr => 1,
2911 noscript => 0, ## TODO: if scripting is enabled
2912 }->{$token->{tag_name}}) {
2913 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2914 ## Ignore the token
2915 !!!next-token;
2916 return;
2917
2918 ## ISSUE: Issue on HTML5 new elements in spec
2919
2920 } else {
2921 ## Step 1
2922 my $node_i = -1;
2923 my $node = $self->{open_elements}->[$node_i];
2924
2925 ## Step 2
2926 S2: {
2927 if ($node->[1] eq $token->{tag_name}) {
2928 ## Step 1
2929 ## generate implied end tags
2930 if ({
2931 dd => 1, dt => 1, li => 1, p => 1,
2932 td => 1, th => 1, tr => 1,
2933 }->{$self->{open_elements}->[-1]->[1]}) {
2934 !!!back-token;
2935 $token = {type => 'end tag',
2936 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
2937 return;
2938 }
2939
2940 ## Step 2
2941 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
2942 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2943 }
2944
2945 ## Step 3
2946 splice @{$self->{open_elements}}, $node_i;
2947
2948 !!!next-token;
2949 last S2;
2950 } else {
2951 ## Step 3
2952 if (not $formatting_category->{$node->[1]} and
2953 #not $phrasing_category->{$node->[1]} and
2954 ($special_category->{$node->[1]} or
2955 $scoping_category->{$node->[1]})) {
2956 !!!parse-error (type => 'not closed:'.$node->[1]);
2957 ## Ignore the token
2958 !!!next-token;
2959 last S2;
2960 }
2961 }
2962
2963 ## Step 4
2964 $node_i--;
2965 $node = $self->{open_elements}->[$node_i];
2966
2967 ## Step 5;
2968 redo S2;
2969 } # S2
2970 return;
2971 }
2972 }
2973 }; # $in_body
2974
2975 B: {
2976 if ($phase eq 'main') {
2977 if ($token->{type} eq 'DOCTYPE') {
2978 !!!parse-error (type => 'in html:#DOCTYPE');
2979 ## Ignore the token
2980 ## Stay in the phase
2981 !!!next-token;
2982 redo B;
2983 } elsif ($token->{type} eq 'start tag' and
2984 $token->{tag_name} eq 'html') {
2985 ## TODO: unless it is the first start tag token, parse-error
2986 my $top_el = $self->{open_elements}->[0]->[0];
2987 for my $attr_name (keys %{$token->{attributes}}) {
2988 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2989 $top_el->set_attribute_ns
2990 (undef, [undef, $attr_name],
2991 $token->{attributes}->{$attr_name}->{value});
2992 }
2993 }
2994 !!!next-token;
2995 redo B;
2996 } elsif ($token->{type} eq 'end-of-file') {
2997 ## Generate implied end tags
2998 if ({
2999 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3000 }->{$self->{open_elements}->[-1]->[1]}) {
3001 !!!back-token;
3002 $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
3003 redo B;
3004 }
3005
3006 if (@{$self->{open_elements}} > 2 or
3007 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3008 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3009 } elsif (defined $self->{inner_html_node} and
3010 @{$self->{open_elements}} > 1 and
3011 $self->{open_elements}->[1]->[1] ne 'body') {
3012 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3013 }
3014
3015 ## Stop parsing
3016 last B;
3017
3018 ## ISSUE: There is an issue in the spec.
3019 } else {
3020 if ($self->{insertion_mode} eq 'before head') {
3021 if ($token->{type} eq 'character') {
3022 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3023 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3024 unless (length $token->{data}) {
3025 !!!next-token;
3026 redo B;
3027 }
3028 }
3029 ## As if <head>
3030 !!!create-element ($self->{head_element}, 'head');
3031 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3032 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3033 $self->{insertion_mode} = 'in head';
3034 ## reprocess
3035 redo B;
3036 } elsif ($token->{type} eq 'comment') {
3037 my $comment = $self->{document}->create_comment ($token->{data});
3038 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3039 !!!next-token;
3040 redo B;
3041 } elsif ($token->{type} eq 'start tag') {
3042 my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3043 !!!create-element ($self->{head_element}, 'head', $attr);
3044 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3045 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3046 $self->{insertion_mode} = 'in head';
3047 if ($token->{tag_name} eq 'head') {
3048 !!!next-token;
3049 #} elsif ({
3050 # base => 1, link => 1, meta => 1,
3051 # script => 1, style => 1, title => 1,
3052 # }->{$token->{tag_name}}) {
3053 # ## reprocess
3054 } else {
3055 ## reprocess
3056 }
3057 redo B;
3058 } elsif ($token->{type} eq 'end tag') {
3059 if ($token->{tag_name} eq 'html') {
3060 ## As if <head>
3061 !!!create-element ($self->{head_element}, 'head');
3062 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3063 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3064 $self->{insertion_mode} = 'in head';
3065 ## reprocess
3066 redo B;
3067 } else {
3068 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3069 ## Ignore the token
3070 !!!next-token;
3071 redo B;
3072 }
3073 } else {
3074 die "$0: $token->{type}: Unknown type";
3075 }
3076 } elsif ($self->{insertion_mode} eq 'in head') {
3077 if ($token->{type} eq 'character') {
3078 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3079 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3080 unless (length $token->{data}) {
3081 !!!next-token;
3082 redo B;
3083 }
3084 }
3085
3086 #
3087 } elsif ($token->{type} eq 'comment') {
3088 my $comment = $self->{document}->create_comment ($token->{data});
3089 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3090 !!!next-token;
3091 redo B;
3092 } elsif ($token->{type} eq 'start tag') {
3093 if ($token->{tag_name} eq 'title') {
3094 ## NOTE: There is an "as if in head" code clone
3095 my $title_el;
3096 !!!create-element ($title_el, 'title', $token->{attributes});
3097 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
3098 ->append_child ($title_el);
3099 $self->{content_model_flag} = 'RCDATA';
3100
3101 my $text = '';
3102 !!!next-token;
3103 while ($token->{type} eq 'character') {
3104 $text .= $token->{data};
3105 !!!next-token;
3106 }
3107 if (length $text) {
3108 $title_el->manakai_append_text ($text);
3109 }
3110
3111 $self->{content_model_flag} = 'PCDATA';
3112
3113 if ($token->{type} eq 'end tag' and
3114 $token->{tag_name} eq 'title') {
3115 ## Ignore the token
3116 } else {
3117 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
3118 ## ISSUE: And ignore?
3119 }
3120 !!!next-token;
3121 redo B;
3122 } elsif ($token->{tag_name} eq 'style') {
3123 $style_start_tag->();
3124 redo B;
3125 } elsif ($token->{tag_name} eq 'script') {
3126 $script_start_tag->();
3127 redo B;
3128 } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
3129 ## NOTE: There are "as if in head" code clones
3130 my $el;
3131 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
3132 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
3133 ->append_child ($el);
3134
3135 !!!next-token;
3136 redo B;
3137 } elsif ($token->{tag_name} eq 'head') {
3138 !!!parse-error (type => 'in head:head');
3139 ## Ignore the token
3140 !!!next-token;
3141 redo B;
3142 } else {
3143 #
3144 }
3145 } elsif ($token->{type} eq 'end tag') {
3146 if ($token->{tag_name} eq 'head') {
3147 if ($self->{open_elements}->[-1]->[1] eq 'head') {
3148 pop @{$self->{open_elements}};
3149 } else {
3150 !!!parse-error (type => 'unmatched end tag:head');
3151 }
3152 $self->{insertion_mode} = 'after head';
3153 !!!next-token;
3154 redo B;
3155 } elsif ($token->{tag_name} eq 'html') {
3156 #
3157 } else {
3158 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3159 ## Ignore the token
3160 !!!next-token;
3161 redo B;
3162 }
3163 } else {
3164 #
3165 }
3166
3167 if ($self->{open_elements}->[-1]->[1] eq 'head') {
3168 ## As if </head>
3169 pop @{$self->{open_elements}};
3170 }
3171 $self->{insertion_mode} = 'after head';
3172 ## reprocess
3173 redo B;
3174
3175 ## ISSUE: An issue in the spec.
3176 } elsif ($self->{insertion_mode} eq 'after head') {
3177 if ($token->{type} eq 'character') {
3178 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3179 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3180 unless (length $token->{data}) {
3181 !!!next-token;
3182 redo B;
3183 }
3184 }
3185
3186 #
3187 } elsif ($token->{type} eq 'comment') {
3188 my $comment = $self->{document}->create_comment ($token->{data});
3189 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3190 !!!next-token;
3191 redo B;
3192 } elsif ($token->{type} eq 'start tag') {
3193 if ($token->{tag_name} eq 'body') {
3194 !!!insert-element ('body', $token->{attributes});
3195 $self->{insertion_mode} = 'in body';
3196 !!!next-token;
3197 redo B;
3198 } elsif ($token->{tag_name} eq 'frameset') {
3199 !!!insert-element ('frameset', $token->{attributes});
3200 $self->{insertion_mode} = 'in frameset';
3201 !!!next-token;
3202 redo B;
3203 } elsif ({
3204 base => 1, link => 1, meta => 1,
3205 script => 1, style => 1, title => 1,
3206 }->{$token->{tag_name}}) {
3207 !!!parse-error (type => 'after head:'.$token->{tag_name});
3208 $self->{insertion_mode} = 'in head';
3209 ## reprocess
3210 redo B;
3211 } else {
3212 #
3213 }
3214 } else {
3215 #
3216 }
3217
3218 ## As if <body>
3219 !!!insert-element ('body');
3220 $self->{insertion_mode} = 'in body';
3221 ## reprocess
3222 redo B;
3223 } elsif ($self->{insertion_mode} eq 'in body') {
3224 if ($token->{type} eq 'character') {
3225 ## NOTE: There is a code clone of "character in body".
3226 $reconstruct_active_formatting_elements->($insert_to_current);
3227
3228 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3229
3230 !!!next-token;
3231 redo B;
3232 } elsif ($token->{type} eq 'comment') {
3233 ## NOTE: There is a code clone of "comment in body".
3234 my $comment = $self->{document}->create_comment ($token->{data});
3235 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3236 !!!next-token;
3237 redo B;
3238 } else {
3239 $in_body->($insert_to_current);
3240 redo B;
3241 }
3242 } elsif ($self->{insertion_mode} eq 'in table') {
3243 if ($token->{type} eq 'character') {
3244 ## NOTE: There are "character in table" code clones.
3245 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3246 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3247
3248 unless (length $token->{data}) {
3249 !!!next-token;
3250 redo B;
3251 }
3252 }
3253
3254 !!!parse-error (type => 'in table:#character');
3255
3256 ## As if in body, but insert into foster parent element
3257 ## ISSUE: Spec says that "whenever a node would be inserted
3258 ## into the current node" while characters might not be
3259 ## result in a new Text node.
3260 $reconstruct_active_formatting_elements->($insert_to_foster);
3261
3262 if ({
3263 table => 1, tbody => 1, tfoot => 1,
3264 thead => 1, tr => 1,
3265 }->{$self->{open_elements}->[-1]->[1]}) {
3266 # MUST
3267 my $foster_parent_element;
3268 my $next_sibling;
3269 my $prev_sibling;
3270 OE: for (reverse 0..$#{$self->{open_elements}}) {
3271 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3272 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3273 if (defined $parent and $parent->node_type == 1) {
3274 $foster_parent_element = $parent;
3275 $next_sibling = $self->{open_elements}->[$_]->[0];
3276 $prev_sibling = $next_sibling->previous_sibling;
3277 } else {
3278 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3279 $prev_sibling = $foster_parent_element->last_child;
3280 }
3281 last OE;
3282 }
3283 } # OE
3284 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3285 $prev_sibling = $foster_parent_element->last_child
3286 unless defined $foster_parent_element;
3287 if (defined $prev_sibling and
3288 $prev_sibling->node_type == 3) {
3289 $prev_sibling->manakai_append_text ($token->{data});
3290 } else {
3291 $foster_parent_element->insert_before
3292 ($self->{document}->create_text_node ($token->{data}),
3293 $next_sibling);
3294 }
3295 } else {
3296 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3297 }
3298
3299 !!!next-token;
3300 redo B;
3301 } elsif ($token->{type} eq 'comment') {
3302 my $comment = $self->{document}->create_comment ($token->{data});
3303 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3304 !!!next-token;
3305 redo B;
3306 } elsif ($token->{type} eq 'start tag') {
3307 if ({
3308 caption => 1,
3309 colgroup => 1,
3310 tbody => 1, tfoot => 1, thead => 1,
3311 }->{$token->{tag_name}}) {
3312 ## Clear back to table context
3313 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3314 $self->{open_elements}->[-1]->[1] ne 'html') {
3315 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3316 pop @{$self->{open_elements}};
3317 }
3318
3319 push @$active_formatting_elements, ['#marker', '']
3320 if $token->{tag_name} eq 'caption';
3321
3322 !!!insert-element ($token->{tag_name}, $token->{attributes});
3323 $self->{insertion_mode} = {
3324 caption => 'in caption',
3325 colgroup => 'in column group',
3326 tbody => 'in table body',
3327 tfoot => 'in table body',
3328 thead => 'in table body',
3329 }->{$token->{tag_name}};
3330 !!!next-token;
3331 redo B;
3332 } elsif ({
3333 col => 1,
3334 td => 1, th => 1, tr => 1,
3335 }->{$token->{tag_name}}) {
3336 ## Clear back to table context
3337 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3338 $self->{open_elements}->[-1]->[1] ne 'html') {
3339 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3340 pop @{$self->{open_elements}};
3341 }
3342
3343 !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
3344 $self->{insertion_mode} = $token->{tag_name} eq 'col'
3345 ? 'in column group' : 'in table body';
3346 ## reprocess
3347 redo B;
3348 } elsif ($token->{tag_name} eq 'table') {
3349 ## NOTE: There are code clones for this "table in table"
3350 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3351
3352 ## As if </table>
3353 ## have a table element in table scope
3354 my $i;
3355 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3356 my $node = $self->{open_elements}->[$_];
3357 if ($node->[1] eq 'table') {
3358 $i = $_;
3359 last INSCOPE;
3360 } elsif ({
3361 table => 1, html => 1,
3362 }->{$node->[1]}) {
3363 last INSCOPE;
3364 }
3365 } # INSCOPE
3366 unless (defined $i) {
3367 !!!parse-error (type => 'unmatched end tag:table');
3368 ## Ignore tokens </table><table>
3369 !!!next-token;
3370 redo B;
3371 }
3372
3373 ## generate implied end tags
3374 if ({
3375 dd => 1, dt => 1, li => 1, p => 1,
3376 td => 1, th => 1, tr => 1,
3377 }->{$self->{open_elements}->[-1]->[1]}) {
3378 !!!back-token; # <table>
3379 $token = {type => 'end tag', tag_name => 'table'};
3380 !!!back-token;
3381 $token = {type => 'end tag',
3382 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3383 redo B;
3384 }
3385
3386 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3387 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3388 }
3389
3390 splice @{$self->{open_elements}}, $i;
3391
3392 $self->_reset_insertion_mode;
3393
3394 ## reprocess
3395 redo B;
3396 } else {
3397 #
3398 }
3399 } elsif ($token->{type} eq 'end tag') {
3400 if ($token->{tag_name} eq 'table') {
3401 ## have a table element in table scope
3402 my $i;
3403 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3404 my $node = $self->{open_elements}->[$_];
3405 if ($node->[1] eq $token->{tag_name}) {
3406 $i = $_;
3407 last INSCOPE;
3408 } elsif ({
3409 table => 1, html => 1,
3410 }->{$node->[1]}) {
3411 last INSCOPE;
3412 }
3413 } # INSCOPE
3414 unless (defined $i) {
3415 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3416 ## Ignore the token
3417 !!!next-token;
3418 redo B;
3419 }
3420
3421 ## generate implied end tags
3422 if ({
3423 dd => 1, dt => 1, li => 1, p => 1,
3424 td => 1, th => 1, tr => 1,
3425 }->{$self->{open_elements}->[-1]->[1]}) {
3426 !!!back-token;
3427 $token = {type => 'end tag',
3428 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3429 redo B;
3430 }
3431
3432 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3433 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3434 }
3435
3436 splice @{$self->{open_elements}}, $i;
3437
3438 $self->_reset_insertion_mode;
3439
3440 !!!next-token;
3441 redo B;
3442 } elsif ({
3443 body => 1, caption => 1, col => 1, colgroup => 1,
3444 html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
3445 thead => 1, tr => 1,
3446 }->{$token->{tag_name}}) {
3447 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3448 ## Ignore the token
3449 !!!next-token;
3450 redo B;
3451 } else {
3452 #
3453 }
3454 } else {
3455 #
3456 }
3457
3458 !!!parse-error (type => 'in table:'.$token->{tag_name});
3459 $in_body->($insert_to_foster);
3460 redo B;
3461 } elsif ($self->{insertion_mode} eq 'in caption') {
3462 if ($token->{type} eq 'character') {
3463 ## NOTE: This is a code clone of "character in body".
3464 $reconstruct_active_formatting_elements->($insert_to_current);
3465
3466 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3467
3468 !!!next-token;
3469 redo B;
3470 } elsif ($token->{type} eq 'comment') {
3471 ## NOTE: This is a code clone of "comment in body".
3472 my $comment = $self->{document}->create_comment ($token->{data});
3473 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3474 !!!next-token;
3475 redo B;
3476 } elsif ($token->{type} eq 'start tag') {
3477 if ({
3478 caption => 1, col => 1, colgroup => 1, tbody => 1,
3479 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3480 }->{$token->{tag_name}}) {
3481 !!!parse-error (type => 'not closed:caption');
3482
3483 ## As if </caption>
3484 ## have a table element in table scope
3485 my $i;
3486 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3487 my $node = $self->{open_elements}->[$_];
3488 if ($node->[1] eq 'caption') {
3489 $i = $_;
3490 last INSCOPE;
3491 } elsif ({
3492 table => 1, html => 1,
3493 }->{$node->[1]}) {
3494 last INSCOPE;
3495 }
3496 } # INSCOPE
3497 unless (defined $i) {
3498 !!!parse-error (type => 'unmatched end tag:caption');
3499 ## Ignore the token
3500 !!!next-token;
3501 redo B;
3502 }
3503
3504 ## generate implied end tags
3505 if ({
3506 dd => 1, dt => 1, li => 1, p => 1,
3507 td => 1, th => 1, tr => 1,
3508 }->{$self->{open_elements}->[-1]->[1]}) {
3509 !!!back-token; # <?>
3510 $token = {type => 'end tag', tag_name => 'caption'};
3511 !!!back-token;
3512 $token = {type => 'end tag',
3513 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3514 redo B;
3515 }
3516
3517 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3518 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3519 }
3520
3521 splice @{$self->{open_elements}}, $i;
3522
3523 $clear_up_to_marker->();
3524
3525 $self->{insertion_mode} = 'in table';
3526
3527 ## reprocess
3528 redo B;
3529 } else {
3530 #
3531 }
3532 } elsif ($token->{type} eq 'end tag') {
3533 if ($token->{tag_name} eq 'caption') {
3534 ## have a table element in table scope
3535 my $i;
3536 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3537 my $node = $self->{open_elements}->[$_];
3538 if ($node->[1] eq $token->{tag_name}) {
3539 $i = $_;
3540 last INSCOPE;
3541 } elsif ({
3542 table => 1, html => 1,
3543 }->{$node->[1]}) {
3544 last INSCOPE;
3545 }
3546 } # INSCOPE
3547 unless (defined $i) {
3548 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3549 ## Ignore the token
3550 !!!next-token;
3551 redo B;
3552 }
3553
3554 ## generate implied end tags
3555 if ({
3556 dd => 1, dt => 1, li => 1, p => 1,
3557 td => 1, th => 1, tr => 1,
3558 }->{$self->{open_elements}->[-1]->[1]}) {
3559 !!!back-token;
3560 $token = {type => 'end tag',
3561 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3562 redo B;
3563 }
3564
3565 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3566 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3567 }
3568
3569 splice @{$self->{open_elements}}, $i;
3570
3571 $clear_up_to_marker->();
3572
3573 $self->{insertion_mode} = 'in table';
3574
3575 !!!next-token;
3576 redo B;
3577 } elsif ($token->{tag_name} eq 'table') {
3578 !!!parse-error (type => 'not closed:caption');
3579
3580 ## As if </caption>
3581 ## have a table element in table scope
3582 my $i;
3583 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3584 my $node = $self->{open_elements}->[$_];
3585 if ($node->[1] eq 'caption') {
3586 $i = $_;
3587 last INSCOPE;
3588 } elsif ({
3589 table => 1, html => 1,
3590 }->{$node->[1]}) {
3591 last INSCOPE;
3592 }
3593 } # INSCOPE
3594 unless (defined $i) {
3595 !!!parse-error (type => 'unmatched end tag:caption');
3596 ## Ignore the token
3597 !!!next-token;
3598 redo B;
3599 }
3600
3601 ## generate implied end tags
3602 if ({
3603 dd => 1, dt => 1, li => 1, p => 1,
3604 td => 1, th => 1, tr => 1,
3605 }->{$self->{open_elements}->[-1]->[1]}) {
3606 !!!back-token; # </table>
3607 $token = {type => 'end tag', tag_name => 'caption'};
3608 !!!back-token;
3609 $token = {type => 'end tag',
3610 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3611 redo B;
3612 }
3613
3614 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3615 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3616 }
3617
3618 splice @{$self->{open_elements}}, $i;
3619
3620 $clear_up_to_marker->();
3621
3622 $self->{insertion_mode} = 'in table';
3623
3624 ## reprocess
3625 redo B;
3626 } elsif ({
3627 body => 1, col => 1, colgroup => 1,
3628 html => 1, tbody => 1, td => 1, tfoot => 1,
3629 th => 1, thead => 1, tr => 1,
3630 }->{$token->{tag_name}}) {
3631 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3632 ## Ignore the token
3633 redo B;
3634 } else {
3635 #
3636 }
3637 } else {
3638 #
3639 }
3640
3641 $in_body->($insert_to_current);
3642 redo B;
3643 } elsif ($self->{insertion_mode} eq 'in column group') {
3644 if ($token->{type} eq 'character') {
3645 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3646 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3647 unless (length $token->{data}) {
3648 !!!next-token;
3649 redo B;
3650 }
3651 }
3652
3653 #
3654 } elsif ($token->{type} eq 'comment') {
3655 my $comment = $self->{document}->create_comment ($token->{data});
3656 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3657 !!!next-token;
3658 redo B;
3659 } elsif ($token->{type} eq 'start tag') {
3660 if ($token->{tag_name} eq 'col') {
3661 !!!insert-element ($token->{tag_name}, $token->{attributes});
3662 pop @{$self->{open_elements}};
3663 !!!next-token;
3664 redo B;
3665 } else {
3666 #
3667 }
3668 } elsif ($token->{type} eq 'end tag') {
3669 if ($token->{tag_name} eq 'colgroup') {
3670 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3671 !!!parse-error (type => 'unmatched end tag:colgroup');
3672 ## Ignore the token
3673 !!!next-token;
3674 redo B;
3675 } else {
3676 pop @{$self->{open_elements}}; # colgroup
3677 $self->{insertion_mode} = 'in table';
3678 !!!next-token;
3679 redo B;
3680 }
3681 } elsif ($token->{tag_name} eq 'col') {
3682 !!!parse-error (type => 'unmatched end tag:col');
3683 ## Ignore the token
3684 !!!next-token;
3685 redo B;
3686 } else {
3687 #
3688 }
3689 } else {
3690 #
3691 }
3692
3693 ## As if </colgroup>
3694 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3695 !!!parse-error (type => 'unmatched end tag:colgroup');
3696 ## Ignore the token
3697 !!!next-token;
3698 redo B;
3699 } else {
3700 pop @{$self->{open_elements}}; # colgroup
3701 $self->{insertion_mode} = 'in table';
3702 ## reprocess
3703 redo B;
3704 }
3705 } elsif ($self->{insertion_mode} eq 'in table body') {
3706 if ($token->{type} eq 'character') {
3707 ## NOTE: This is a "character in table" code clone.
3708 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3709 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3710
3711 unless (length $token->{data}) {
3712 !!!next-token;
3713 redo B;
3714 }
3715 }
3716
3717 !!!parse-error (type => 'in table:#character');
3718
3719 ## As if in body, but insert into foster parent element
3720 ## ISSUE: Spec says that "whenever a node would be inserted
3721 ## into the current node" while characters might not be
3722 ## result in a new Text node.
3723 $reconstruct_active_formatting_elements->($insert_to_foster);
3724
3725 if ({
3726 table => 1, tbody => 1, tfoot => 1,
3727 thead => 1, tr => 1,
3728 }->{$self->{open_elements}->[-1]->[1]}) {
3729 # MUST
3730 my $foster_parent_element;
3731 my $next_sibling;
3732 my $prev_sibling;
3733 OE: for (reverse 0..$#{$self->{open_elements}}) {
3734 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3735 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3736 if (defined $parent and $parent->node_type == 1) {
3737 $foster_parent_element = $parent;
3738 $next_sibling = $self->{open_elements}->[$_]->[0];
3739 $prev_sibling = $next_sibling->previous_sibling;
3740 } else {
3741 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3742 $prev_sibling = $foster_parent_element->last_child;
3743 }
3744 last OE;
3745 }
3746 } # OE
3747 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3748 $prev_sibling = $foster_parent_element->last_child
3749 unless defined $foster_parent_element;
3750 if (defined $prev_sibling and
3751 $prev_sibling->node_type == 3) {
3752 $prev_sibling->manakai_append_text ($token->{data});
3753 } else {
3754 $foster_parent_element->insert_before
3755 ($self->{document}->create_text_node ($token->{data}),
3756 $next_sibling);
3757 }
3758 } else {
3759 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3760 }
3761
3762 !!!next-token;
3763 redo B;
3764 } elsif ($token->{type} eq 'comment') {
3765 ## Copied from 'in table'
3766 my $comment = $self->{document}->create_comment ($token->{data});
3767 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3768 !!!next-token;
3769 redo B;
3770 } elsif ($token->{type} eq 'start tag') {
3771 if ({
3772 tr => 1,
3773 th => 1, td => 1,
3774 }->{$token->{tag_name}}) {
3775 unless ($token->{tag_name} eq 'tr') {
3776 !!!parse-error (type => 'missing start tag:tr');
3777 }
3778
3779 ## Clear back to table body context
3780 while (not {
3781 tbody => 1, tfoot => 1, thead => 1, html => 1,
3782 }->{$self->{open_elements}->[-1]->[1]}) {
3783 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3784 pop @{$self->{open_elements}};
3785 }
3786
3787 $self->{insertion_mode} = 'in row';
3788 if ($token->{tag_name} eq 'tr') {
3789 !!!insert-element ($token->{tag_name}, $token->{attributes});
3790 !!!next-token;
3791 } else {
3792 !!!insert-element ('tr');
3793 ## reprocess
3794 }
3795 redo B;
3796 } elsif ({
3797 caption => 1, col => 1, colgroup => 1,
3798 tbody => 1, tfoot => 1, thead => 1,
3799 }->{$token->{tag_name}}) {
3800 ## have an element in table scope
3801 my $i;
3802 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3803 my $node = $self->{open_elements}->[$_];
3804 if ({
3805 tbody => 1, thead => 1, tfoot => 1,
3806 }->{$node->[1]}) {
3807 $i = $_;
3808 last INSCOPE;
3809 } elsif ({
3810 table => 1, html => 1,
3811 }->{$node->[1]}) {
3812 last INSCOPE;
3813 }
3814 } # INSCOPE
3815 unless (defined $i) {
3816 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3817 ## Ignore the token
3818 !!!next-token;
3819 redo B;
3820 }
3821
3822 ## Clear back to table body context
3823 while (not {
3824 tbody => 1, tfoot => 1, thead => 1, html => 1,
3825 }->{$self->{open_elements}->[-1]->[1]}) {
3826 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3827 pop @{$self->{open_elements}};
3828 }
3829
3830 ## As if <{current node}>
3831 ## have an element in table scope
3832 ## true by definition
3833
3834 ## Clear back to table body context
3835 ## nop by definition
3836
3837 pop @{$self->{open_elements}};
3838 $self->{insertion_mode} = 'in table';
3839 ## reprocess
3840 redo B;
3841 } elsif ($token->{tag_name} eq 'table') {
3842 ## NOTE: This is a code clone of "table in table"
3843 !!!parse-error (type => 'not closed:table');
3844
3845 ## As if </table>
3846 ## have a table element in table scope
3847 my $i;
3848 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3849 my $node = $self->{open_elements}->[$_];
3850 if ($node->[1] eq 'table') {
3851 $i = $_;
3852 last INSCOPE;
3853 } elsif ({
3854 table => 1, html => 1,
3855 }->{$node->[1]}) {
3856 last INSCOPE;
3857 }
3858 } # INSCOPE
3859 unless (defined $i) {
3860 !!!parse-error (type => 'unmatched end tag:table');
3861 ## Ignore tokens </table><table>
3862 !!!next-token;
3863 redo B;
3864 }
3865
3866 ## generate implied end tags
3867 if ({
3868 dd => 1, dt => 1, li => 1, p => 1,
3869 td => 1, th => 1, tr => 1,
3870 }->{$self->{open_elements}->[-1]->[1]}) {
3871 !!!back-token; # <table>
3872 $token = {type => 'end tag', tag_name => 'table'};
3873 !!!back-token;
3874 $token = {type => 'end tag',
3875 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3876 redo B;
3877 }
3878
3879 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3880 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3881 }
3882
3883 splice @{$self->{open_elements}}, $i;
3884
3885 $self->_reset_insertion_mode;
3886
3887 ## reprocess
3888 redo B;
3889 } else {
3890 #
3891 }
3892 } elsif ($token->{type} eq 'end tag') {
3893 if ({
3894 tbody => 1, tfoot => 1, thead => 1,
3895 }->{$token->{tag_name}}) {
3896 ## have an element in table scope
3897 my $i;
3898 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3899 my $node = $self->{open_elements}->[$_];
3900 if ($node->[1] eq $token->{tag_name}) {
3901 $i = $_;
3902 last INSCOPE;
3903 } elsif ({
3904 table => 1, html => 1,
3905 }->{$node->[1]}) {
3906 last INSCOPE;
3907 }
3908 } # INSCOPE
3909 unless (defined $i) {
3910 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3911 ## Ignore the token
3912 !!!next-token;
3913 redo B;
3914 }
3915
3916 ## Clear back to table body context
3917 while (not {
3918 tbody => 1, tfoot => 1, thead => 1, html => 1,
3919 }->{$self->{open_elements}->[-1]->[1]}) {
3920 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3921 pop @{$self->{open_elements}};
3922 }
3923
3924 pop @{$self->{open_elements}};
3925 $self->{insertion_mode} = 'in table';
3926 !!!next-token;
3927 redo B;
3928 } elsif ($token->{tag_name} eq 'table') {
3929 ## have an element in table scope
3930 my $i;
3931 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3932 my $node = $self->{open_elements}->[$_];
3933 if ({
3934 tbody => 1, thead => 1, tfoot => 1,
3935 }->{$node->[1]}) {
3936 $i = $_;
3937 last INSCOPE;
3938 } elsif ({
3939 table => 1, html => 1,
3940 }->{$node->[1]}) {
3941 last INSCOPE;
3942 }
3943 } # INSCOPE
3944 unless (defined $i) {
3945 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3946 ## Ignore the token
3947 !!!next-token;
3948 redo B;
3949 }
3950
3951 ## Clear back to table body context
3952 while (not {
3953 tbody => 1, tfoot => 1, thead => 1, html => 1,
3954 }->{$self->{open_elements}->[-1]->[1]}) {
3955 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3956 pop @{$self->{open_elements}};
3957 }
3958
3959 ## As if <{current node}>
3960 ## have an element in table scope
3961 ## true by definition
3962
3963 ## Clear back to table body context
3964 ## nop by definition
3965
3966 pop @{$self->{open_elements}};
3967 $self->{insertion_mode} = 'in table';
3968 ## reprocess
3969 redo B;
3970 } elsif ({
3971 body => 1, caption => 1, col => 1, colgroup => 1,
3972 html => 1, td => 1, th => 1, tr => 1,
3973 }->{$token->{tag_name}}) {
3974 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3975 ## Ignore the token
3976 !!!next-token;
3977 redo B;
3978 } else {
3979 #
3980 }
3981 } else {
3982 #
3983 }
3984
3985 ## As if in table
3986 !!!parse-error (type => 'in table:'.$token->{tag_name});
3987 $in_body->($insert_to_foster);
3988 redo B;
3989 } elsif ($self->{insertion_mode} eq 'in row') {
3990 if ($token->{type} eq 'character') {
3991 ## NOTE: This is a "character in table" code clone.
3992 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3993 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3994
3995 unless (length $token->{data}) {
3996 !!!next-token;
3997 redo B;
3998 }
3999 }
4000
4001 !!!parse-error (type => 'in table:#character');
4002
4003 ## As if in body, but insert into foster parent element
4004 ## ISSUE: Spec says that "whenever a node would be inserted
4005 ## into the current node" while characters might not be
4006 ## result in a new Text node.
4007 $reconstruct_active_formatting_elements->($insert_to_foster);
4008
4009 if ({
4010 table => 1, tbody => 1, tfoot => 1,
4011 thead => 1, tr => 1,
4012 }->{$self->{open_elements}->[-1]->[1]}) {
4013 # MUST
4014 my $foster_parent_element;
4015 my $next_sibling;
4016 my $prev_sibling;
4017 OE: for (reverse 0..$#{$self->{open_elements}}) {
4018 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4019 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4020 if (defined $parent and $parent->node_type == 1) {
4021 $foster_parent_element = $parent;
4022 $next_sibling = $self->{open_elements}->[$_]->[0];
4023 $prev_sibling = $next_sibling->previous_sibling;
4024 } else {
4025 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4026 $prev_sibling = $foster_parent_element->last_child;
4027 }
4028 last OE;
4029 }
4030 } # OE
4031 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4032 $prev_sibling = $foster_parent_element->last_child
4033 unless defined $foster_parent_element;
4034 if (defined $prev_sibling and
4035 $prev_sibling->node_type == 3) {
4036 $prev_sibling->manakai_append_text ($token->{data});
4037 } else {
4038 $foster_parent_element->insert_before
4039 ($self->{document}->create_text_node ($token->{data}),
4040 $next_sibling);
4041 }
4042 } else {
4043 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4044 }
4045
4046 !!!next-token;
4047 redo B;
4048 } elsif ($token->{type} eq 'comment') {
4049 ## Copied from 'in table'
4050 my $comment = $self->{document}->create_comment ($token->{data});
4051 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4052 !!!next-token;
4053 redo B;
4054 } elsif ($token->{type} eq 'start tag') {
4055 if ($token->{tag_name} eq 'th' or
4056 $token->{tag_name} eq 'td') {
4057 ## Clear back to table row context
4058 while (not {
4059 tr => 1, html => 1,
4060 }->{$self->{open_elements}->[-1]->[1]}) {
4061 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4062 pop @{$self->{open_elements}};
4063 }
4064
4065 !!!insert-element ($token->{tag_name}, $token->{attributes});
4066 $self->{insertion_mode} = 'in cell';
4067
4068 push @$active_formatting_elements, ['#marker', ''];
4069
4070 !!!next-token;
4071 redo B;
4072 } elsif ({
4073 caption => 1, col => 1, colgroup => 1,
4074 tbody => 1, tfoot => 1, thead => 1, tr => 1,
4075 }->{$token->{tag_name}}) {
4076 ## As if </tr>
4077 ## have an element in table scope
4078 my $i;
4079 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4080 my $node = $self->{open_elements}->[$_];
4081 if ($node->[1] eq 'tr') {
4082 $i = $_;
4083 last INSCOPE;
4084 } elsif ({
4085 table => 1, html => 1,
4086 }->{$node->[1]}) {
4087 last INSCOPE;
4088 }
4089 } # INSCOPE
4090 unless (defined $i) {
4091 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4092 ## Ignore the token
4093 !!!next-token;
4094 redo B;
4095 }
4096
4097 ## Clear back to table row context
4098 while (not {
4099 tr => 1, html => 1,
4100 }->{$self->{open_elements}->[-1]->[1]}) {
4101 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4102 pop @{$self->{open_elements}};
4103 }
4104
4105 pop @{$self->{open_elements}}; # tr
4106 $self->{insertion_mode} = 'in table body';
4107 ## reprocess
4108 redo B;
4109 } elsif ($token->{tag_name} eq 'table') {
4110 ## NOTE: This is a code clone of "table in table"
4111 !!!parse-error (type => 'not closed:table');
4112
4113 ## As if </table>
4114 ## have a table element in table scope
4115 my $i;
4116 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4117 my $node = $self->{open_elements}->[$_];
4118 if ($node->[1] eq 'table') {
4119 $i = $_;
4120 last INSCOPE;
4121 } elsif ({
4122 table => 1, html => 1,
4123 }->{$node->[1]}) {
4124 last INSCOPE;
4125 }
4126 } # INSCOPE
4127 unless (defined $i) {
4128 !!!parse-error (type => 'unmatched end tag:table');
4129 ## Ignore tokens </table><table>
4130 !!!next-token;
4131 redo B;
4132 }
4133
4134 ## generate implied end tags
4135 if ({
4136 dd => 1, dt => 1, li => 1, p => 1,
4137 td => 1, th => 1, tr => 1,
4138 }->{$self->{open_elements}->[-1]->[1]}) {
4139 !!!back-token; # <table>
4140 $token = {type => 'end tag', tag_name => 'table'};
4141 !!!back-token;
4142 $token = {type => 'end tag',
4143 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4144 redo B;
4145 }
4146
4147 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4148 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4149 }
4150
4151 splice @{$self->{open_elements}}, $i;
4152
4153 $self->_reset_insertion_mode;
4154
4155 ## reprocess
4156 redo B;
4157 } else {
4158 #
4159 }
4160 } elsif ($token->{type} eq 'end tag') {
4161 if ($token->{tag_name} eq 'tr') {
4162 ## have an element in table scope
4163 my $i;
4164 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4165 my $node = $self->{open_elements}->[$_];
4166 if ($node->[1] eq $token->{tag_name}) {
4167 $i = $_;
4168 last INSCOPE;
4169 } elsif ({
4170 table => 1, html => 1,
4171 }->{$node->[1]}) {
4172 last INSCOPE;
4173 }
4174 } # INSCOPE
4175 unless (defined $i) {
4176 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4177 ## Ignore the token
4178 !!!next-token;
4179 redo B;
4180 }
4181
4182 ## Clear back to table row context
4183 while (not {
4184 tr => 1, html => 1,
4185 }->{$self->{open_elements}->[-1]->[1]}) {
4186 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4187 pop @{$self->{open_elements}};
4188 }
4189
4190 pop @{$self->{open_elements}}; # tr
4191 $self->{insertion_mode} = 'in table body';
4192 !!!next-token;
4193 redo B;
4194 } elsif ($token->{tag_name} eq 'table') {
4195 ## As if </tr>
4196 ## have an element in table scope
4197 my $i;
4198 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4199 my $node = $self->{open_elements}->[$_];
4200 if ($node->[1] eq 'tr') {
4201 $i = $_;
4202 last INSCOPE;
4203 } elsif ({
4204 table => 1, html => 1,
4205 }->{$node->[1]}) {
4206 last INSCOPE;
4207 }
4208 } # INSCOPE
4209 unless (defined $i) {
4210 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4211 ## Ignore the token
4212 !!!next-token;
4213 redo B;
4214 }
4215
4216 ## Clear back to table row context
4217 while (not {
4218 tr => 1, html => 1,
4219 }->{$self->{open_elements}->[-1]->[1]}) {
4220 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4221 pop @{$self->{open_elements}};
4222 }
4223
4224 pop @{$self->{open_elements}}; # tr
4225 $self->{insertion_mode} = 'in table body';
4226 ## reprocess
4227 redo B;
4228 } elsif ({
4229 tbody => 1, tfoot => 1, thead => 1,
4230 }->{$token->{tag_name}}) {
4231 ## have an element in table scope
4232 my $i;
4233 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4234 my $node = $self->{open_elements}->[$_];
4235 if ($node->[1] eq $token->{tag_name}) {
4236 $i = $_;
4237 last INSCOPE;
4238 } elsif ({
4239 table => 1, html => 1,
4240 }->{$node->[1]}) {
4241 last INSCOPE;
4242 }
4243 } # INSCOPE
4244 unless (defined $i) {
4245 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4246 ## Ignore the token
4247 !!!next-token;
4248 redo B;
4249 }
4250
4251 ## As if </tr>
4252 ## have an element in table scope
4253 my $i;
4254 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4255 my $node = $self->{open_elements}->[$_];
4256 if ($node->[1] eq 'tr') {
4257 $i = $_;
4258 last INSCOPE;
4259 } elsif ({
4260 table => 1, html => 1,
4261 }->{$node->[1]}) {
4262 last INSCOPE;
4263 }
4264 } # INSCOPE
4265 unless (defined $i) {
4266 !!!parse-error (type => 'unmatched end tag:tr');
4267 ## Ignore the token
4268 !!!next-token;
4269 redo B;
4270 }
4271
4272 ## Clear back to table row context
4273 while (not {
4274 tr => 1, html => 1,
4275 }->{$self->{open_elements}->[-1]->[1]}) {
4276 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4277 pop @{$self->{open_elements}};
4278 }
4279
4280 pop @{$self->{open_elements}}; # tr
4281 $self->{insertion_mode} = 'in table body';
4282 ## reprocess
4283 redo B;
4284 } elsif ({
4285 body => 1, caption => 1, col => 1,
4286 colgroup => 1, html => 1, td => 1, th => 1,
4287 }->{$token->{tag_name}}) {
4288 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4289 ## Ignore the token
4290 !!!next-token;
4291 redo B;
4292 } else {
4293 #
4294 }
4295 } else {
4296 #
4297 }
4298
4299 ## As if in table
4300 !!!parse-error (type => 'in table:'.$token->{tag_name});
4301 $in_body->($insert_to_foster);
4302 redo B;
4303 } elsif ($self->{insertion_mode} eq 'in cell') {
4304 if ($token->{type} eq 'character') {
4305 ## NOTE: This is a code clone of "character in body".
4306 $reconstruct_active_formatting_elements->($insert_to_current);
4307
4308 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4309
4310 !!!next-token;
4311 redo B;
4312 } elsif ($token->{type} eq 'comment') {
4313 ## NOTE: This is a code clone of "comment in body".
4314 my $comment = $self->{document}->create_comment ($token->{data});
4315 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4316 !!!next-token;
4317 redo B;
4318 } elsif ($token->{type} eq 'start tag') {
4319 if ({
4320 caption => 1, col => 1, colgroup => 1,
4321 tbody => 1, td => 1, tfoot => 1, th => 1,
4322 thead => 1, tr => 1,
4323 }->{$token->{tag_name}}) {
4324 ## have an element in table scope
4325 my $tn;
4326 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4327 my $node = $self->{open_elements}->[$_];
4328 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4329 $tn = $node->[1];
4330 last INSCOPE;
4331 } elsif ({
4332 table => 1, html => 1,
4333 }->{$node->[1]}) {
4334 last INSCOPE;
4335 }
4336 } # INSCOPE
4337 unless (defined $tn) {
4338 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4339 ## Ignore the token
4340 !!!next-token;
4341 redo B;
4342 }
4343
4344 ## Close the cell
4345 !!!back-token; # <?>
4346 $token = {type => 'end tag', tag_name => $tn};
4347 redo B;
4348 } else {
4349 #
4350 }
4351 } elsif ($token->{type} eq 'end tag') {
4352 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4353 ## have an element in table scope
4354 my $i;
4355 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4356 my $node = $self->{open_elements}->[$_];
4357 if ($node->[1] eq $token->{tag_name}) {
4358 $i = $_;
4359 last INSCOPE;
4360 } elsif ({
4361 table => 1, html => 1,
4362 }->{$node->[1]}) {
4363 last INSCOPE;
4364 }
4365 } # INSCOPE
4366 unless (defined $i) {
4367 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4368 ## Ignore the token
4369 !!!next-token;
4370 redo B;
4371 }
4372
4373 ## generate implied end tags
4374 if ({
4375 dd => 1, dt => 1, li => 1, p => 1,
4376 td => ($token->{tag_name} eq 'th'),
4377 th => ($token->{tag_name} eq 'td'),
4378 tr => 1,
4379 }->{$self->{open_elements}->[-1]->[1]}) {
4380 !!!back-token;
4381 $token = {type => 'end tag',
4382 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4383 redo B;
4384 }
4385
4386 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4387 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4388 }
4389
4390 splice @{$self->{open_elements}}, $i;
4391
4392 $clear_up_to_marker->();
4393
4394 $self->{insertion_mode} = 'in row';
4395
4396 !!!next-token;
4397 redo B;
4398 } elsif ({
4399 body => 1, caption => 1, col => 1,
4400 colgroup => 1, html => 1,
4401 }->{$token->{tag_name}}) {
4402 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4403 ## Ignore the token
4404 !!!next-token;
4405 redo B;
4406 } elsif ({
4407 table => 1, tbody => 1, tfoot => 1,
4408 thead => 1, tr => 1,
4409 }->{$token->{tag_name}}) {
4410 ## have an element in table scope
4411 my $i;
4412 my $tn;
4413 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4414 my $node = $self->{open_elements}->[$_];
4415 if ($node->[1] eq $token->{tag_name}) {
4416 $i = $_;
4417 last INSCOPE;
4418 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4419 $tn = $node->[1];
4420 ## NOTE: There is exactly one |td| or |th| element
4421 ## in scope in the stack of open elements by definition.
4422 } elsif ({
4423 table => 1, html => 1,
4424 }->{$node->[1]}) {
4425 last INSCOPE;
4426 }
4427 } # INSCOPE
4428 unless (defined $i) {
4429 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4430 ## Ignore the token
4431 !!!next-token;
4432 redo B;
4433 }
4434
4435 ## Close the cell
4436 !!!back-token; # </?>
4437 $token = {type => 'end tag', tag_name => $tn};
4438 redo B;
4439 } else {
4440 #
4441 }
4442 } else {
4443 #
4444 }
4445
4446 $in_body->($insert_to_current);
4447 redo B;
4448 } elsif ($self->{insertion_mode} eq 'in select') {
4449 if ($token->{type} eq 'character') {
4450 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4451 !!!next-token;
4452 redo B;
4453 } elsif ($token->{type} eq 'comment') {
4454 my $comment = $self->{document}->create_comment ($token->{data});
4455 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4456 !!!next-token;
4457 redo B;
4458 } elsif ($token->{type} eq 'start tag') {
4459 if ($token->{tag_name} eq 'option') {
4460 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4461 ## As if </option>
4462 pop @{$self->{open_elements}};
4463 }
4464
4465 !!!insert-element ($token->{tag_name}, $token->{attributes});
4466 !!!next-token;
4467 redo B;
4468 } elsif ($token->{tag_name} eq 'optgroup') {
4469 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4470 ## As if </option>
4471 pop @{$self->{open_elements}};
4472 }
4473
4474 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4475 ## As if </optgroup>
4476 pop @{$self->{open_elements}};
4477 }
4478
4479 !!!insert-element ($token->{tag_name}, $token->{attributes});
4480 !!!next-token;
4481 redo B;
4482 } elsif ($token->{tag_name} eq 'select') {
4483 !!!parse-error (type => 'not closed:select');
4484 ## As if </select> instead
4485 ## have an element in table scope
4486 my $i;
4487 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4488 my $node = $self->{open_elements}->[$_];
4489 if ($node->[1] eq $token->{tag_name}) {
4490 $i = $_;
4491 last INSCOPE;
4492 } elsif ({
4493 table => 1, html => 1,
4494 }->{$node->[1]}) {
4495 last INSCOPE;
4496 }
4497 } # INSCOPE
4498 unless (defined $i) {
4499 !!!parse-error (type => 'unmatched end tag:select');
4500 ## Ignore the token
4501 !!!next-token;
4502 redo B;
4503 }
4504
4505 splice @{$self->{open_elements}}, $i;
4506
4507 $self->_reset_insertion_mode;
4508
4509 !!!next-token;
4510 redo B;
4511 } else {
4512 #
4513 }
4514 } elsif ($token->{type} eq 'end tag') {
4515 if ($token->{tag_name} eq 'optgroup') {
4516 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4517 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4518 ## As if </option>
4519 splice @{$self->{open_elements}}, -2;
4520 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4521 pop @{$self->{open_elements}};
4522 } else {
4523 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4524 ## Ignore the token
4525 }
4526 !!!next-token;
4527 redo B;
4528 } elsif ($token->{tag_name} eq 'option') {
4529 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4530 pop @{$self->{open_elements}};
4531 } else {
4532 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4533 ## Ignore the token
4534 }
4535 !!!next-token;
4536 redo B;
4537 } elsif ($token->{tag_name} eq 'select') {
4538 ## have an element in table scope
4539 my $i;
4540 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4541 my $node = $self->{open_elements}->[$_];
4542 if ($node->[1] eq $token->{tag_name}) {
4543 $i = $_;
4544 last INSCOPE;
4545 } elsif ({
4546 table => 1, html => 1,
4547 }->{$node->[1]}) {
4548 last INSCOPE;
4549 }
4550 } # INSCOPE
4551 unless (defined $i) {
4552 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4553 ## Ignore the token
4554 !!!next-token;
4555 redo B;
4556 }
4557
4558 splice @{$self->{open_elements}}, $i;
4559
4560 $self->_reset_insertion_mode;
4561
4562 !!!next-token;
4563 redo B;
4564 } elsif ({
4565 caption => 1, table => 1, tbody => 1,
4566 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4567 }->{$token->{tag_name}}) {
4568 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4569
4570 ## have an element in table scope
4571 my $i;
4572 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4573 my $node = $self->{open_elements}->[$_];
4574 if ($node->[1] eq $token->{tag_name}) {
4575 $i = $_;
4576 last INSCOPE;
4577 } elsif ({
4578 table => 1, html => 1,
4579 }->{$node->[1]}) {
4580 last INSCOPE;
4581 }
4582 } # INSCOPE
4583 unless (defined $i) {
4584 ## Ignore the token
4585 !!!next-token;
4586 redo B;
4587 }
4588
4589 ## As if </select>
4590 ## have an element in table scope
4591 undef $i;
4592 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4593 my $node = $self->{open_elements}->[$_];
4594 if ($node->[1] eq 'select') {
4595 $i = $_;
4596 last INSCOPE;
4597 } elsif ({
4598 table => 1, html => 1,
4599 }->{$node->[1]}) {
4600 last INSCOPE;
4601 }
4602 } # INSCOPE
4603 unless (defined $i) {
4604 !!!parse-error (type => 'unmatched end tag:select');
4605 ## Ignore the </select> token
4606 !!!next-token; ## TODO: ok?
4607 redo B;
4608 }
4609
4610 splice @{$self->{open_elements}}, $i;
4611
4612 $self->_reset_insertion_mode;
4613
4614 ## reprocess
4615 redo B;
4616 } else {
4617 #
4618 }
4619 } else {
4620 #
4621 }
4622
4623 !!!parse-error (type => 'in select:'.$token->{tag_name});
4624 ## Ignore the token
4625 !!!next-token;
4626 redo B;
4627 } elsif ($self->{insertion_mode} eq 'after body') {
4628 if ($token->{type} eq 'character') {
4629 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4630 ## As if in body
4631 $reconstruct_active_formatting_elements->($insert_to_current);
4632
4633 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4634
4635 unless (length $token->{data}) {
4636 !!!next-token;
4637 redo B;
4638 }
4639 }
4640
4641 #
4642 !!!parse-error (type => 'after body:#'.$token->{type});
4643 } elsif ($token->{type} eq 'comment') {
4644 my $comment = $self->{document}->create_comment ($token->{data});
4645 $self->{open_elements}->[0]->[0]->append_child ($comment);
4646 !!!next-token;
4647 redo B;
4648 } elsif ($token->{type} eq 'start tag') {
4649 !!!parse-error (type => 'after body:'.$token->{tag_name});
4650 #
4651 } elsif ($token->{type} eq 'end tag') {
4652 if ($token->{tag_name} eq 'html') {
4653 if (defined $self->{inner_html_node}) {
4654 !!!parse-error (type => 'unmatched end tag:html');
4655 ## Ignore the token
4656 !!!next-token;
4657 redo B;
4658 } else {
4659 $phase = 'trailing end';
4660 !!!next-token;
4661 redo B;
4662 }
4663 } else {
4664 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4665 }
4666 } else {
4667 !!!parse-error (type => 'after body:#'.$token->{type});
4668 }
4669
4670 $self->{insertion_mode} = 'in body';
4671 ## reprocess
4672 redo B;
4673 } elsif ($self->{insertion_mode} eq 'in frameset') {
4674 if ($token->{type} eq 'character') {
4675 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4676 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4677
4678 unless (length $token->{data}) {
4679 !!!next-token;
4680 redo B;
4681 }
4682 }
4683
4684 #
4685 } elsif ($token->{type} eq 'comment') {
4686 my $comment = $self->{document}->create_comment ($token->{data});
4687 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4688 !!!next-token;
4689 redo B;
4690 } elsif ($token->{type} eq 'start tag') {
4691 if ($token->{tag_name} eq 'frameset') {
4692 !!!insert-element ($token->{tag_name}, $token->{attributes});
4693 !!!next-token;
4694 redo B;
4695 } elsif ($token->{tag_name} eq 'frame') {
4696 !!!insert-element ($token->{tag_name}, $token->{attributes});
4697 pop @{$self->{open_elements}};
4698 !!!next-token;
4699 redo B;
4700 } elsif ($token->{tag_name} eq 'noframes') {
4701 $in_body->($insert_to_current);
4702 redo B;
4703 } else {
4704 #
4705 }
4706 } elsif ($token->{type} eq 'end tag') {
4707 if ($token->{tag_name} eq 'frameset') {
4708 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4709 @{$self->{open_elements}} == 1) {
4710 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4711 ## Ignore the token
4712 !!!next-token;
4713 } else {
4714 pop @{$self->{open_elements}};
4715 !!!next-token;
4716 }
4717
4718 ## if not inner_html and
4719 if ($self->{open_elements}->[-1]->[1] ne 'frameset') {
4720 $self->{insertion_mode} = 'after frameset';
4721 }
4722 redo B;
4723 } else {
4724 #
4725 }
4726 } else {
4727 #
4728 }
4729
4730 if (defined $token->{tag_name}) {
4731 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4732 } else {
4733 !!!parse-error (type => 'in frameset:#'.$token->{type});
4734 }
4735 ## Ignore the token
4736 !!!next-token;
4737 redo B;
4738 } elsif ($self->{insertion_mode} eq 'after frameset') {
4739 if ($token->{type} eq 'character') {
4740 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4741 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4742
4743 unless (length $token->{data}) {
4744 !!!next-token;
4745 redo B;
4746 }
4747 }
4748
4749 #
4750 } elsif ($token->{type} eq 'comment') {
4751 my $comment = $self->{document}->create_comment ($token->{data});
4752 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4753 !!!next-token;
4754 redo B;
4755 } elsif ($token->{type} eq 'start tag') {
4756 if ($token->{tag_name} eq 'noframes') {
4757 $in_body->($insert_to_current);
4758 redo B;
4759 } else {
4760 #
4761 }
4762 } elsif ($token->{type} eq 'end tag') {
4763 if ($token->{tag_name} eq 'html') {
4764 $phase = 'trailing end';
4765 !!!next-token;
4766 redo B;
4767 } else {
4768 #
4769 }
4770 } else {
4771 #
4772 }
4773
4774 if (defined $token->{tag_name}) {
4775 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4776 } else {
4777 !!!parse-error (type => 'after frameset:#'.$token->{type});
4778 }
4779 ## Ignore the token
4780 !!!next-token;
4781 redo B;
4782
4783 ## ISSUE: An issue in spec there
4784 } else {
4785 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4786 }
4787 }
4788 } elsif ($phase eq 'trailing end') {
4789 ## states in the main stage is preserved yet # MUST
4790
4791 if ($token->{type} eq 'DOCTYPE') {
4792 !!!parse-error (type => 'after html:#DOCTYPE');
4793 ## Ignore the token
4794 !!!next-token;
4795 redo B;
4796 } elsif ($token->{type} eq 'comment') {
4797 my $comment = $self->{document}->create_comment ($token->{data});
4798 $self->{document}->append_child ($comment);
4799 !!!next-token;
4800 redo B;
4801 } elsif ($token->{type} eq 'character') {
4802 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4803 my $data = $1;
4804 ## As if in the main phase.
4805 ## NOTE: The insertion mode in the main phase
4806 ## just before the phase has been changed to the trailing
4807 ## end phase is either "after body" or "after frameset".
4808 $reconstruct_active_formatting_elements->($insert_to_current)
4809 if $phase eq 'main';
4810
4811 $self->{open_elements}->[-1]->[0]->manakai_append_text ($data);
4812
4813 unless (length $token->{data}) {
4814 !!!next-token;
4815 redo B;
4816 }
4817 }
4818
4819 !!!parse-error (type => 'after html:#character');
4820 $phase = 'main';
4821 ## reprocess
4822 redo B;
4823 } elsif ($token->{type} eq 'start tag' or
4824 $token->{type} eq 'end tag') {
4825 !!!parse-error (type => 'after html:'.$token->{tag_name});
4826 $phase = 'main';
4827 ## reprocess
4828 redo B;
4829 } elsif ($token->{type} eq 'end-of-file') {
4830 ## Stop parsing
4831 last B;
4832 } else {
4833 die "$0: $token->{type}: Unknown token";
4834 }
4835 }
4836 } # B
4837
4838 ## Stop parsing # MUST
4839
4840 ## TODO: script stuffs
4841 } # _tree_construct_main
4842
4843 sub set_inner_html ($$$) {
4844 my $class = shift;
4845 my $node = shift;
4846 my $s = \$_[0];
4847 my $onerror = $_[1];
4848
4849 my $nt = $node->node_type;
4850 if ($nt == 9) {
4851 # MUST
4852
4853 ## Step 1 # MUST
4854 ## TODO: If the document has an active parser, ...
4855 ## ISSUE: There is an issue in the spec.
4856
4857 ## Step 2 # MUST
4858 my @cn = @{$node->child_nodes};
4859 for (@cn) {
4860 $node->remove_child ($_);
4861 }
4862
4863 ## Step 3, 4, 5 # MUST
4864 $class->parse_string ($$s => $node, $onerror);
4865 } elsif ($nt == 1) {
4866 ## TODO: If non-html element
4867
4868 ## NOTE: Most of this code is copied from |parse_string|
4869
4870 ## Step 1 # MUST
4871 my $doc = $node->owner_document->implementation->create_document;
4872 ## TODO: Mark as HTML document
4873 my $p = $class->new;
4874 $p->{document} = $doc;
4875
4876 ## Step 9 # MUST
4877 my $i = 0;
4878 my $line = 1;
4879 my $column = 0;
4880 $p->{set_next_input_character} = sub {
4881 my $self = shift;
4882 $self->{next_input_character} = -1 and return if $i >= length $$s;
4883 $self->{next_input_character} = ord substr $$s, $i++, 1;
4884 $column++;
4885
4886 if ($self->{next_input_character} == 0x000A) { # LF
4887 $line++;
4888 $column = 0;
4889 } elsif ($self->{next_input_character} == 0x000D) { # CR
4890 if ($i >= length $$s) {
4891 #
4892 } else {
4893 my $next_char = ord substr $$s, $i++, 1;
4894 if ($next_char == 0x000A) { # LF
4895 #
4896 } else {
4897 push @{$self->{char}}, $next_char;
4898 }
4899 }
4900 $self->{next_input_character} = 0x000A; # LF # MUST
4901 $line++;
4902 $column = 0;
4903 } elsif ($self->{next_input_character} > 0x10FFFF) {
4904 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
4905 } elsif ($self->{next_input_character} == 0x0000) { # NULL
4906 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
4907 }
4908 };
4909
4910 my $ponerror = $onerror || sub {
4911 my (%opt) = @_;
4912 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
4913 };
4914 $p->{parse_error} = sub {
4915 $ponerror->(@_, line => $line, column => $column);
4916 };
4917
4918 $p->_initialize_tokenizer;
4919 $p->_initialize_tree_constructor;
4920
4921 ## Step 2
4922 my $node_ln = $node->local_name;
4923 $p->{content_model_flag} = {
4924 title => 'RCDATA',
4925 textarea => 'RCDATA',
4926 style => 'CDATA',
4927 script => 'CDATA',
4928 xmp => 'CDATA',
4929 iframe => 'CDATA',
4930 noembed => 'CDATA',
4931 noframes => 'CDATA',
4932 noscript => 'CDATA',
4933 plaintext => 'PLAINTEXT',
4934 }->{$node_ln} || 'PCDATA';
4935 ## ISSUE: What is "the name of the element"? local name?
4936
4937 $p->{inner_html_node} = [$node, $node_ln];
4938
4939 ## Step 4
4940 my $root = $doc->create_element_ns
4941 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
4942
4943 ## Step 5 # MUST
4944 $doc->append_child ($root);
4945
4946 ## Step 6 # MUST
4947 push @{$p->{open_elements}}, [$root, 'html'];
4948
4949 undef $p->{head_element};
4950
4951 ## Step 7 # MUST
4952 $p->_reset_insertion_mode;
4953
4954 ## Step 8 # MUST
4955 my $anode = $node;
4956 AN: while (defined $anode) {
4957 if ($anode->node_type == 1) {
4958 my $nsuri = $anode->namespace_uri;
4959 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
4960 if ($anode->local_name eq 'form') { ## TODO: case?
4961 $p->{form_element} = $anode;
4962 last AN;
4963 }
4964 }
4965 }
4966 $anode = $anode->parent_node;
4967 } # AN
4968
4969 ## Step 3 # MUST
4970 ## Step 10 # MUST
4971 {
4972 my $self = $p;
4973 !!!next-token;
4974 }
4975 $p->_tree_construction_main;
4976
4977 ## Step 11 # MUST
4978 my @cn = @{$node->child_nodes};
4979 for (@cn) {
4980 $node->remove_child ($_);
4981 }
4982 ## ISSUE: mutation events? read-only?
4983
4984 ## Step 12 # MUST
4985 @cn = @{$root->child_nodes};
4986 for (@cn) {
4987 $node->append_child ($_);
4988 }
4989 ## ISSUE: adopt_node? mutation events?
4990
4991 $p->_terminate_tree_constructor;
4992 } else {
4993 die "$0: |set_inner_html| is not defined for node of type $nt";
4994 }
4995 } # set_inner_html
4996
4997 } # tree construction stage
4998
4999 sub get_inner_html ($$$) {
5000 my (undef, $node, $on_error) = @_;
5001
5002 ## Step 1
5003 my $s = '';
5004
5005 my $in_cdata;
5006 my $parent = $node;
5007 while (defined $parent) {
5008 if ($parent->node_type == 1 and
5009 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5010 {
5011 style => 1, script => 1, xmp => 1, iframe => 1,
5012 noembed => 1, noframes => 1, noscript => 1,
5013 }->{$parent->local_name}) { ## TODO: case thingy
5014 $in_cdata = 1;
5015 }
5016 $parent = $parent->parent_node;
5017 }
5018
5019 ## Step 2
5020 my @node = @{$node->child_nodes};
5021 C: while (@node) {
5022 my $child = shift @node;
5023 unless (ref $child) {
5024 if ($child eq 'cdata-out') {
5025 $in_cdata = 0;
5026 } else {
5027 $s .= $child; # end tag
5028 }
5029 next C;
5030 }
5031
5032 my $nt = $child->node_type;
5033 if ($nt == 1) { # Element
5034 my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
5035 $s .= '<' . $tag_name;
5036
5037 ## ISSUE: Non-html elements
5038
5039 my @attrs = @{$child->attributes}; # sort order MUST be stable
5040 for my $attr (@attrs) { # order is implementation dependent
5041 my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
5042 $s .= ' ' . $attr_name . '="';
5043 my $attr_value = $attr->value;
5044 ## escape
5045 $attr_value =~ s/&/&amp;/g;
5046 $attr_value =~ s/</&lt;/g;
5047 $attr_value =~ s/>/&gt;/g;
5048 $attr_value =~ s/"/&quot;/g;
5049 $s .= $attr_value . '"';
5050 }
5051 $s .= '>';
5052
5053 next C if {
5054 area => 1, base => 1, basefont => 1, bgsound => 1,
5055 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5056 img => 1, input => 1, link => 1, meta => 1, param => 1,
5057 spacer => 1, wbr => 1,
5058 }->{$tag_name};
5059
5060 if (not $in_cdata and {
5061 style => 1, script => 1, xmp => 1, iframe => 1,
5062 noembed => 1, noframes => 1, noscript => 1,
5063 }->{$tag_name}) {
5064 unshift @node, 'cdata-out';
5065 $in_cdata = 1;
5066 }
5067
5068 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5069 } elsif ($nt == 3 or $nt == 4) {
5070 if ($in_cdata) {
5071 $s .= $child->data;
5072 } else {
5073 my $value = $child->data;
5074 $value =~ s/&/&amp;/g;
5075 $value =~ s/</&lt;/g;
5076 $value =~ s/>/&gt;/g;
5077 $value =~ s/"/&quot;/g;
5078 $s .= $value;
5079 }
5080 } elsif ($nt == 8) {
5081 $s .= '<!--' . $child->data . '-->';
5082 } elsif ($nt == 10) {
5083 $s .= '<!DOCTYPE ' . $child->name . '>';
5084 } elsif ($nt == 5) { # entrefs
5085 push @node, @{$child->child_nodes};
5086 } else {
5087 $on_error->($child) if defined $on_error;
5088 }
5089 ## ISSUE: This code does not support PIs.
5090 } # C
5091
5092 ## Step 3
5093 return \$s;
5094 } # get_inner_html
5095
5096 1;
5097 # $Date: 2007/06/23 02:41:51 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24