/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (show annotations) (download) (as text)
Sat Jun 23 06:48:24 2007 UTC (17 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.14: +4 -23 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	23 Jun 2007 06:48:21 -0000
	* HTML.pm.src: Parse errors immediately after U+000D
	were ignored and U+000D immediately following another
	U+000D was not converted to U+000A.

2007-06-23  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.14 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## This is an early version of an HTML parser.
6
7 my $permitted_slash_tag_name = {
8 base => 1,
9 link => 1,
10 meta => 1,
11 hr => 1,
12 br => 1,
13 img=> 1,
14 embed => 1,
15 param => 1,
16 area => 1,
17 col => 1,
18 input => 1,
19 };
20
21 my $entity_char = {
22 AElig => "\x{00C6}",
23 Aacute => "\x{00C1}",
24 Acirc => "\x{00C2}",
25 Agrave => "\x{00C0}",
26 Alpha => "\x{0391}",
27 Aring => "\x{00C5}",
28 Atilde => "\x{00C3}",
29 Auml => "\x{00C4}",
30 Beta => "\x{0392}",
31 Ccedil => "\x{00C7}",
32 Chi => "\x{03A7}",
33 Dagger => "\x{2021}",
34 Delta => "\x{0394}",
35 ETH => "\x{00D0}",
36 Eacute => "\x{00C9}",
37 Ecirc => "\x{00CA}",
38 Egrave => "\x{00C8}",
39 Epsilon => "\x{0395}",
40 Eta => "\x{0397}",
41 Euml => "\x{00CB}",
42 Gamma => "\x{0393}",
43 Iacute => "\x{00CD}",
44 Icirc => "\x{00CE}",
45 Igrave => "\x{00CC}",
46 Iota => "\x{0399}",
47 Iuml => "\x{00CF}",
48 Kappa => "\x{039A}",
49 Lambda => "\x{039B}",
50 Mu => "\x{039C}",
51 Ntilde => "\x{00D1}",
52 Nu => "\x{039D}",
53 OElig => "\x{0152}",
54 Oacute => "\x{00D3}",
55 Ocirc => "\x{00D4}",
56 Ograve => "\x{00D2}",
57 Omega => "\x{03A9}",
58 Omicron => "\x{039F}",
59 Oslash => "\x{00D8}",
60 Otilde => "\x{00D5}",
61 Ouml => "\x{00D6}",
62 Phi => "\x{03A6}",
63 Pi => "\x{03A0}",
64 Prime => "\x{2033}",
65 Psi => "\x{03A8}",
66 Rho => "\x{03A1}",
67 Scaron => "\x{0160}",
68 Sigma => "\x{03A3}",
69 THORN => "\x{00DE}",
70 Tau => "\x{03A4}",
71 Theta => "\x{0398}",
72 Uacute => "\x{00DA}",
73 Ucirc => "\x{00DB}",
74 Ugrave => "\x{00D9}",
75 Upsilon => "\x{03A5}",
76 Uuml => "\x{00DC}",
77 Xi => "\x{039E}",
78 Yacute => "\x{00DD}",
79 Yuml => "\x{0178}",
80 Zeta => "\x{0396}",
81 aacute => "\x{00E1}",
82 acirc => "\x{00E2}",
83 acute => "\x{00B4}",
84 aelig => "\x{00E6}",
85 agrave => "\x{00E0}",
86 alefsym => "\x{2135}",
87 alpha => "\x{03B1}",
88 amp => "\x{0026}",
89 AMP => "\x{0026}",
90 and => "\x{2227}",
91 ang => "\x{2220}",
92 apos => "\x{0027}",
93 aring => "\x{00E5}",
94 asymp => "\x{2248}",
95 atilde => "\x{00E3}",
96 auml => "\x{00E4}",
97 bdquo => "\x{201E}",
98 beta => "\x{03B2}",
99 brvbar => "\x{00A6}",
100 bull => "\x{2022}",
101 cap => "\x{2229}",
102 ccedil => "\x{00E7}",
103 cedil => "\x{00B8}",
104 cent => "\x{00A2}",
105 chi => "\x{03C7}",
106 circ => "\x{02C6}",
107 clubs => "\x{2663}",
108 cong => "\x{2245}",
109 copy => "\x{00A9}",
110 COPY => "\x{00A9}",
111 crarr => "\x{21B5}",
112 cup => "\x{222A}",
113 curren => "\x{00A4}",
114 dArr => "\x{21D3}",
115 dagger => "\x{2020}",
116 darr => "\x{2193}",
117 deg => "\x{00B0}",
118 delta => "\x{03B4}",
119 diams => "\x{2666}",
120 divide => "\x{00F7}",
121 eacute => "\x{00E9}",
122 ecirc => "\x{00EA}",
123 egrave => "\x{00E8}",
124 empty => "\x{2205}",
125 emsp => "\x{2003}",
126 ensp => "\x{2002}",
127 epsilon => "\x{03B5}",
128 equiv => "\x{2261}",
129 eta => "\x{03B7}",
130 eth => "\x{00F0}",
131 euml => "\x{00EB}",
132 euro => "\x{20AC}",
133 exist => "\x{2203}",
134 fnof => "\x{0192}",
135 forall => "\x{2200}",
136 frac12 => "\x{00BD}",
137 frac14 => "\x{00BC}",
138 frac34 => "\x{00BE}",
139 frasl => "\x{2044}",
140 gamma => "\x{03B3}",
141 ge => "\x{2265}",
142 gt => "\x{003E}",
143 GT => "\x{003E}",
144 hArr => "\x{21D4}",
145 harr => "\x{2194}",
146 hearts => "\x{2665}",
147 hellip => "\x{2026}",
148 iacute => "\x{00ED}",
149 icirc => "\x{00EE}",
150 iexcl => "\x{00A1}",
151 igrave => "\x{00EC}",
152 image => "\x{2111}",
153 infin => "\x{221E}",
154 int => "\x{222B}",
155 iota => "\x{03B9}",
156 iquest => "\x{00BF}",
157 isin => "\x{2208}",
158 iuml => "\x{00EF}",
159 kappa => "\x{03BA}",
160 lArr => "\x{21D0}",
161 lambda => "\x{03BB}",
162 lang => "\x{2329}",
163 laquo => "\x{00AB}",
164 larr => "\x{2190}",
165 lceil => "\x{2308}",
166 ldquo => "\x{201C}",
167 le => "\x{2264}",
168 lfloor => "\x{230A}",
169 lowast => "\x{2217}",
170 loz => "\x{25CA}",
171 lrm => "\x{200E}",
172 lsaquo => "\x{2039}",
173 lsquo => "\x{2018}",
174 lt => "\x{003C}",
175 LT => "\x{003C}",
176 macr => "\x{00AF}",
177 mdash => "\x{2014}",
178 micro => "\x{00B5}",
179 middot => "\x{00B7}",
180 minus => "\x{2212}",
181 mu => "\x{03BC}",
182 nabla => "\x{2207}",
183 nbsp => "\x{00A0}",
184 ndash => "\x{2013}",
185 ne => "\x{2260}",
186 ni => "\x{220B}",
187 not => "\x{00AC}",
188 notin => "\x{2209}",
189 nsub => "\x{2284}",
190 ntilde => "\x{00F1}",
191 nu => "\x{03BD}",
192 oacute => "\x{00F3}",
193 ocirc => "\x{00F4}",
194 oelig => "\x{0153}",
195 ograve => "\x{00F2}",
196 oline => "\x{203E}",
197 omega => "\x{03C9}",
198 omicron => "\x{03BF}",
199 oplus => "\x{2295}",
200 or => "\x{2228}",
201 ordf => "\x{00AA}",
202 ordm => "\x{00BA}",
203 oslash => "\x{00F8}",
204 otilde => "\x{00F5}",
205 otimes => "\x{2297}",
206 ouml => "\x{00F6}",
207 para => "\x{00B6}",
208 part => "\x{2202}",
209 permil => "\x{2030}",
210 perp => "\x{22A5}",
211 phi => "\x{03C6}",
212 pi => "\x{03C0}",
213 piv => "\x{03D6}",
214 plusmn => "\x{00B1}",
215 pound => "\x{00A3}",
216 prime => "\x{2032}",
217 prod => "\x{220F}",
218 prop => "\x{221D}",
219 psi => "\x{03C8}",
220 quot => "\x{0022}",
221 QUOT => "\x{0022}",
222 rArr => "\x{21D2}",
223 radic => "\x{221A}",
224 rang => "\x{232A}",
225 raquo => "\x{00BB}",
226 rarr => "\x{2192}",
227 rceil => "\x{2309}",
228 rdquo => "\x{201D}",
229 real => "\x{211C}",
230 reg => "\x{00AE}",
231 REG => "\x{00AE}",
232 rfloor => "\x{230B}",
233 rho => "\x{03C1}",
234 rlm => "\x{200F}",
235 rsaquo => "\x{203A}",
236 rsquo => "\x{2019}",
237 sbquo => "\x{201A}",
238 scaron => "\x{0161}",
239 sdot => "\x{22C5}",
240 sect => "\x{00A7}",
241 shy => "\x{00AD}",
242 sigma => "\x{03C3}",
243 sigmaf => "\x{03C2}",
244 sim => "\x{223C}",
245 spades => "\x{2660}",
246 sub => "\x{2282}",
247 sube => "\x{2286}",
248 sum => "\x{2211}",
249 sup => "\x{2283}",
250 sup1 => "\x{00B9}",
251 sup2 => "\x{00B2}",
252 sup3 => "\x{00B3}",
253 supe => "\x{2287}",
254 szlig => "\x{00DF}",
255 tau => "\x{03C4}",
256 there4 => "\x{2234}",
257 theta => "\x{03B8}",
258 thetasym => "\x{03D1}",
259 thinsp => "\x{2009}",
260 thorn => "\x{00FE}",
261 tilde => "\x{02DC}",
262 times => "\x{00D7}",
263 trade => "\x{2122}",
264 uArr => "\x{21D1}",
265 uacute => "\x{00FA}",
266 uarr => "\x{2191}",
267 ucirc => "\x{00FB}",
268 ugrave => "\x{00F9}",
269 uml => "\x{00A8}",
270 upsih => "\x{03D2}",
271 upsilon => "\x{03C5}",
272 uuml => "\x{00FC}",
273 weierp => "\x{2118}",
274 xi => "\x{03BE}",
275 yacute => "\x{00FD}",
276 yen => "\x{00A5}",
277 yuml => "\x{00FF}",
278 zeta => "\x{03B6}",
279 zwj => "\x{200D}",
280 zwnj => "\x{200C}",
281 }; # $entity_char
282
283 my $c1_entity_char = {
284 0x80 => 0x20AC,
285 0x81 => 0xFFFD,
286 0x82 => 0x201A,
287 0x83 => 0x0192,
288 0x84 => 0x201E,
289 0x85 => 0x2026,
290 0x86 => 0x2020,
291 0x87 => 0x2021,
292 0x88 => 0x02C6,
293 0x89 => 0x2030,
294 0x8A => 0x0160,
295 0x8B => 0x2039,
296 0x8C => 0x0152,
297 0x8D => 0xFFFD,
298 0x8E => 0x017D,
299 0x8F => 0xFFFD,
300 0x90 => 0xFFFD,
301 0x91 => 0x2018,
302 0x92 => 0x2019,
303 0x93 => 0x201C,
304 0x94 => 0x201D,
305 0x95 => 0x2022,
306 0x96 => 0x2013,
307 0x97 => 0x2014,
308 0x98 => 0x02DC,
309 0x99 => 0x2122,
310 0x9A => 0x0161,
311 0x9B => 0x203A,
312 0x9C => 0x0153,
313 0x9D => 0xFFFD,
314 0x9E => 0x017E,
315 0x9F => 0x0178,
316 }; # $c1_entity_char
317
318 my $special_category = {
319 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
320 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
321 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
322 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
323 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
324 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
325 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
326 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
327 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
328 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
329 };
330 my $scoping_category = {
331 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
332 table => 1, td => 1, th => 1,
333 };
334 my $formatting_category = {
335 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
336 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
337 };
338 # $phrasing_category: all other elements
339
340 sub parse_string ($$$;$) {
341 my $self = shift->new;
342 my $s = \$_[0];
343 $self->{document} = $_[1];
344
345 ## NOTE: |set_inner_html| copies most of this method's code
346
347 my $i = 0;
348 my $line = 1;
349 my $column = 0;
350 $self->{set_next_input_character} = sub {
351 my $self = shift;
352
353 pop @{$self->{prev_input_character}};
354 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
355
356 $self->{next_input_character} = -1 and return if $i >= length $$s;
357 $self->{next_input_character} = ord substr $$s, $i++, 1;
358 $column++;
359
360 if ($self->{next_input_character} == 0x000A) { # LF
361 $line++;
362 $column = 0;
363 } elsif ($self->{next_input_character} == 0x000D) { # CR
364 $i++ if substr ($$s, $i, 1) eq "\x0A";
365 $self->{next_input_character} = 0x000A; # LF # MUST
366 $line++;
367 $column = 0;
368 } elsif ($self->{next_input_character} > 0x10FFFF) {
369 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
370 } elsif ($self->{next_input_character} == 0x0000) { # NULL
371 !!!parse-error (type => 'NULL');
372 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
373 }
374 };
375 $self->{prev_input_character} = [-1, -1, -1];
376 $self->{next_input_character} = -1;
377
378 my $onerror = $_[2] || sub {
379 my (%opt) = @_;
380 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
381 };
382 $self->{parse_error} = sub {
383 $onerror->(@_, line => $line, column => $column);
384 };
385
386 $self->_initialize_tokenizer;
387 $self->_initialize_tree_constructor;
388 $self->_construct_tree;
389 $self->_terminate_tree_constructor;
390
391 return $self->{document};
392 } # parse_string
393
394 sub new ($) {
395 my $class = shift;
396 my $self = bless {}, $class;
397 $self->{set_next_input_character} = sub {
398 $self->{next_input_character} = -1;
399 };
400 $self->{parse_error} = sub {
401 #
402 };
403 return $self;
404 } # new
405
406 ## Implementations MUST act as if state machine in the spec
407
408 sub _initialize_tokenizer ($) {
409 my $self = shift;
410 $self->{state} = 'data'; # MUST
411 $self->{content_model_flag} = 'PCDATA'; # be
412 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
413 undef $self->{current_attribute};
414 undef $self->{last_emitted_start_tag_name};
415 undef $self->{last_attribute_value_state};
416 $self->{char} = [];
417 # $self->{next_input_character}
418 !!!next-input-character;
419 $self->{token} = [];
420 } # _initialize_tokenizer
421
422 ## A token has:
423 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
424 ## 'character', or 'end-of-file'
425 ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
426 ## ISSUE: the spec need s/tagname/tag name/
427 ## ->{error} == 1 or 0 (DOCTYPE)
428 ## ->{attributes} isa HASH (start tag, end tag)
429 ## ->{data} (comment, character)
430
431 ## Macros
432 ## Macros MUST be preceded by three EXCLAMATION MARKs.
433 ## emit ($token)
434 ## Emits the specified token.
435
436 ## Emitted token MUST immediately be handled by the tree construction state.
437
438 ## Before each step, UA MAY check to see if either one of the scripts in
439 ## "list of scripts that will execute as soon as possible" or the first
440 ## script in the "list of scripts that will execute asynchronously",
441 ## has completed loading. If one has, then it MUST be executed
442 ## and removed from the list.
443
444 sub _get_next_token ($) {
445 my $self = shift;
446 if (@{$self->{token}}) {
447 return shift @{$self->{token}};
448 }
449
450 A: {
451 if ($self->{state} eq 'data') {
452 if ($self->{next_input_character} == 0x0026) { # &
453 if ($self->{content_model_flag} eq 'PCDATA' or
454 $self->{content_model_flag} eq 'RCDATA') {
455 $self->{state} = 'entity data';
456 !!!next-input-character;
457 redo A;
458 } else {
459 #
460 }
461 } elsif ($self->{next_input_character} == 0x002D) { # -
462 if ($self->{content_model_flag} eq 'RCDATA' or
463 $self->{content_model_flag} eq 'CDATA') {
464 unless ($self->{escape}) {
465 if ($self->{prev_input_character}->[0] == 0x002D and # -
466 $self->{prev_input_character}->[1] == 0x0021 and # !
467 $self->{prev_input_character}->[2] == 0x003C) { # <
468 $self->{escape} = 1;
469 }
470 }
471 }
472
473 #
474 } elsif ($self->{next_input_character} == 0x003C) { # <
475 if ($self->{content_model_flag} eq 'PCDATA' or
476 (($self->{content_model_flag} eq 'CDATA' or
477 $self->{content_model_flag} eq 'RCDATA') and
478 not $self->{escape})) {
479 $self->{state} = 'tag open';
480 !!!next-input-character;
481 redo A;
482 } else {
483 #
484 }
485 } elsif ($self->{next_input_character} == 0x003E) { # >
486 if ($self->{escape} and
487 ($self->{content_model_flag} eq 'RCDATA' or
488 $self->{content_model_flag} eq 'CDATA')) {
489 if ($self->{prev_input_character}->[0] == 0x002D and # -
490 $self->{prev_input_character}->[1] == 0x002D) { # -
491 delete $self->{escape};
492 }
493 }
494
495 #
496 } elsif ($self->{next_input_character} == -1) {
497 !!!emit ({type => 'end-of-file'});
498 last A; ## TODO: ok?
499 }
500 # Anything else
501 my $token = {type => 'character',
502 data => chr $self->{next_input_character}};
503 ## Stay in the data state
504 !!!next-input-character;
505
506 !!!emit ($token);
507
508 redo A;
509 } elsif ($self->{state} eq 'entity data') {
510 ## (cannot happen in CDATA state)
511
512 my $token = $self->_tokenize_attempt_to_consume_an_entity;
513
514 $self->{state} = 'data';
515 # next-input-character is already done
516
517 unless (defined $token) {
518 !!!emit ({type => 'character', data => '&'});
519 } else {
520 !!!emit ($token);
521 }
522
523 redo A;
524 } elsif ($self->{state} eq 'tag open') {
525 if ($self->{content_model_flag} eq 'RCDATA' or
526 $self->{content_model_flag} eq 'CDATA') {
527 if ($self->{next_input_character} == 0x002F) { # /
528 !!!next-input-character;
529 $self->{state} = 'close tag open';
530 redo A;
531 } else {
532 ## reconsume
533 $self->{state} = 'data';
534
535 !!!emit ({type => 'character', data => '<'});
536
537 redo A;
538 }
539 } elsif ($self->{content_model_flag} eq 'PCDATA') {
540 if ($self->{next_input_character} == 0x0021) { # !
541 $self->{state} = 'markup declaration open';
542 !!!next-input-character;
543 redo A;
544 } elsif ($self->{next_input_character} == 0x002F) { # /
545 $self->{state} = 'close tag open';
546 !!!next-input-character;
547 redo A;
548 } elsif (0x0041 <= $self->{next_input_character} and
549 $self->{next_input_character} <= 0x005A) { # A..Z
550 $self->{current_token}
551 = {type => 'start tag',
552 tag_name => chr ($self->{next_input_character} + 0x0020)};
553 $self->{state} = 'tag name';
554 !!!next-input-character;
555 redo A;
556 } elsif (0x0061 <= $self->{next_input_character} and
557 $self->{next_input_character} <= 0x007A) { # a..z
558 $self->{current_token} = {type => 'start tag',
559 tag_name => chr ($self->{next_input_character})};
560 $self->{state} = 'tag name';
561 !!!next-input-character;
562 redo A;
563 } elsif ($self->{next_input_character} == 0x003E) { # >
564 !!!parse-error (type => 'empty start tag');
565 $self->{state} = 'data';
566 !!!next-input-character;
567
568 !!!emit ({type => 'character', data => '<>'});
569
570 redo A;
571 } elsif ($self->{next_input_character} == 0x003F) { # ?
572 !!!parse-error (type => 'pio');
573 $self->{state} = 'bogus comment';
574 ## $self->{next_input_character} is intentionally left as is
575 redo A;
576 } else {
577 !!!parse-error (type => 'bare stago');
578 $self->{state} = 'data';
579 ## reconsume
580
581 !!!emit ({type => 'character', data => '<'});
582
583 redo A;
584 }
585 } else {
586 die "$0: $self->{content_model_flag}: Unknown content model flag";
587 }
588 } elsif ($self->{state} eq 'close tag open') {
589 if ($self->{content_model_flag} eq 'RCDATA' or
590 $self->{content_model_flag} eq 'CDATA') {
591 my @next_char;
592 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
593 push @next_char, $self->{next_input_character};
594 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
595 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
596 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
597 !!!next-input-character;
598 next TAGNAME;
599 } else {
600 !!!parse-error (type => 'unmatched end tag');
601 $self->{next_input_character} = shift @next_char; # reconsume
602 !!!back-next-input-character (@next_char);
603 $self->{state} = 'data';
604
605 !!!emit ({type => 'character', data => '</'});
606
607 redo A;
608 }
609 }
610 push @next_char, $self->{next_input_character};
611
612 unless ($self->{next_input_character} == 0x0009 or # HT
613 $self->{next_input_character} == 0x000A or # LF
614 $self->{next_input_character} == 0x000B or # VT
615 $self->{next_input_character} == 0x000C or # FF
616 $self->{next_input_character} == 0x0020 or # SP
617 $self->{next_input_character} == 0x003E or # >
618 $self->{next_input_character} == 0x002F or # /
619 $self->{next_input_character} == 0x003C or # <
620 $self->{next_input_character} == -1) {
621 !!!parse-error (type => 'unmatched end tag');
622 $self->{next_input_character} = shift @next_char; # reconsume
623 !!!back-next-input-character (@next_char);
624 $self->{state} = 'data';
625
626 !!!emit ({type => 'character', data => '</'});
627
628 redo A;
629 } else {
630 $self->{next_input_character} = shift @next_char;
631 !!!back-next-input-character (@next_char);
632 # and consume...
633 }
634 }
635
636 if (0x0041 <= $self->{next_input_character} and
637 $self->{next_input_character} <= 0x005A) { # A..Z
638 $self->{current_token} = {type => 'end tag',
639 tag_name => chr ($self->{next_input_character} + 0x0020)};
640 $self->{state} = 'tag name';
641 !!!next-input-character;
642 redo A;
643 } elsif (0x0061 <= $self->{next_input_character} and
644 $self->{next_input_character} <= 0x007A) { # a..z
645 $self->{current_token} = {type => 'end tag',
646 tag_name => chr ($self->{next_input_character})};
647 $self->{state} = 'tag name';
648 !!!next-input-character;
649 redo A;
650 } elsif ($self->{next_input_character} == 0x003E) { # >
651 !!!parse-error (type => 'empty end tag');
652 $self->{state} = 'data';
653 !!!next-input-character;
654 redo A;
655 } elsif ($self->{next_input_character} == -1) {
656 !!!parse-error (type => 'bare etago');
657 $self->{state} = 'data';
658 # reconsume
659
660 !!!emit ({type => 'character', data => '</'});
661
662 redo A;
663 } else {
664 !!!parse-error (type => 'bogus end tag');
665 $self->{state} = 'bogus comment';
666 ## $self->{next_input_character} is intentionally left as is
667 redo A;
668 }
669 } elsif ($self->{state} eq 'tag name') {
670 if ($self->{next_input_character} == 0x0009 or # HT
671 $self->{next_input_character} == 0x000A or # LF
672 $self->{next_input_character} == 0x000B or # VT
673 $self->{next_input_character} == 0x000C or # FF
674 $self->{next_input_character} == 0x0020) { # SP
675 $self->{state} = 'before attribute name';
676 !!!next-input-character;
677 redo A;
678 } elsif ($self->{next_input_character} == 0x003E) { # >
679 if ($self->{current_token}->{type} eq 'start tag') {
680 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
681 } elsif ($self->{current_token}->{type} eq 'end tag') {
682 $self->{content_model_flag} = 'PCDATA'; # MUST
683 if ($self->{current_token}->{attributes}) {
684 !!!parse-error (type => 'end tag attribute');
685 }
686 } else {
687 die "$0: $self->{current_token}->{type}: Unknown token type";
688 }
689 $self->{state} = 'data';
690 !!!next-input-character;
691
692 !!!emit ($self->{current_token}); # start tag or end tag
693 undef $self->{current_token};
694
695 redo A;
696 } elsif (0x0041 <= $self->{next_input_character} and
697 $self->{next_input_character} <= 0x005A) { # A..Z
698 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
699 # start tag or end tag
700 ## Stay in this state
701 !!!next-input-character;
702 redo A;
703 } elsif ($self->{next_input_character} == 0x003C or # <
704 $self->{next_input_character} == -1) {
705 !!!parse-error (type => 'unclosed tag');
706 if ($self->{current_token}->{type} eq 'start tag') {
707 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
708 } elsif ($self->{current_token}->{type} eq 'end tag') {
709 $self->{content_model_flag} = 'PCDATA'; # MUST
710 if ($self->{current_token}->{attributes}) {
711 !!!parse-error (type => 'end tag attribute');
712 }
713 } else {
714 die "$0: $self->{current_token}->{type}: Unknown token type";
715 }
716 $self->{state} = 'data';
717 # reconsume
718
719 !!!emit ($self->{current_token}); # start tag or end tag
720 undef $self->{current_token};
721
722 redo A;
723 } elsif ($self->{next_input_character} == 0x002F) { # /
724 !!!next-input-character;
725 if ($self->{next_input_character} == 0x003E and # >
726 $self->{current_token}->{type} eq 'start tag' and
727 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
728 # permitted slash
729 #
730 } else {
731 !!!parse-error (type => 'nestc');
732 }
733 $self->{state} = 'before attribute name';
734 # next-input-character is already done
735 redo A;
736 } else {
737 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
738 # start tag or end tag
739 ## Stay in the state
740 !!!next-input-character;
741 redo A;
742 }
743 } elsif ($self->{state} eq 'before attribute name') {
744 if ($self->{next_input_character} == 0x0009 or # HT
745 $self->{next_input_character} == 0x000A or # LF
746 $self->{next_input_character} == 0x000B or # VT
747 $self->{next_input_character} == 0x000C or # FF
748 $self->{next_input_character} == 0x0020) { # SP
749 ## Stay in the state
750 !!!next-input-character;
751 redo A;
752 } elsif ($self->{next_input_character} == 0x003E) { # >
753 if ($self->{current_token}->{type} eq 'start tag') {
754 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
755 } elsif ($self->{current_token}->{type} eq 'end tag') {
756 $self->{content_model_flag} = 'PCDATA'; # MUST
757 if ($self->{current_token}->{attributes}) {
758 !!!parse-error (type => 'end tag attribute');
759 }
760 } else {
761 die "$0: $self->{current_token}->{type}: Unknown token type";
762 }
763 $self->{state} = 'data';
764 !!!next-input-character;
765
766 !!!emit ($self->{current_token}); # start tag or end tag
767 undef $self->{current_token};
768
769 redo A;
770 } elsif (0x0041 <= $self->{next_input_character} and
771 $self->{next_input_character} <= 0x005A) { # A..Z
772 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
773 value => ''};
774 $self->{state} = 'attribute name';
775 !!!next-input-character;
776 redo A;
777 } elsif ($self->{next_input_character} == 0x002F) { # /
778 !!!next-input-character;
779 if ($self->{next_input_character} == 0x003E and # >
780 $self->{current_token}->{type} eq 'start tag' and
781 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
782 # permitted slash
783 #
784 } else {
785 !!!parse-error (type => 'nestc');
786 }
787 ## Stay in the state
788 # next-input-character is already done
789 redo A;
790 } elsif ($self->{next_input_character} == 0x003C or # <
791 $self->{next_input_character} == -1) {
792 !!!parse-error (type => 'unclosed tag');
793 if ($self->{current_token}->{type} eq 'start tag') {
794 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
795 } elsif ($self->{current_token}->{type} eq 'end tag') {
796 $self->{content_model_flag} = 'PCDATA'; # MUST
797 if ($self->{current_token}->{attributes}) {
798 !!!parse-error (type => 'end tag attribute');
799 }
800 } else {
801 die "$0: $self->{current_token}->{type}: Unknown token type";
802 }
803 $self->{state} = 'data';
804 # reconsume
805
806 !!!emit ($self->{current_token}); # start tag or end tag
807 undef $self->{current_token};
808
809 redo A;
810 } else {
811 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
812 value => ''};
813 $self->{state} = 'attribute name';
814 !!!next-input-character;
815 redo A;
816 }
817 } elsif ($self->{state} eq 'attribute name') {
818 my $before_leave = sub {
819 if (exists $self->{current_token}->{attributes} # start tag or end tag
820 ->{$self->{current_attribute}->{name}}) { # MUST
821 !!!parse-error (type => 'dupulicate attribute');
822 ## Discard $self->{current_attribute} # MUST
823 } else {
824 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
825 = $self->{current_attribute};
826 }
827 }; # $before_leave
828
829 if ($self->{next_input_character} == 0x0009 or # HT
830 $self->{next_input_character} == 0x000A or # LF
831 $self->{next_input_character} == 0x000B or # VT
832 $self->{next_input_character} == 0x000C or # FF
833 $self->{next_input_character} == 0x0020) { # SP
834 $before_leave->();
835 $self->{state} = 'after attribute name';
836 !!!next-input-character;
837 redo A;
838 } elsif ($self->{next_input_character} == 0x003D) { # =
839 $before_leave->();
840 $self->{state} = 'before attribute value';
841 !!!next-input-character;
842 redo A;
843 } elsif ($self->{next_input_character} == 0x003E) { # >
844 $before_leave->();
845 if ($self->{current_token}->{type} eq 'start tag') {
846 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
847 } elsif ($self->{current_token}->{type} eq 'end tag') {
848 $self->{content_model_flag} = 'PCDATA'; # MUST
849 if ($self->{current_token}->{attributes}) {
850 !!!parse-error (type => 'end tag attribute');
851 }
852 } else {
853 die "$0: $self->{current_token}->{type}: Unknown token type";
854 }
855 $self->{state} = 'data';
856 !!!next-input-character;
857
858 !!!emit ($self->{current_token}); # start tag or end tag
859 undef $self->{current_token};
860
861 redo A;
862 } elsif (0x0041 <= $self->{next_input_character} and
863 $self->{next_input_character} <= 0x005A) { # A..Z
864 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
865 ## Stay in the state
866 !!!next-input-character;
867 redo A;
868 } elsif ($self->{next_input_character} == 0x002F) { # /
869 $before_leave->();
870 !!!next-input-character;
871 if ($self->{next_input_character} == 0x003E and # >
872 $self->{current_token}->{type} eq 'start tag' and
873 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
874 # permitted slash
875 #
876 } else {
877 !!!parse-error (type => 'nestc');
878 }
879 $self->{state} = 'before attribute name';
880 # next-input-character is already done
881 redo A;
882 } elsif ($self->{next_input_character} == 0x003C or # <
883 $self->{next_input_character} == -1) {
884 !!!parse-error (type => 'unclosed tag');
885 $before_leave->();
886 if ($self->{current_token}->{type} eq 'start tag') {
887 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
888 } elsif ($self->{current_token}->{type} eq 'end tag') {
889 $self->{content_model_flag} = 'PCDATA'; # MUST
890 if ($self->{current_token}->{attributes}) {
891 !!!parse-error (type => 'end tag attribute');
892 }
893 } else {
894 die "$0: $self->{current_token}->{type}: Unknown token type";
895 }
896 $self->{state} = 'data';
897 # reconsume
898
899 !!!emit ($self->{current_token}); # start tag or end tag
900 undef $self->{current_token};
901
902 redo A;
903 } else {
904 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
905 ## Stay in the state
906 !!!next-input-character;
907 redo A;
908 }
909 } elsif ($self->{state} eq 'after attribute name') {
910 if ($self->{next_input_character} == 0x0009 or # HT
911 $self->{next_input_character} == 0x000A or # LF
912 $self->{next_input_character} == 0x000B or # VT
913 $self->{next_input_character} == 0x000C or # FF
914 $self->{next_input_character} == 0x0020) { # SP
915 ## Stay in the state
916 !!!next-input-character;
917 redo A;
918 } elsif ($self->{next_input_character} == 0x003D) { # =
919 $self->{state} = 'before attribute value';
920 !!!next-input-character;
921 redo A;
922 } elsif ($self->{next_input_character} == 0x003E) { # >
923 if ($self->{current_token}->{type} eq 'start tag') {
924 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
925 } elsif ($self->{current_token}->{type} eq 'end tag') {
926 $self->{content_model_flag} = 'PCDATA'; # MUST
927 if ($self->{current_token}->{attributes}) {
928 !!!parse-error (type => 'end tag attribute');
929 }
930 } else {
931 die "$0: $self->{current_token}->{type}: Unknown token type";
932 }
933 $self->{state} = 'data';
934 !!!next-input-character;
935
936 !!!emit ($self->{current_token}); # start tag or end tag
937 undef $self->{current_token};
938
939 redo A;
940 } elsif (0x0041 <= $self->{next_input_character} and
941 $self->{next_input_character} <= 0x005A) { # A..Z
942 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
943 value => ''};
944 $self->{state} = 'attribute name';
945 !!!next-input-character;
946 redo A;
947 } elsif ($self->{next_input_character} == 0x002F) { # /
948 !!!next-input-character;
949 if ($self->{next_input_character} == 0x003E and # >
950 $self->{current_token}->{type} eq 'start tag' and
951 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
952 # permitted slash
953 #
954 } else {
955 !!!parse-error (type => 'nestc');
956 }
957 $self->{state} = 'before attribute name';
958 # next-input-character is already done
959 redo A;
960 } elsif ($self->{next_input_character} == 0x003C or # <
961 $self->{next_input_character} == -1) {
962 !!!parse-error (type => 'unclosed tag');
963 if ($self->{current_token}->{type} eq 'start tag') {
964 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
965 } elsif ($self->{current_token}->{type} eq 'end tag') {
966 $self->{content_model_flag} = 'PCDATA'; # MUST
967 if ($self->{current_token}->{attributes}) {
968 !!!parse-error (type => 'end tag attribute');
969 }
970 } else {
971 die "$0: $self->{current_token}->{type}: Unknown token type";
972 }
973 $self->{state} = 'data';
974 # reconsume
975
976 !!!emit ($self->{current_token}); # start tag or end tag
977 undef $self->{current_token};
978
979 redo A;
980 } else {
981 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
982 value => ''};
983 $self->{state} = 'attribute name';
984 !!!next-input-character;
985 redo A;
986 }
987 } elsif ($self->{state} eq 'before attribute value') {
988 if ($self->{next_input_character} == 0x0009 or # HT
989 $self->{next_input_character} == 0x000A or # LF
990 $self->{next_input_character} == 0x000B or # VT
991 $self->{next_input_character} == 0x000C or # FF
992 $self->{next_input_character} == 0x0020) { # SP
993 ## Stay in the state
994 !!!next-input-character;
995 redo A;
996 } elsif ($self->{next_input_character} == 0x0022) { # "
997 $self->{state} = 'attribute value (double-quoted)';
998 !!!next-input-character;
999 redo A;
1000 } elsif ($self->{next_input_character} == 0x0026) { # &
1001 $self->{state} = 'attribute value (unquoted)';
1002 ## reconsume
1003 redo A;
1004 } elsif ($self->{next_input_character} == 0x0027) { # '
1005 $self->{state} = 'attribute value (single-quoted)';
1006 !!!next-input-character;
1007 redo A;
1008 } elsif ($self->{next_input_character} == 0x003E) { # >
1009 if ($self->{current_token}->{type} eq 'start tag') {
1010 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1011 } elsif ($self->{current_token}->{type} eq 'end tag') {
1012 $self->{content_model_flag} = 'PCDATA'; # MUST
1013 if ($self->{current_token}->{attributes}) {
1014 !!!parse-error (type => 'end tag attribute');
1015 }
1016 } else {
1017 die "$0: $self->{current_token}->{type}: Unknown token type";
1018 }
1019 $self->{state} = 'data';
1020 !!!next-input-character;
1021
1022 !!!emit ($self->{current_token}); # start tag or end tag
1023 undef $self->{current_token};
1024
1025 redo A;
1026 } elsif ($self->{next_input_character} == 0x003C or # <
1027 $self->{next_input_character} == -1) {
1028 !!!parse-error (type => 'unclosed tag');
1029 if ($self->{current_token}->{type} eq 'start tag') {
1030 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1031 } elsif ($self->{current_token}->{type} eq 'end tag') {
1032 $self->{content_model_flag} = 'PCDATA'; # MUST
1033 if ($self->{current_token}->{attributes}) {
1034 !!!parse-error (type => 'end tag attribute');
1035 }
1036 } else {
1037 die "$0: $self->{current_token}->{type}: Unknown token type";
1038 }
1039 $self->{state} = 'data';
1040 ## reconsume
1041
1042 !!!emit ($self->{current_token}); # start tag or end tag
1043 undef $self->{current_token};
1044
1045 redo A;
1046 } else {
1047 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1048 $self->{state} = 'attribute value (unquoted)';
1049 !!!next-input-character;
1050 redo A;
1051 }
1052 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
1053 if ($self->{next_input_character} == 0x0022) { # "
1054 $self->{state} = 'before attribute name';
1055 !!!next-input-character;
1056 redo A;
1057 } elsif ($self->{next_input_character} == 0x0026) { # &
1058 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
1059 $self->{state} = 'entity in attribute value';
1060 !!!next-input-character;
1061 redo A;
1062 } elsif ($self->{next_input_character} == -1) {
1063 !!!parse-error (type => 'unclosed attribute value');
1064 if ($self->{current_token}->{type} eq 'start tag') {
1065 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1066 } elsif ($self->{current_token}->{type} eq 'end tag') {
1067 $self->{content_model_flag} = 'PCDATA'; # MUST
1068 if ($self->{current_token}->{attributes}) {
1069 !!!parse-error (type => 'end tag attribute');
1070 }
1071 } else {
1072 die "$0: $self->{current_token}->{type}: Unknown token type";
1073 }
1074 $self->{state} = 'data';
1075 ## reconsume
1076
1077 !!!emit ($self->{current_token}); # start tag or end tag
1078 undef $self->{current_token};
1079
1080 redo A;
1081 } else {
1082 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1083 ## Stay in the state
1084 !!!next-input-character;
1085 redo A;
1086 }
1087 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
1088 if ($self->{next_input_character} == 0x0027) { # '
1089 $self->{state} = 'before attribute name';
1090 !!!next-input-character;
1091 redo A;
1092 } elsif ($self->{next_input_character} == 0x0026) { # &
1093 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
1094 $self->{state} = 'entity in attribute value';
1095 !!!next-input-character;
1096 redo A;
1097 } elsif ($self->{next_input_character} == -1) {
1098 !!!parse-error (type => 'unclosed attribute value');
1099 if ($self->{current_token}->{type} eq 'start tag') {
1100 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1101 } elsif ($self->{current_token}->{type} eq 'end tag') {
1102 $self->{content_model_flag} = 'PCDATA'; # MUST
1103 if ($self->{current_token}->{attributes}) {
1104 !!!parse-error (type => 'end tag attribute');
1105 }
1106 } else {
1107 die "$0: $self->{current_token}->{type}: Unknown token type";
1108 }
1109 $self->{state} = 'data';
1110 ## reconsume
1111
1112 !!!emit ($self->{current_token}); # start tag or end tag
1113 undef $self->{current_token};
1114
1115 redo A;
1116 } else {
1117 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1118 ## Stay in the state
1119 !!!next-input-character;
1120 redo A;
1121 }
1122 } elsif ($self->{state} eq 'attribute value (unquoted)') {
1123 if ($self->{next_input_character} == 0x0009 or # HT
1124 $self->{next_input_character} == 0x000A or # LF
1125 $self->{next_input_character} == 0x000B or # HT
1126 $self->{next_input_character} == 0x000C or # FF
1127 $self->{next_input_character} == 0x0020) { # SP
1128 $self->{state} = 'before attribute name';
1129 !!!next-input-character;
1130 redo A;
1131 } elsif ($self->{next_input_character} == 0x0026) { # &
1132 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1133 $self->{state} = 'entity in attribute value';
1134 !!!next-input-character;
1135 redo A;
1136 } elsif ($self->{next_input_character} == 0x003E) { # >
1137 if ($self->{current_token}->{type} eq 'start tag') {
1138 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1139 } elsif ($self->{current_token}->{type} eq 'end tag') {
1140 $self->{content_model_flag} = 'PCDATA'; # MUST
1141 if ($self->{current_token}->{attributes}) {
1142 !!!parse-error (type => 'end tag attribute');
1143 }
1144 } else {
1145 die "$0: $self->{current_token}->{type}: Unknown token type";
1146 }
1147 $self->{state} = 'data';
1148 !!!next-input-character;
1149
1150 !!!emit ($self->{current_token}); # start tag or end tag
1151 undef $self->{current_token};
1152
1153 redo A;
1154 } elsif ($self->{next_input_character} == 0x003C or # <
1155 $self->{next_input_character} == -1) {
1156 !!!parse-error (type => 'unclosed tag');
1157 if ($self->{current_token}->{type} eq 'start tag') {
1158 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1159 } elsif ($self->{current_token}->{type} eq 'end tag') {
1160 $self->{content_model_flag} = 'PCDATA'; # MUST
1161 if ($self->{current_token}->{attributes}) {
1162 !!!parse-error (type => 'end tag attribute');
1163 }
1164 } else {
1165 die "$0: $self->{current_token}->{type}: Unknown token type";
1166 }
1167 $self->{state} = 'data';
1168 ## reconsume
1169
1170 !!!emit ($self->{current_token}); # start tag or end tag
1171 undef $self->{current_token};
1172
1173 redo A;
1174 } else {
1175 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1176 ## Stay in the state
1177 !!!next-input-character;
1178 redo A;
1179 }
1180 } elsif ($self->{state} eq 'entity in attribute value') {
1181 my $token = $self->_tokenize_attempt_to_consume_an_entity;
1182
1183 unless (defined $token) {
1184 $self->{current_attribute}->{value} .= '&';
1185 } else {
1186 $self->{current_attribute}->{value} .= $token->{data};
1187 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1188 }
1189
1190 $self->{state} = $self->{last_attribute_value_state};
1191 # next-input-character is already done
1192 redo A;
1193 } elsif ($self->{state} eq 'bogus comment') {
1194 ## (only happen if PCDATA state)
1195
1196 my $token = {type => 'comment', data => ''};
1197
1198 BC: {
1199 if ($self->{next_input_character} == 0x003E) { # >
1200 $self->{state} = 'data';
1201 !!!next-input-character;
1202
1203 !!!emit ($token);
1204
1205 redo A;
1206 } elsif ($self->{next_input_character} == -1) {
1207 $self->{state} = 'data';
1208 ## reconsume
1209
1210 !!!emit ($token);
1211
1212 redo A;
1213 } else {
1214 $token->{data} .= chr ($self->{next_input_character});
1215 !!!next-input-character;
1216 redo BC;
1217 }
1218 } # BC
1219 } elsif ($self->{state} eq 'markup declaration open') {
1220 ## (only happen if PCDATA state)
1221
1222 my @next_char;
1223 push @next_char, $self->{next_input_character};
1224
1225 if ($self->{next_input_character} == 0x002D) { # -
1226 !!!next-input-character;
1227 push @next_char, $self->{next_input_character};
1228 if ($self->{next_input_character} == 0x002D) { # -
1229 $self->{current_token} = {type => 'comment', data => ''};
1230 $self->{state} = 'comment';
1231 !!!next-input-character;
1232 redo A;
1233 }
1234 } elsif ($self->{next_input_character} == 0x0044 or # D
1235 $self->{next_input_character} == 0x0064) { # d
1236 !!!next-input-character;
1237 push @next_char, $self->{next_input_character};
1238 if ($self->{next_input_character} == 0x004F or # O
1239 $self->{next_input_character} == 0x006F) { # o
1240 !!!next-input-character;
1241 push @next_char, $self->{next_input_character};
1242 if ($self->{next_input_character} == 0x0043 or # C
1243 $self->{next_input_character} == 0x0063) { # c
1244 !!!next-input-character;
1245 push @next_char, $self->{next_input_character};
1246 if ($self->{next_input_character} == 0x0054 or # T
1247 $self->{next_input_character} == 0x0074) { # t
1248 !!!next-input-character;
1249 push @next_char, $self->{next_input_character};
1250 if ($self->{next_input_character} == 0x0059 or # Y
1251 $self->{next_input_character} == 0x0079) { # y
1252 !!!next-input-character;
1253 push @next_char, $self->{next_input_character};
1254 if ($self->{next_input_character} == 0x0050 or # P
1255 $self->{next_input_character} == 0x0070) { # p
1256 !!!next-input-character;
1257 push @next_char, $self->{next_input_character};
1258 if ($self->{next_input_character} == 0x0045 or # E
1259 $self->{next_input_character} == 0x0065) { # e
1260 ## ISSUE: What a stupid code this is!
1261 $self->{state} = 'DOCTYPE';
1262 !!!next-input-character;
1263 redo A;
1264 }
1265 }
1266 }
1267 }
1268 }
1269 }
1270 }
1271
1272 !!!parse-error (type => 'bogus comment open');
1273 $self->{next_input_character} = shift @next_char;
1274 !!!back-next-input-character (@next_char);
1275 $self->{state} = 'bogus comment';
1276 redo A;
1277
1278 ## ISSUE: typos in spec: chacacters, is is a parse error
1279 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1280 } elsif ($self->{state} eq 'comment') {
1281 if ($self->{next_input_character} == 0x002D) { # -
1282 $self->{state} = 'comment dash';
1283 !!!next-input-character;
1284 redo A;
1285 } elsif ($self->{next_input_character} == -1) {
1286 !!!parse-error (type => 'unclosed comment');
1287 $self->{state} = 'data';
1288 ## reconsume
1289
1290 !!!emit ($self->{current_token}); # comment
1291 undef $self->{current_token};
1292
1293 redo A;
1294 } else {
1295 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1296 ## Stay in the state
1297 !!!next-input-character;
1298 redo A;
1299 }
1300 } elsif ($self->{state} eq 'comment dash') {
1301 if ($self->{next_input_character} == 0x002D) { # -
1302 $self->{state} = 'comment end';
1303 !!!next-input-character;
1304 redo A;
1305 } elsif ($self->{next_input_character} == -1) {
1306 !!!parse-error (type => 'unclosed comment');
1307 $self->{state} = 'data';
1308 ## reconsume
1309
1310 !!!emit ($self->{current_token}); # comment
1311 undef $self->{current_token};
1312
1313 redo A;
1314 } else {
1315 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1316 $self->{state} = 'comment';
1317 !!!next-input-character;
1318 redo A;
1319 }
1320 } elsif ($self->{state} eq 'comment end') {
1321 if ($self->{next_input_character} == 0x003E) { # >
1322 $self->{state} = 'data';
1323 !!!next-input-character;
1324
1325 !!!emit ($self->{current_token}); # comment
1326 undef $self->{current_token};
1327
1328 redo A;
1329 } elsif ($self->{next_input_character} == 0x002D) { # -
1330 !!!parse-error (type => 'dash in comment');
1331 $self->{current_token}->{data} .= '-'; # comment
1332 ## Stay in the state
1333 !!!next-input-character;
1334 redo A;
1335 } elsif ($self->{next_input_character} == -1) {
1336 !!!parse-error (type => 'unclosed comment');
1337 $self->{state} = 'data';
1338 ## reconsume
1339
1340 !!!emit ($self->{current_token}); # comment
1341 undef $self->{current_token};
1342
1343 redo A;
1344 } else {
1345 !!!parse-error (type => 'dash in comment');
1346 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1347 $self->{state} = 'comment';
1348 !!!next-input-character;
1349 redo A;
1350 }
1351 } elsif ($self->{state} eq 'DOCTYPE') {
1352 if ($self->{next_input_character} == 0x0009 or # HT
1353 $self->{next_input_character} == 0x000A or # LF
1354 $self->{next_input_character} == 0x000B or # VT
1355 $self->{next_input_character} == 0x000C or # FF
1356 $self->{next_input_character} == 0x0020) { # SP
1357 $self->{state} = 'before DOCTYPE name';
1358 !!!next-input-character;
1359 redo A;
1360 } else {
1361 !!!parse-error (type => 'no space before DOCTYPE name');
1362 $self->{state} = 'before DOCTYPE name';
1363 ## reconsume
1364 redo A;
1365 }
1366 } elsif ($self->{state} eq 'before DOCTYPE name') {
1367 if ($self->{next_input_character} == 0x0009 or # HT
1368 $self->{next_input_character} == 0x000A or # LF
1369 $self->{next_input_character} == 0x000B or # VT
1370 $self->{next_input_character} == 0x000C or # FF
1371 $self->{next_input_character} == 0x0020) { # SP
1372 ## Stay in the state
1373 !!!next-input-character;
1374 redo A;
1375 } elsif (0x0061 <= $self->{next_input_character} and
1376 $self->{next_input_character} <= 0x007A) { # a..z
1377 ## ISSUE: "Set the token's name name to the" in the spec
1378 $self->{current_token} = {type => 'DOCTYPE',
1379 name => chr ($self->{next_input_character} - 0x0020),
1380 error => 1};
1381 $self->{state} = 'DOCTYPE name';
1382 !!!next-input-character;
1383 redo A;
1384 } elsif ($self->{next_input_character} == 0x003E) { # >
1385 !!!parse-error (type => 'no DOCTYPE name');
1386 $self->{state} = 'data';
1387 !!!next-input-character;
1388
1389 !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1390
1391 redo A;
1392 } elsif ($self->{next_input_character} == -1) {
1393 !!!parse-error (type => 'no DOCTYPE name');
1394 $self->{state} = 'data';
1395 ## reconsume
1396
1397 !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1398
1399 redo A;
1400 } else {
1401 $self->{current_token} = {type => 'DOCTYPE',
1402 name => chr ($self->{next_input_character}),
1403 error => 1};
1404 ## ISSUE: "Set the token's name name to the" in the spec
1405 $self->{state} = 'DOCTYPE name';
1406 !!!next-input-character;
1407 redo A;
1408 }
1409 } elsif ($self->{state} eq 'DOCTYPE name') {
1410 if ($self->{next_input_character} == 0x0009 or # HT
1411 $self->{next_input_character} == 0x000A or # LF
1412 $self->{next_input_character} == 0x000B or # VT
1413 $self->{next_input_character} == 0x000C or # FF
1414 $self->{next_input_character} == 0x0020) { # SP
1415 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1416 $self->{state} = 'after DOCTYPE name';
1417 !!!next-input-character;
1418 redo A;
1419 } elsif ($self->{next_input_character} == 0x003E) { # >
1420 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1421 $self->{state} = 'data';
1422 !!!next-input-character;
1423
1424 !!!emit ($self->{current_token}); # DOCTYPE
1425 undef $self->{current_token};
1426
1427 redo A;
1428 } elsif (0x0061 <= $self->{next_input_character} and
1429 $self->{next_input_character} <= 0x007A) { # a..z
1430 $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1431 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1432 ## Stay in the state
1433 !!!next-input-character;
1434 redo A;
1435 } elsif ($self->{next_input_character} == -1) {
1436 !!!parse-error (type => 'unclosed DOCTYPE');
1437 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1438 $self->{state} = 'data';
1439 ## reconsume
1440
1441 !!!emit ($self->{current_token});
1442 undef $self->{current_token};
1443
1444 redo A;
1445 } else {
1446 $self->{current_token}->{name}
1447 .= chr ($self->{next_input_character}); # DOCTYPE
1448 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1449 ## Stay in the state
1450 !!!next-input-character;
1451 redo A;
1452 }
1453 } elsif ($self->{state} eq 'after DOCTYPE name') {
1454 if ($self->{next_input_character} == 0x0009 or # HT
1455 $self->{next_input_character} == 0x000A or # LF
1456 $self->{next_input_character} == 0x000B or # VT
1457 $self->{next_input_character} == 0x000C or # FF
1458 $self->{next_input_character} == 0x0020) { # SP
1459 ## Stay in the state
1460 !!!next-input-character;
1461 redo A;
1462 } elsif ($self->{next_input_character} == 0x003E) { # >
1463 $self->{state} = 'data';
1464 !!!next-input-character;
1465
1466 !!!emit ($self->{current_token}); # DOCTYPE
1467 undef $self->{current_token};
1468
1469 redo A;
1470 } elsif ($self->{next_input_character} == -1) {
1471 !!!parse-error (type => 'unclosed DOCTYPE');
1472 $self->{state} = 'data';
1473 ## reconsume
1474
1475 !!!emit ($self->{current_token}); # DOCTYPE
1476 undef $self->{current_token};
1477
1478 redo A;
1479 } else {
1480 !!!parse-error (type => 'string after DOCTYPE name');
1481 $self->{current_token}->{error} = 1; # DOCTYPE
1482 $self->{state} = 'bogus DOCTYPE';
1483 !!!next-input-character;
1484 redo A;
1485 }
1486 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1487 if ($self->{next_input_character} == 0x003E) { # >
1488 $self->{state} = 'data';
1489 !!!next-input-character;
1490
1491 !!!emit ($self->{current_token}); # DOCTYPE
1492 undef $self->{current_token};
1493
1494 redo A;
1495 } elsif ($self->{next_input_character} == -1) {
1496 !!!parse-error (type => 'unclosed DOCTYPE');
1497 $self->{state} = 'data';
1498 ## reconsume
1499
1500 !!!emit ($self->{current_token}); # DOCTYPE
1501 undef $self->{current_token};
1502
1503 redo A;
1504 } else {
1505 ## Stay in the state
1506 !!!next-input-character;
1507 redo A;
1508 }
1509 } else {
1510 die "$0: $self->{state}: Unknown state";
1511 }
1512 } # A
1513
1514 die "$0: _get_next_token: unexpected case";
1515 } # _get_next_token
1516
1517 sub _tokenize_attempt_to_consume_an_entity ($) {
1518 my $self = shift;
1519
1520 if ($self->{next_input_character} == 0x0023) { # #
1521 !!!next-input-character;
1522 if ($self->{next_input_character} == 0x0078 or # x
1523 $self->{next_input_character} == 0x0058) { # X
1524 my $num;
1525 X: {
1526 my $x_char = $self->{next_input_character};
1527 !!!next-input-character;
1528 if (0x0030 <= $self->{next_input_character} and
1529 $self->{next_input_character} <= 0x0039) { # 0..9
1530 $num ||= 0;
1531 $num *= 0x10;
1532 $num += $self->{next_input_character} - 0x0030;
1533 redo X;
1534 } elsif (0x0061 <= $self->{next_input_character} and
1535 $self->{next_input_character} <= 0x0066) { # a..f
1536 ## ISSUE: the spec says U+0078, which is apparently incorrect
1537 $num ||= 0;
1538 $num *= 0x10;
1539 $num += $self->{next_input_character} - 0x0060 + 9;
1540 redo X;
1541 } elsif (0x0041 <= $self->{next_input_character} and
1542 $self->{next_input_character} <= 0x0046) { # A..F
1543 ## ISSUE: the spec says U+0058, which is apparently incorrect
1544 $num ||= 0;
1545 $num *= 0x10;
1546 $num += $self->{next_input_character} - 0x0040 + 9;
1547 redo X;
1548 } elsif (not defined $num) { # no hexadecimal digit
1549 !!!parse-error (type => 'bare hcro');
1550 $self->{next_input_character} = 0x0023; # #
1551 !!!back-next-input-character ($x_char);
1552 return undef;
1553 } elsif ($self->{next_input_character} == 0x003B) { # ;
1554 !!!next-input-character;
1555 } else {
1556 !!!parse-error (type => 'no refc');
1557 }
1558
1559 ## TODO: check the definition for |a valid Unicode character|.
1560 ## <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8189>
1561 if ($num > 1114111 or $num == 0) {
1562 $num = 0xFFFD; # REPLACEMENT CHARACTER
1563 ## ISSUE: Why this is not an error?
1564 } elsif (0x80 <= $num and $num <= 0x9F) {
1565 !!!parse-error (type => sprintf 'c1 entity:U+%04X', $num);
1566 $num = $c1_entity_char->{$num};
1567 }
1568
1569 return {type => 'character', data => chr $num};
1570 } # X
1571 } elsif (0x0030 <= $self->{next_input_character} and
1572 $self->{next_input_character} <= 0x0039) { # 0..9
1573 my $code = $self->{next_input_character} - 0x0030;
1574 !!!next-input-character;
1575
1576 while (0x0030 <= $self->{next_input_character} and
1577 $self->{next_input_character} <= 0x0039) { # 0..9
1578 $code *= 10;
1579 $code += $self->{next_input_character} - 0x0030;
1580
1581 !!!next-input-character;
1582 }
1583
1584 if ($self->{next_input_character} == 0x003B) { # ;
1585 !!!next-input-character;
1586 } else {
1587 !!!parse-error (type => 'no refc');
1588 }
1589
1590 ## TODO: check the definition for |a valid Unicode character|.
1591 if ($code > 1114111 or $code == 0) {
1592 $code = 0xFFFD; # REPLACEMENT CHARACTER
1593 ## ISSUE: Why this is not an error?
1594 } elsif (0x80 <= $code and $code <= 0x9F) {
1595 !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);
1596 $code = $c1_entity_char->{$code};
1597 }
1598
1599 return {type => 'character', data => chr $code};
1600 } else {
1601 !!!parse-error (type => 'bare nero');
1602 !!!back-next-input-character ($self->{next_input_character});
1603 $self->{next_input_character} = 0x0023; # #
1604 return undef;
1605 }
1606 } elsif ((0x0041 <= $self->{next_input_character} and
1607 $self->{next_input_character} <= 0x005A) or
1608 (0x0061 <= $self->{next_input_character} and
1609 $self->{next_input_character} <= 0x007A)) {
1610 my $entity_name = chr $self->{next_input_character};
1611 !!!next-input-character;
1612
1613 my $value = $entity_name;
1614 my $match;
1615
1616 while (length $entity_name < 10 and
1617 ## NOTE: Some number greater than the maximum length of entity name
1618 ((0x0041 <= $self->{next_input_character} and
1619 $self->{next_input_character} <= 0x005A) or
1620 (0x0061 <= $self->{next_input_character} and
1621 $self->{next_input_character} <= 0x007A) or
1622 (0x0030 <= $self->{next_input_character} and
1623 $self->{next_input_character} <= 0x0039))) {
1624 $entity_name .= chr $self->{next_input_character};
1625 if (defined $entity_char->{$entity_name}) {
1626 $value = $entity_char->{$entity_name};
1627 $match = 1;
1628 } else {
1629 $value .= chr $self->{next_input_character};
1630 }
1631 !!!next-input-character;
1632 }
1633
1634 if ($match) {
1635 if ($self->{next_input_character} == 0x003B) { # ;
1636 !!!next-input-character;
1637 } else {
1638 !!!parse-error (type => 'refc');
1639 }
1640
1641 return {type => 'character', data => $value};
1642 } else {
1643 !!!parse-error (type => 'bare ero');
1644 ## NOTE: No characters are consumed in the spec.
1645 !!!back-token ({type => 'character', data => $value});
1646 return undef;
1647 }
1648 } else {
1649 ## no characters are consumed
1650 !!!parse-error (type => 'bare ero');
1651 return undef;
1652 }
1653 } # _tokenize_attempt_to_consume_an_entity
1654
1655 sub _initialize_tree_constructor ($) {
1656 my $self = shift;
1657 ## NOTE: $self->{document} MUST be specified before this method is called
1658 $self->{document}->strict_error_checking (0);
1659 ## TODO: Turn mutation events off # MUST
1660 ## TODO: Turn loose Document option (manakai extension) on
1661 ## TODO: Mark the Document as an HTML document # MUST
1662 } # _initialize_tree_constructor
1663
1664 sub _terminate_tree_constructor ($) {
1665 my $self = shift;
1666 $self->{document}->strict_error_checking (1);
1667 ## TODO: Turn mutation events on
1668 } # _terminate_tree_constructor
1669
1670 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1671
1672 { # tree construction stage
1673 my $token;
1674
1675 sub _construct_tree ($) {
1676 my ($self) = @_;
1677
1678 ## When an interactive UA render the $self->{document} available
1679 ## to the user, or when it begin accepting user input, are
1680 ## not defined.
1681
1682 ## Append a character: collect it and all subsequent consecutive
1683 ## characters and insert one Text node whose data is concatenation
1684 ## of all those characters. # MUST
1685
1686 !!!next-token;
1687
1688 $self->{insertion_mode} = 'before head';
1689 undef $self->{form_element};
1690 undef $self->{head_element};
1691 $self->{open_elements} = [];
1692 undef $self->{inner_html_node};
1693
1694 $self->_tree_construction_initial; # MUST
1695 $self->_tree_construction_root_element;
1696 $self->_tree_construction_main;
1697 } # _construct_tree
1698
1699 sub _tree_construction_initial ($) {
1700 my $self = shift;
1701 B: {
1702 if ($token->{type} eq 'DOCTYPE') {
1703 if ($token->{error}) {
1704 ## ISSUE: Spec currently left this case undefined.
1705 !!!parse-error (type => 'bogus DOCTYPE');
1706 }
1707 my $doctype = $self->{document}->create_document_type_definition
1708 ($token->{name});
1709 $self->{document}->append_child ($doctype);
1710 #$phase = 'root element';
1711 !!!next-token;
1712 #redo B;
1713 return;
1714 } elsif ({
1715 comment => 1,
1716 'start tag' => 1,
1717 'end tag' => 1,
1718 'end-of-file' => 1,
1719 }->{$token->{type}}) {
1720 ## ISSUE: Spec currently left this case undefined.
1721 !!!parse-error (type => 'missing DOCTYPE');
1722 #$phase = 'root element';
1723 ## reprocess
1724 #redo B;
1725 return;
1726 } elsif ($token->{type} eq 'character') {
1727 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
1728 $self->{document}->manakai_append_text ($1);
1729 ## ISSUE: DOM3 Core does not allow Document > Text
1730 unless (length $token->{data}) {
1731 ## Stay in the phase
1732 !!!next-token;
1733 redo B;
1734 }
1735 }
1736 ## ISSUE: Spec currently left this case undefined.
1737 !!!parse-error (type => 'missing DOCTYPE');
1738 #$phase = 'root element';
1739 ## reprocess
1740 #redo B;
1741 return;
1742 } else {
1743 die "$0: $token->{type}: Unknown token";
1744 }
1745 } # B
1746 } # _tree_construction_initial
1747
1748 sub _tree_construction_root_element ($) {
1749 my $self = shift;
1750
1751 B: {
1752 if ($token->{type} eq 'DOCTYPE') {
1753 !!!parse-error (type => 'in html:#DOCTYPE');
1754 ## Ignore the token
1755 ## Stay in the phase
1756 !!!next-token;
1757 redo B;
1758 } elsif ($token->{type} eq 'comment') {
1759 my $comment = $self->{document}->create_comment ($token->{data});
1760 $self->{document}->append_child ($comment);
1761 ## Stay in the phase
1762 !!!next-token;
1763 redo B;
1764 } elsif ($token->{type} eq 'character') {
1765 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
1766 $self->{document}->manakai_append_text ($1);
1767 ## ISSUE: DOM3 Core does not allow Document > Text
1768 unless (length $token->{data}) {
1769 ## Stay in the phase
1770 !!!next-token;
1771 redo B;
1772 }
1773 }
1774 #
1775 } elsif ({
1776 'start tag' => 1,
1777 'end tag' => 1,
1778 'end-of-file' => 1,
1779 }->{$token->{type}}) {
1780 ## ISSUE: There is an issue in the spec
1781 #
1782 } else {
1783 die "$0: $token->{type}: Unknown token";
1784 }
1785 my $root_element; !!!create-element ($root_element, 'html');
1786 $self->{document}->append_child ($root_element);
1787 push @{$self->{open_elements}}, [$root_element, 'html'];
1788 #$phase = 'main';
1789 ## reprocess
1790 #redo B;
1791 return;
1792 } # B
1793 } # _tree_construction_root_element
1794
1795 sub _reset_insertion_mode ($) {
1796 my $self = shift;
1797
1798 ## Step 1
1799 my $last;
1800
1801 ## Step 2
1802 my $i = -1;
1803 my $node = $self->{open_elements}->[$i];
1804
1805 ## Step 3
1806 S3: {
1807 $last = 1 if $self->{open_elements}->[0]->[0] eq $node->[0];
1808 if (defined $self->{inner_html_node}) {
1809 if ($self->{inner_html_node}->[1] eq 'td' or
1810 $self->{inner_html_node}->[1] eq 'th') {
1811 #
1812 } else {
1813 $node = $self->{inner_html_node};
1814 }
1815 }
1816
1817 ## Step 4..13
1818 my $new_mode = {
1819 select => 'in select',
1820 td => 'in cell',
1821 th => 'in cell',
1822 tr => 'in row',
1823 tbody => 'in table body',
1824 thead => 'in table head',
1825 tfoot => 'in table foot',
1826 caption => 'in caption',
1827 colgroup => 'in column group',
1828 table => 'in table',
1829 head => 'in body', # not in head!
1830 body => 'in body',
1831 frameset => 'in frameset',
1832 }->{$node->[1]};
1833 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
1834
1835 ## Step 14
1836 if ($node->[1] eq 'html') {
1837 unless (defined $self->{head_element}) {
1838 $self->{insertion_mode} = 'before head';
1839 } else {
1840 $self->{insertion_mode} = 'after head';
1841 }
1842 return;
1843 }
1844
1845 ## Step 15
1846 $self->{insertion_mode} = 'in body' and return if $last;
1847
1848 ## Step 16
1849 $i--;
1850 $node = $self->{open_elements}->[$i];
1851
1852 ## Step 17
1853 redo S3;
1854 } # S3
1855 } # _reset_insertion_mode
1856
1857 sub _tree_construction_main ($) {
1858 my $self = shift;
1859
1860 my $phase = 'main';
1861
1862 my $active_formatting_elements = [];
1863
1864 my $reconstruct_active_formatting_elements = sub { # MUST
1865 my $insert = shift;
1866
1867 ## Step 1
1868 return unless @$active_formatting_elements;
1869
1870 ## Step 3
1871 my $i = -1;
1872 my $entry = $active_formatting_elements->[$i];
1873
1874 ## Step 2
1875 return if $entry->[0] eq '#marker';
1876 for (@{$self->{open_elements}}) {
1877 if ($entry->[0] eq $_->[0]) {
1878 return;
1879 }
1880 }
1881
1882 S4: {
1883 ## Step 4
1884 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
1885
1886 ## Step 5
1887 $i--;
1888 $entry = $active_formatting_elements->[$i];
1889
1890 ## Step 6
1891 if ($entry->[0] eq '#marker') {
1892 #
1893 } else {
1894 my $in_open_elements;
1895 OE: for (@{$self->{open_elements}}) {
1896 if ($entry->[0] eq $_->[0]) {
1897 $in_open_elements = 1;
1898 last OE;
1899 }
1900 }
1901 if ($in_open_elements) {
1902 #
1903 } else {
1904 redo S4;
1905 }
1906 }
1907
1908 ## Step 7
1909 $i++;
1910 $entry = $active_formatting_elements->[$i];
1911 } # S4
1912
1913 S7: {
1914 ## Step 8
1915 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
1916
1917 ## Step 9
1918 $insert->($clone->[0]);
1919 push @{$self->{open_elements}}, $clone;
1920
1921 ## Step 10
1922 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
1923
1924 ## Step 11
1925 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
1926 ## Step 7'
1927 $i++;
1928 $entry = $active_formatting_elements->[$i];
1929
1930 redo S7;
1931 }
1932 } # S7
1933 }; # $reconstruct_active_formatting_elements
1934
1935 my $clear_up_to_marker = sub {
1936 for (reverse 0..$#$active_formatting_elements) {
1937 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
1938 splice @$active_formatting_elements, $_;
1939 return;
1940 }
1941 }
1942 }; # $clear_up_to_marker
1943
1944 my $style_start_tag = sub {
1945 my $style_el; !!!create-element ($style_el, 'style', $token->{attributes});
1946 ## $self->{insertion_mode} eq 'in head' and ... (always true)
1947 (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
1948 ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
1949 ->append_child ($style_el);
1950 $self->{content_model_flag} = 'CDATA';
1951 delete $self->{escape}; # MUST
1952
1953 my $text = '';
1954 !!!next-token;
1955 while ($token->{type} eq 'character') {
1956 $text .= $token->{data};
1957 !!!next-token;
1958 } # stop if non-character token or tokenizer stops tokenising
1959 if (length $text) {
1960 $style_el->manakai_append_text ($text);
1961 }
1962
1963 $self->{content_model_flag} = 'PCDATA';
1964
1965 if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
1966 ## Ignore the token
1967 } else {
1968 !!!parse-error (type => 'in CDATA:#'.$token->{type});
1969 ## ISSUE: And ignore?
1970 }
1971 !!!next-token;
1972 }; # $style_start_tag
1973
1974 my $script_start_tag = sub {
1975 my $script_el;
1976 !!!create-element ($script_el, 'script', $token->{attributes});
1977 ## TODO: mark as "parser-inserted"
1978
1979 $self->{content_model_flag} = 'CDATA';
1980 delete $self->{escape}; # MUST
1981
1982 my $text = '';
1983 !!!next-token;
1984 while ($token->{type} eq 'character') {
1985 $text .= $token->{data};
1986 !!!next-token;
1987 } # stop if non-character token or tokenizer stops tokenising
1988 if (length $text) {
1989 $script_el->manakai_append_text ($text);
1990 }
1991
1992 $self->{content_model_flag} = 'PCDATA';
1993
1994 if ($token->{type} eq 'end tag' and
1995 $token->{tag_name} eq 'script') {
1996 ## Ignore the token
1997 } else {
1998 !!!parse-error (type => 'in CDATA:#'.$token->{type});
1999 ## ISSUE: And ignore?
2000 ## TODO: mark as "already executed"
2001 }
2002
2003 if (defined $self->{inner_html_node}) {
2004 ## TODO: mark as "already executed"
2005 } else {
2006 ## TODO: $old_insertion_point = current insertion point
2007 ## TODO: insertion point = just before the next input character
2008
2009 (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
2010 ? $self->{head_element} : $self->{open_elements}->[-1]->[0])->append_child ($script_el);
2011
2012 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2013
2014 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2015 }
2016
2017 !!!next-token;
2018 }; # $script_start_tag
2019
2020 my $formatting_end_tag = sub {
2021 my $tag_name = shift;
2022
2023 FET: {
2024 ## Step 1
2025 my $formatting_element;
2026 my $formatting_element_i_in_active;
2027 AFE: for (reverse 0..$#$active_formatting_elements) {
2028 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2029 $formatting_element = $active_formatting_elements->[$_];
2030 $formatting_element_i_in_active = $_;
2031 last AFE;
2032 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2033 last AFE;
2034 }
2035 } # AFE
2036 unless (defined $formatting_element) {
2037 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2038 ## Ignore the token
2039 !!!next-token;
2040 return;
2041 }
2042 ## has an element in scope
2043 my $in_scope = 1;
2044 my $formatting_element_i_in_open;
2045 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2046 my $node = $self->{open_elements}->[$_];
2047 if ($node->[0] eq $formatting_element->[0]) {
2048 if ($in_scope) {
2049 $formatting_element_i_in_open = $_;
2050 last INSCOPE;
2051 } else { # in open elements but not in scope
2052 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2053 ## Ignore the token
2054 !!!next-token;
2055 return;
2056 }
2057 } elsif ({
2058 table => 1, caption => 1, td => 1, th => 1,
2059 button => 1, marquee => 1, object => 1, html => 1,
2060 }->{$node->[1]}) {
2061 $in_scope = 0;
2062 }
2063 } # INSCOPE
2064 unless (defined $formatting_element_i_in_open) {
2065 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2066 pop @$active_formatting_elements; # $formatting_element
2067 !!!next-token; ## TODO: ok?
2068 return;
2069 }
2070 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2071 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2072 }
2073
2074 ## Step 2
2075 my $furthest_block;
2076 my $furthest_block_i_in_open;
2077 OE: for (reverse 0..$#{$self->{open_elements}}) {
2078 my $node = $self->{open_elements}->[$_];
2079 if (not $formatting_category->{$node->[1]} and
2080 #not $phrasing_category->{$node->[1]} and
2081 ($special_category->{$node->[1]} or
2082 $scoping_category->{$node->[1]})) {
2083 $furthest_block = $node;
2084 $furthest_block_i_in_open = $_;
2085 } elsif ($node->[0] eq $formatting_element->[0]) {
2086 last OE;
2087 }
2088 } # OE
2089
2090 ## Step 3
2091 unless (defined $furthest_block) { # MUST
2092 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2093 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2094 !!!next-token;
2095 return;
2096 }
2097
2098 ## Step 4
2099 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2100
2101 ## Step 5
2102 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2103 if (defined $furthest_block_parent) {
2104 $furthest_block_parent->remove_child ($furthest_block->[0]);
2105 }
2106
2107 ## Step 6
2108 my $bookmark_prev_el
2109 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2110 ->[0];
2111
2112 ## Step 7
2113 my $node = $furthest_block;
2114 my $node_i_in_open = $furthest_block_i_in_open;
2115 my $last_node = $furthest_block;
2116 S7: {
2117 ## Step 1
2118 $node_i_in_open--;
2119 $node = $self->{open_elements}->[$node_i_in_open];
2120
2121 ## Step 2
2122 my $node_i_in_active;
2123 S7S2: {
2124 for (reverse 0..$#$active_formatting_elements) {
2125 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2126 $node_i_in_active = $_;
2127 last S7S2;
2128 }
2129 }
2130 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2131 redo S7;
2132 } # S7S2
2133
2134 ## Step 3
2135 last S7 if $node->[0] eq $formatting_element->[0];
2136
2137 ## Step 4
2138 if ($last_node->[0] eq $furthest_block->[0]) {
2139 $bookmark_prev_el = $node->[0];
2140 }
2141
2142 ## Step 5
2143 if ($node->[0]->has_child_nodes ()) {
2144 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2145 $active_formatting_elements->[$node_i_in_active] = $clone;
2146 $self->{open_elements}->[$node_i_in_open] = $clone;
2147 $node = $clone;
2148 }
2149
2150 ## Step 6
2151 $node->[0]->append_child ($last_node->[0]);
2152
2153 ## Step 7
2154 $last_node = $node;
2155
2156 ## Step 8
2157 redo S7;
2158 } # S7
2159
2160 ## Step 8
2161 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2162
2163 ## Step 9
2164 my $clone = [$formatting_element->[0]->clone_node (0),
2165 $formatting_element->[1]];
2166
2167 ## Step 10
2168 my @cn = @{$furthest_block->[0]->child_nodes};
2169 $clone->[0]->append_child ($_) for @cn;
2170
2171 ## Step 11
2172 $furthest_block->[0]->append_child ($clone->[0]);
2173
2174 ## Step 12
2175 my $i;
2176 AFE: for (reverse 0..$#$active_formatting_elements) {
2177 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2178 splice @$active_formatting_elements, $_, 1;
2179 $i-- and last AFE if defined $i;
2180 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2181 $i = $_;
2182 }
2183 } # AFE
2184 splice @$active_formatting_elements, $i + 1, 0, $clone;
2185
2186 ## Step 13
2187 undef $i;
2188 OE: for (reverse 0..$#{$self->{open_elements}}) {
2189 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2190 splice @{$self->{open_elements}}, $_, 1;
2191 $i-- and last OE if defined $i;
2192 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2193 $i = $_;
2194 }
2195 } # OE
2196 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2197
2198 ## Step 14
2199 redo FET;
2200 } # FET
2201 }; # $formatting_end_tag
2202
2203 my $insert_to_current = sub {
2204 $self->{open_elements}->[-1]->[0]->append_child (shift);
2205 }; # $insert_to_current
2206
2207 my $insert_to_foster = sub {
2208 my $child = shift;
2209 if ({
2210 table => 1, tbody => 1, tfoot => 1,
2211 thead => 1, tr => 1,
2212 }->{$self->{open_elements}->[-1]->[1]}) {
2213 # MUST
2214 my $foster_parent_element;
2215 my $next_sibling;
2216 OE: for (reverse 0..$#{$self->{open_elements}}) {
2217 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2218 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2219 if (defined $parent and $parent->node_type == 1) {
2220 $foster_parent_element = $parent;
2221 $next_sibling = $self->{open_elements}->[$_]->[0];
2222 } else {
2223 $foster_parent_element
2224 = $self->{open_elements}->[$_ - 1]->[0];
2225 }
2226 last OE;
2227 }
2228 } # OE
2229 $foster_parent_element = $self->{open_elements}->[0]->[0]
2230 unless defined $foster_parent_element;
2231 $foster_parent_element->insert_before
2232 ($child, $next_sibling);
2233 } else {
2234 $self->{open_elements}->[-1]->[0]->append_child ($child);
2235 }
2236 }; # $insert_to_foster
2237
2238 my $in_body = sub {
2239 my $insert = shift;
2240 if ($token->{type} eq 'start tag') {
2241 if ($token->{tag_name} eq 'script') {
2242 $script_start_tag->();
2243 return;
2244 } elsif ($token->{tag_name} eq 'style') {
2245 $style_start_tag->();
2246 return;
2247 } elsif ({
2248 base => 1, link => 1, meta => 1,
2249 }->{$token->{tag_name}}) {
2250 !!!parse-error (type => 'in body:'.$token->{tag_name});
2251 ## NOTE: This is an "as if in head" code clone
2252 my $el;
2253 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2254 if (defined $self->{head_element}) {
2255 $self->{head_element}->append_child ($el);
2256 } else {
2257 $insert->($el);
2258 }
2259
2260 !!!next-token;
2261 return;
2262 } elsif ($token->{tag_name} eq 'title') {
2263 !!!parse-error (type => 'in body:title');
2264 ## NOTE: There is an "as if in head" code clone
2265 my $title_el;
2266 !!!create-element ($title_el, 'title', $token->{attributes});
2267 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
2268 ->append_child ($title_el);
2269 $self->{content_model_flag} = 'RCDATA';
2270 delete $self->{escape}; # MUST
2271
2272 my $text = '';
2273 !!!next-token;
2274 while ($token->{type} eq 'character') {
2275 $text .= $token->{data};
2276 !!!next-token;
2277 }
2278 if (length $text) {
2279 $title_el->manakai_append_text ($text);
2280 }
2281
2282 $self->{content_model_flag} = 'PCDATA';
2283
2284 if ($token->{type} eq 'end tag' and
2285 $token->{tag_name} eq 'title') {
2286 ## Ignore the token
2287 } else {
2288 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2289 ## ISSUE: And ignore?
2290 }
2291 !!!next-token;
2292 return;
2293 } elsif ($token->{tag_name} eq 'body') {
2294 !!!parse-error (type => 'in body:body');
2295
2296 if (@{$self->{open_elements}} == 1 or
2297 $self->{open_elements}->[1]->[1] ne 'body') {
2298 ## Ignore the token
2299 } else {
2300 my $body_el = $self->{open_elements}->[1]->[0];
2301 for my $attr_name (keys %{$token->{attributes}}) {
2302 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2303 $body_el->set_attribute_ns
2304 (undef, [undef, $attr_name],
2305 $token->{attributes}->{$attr_name}->{value});
2306 }
2307 }
2308 }
2309 !!!next-token;
2310 return;
2311 } elsif ({
2312 address => 1, blockquote => 1, center => 1, dir => 1,
2313 div => 1, dl => 1, fieldset => 1, listing => 1,
2314 menu => 1, ol => 1, p => 1, ul => 1,
2315 pre => 1,
2316 }->{$token->{tag_name}}) {
2317 ## has a p element in scope
2318 INSCOPE: for (reverse @{$self->{open_elements}}) {
2319 if ($_->[1] eq 'p') {
2320 !!!back-token;
2321 $token = {type => 'end tag', tag_name => 'p'};
2322 return;
2323 } elsif ({
2324 table => 1, caption => 1, td => 1, th => 1,
2325 button => 1, marquee => 1, object => 1, html => 1,
2326 }->{$_->[1]}) {
2327 last INSCOPE;
2328 }
2329 } # INSCOPE
2330
2331 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2332 if ($token->{tag_name} eq 'pre') {
2333 !!!next-token;
2334 if ($token->{type} eq 'character') {
2335 $token->{data} =~ s/^\x0A//;
2336 unless (length $token->{data}) {
2337 !!!next-token;
2338 }
2339 }
2340 } else {
2341 !!!next-token;
2342 }
2343 return;
2344 } elsif ($token->{tag_name} eq 'form') {
2345 if (defined $self->{form_element}) {
2346 !!!parse-error (type => 'in form:form');
2347 ## Ignore the token
2348 !!!next-token;
2349 return;
2350 } else {
2351 ## has a p element in scope
2352 INSCOPE: for (reverse @{$self->{open_elements}}) {
2353 if ($_->[1] eq 'p') {
2354 !!!back-token;
2355 $token = {type => 'end tag', tag_name => 'p'};
2356 return;
2357 } elsif ({
2358 table => 1, caption => 1, td => 1, th => 1,
2359 button => 1, marquee => 1, object => 1, html => 1,
2360 }->{$_->[1]}) {
2361 last INSCOPE;
2362 }
2363 } # INSCOPE
2364
2365 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2366 $self->{form_element} = $self->{open_elements}->[-1]->[0];
2367 !!!next-token;
2368 return;
2369 }
2370 } elsif ($token->{tag_name} eq 'li') {
2371 ## has a p element in scope
2372 INSCOPE: for (reverse @{$self->{open_elements}}) {
2373 if ($_->[1] eq 'p') {
2374 !!!back-token;
2375 $token = {type => 'end tag', tag_name => 'p'};
2376 return;
2377 } elsif ({
2378 table => 1, caption => 1, td => 1, th => 1,
2379 button => 1, marquee => 1, object => 1, html => 1,
2380 }->{$_->[1]}) {
2381 last INSCOPE;
2382 }
2383 } # INSCOPE
2384
2385 ## Step 1
2386 my $i = -1;
2387 my $node = $self->{open_elements}->[$i];
2388 LI: {
2389 ## Step 2
2390 if ($node->[1] eq 'li') {
2391 if ($i != -1) {
2392 !!!parse-error (type => 'end tag missing:'.
2393 $self->{open_elements}->[-1]->[1]);
2394 ## TODO: test
2395 }
2396 splice @{$self->{open_elements}}, $i;
2397 last LI;
2398 }
2399
2400 ## Step 3
2401 if (not $formatting_category->{$node->[1]} and
2402 #not $phrasing_category->{$node->[1]} and
2403 ($special_category->{$node->[1]} or
2404 $scoping_category->{$node->[1]}) and
2405 $node->[1] ne 'address' and $node->[1] ne 'div') {
2406 last LI;
2407 }
2408
2409 ## Step 4
2410 $i--;
2411 $node = $self->{open_elements}->[$i];
2412 redo LI;
2413 } # LI
2414
2415 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2416 !!!next-token;
2417 return;
2418 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2419 ## has a p element in scope
2420 INSCOPE: for (reverse @{$self->{open_elements}}) {
2421 if ($_->[1] eq 'p') {
2422 !!!back-token;
2423 $token = {type => 'end tag', tag_name => 'p'};
2424 return;
2425 } elsif ({
2426 table => 1, caption => 1, td => 1, th => 1,
2427 button => 1, marquee => 1, object => 1, html => 1,
2428 }->{$_->[1]}) {
2429 last INSCOPE;
2430 }
2431 } # INSCOPE
2432
2433 ## Step 1
2434 my $i = -1;
2435 my $node = $self->{open_elements}->[$i];
2436 LI: {
2437 ## Step 2
2438 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2439 if ($i != -1) {
2440 !!!parse-error (type => 'end tag missing:'.
2441 $self->{open_elements}->[-1]->[1]);
2442 ## TODO: test
2443 }
2444 splice @{$self->{open_elements}}, $i;
2445 last LI;
2446 }
2447
2448 ## Step 3
2449 if (not $formatting_category->{$node->[1]} and
2450 #not $phrasing_category->{$node->[1]} and
2451 ($special_category->{$node->[1]} or
2452 $scoping_category->{$node->[1]}) and
2453 $node->[1] ne 'address' and $node->[1] ne 'div') {
2454 last LI;
2455 }
2456
2457 ## Step 4
2458 $i--;
2459 $node = $self->{open_elements}->[$i];
2460 redo LI;
2461 } # LI
2462
2463 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2464 !!!next-token;
2465 return;
2466 } elsif ($token->{tag_name} eq 'plaintext') {
2467 ## has a p element in scope
2468 INSCOPE: for (reverse @{$self->{open_elements}}) {
2469 if ($_->[1] eq 'p') {
2470 !!!back-token;
2471 $token = {type => 'end tag', tag_name => 'p'};
2472 return;
2473 } elsif ({
2474 table => 1, caption => 1, td => 1, th => 1,
2475 button => 1, marquee => 1, object => 1, html => 1,
2476 }->{$_->[1]}) {
2477 last INSCOPE;
2478 }
2479 } # INSCOPE
2480
2481 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2482
2483 $self->{content_model_flag} = 'PLAINTEXT';
2484
2485 !!!next-token;
2486 return;
2487 } elsif ({
2488 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2489 }->{$token->{tag_name}}) {
2490 ## has a p element in scope
2491 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2492 my $node = $self->{open_elements}->[$_];
2493 if ($node->[1] eq 'p') {
2494 !!!back-token;
2495 $token = {type => 'end tag', tag_name => 'p'};
2496 return;
2497 } elsif ({
2498 table => 1, caption => 1, td => 1, th => 1,
2499 button => 1, marquee => 1, object => 1, html => 1,
2500 }->{$node->[1]}) {
2501 last INSCOPE;
2502 }
2503 } # INSCOPE
2504
2505 ## has an element in scope
2506 my $i;
2507 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2508 my $node = $self->{open_elements}->[$_];
2509 if ({
2510 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2511 }->{$node->[1]}) {
2512 $i = $_;
2513 last INSCOPE;
2514 } elsif ({
2515 table => 1, caption => 1, td => 1, th => 1,
2516 button => 1, marquee => 1, object => 1, html => 1,
2517 }->{$node->[1]}) {
2518 last INSCOPE;
2519 }
2520 } # INSCOPE
2521
2522 if (defined $i) {
2523 !!!parse-error (type => 'in hn:hn');
2524 splice @{$self->{open_elements}}, $i;
2525 }
2526
2527 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2528
2529 !!!next-token;
2530 return;
2531 } elsif ($token->{tag_name} eq 'a') {
2532 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2533 my $node = $active_formatting_elements->[$i];
2534 if ($node->[1] eq 'a') {
2535 !!!parse-error (type => 'in a:a');
2536
2537 !!!back-token;
2538 $token = {type => 'end tag', tag_name => 'a'};
2539 $formatting_end_tag->($token->{tag_name});
2540
2541 AFE2: for (reverse 0..$#$active_formatting_elements) {
2542 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2543 splice @$active_formatting_elements, $_, 1;
2544 last AFE2;
2545 }
2546 } # AFE2
2547 OE: for (reverse 0..$#{$self->{open_elements}}) {
2548 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
2549 splice @{$self->{open_elements}}, $_, 1;
2550 last OE;
2551 }
2552 } # OE
2553 last AFE;
2554 } elsif ($node->[0] eq '#marker') {
2555 last AFE;
2556 }
2557 } # AFE
2558
2559 $reconstruct_active_formatting_elements->($insert_to_current);
2560
2561 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2562 push @$active_formatting_elements, $self->{open_elements}->[-1];
2563
2564 !!!next-token;
2565 return;
2566 } elsif ({
2567 b => 1, big => 1, em => 1, font => 1, i => 1,
2568 nobr => 1, s => 1, small => 1, strile => 1,
2569 strong => 1, tt => 1, u => 1,
2570 }->{$token->{tag_name}}) {
2571 $reconstruct_active_formatting_elements->($insert_to_current);
2572
2573 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2574 push @$active_formatting_elements, $self->{open_elements}->[-1];
2575
2576 !!!next-token;
2577 return;
2578 } elsif ($token->{tag_name} eq 'button') {
2579 ## has a button element in scope
2580 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2581 my $node = $self->{open_elements}->[$_];
2582 if ($node->[1] eq 'button') {
2583 !!!parse-error (type => 'in button:button');
2584 !!!back-token;
2585 $token = {type => 'end tag', tag_name => 'button'};
2586 return;
2587 } elsif ({
2588 table => 1, caption => 1, td => 1, th => 1,
2589 button => 1, marquee => 1, object => 1, html => 1,
2590 }->{$node->[1]}) {
2591 last INSCOPE;
2592 }
2593 } # INSCOPE
2594
2595 $reconstruct_active_formatting_elements->($insert_to_current);
2596
2597 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2598 push @$active_formatting_elements, ['#marker', ''];
2599
2600 !!!next-token;
2601 return;
2602 } elsif ($token->{tag_name} eq 'marquee' or
2603 $token->{tag_name} eq 'object') {
2604 $reconstruct_active_formatting_elements->($insert_to_current);
2605
2606 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2607 push @$active_formatting_elements, ['#marker', ''];
2608
2609 !!!next-token;
2610 return;
2611 } elsif ($token->{tag_name} eq 'xmp') {
2612 $reconstruct_active_formatting_elements->($insert_to_current);
2613
2614 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2615
2616 $self->{content_model_flag} = 'CDATA';
2617 delete $self->{escape}; # MUST
2618
2619 !!!next-token;
2620 return;
2621 } elsif ($token->{tag_name} eq 'table') {
2622 ## has a p element in scope
2623 INSCOPE: for (reverse @{$self->{open_elements}}) {
2624 if ($_->[1] eq 'p') {
2625 !!!back-token;
2626 $token = {type => 'end tag', tag_name => 'p'};
2627 return;
2628 } elsif ({
2629 table => 1, caption => 1, td => 1, th => 1,
2630 button => 1, marquee => 1, object => 1, html => 1,
2631 }->{$_->[1]}) {
2632 last INSCOPE;
2633 }
2634 } # INSCOPE
2635
2636 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2637
2638 $self->{insertion_mode} = 'in table';
2639
2640 !!!next-token;
2641 return;
2642 } elsif ({
2643 area => 1, basefont => 1, bgsound => 1, br => 1,
2644 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2645 image => 1,
2646 }->{$token->{tag_name}}) {
2647 if ($token->{tag_name} eq 'image') {
2648 !!!parse-error (type => 'image');
2649 $token->{tag_name} = 'img';
2650 }
2651
2652 $reconstruct_active_formatting_elements->($insert_to_current);
2653
2654 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2655 pop @{$self->{open_elements}};
2656
2657 !!!next-token;
2658 return;
2659 } elsif ($token->{tag_name} eq 'hr') {
2660 ## has a p element in scope
2661 INSCOPE: for (reverse @{$self->{open_elements}}) {
2662 if ($_->[1] eq 'p') {
2663 !!!back-token;
2664 $token = {type => 'end tag', tag_name => 'p'};
2665 return;
2666 } elsif ({
2667 table => 1, caption => 1, td => 1, th => 1,
2668 button => 1, marquee => 1, object => 1, html => 1,
2669 }->{$_->[1]}) {
2670 last INSCOPE;
2671 }
2672 } # INSCOPE
2673
2674 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2675 pop @{$self->{open_elements}};
2676
2677 !!!next-token;
2678 return;
2679 } elsif ($token->{tag_name} eq 'input') {
2680 $reconstruct_active_formatting_elements->($insert_to_current);
2681
2682 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2683 ## TODO: associate with $self->{form_element} if defined
2684 pop @{$self->{open_elements}};
2685
2686 !!!next-token;
2687 return;
2688 } elsif ($token->{tag_name} eq 'isindex') {
2689 !!!parse-error (type => 'isindex');
2690
2691 if (defined $self->{form_element}) {
2692 ## Ignore the token
2693 !!!next-token;
2694 return;
2695 } else {
2696 my $at = $token->{attributes};
2697 $at->{name} = {name => 'name', value => 'isindex'};
2698 my @tokens = (
2699 {type => 'start tag', tag_name => 'form'},
2700 {type => 'start tag', tag_name => 'hr'},
2701 {type => 'start tag', tag_name => 'p'},
2702 {type => 'start tag', tag_name => 'label'},
2703 {type => 'character',
2704 data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
2705 ## TODO: make this configurable
2706 {type => 'start tag', tag_name => 'input', attributes => $at},
2707 #{type => 'character', data => ''}, # SHOULD
2708 {type => 'end tag', tag_name => 'label'},
2709 {type => 'end tag', tag_name => 'p'},
2710 {type => 'start tag', tag_name => 'hr'},
2711 {type => 'end tag', tag_name => 'form'},
2712 );
2713 $token = shift @tokens;
2714 !!!back-token (@tokens);
2715 return;
2716 }
2717 } elsif ({
2718 textarea => 1,
2719 iframe => 1,
2720 noembed => 1,
2721 noframes => 1,
2722 noscript => 0, ## TODO: 1 if scripting is enabled
2723 }->{$token->{tag_name}}) {
2724 my $tag_name = $token->{tag_name};
2725 my $el;
2726 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2727
2728 if ($token->{tag_name} eq 'textarea') {
2729 ## TODO: $self->{form_element} if defined
2730 $self->{content_model_flag} = 'RCDATA';
2731 } else {
2732 $self->{content_model_flag} = 'CDATA';
2733 }
2734 delete $self->{escape}; # MUST
2735
2736 $insert->($el);
2737
2738 my $text = '';
2739 if ($token->{tag_name} eq 'textarea') {
2740 !!!next-token;
2741 if ($token->{type} eq 'character') {
2742 $token->{data} =~ s/^\x0A//;
2743 unless (length $token->{data}) {
2744 !!!next-token;
2745 }
2746 }
2747 } else {
2748 !!!next-token;
2749 }
2750 while ($token->{type} eq 'character') {
2751 $text .= $token->{data};
2752 !!!next-token;
2753 }
2754 if (length $text) {
2755 $el->manakai_append_text ($text);
2756 }
2757
2758 $self->{content_model_flag} = 'PCDATA';
2759
2760 if ($token->{type} eq 'end tag' and
2761 $token->{tag_name} eq $tag_name) {
2762 ## Ignore the token
2763 } else {
2764 if ($token->{tag_name} eq 'textarea') {
2765 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2766 } else {
2767 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2768 }
2769 ## ISSUE: And ignore?
2770 }
2771 !!!next-token;
2772 return;
2773 } elsif ($token->{tag_name} eq 'select') {
2774 $reconstruct_active_formatting_elements->($insert_to_current);
2775
2776 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2777
2778 $self->{insertion_mode} = 'in select';
2779 !!!next-token;
2780 return;
2781 } elsif ({
2782 caption => 1, col => 1, colgroup => 1, frame => 1,
2783 frameset => 1, head => 1, option => 1, optgroup => 1,
2784 tbody => 1, td => 1, tfoot => 1, th => 1,
2785 thead => 1, tr => 1,
2786 }->{$token->{tag_name}}) {
2787 !!!parse-error (type => 'in body:'.$token->{tag_name});
2788 ## Ignore the token
2789 !!!next-token;
2790 return;
2791
2792 ## ISSUE: An issue on HTML5 new elements in the spec.
2793 } else {
2794 $reconstruct_active_formatting_elements->($insert_to_current);
2795
2796 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2797
2798 !!!next-token;
2799 return;
2800 }
2801 } elsif ($token->{type} eq 'end tag') {
2802 if ($token->{tag_name} eq 'body') {
2803 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
2804 ## ISSUE: There is an issue in the spec.
2805 if ($self->{open_elements}->[-1]->[1] ne 'body') {
2806 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2807 }
2808 $self->{insertion_mode} = 'after body';
2809 !!!next-token;
2810 return;
2811 } else {
2812 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2813 ## Ignore the token
2814 !!!next-token;
2815 return;
2816 }
2817 } elsif ($token->{tag_name} eq 'html') {
2818 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
2819 ## ISSUE: There is an issue in the spec.
2820 if ($self->{open_elements}->[-1]->[1] ne 'body') {
2821 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
2822 }
2823 $self->{insertion_mode} = 'after body';
2824 ## reprocess
2825 return;
2826 } else {
2827 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2828 ## Ignore the token
2829 !!!next-token;
2830 return;
2831 }
2832 } elsif ({
2833 address => 1, blockquote => 1, center => 1, dir => 1,
2834 div => 1, dl => 1, fieldset => 1, listing => 1,
2835 menu => 1, ol => 1, pre => 1, ul => 1,
2836 p => 1,
2837 dd => 1, dt => 1, li => 1,
2838 button => 1, marquee => 1, object => 1,
2839 }->{$token->{tag_name}}) {
2840 ## has an element in scope
2841 my $i;
2842 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2843 my $node = $self->{open_elements}->[$_];
2844 if ($node->[1] eq $token->{tag_name}) {
2845 ## generate implied end tags
2846 if ({
2847 dd => ($token->{tag_name} ne 'dd'),
2848 dt => ($token->{tag_name} ne 'dt'),
2849 li => ($token->{tag_name} ne 'li'),
2850 p => ($token->{tag_name} ne 'p'),
2851 td => 1, th => 1, tr => 1,
2852 }->{$self->{open_elements}->[-1]->[1]}) {
2853 !!!back-token;
2854 $token = {type => 'end tag',
2855 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
2856 return;
2857 }
2858 $i = $_;
2859 last INSCOPE unless $token->{tag_name} eq 'p';
2860 } elsif ({
2861 table => 1, caption => 1, td => 1, th => 1,
2862 button => 1, marquee => 1, object => 1, html => 1,
2863 }->{$node->[1]}) {
2864 last INSCOPE;
2865 }
2866 } # INSCOPE
2867
2868 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
2869 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2870 }
2871
2872 splice @{$self->{open_elements}}, $i if defined $i;
2873 $clear_up_to_marker->()
2874 if {
2875 button => 1, marquee => 1, object => 1,
2876 }->{$token->{tag_name}};
2877 !!!next-token;
2878 return;
2879 } elsif ($token->{tag_name} eq 'form') {
2880 ## has an element in scope
2881 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2882 my $node = $self->{open_elements}->[$_];
2883 if ($node->[1] eq $token->{tag_name}) {
2884 ## generate implied end tags
2885 if ({
2886 dd => 1, dt => 1, li => 1, p => 1,
2887 td => 1, th => 1, tr => 1,
2888 }->{$self->{open_elements}->[-1]->[1]}) {
2889 !!!back-token;
2890 $token = {type => 'end tag',
2891 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
2892 return;
2893 }
2894 last INSCOPE;
2895 } elsif ({
2896 table => 1, caption => 1, td => 1, th => 1,
2897 button => 1, marquee => 1, object => 1, html => 1,
2898 }->{$node->[1]}) {
2899 last INSCOPE;
2900 }
2901 } # INSCOPE
2902
2903 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
2904 pop @{$self->{open_elements}};
2905 } else {
2906 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2907 }
2908
2909 undef $self->{form_element};
2910 !!!next-token;
2911 return;
2912 } elsif ({
2913 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2914 }->{$token->{tag_name}}) {
2915 ## has an element in scope
2916 my $i;
2917 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2918 my $node = $self->{open_elements}->[$_];
2919 if ({
2920 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2921 }->{$node->[1]}) {
2922 ## generate implied end tags
2923 if ({
2924 dd => 1, dt => 1, li => 1, p => 1,
2925 td => 1, th => 1, tr => 1,
2926 }->{$self->{open_elements}->[-1]->[1]}) {
2927 !!!back-token;
2928 $token = {type => 'end tag',
2929 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
2930 return;
2931 }
2932 $i = $_;
2933 last INSCOPE;
2934 } elsif ({
2935 table => 1, caption => 1, td => 1, th => 1,
2936 button => 1, marquee => 1, object => 1, html => 1,
2937 }->{$node->[1]}) {
2938 last INSCOPE;
2939 }
2940 } # INSCOPE
2941
2942 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
2943 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2944 }
2945
2946 splice @{$self->{open_elements}}, $i if defined $i;
2947 !!!next-token;
2948 return;
2949 } elsif ({
2950 a => 1,
2951 b => 1, big => 1, em => 1, font => 1, i => 1,
2952 nobr => 1, s => 1, small => 1, strile => 1,
2953 strong => 1, tt => 1, u => 1,
2954 }->{$token->{tag_name}}) {
2955 $formatting_end_tag->($token->{tag_name});
2956 ## TODO: <http://html5.org/tools/web-apps-tracker?from=883&to=884>
2957 return;
2958 } elsif ({
2959 caption => 1, col => 1, colgroup => 1, frame => 1,
2960 frameset => 1, head => 1, option => 1, optgroup => 1,
2961 tbody => 1, td => 1, tfoot => 1, th => 1,
2962 thead => 1, tr => 1,
2963 area => 1, basefont => 1, bgsound => 1, br => 1,
2964 embed => 1, hr => 1, iframe => 1, image => 1,
2965 img => 1, input => 1, isindex => 1, noembed => 1,
2966 noframes => 1, param => 1, select => 1, spacer => 1,
2967 table => 1, textarea => 1, wbr => 1,
2968 noscript => 0, ## TODO: if scripting is enabled
2969 }->{$token->{tag_name}}) {
2970 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2971 ## Ignore the token
2972 !!!next-token;
2973 return;
2974
2975 ## ISSUE: Issue on HTML5 new elements in spec
2976
2977 } else {
2978 ## Step 1
2979 my $node_i = -1;
2980 my $node = $self->{open_elements}->[$node_i];
2981
2982 ## Step 2
2983 S2: {
2984 if ($node->[1] eq $token->{tag_name}) {
2985 ## Step 1
2986 ## generate implied end tags
2987 if ({
2988 dd => 1, dt => 1, li => 1, p => 1,
2989 td => 1, th => 1, tr => 1,
2990 }->{$self->{open_elements}->[-1]->[1]}) {
2991 !!!back-token;
2992 $token = {type => 'end tag',
2993 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
2994 return;
2995 }
2996
2997 ## Step 2
2998 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
2999 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3000 }
3001
3002 ## Step 3
3003 splice @{$self->{open_elements}}, $node_i;
3004
3005 !!!next-token;
3006 last S2;
3007 } else {
3008 ## Step 3
3009 if (not $formatting_category->{$node->[1]} and
3010 #not $phrasing_category->{$node->[1]} and
3011 ($special_category->{$node->[1]} or
3012 $scoping_category->{$node->[1]})) {
3013 !!!parse-error (type => 'not closed:'.$node->[1]);
3014 ## Ignore the token
3015 !!!next-token;
3016 last S2;
3017 }
3018 }
3019
3020 ## Step 4
3021 $node_i--;
3022 $node = $self->{open_elements}->[$node_i];
3023
3024 ## Step 5;
3025 redo S2;
3026 } # S2
3027 return;
3028 }
3029 }
3030 }; # $in_body
3031
3032 B: {
3033 if ($phase eq 'main') {
3034 if ($token->{type} eq 'DOCTYPE') {
3035 !!!parse-error (type => 'in html:#DOCTYPE');
3036 ## Ignore the token
3037 ## Stay in the phase
3038 !!!next-token;
3039 redo B;
3040 } elsif ($token->{type} eq 'start tag' and
3041 $token->{tag_name} eq 'html') {
3042 ## TODO: unless it is the first start tag token, parse-error
3043 my $top_el = $self->{open_elements}->[0]->[0];
3044 for my $attr_name (keys %{$token->{attributes}}) {
3045 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3046 $top_el->set_attribute_ns
3047 (undef, [undef, $attr_name],
3048 $token->{attributes}->{$attr_name}->{value});
3049 }
3050 }
3051 !!!next-token;
3052 redo B;
3053 } elsif ($token->{type} eq 'end-of-file') {
3054 ## Generate implied end tags
3055 if ({
3056 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3057 }->{$self->{open_elements}->[-1]->[1]}) {
3058 !!!back-token;
3059 $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
3060 redo B;
3061 }
3062
3063 if (@{$self->{open_elements}} > 2 or
3064 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3065 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3066 } elsif (defined $self->{inner_html_node} and
3067 @{$self->{open_elements}} > 1 and
3068 $self->{open_elements}->[1]->[1] ne 'body') {
3069 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3070 }
3071
3072 ## Stop parsing
3073 last B;
3074
3075 ## ISSUE: There is an issue in the spec.
3076 } else {
3077 if ($self->{insertion_mode} eq 'before head') {
3078 if ($token->{type} eq 'character') {
3079 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3080 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3081 unless (length $token->{data}) {
3082 !!!next-token;
3083 redo B;
3084 }
3085 }
3086 ## As if <head>
3087 !!!create-element ($self->{head_element}, 'head');
3088 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3089 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3090 $self->{insertion_mode} = 'in head';
3091 ## reprocess
3092 redo B;
3093 } elsif ($token->{type} eq 'comment') {
3094 my $comment = $self->{document}->create_comment ($token->{data});
3095 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3096 !!!next-token;
3097 redo B;
3098 } elsif ($token->{type} eq 'start tag') {
3099 my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3100 !!!create-element ($self->{head_element}, 'head', $attr);
3101 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3102 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3103 $self->{insertion_mode} = 'in head';
3104 if ($token->{tag_name} eq 'head') {
3105 !!!next-token;
3106 #} elsif ({
3107 # base => 1, link => 1, meta => 1,
3108 # script => 1, style => 1, title => 1,
3109 # }->{$token->{tag_name}}) {
3110 # ## reprocess
3111 } else {
3112 ## reprocess
3113 }
3114 redo B;
3115 } elsif ($token->{type} eq 'end tag') {
3116 if ($token->{tag_name} eq 'html') {
3117 ## As if <head>
3118 !!!create-element ($self->{head_element}, 'head');
3119 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3120 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3121 $self->{insertion_mode} = 'in head';
3122 ## reprocess
3123 redo B;
3124 } else {
3125 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3126 ## Ignore the token
3127 !!!next-token;
3128 redo B;
3129 }
3130 } else {
3131 die "$0: $token->{type}: Unknown type";
3132 }
3133 } elsif ($self->{insertion_mode} eq 'in head') {
3134 if ($token->{type} eq 'character') {
3135 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3136 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3137 unless (length $token->{data}) {
3138 !!!next-token;
3139 redo B;
3140 }
3141 }
3142
3143 #
3144 } elsif ($token->{type} eq 'comment') {
3145 my $comment = $self->{document}->create_comment ($token->{data});
3146 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3147 !!!next-token;
3148 redo B;
3149 } elsif ($token->{type} eq 'start tag') {
3150 if ($token->{tag_name} eq 'title') {
3151 ## NOTE: There is an "as if in head" code clone
3152 my $title_el;
3153 !!!create-element ($title_el, 'title', $token->{attributes});
3154 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
3155 ->append_child ($title_el);
3156 $self->{content_model_flag} = 'RCDATA';
3157 delete $self->{escape}; # MUST
3158
3159 my $text = '';
3160 !!!next-token;
3161 while ($token->{type} eq 'character') {
3162 $text .= $token->{data};
3163 !!!next-token;
3164 }
3165 if (length $text) {
3166 $title_el->manakai_append_text ($text);
3167 }
3168
3169 $self->{content_model_flag} = 'PCDATA';
3170
3171 if ($token->{type} eq 'end tag' and
3172 $token->{tag_name} eq 'title') {
3173 ## Ignore the token
3174 } else {
3175 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
3176 ## ISSUE: And ignore?
3177 }
3178 !!!next-token;
3179 redo B;
3180 } elsif ($token->{tag_name} eq 'style') {
3181 $style_start_tag->();
3182 redo B;
3183 } elsif ($token->{tag_name} eq 'script') {
3184 $script_start_tag->();
3185 redo B;
3186 } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
3187 ## NOTE: There are "as if in head" code clones
3188 my $el;
3189 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
3190 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
3191 ->append_child ($el);
3192
3193 !!!next-token;
3194 redo B;
3195 } elsif ($token->{tag_name} eq 'head') {
3196 !!!parse-error (type => 'in head:head');
3197 ## Ignore the token
3198 !!!next-token;
3199 redo B;
3200 } else {
3201 #
3202 }
3203 } elsif ($token->{type} eq 'end tag') {
3204 if ($token->{tag_name} eq 'head') {
3205 if ($self->{open_elements}->[-1]->[1] eq 'head') {
3206 pop @{$self->{open_elements}};
3207 } else {
3208 !!!parse-error (type => 'unmatched end tag:head');
3209 }
3210 $self->{insertion_mode} = 'after head';
3211 !!!next-token;
3212 redo B;
3213 } elsif ($token->{tag_name} eq 'html') {
3214 #
3215 } else {
3216 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3217 ## Ignore the token
3218 !!!next-token;
3219 redo B;
3220 }
3221 } else {
3222 #
3223 }
3224
3225 if ($self->{open_elements}->[-1]->[1] eq 'head') {
3226 ## As if </head>
3227 pop @{$self->{open_elements}};
3228 }
3229 $self->{insertion_mode} = 'after head';
3230 ## reprocess
3231 redo B;
3232
3233 ## ISSUE: An issue in the spec.
3234 } elsif ($self->{insertion_mode} eq 'after head') {
3235 if ($token->{type} eq 'character') {
3236 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3237 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3238 unless (length $token->{data}) {
3239 !!!next-token;
3240 redo B;
3241 }
3242 }
3243
3244 #
3245 } elsif ($token->{type} eq 'comment') {
3246 my $comment = $self->{document}->create_comment ($token->{data});
3247 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3248 !!!next-token;
3249 redo B;
3250 } elsif ($token->{type} eq 'start tag') {
3251 if ($token->{tag_name} eq 'body') {
3252 !!!insert-element ('body', $token->{attributes});
3253 $self->{insertion_mode} = 'in body';
3254 !!!next-token;
3255 redo B;
3256 } elsif ($token->{tag_name} eq 'frameset') {
3257 !!!insert-element ('frameset', $token->{attributes});
3258 $self->{insertion_mode} = 'in frameset';
3259 !!!next-token;
3260 redo B;
3261 } elsif ({
3262 base => 1, link => 1, meta => 1,
3263 script => 1, style => 1, title => 1,
3264 }->{$token->{tag_name}}) {
3265 !!!parse-error (type => 'after head:'.$token->{tag_name});
3266 $self->{insertion_mode} = 'in head';
3267 ## reprocess
3268 redo B;
3269 } else {
3270 #
3271 }
3272 } else {
3273 #
3274 }
3275
3276 ## As if <body>
3277 !!!insert-element ('body');
3278 $self->{insertion_mode} = 'in body';
3279 ## reprocess
3280 redo B;
3281 } elsif ($self->{insertion_mode} eq 'in body') {
3282 if ($token->{type} eq 'character') {
3283 ## NOTE: There is a code clone of "character in body".
3284 $reconstruct_active_formatting_elements->($insert_to_current);
3285
3286 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3287
3288 !!!next-token;
3289 redo B;
3290 } elsif ($token->{type} eq 'comment') {
3291 ## NOTE: There is a code clone of "comment in body".
3292 my $comment = $self->{document}->create_comment ($token->{data});
3293 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3294 !!!next-token;
3295 redo B;
3296 } else {
3297 $in_body->($insert_to_current);
3298 redo B;
3299 }
3300 } elsif ($self->{insertion_mode} eq 'in table') {
3301 if ($token->{type} eq 'character') {
3302 ## NOTE: There are "character in table" code clones.
3303 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3304 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3305
3306 unless (length $token->{data}) {
3307 !!!next-token;
3308 redo B;
3309 }
3310 }
3311
3312 !!!parse-error (type => 'in table:#character');
3313
3314 ## As if in body, but insert into foster parent element
3315 ## ISSUE: Spec says that "whenever a node would be inserted
3316 ## into the current node" while characters might not be
3317 ## result in a new Text node.
3318 $reconstruct_active_formatting_elements->($insert_to_foster);
3319
3320 if ({
3321 table => 1, tbody => 1, tfoot => 1,
3322 thead => 1, tr => 1,
3323 }->{$self->{open_elements}->[-1]->[1]}) {
3324 # MUST
3325 my $foster_parent_element;
3326 my $next_sibling;
3327 my $prev_sibling;
3328 OE: for (reverse 0..$#{$self->{open_elements}}) {
3329 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3330 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3331 if (defined $parent and $parent->node_type == 1) {
3332 $foster_parent_element = $parent;
3333 $next_sibling = $self->{open_elements}->[$_]->[0];
3334 $prev_sibling = $next_sibling->previous_sibling;
3335 } else {
3336 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3337 $prev_sibling = $foster_parent_element->last_child;
3338 }
3339 last OE;
3340 }
3341 } # OE
3342 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3343 $prev_sibling = $foster_parent_element->last_child
3344 unless defined $foster_parent_element;
3345 if (defined $prev_sibling and
3346 $prev_sibling->node_type == 3) {
3347 $prev_sibling->manakai_append_text ($token->{data});
3348 } else {
3349 $foster_parent_element->insert_before
3350 ($self->{document}->create_text_node ($token->{data}),
3351 $next_sibling);
3352 }
3353 } else {
3354 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3355 }
3356
3357 !!!next-token;
3358 redo B;
3359 } elsif ($token->{type} eq 'comment') {
3360 my $comment = $self->{document}->create_comment ($token->{data});
3361 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3362 !!!next-token;
3363 redo B;
3364 } elsif ($token->{type} eq 'start tag') {
3365 if ({
3366 caption => 1,
3367 colgroup => 1,
3368 tbody => 1, tfoot => 1, thead => 1,
3369 }->{$token->{tag_name}}) {
3370 ## Clear back to table context
3371 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3372 $self->{open_elements}->[-1]->[1] ne 'html') {
3373 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3374 pop @{$self->{open_elements}};
3375 }
3376
3377 push @$active_formatting_elements, ['#marker', '']
3378 if $token->{tag_name} eq 'caption';
3379
3380 !!!insert-element ($token->{tag_name}, $token->{attributes});
3381 $self->{insertion_mode} = {
3382 caption => 'in caption',
3383 colgroup => 'in column group',
3384 tbody => 'in table body',
3385 tfoot => 'in table body',
3386 thead => 'in table body',
3387 }->{$token->{tag_name}};
3388 !!!next-token;
3389 redo B;
3390 } elsif ({
3391 col => 1,
3392 td => 1, th => 1, tr => 1,
3393 }->{$token->{tag_name}}) {
3394 ## Clear back to table context
3395 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3396 $self->{open_elements}->[-1]->[1] ne 'html') {
3397 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3398 pop @{$self->{open_elements}};
3399 }
3400
3401 !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
3402 $self->{insertion_mode} = $token->{tag_name} eq 'col'
3403 ? 'in column group' : 'in table body';
3404 ## reprocess
3405 redo B;
3406 } elsif ($token->{tag_name} eq 'table') {
3407 ## NOTE: There are code clones for this "table in table"
3408 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3409
3410 ## As if </table>
3411 ## have a table element in table scope
3412 my $i;
3413 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3414 my $node = $self->{open_elements}->[$_];
3415 if ($node->[1] eq 'table') {
3416 $i = $_;
3417 last INSCOPE;
3418 } elsif ({
3419 table => 1, html => 1,
3420 }->{$node->[1]}) {
3421 last INSCOPE;
3422 }
3423 } # INSCOPE
3424 unless (defined $i) {
3425 !!!parse-error (type => 'unmatched end tag:table');
3426 ## Ignore tokens </table><table>
3427 !!!next-token;
3428 redo B;
3429 }
3430
3431 ## generate implied end tags
3432 if ({
3433 dd => 1, dt => 1, li => 1, p => 1,
3434 td => 1, th => 1, tr => 1,
3435 }->{$self->{open_elements}->[-1]->[1]}) {
3436 !!!back-token; # <table>
3437 $token = {type => 'end tag', tag_name => 'table'};
3438 !!!back-token;
3439 $token = {type => 'end tag',
3440 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3441 redo B;
3442 }
3443
3444 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3445 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3446 }
3447
3448 splice @{$self->{open_elements}}, $i;
3449
3450 $self->_reset_insertion_mode;
3451
3452 ## reprocess
3453 redo B;
3454 } else {
3455 #
3456 }
3457 } elsif ($token->{type} eq 'end tag') {
3458 if ($token->{tag_name} eq 'table') {
3459 ## have a table element in table scope
3460 my $i;
3461 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3462 my $node = $self->{open_elements}->[$_];
3463 if ($node->[1] eq $token->{tag_name}) {
3464 $i = $_;
3465 last INSCOPE;
3466 } elsif ({
3467 table => 1, html => 1,
3468 }->{$node->[1]}) {
3469 last INSCOPE;
3470 }
3471 } # INSCOPE
3472 unless (defined $i) {
3473 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3474 ## Ignore the token
3475 !!!next-token;
3476 redo B;
3477 }
3478
3479 ## generate implied end tags
3480 if ({
3481 dd => 1, dt => 1, li => 1, p => 1,
3482 td => 1, th => 1, tr => 1,
3483 }->{$self->{open_elements}->[-1]->[1]}) {
3484 !!!back-token;
3485 $token = {type => 'end tag',
3486 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3487 redo B;
3488 }
3489
3490 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3491 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3492 }
3493
3494 splice @{$self->{open_elements}}, $i;
3495
3496 $self->_reset_insertion_mode;
3497
3498 !!!next-token;
3499 redo B;
3500 } elsif ({
3501 body => 1, caption => 1, col => 1, colgroup => 1,
3502 html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
3503 thead => 1, tr => 1,
3504 }->{$token->{tag_name}}) {
3505 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3506 ## Ignore the token
3507 !!!next-token;
3508 redo B;
3509 } else {
3510 #
3511 }
3512 } else {
3513 #
3514 }
3515
3516 !!!parse-error (type => 'in table:'.$token->{tag_name});
3517 $in_body->($insert_to_foster);
3518 redo B;
3519 } elsif ($self->{insertion_mode} eq 'in caption') {
3520 if ($token->{type} eq 'character') {
3521 ## NOTE: This is a code clone of "character in body".
3522 $reconstruct_active_formatting_elements->($insert_to_current);
3523
3524 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3525
3526 !!!next-token;
3527 redo B;
3528 } elsif ($token->{type} eq 'comment') {
3529 ## NOTE: This is a code clone of "comment in body".
3530 my $comment = $self->{document}->create_comment ($token->{data});
3531 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3532 !!!next-token;
3533 redo B;
3534 } elsif ($token->{type} eq 'start tag') {
3535 if ({
3536 caption => 1, col => 1, colgroup => 1, tbody => 1,
3537 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3538 }->{$token->{tag_name}}) {
3539 !!!parse-error (type => 'not closed:caption');
3540
3541 ## As if </caption>
3542 ## have a table element in table scope
3543 my $i;
3544 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3545 my $node = $self->{open_elements}->[$_];
3546 if ($node->[1] eq 'caption') {
3547 $i = $_;
3548 last INSCOPE;
3549 } elsif ({
3550 table => 1, html => 1,
3551 }->{$node->[1]}) {
3552 last INSCOPE;
3553 }
3554 } # INSCOPE
3555 unless (defined $i) {
3556 !!!parse-error (type => 'unmatched end tag:caption');
3557 ## Ignore the token
3558 !!!next-token;
3559 redo B;
3560 }
3561
3562 ## generate implied end tags
3563 if ({
3564 dd => 1, dt => 1, li => 1, p => 1,
3565 td => 1, th => 1, tr => 1,
3566 }->{$self->{open_elements}->[-1]->[1]}) {
3567 !!!back-token; # <?>
3568 $token = {type => 'end tag', tag_name => 'caption'};
3569 !!!back-token;
3570 $token = {type => 'end tag',
3571 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3572 redo B;
3573 }
3574
3575 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3576 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3577 }
3578
3579 splice @{$self->{open_elements}}, $i;
3580
3581 $clear_up_to_marker->();
3582
3583 $self->{insertion_mode} = 'in table';
3584
3585 ## reprocess
3586 redo B;
3587 } else {
3588 #
3589 }
3590 } elsif ($token->{type} eq 'end tag') {
3591 if ($token->{tag_name} eq 'caption') {
3592 ## have a table element in table scope
3593 my $i;
3594 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3595 my $node = $self->{open_elements}->[$_];
3596 if ($node->[1] eq $token->{tag_name}) {
3597 $i = $_;
3598 last INSCOPE;
3599 } elsif ({
3600 table => 1, html => 1,
3601 }->{$node->[1]}) {
3602 last INSCOPE;
3603 }
3604 } # INSCOPE
3605 unless (defined $i) {
3606 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3607 ## Ignore the token
3608 !!!next-token;
3609 redo B;
3610 }
3611
3612 ## generate implied end tags
3613 if ({
3614 dd => 1, dt => 1, li => 1, p => 1,
3615 td => 1, th => 1, tr => 1,
3616 }->{$self->{open_elements}->[-1]->[1]}) {
3617 !!!back-token;
3618 $token = {type => 'end tag',
3619 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3620 redo B;
3621 }
3622
3623 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3624 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3625 }
3626
3627 splice @{$self->{open_elements}}, $i;
3628
3629 $clear_up_to_marker->();
3630
3631 $self->{insertion_mode} = 'in table';
3632
3633 !!!next-token;
3634 redo B;
3635 } elsif ($token->{tag_name} eq 'table') {
3636 !!!parse-error (type => 'not closed:caption');
3637
3638 ## As if </caption>
3639 ## have a table element in table scope
3640 my $i;
3641 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3642 my $node = $self->{open_elements}->[$_];
3643 if ($node->[1] eq 'caption') {
3644 $i = $_;
3645 last INSCOPE;
3646 } elsif ({
3647 table => 1, html => 1,
3648 }->{$node->[1]}) {
3649 last INSCOPE;
3650 }
3651 } # INSCOPE
3652 unless (defined $i) {
3653 !!!parse-error (type => 'unmatched end tag:caption');
3654 ## Ignore the token
3655 !!!next-token;
3656 redo B;
3657 }
3658
3659 ## generate implied end tags
3660 if ({
3661 dd => 1, dt => 1, li => 1, p => 1,
3662 td => 1, th => 1, tr => 1,
3663 }->{$self->{open_elements}->[-1]->[1]}) {
3664 !!!back-token; # </table>
3665 $token = {type => 'end tag', tag_name => 'caption'};
3666 !!!back-token;
3667 $token = {type => 'end tag',
3668 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3669 redo B;
3670 }
3671
3672 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3673 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3674 }
3675
3676 splice @{$self->{open_elements}}, $i;
3677
3678 $clear_up_to_marker->();
3679
3680 $self->{insertion_mode} = 'in table';
3681
3682 ## reprocess
3683 redo B;
3684 } elsif ({
3685 body => 1, col => 1, colgroup => 1,
3686 html => 1, tbody => 1, td => 1, tfoot => 1,
3687 th => 1, thead => 1, tr => 1,
3688 }->{$token->{tag_name}}) {
3689 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3690 ## Ignore the token
3691 redo B;
3692 } else {
3693 #
3694 }
3695 } else {
3696 #
3697 }
3698
3699 $in_body->($insert_to_current);
3700 redo B;
3701 } elsif ($self->{insertion_mode} eq 'in column group') {
3702 if ($token->{type} eq 'character') {
3703 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3704 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3705 unless (length $token->{data}) {
3706 !!!next-token;
3707 redo B;
3708 }
3709 }
3710
3711 #
3712 } elsif ($token->{type} eq 'comment') {
3713 my $comment = $self->{document}->create_comment ($token->{data});
3714 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3715 !!!next-token;
3716 redo B;
3717 } elsif ($token->{type} eq 'start tag') {
3718 if ($token->{tag_name} eq 'col') {
3719 !!!insert-element ($token->{tag_name}, $token->{attributes});
3720 pop @{$self->{open_elements}};
3721 !!!next-token;
3722 redo B;
3723 } else {
3724 #
3725 }
3726 } elsif ($token->{type} eq 'end tag') {
3727 if ($token->{tag_name} eq 'colgroup') {
3728 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3729 !!!parse-error (type => 'unmatched end tag:colgroup');
3730 ## Ignore the token
3731 !!!next-token;
3732 redo B;
3733 } else {
3734 pop @{$self->{open_elements}}; # colgroup
3735 $self->{insertion_mode} = 'in table';
3736 !!!next-token;
3737 redo B;
3738 }
3739 } elsif ($token->{tag_name} eq 'col') {
3740 !!!parse-error (type => 'unmatched end tag:col');
3741 ## Ignore the token
3742 !!!next-token;
3743 redo B;
3744 } else {
3745 #
3746 }
3747 } else {
3748 #
3749 }
3750
3751 ## As if </colgroup>
3752 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3753 !!!parse-error (type => 'unmatched end tag:colgroup');
3754 ## Ignore the token
3755 !!!next-token;
3756 redo B;
3757 } else {
3758 pop @{$self->{open_elements}}; # colgroup
3759 $self->{insertion_mode} = 'in table';
3760 ## reprocess
3761 redo B;
3762 }
3763 } elsif ($self->{insertion_mode} eq 'in table body') {
3764 if ($token->{type} eq 'character') {
3765 ## NOTE: This is a "character in table" code clone.
3766 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3767 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3768
3769 unless (length $token->{data}) {
3770 !!!next-token;
3771 redo B;
3772 }
3773 }
3774
3775 !!!parse-error (type => 'in table:#character');
3776
3777 ## As if in body, but insert into foster parent element
3778 ## ISSUE: Spec says that "whenever a node would be inserted
3779 ## into the current node" while characters might not be
3780 ## result in a new Text node.
3781 $reconstruct_active_formatting_elements->($insert_to_foster);
3782
3783 if ({
3784 table => 1, tbody => 1, tfoot => 1,
3785 thead => 1, tr => 1,
3786 }->{$self->{open_elements}->[-1]->[1]}) {
3787 # MUST
3788 my $foster_parent_element;
3789 my $next_sibling;
3790 my $prev_sibling;
3791 OE: for (reverse 0..$#{$self->{open_elements}}) {
3792 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3793 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3794 if (defined $parent and $parent->node_type == 1) {
3795 $foster_parent_element = $parent;
3796 $next_sibling = $self->{open_elements}->[$_]->[0];
3797 $prev_sibling = $next_sibling->previous_sibling;
3798 } else {
3799 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3800 $prev_sibling = $foster_parent_element->last_child;
3801 }
3802 last OE;
3803 }
3804 } # OE
3805 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3806 $prev_sibling = $foster_parent_element->last_child
3807 unless defined $foster_parent_element;
3808 if (defined $prev_sibling and
3809 $prev_sibling->node_type == 3) {
3810 $prev_sibling->manakai_append_text ($token->{data});
3811 } else {
3812 $foster_parent_element->insert_before
3813 ($self->{document}->create_text_node ($token->{data}),
3814 $next_sibling);
3815 }
3816 } else {
3817 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3818 }
3819
3820 !!!next-token;
3821 redo B;
3822 } elsif ($token->{type} eq 'comment') {
3823 ## Copied from 'in table'
3824 my $comment = $self->{document}->create_comment ($token->{data});
3825 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3826 !!!next-token;
3827 redo B;
3828 } elsif ($token->{type} eq 'start tag') {
3829 if ({
3830 tr => 1,
3831 th => 1, td => 1,
3832 }->{$token->{tag_name}}) {
3833 unless ($token->{tag_name} eq 'tr') {
3834 !!!parse-error (type => 'missing start tag:tr');
3835 }
3836
3837 ## Clear back to table body context
3838 while (not {
3839 tbody => 1, tfoot => 1, thead => 1, html => 1,
3840 }->{$self->{open_elements}->[-1]->[1]}) {
3841 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3842 pop @{$self->{open_elements}};
3843 }
3844
3845 $self->{insertion_mode} = 'in row';
3846 if ($token->{tag_name} eq 'tr') {
3847 !!!insert-element ($token->{tag_name}, $token->{attributes});
3848 !!!next-token;
3849 } else {
3850 !!!insert-element ('tr');
3851 ## reprocess
3852 }
3853 redo B;
3854 } elsif ({
3855 caption => 1, col => 1, colgroup => 1,
3856 tbody => 1, tfoot => 1, thead => 1,
3857 }->{$token->{tag_name}}) {
3858 ## have an element in table scope
3859 my $i;
3860 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3861 my $node = $self->{open_elements}->[$_];
3862 if ({
3863 tbody => 1, thead => 1, tfoot => 1,
3864 }->{$node->[1]}) {
3865 $i = $_;
3866 last INSCOPE;
3867 } elsif ({
3868 table => 1, html => 1,
3869 }->{$node->[1]}) {
3870 last INSCOPE;
3871 }
3872 } # INSCOPE
3873 unless (defined $i) {
3874 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3875 ## Ignore the token
3876 !!!next-token;
3877 redo B;
3878 }
3879
3880 ## Clear back to table body context
3881 while (not {
3882 tbody => 1, tfoot => 1, thead => 1, html => 1,
3883 }->{$self->{open_elements}->[-1]->[1]}) {
3884 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3885 pop @{$self->{open_elements}};
3886 }
3887
3888 ## As if <{current node}>
3889 ## have an element in table scope
3890 ## true by definition
3891
3892 ## Clear back to table body context
3893 ## nop by definition
3894
3895 pop @{$self->{open_elements}};
3896 $self->{insertion_mode} = 'in table';
3897 ## reprocess
3898 redo B;
3899 } elsif ($token->{tag_name} eq 'table') {
3900 ## NOTE: This is a code clone of "table in table"
3901 !!!parse-error (type => 'not closed:table');
3902
3903 ## As if </table>
3904 ## have a table element in table scope
3905 my $i;
3906 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3907 my $node = $self->{open_elements}->[$_];
3908 if ($node->[1] eq 'table') {
3909 $i = $_;
3910 last INSCOPE;
3911 } elsif ({
3912 table => 1, html => 1,
3913 }->{$node->[1]}) {
3914 last INSCOPE;
3915 }
3916 } # INSCOPE
3917 unless (defined $i) {
3918 !!!parse-error (type => 'unmatched end tag:table');
3919 ## Ignore tokens </table><table>
3920 !!!next-token;
3921 redo B;
3922 }
3923
3924 ## generate implied end tags
3925 if ({
3926 dd => 1, dt => 1, li => 1, p => 1,
3927 td => 1, th => 1, tr => 1,
3928 }->{$self->{open_elements}->[-1]->[1]}) {
3929 !!!back-token; # <table>
3930 $token = {type => 'end tag', tag_name => 'table'};
3931 !!!back-token;
3932 $token = {type => 'end tag',
3933 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3934 redo B;
3935 }
3936
3937 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3938 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3939 }
3940
3941 splice @{$self->{open_elements}}, $i;
3942
3943 $self->_reset_insertion_mode;
3944
3945 ## reprocess
3946 redo B;
3947 } else {
3948 #
3949 }
3950 } elsif ($token->{type} eq 'end tag') {
3951 if ({
3952 tbody => 1, tfoot => 1, thead => 1,
3953 }->{$token->{tag_name}}) {
3954 ## have an element in table scope
3955 my $i;
3956 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3957 my $node = $self->{open_elements}->[$_];
3958 if ($node->[1] eq $token->{tag_name}) {
3959 $i = $_;
3960 last INSCOPE;
3961 } elsif ({
3962 table => 1, html => 1,
3963 }->{$node->[1]}) {
3964 last INSCOPE;
3965 }
3966 } # INSCOPE
3967 unless (defined $i) {
3968 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3969 ## Ignore the token
3970 !!!next-token;
3971 redo B;
3972 }
3973
3974 ## Clear back to table body context
3975 while (not {
3976 tbody => 1, tfoot => 1, thead => 1, html => 1,
3977 }->{$self->{open_elements}->[-1]->[1]}) {
3978 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3979 pop @{$self->{open_elements}};
3980 }
3981
3982 pop @{$self->{open_elements}};
3983 $self->{insertion_mode} = 'in table';
3984 !!!next-token;
3985 redo B;
3986 } elsif ($token->{tag_name} eq 'table') {
3987 ## have an element in table scope
3988 my $i;
3989 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3990 my $node = $self->{open_elements}->[$_];
3991 if ({
3992 tbody => 1, thead => 1, tfoot => 1,
3993 }->{$node->[1]}) {
3994 $i = $_;
3995 last INSCOPE;
3996 } elsif ({
3997 table => 1, html => 1,
3998 }->{$node->[1]}) {
3999 last INSCOPE;
4000 }
4001 } # INSCOPE
4002 unless (defined $i) {
4003 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4004 ## Ignore the token
4005 !!!next-token;
4006 redo B;
4007 }
4008
4009 ## Clear back to table body context
4010 while (not {
4011 tbody => 1, tfoot => 1, thead => 1, html => 1,
4012 }->{$self->{open_elements}->[-1]->[1]}) {
4013 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4014 pop @{$self->{open_elements}};
4015 }
4016
4017 ## As if <{current node}>
4018 ## have an element in table scope
4019 ## true by definition
4020
4021 ## Clear back to table body context
4022 ## nop by definition
4023
4024 pop @{$self->{open_elements}};
4025 $self->{insertion_mode} = 'in table';
4026 ## reprocess
4027 redo B;
4028 } elsif ({
4029 body => 1, caption => 1, col => 1, colgroup => 1,
4030 html => 1, td => 1, th => 1, tr => 1,
4031 }->{$token->{tag_name}}) {
4032 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4033 ## Ignore the token
4034 !!!next-token;
4035 redo B;
4036 } else {
4037 #
4038 }
4039 } else {
4040 #
4041 }
4042
4043 ## As if in table
4044 !!!parse-error (type => 'in table:'.$token->{tag_name});
4045 $in_body->($insert_to_foster);
4046 redo B;
4047 } elsif ($self->{insertion_mode} eq 'in row') {
4048 if ($token->{type} eq 'character') {
4049 ## NOTE: This is a "character in table" code clone.
4050 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4051 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4052
4053 unless (length $token->{data}) {
4054 !!!next-token;
4055 redo B;
4056 }
4057 }
4058
4059 !!!parse-error (type => 'in table:#character');
4060
4061 ## As if in body, but insert into foster parent element
4062 ## ISSUE: Spec says that "whenever a node would be inserted
4063 ## into the current node" while characters might not be
4064 ## result in a new Text node.
4065 $reconstruct_active_formatting_elements->($insert_to_foster);
4066
4067 if ({
4068 table => 1, tbody => 1, tfoot => 1,
4069 thead => 1, tr => 1,
4070 }->{$self->{open_elements}->[-1]->[1]}) {
4071 # MUST
4072 my $foster_parent_element;
4073 my $next_sibling;
4074 my $prev_sibling;
4075 OE: for (reverse 0..$#{$self->{open_elements}}) {
4076 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4077 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4078 if (defined $parent and $parent->node_type == 1) {
4079 $foster_parent_element = $parent;
4080 $next_sibling = $self->{open_elements}->[$_]->[0];
4081 $prev_sibling = $next_sibling->previous_sibling;
4082 } else {
4083 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4084 $prev_sibling = $foster_parent_element->last_child;
4085 }
4086 last OE;
4087 }
4088 } # OE
4089 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4090 $prev_sibling = $foster_parent_element->last_child
4091 unless defined $foster_parent_element;
4092 if (defined $prev_sibling and
4093 $prev_sibling->node_type == 3) {
4094 $prev_sibling->manakai_append_text ($token->{data});
4095 } else {
4096 $foster_parent_element->insert_before
4097 ($self->{document}->create_text_node ($token->{data}),
4098 $next_sibling);
4099 }
4100 } else {
4101 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4102 }
4103
4104 !!!next-token;
4105 redo B;
4106 } elsif ($token->{type} eq 'comment') {
4107 ## Copied from 'in table'
4108 my $comment = $self->{document}->create_comment ($token->{data});
4109 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4110 !!!next-token;
4111 redo B;
4112 } elsif ($token->{type} eq 'start tag') {
4113 if ($token->{tag_name} eq 'th' or
4114 $token->{tag_name} eq 'td') {
4115 ## Clear back to table row context
4116 while (not {
4117 tr => 1, html => 1,
4118 }->{$self->{open_elements}->[-1]->[1]}) {
4119 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4120 pop @{$self->{open_elements}};
4121 }
4122
4123 !!!insert-element ($token->{tag_name}, $token->{attributes});
4124 $self->{insertion_mode} = 'in cell';
4125
4126 push @$active_formatting_elements, ['#marker', ''];
4127
4128 !!!next-token;
4129 redo B;
4130 } elsif ({
4131 caption => 1, col => 1, colgroup => 1,
4132 tbody => 1, tfoot => 1, thead => 1, tr => 1,
4133 }->{$token->{tag_name}}) {
4134 ## As if </tr>
4135 ## have an element in table scope
4136 my $i;
4137 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4138 my $node = $self->{open_elements}->[$_];
4139 if ($node->[1] eq 'tr') {
4140 $i = $_;
4141 last INSCOPE;
4142 } elsif ({
4143 table => 1, html => 1,
4144 }->{$node->[1]}) {
4145 last INSCOPE;
4146 }
4147 } # INSCOPE
4148 unless (defined $i) {
4149 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4150 ## Ignore the token
4151 !!!next-token;
4152 redo B;
4153 }
4154
4155 ## Clear back to table row context
4156 while (not {
4157 tr => 1, html => 1,
4158 }->{$self->{open_elements}->[-1]->[1]}) {
4159 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4160 pop @{$self->{open_elements}};
4161 }
4162
4163 pop @{$self->{open_elements}}; # tr
4164 $self->{insertion_mode} = 'in table body';
4165 ## reprocess
4166 redo B;
4167 } elsif ($token->{tag_name} eq 'table') {
4168 ## NOTE: This is a code clone of "table in table"
4169 !!!parse-error (type => 'not closed:table');
4170
4171 ## As if </table>
4172 ## have a table element in table scope
4173 my $i;
4174 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4175 my $node = $self->{open_elements}->[$_];
4176 if ($node->[1] eq 'table') {
4177 $i = $_;
4178 last INSCOPE;
4179 } elsif ({
4180 table => 1, html => 1,
4181 }->{$node->[1]}) {
4182 last INSCOPE;
4183 }
4184 } # INSCOPE
4185 unless (defined $i) {
4186 !!!parse-error (type => 'unmatched end tag:table');
4187 ## Ignore tokens </table><table>
4188 !!!next-token;
4189 redo B;
4190 }
4191
4192 ## generate implied end tags
4193 if ({
4194 dd => 1, dt => 1, li => 1, p => 1,
4195 td => 1, th => 1, tr => 1,
4196 }->{$self->{open_elements}->[-1]->[1]}) {
4197 !!!back-token; # <table>
4198 $token = {type => 'end tag', tag_name => 'table'};
4199 !!!back-token;
4200 $token = {type => 'end tag',
4201 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4202 redo B;
4203 }
4204
4205 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4206 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4207 }
4208
4209 splice @{$self->{open_elements}}, $i;
4210
4211 $self->_reset_insertion_mode;
4212
4213 ## reprocess
4214 redo B;
4215 } else {
4216 #
4217 }
4218 } elsif ($token->{type} eq 'end tag') {
4219 if ($token->{tag_name} eq 'tr') {
4220 ## have an element in table scope
4221 my $i;
4222 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4223 my $node = $self->{open_elements}->[$_];
4224 if ($node->[1] eq $token->{tag_name}) {
4225 $i = $_;
4226 last INSCOPE;
4227 } elsif ({
4228 table => 1, html => 1,
4229 }->{$node->[1]}) {
4230 last INSCOPE;
4231 }
4232 } # INSCOPE
4233 unless (defined $i) {
4234 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4235 ## Ignore the token
4236 !!!next-token;
4237 redo B;
4238 }
4239
4240 ## Clear back to table row context
4241 while (not {
4242 tr => 1, html => 1,
4243 }->{$self->{open_elements}->[-1]->[1]}) {
4244 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4245 pop @{$self->{open_elements}};
4246 }
4247
4248 pop @{$self->{open_elements}}; # tr
4249 $self->{insertion_mode} = 'in table body';
4250 !!!next-token;
4251 redo B;
4252 } elsif ($token->{tag_name} eq 'table') {
4253 ## As if </tr>
4254 ## have an element in table scope
4255 my $i;
4256 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4257 my $node = $self->{open_elements}->[$_];
4258 if ($node->[1] eq 'tr') {
4259 $i = $_;
4260 last INSCOPE;
4261 } elsif ({
4262 table => 1, html => 1,
4263 }->{$node->[1]}) {
4264 last INSCOPE;
4265 }
4266 } # INSCOPE
4267 unless (defined $i) {
4268 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4269 ## Ignore the token
4270 !!!next-token;
4271 redo B;
4272 }
4273
4274 ## Clear back to table row context
4275 while (not {
4276 tr => 1, html => 1,
4277 }->{$self->{open_elements}->[-1]->[1]}) {
4278 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4279 pop @{$self->{open_elements}};
4280 }
4281
4282 pop @{$self->{open_elements}}; # tr
4283 $self->{insertion_mode} = 'in table body';
4284 ## reprocess
4285 redo B;
4286 } elsif ({
4287 tbody => 1, tfoot => 1, thead => 1,
4288 }->{$token->{tag_name}}) {
4289 ## have an element in table scope
4290 my $i;
4291 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4292 my $node = $self->{open_elements}->[$_];
4293 if ($node->[1] eq $token->{tag_name}) {
4294 $i = $_;
4295 last INSCOPE;
4296 } elsif ({
4297 table => 1, html => 1,
4298 }->{$node->[1]}) {
4299 last INSCOPE;
4300 }
4301 } # INSCOPE
4302 unless (defined $i) {
4303 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4304 ## Ignore the token
4305 !!!next-token;
4306 redo B;
4307 }
4308
4309 ## As if </tr>
4310 ## have an element in table scope
4311 my $i;
4312 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4313 my $node = $self->{open_elements}->[$_];
4314 if ($node->[1] eq 'tr') {
4315 $i = $_;
4316 last INSCOPE;
4317 } elsif ({
4318 table => 1, html => 1,
4319 }->{$node->[1]}) {
4320 last INSCOPE;
4321 }
4322 } # INSCOPE
4323 unless (defined $i) {
4324 !!!parse-error (type => 'unmatched end tag:tr');
4325 ## Ignore the token
4326 !!!next-token;
4327 redo B;
4328 }
4329
4330 ## Clear back to table row context
4331 while (not {
4332 tr => 1, html => 1,
4333 }->{$self->{open_elements}->[-1]->[1]}) {
4334 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4335 pop @{$self->{open_elements}};
4336 }
4337
4338 pop @{$self->{open_elements}}; # tr
4339 $self->{insertion_mode} = 'in table body';
4340 ## reprocess
4341 redo B;
4342 } elsif ({
4343 body => 1, caption => 1, col => 1,
4344 colgroup => 1, html => 1, td => 1, th => 1,
4345 }->{$token->{tag_name}}) {
4346 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4347 ## Ignore the token
4348 !!!next-token;
4349 redo B;
4350 } else {
4351 #
4352 }
4353 } else {
4354 #
4355 }
4356
4357 ## As if in table
4358 !!!parse-error (type => 'in table:'.$token->{tag_name});
4359 $in_body->($insert_to_foster);
4360 redo B;
4361 } elsif ($self->{insertion_mode} eq 'in cell') {
4362 if ($token->{type} eq 'character') {
4363 ## NOTE: This is a code clone of "character in body".
4364 $reconstruct_active_formatting_elements->($insert_to_current);
4365
4366 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4367
4368 !!!next-token;
4369 redo B;
4370 } elsif ($token->{type} eq 'comment') {
4371 ## NOTE: This is a code clone of "comment in body".
4372 my $comment = $self->{document}->create_comment ($token->{data});
4373 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4374 !!!next-token;
4375 redo B;
4376 } elsif ($token->{type} eq 'start tag') {
4377 if ({
4378 caption => 1, col => 1, colgroup => 1,
4379 tbody => 1, td => 1, tfoot => 1, th => 1,
4380 thead => 1, tr => 1,
4381 }->{$token->{tag_name}}) {
4382 ## have an element in table scope
4383 my $tn;
4384 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4385 my $node = $self->{open_elements}->[$_];
4386 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4387 $tn = $node->[1];
4388 last INSCOPE;
4389 } elsif ({
4390 table => 1, html => 1,
4391 }->{$node->[1]}) {
4392 last INSCOPE;
4393 }
4394 } # INSCOPE
4395 unless (defined $tn) {
4396 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4397 ## Ignore the token
4398 !!!next-token;
4399 redo B;
4400 }
4401
4402 ## Close the cell
4403 !!!back-token; # <?>
4404 $token = {type => 'end tag', tag_name => $tn};
4405 redo B;
4406 } else {
4407 #
4408 }
4409 } elsif ($token->{type} eq 'end tag') {
4410 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4411 ## have an element in table scope
4412 my $i;
4413 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4414 my $node = $self->{open_elements}->[$_];
4415 if ($node->[1] eq $token->{tag_name}) {
4416 $i = $_;
4417 last INSCOPE;
4418 } elsif ({
4419 table => 1, html => 1,
4420 }->{$node->[1]}) {
4421 last INSCOPE;
4422 }
4423 } # INSCOPE
4424 unless (defined $i) {
4425 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4426 ## Ignore the token
4427 !!!next-token;
4428 redo B;
4429 }
4430
4431 ## generate implied end tags
4432 if ({
4433 dd => 1, dt => 1, li => 1, p => 1,
4434 td => ($token->{tag_name} eq 'th'),
4435 th => ($token->{tag_name} eq 'td'),
4436 tr => 1,
4437 }->{$self->{open_elements}->[-1]->[1]}) {
4438 !!!back-token;
4439 $token = {type => 'end tag',
4440 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4441 redo B;
4442 }
4443
4444 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4445 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4446 }
4447
4448 splice @{$self->{open_elements}}, $i;
4449
4450 $clear_up_to_marker->();
4451
4452 $self->{insertion_mode} = 'in row';
4453
4454 !!!next-token;
4455 redo B;
4456 } elsif ({
4457 body => 1, caption => 1, col => 1,
4458 colgroup => 1, html => 1,
4459 }->{$token->{tag_name}}) {
4460 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4461 ## Ignore the token
4462 !!!next-token;
4463 redo B;
4464 } elsif ({
4465 table => 1, tbody => 1, tfoot => 1,
4466 thead => 1, tr => 1,
4467 }->{$token->{tag_name}}) {
4468 ## have an element in table scope
4469 my $i;
4470 my $tn;
4471 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4472 my $node = $self->{open_elements}->[$_];
4473 if ($node->[1] eq $token->{tag_name}) {
4474 $i = $_;
4475 last INSCOPE;
4476 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4477 $tn = $node->[1];
4478 ## NOTE: There is exactly one |td| or |th| element
4479 ## in scope in the stack of open elements by definition.
4480 } elsif ({
4481 table => 1, html => 1,
4482 }->{$node->[1]}) {
4483 last INSCOPE;
4484 }
4485 } # INSCOPE
4486 unless (defined $i) {
4487 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4488 ## Ignore the token
4489 !!!next-token;
4490 redo B;
4491 }
4492
4493 ## Close the cell
4494 !!!back-token; # </?>
4495 $token = {type => 'end tag', tag_name => $tn};
4496 redo B;
4497 } else {
4498 #
4499 }
4500 } else {
4501 #
4502 }
4503
4504 $in_body->($insert_to_current);
4505 redo B;
4506 } elsif ($self->{insertion_mode} eq 'in select') {
4507 if ($token->{type} eq 'character') {
4508 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4509 !!!next-token;
4510 redo B;
4511 } elsif ($token->{type} eq 'comment') {
4512 my $comment = $self->{document}->create_comment ($token->{data});
4513 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4514 !!!next-token;
4515 redo B;
4516 } elsif ($token->{type} eq 'start tag') {
4517 if ($token->{tag_name} eq 'option') {
4518 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4519 ## As if </option>
4520 pop @{$self->{open_elements}};
4521 }
4522
4523 !!!insert-element ($token->{tag_name}, $token->{attributes});
4524 !!!next-token;
4525 redo B;
4526 } elsif ($token->{tag_name} eq 'optgroup') {
4527 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4528 ## As if </option>
4529 pop @{$self->{open_elements}};
4530 }
4531
4532 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4533 ## As if </optgroup>
4534 pop @{$self->{open_elements}};
4535 }
4536
4537 !!!insert-element ($token->{tag_name}, $token->{attributes});
4538 !!!next-token;
4539 redo B;
4540 } elsif ($token->{tag_name} eq 'select') {
4541 !!!parse-error (type => 'not closed:select');
4542 ## As if </select> instead
4543 ## have an element in table scope
4544 my $i;
4545 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4546 my $node = $self->{open_elements}->[$_];
4547 if ($node->[1] eq $token->{tag_name}) {
4548 $i = $_;
4549 last INSCOPE;
4550 } elsif ({
4551 table => 1, html => 1,
4552 }->{$node->[1]}) {
4553 last INSCOPE;
4554 }
4555 } # INSCOPE
4556 unless (defined $i) {
4557 !!!parse-error (type => 'unmatched end tag:select');
4558 ## Ignore the token
4559 !!!next-token;
4560 redo B;
4561 }
4562
4563 splice @{$self->{open_elements}}, $i;
4564
4565 $self->_reset_insertion_mode;
4566
4567 !!!next-token;
4568 redo B;
4569 } else {
4570 #
4571 }
4572 } elsif ($token->{type} eq 'end tag') {
4573 if ($token->{tag_name} eq 'optgroup') {
4574 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4575 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4576 ## As if </option>
4577 splice @{$self->{open_elements}}, -2;
4578 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4579 pop @{$self->{open_elements}};
4580 } else {
4581 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4582 ## Ignore the token
4583 }
4584 !!!next-token;
4585 redo B;
4586 } elsif ($token->{tag_name} eq 'option') {
4587 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4588 pop @{$self->{open_elements}};
4589 } else {
4590 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4591 ## Ignore the token
4592 }
4593 !!!next-token;
4594 redo B;
4595 } elsif ($token->{tag_name} eq 'select') {
4596 ## have an element in table scope
4597 my $i;
4598 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4599 my $node = $self->{open_elements}->[$_];
4600 if ($node->[1] eq $token->{tag_name}) {
4601 $i = $_;
4602 last INSCOPE;
4603 } elsif ({
4604 table => 1, html => 1,
4605 }->{$node->[1]}) {
4606 last INSCOPE;
4607 }
4608 } # INSCOPE
4609 unless (defined $i) {
4610 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4611 ## Ignore the token
4612 !!!next-token;
4613 redo B;
4614 }
4615
4616 splice @{$self->{open_elements}}, $i;
4617
4618 $self->_reset_insertion_mode;
4619
4620 !!!next-token;
4621 redo B;
4622 } elsif ({
4623 caption => 1, table => 1, tbody => 1,
4624 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4625 }->{$token->{tag_name}}) {
4626 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4627
4628 ## have an element in table scope
4629 my $i;
4630 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4631 my $node = $self->{open_elements}->[$_];
4632 if ($node->[1] eq $token->{tag_name}) {
4633 $i = $_;
4634 last INSCOPE;
4635 } elsif ({
4636 table => 1, html => 1,
4637 }->{$node->[1]}) {
4638 last INSCOPE;
4639 }
4640 } # INSCOPE
4641 unless (defined $i) {
4642 ## Ignore the token
4643 !!!next-token;
4644 redo B;
4645 }
4646
4647 ## As if </select>
4648 ## have an element in table scope
4649 undef $i;
4650 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4651 my $node = $self->{open_elements}->[$_];
4652 if ($node->[1] eq 'select') {
4653 $i = $_;
4654 last INSCOPE;
4655 } elsif ({
4656 table => 1, html => 1,
4657 }->{$node->[1]}) {
4658 last INSCOPE;
4659 }
4660 } # INSCOPE
4661 unless (defined $i) {
4662 !!!parse-error (type => 'unmatched end tag:select');
4663 ## Ignore the </select> token
4664 !!!next-token; ## TODO: ok?
4665 redo B;
4666 }
4667
4668 splice @{$self->{open_elements}}, $i;
4669
4670 $self->_reset_insertion_mode;
4671
4672 ## reprocess
4673 redo B;
4674 } else {
4675 #
4676 }
4677 } else {
4678 #
4679 }
4680
4681 !!!parse-error (type => 'in select:'.$token->{tag_name});
4682 ## Ignore the token
4683 !!!next-token;
4684 redo B;
4685 } elsif ($self->{insertion_mode} eq 'after body') {
4686 if ($token->{type} eq 'character') {
4687 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4688 ## As if in body
4689 $reconstruct_active_formatting_elements->($insert_to_current);
4690
4691 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4692
4693 unless (length $token->{data}) {
4694 !!!next-token;
4695 redo B;
4696 }
4697 }
4698
4699 #
4700 !!!parse-error (type => 'after body:#'.$token->{type});
4701 } elsif ($token->{type} eq 'comment') {
4702 my $comment = $self->{document}->create_comment ($token->{data});
4703 $self->{open_elements}->[0]->[0]->append_child ($comment);
4704 !!!next-token;
4705 redo B;
4706 } elsif ($token->{type} eq 'start tag') {
4707 !!!parse-error (type => 'after body:'.$token->{tag_name});
4708 #
4709 } elsif ($token->{type} eq 'end tag') {
4710 if ($token->{tag_name} eq 'html') {
4711 if (defined $self->{inner_html_node}) {
4712 !!!parse-error (type => 'unmatched end tag:html');
4713 ## Ignore the token
4714 !!!next-token;
4715 redo B;
4716 } else {
4717 $phase = 'trailing end';
4718 !!!next-token;
4719 redo B;
4720 }
4721 } else {
4722 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4723 }
4724 } else {
4725 !!!parse-error (type => 'after body:#'.$token->{type});
4726 }
4727
4728 $self->{insertion_mode} = 'in body';
4729 ## reprocess
4730 redo B;
4731 } elsif ($self->{insertion_mode} eq 'in frameset') {
4732 if ($token->{type} eq 'character') {
4733 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4734 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4735
4736 unless (length $token->{data}) {
4737 !!!next-token;
4738 redo B;
4739 }
4740 }
4741
4742 #
4743 } elsif ($token->{type} eq 'comment') {
4744 my $comment = $self->{document}->create_comment ($token->{data});
4745 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4746 !!!next-token;
4747 redo B;
4748 } elsif ($token->{type} eq 'start tag') {
4749 if ($token->{tag_name} eq 'frameset') {
4750 !!!insert-element ($token->{tag_name}, $token->{attributes});
4751 !!!next-token;
4752 redo B;
4753 } elsif ($token->{tag_name} eq 'frame') {
4754 !!!insert-element ($token->{tag_name}, $token->{attributes});
4755 pop @{$self->{open_elements}};
4756 !!!next-token;
4757 redo B;
4758 } elsif ($token->{tag_name} eq 'noframes') {
4759 $in_body->($insert_to_current);
4760 redo B;
4761 } else {
4762 #
4763 }
4764 } elsif ($token->{type} eq 'end tag') {
4765 if ($token->{tag_name} eq 'frameset') {
4766 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4767 @{$self->{open_elements}} == 1) {
4768 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4769 ## Ignore the token
4770 !!!next-token;
4771 } else {
4772 pop @{$self->{open_elements}};
4773 !!!next-token;
4774 }
4775
4776 ## if not inner_html and
4777 if ($self->{open_elements}->[-1]->[1] ne 'frameset') {
4778 $self->{insertion_mode} = 'after frameset';
4779 }
4780 redo B;
4781 } else {
4782 #
4783 }
4784 } else {
4785 #
4786 }
4787
4788 if (defined $token->{tag_name}) {
4789 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4790 } else {
4791 !!!parse-error (type => 'in frameset:#'.$token->{type});
4792 }
4793 ## Ignore the token
4794 !!!next-token;
4795 redo B;
4796 } elsif ($self->{insertion_mode} eq 'after frameset') {
4797 if ($token->{type} eq 'character') {
4798 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4799 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4800
4801 unless (length $token->{data}) {
4802 !!!next-token;
4803 redo B;
4804 }
4805 }
4806
4807 #
4808 } elsif ($token->{type} eq 'comment') {
4809 my $comment = $self->{document}->create_comment ($token->{data});
4810 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4811 !!!next-token;
4812 redo B;
4813 } elsif ($token->{type} eq 'start tag') {
4814 if ($token->{tag_name} eq 'noframes') {
4815 $in_body->($insert_to_current);
4816 redo B;
4817 } else {
4818 #
4819 }
4820 } elsif ($token->{type} eq 'end tag') {
4821 if ($token->{tag_name} eq 'html') {
4822 $phase = 'trailing end';
4823 !!!next-token;
4824 redo B;
4825 } else {
4826 #
4827 }
4828 } else {
4829 #
4830 }
4831
4832 if (defined $token->{tag_name}) {
4833 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4834 } else {
4835 !!!parse-error (type => 'after frameset:#'.$token->{type});
4836 }
4837 ## Ignore the token
4838 !!!next-token;
4839 redo B;
4840
4841 ## ISSUE: An issue in spec there
4842 } else {
4843 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4844 }
4845 }
4846 } elsif ($phase eq 'trailing end') {
4847 ## states in the main stage is preserved yet # MUST
4848
4849 if ($token->{type} eq 'DOCTYPE') {
4850 !!!parse-error (type => 'after html:#DOCTYPE');
4851 ## Ignore the token
4852 !!!next-token;
4853 redo B;
4854 } elsif ($token->{type} eq 'comment') {
4855 my $comment = $self->{document}->create_comment ($token->{data});
4856 $self->{document}->append_child ($comment);
4857 !!!next-token;
4858 redo B;
4859 } elsif ($token->{type} eq 'character') {
4860 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4861 my $data = $1;
4862 ## As if in the main phase.
4863 ## NOTE: The insertion mode in the main phase
4864 ## just before the phase has been changed to the trailing
4865 ## end phase is either "after body" or "after frameset".
4866 $reconstruct_active_formatting_elements->($insert_to_current)
4867 if $phase eq 'main';
4868
4869 $self->{open_elements}->[-1]->[0]->manakai_append_text ($data);
4870
4871 unless (length $token->{data}) {
4872 !!!next-token;
4873 redo B;
4874 }
4875 }
4876
4877 !!!parse-error (type => 'after html:#character');
4878 $phase = 'main';
4879 ## reprocess
4880 redo B;
4881 } elsif ($token->{type} eq 'start tag' or
4882 $token->{type} eq 'end tag') {
4883 !!!parse-error (type => 'after html:'.$token->{tag_name});
4884 $phase = 'main';
4885 ## reprocess
4886 redo B;
4887 } elsif ($token->{type} eq 'end-of-file') {
4888 ## Stop parsing
4889 last B;
4890 } else {
4891 die "$0: $token->{type}: Unknown token";
4892 }
4893 }
4894 } # B
4895
4896 ## Stop parsing # MUST
4897
4898 ## TODO: script stuffs
4899 } # _tree_construct_main
4900
4901 sub set_inner_html ($$$) {
4902 my $class = shift;
4903 my $node = shift;
4904 my $s = \$_[0];
4905 my $onerror = $_[1];
4906
4907 my $nt = $node->node_type;
4908 if ($nt == 9) {
4909 # MUST
4910
4911 ## Step 1 # MUST
4912 ## TODO: If the document has an active parser, ...
4913 ## ISSUE: There is an issue in the spec.
4914
4915 ## Step 2 # MUST
4916 my @cn = @{$node->child_nodes};
4917 for (@cn) {
4918 $node->remove_child ($_);
4919 }
4920
4921 ## Step 3, 4, 5 # MUST
4922 $class->parse_string ($$s => $node, $onerror);
4923 } elsif ($nt == 1) {
4924 ## TODO: If non-html element
4925
4926 ## NOTE: Most of this code is copied from |parse_string|
4927
4928 ## Step 1 # MUST
4929 my $this_doc = $node->owner_document;
4930 my $doc = $this_doc->implementation->create_document;
4931 ## TODO: Mark as HTML document
4932 my $p = $class->new;
4933 $p->{document} = $doc;
4934
4935 ## Step 9 # MUST
4936 my $i = 0;
4937 my $line = 1;
4938 my $column = 0;
4939 $p->{set_next_input_character} = sub {
4940 my $self = shift;
4941
4942 pop @{$self->{prev_input_character}};
4943 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
4944
4945 $self->{next_input_character} = -1 and return if $i >= length $$s;
4946 $self->{next_input_character} = ord substr $$s, $i++, 1;
4947 $column++;
4948
4949 if ($self->{next_input_character} == 0x000A) { # LF
4950 $line++;
4951 $column = 0;
4952 } elsif ($self->{next_input_character} == 0x000D) { # CR
4953 $i++ if substr ($$s, $i, 1) eq "\x0A";
4954 $self->{next_input_character} = 0x000A; # LF # MUST
4955 $line++;
4956 $column = 0;
4957 } elsif ($self->{next_input_character} > 0x10FFFF) {
4958 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
4959 } elsif ($self->{next_input_character} == 0x0000) { # NULL
4960 !!!parse-error (type => 'NULL');
4961 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
4962 }
4963 };
4964 $p->{prev_input_character} = [-1, -1, -1];
4965 $p->{next_input_character} = -1;
4966
4967 my $ponerror = $onerror || sub {
4968 my (%opt) = @_;
4969 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
4970 };
4971 $p->{parse_error} = sub {
4972 $ponerror->(@_, line => $line, column => $column);
4973 };
4974
4975 $p->_initialize_tokenizer;
4976 $p->_initialize_tree_constructor;
4977
4978 ## Step 2
4979 my $node_ln = $node->local_name;
4980 $p->{content_model_flag} = {
4981 title => 'RCDATA',
4982 textarea => 'RCDATA',
4983 style => 'CDATA',
4984 script => 'CDATA',
4985 xmp => 'CDATA',
4986 iframe => 'CDATA',
4987 noembed => 'CDATA',
4988 noframes => 'CDATA',
4989 noscript => 'CDATA',
4990 plaintext => 'PLAINTEXT',
4991 }->{$node_ln} || 'PCDATA';
4992 ## ISSUE: What is "the name of the element"? local name?
4993
4994 $p->{inner_html_node} = [$node, $node_ln];
4995
4996 ## Step 4
4997 my $root = $doc->create_element_ns
4998 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
4999
5000 ## Step 5 # MUST
5001 $doc->append_child ($root);
5002
5003 ## Step 6 # MUST
5004 push @{$p->{open_elements}}, [$root, 'html'];
5005
5006 undef $p->{head_element};
5007
5008 ## Step 7 # MUST
5009 $p->_reset_insertion_mode;
5010
5011 ## Step 8 # MUST
5012 my $anode = $node;
5013 AN: while (defined $anode) {
5014 if ($anode->node_type == 1) {
5015 my $nsuri = $anode->namespace_uri;
5016 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5017 if ($anode->local_name eq 'form') { ## TODO: case?
5018 $p->{form_element} = $anode;
5019 last AN;
5020 }
5021 }
5022 }
5023 $anode = $anode->parent_node;
5024 } # AN
5025
5026 ## Step 3 # MUST
5027 ## Step 10 # MUST
5028 {
5029 my $self = $p;
5030 !!!next-token;
5031 }
5032 $p->_tree_construction_main;
5033
5034 ## Step 11 # MUST
5035 my @cn = @{$node->child_nodes};
5036 for (@cn) {
5037 $node->remove_child ($_);
5038 }
5039 ## ISSUE: mutation events? read-only?
5040
5041 ## Step 12 # MUST
5042 @cn = @{$root->child_nodes};
5043 for (@cn) {
5044 $this_doc->adopt_node ($_);
5045 $node->append_child ($_);
5046 }
5047 ## ISSUE: mutation events?
5048
5049 $p->_terminate_tree_constructor;
5050 } else {
5051 die "$0: |set_inner_html| is not defined for node of type $nt";
5052 }
5053 } # set_inner_html
5054
5055 } # tree construction stage
5056
5057 sub get_inner_html ($$$) {
5058 my (undef, $node, $on_error) = @_;
5059
5060 ## Step 1
5061 my $s = '';
5062
5063 my $in_cdata;
5064 my $parent = $node;
5065 while (defined $parent) {
5066 if ($parent->node_type == 1 and
5067 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5068 {
5069 style => 1, script => 1, xmp => 1, iframe => 1,
5070 noembed => 1, noframes => 1, noscript => 1,
5071 }->{$parent->local_name}) { ## TODO: case thingy
5072 $in_cdata = 1;
5073 }
5074 $parent = $parent->parent_node;
5075 }
5076
5077 ## Step 2
5078 my @node = @{$node->child_nodes};
5079 C: while (@node) {
5080 my $child = shift @node;
5081 unless (ref $child) {
5082 if ($child eq 'cdata-out') {
5083 $in_cdata = 0;
5084 } else {
5085 $s .= $child; # end tag
5086 }
5087 next C;
5088 }
5089
5090 my $nt = $child->node_type;
5091 if ($nt == 1) { # Element
5092 my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
5093 $s .= '<' . $tag_name;
5094
5095 ## ISSUE: Non-html elements
5096
5097 my @attrs = @{$child->attributes}; # sort order MUST be stable
5098 for my $attr (@attrs) { # order is implementation dependent
5099 my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
5100 $s .= ' ' . $attr_name . '="';
5101 my $attr_value = $attr->value;
5102 ## escape
5103 $attr_value =~ s/&/&amp;/g;
5104 $attr_value =~ s/</&lt;/g;
5105 $attr_value =~ s/>/&gt;/g;
5106 $attr_value =~ s/"/&quot;/g;
5107 $s .= $attr_value . '"';
5108 }
5109 $s .= '>';
5110
5111 next C if {
5112 area => 1, base => 1, basefont => 1, bgsound => 1,
5113 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5114 img => 1, input => 1, link => 1, meta => 1, param => 1,
5115 spacer => 1, wbr => 1,
5116 }->{$tag_name};
5117
5118 if (not $in_cdata and {
5119 style => 1, script => 1, xmp => 1, iframe => 1,
5120 noembed => 1, noframes => 1, noscript => 1,
5121 }->{$tag_name}) {
5122 unshift @node, 'cdata-out';
5123 $in_cdata = 1;
5124 }
5125
5126 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5127 } elsif ($nt == 3 or $nt == 4) {
5128 if ($in_cdata) {
5129 $s .= $child->data;
5130 } else {
5131 my $value = $child->data;
5132 $value =~ s/&/&amp;/g;
5133 $value =~ s/</&lt;/g;
5134 $value =~ s/>/&gt;/g;
5135 $value =~ s/"/&quot;/g;
5136 $s .= $value;
5137 }
5138 } elsif ($nt == 8) {
5139 $s .= '<!--' . $child->data . '-->';
5140 } elsif ($nt == 10) {
5141 $s .= '<!DOCTYPE ' . $child->name . '>';
5142 } elsif ($nt == 5) { # entrefs
5143 push @node, @{$child->child_nodes};
5144 } else {
5145 $on_error->($child) if defined $on_error;
5146 }
5147 ## ISSUE: This code does not support PIs.
5148 } # C
5149
5150 ## Step 3
5151 return \$s;
5152 } # get_inner_html
5153
5154 1;
5155 # $Date: 2007/06/23 06:38:12 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24