/[suikacvs]/markup/html/whatpm/What/HTML.pm.src
Suika

Contents of /markup/html/whatpm/What/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (show annotations) (download) (as text)
Tue May 1 10:37:35 2007 UTC (17 years, 7 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +2 -2 lines
File MIME type: application/x-wais-source
FILE REMOVED
Renamed

1 package What::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.8 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## This is an early version of an HTML parser.
6
7 my $permitted_slash_tag_name = {
8 base => 1,
9 link => 1,
10 meta => 1,
11 hr => 1,
12 br => 1,
13 img=> 1,
14 embed => 1,
15 param => 1,
16 area => 1,
17 col => 1,
18 input => 1,
19 };
20
21 my $entity_char = {
22 AElig => "\x{00C6}",
23 Aacute => "\x{00C1}",
24 Acirc => "\x{00C2}",
25 Agrave => "\x{00C0}",
26 Alpha => "\x{0391}",
27 Aring => "\x{00C5}",
28 Atilde => "\x{00C3}",
29 Auml => "\x{00C4}",
30 Beta => "\x{0392}",
31 Ccedil => "\x{00C7}",
32 Chi => "\x{03A7}",
33 Dagger => "\x{2021}",
34 Delta => "\x{0394}",
35 ETH => "\x{00D0}",
36 Eacute => "\x{00C9}",
37 Ecirc => "\x{00CA}",
38 Egrave => "\x{00C8}",
39 Epsilon => "\x{0395}",
40 Eta => "\x{0397}",
41 Euml => "\x{00CB}",
42 Gamma => "\x{0393}",
43 Iacute => "\x{00CD}",
44 Icirc => "\x{00CE}",
45 Igrave => "\x{00CC}",
46 Iota => "\x{0399}",
47 Iuml => "\x{00CF}",
48 Kappa => "\x{039A}",
49 Lambda => "\x{039B}",
50 Mu => "\x{039C}",
51 Ntilde => "\x{00D1}",
52 Nu => "\x{039D}",
53 OElig => "\x{0152}",
54 Oacute => "\x{00D3}",
55 Ocirc => "\x{00D4}",
56 Ograve => "\x{00D2}",
57 Omega => "\x{03A9}",
58 Omicron => "\x{039F}",
59 Oslash => "\x{00D8}",
60 Otilde => "\x{00D5}",
61 Ouml => "\x{00D6}",
62 Phi => "\x{03A6}",
63 Pi => "\x{03A0}",
64 Prime => "\x{2033}",
65 Psi => "\x{03A8}",
66 Rho => "\x{03A1}",
67 Scaron => "\x{0160}",
68 Sigma => "\x{03A3}",
69 THORN => "\x{00DE}",
70 Tau => "\x{03A4}",
71 Theta => "\x{0398}",
72 Uacute => "\x{00DA}",
73 Ucirc => "\x{00DB}",
74 Ugrave => "\x{00D9}",
75 Upsilon => "\x{03A5}",
76 Uuml => "\x{00DC}",
77 Xi => "\x{039E}",
78 Yacute => "\x{00DD}",
79 Yuml => "\x{0178}",
80 Zeta => "\x{0396}",
81 aacute => "\x{00E1}",
82 acirc => "\x{00E2}",
83 acute => "\x{00B4}",
84 aelig => "\x{00E6}",
85 agrave => "\x{00E0}",
86 alefsym => "\x{2135}",
87 alpha => "\x{03B1}",
88 amp => "\x{0026}",
89 AMP => "\x{0026}",
90 and => "\x{2227}",
91 ang => "\x{2220}",
92 apos => "\x{0027}",
93 aring => "\x{00E5}",
94 asymp => "\x{2248}",
95 atilde => "\x{00E3}",
96 auml => "\x{00E4}",
97 bdquo => "\x{201E}",
98 beta => "\x{03B2}",
99 brvbar => "\x{00A6}",
100 bull => "\x{2022}",
101 cap => "\x{2229}",
102 ccedil => "\x{00E7}",
103 cedil => "\x{00B8}",
104 cent => "\x{00A2}",
105 chi => "\x{03C7}",
106 circ => "\x{02C6}",
107 clubs => "\x{2663}",
108 cong => "\x{2245}",
109 copy => "\x{00A9}",
110 COPY => "\x{00A9}",
111 crarr => "\x{21B5}",
112 cup => "\x{222A}",
113 curren => "\x{00A4}",
114 dArr => "\x{21D3}",
115 dagger => "\x{2020}",
116 darr => "\x{2193}",
117 deg => "\x{00B0}",
118 delta => "\x{03B4}",
119 diams => "\x{2666}",
120 divide => "\x{00F7}",
121 eacute => "\x{00E9}",
122 ecirc => "\x{00EA}",
123 egrave => "\x{00E8}",
124 empty => "\x{2205}",
125 emsp => "\x{2003}",
126 ensp => "\x{2002}",
127 epsilon => "\x{03B5}",
128 equiv => "\x{2261}",
129 eta => "\x{03B7}",
130 eth => "\x{00F0}",
131 euml => "\x{00EB}",
132 euro => "\x{20AC}",
133 exist => "\x{2203}",
134 fnof => "\x{0192}",
135 forall => "\x{2200}",
136 frac12 => "\x{00BD}",
137 frac14 => "\x{00BC}",
138 frac34 => "\x{00BE}",
139 frasl => "\x{2044}",
140 gamma => "\x{03B3}",
141 ge => "\x{2265}",
142 gt => "\x{003E}",
143 GT => "\x{003E}",
144 hArr => "\x{21D4}",
145 harr => "\x{2194}",
146 hearts => "\x{2665}",
147 hellip => "\x{2026}",
148 iacute => "\x{00ED}",
149 icirc => "\x{00EE}",
150 iexcl => "\x{00A1}",
151 igrave => "\x{00EC}",
152 image => "\x{2111}",
153 infin => "\x{221E}",
154 int => "\x{222B}",
155 iota => "\x{03B9}",
156 iquest => "\x{00BF}",
157 isin => "\x{2208}",
158 iuml => "\x{00EF}",
159 kappa => "\x{03BA}",
160 lArr => "\x{21D0}",
161 lambda => "\x{03BB}",
162 lang => "\x{2329}",
163 laquo => "\x{00AB}",
164 larr => "\x{2190}",
165 lceil => "\x{2308}",
166 ldquo => "\x{201C}",
167 le => "\x{2264}",
168 lfloor => "\x{230A}",
169 lowast => "\x{2217}",
170 loz => "\x{25CA}",
171 lrm => "\x{200E}",
172 lsaquo => "\x{2039}",
173 lsquo => "\x{2018}",
174 lt => "\x{003C}",
175 LT => "\x{003C}",
176 macr => "\x{00AF}",
177 mdash => "\x{2014}",
178 micro => "\x{00B5}",
179 middot => "\x{00B7}",
180 minus => "\x{2212}",
181 mu => "\x{03BC}",
182 nabla => "\x{2207}",
183 nbsp => "\x{00A0}",
184 ndash => "\x{2013}",
185 ne => "\x{2260}",
186 ni => "\x{220B}",
187 not => "\x{00AC}",
188 notin => "\x{2209}",
189 nsub => "\x{2284}",
190 ntilde => "\x{00F1}",
191 nu => "\x{03BD}",
192 oacute => "\x{00F3}",
193 ocirc => "\x{00F4}",
194 oelig => "\x{0153}",
195 ograve => "\x{00F2}",
196 oline => "\x{203E}",
197 omega => "\x{03C9}",
198 omicron => "\x{03BF}",
199 oplus => "\x{2295}",
200 or => "\x{2228}",
201 ordf => "\x{00AA}",
202 ordm => "\x{00BA}",
203 oslash => "\x{00F8}",
204 otilde => "\x{00F5}",
205 otimes => "\x{2297}",
206 ouml => "\x{00F6}",
207 para => "\x{00B6}",
208 part => "\x{2202}",
209 permil => "\x{2030}",
210 perp => "\x{22A5}",
211 phi => "\x{03C6}",
212 pi => "\x{03C0}",
213 piv => "\x{03D6}",
214 plusmn => "\x{00B1}",
215 pound => "\x{00A3}",
216 prime => "\x{2032}",
217 prod => "\x{220F}",
218 prop => "\x{221D}",
219 psi => "\x{03C8}",
220 quot => "\x{0022}",
221 QUOT => "\x{0022}",
222 rArr => "\x{21D2}",
223 radic => "\x{221A}",
224 rang => "\x{232A}",
225 raquo => "\x{00BB}",
226 rarr => "\x{2192}",
227 rceil => "\x{2309}",
228 rdquo => "\x{201D}",
229 real => "\x{211C}",
230 reg => "\x{00AE}",
231 REG => "\x{00AE}",
232 rfloor => "\x{230B}",
233 rho => "\x{03C1}",
234 rlm => "\x{200F}",
235 rsaquo => "\x{203A}",
236 rsquo => "\x{2019}",
237 sbquo => "\x{201A}",
238 scaron => "\x{0161}",
239 sdot => "\x{22C5}",
240 sect => "\x{00A7}",
241 shy => "\x{00AD}",
242 sigma => "\x{03C3}",
243 sigmaf => "\x{03C2}",
244 sim => "\x{223C}",
245 spades => "\x{2660}",
246 sub => "\x{2282}",
247 sube => "\x{2286}",
248 sum => "\x{2211}",
249 sup => "\x{2283}",
250 sup1 => "\x{00B9}",
251 sup2 => "\x{00B2}",
252 sup3 => "\x{00B3}",
253 supe => "\x{2287}",
254 szlig => "\x{00DF}",
255 tau => "\x{03C4}",
256 there4 => "\x{2234}",
257 theta => "\x{03B8}",
258 thetasym => "\x{03D1}",
259 thinsp => "\x{2009}",
260 thorn => "\x{00FE}",
261 tilde => "\x{02DC}",
262 times => "\x{00D7}",
263 trade => "\x{2122}",
264 uArr => "\x{21D1}",
265 uacute => "\x{00FA}",
266 uarr => "\x{2191}",
267 ucirc => "\x{00FB}",
268 ugrave => "\x{00F9}",
269 uml => "\x{00A8}",
270 upsih => "\x{03D2}",
271 upsilon => "\x{03C5}",
272 uuml => "\x{00FC}",
273 weierp => "\x{2118}",
274 xi => "\x{03BE}",
275 yacute => "\x{00FD}",
276 yen => "\x{00A5}",
277 yuml => "\x{00FF}",
278 zeta => "\x{03B6}",
279 zwj => "\x{200D}",
280 zwnj => "\x{200C}",
281 };
282
283 my $special_category = {
284 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
285 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
286 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
287 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
288 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
289 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
290 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
291 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
292 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
293 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
294 };
295 my $scoping_category = {
296 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
297 table => 1, td => 1, th => 1,
298 };
299 my $formatting_category = {
300 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
301 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
302 };
303 # $phrasing_category: all other elements
304
305 sub parse_string ($$$;$) {
306 my $self = shift->new;
307 my $s = \$_[0];
308 $self->{document} = $_[1];
309
310 my $i;
311 my $i = 0;
312 $self->{set_next_input_character} = sub {
313 my $self = shift;
314 $self->{next_input_character} = -1 and return if $i >= length $$s;
315 $self->{next_input_character} = ord substr $$s, $i++, 1;
316
317 if ($self->{next_input_character} == 0x000D) { # CR
318 if ($i >= length $$s) {
319 #
320 } else {
321 my $next_char = ord substr $$s, $i++, 1;
322 if ($next_char == 0x000A) { # LF
323 #
324 } else {
325 push @{$self->{char}}, $next_char;
326 }
327 }
328 $self->{next_input_character} = 0x000A; # LF # MUST
329 } elsif ($self->{next_input_character} > 0x10FFFF) {
330 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
331 } elsif ($self->{next_input_character} == 0x0000) { # NULL
332 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
333 }
334 };
335
336 $self->{parse_error} = $_[2] || sub {
337 warn "Parse error at character $i\n"; ## TODO: Report (line, column) pair
338 };
339
340 $self->_initialize_tokenizer;
341 $self->_initialize_tree_constructor;
342 $self->_construct_tree;
343 $self->_terminate_tree_constructor;
344
345 return $self->{document};
346 } # parse_string
347
348 sub new ($) {
349 my $class = shift;
350 my $self = bless {}, $class;
351 $self->{set_next_input_character} = sub {
352 $self->{next_input_character} = -1;
353 };
354 $self->{parse_error} = sub {
355 #
356 };
357 return $self;
358 } # new
359
360 ## Implementations MUST act as if state machine in the spec
361
362 sub _initialize_tokenizer ($) {
363 my $self = shift;
364 $self->{state} = 'data'; # MUST
365 $self->{content_model_flag} = 'PCDATA'; # be
366 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
367 undef $self->{current_attribute};
368 undef $self->{last_emitted_start_tag_name};
369 undef $self->{last_attribute_value_state};
370 $self->{char} = [];
371 # $self->{next_input_character}
372 !!!next-input-character;
373 $self->{token} = [];
374 } # _initialize_tokenizer
375
376 ## A token has:
377 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
378 ## 'character', or 'end-of-file'
379 ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
380 ## ISSUE: the spec need s/tagname/tag name/
381 ## ->{error} == 1 or 0 (DOCTYPE)
382 ## ->{attributes} isa HASH (start tag, end tag)
383 ## ->{data} (comment, character)
384
385 ## Macros
386 ## Macros MUST be preceded by three EXCLAMATION MARKs.
387 ## emit ($token)
388 ## Emits the specified token.
389
390 ## Emitted token MUST immediately be handled by the tree construction state.
391
392 ## Before each step, UA MAY check to see if either one of the scripts in
393 ## "list of scripts that will execute as soon as possible" or the first
394 ## script in the "list of scripts that will execute asynchronously",
395 ## has completed loading. If one has, then it MUST be executed
396 ## and removed from the list.
397
398 sub _get_next_token ($) {
399 my $self = shift;
400 if (@{$self->{token}}) {
401 return shift @{$self->{token}};
402 }
403
404 A: {
405 if ($self->{state} eq 'data') {
406 if ($self->{next_input_character} == 0x0026) { # &
407 if ($self->{content_model_flag} eq 'PCDATA' or
408 $self->{content_model_flag} eq 'RCDATA') {
409 $self->{state} = 'entity data';
410 !!!next-input-character;
411 redo A;
412 } else {
413 #
414 }
415 } elsif ($self->{next_input_character} == 0x003C) { # <
416 if ($self->{content_model_flag} ne 'PLAINTEXT') {
417 $self->{state} = 'tag open';
418 !!!next-input-character;
419 redo A;
420 } else {
421 #
422 }
423 } elsif ($self->{next_input_character} == -1) {
424 !!!emit ({type => 'end-of-file'});
425 last A; ## TODO: ok?
426 }
427 # Anything else
428 my $token = {type => 'character',
429 data => chr $self->{next_input_character}};
430 ## Stay in the data state
431 !!!next-input-character;
432
433 !!!emit ($token);
434
435 redo A;
436 } elsif ($self->{state} eq 'entity data') {
437 ## (cannot happen in CDATA state)
438
439 my $token = $self->_tokenize_attempt_to_consume_an_entity;
440
441 $self->{state} = 'data';
442 # next-input-character is already done
443
444 unless (defined $token) {
445 !!!emit ({type => 'character', data => '&'});
446 } else {
447 !!!emit ($token);
448 }
449
450 redo A;
451 } elsif ($self->{state} eq 'tag open') {
452 if ($self->{content_model_flag} eq 'RCDATA' or
453 $self->{content_model_flag} eq 'CDATA') {
454 if ($self->{next_input_character} == 0x002F) { # /
455 !!!next-input-character;
456 $self->{state} = 'close tag open';
457 redo A;
458 } else {
459 ## reconsume
460 $self->{state} = 'data';
461
462 !!!emit ({type => 'character', data => '<'});
463
464 redo A;
465 }
466 } elsif ($self->{content_model_flag} eq 'PCDATA') {
467 if ($self->{next_input_character} == 0x0021) { # !
468 $self->{state} = 'markup declaration open';
469 !!!next-input-character;
470 redo A;
471 } elsif ($self->{next_input_character} == 0x002F) { # /
472 $self->{state} = 'close tag open';
473 !!!next-input-character;
474 redo A;
475 } elsif (0x0041 <= $self->{next_input_character} and
476 $self->{next_input_character} <= 0x005A) { # A..Z
477 $self->{current_token}
478 = {type => 'start tag',
479 tag_name => chr ($self->{next_input_character} + 0x0020)};
480 $self->{state} = 'tag name';
481 !!!next-input-character;
482 redo A;
483 } elsif (0x0061 <= $self->{next_input_character} and
484 $self->{next_input_character} <= 0x007A) { # a..z
485 $self->{current_token} = {type => 'start tag',
486 tag_name => chr ($self->{next_input_character})};
487 $self->{state} = 'tag name';
488 !!!next-input-character;
489 redo A;
490 } elsif ($self->{next_input_character} == 0x003E) { # >
491 !!!parse-error;
492 $self->{state} = 'data';
493 !!!next-input-character;
494
495 !!!emit ({type => 'character', data => '<>'});
496
497 redo A;
498 } elsif ($self->{next_input_character} == 0x003F) { # ?
499 !!!parse-error;
500 $self->{state} = 'bogus comment';
501 ## $self->{next_input_character} is intentionally left as is
502 redo A;
503 } else {
504 !!!parse-error;
505 $self->{state} = 'data';
506 ## reconsume
507
508 !!!emit ({type => 'character', data => '<'});
509
510 redo A;
511 }
512 } else {
513 die "$0: $self->{content_model_flag}: Unknown content model flag";
514 }
515 } elsif ($self->{state} eq 'close tag open') {
516 if ($self->{content_model_flag} eq 'RCDATA' or
517 $self->{content_model_flag} eq 'CDATA') {
518 my @next_char;
519 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
520 push @next_char, $self->{next_input_character};
521 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
522 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
523 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
524 !!!next-input-character;
525 next TAGNAME;
526 } else {
527 !!!parse-error;
528 $self->{next_input_character} = shift @next_char; # reconsume
529 !!!back-next-input-character (@next_char);
530 $self->{state} = 'data';
531
532 !!!emit ({type => 'character', data => '</'});
533
534 redo A;
535 }
536 }
537 push @next_char, $self->{next_input_character};
538
539 unless ($self->{next_input_character} == 0x0009 or # HT
540 $self->{next_input_character} == 0x000A or # LF
541 $self->{next_input_character} == 0x000B or # VT
542 $self->{next_input_character} == 0x000C or # FF
543 $self->{next_input_character} == 0x0020 or # SP
544 $self->{next_input_character} == 0x003E or # >
545 $self->{next_input_character} == 0x002F or # /
546 $self->{next_input_character} == 0x003C or # <
547 $self->{next_input_character} == -1) {
548 !!!parse-error;
549 $self->{next_input_character} = shift @next_char; # reconsume
550 !!!back-next-input-character (@next_char);
551 $self->{state} = 'data';
552
553 !!!emit ({type => 'character', data => '</'});
554
555 redo A;
556 } else {
557 $self->{next_input_character} = shift @next_char;
558 !!!back-next-input-character (@next_char);
559 # and consume...
560 }
561 }
562
563 if (0x0041 <= $self->{next_input_character} and
564 $self->{next_input_character} <= 0x005A) { # A..Z
565 $self->{current_token} = {type => 'end tag',
566 tag_name => chr ($self->{next_input_character} + 0x0020)};
567 $self->{state} = 'tag name';
568 !!!next-input-character;
569 redo A;
570 } elsif (0x0061 <= $self->{next_input_character} and
571 $self->{next_input_character} <= 0x007A) { # a..z
572 $self->{current_token} = {type => 'end tag',
573 tag_name => chr ($self->{next_input_character})};
574 $self->{state} = 'tag name';
575 !!!next-input-character;
576 redo A;
577 } elsif ($self->{next_input_character} == 0x003E) { # >
578 !!!parse-error;
579 $self->{state} = 'data';
580 !!!next-input-character;
581 redo A;
582 } elsif ($self->{next_input_character} == -1) {
583 !!!parse-error;
584 $self->{state} = 'data';
585 # reconsume
586
587 !!!emit ({type => 'character', data => '</'});
588
589 redo A;
590 } else {
591 !!!parse-error;
592 $self->{state} = 'bogus comment';
593 ## $self->{next_input_character} is intentionally left as is
594 redo A;
595 }
596 } elsif ($self->{state} eq 'tag name') {
597 if ($self->{next_input_character} == 0x0009 or # HT
598 $self->{next_input_character} == 0x000A or # LF
599 $self->{next_input_character} == 0x000B or # VT
600 $self->{next_input_character} == 0x000C or # FF
601 $self->{next_input_character} == 0x0020) { # SP
602 $self->{state} = 'before attribute name';
603 !!!next-input-character;
604 redo A;
605 } elsif ($self->{next_input_character} == 0x003E) { # >
606 if ($self->{current_token}->{type} eq 'start tag') {
607 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
608 } elsif ($self->{current_token}->{type} eq 'end tag') {
609 $self->{content_model_flag} = 'PCDATA'; # MUST
610 if ($self->{current_token}->{attributes}) {
611 !!!parse-error;
612 }
613 } else {
614 die "$0: $self->{current_token}->{type}: Unknown token type";
615 }
616 $self->{state} = 'data';
617 !!!next-input-character;
618
619 !!!emit ($self->{current_token}); # start tag or end tag
620 undef $self->{current_token};
621
622 redo A;
623 } elsif (0x0041 <= $self->{next_input_character} and
624 $self->{next_input_character} <= 0x005A) { # A..Z
625 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
626 # start tag or end tag
627 ## Stay in this state
628 !!!next-input-character;
629 redo A;
630 } elsif ($self->{next_input_character} == 0x003C or # <
631 $self->{next_input_character} == -1) {
632 !!!parse-error;
633 if ($self->{current_token}->{type} eq 'start tag') {
634 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
635 } elsif ($self->{current_token}->{type} eq 'end tag') {
636 $self->{content_model_flag} = 'PCDATA'; # MUST
637 if ($self->{current_token}->{attributes}) {
638 !!!parse-error;
639 }
640 } else {
641 die "$0: $self->{current_token}->{type}: Unknown token type";
642 }
643 $self->{state} = 'data';
644 # reconsume
645
646 !!!emit ($self->{current_token}); # start tag or end tag
647 undef $self->{current_token};
648
649 redo A;
650 } elsif ($self->{next_input_character} == 0x002F) { # /
651 !!!next-input-character;
652 if ($self->{next_input_character} == 0x003E and # >
653 $self->{current_token}->{type} eq 'start tag' and
654 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
655 # permitted slash
656 #
657 } else {
658 !!!parse-error;
659 }
660 $self->{state} = 'before attribute name';
661 # next-input-character is already done
662 redo A;
663 } else {
664 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
665 # start tag or end tag
666 ## Stay in the state
667 !!!next-input-character;
668 redo A;
669 }
670 } elsif ($self->{state} eq 'before attribute name') {
671 if ($self->{next_input_character} == 0x0009 or # HT
672 $self->{next_input_character} == 0x000A or # LF
673 $self->{next_input_character} == 0x000B or # VT
674 $self->{next_input_character} == 0x000C or # FF
675 $self->{next_input_character} == 0x0020) { # SP
676 ## Stay in the state
677 !!!next-input-character;
678 redo A;
679 } elsif ($self->{next_input_character} == 0x003E) { # >
680 if ($self->{current_token}->{type} eq 'start tag') {
681 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
682 } elsif ($self->{current_token}->{type} eq 'end tag') {
683 $self->{content_model_flag} = 'PCDATA'; # MUST
684 if ($self->{current_token}->{attributes}) {
685 !!!parse-error;
686 }
687 } else {
688 die "$0: $self->{current_token}->{type}: Unknown token type";
689 }
690 $self->{state} = 'data';
691 !!!next-input-character;
692
693 !!!emit ($self->{current_token}); # start tag or end tag
694 undef $self->{current_token};
695
696 redo A;
697 } elsif (0x0041 <= $self->{next_input_character} and
698 $self->{next_input_character} <= 0x005A) { # A..Z
699 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
700 value => ''};
701 $self->{state} = 'attribute name';
702 !!!next-input-character;
703 redo A;
704 } elsif ($self->{next_input_character} == 0x002F) { # /
705 !!!next-input-character;
706 if ($self->{next_input_character} == 0x003E and # >
707 $self->{current_token}->{type} eq 'start tag' and
708 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
709 # permitted slash
710 #
711 } else {
712 !!!parse-error;
713 }
714 ## Stay in the state
715 # next-input-character is already done
716 redo A;
717 } elsif ($self->{next_input_character} == 0x003C or # <
718 $self->{next_input_character} == -1) {
719 !!!parse-error;
720 if ($self->{current_token}->{type} eq 'start tag') {
721 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
722 } elsif ($self->{current_token}->{type} eq 'end tag') {
723 $self->{content_model_flag} = 'PCDATA'; # MUST
724 if ($self->{current_token}->{attributes}) {
725 !!!parse-error;
726 }
727 } else {
728 die "$0: $self->{current_token}->{type}: Unknown token type";
729 }
730 $self->{state} = 'data';
731 # reconsume
732
733 !!!emit ($self->{current_token}); # start tag or end tag
734 undef $self->{current_token};
735
736 redo A;
737 } else {
738 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
739 value => ''};
740 $self->{state} = 'attribute name';
741 !!!next-input-character;
742 redo A;
743 }
744 } elsif ($self->{state} eq 'attribute name') {
745 my $before_leave = sub {
746 if (exists $self->{current_token}->{attributes} # start tag or end tag
747 ->{$self->{current_attribute}->{name}}) { # MUST
748 !!!parse-error;
749 ## Discard $self->{current_attribute} # MUST
750 } else {
751 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
752 = $self->{current_attribute};
753 }
754 }; # $before_leave
755
756 if ($self->{next_input_character} == 0x0009 or # HT
757 $self->{next_input_character} == 0x000A or # LF
758 $self->{next_input_character} == 0x000B or # VT
759 $self->{next_input_character} == 0x000C or # FF
760 $self->{next_input_character} == 0x0020) { # SP
761 $before_leave->();
762 $self->{state} = 'after attribute name';
763 !!!next-input-character;
764 redo A;
765 } elsif ($self->{next_input_character} == 0x003D) { # =
766 $before_leave->();
767 $self->{state} = 'before attribute value';
768 !!!next-input-character;
769 redo A;
770 } elsif ($self->{next_input_character} == 0x003E) { # >
771 $before_leave->();
772 if ($self->{current_token}->{type} eq 'start tag') {
773 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
774 } elsif ($self->{current_token}->{type} eq 'end tag') {
775 $self->{content_model_flag} = 'PCDATA'; # MUST
776 if ($self->{current_token}->{attributes}) {
777 !!!parse-error;
778 }
779 } else {
780 die "$0: $self->{current_token}->{type}: Unknown token type";
781 }
782 $self->{state} = 'data';
783 !!!next-input-character;
784
785 !!!emit ($self->{current_token}); # start tag or end tag
786 undef $self->{current_token};
787
788 redo A;
789 } elsif (0x0041 <= $self->{next_input_character} and
790 $self->{next_input_character} <= 0x005A) { # A..Z
791 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
792 ## Stay in the state
793 !!!next-input-character;
794 redo A;
795 } elsif ($self->{next_input_character} == 0x002F) { # /
796 $before_leave->();
797 !!!next-input-character;
798 if ($self->{next_input_character} == 0x003E and # >
799 $self->{current_token}->{type} eq 'start tag' and
800 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
801 # permitted slash
802 #
803 } else {
804 !!!parse-error;
805 }
806 $self->{state} = 'before attribute name';
807 # next-input-character is already done
808 redo A;
809 } elsif ($self->{next_input_character} == 0x003C or # <
810 $self->{next_input_character} == -1) {
811 !!!parse-error;
812 $before_leave->();
813 if ($self->{current_token}->{type} eq 'start tag') {
814 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
815 } elsif ($self->{current_token}->{type} eq 'end tag') {
816 $self->{content_model_flag} = 'PCDATA'; # MUST
817 if ($self->{current_token}->{attributes}) {
818 !!!parse-error;
819 }
820 } else {
821 die "$0: $self->{current_token}->{type}: Unknown token type";
822 }
823 $self->{state} = 'data';
824 # reconsume
825
826 !!!emit ($self->{current_token}); # start tag or end tag
827 undef $self->{current_token};
828
829 redo A;
830 } else {
831 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
832 ## Stay in the state
833 !!!next-input-character;
834 redo A;
835 }
836 } elsif ($self->{state} eq 'after attribute name') {
837 if ($self->{next_input_character} == 0x0009 or # HT
838 $self->{next_input_character} == 0x000A or # LF
839 $self->{next_input_character} == 0x000B or # VT
840 $self->{next_input_character} == 0x000C or # FF
841 $self->{next_input_character} == 0x0020) { # SP
842 ## Stay in the state
843 !!!next-input-character;
844 redo A;
845 } elsif ($self->{next_input_character} == 0x003D) { # =
846 $self->{state} = 'before attribute value';
847 !!!next-input-character;
848 redo A;
849 } elsif ($self->{next_input_character} == 0x003E) { # >
850 if ($self->{current_token}->{type} eq 'start tag') {
851 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
852 } elsif ($self->{current_token}->{type} eq 'end tag') {
853 $self->{content_model_flag} = 'PCDATA'; # MUST
854 if ($self->{current_token}->{attributes}) {
855 !!!parse-error;
856 }
857 } else {
858 die "$0: $self->{current_token}->{type}: Unknown token type";
859 }
860 $self->{state} = 'data';
861 !!!next-input-character;
862
863 !!!emit ($self->{current_token}); # start tag or end tag
864 undef $self->{current_token};
865
866 redo A;
867 } elsif (0x0041 <= $self->{next_input_character} and
868 $self->{next_input_character} <= 0x005A) { # A..Z
869 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
870 value => ''};
871 $self->{state} = 'attribute name';
872 !!!next-input-character;
873 redo A;
874 } elsif ($self->{next_input_character} == 0x002F) { # /
875 !!!next-input-character;
876 if ($self->{next_input_character} == 0x003E and # >
877 $self->{current_token}->{type} eq 'start tag' and
878 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
879 # permitted slash
880 #
881 } else {
882 !!!parse-error;
883 }
884 $self->{state} = 'before attribute name';
885 # next-input-character is already done
886 redo A;
887 } elsif ($self->{next_input_character} == 0x003C or # <
888 $self->{next_input_character} == -1) {
889 !!!parse-error;
890 if ($self->{current_token}->{type} eq 'start tag') {
891 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
892 } elsif ($self->{current_token}->{type} eq 'end tag') {
893 $self->{content_model_flag} = 'PCDATA'; # MUST
894 if ($self->{current_token}->{attributes}) {
895 !!!parse-error;
896 }
897 } else {
898 die "$0: $self->{current_token}->{type}: Unknown token type";
899 }
900 $self->{state} = 'data';
901 # reconsume
902
903 !!!emit ($self->{current_token}); # start tag or end tag
904 undef $self->{current_token};
905
906 redo A;
907 } else {
908 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
909 value => ''};
910 $self->{state} = 'attribute name';
911 !!!next-input-character;
912 redo A;
913 }
914 } elsif ($self->{state} eq 'before attribute value') {
915 if ($self->{next_input_character} == 0x0009 or # HT
916 $self->{next_input_character} == 0x000A or # LF
917 $self->{next_input_character} == 0x000B or # VT
918 $self->{next_input_character} == 0x000C or # FF
919 $self->{next_input_character} == 0x0020) { # SP
920 ## Stay in the state
921 !!!next-input-character;
922 redo A;
923 } elsif ($self->{next_input_character} == 0x0022) { # "
924 $self->{state} = 'attribute value (double-quoted)';
925 !!!next-input-character;
926 redo A;
927 } elsif ($self->{next_input_character} == 0x0026) { # &
928 $self->{state} = 'attribute value (unquoted)';
929 ## reconsume
930 redo A;
931 } elsif ($self->{next_input_character} == 0x0027) { # '
932 $self->{state} = 'attribute value (single-quoted)';
933 !!!next-input-character;
934 redo A;
935 } elsif ($self->{next_input_character} == 0x003E) { # >
936 if ($self->{current_token}->{type} eq 'start tag') {
937 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
938 } elsif ($self->{current_token}->{type} eq 'end tag') {
939 $self->{content_model_flag} = 'PCDATA'; # MUST
940 if ($self->{current_token}->{attributes}) {
941 !!!parse-error;
942 }
943 } else {
944 die "$0: $self->{current_token}->{type}: Unknown token type";
945 }
946 $self->{state} = 'data';
947 !!!next-input-character;
948
949 !!!emit ($self->{current_token}); # start tag or end tag
950 undef $self->{current_token};
951
952 redo A;
953 } elsif ($self->{next_input_character} == 0x003C or # <
954 $self->{next_input_character} == -1) {
955 !!!parse-error;
956 if ($self->{current_token}->{type} eq 'start tag') {
957 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
958 } elsif ($self->{current_token}->{type} eq 'end tag') {
959 $self->{content_model_flag} = 'PCDATA'; # MUST
960 if ($self->{current_token}->{attributes}) {
961 !!!parse-error;
962 }
963 } else {
964 die "$0: $self->{current_token}->{type}: Unknown token type";
965 }
966 $self->{state} = 'data';
967 ## reconsume
968
969 !!!emit ($self->{current_token}); # start tag or end tag
970 undef $self->{current_token};
971
972 redo A;
973 } else {
974 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
975 $self->{state} = 'attribute value (unquoted)';
976 !!!next-input-character;
977 redo A;
978 }
979 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
980 if ($self->{next_input_character} == 0x0022) { # "
981 $self->{state} = 'before attribute name';
982 !!!next-input-character;
983 redo A;
984 } elsif ($self->{next_input_character} == 0x0026) { # &
985 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
986 $self->{state} = 'entity in attribute value';
987 !!!next-input-character;
988 redo A;
989 } elsif ($self->{next_input_character} == -1) {
990 !!!parse-error;
991 if ($self->{current_token}->{type} eq 'start tag') {
992 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
993 } elsif ($self->{current_token}->{type} eq 'end tag') {
994 $self->{content_model_flag} = 'PCDATA'; # MUST
995 if ($self->{current_token}->{attributes}) {
996 !!!parse-error;
997 }
998 } else {
999 die "$0: $self->{current_token}->{type}: Unknown token type";
1000 }
1001 $self->{state} = 'data';
1002 ## reconsume
1003
1004 !!!emit ($self->{current_token}); # start tag or end tag
1005 undef $self->{current_token};
1006
1007 redo A;
1008 } else {
1009 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1010 ## Stay in the state
1011 !!!next-input-character;
1012 redo A;
1013 }
1014 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
1015 if ($self->{next_input_character} == 0x0027) { # '
1016 $self->{state} = 'before attribute name';
1017 !!!next-input-character;
1018 redo A;
1019 } elsif ($self->{next_input_character} == 0x0026) { # &
1020 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
1021 $self->{state} = 'entity in attribute value';
1022 !!!next-input-character;
1023 redo A;
1024 } elsif ($self->{next_input_character} == -1) {
1025 !!!parse-error;
1026 if ($self->{current_token}->{type} eq 'start tag') {
1027 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1028 } elsif ($self->{current_token}->{type} eq 'end tag') {
1029 $self->{content_model_flag} = 'PCDATA'; # MUST
1030 if ($self->{current_token}->{attributes}) {
1031 !!!parse-error;
1032 }
1033 } else {
1034 die "$0: $self->{current_token}->{type}: Unknown token type";
1035 }
1036 $self->{state} = 'data';
1037 ## reconsume
1038
1039 !!!emit ($self->{current_token}); # start tag or end tag
1040 undef $self->{current_token};
1041
1042 redo A;
1043 } else {
1044 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1045 ## Stay in the state
1046 !!!next-input-character;
1047 redo A;
1048 }
1049 } elsif ($self->{state} eq 'attribute value (unquoted)') {
1050 if ($self->{next_input_character} == 0x0009 or # HT
1051 $self->{next_input_character} == 0x000A or # LF
1052 $self->{next_input_character} == 0x000B or # HT
1053 $self->{next_input_character} == 0x000C or # FF
1054 $self->{next_input_character} == 0x0020) { # SP
1055 $self->{state} = 'before attribute name';
1056 !!!next-input-character;
1057 redo A;
1058 } elsif ($self->{next_input_character} == 0x0026) { # &
1059 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1060 $self->{state} = 'entity in attribute value';
1061 !!!next-input-character;
1062 redo A;
1063 } elsif ($self->{next_input_character} == 0x003E) { # >
1064 if ($self->{current_token}->{type} eq 'start tag') {
1065 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1066 } elsif ($self->{current_token}->{type} eq 'end tag') {
1067 $self->{content_model_flag} = 'PCDATA'; # MUST
1068 if ($self->{current_token}->{attributes}) {
1069 !!!parse-error;
1070 }
1071 } else {
1072 die "$0: $self->{current_token}->{type}: Unknown token type";
1073 }
1074 $self->{state} = 'data';
1075 !!!next-input-character;
1076
1077 !!!emit ($self->{current_token}); # start tag or end tag
1078 undef $self->{current_token};
1079
1080 redo A;
1081 } elsif ($self->{next_input_character} == 0x003C or # <
1082 $self->{next_input_character} == -1) {
1083 !!!parse-error;
1084 if ($self->{current_token}->{type} eq 'start tag') {
1085 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1086 } elsif ($self->{current_token}->{type} eq 'end tag') {
1087 $self->{content_model_flag} = 'PCDATA'; # MUST
1088 if ($self->{current_token}->{attributes}) {
1089 !!!parse-error;
1090 }
1091 } else {
1092 die "$0: $self->{current_token}->{type}: Unknown token type";
1093 }
1094 $self->{state} = 'data';
1095 ## reconsume
1096
1097 !!!emit ($self->{current_token}); # start tag or end tag
1098 undef $self->{current_token};
1099
1100 redo A;
1101 } else {
1102 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1103 ## Stay in the state
1104 !!!next-input-character;
1105 redo A;
1106 }
1107 } elsif ($self->{state} eq 'entity in attribute value') {
1108 my $token = $self->_tokenize_attempt_to_consume_an_entity;
1109
1110 unless (defined $token) {
1111 $self->{current_attribute}->{value} .= '&';
1112 } else {
1113 $self->{current_attribute}->{value} .= $token->{data};
1114 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1115 }
1116
1117 $self->{state} = $self->{last_attribute_value_state};
1118 # next-input-character is already done
1119 redo A;
1120 } elsif ($self->{state} eq 'bogus comment') {
1121 ## (only happen if PCDATA state)
1122
1123 my $token = {type => 'comment', data => ''};
1124
1125 BC: {
1126 if ($self->{next_input_character} == 0x003E) { # >
1127 $self->{state} = 'data';
1128 !!!next-input-character;
1129
1130 !!!emit ($token);
1131
1132 redo A;
1133 } elsif ($self->{next_input_character} == -1) {
1134 $self->{state} = 'data';
1135 ## reconsume
1136
1137 !!!emit ($token);
1138
1139 redo A;
1140 } else {
1141 $token->{data} .= chr ($self->{next_input_character});
1142 !!!next-input-character;
1143 redo BC;
1144 }
1145 } # BC
1146 } elsif ($self->{state} eq 'markup declaration open') {
1147 ## (only happen if PCDATA state)
1148
1149 my @next_char;
1150 push @next_char, $self->{next_input_character};
1151
1152 if ($self->{next_input_character} == 0x002D) { # -
1153 !!!next-input-character;
1154 push @next_char, $self->{next_input_character};
1155 if ($self->{next_input_character} == 0x002D) { # -
1156 $self->{current_token} = {type => 'comment', data => ''};
1157 $self->{state} = 'comment';
1158 !!!next-input-character;
1159 redo A;
1160 }
1161 } elsif ($self->{next_input_character} == 0x0044 or # D
1162 $self->{next_input_character} == 0x0064) { # d
1163 !!!next-input-character;
1164 push @next_char, $self->{next_input_character};
1165 if ($self->{next_input_character} == 0x004F or # O
1166 $self->{next_input_character} == 0x006F) { # o
1167 !!!next-input-character;
1168 push @next_char, $self->{next_input_character};
1169 if ($self->{next_input_character} == 0x0043 or # C
1170 $self->{next_input_character} == 0x0063) { # c
1171 !!!next-input-character;
1172 push @next_char, $self->{next_input_character};
1173 if ($self->{next_input_character} == 0x0054 or # T
1174 $self->{next_input_character} == 0x0074) { # t
1175 !!!next-input-character;
1176 push @next_char, $self->{next_input_character};
1177 if ($self->{next_input_character} == 0x0059 or # Y
1178 $self->{next_input_character} == 0x0079) { # y
1179 !!!next-input-character;
1180 push @next_char, $self->{next_input_character};
1181 if ($self->{next_input_character} == 0x0050 or # P
1182 $self->{next_input_character} == 0x0070) { # p
1183 !!!next-input-character;
1184 push @next_char, $self->{next_input_character};
1185 if ($self->{next_input_character} == 0x0045 or # E
1186 $self->{next_input_character} == 0x0065) { # e
1187 ## ISSUE: What a stupid code this is!
1188 $self->{state} = 'DOCTYPE';
1189 !!!next-input-character;
1190 redo A;
1191 }
1192 }
1193 }
1194 }
1195 }
1196 }
1197 }
1198
1199 !!!parse-error;
1200 $self->{next_input_character} = shift @next_char;
1201 !!!back-next-input-character (@next_char);
1202 $self->{state} = 'bogus comment';
1203 redo A;
1204
1205 ## ISSUE: typos in spec: chacacters, is is a parse error
1206 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1207 } elsif ($self->{state} eq 'comment') {
1208 if ($self->{next_input_character} == 0x002D) { # -
1209 $self->{state} = 'comment dash';
1210 !!!next-input-character;
1211 redo A;
1212 } elsif ($self->{next_input_character} == -1) {
1213 !!!parse-error;
1214 $self->{state} = 'data';
1215 ## reconsume
1216
1217 !!!emit ($self->{current_token}); # comment
1218 undef $self->{current_token};
1219
1220 redo A;
1221 } else {
1222 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1223 ## Stay in the state
1224 !!!next-input-character;
1225 redo A;
1226 }
1227 } elsif ($self->{state} eq 'comment dash') {
1228 if ($self->{next_input_character} == 0x002D) { # -
1229 $self->{state} = 'comment end';
1230 !!!next-input-character;
1231 redo A;
1232 } elsif ($self->{next_input_character} == -1) {
1233 !!!parse-error;
1234 $self->{state} = 'data';
1235 ## reconsume
1236
1237 !!!emit ($self->{current_token}); # comment
1238 undef $self->{current_token};
1239
1240 redo A;
1241 } else {
1242 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1243 $self->{state} = 'comment';
1244 !!!next-input-character;
1245 redo A;
1246 }
1247 } elsif ($self->{state} eq 'comment end') {
1248 if ($self->{next_input_character} == 0x003E) { # >
1249 $self->{state} = 'data';
1250 !!!next-input-character;
1251
1252 !!!emit ($self->{current_token}); # comment
1253 undef $self->{current_token};
1254
1255 redo A;
1256 } elsif ($self->{next_input_character} == 0x002D) { # -
1257 !!!parse-error;
1258 $self->{current_token}->{data} .= '-'; # comment
1259 ## Stay in the state
1260 !!!next-input-character;
1261 redo A;
1262 } elsif ($self->{next_input_character} == -1) {
1263 !!!parse-error;
1264 $self->{state} = 'data';
1265 ## reconsume
1266
1267 !!!emit ($self->{current_token}); # comment
1268 undef $self->{current_token};
1269
1270 redo A;
1271 } else {
1272 !!!parse-error;
1273 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1274 $self->{state} = 'comment';
1275 !!!next-input-character;
1276 redo A;
1277 }
1278 } elsif ($self->{state} eq 'DOCTYPE') {
1279 if ($self->{next_input_character} == 0x0009 or # HT
1280 $self->{next_input_character} == 0x000A or # LF
1281 $self->{next_input_character} == 0x000B or # VT
1282 $self->{next_input_character} == 0x000C or # FF
1283 $self->{next_input_character} == 0x0020) { # SP
1284 $self->{state} = 'before DOCTYPE name';
1285 !!!next-input-character;
1286 redo A;
1287 } else {
1288 !!!parse-error;
1289 $self->{state} = 'before DOCTYPE name';
1290 ## reconsume
1291 redo A;
1292 }
1293 } elsif ($self->{state} eq 'before DOCTYPE name') {
1294 if ($self->{next_input_character} == 0x0009 or # HT
1295 $self->{next_input_character} == 0x000A or # LF
1296 $self->{next_input_character} == 0x000B or # VT
1297 $self->{next_input_character} == 0x000C or # FF
1298 $self->{next_input_character} == 0x0020) { # SP
1299 ## Stay in the state
1300 !!!next-input-character;
1301 redo A;
1302 } elsif (0x0061 <= $self->{next_input_character} and
1303 $self->{next_input_character} <= 0x007A) { # a..z
1304 $self->{current_token} = {type => 'DOCTYPE',
1305 name => chr ($self->{next_input_character} - 0x0020),
1306 error => 1};
1307 $self->{state} = 'DOCTYPE name';
1308 !!!next-input-character;
1309 redo A;
1310 } elsif ($self->{next_input_character} == 0x003E) { # >
1311 !!!parse-error;
1312 $self->{state} = 'data';
1313 !!!next-input-character;
1314
1315 !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1316
1317 redo A;
1318 } elsif ($self->{next_input_character} == -1) {
1319 !!!parse-error;
1320 $self->{state} = 'data';
1321 ## reconsume
1322
1323 !!!emit ({type => 'DOCTYPE', name => '', error => 1});
1324
1325 redo A;
1326 } else {
1327 $self->{current_token} = {type => 'DOCTYPE',
1328 name => chr ($self->{next_input_character}),
1329 error => 1};
1330 $self->{state} = 'DOCTYPE name';
1331 !!!next-input-character;
1332 redo A;
1333 }
1334 } elsif ($self->{state} eq 'DOCTYPE name') {
1335 if ($self->{next_input_character} == 0x0009 or # HT
1336 $self->{next_input_character} == 0x000A or # LF
1337 $self->{next_input_character} == 0x000B or # VT
1338 $self->{next_input_character} == 0x000C or # FF
1339 $self->{next_input_character} == 0x0020) { # SP
1340 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1341 $self->{state} = 'after DOCTYPE name';
1342 !!!next-input-character;
1343 redo A;
1344 } elsif ($self->{next_input_character} == 0x003E) { # >
1345 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1346 $self->{state} = 'data';
1347 !!!next-input-character;
1348
1349 !!!emit ($self->{current_token}); # DOCTYPE
1350 undef $self->{current_token};
1351
1352 redo A;
1353 } elsif (0x0061 <= $self->{next_input_character} and
1354 $self->{next_input_character} <= 0x007A) { # a..z
1355 $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1356 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1357 ## Stay in the state
1358 !!!next-input-character;
1359 redo A;
1360 } elsif ($self->{next_input_character} == -1) {
1361 !!!parse-error;
1362 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1363 $self->{state} = 'data';
1364 ## reconsume
1365
1366 !!!emit ($self->{current_token});
1367 undef $self->{current_token};
1368
1369 redo A;
1370 } else {
1371 $self->{current_token}->{name}
1372 .= chr ($self->{next_input_character}); # DOCTYPE
1373 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1374 ## Stay in the state
1375 !!!next-input-character;
1376 redo A;
1377 }
1378 } elsif ($self->{state} eq 'after DOCTYPE name') {
1379 if ($self->{next_input_character} == 0x0009 or # HT
1380 $self->{next_input_character} == 0x000A or # LF
1381 $self->{next_input_character} == 0x000B or # VT
1382 $self->{next_input_character} == 0x000C or # FF
1383 $self->{next_input_character} == 0x0020) { # SP
1384 ## Stay in the state
1385 !!!next-input-character;
1386 redo A;
1387 } elsif ($self->{next_input_character} == 0x003E) { # >
1388 $self->{state} = 'data';
1389 !!!next-input-character;
1390
1391 !!!emit ($self->{current_token}); # DOCTYPE
1392 undef $self->{current_token};
1393
1394 redo A;
1395 } elsif ($self->{next_input_character} == -1) {
1396 !!!parse-error;
1397 $self->{state} = 'data';
1398 ## reconsume
1399
1400 !!!emit ($self->{current_token}); # DOCTYPE
1401 undef $self->{current_token};
1402
1403 redo A;
1404 } else {
1405 !!!parse-error;
1406 $self->{current_token}->{error} = 1; # DOCTYPE
1407 $self->{state} = 'bogus DOCTYPE';
1408 !!!next-input-character;
1409 redo A;
1410 }
1411 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1412 if ($self->{next_input_character} == 0x003E) { # >
1413 $self->{state} = 'data';
1414 !!!next-input-character;
1415
1416 !!!emit ($self->{current_token}); # DOCTYPE
1417 undef $self->{current_token};
1418
1419 redo A;
1420 } elsif ($self->{next_input_character} == -1) {
1421 !!!parse-error;
1422 $self->{state} = 'data';
1423 ## reconsume
1424
1425 !!!emit ($self->{current_token}); # DOCTYPE
1426 undef $self->{current_token};
1427
1428 redo A;
1429 } else {
1430 ## Stay in the state
1431 !!!next-input-character;
1432 redo A;
1433 }
1434 } else {
1435 die "$0: $self->{state}: Unknown state";
1436 }
1437 } # A
1438
1439 die "$0: _get_next_token: unexpected case";
1440 } # _get_next_token
1441
1442 sub _tokenize_attempt_to_consume_an_entity ($) {
1443 my $self = shift;
1444
1445 if ($self->{next_input_character} == 0x0023) { # #
1446 !!!next-input-character;
1447 my $num;
1448 if ($self->{next_input_character} == 0x0078 or # x
1449 $self->{next_input_character} == 0x0058) { # X
1450 X: {
1451 my $x_char = $self->{next_input_character};
1452 !!!next-input-character;
1453 if (0x0030 <= $self->{next_input_character} and
1454 $self->{next_input_character} <= 0x0039) { # 0..9
1455 $num ||= 0;
1456 $num *= 0x10;
1457 $num += $self->{next_input_character} - 0x0030;
1458 redo X;
1459 } elsif (0x0061 <= $self->{next_input_character} and
1460 $self->{next_input_character} <= 0x0066) { # a..f
1461 ## ISSUE: the spec says U+0078, which is apparently incorrect
1462 $num ||= 0;
1463 $num *= 0x10;
1464 $num += $self->{next_input_character} - 0x0060 + 9;
1465 redo X;
1466 } elsif (0x0041 <= $self->{next_input_character} and
1467 $self->{next_input_character} <= 0x0046) { # A..F
1468 ## ISSUE: the spec says U+0058, which is apparently incorrect
1469 $num ||= 0;
1470 $num *= 0x10;
1471 $num += $self->{next_input_character} - 0x0040 + 9;
1472 redo X;
1473 } elsif (not defined $num) { # no hexadecimal digit
1474 !!!parse-error;
1475 $self->{next_input_character} = 0x0023; # #
1476 !!!back-next-input-character ($x_char);
1477 return undef;
1478 } elsif ($self->{next_input_character} == 0x003B) { # ;
1479 !!!next-input-character;
1480 } else {
1481 !!!parse-error;
1482 }
1483
1484 ## TODO: check the definition for |a valid Unicode character|.
1485 if ($num > 1114111 or $num == 0) {
1486 $num = 0xFFFD; # REPLACEMENT CHARACTER
1487 ## ISSUE: Why this is not an error?
1488 }
1489
1490 return {type => 'character', data => chr $num};
1491 } # X
1492 } elsif (0x0030 <= $self->{next_input_character} and
1493 $self->{next_input_character} <= 0x0039) { # 0..9
1494 my $code = $self->{next_input_character} - 0x0030;
1495 !!!next-input-character;
1496
1497 while (0x0030 <= $self->{next_input_character} and
1498 $self->{next_input_character} <= 0x0039) { # 0..9
1499 $code *= 10;
1500 $code += $self->{next_input_character} - 0x0030;
1501
1502 !!!next-input-character;
1503 }
1504
1505 if ($self->{next_input_character} == 0x003B) { # ;
1506 !!!next-input-character;
1507 } else {
1508 !!!parse-error;
1509 }
1510
1511 ## TODO: check the definition for |a valid Unicode character|.
1512 if ($code > 1114111 or $code == 0) {
1513 $code = 0xFFFD; # REPLACEMENT CHARACTER
1514 ## ISSUE: Why this is not an error?
1515 }
1516
1517 return {type => 'character', data => chr $code};
1518 } else {
1519 !!!parse-error;
1520 !!!back-next-input-character ($self->{next_input_character});
1521 $self->{next_input_character} = 0x0023; # #
1522 return undef;
1523 }
1524 } elsif ((0x0041 <= $self->{next_input_character} and
1525 $self->{next_input_character} <= 0x005A) or
1526 (0x0061 <= $self->{next_input_character} and
1527 $self->{next_input_character} <= 0x007A)) {
1528 my $entity_name = chr $self->{next_input_character};
1529 !!!next-input-character;
1530
1531 my $value = $entity_name;
1532 my $match;
1533
1534 while (length $entity_name < 10 and
1535 ## NOTE: Some number greater than the maximum length of entity name
1536 ((0x0041 <= $self->{next_input_character} and
1537 $self->{next_input_character} <= 0x005A) or
1538 (0x0061 <= $self->{next_input_character} and
1539 $self->{next_input_character} <= 0x007A) or
1540 (0x0030 <= $self->{next_input_character} and
1541 $self->{next_input_character} <= 0x0039))) {
1542 $entity_name .= chr $self->{next_input_character};
1543 if (defined $entity_char->{$entity_name}) {
1544 $value = $entity_char->{$entity_name};
1545 $match = 1;
1546 } else {
1547 $value .= chr $self->{next_input_character};
1548 }
1549 !!!next-input-character;
1550 }
1551
1552 if ($match) {
1553 if ($self->{next_input_character} == 0x003B) { # ;
1554 !!!next-input-character;
1555 } else {
1556 !!!parse-error;
1557 }
1558
1559 return {type => 'character', data => $value};
1560 } else {
1561 !!!parse-error;
1562 ## NOTE: No characters are consumed in the spec.
1563 !!!back-token ({type => 'character', data => $value});
1564 return undef;
1565 }
1566 } else {
1567 ## no characters are consumed
1568 !!!parse-error;
1569 return undef;
1570 }
1571 } # _tokenize_attempt_to_consume_an_entity
1572
1573 sub _initialize_tree_constructor ($) {
1574 my $self = shift;
1575 ## NOTE: $self->{document} MUST be specified before this method is called
1576 $self->{document}->strict_error_checking (0);
1577 ## TODO: Turn mutation events off # MUST
1578 ## TODO: Turn loose Document option (manakai extension) on
1579 ## TODO: Mark the Document as an HTML document # MUST
1580 } # _initialize_tree_constructor
1581
1582 sub _terminate_tree_constructor ($) {
1583 my $self = shift;
1584 $self->{document}->strict_error_checking (1);
1585 ## TODO: Turn mutation events on
1586 } # _terminate_tree_constructor
1587
1588 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1589
1590 sub _construct_tree ($) {
1591 my ($self) = @_;
1592
1593 ## When an interactive UA render the $self->{document} available
1594 ## to the user, or when it begin accepting user input, are
1595 ## not defined.
1596
1597 ## Append a character: collect it and all subsequent consecutive
1598 ## characters and insert one Text node whose data is concatenation
1599 ## of all those characters. # MUST
1600
1601 my $token;
1602 !!!next-token;
1603
1604 my $phase = 'initial'; # MUST
1605
1606 my $open_elements = [];
1607 my $active_formatting_elements = [];
1608 my $head_element;
1609 my $form_element;
1610 my $insertion_mode = 'before head';
1611
1612 my $reconstruct_active_formatting_elements = sub { # MUST
1613 my $insert = shift;
1614
1615 ## Step 1
1616 return unless @$active_formatting_elements;
1617
1618 ## Step 3
1619 my $i = -1;
1620 my $entry = $active_formatting_elements->[$i];
1621
1622 ## Step 2
1623 return if $entry->[0] eq '#marker';
1624 for (@$open_elements) {
1625 if ($entry->[0] eq $_->[0]) {
1626 return;
1627 }
1628 }
1629
1630 S4: {
1631 ## Step 4
1632 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
1633
1634 ## Step 5
1635 $i--;
1636 $entry = $active_formatting_elements->[$i];
1637
1638 ## Step 6
1639 if ($entry->[0] eq '#marker') {
1640 #
1641 } else {
1642 my $in_open_elements;
1643 OE: for (@$open_elements) {
1644 if ($entry->[0] eq $_->[0]) {
1645 $in_open_elements = 1;
1646 last OE;
1647 }
1648 }
1649 if ($in_open_elements) {
1650 #
1651 } else {
1652 redo S4;
1653 }
1654 }
1655
1656 ## Step 7
1657 $i++;
1658 $entry = $active_formatting_elements->[$i];
1659 } # S4
1660
1661 S7: {
1662 ## Step 8
1663 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
1664
1665 ## Step 9
1666 $insert->($clone->[0]);
1667 push @$open_elements, $clone;
1668
1669 ## Step 10
1670 $active_formatting_elements->[$i] = $open_elements->[-1];
1671
1672 ## Step 11
1673 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
1674 ## Step 7'
1675 $i++;
1676 $entry = $active_formatting_elements->[$i];
1677
1678 redo S7;
1679 }
1680 } # S7
1681 }; # $reconstruct_active_formatting_elements
1682
1683 my $clear_up_to_marker = sub {
1684 for (reverse 0..$#$active_formatting_elements) {
1685 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
1686 splice @$active_formatting_elements, $_;
1687 return;
1688 }
1689 }
1690 }; # $clear_up_to_marker
1691
1692 my $reset_insertion_mode = sub {
1693 ## Step 1
1694 my $last;
1695
1696 ## Step 2
1697 my $i = -1;
1698 my $node = $open_elements->[$i];
1699
1700 ## Step 3
1701 S3: {
1702 $last = 1 if $open_elements->[0]->[0] eq $node->[0];
1703 ## TODO: the element whose inner_html is set is neither td nor th, then $node = the element
1704
1705 ## Step 4..13
1706 my $new_mode = {
1707 select => 'in select',
1708 td => 'in cell',
1709 th => 'in cell',
1710 tr => 'in row',
1711 tbody => 'in table body',
1712 thead => 'in table head',
1713 tfoot => 'in table foot',
1714 caption => 'in caption',
1715 colgroup => 'in column group',
1716 table => 'in table',
1717 head => 'in body', # not in head!
1718 body => 'in body',
1719 frameset => 'in frameset',
1720 }->{$node->[1]};
1721 $insertion_mode = $new_mode and return if defined $new_mode;
1722
1723 ## Step 14
1724 if ($node->[1] eq 'html') {
1725 unless (defined $head_element) {
1726 $insertion_mode = 'before head';
1727 } else {
1728 $insertion_mode = 'after head';
1729 }
1730 return;
1731 }
1732
1733 ## Step 15
1734 $insertion_mode = 'in body' and return if $last;
1735
1736 ## Step 16
1737 $i--;
1738 $node = $open_elements->[$i];
1739
1740 ## Step 17
1741 redo S3;
1742 } # S3
1743 }; # $reset_insertion_mode
1744
1745 my $style_start_tag = sub {
1746 my $style_el; !!!create-element ($style_el, 'style');
1747 ## $insertion_mode eq 'in head' and ... (always true)
1748 (($insertion_mode eq 'in head' and defined $head_element)
1749 ? $head_element : $open_elements->[-1]->[0])
1750 ->append_child ($style_el);
1751 $self->{content_model_flag} = 'CDATA';
1752
1753 my $text = '';
1754 !!!next-token;
1755 while ($token->{type} eq 'character') {
1756 $text .= $token->{data};
1757 !!!next-token;
1758 } # stop if non-character token or tokenizer stops tokenising
1759 if (length $text) {
1760 $style_el->manakai_append_text ($text);
1761 }
1762
1763 $self->{content_model_flag} = 'PCDATA';
1764
1765 if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
1766 ## Ignore the token
1767 } else {
1768 !!!parse-error;
1769 ## ISSUE: And ignore?
1770 }
1771 !!!next-token;
1772 }; # $style_start_tag
1773
1774 my $script_start_tag = sub {
1775 my $script_el;
1776 !!!create-element ($script_el, 'script', $token->{attributes});
1777 ## TODO: mark as "parser-inserted"
1778
1779 $self->{content_model_flag} = 'CDATA';
1780
1781 my $text = '';
1782 !!!next-token;
1783 while ($token->{type} eq 'character') {
1784 $text .= $token->{data};
1785 !!!next-token;
1786 } # stop if non-character token or tokenizer stops tokenising
1787 if (length $text) {
1788 $script_el->manakai_append_text ($text);
1789 }
1790
1791 $self->{content_model_flag} = 'PCDATA';
1792
1793 if ($token->{type} eq 'end tag' and
1794 $token->{tag_name} eq 'script') {
1795 ## Ignore the token
1796 } else {
1797 !!!parse-error;
1798 ## ISSUE: And ignore?
1799 ## TODO: mark as "already executed"
1800 }
1801
1802 ## TODO: inner_html mode then mark as "already executed" and skip
1803 if (1) {
1804 ## TODO: $old_insertion_point = current insertion point
1805 ## TODO: insertion point = just before the next input character
1806
1807 (($insertion_mode eq 'in head' and defined $head_element)
1808 ? $head_element : $open_elements->[-1]->[0])->append_child ($script_el);
1809
1810 ## TODO: insertion point = $old_insertion_point (might be "undefined")
1811
1812 ## TODO: if there is a script that will execute as soon as the parser resume, then...
1813 }
1814
1815 !!!next-token;
1816 }; # $script_start_tag
1817
1818 my $formatting_end_tag = sub {
1819 my $tag_name = shift;
1820
1821 FET: {
1822 ## Step 1
1823 my $formatting_element;
1824 my $formatting_element_i_in_active;
1825 AFE: for (reverse 0..$#$active_formatting_elements) {
1826 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
1827 $formatting_element = $active_formatting_elements->[$_];
1828 $formatting_element_i_in_active = $_;
1829 last AFE;
1830 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
1831 last AFE;
1832 }
1833 } # AFE
1834 unless (defined $formatting_element) {
1835 !!!parse-error;
1836 ## Ignore the token
1837 !!!next-token;
1838 return;
1839 }
1840 ## has an element in scope
1841 my $in_scope = 1;
1842 my $formatting_element_i_in_open;
1843 INSCOPE: for (reverse 0..$#$open_elements) {
1844 my $node = $open_elements->[$_];
1845 if ($node->[0] eq $formatting_element->[0]) {
1846 if ($in_scope) {
1847 $formatting_element_i_in_open = $_;
1848 last INSCOPE;
1849 } else { # in open elements but not in scope
1850 !!!parse-error;
1851 ## Ignore the token
1852 !!!next-token;
1853 return;
1854 }
1855 } elsif ({
1856 table => 1, caption => 1, td => 1, th => 1,
1857 button => 1, marquee => 1, object => 1, html => 1,
1858 }->{$node->[1]}) {
1859 $in_scope = 0;
1860 }
1861 } # INSCOPE
1862 unless (defined $formatting_element_i_in_open) {
1863 !!!parse-error;
1864 pop @$active_formatting_elements; # $formatting_element
1865 !!!next-token; ## TODO: ok?
1866 return;
1867 }
1868 if (not $open_elements->[-1]->[0] eq $formatting_element->[0]) {
1869 !!!parse-error;
1870 }
1871
1872 ## Step 2
1873 my $furthest_block;
1874 my $furthest_block_i_in_open;
1875 OE: for (reverse 0..$#$open_elements) {
1876 my $node = $open_elements->[$_];
1877 if (not $formatting_category->{$node->[1]} and
1878 #not $phrasing_category->{$node->[1]} and
1879 ($special_category->{$node->[1]} or
1880 $scoping_category->{$node->[1]})) {
1881 $furthest_block = $node;
1882 $furthest_block_i_in_open = $_;
1883 } elsif ($node->[0] eq $formatting_element->[0]) {
1884 last OE;
1885 }
1886 } # OE
1887
1888 ## Step 3
1889 unless (defined $furthest_block) { # MUST
1890 splice @$open_elements, $formatting_element_i_in_open;
1891 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
1892 !!!next-token;
1893 return;
1894 }
1895
1896 ## Step 4
1897 my $common_ancestor_node = $open_elements->[$formatting_element_i_in_open - 1];
1898
1899 ## Step 5
1900 my $furthest_block_parent = $furthest_block->[0]->parent_node;
1901 if (defined $furthest_block_parent) {
1902 $furthest_block_parent->remove_child ($furthest_block->[0]);
1903 }
1904
1905 ## Step 6
1906 my $bookmark_prev_el
1907 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
1908 ->[0];
1909
1910 ## Step 7
1911 my $node = $furthest_block;
1912 my $node_i_in_open = $furthest_block_i_in_open;
1913 my $last_node = $furthest_block;
1914 S7: {
1915 ## Step 1
1916 $node_i_in_open--;
1917 $node = $open_elements->[$node_i_in_open];
1918
1919 ## Step 2
1920 my $node_i_in_active;
1921 S7S2: {
1922 for (reverse 0..$#$active_formatting_elements) {
1923 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
1924 $node_i_in_active = $_;
1925 last S7S2;
1926 }
1927 }
1928 splice @$open_elements, $node_i_in_open, 1;
1929 redo S7;
1930 } # S7S2
1931
1932 ## Step 3
1933 last S7 if $node->[0] eq $formatting_element->[0];
1934
1935 ## Step 4
1936 if ($last_node->[0] eq $furthest_block->[0]) {
1937 $bookmark_prev_el = $node->[0];
1938 }
1939
1940 ## Step 5
1941 if ($node->[0]->has_child_nodes ()) {
1942 my $clone = [$node->[0]->clone_node (0), $node->[1]];
1943 $active_formatting_elements->[$node_i_in_active] = $clone;
1944 $open_elements->[$node_i_in_open] = $clone;
1945 $node = $clone;
1946 }
1947
1948 ## Step 6
1949 $node->[0]->append_child ($last_node->[0]);
1950
1951 ## Step 7
1952 $last_node = $node;
1953
1954 ## Step 8
1955 redo S7;
1956 } # S7
1957
1958 ## Step 8
1959 $common_ancestor_node->[0]->append_child ($last_node->[0]);
1960
1961 ## Step 9
1962 my $clone = [$formatting_element->[0]->clone_node (0),
1963 $formatting_element->[1]];
1964
1965 ## Step 10
1966 my @cn = @{$furthest_block->[0]->child_nodes};
1967 $clone->[0]->append_child ($_) for @cn;
1968
1969 ## Step 11
1970 $furthest_block->[0]->append_child ($clone->[0]);
1971
1972 ## Step 12
1973 my $i;
1974 AFE: for (reverse 0..$#$active_formatting_elements) {
1975 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
1976 splice @$active_formatting_elements, $_, 1;
1977 $i-- and last AFE if defined $i;
1978 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
1979 $i = $_;
1980 }
1981 } # AFE
1982 splice @$active_formatting_elements, $i + 1, 0, $clone;
1983
1984 ## Step 13
1985 undef $i;
1986 OE: for (reverse 0..$#$open_elements) {
1987 if ($open_elements->[$_]->[0] eq $formatting_element->[0]) {
1988 splice @$open_elements, $_, 1;
1989 $i-- and last OE if defined $i;
1990 } elsif ($open_elements->[$_]->[0] eq $furthest_block->[0]) {
1991 $i = $_;
1992 }
1993 } # OE
1994 splice @$open_elements, $i + 1, 1, $clone;
1995
1996 ## Step 14
1997 redo FET;
1998 } # FET
1999 }; # $formatting_end_tag
2000
2001 my $insert_to_current = sub {
2002 $open_elements->[-1]->[0]->append_child (shift);
2003 }; # $insert_to_current
2004
2005 my $insert_to_foster = sub {
2006 my $child = shift;
2007 if ({
2008 table => 1, tbody => 1, tfoot => 1,
2009 thead => 1, tr => 1,
2010 }->{$open_elements->[-1]->[1]}) {
2011 # MUST
2012 my $foster_parent_element;
2013 my $next_sibling;
2014 OE: for (reverse 0..$#$open_elements) {
2015 if ($open_elements->[$_]->[1] eq 'table') {
2016 my $parent = $open_elements->[$_]->[0]->parent_node;
2017 if (defined $parent and $parent->node_type == 1) {
2018 $foster_parent_element = $parent;
2019 $next_sibling = $open_elements->[$_]->[0];
2020 } else {
2021 $foster_parent_element
2022 = $open_elements->[$_ - 1]->[0];
2023 }
2024 last OE;
2025 }
2026 } # OE
2027 $foster_parent_element = $open_elements->[0]->[0]
2028 unless defined $foster_parent_element;
2029 $foster_parent_element->insert_before
2030 ($child, $next_sibling);
2031 } else {
2032 $open_elements->[-1]->[0]->append_child ($child);
2033 }
2034 }; # $insert_to_foster
2035
2036 my $in_body = sub {
2037 my $insert = shift;
2038 if ($token->{type} eq 'start tag') {
2039 if ($token->{tag_name} eq 'script') {
2040 $script_start_tag->();
2041 return;
2042 } elsif ($token->{tag_name} eq 'style') {
2043 $style_start_tag->();
2044 return;
2045 } elsif ({
2046 base => 1, link => 1, meta => 1,
2047 }->{$token->{tag_name}}) {
2048 !!!parse-error ($token->{tag_name}.' in body');
2049 ## NOTE: This is an "as if in head" code clone
2050 my $el;
2051 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2052 if (defined $head_element) {
2053 $head_element->append_child ($el);
2054 } else {
2055 $insert->($el);
2056 }
2057
2058 !!!next-token;
2059 return;
2060 } elsif ($token->{tag_name} eq 'title') {
2061 !!!parse-error ('title in body');
2062 ## NOTE: There is an "as if in head" code clone
2063 my $title_el;
2064 !!!create-element ($title_el, 'title', $token->{attributes});
2065 (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2066 ->append_child ($title_el);
2067 $self->{content_model_flag} = 'RCDATA';
2068
2069 my $text = '';
2070 !!!next-token;
2071 while ($token->{type} eq 'character') {
2072 $text .= $token->{data};
2073 !!!next-token;
2074 }
2075 if (length $text) {
2076 $title_el->manakai_append_text ($text);
2077 }
2078
2079 $self->{content_model_flag} = 'PCDATA';
2080
2081 if ($token->{type} eq 'end tag' and
2082 $token->{tag_name} eq 'title') {
2083 ## Ignore the token
2084 } else {
2085 !!!parse-error;
2086 ## ISSUE: And ignore?
2087 }
2088 !!!next-token;
2089 return;
2090 } elsif ($token->{tag_name} eq 'body') {
2091 !!!parse-error;
2092
2093 if (@$open_elements == 1 or
2094 $open_elements->[1]->[1] ne 'body') {
2095 ## Ignore the token
2096 } else {
2097 my $body_el = $open_elements->[1]->[0];
2098 for my $attr_name (keys %{$token->{attributes}}) {
2099 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2100 $body_el->set_attribute_ns
2101 (undef, [undef, $attr_name],
2102 $token->{attributes}->{$attr_name}->{value});
2103 }
2104 }
2105 }
2106 !!!next-token;
2107 return;
2108 } elsif ({
2109 address => 1, blockquote => 1, center => 1, dir => 1,
2110 div => 1, dl => 1, fieldset => 1, listing => 1,
2111 menu => 1, ol => 1, p => 1, ul => 1,
2112 pre => 1,
2113 }->{$token->{tag_name}}) {
2114 ## has a p element in scope
2115 INSCOPE: for (reverse @$open_elements) {
2116 if ($_->[1] eq 'p') {
2117 !!!back-token;
2118 $token = {type => 'end tag', tag_name => 'p'};
2119 return;
2120 } elsif ({
2121 table => 1, caption => 1, td => 1, th => 1,
2122 button => 1, marquee => 1, object => 1, html => 1,
2123 }->{$_->[1]}) {
2124 last INSCOPE;
2125 }
2126 } # INSCOPE
2127
2128 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2129 if ($token->{tag_name} eq 'pre') {
2130 !!!next-token;
2131 if ($token->{type} eq 'character') {
2132 $token->{data} =~ s/^\x0A//;
2133 unless (length $token->{data}) {
2134 !!!next-token;
2135 }
2136 }
2137 } else {
2138 !!!next-token;
2139 }
2140 return;
2141 } elsif ($token->{tag_name} eq 'form') {
2142 if (defined $form_element) {
2143 !!!parse-error;
2144 ## Ignore the token
2145 } else {
2146 ## has a p element in scope
2147 INSCOPE: for (reverse @$open_elements) {
2148 if ($_->[1] eq 'p') {
2149 !!!back-token;
2150 $token = {type => 'end tag', tag_name => 'p'};
2151 return;
2152 } elsif ({
2153 table => 1, caption => 1, td => 1, th => 1,
2154 button => 1, marquee => 1, object => 1, html => 1,
2155 }->{$_->[1]}) {
2156 last INSCOPE;
2157 }
2158 } # INSCOPE
2159
2160 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2161 $form_element = $open_elements->[-1]->[0];
2162 !!!next-token;
2163 return;
2164 }
2165 } elsif ($token->{tag_name} eq 'li') {
2166 ## has a p element in scope
2167 INSCOPE: for (reverse @$open_elements) {
2168 if ($_->[1] eq 'p') {
2169 !!!back-token;
2170 $token = {type => 'end tag', tag_name => 'p'};
2171 return;
2172 } elsif ({
2173 table => 1, caption => 1, td => 1, th => 1,
2174 button => 1, marquee => 1, object => 1, html => 1,
2175 }->{$_->[1]}) {
2176 last INSCOPE;
2177 }
2178 } # INSCOPE
2179
2180 ## Step 1
2181 my $i = -1;
2182 my $node = $open_elements->[$i];
2183 LI: {
2184 ## Step 2
2185 if ($node->[1] eq 'li') {
2186 splice @$open_elements, $i;
2187 last LI;
2188 }
2189
2190 ## Step 3
2191 if (not $formatting_category->{$node->[1]} and
2192 #not $phrasing_category->{$node->[1]} and
2193 ($special_category->{$node->[1]} or
2194 $scoping_category->{$node->[1]}) and
2195 $node->[1] ne 'address' and $node->[1] ne 'div') {
2196 last LI;
2197 }
2198
2199 ## Step 4
2200 $i--;
2201 $node = $open_elements->[$i];
2202 redo LI;
2203 } # LI
2204
2205 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2206 !!!next-token;
2207 return;
2208 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2209 ## has a p element in scope
2210 INSCOPE: for (reverse @$open_elements) {
2211 if ($_->[1] eq 'p') {
2212 !!!back-token;
2213 $token = {type => 'end tag', tag_name => 'p'};
2214 return;
2215 } elsif ({
2216 table => 1, caption => 1, td => 1, th => 1,
2217 button => 1, marquee => 1, object => 1, html => 1,
2218 }->{$_->[1]}) {
2219 last INSCOPE;
2220 }
2221 } # INSCOPE
2222
2223 ## Step 1
2224 my $i = -1;
2225 my $node = $open_elements->[$i];
2226 LI: {
2227 ## Step 2
2228 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2229 splice @$open_elements, $i;
2230 last LI;
2231 }
2232
2233 ## Step 3
2234 if (not $formatting_category->{$node->[1]} and
2235 #not $phrasing_category->{$node->[1]} and
2236 ($special_category->{$node->[1]} or
2237 $scoping_category->{$node->[1]}) and
2238 $node->[1] ne 'address' and $node->[1] ne 'div') {
2239 last LI;
2240 }
2241
2242 ## Step 4
2243 $i--;
2244 $node = $open_elements->[$i];
2245 redo LI;
2246 } # LI
2247
2248 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2249 !!!next-token;
2250 return;
2251 } elsif ($token->{tag_name} eq 'plaintext') {
2252 ## has a p element in scope
2253 INSCOPE: for (reverse @$open_elements) {
2254 if ($_->[1] eq 'p') {
2255 !!!back-token;
2256 $token = {type => 'end tag', tag_name => 'p'};
2257 return;
2258 } elsif ({
2259 table => 1, caption => 1, td => 1, th => 1,
2260 button => 1, marquee => 1, object => 1, html => 1,
2261 }->{$_->[1]}) {
2262 last INSCOPE;
2263 }
2264 } # INSCOPE
2265
2266 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2267
2268 $self->{content_model_flag} = 'PLAINTEXT';
2269
2270 !!!next-token;
2271 return;
2272 } elsif ({
2273 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2274 }->{$token->{tag_name}}) {
2275 ## has a p element in scope
2276 INSCOPE: for (reverse 0..$#$open_elements) {
2277 my $node = $open_elements->[$_];
2278 if ($node->[1] eq 'p') {
2279 !!!back-token;
2280 $token = {type => 'end tag', tag_name => 'p'};
2281 return;
2282 } elsif ({
2283 table => 1, caption => 1, td => 1, th => 1,
2284 button => 1, marquee => 1, object => 1, html => 1,
2285 }->{$node->[1]}) {
2286 last INSCOPE;
2287 }
2288 } # INSCOPE
2289
2290 ## has an element in scope
2291 my $i;
2292 INSCOPE: for (reverse 0..$#$open_elements) {
2293 my $node = $open_elements->[$_];
2294 if ({
2295 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2296 }->{$node->[1]}) {
2297 $i = $_;
2298 last INSCOPE;
2299 } elsif ({
2300 table => 1, caption => 1, td => 1, th => 1,
2301 button => 1, marquee => 1, object => 1, html => 1,
2302 }->{$node->[1]}) {
2303 last INSCOPE;
2304 }
2305 } # INSCOPE
2306
2307 if (defined $i) {
2308 !!!parse-error;
2309 splice @$open_elements, $i;
2310 }
2311
2312 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2313
2314 !!!next-token;
2315 return;
2316 } elsif ($token->{tag_name} eq 'a') {
2317 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2318 my $node = $active_formatting_elements->[$i];
2319 if ($node->[1] eq 'a') {
2320 !!!parse-error ('a in a');
2321
2322 !!!back-token;
2323 $token = {type => 'end tag', tag_name => 'a'};
2324 $formatting_end_tag->($token->{tag_name});
2325
2326 AFE2: for (reverse 0..$#$active_formatting_elements) {
2327 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2328 splice @$active_formatting_elements, $_, 1;
2329 last AFE2;
2330 }
2331 } # AFE2
2332 OE: for (reverse 0..$#$open_elements) {
2333 if ($open_elements->[$_]->[0] eq $node->[0]) {
2334 splice @$open_elements, $_, 1;
2335 last OE;
2336 }
2337 } # OE
2338 last AFE;
2339 } elsif ($node->[0] eq '#marker') {
2340 last AFE;
2341 }
2342 } # AFE
2343
2344 $reconstruct_active_formatting_elements->($insert_to_current);
2345
2346 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2347 push @$active_formatting_elements, $open_elements->[-1];
2348
2349 !!!next-token;
2350 return;
2351 } elsif ({
2352 b => 1, big => 1, em => 1, font => 1, i => 1,
2353 nobr => 1, s => 1, small => 1, strile => 1,
2354 strong => 1, tt => 1, u => 1,
2355 }->{$token->{tag_name}}) {
2356 $reconstruct_active_formatting_elements->($insert_to_current);
2357
2358 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2359 push @$active_formatting_elements, $open_elements->[-1];
2360
2361 !!!next-token;
2362 return;
2363 } elsif ($token->{tag_name} eq 'button') {
2364 ## has a button element in scope
2365 INSCOPE: for (reverse 0..$#$open_elements) {
2366 my $node = $open_elements->[$_];
2367 if ($node->[1] eq 'button') {
2368 !!!parse-error;
2369 !!!back-token;
2370 $token = {type => 'end tag', tag_name => 'button'};
2371 return;
2372 } elsif ({
2373 table => 1, caption => 1, td => 1, th => 1,
2374 button => 1, marquee => 1, object => 1, html => 1,
2375 }->{$node->[1]}) {
2376 last INSCOPE;
2377 }
2378 } # INSCOPE
2379
2380 $reconstruct_active_formatting_elements->($insert_to_current);
2381
2382 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2383 push @$active_formatting_elements, ['#marker', ''];
2384
2385 !!!next-token;
2386 return;
2387 } elsif ($token->{tag_name} eq 'marquee' or
2388 $token->{tag_name} eq 'object') {
2389 $reconstruct_active_formatting_elements->($insert_to_current);
2390
2391 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2392 push @$active_formatting_elements, ['#marker', ''];
2393
2394 !!!next-token;
2395 return;
2396 } elsif ($token->{tag_name} eq 'xmp') {
2397 $reconstruct_active_formatting_elements->($insert_to_current);
2398
2399 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2400
2401 $self->{content_model_flag} = 'CDATA';
2402
2403 !!!next-token;
2404 return;
2405 } elsif ($token->{tag_name} eq 'table') {
2406 ## has a p element in scope
2407 INSCOPE: for (reverse @$open_elements) {
2408 if ($_->[1] eq 'p') {
2409 !!!back-token;
2410 $token = {type => 'end tag', tag_name => 'p'};
2411 return;
2412 } elsif ({
2413 table => 1, caption => 1, td => 1, th => 1,
2414 button => 1, marquee => 1, object => 1, html => 1,
2415 }->{$_->[1]}) {
2416 last INSCOPE;
2417 }
2418 } # INSCOPE
2419
2420 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2421
2422 $insertion_mode = 'in table';
2423
2424 !!!next-token;
2425 return;
2426 } elsif ({
2427 area => 1, basefont => 1, bgsound => 1, br => 1,
2428 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2429 image => 1,
2430 }->{$token->{tag_name}}) {
2431 if ($token->{tag_name} eq 'image') {
2432 !!!parse-error;
2433 $token->{tag_name} = 'img';
2434 }
2435
2436 $reconstruct_active_formatting_elements->($insert_to_current);
2437
2438 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2439 pop @$open_elements;
2440
2441 !!!next-token;
2442 return;
2443 } elsif ($token->{tag_name} eq 'hr') {
2444 ## has a p element in scope
2445 INSCOPE: for (reverse @$open_elements) {
2446 if ($_->[1] eq 'p') {
2447 !!!back-token;
2448 $token = {type => 'end tag', tag_name => 'p'};
2449 return;
2450 } elsif ({
2451 table => 1, caption => 1, td => 1, th => 1,
2452 button => 1, marquee => 1, object => 1, html => 1,
2453 }->{$_->[1]}) {
2454 last INSCOPE;
2455 }
2456 } # INSCOPE
2457
2458 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2459 pop @$open_elements;
2460
2461 !!!next-token;
2462 return;
2463 } elsif ($token->{tag_name} eq 'input') {
2464 $reconstruct_active_formatting_elements->($insert_to_current);
2465
2466 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2467 ## TODO: associate with $form_element if defined
2468 pop @$open_elements;
2469
2470 !!!next-token;
2471 return;
2472 } elsif ($token->{tag_name} eq 'isindex') {
2473 !!!parse-error;
2474
2475 if (defined $form_element) {
2476 ## Ignore the token
2477 !!!next-token;
2478 return;
2479 } else {
2480 my $at = $token->{attributes};
2481 $at->{name} = {name => 'name', value => 'isindex'};
2482 my @tokens = (
2483 {type => 'start tag', tag_name => 'form'},
2484 {type => 'start tag', tag_name => 'hr'},
2485 {type => 'start tag', tag_name => 'p'},
2486 {type => 'start tag', tag_name => 'label'},
2487 {type => 'character',
2488 data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
2489 ## TODO: make this configurable
2490 {type => 'start tag', tag_name => 'input', attributes => $at},
2491 #{type => 'character', data => ''}, # SHOULD
2492 {type => 'end tag', tag_name => 'label'},
2493 {type => 'end tag', tag_name => 'p'},
2494 {type => 'start tag', tag_name => 'hr'},
2495 {type => 'end tag', tag_name => 'form'},
2496 );
2497 $token = shift @tokens;
2498 !!!back-token (@tokens);
2499 return;
2500 }
2501 } elsif ({
2502 textarea => 1,
2503 noembed => 1,
2504 noframes => 1,
2505 noscript => 0, ## TODO: 1 if scripting is enabled
2506 }->{$token->{tag_name}}) {
2507 my $tag_name = $token->{tag_name};
2508 my $el;
2509 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2510
2511 if ($token->{tag_name} eq 'textarea') {
2512 ## TODO: form_element if defined
2513 $self->{content_model_flag} = 'RCDATA';
2514 } else {
2515 $self->{content_model_flag} = 'CDATA';
2516 }
2517
2518 $insert->($el);
2519
2520 my $text = '';
2521 !!!next-token;
2522 while ($token->{type} eq 'character') {
2523 $text .= $token->{data};
2524 !!!next-token;
2525 }
2526 if (length $text) {
2527 $el->manakai_append_text ($text);
2528 }
2529
2530 $self->{content_model_flag} = 'PCDATA';
2531
2532 if ($token->{type} eq 'end tag' and
2533 $token->{tag_name} eq $tag_name) {
2534 ## Ignore the token
2535 } else {
2536 !!!parse-error;
2537 ## ISSUE: And ignore?
2538 }
2539 !!!next-token;
2540 return;
2541 } elsif ($token->{tag_name} eq 'select') {
2542 $reconstruct_active_formatting_elements->($insert_to_current);
2543
2544 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2545
2546 $insertion_mode = 'in select';
2547 !!!next-token;
2548 return;
2549 } elsif ({
2550 caption => 1, col => 1, colgroup => 1, frame => 1,
2551 frameset => 1, head => 1, option => 1, optgroup => 1,
2552 tbody => 1, td => 1, tfoot => 1, th => 1,
2553 thead => 1, tr => 1,
2554 }->{$token->{tag_name}}) {
2555 !!!parse-error;
2556 ## Ignore the token
2557 !!!next-token;
2558 return;
2559
2560 ## ISSUE: An issue on HTML5 new elements in the spec.
2561 } else {
2562 $reconstruct_active_formatting_elements->($insert_to_current);
2563
2564 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2565
2566 !!!next-token;
2567 return;
2568 }
2569 } elsif ($token->{type} eq 'end tag') {
2570 if ($token->{tag_name} eq 'body') {
2571 if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
2572 ## ISSUE: There is an issue in the spec.
2573 if ($open_elements->[-1]->[1] ne 'body') {
2574 !!!parse-error;
2575 }
2576 $insertion_mode = 'after body';
2577 !!!next-token;
2578 return;
2579 } else {
2580 !!!parse-error;
2581 ## Ignore the token
2582 !!!next-token;
2583 return;
2584 }
2585 } elsif ($token->{tag_name} eq 'html') {
2586 if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
2587 ## ISSUE: There is an issue in the spec.
2588 if ($open_elements->[-1]->[1] ne 'body') {
2589 !!!parse-error;
2590 }
2591 $insertion_mode = 'after body';
2592 ## reprocess
2593 return;
2594 } else {
2595 !!!parse-error;
2596 ## Ignore the token
2597 !!!next-token;
2598 return;
2599 }
2600 } elsif ({
2601 address => 1, blockquote => 1, center => 1, dir => 1,
2602 div => 1, dl => 1, fieldset => 1, listing => 1,
2603 menu => 1, ol => 1, pre => 1, ul => 1,
2604 form => 1,
2605 p => 1,
2606 dd => 1, dt => 1, li => 1,
2607 button => 1, marquee => 1, object => 1,
2608 }->{$token->{tag_name}}) {
2609 ## has an element in scope
2610 my $i;
2611 INSCOPE: for (reverse 0..$#$open_elements) {
2612 my $node = $open_elements->[$_];
2613 if ($node->[1] eq $token->{tag_name}) {
2614 ## generate implied end tags
2615 if ({
2616 dd => ($token->{tag_name} ne 'dd'),
2617 dt => ($token->{tag_name} ne 'dt'),
2618 li => ($token->{tag_name} ne 'li'),
2619 p => ($token->{tag_name} ne 'p'),
2620 td => 1, th => 1, tr => 1,
2621 }->{$open_elements->[-1]->[1]}) {
2622 !!!back-token;
2623 $token = {type => 'end tag',
2624 tag_name => $open_elements->[-1]->[1]}; # MUST
2625 return;
2626 }
2627 $i = $_;
2628 last INSCOPE unless $token->{tag_name} eq 'p';
2629 } elsif ({
2630 table => 1, caption => 1, td => 1, th => 1,
2631 button => 1, marquee => 1, object => 1, html => 1,
2632 }->{$node->[1]}) {
2633 last INSCOPE;
2634 }
2635 } # INSCOPE
2636
2637 if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
2638 !!!parse-error;
2639 }
2640
2641 splice @$open_elements, $i if defined $i;
2642 undef $form_element if $token->{tag_name} eq 'form';
2643 $clear_up_to_marker->()
2644 if {
2645 button => 1, marquee => 1, object => 1,
2646 }->{$token->{tag_name}};
2647 !!!next-token;
2648 return;
2649 } elsif ({
2650 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2651 }->{$token->{tag_name}}) {
2652 ## has an element in scope
2653 my $i;
2654 INSCOPE: for (reverse 0..$#$open_elements) {
2655 my $node = $open_elements->[$_];
2656 if ({
2657 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2658 }->{$node->[1]}) {
2659 ## generate implied end tags
2660 if ({
2661 dd => 1, dt => 1, li => 1, p => 1,
2662 td => 1, th => 1, tr => 1,
2663 }->{$open_elements->[-1]->[1]}) {
2664 !!!back-token;
2665 $token = {type => 'end tag',
2666 tag_name => $open_elements->[-1]->[1]}; # MUST
2667 return;
2668 }
2669 $i = $_;
2670 last INSCOPE;
2671 } elsif ({
2672 table => 1, caption => 1, td => 1, th => 1,
2673 button => 1, marquee => 1, object => 1, html => 1,
2674 }->{$node->[1]}) {
2675 last INSCOPE;
2676 }
2677 } # INSCOPE
2678
2679 if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
2680 !!!parse-error;
2681 }
2682
2683 splice @$open_elements, $i if defined $i;
2684 !!!next-token;
2685 return;
2686 } elsif ({
2687 a => 1,
2688 b => 1, big => 1, em => 1, font => 1, i => 1,
2689 nobr => 1, s => 1, small => 1, strile => 1,
2690 strong => 1, tt => 1, u => 1,
2691 }->{$token->{tag_name}}) {
2692 $formatting_end_tag->($token->{tag_name});
2693 return;
2694 } elsif ({
2695 caption => 1, col => 1, colgroup => 1, frame => 1,
2696 frameset => 1, head => 1, option => 1, optgroup => 1,
2697 tbody => 1, td => 1, tfoot => 1, th => 1,
2698 thead => 1, tr => 1,
2699 area => 1, basefont => 1, bgsound => 1, br => 1,
2700 embed => 1, hr => 1, iframe => 1, image => 1,
2701 img => 1, input => 1, isindex=> 1, noembed => 1,
2702 noframes => 1, param => 1, select => 1, spacer => 1,
2703 table => 1, textarea => 1, wbr => 1,
2704 noscript => 0, ## TODO: if scripting is enabled
2705 }->{$token->{tag_name}}) {
2706 !!!parse-error;
2707 ## Ignore the token
2708 !!!next-token;
2709 return;
2710
2711 ## ISSUE: Issue on HTML5 new elements in spec
2712
2713 } else {
2714 ## Step 1
2715 my $node_i = -1;
2716 my $node = $open_elements->[$node_i];
2717
2718 ## Step 2
2719 S2: {
2720 if ($node->[1] eq $token->{tag_name}) {
2721 ## Step 1
2722 ## generate implied end tags
2723 if ({
2724 dd => 1, dt => 1, li => 1, p => 1,
2725 td => 1, th => 1, tr => 1,
2726 }->{$open_elements->[-1]->[1]}) {
2727 !!!back-token;
2728 $token = {type => 'end tag',
2729 tag_name => $open_elements->[-1]->[1]}; # MUST
2730 return;
2731 }
2732
2733 ## Step 2
2734 if ($token->{tag_name} ne $open_elements->[-1]->[1]) {
2735 !!!parse-error;
2736 }
2737
2738 ## Step 3
2739 splice @$open_elements, $node_i;
2740 last S2;
2741 } else {
2742 ## Step 3
2743 if (not $formatting_category->{$node->[1]} and
2744 #not $phrasing_category->{$node->[1]} and
2745 ($special_category->{$node->[1]} or
2746 $scoping_category->{$node->[1]})) {
2747 !!!parse-error;
2748 ## Ignore the token
2749 !!!next-token;
2750 last S2;
2751 }
2752 }
2753
2754 ## Step 4
2755 $node_i--;
2756 $node = $open_elements->[$node_i];
2757
2758 ## Step 5;
2759 redo S2;
2760 } # S2
2761 }
2762 }
2763 }; # $in_body
2764
2765 B: {
2766 if ($phase eq 'initial') {
2767 if ($token->{type} eq 'DOCTYPE') {
2768 if ($token->{error}) {
2769 ## ISSUE: Spec currently left this case undefined.
2770 !!!parse-error ('missing DOCTYPE');
2771 }
2772 my $doctype = $self->{document}->create_document_type_definition
2773 ($token->{name});
2774 $self->{document}->append_child ($doctype);
2775 $phase = 'root element';
2776 !!!next-token;
2777 redo B;
2778 } elsif ({
2779 comment => 1,
2780 'start tag' => 1,
2781 'end tag' => 1,
2782 'end-of-file' => 1,
2783 }->{$token->{type}}) {
2784 ## ISSUE: Spec currently left this case undefined.
2785 !!!parse-error ('missing DOCTYPE');
2786 $phase = 'root element';
2787 ## reprocess
2788 redo B;
2789 } elsif ($token->{type} eq 'character') {
2790 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2791 $self->{document}->manakai_append_text ($1);
2792 ## ISSUE: DOM3 Core does not allow Document > Text
2793 unless (length $token->{data}) {
2794 ## Stay in the phase
2795 !!!next-token;
2796 redo B;
2797 }
2798 }
2799 ## ISSUE: Spec currently left this case undefined.
2800 !!!parse-error ('missing DOCTYPE');
2801 $phase = 'root element';
2802 ## reprocess
2803 redo B;
2804 } else {
2805 die "$0: $token->{type}: Unknown token";
2806 }
2807 } elsif ($phase eq 'root element') {
2808 if ($token->{type} eq 'DOCTYPE') {
2809 !!!parse-error;
2810 ## Ignore the token
2811 ## Stay in the phase
2812 !!!next-token;
2813 redo B;
2814 } elsif ($token->{type} eq 'comment') {
2815 my $comment = $self->{document}->create_comment ($token->{data});
2816 $self->{document}->append_child ($comment);
2817 ## Stay in the phase
2818 !!!next-token;
2819 redo B;
2820 } elsif ($token->{type} eq 'character') {
2821 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2822 $self->{document}->manakai_append_text ($1);
2823 ## ISSUE: DOM3 Core does not allow Document > Text
2824 unless (length $token->{data}) {
2825 ## Stay in the phase
2826 !!!next-token;
2827 redo B;
2828 }
2829 }
2830 #
2831 } elsif ({
2832 'start tag' => 1,
2833 'end tag' => 1,
2834 'end-of-file' => 1,
2835 }->{$token->{type}}) {
2836 ## ISSUE: There is an issue in the spec
2837 #
2838 } else {
2839 die "$0: $token->{type}: Unknown token";
2840 }
2841 my $root_element; !!!create-element ($root_element, 'html');
2842 $self->{document}->append_child ($root_element);
2843 $open_elements = [[$root_element, 'html']];
2844 $phase = 'main';
2845 ## reprocess
2846 redo B;
2847 } elsif ($phase eq 'main') {
2848 if ($token->{type} eq 'DOCTYPE') {
2849 !!!parse-error;
2850 ## Ignore the token
2851 ## Stay in the phase
2852 !!!next-token;
2853 redo B;
2854 } elsif ($token->{type} eq 'start tag' and
2855 $token->{tag_name} eq 'html') {
2856 ## TODO: unless it is the first start tag token, parse-error
2857 my $top_el = $open_elements->[0]->[0];
2858 for my $attr_name (keys %{$token->{attributes}}) {
2859 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2860 $top_el->set_attribute_ns
2861 (undef, [undef, $attr_name],
2862 $token->{attributes}->{$attr_name}->{value});
2863 }
2864 }
2865 !!!next-token;
2866 redo B;
2867 } elsif ($token->{type} eq 'end-of-file') {
2868 ## Generate implied end tags
2869 if ({
2870 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2871 }->{$open_elements->[-1]->[1]}) {
2872 !!!back-token;
2873 $token = {type => 'end tag', tag_name => $open_elements->[-1]->[1]};
2874 redo B;
2875 }
2876
2877 if (@$open_elements > 2 or
2878 (@$open_elements == 2 and $open_elements->[1]->[1] ne 'body')) {
2879 !!!parse-error;
2880 } else {
2881 ## TODO: inner_html parser and @$open_elements > 1 and $open_elements->[1] ne 'body', then parse-error
2882 }
2883
2884 ## Stop parsing
2885 last B;
2886
2887 ## ISSUE: There is an issue in the spec.
2888 } else {
2889 if ($insertion_mode eq 'before head') {
2890 if ($token->{type} eq 'character') {
2891 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2892 $open_elements->[-1]->[0]->manakai_append_text ($1);
2893 unless (length $token->{data}) {
2894 !!!next-token;
2895 redo B;
2896 }
2897 }
2898 ## As if <head>
2899 !!!create-element ($head_element, 'head');
2900 $open_elements->[-1]->[0]->append_child ($head_element);
2901 push @$open_elements, [$head_element, 'head'];
2902 $insertion_mode = 'in head';
2903 ## reprocess
2904 redo B;
2905 } elsif ($token->{type} eq 'comment') {
2906 my $comment = $self->{document}->create_comment ($token->{data});
2907 $open_elements->[-1]->[0]->append_child ($comment);
2908 !!!next-token;
2909 redo B;
2910 } elsif ($token->{type} eq 'start tag') {
2911 my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
2912 !!!create-element ($head_element, 'head', $attr);
2913 $open_elements->[-1]->[0]->append_child ($head_element);
2914 push @$open_elements, [$head_element, 'head'];
2915 $insertion_mode = 'in head';
2916 if ($token->{tag_name} eq 'head') {
2917 !!!next-token;
2918 #} elsif ({
2919 # base => 1, link => 1, meta => 1,
2920 # script => 1, style => 1, title => 1,
2921 # }->{$token->{tag_name}}) {
2922 # ## reprocess
2923 } else {
2924 ## reprocess
2925 }
2926 redo B;
2927 } elsif ($token->{type} eq 'end tag') {
2928 if ($token->{tag_name} eq 'html') {
2929 ## As if <head>
2930 !!!create-element ($head_element, 'head');
2931 $open_elements->[-1]->[0]->append_child ($head_element);
2932 push @$open_elements, [$head_element, 'head'];
2933 $insertion_mode = 'in head';
2934 ## reprocess
2935 redo B;
2936 } else {
2937 !!!parse-error;
2938 ## Ignore the token
2939 !!!next-token;
2940 redo B;
2941 }
2942 } else {
2943 die "$0: $token->{type}: Unknown type";
2944 }
2945 } elsif ($insertion_mode eq 'in head') {
2946 if ($token->{type} eq 'character') {
2947 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2948 $open_elements->[-1]->[0]->manakai_append_text ($1);
2949 unless (length $token->{data}) {
2950 !!!next-token;
2951 redo B;
2952 }
2953 }
2954
2955 #
2956 } elsif ($token->{type} eq 'comment') {
2957 my $comment = $self->{document}->create_comment ($token->{data});
2958 $open_elements->[-1]->[0]->append_child ($comment);
2959 !!!next-token;
2960 redo B;
2961 } elsif ($token->{type} eq 'start tag') {
2962 if ($token->{tag_name} eq 'title') {
2963 ## NOTE: There is an "as if in head" code clone
2964 my $title_el;
2965 !!!create-element ($title_el, 'title', $token->{attributes});
2966 (defined $head_element ? $head_element : $open_elements->[-1]->[0])
2967 ->append_child ($title_el);
2968 $self->{content_model_flag} = 'RCDATA';
2969
2970 my $text = '';
2971 !!!next-token;
2972 while ($token->{type} eq 'character') {
2973 $text .= $token->{data};
2974 !!!next-token;
2975 }
2976 if (length $text) {
2977 $title_el->manakai_append_text ($text);
2978 }
2979
2980 $self->{content_model_flag} = 'PCDATA';
2981
2982 if ($token->{type} eq 'end tag' and
2983 $token->{tag_name} eq 'title') {
2984 ## Ignore the token
2985 } else {
2986 !!!parse-error;
2987 ## ISSUE: And ignore?
2988 }
2989 !!!next-token;
2990 redo B;
2991 } elsif ($token->{tag_name} eq 'style') {
2992 $style_start_tag->();
2993 redo B;
2994 } elsif ($token->{tag_name} eq 'script') {
2995 $script_start_tag->();
2996 redo B;
2997 } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
2998 ## NOTE: There are "as if in head" code clones
2999 my $el;
3000 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
3001 (defined $head_element ? $head_element : $open_elements->[-1]->[0])
3002 ->append_child ($el);
3003
3004 !!!next-token;
3005 redo B;
3006 } elsif ($token->{tag_name} eq 'head') {
3007 !!!parse-error;
3008 ## Ignore the token
3009 !!!next-token;
3010 redo B;
3011 } else {
3012 #
3013 }
3014 } elsif ($token->{type} eq 'end tag') {
3015 if ($token->{tag_name} eq 'head') {
3016 if ($open_elements->[-1]->[1] eq 'head') {
3017 pop @$open_elements;
3018 } else {
3019 !!!parse-error;
3020 }
3021 $insertion_mode = 'after head';
3022 !!!next-token;
3023 redo B;
3024 } elsif ($token->{tag_name} eq 'html') {
3025 #
3026 } else {
3027 !!!parse-error;
3028 ## Ignore the token
3029 !!!next-token;
3030 redo B;
3031 }
3032 } else {
3033 #
3034 }
3035
3036 if ($open_elements->[-1]->[1] eq 'head') {
3037 ## As if </head>
3038 pop @$open_elements;
3039 }
3040 $insertion_mode = 'after head';
3041 ## reprocess
3042 redo B;
3043
3044 ## ISSUE: An issue in the spec.
3045 } elsif ($insertion_mode eq 'after head') {
3046 if ($token->{type} eq 'character') {
3047 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3048 $open_elements->[-1]->[0]->manakai_append_text ($1);
3049 unless (length $token->{data}) {
3050 !!!next-token;
3051 redo B;
3052 }
3053 }
3054
3055 #
3056 } elsif ($token->{type} eq 'comment') {
3057 my $comment = $self->{document}->create_comment ($token->{data});
3058 $open_elements->[-1]->[0]->append_child ($comment);
3059 !!!next-token;
3060 redo B;
3061 } elsif ($token->{type} eq 'start tag') {
3062 if ($token->{tag_name} eq 'body') {
3063 !!!insert-element ('body', $token->{attributes});
3064 $insertion_mode = 'in body';
3065 !!!next-token;
3066 redo B;
3067 } elsif ($token->{tag_name} eq 'frameset') {
3068 !!!insert-element ('frameset', $token->{attributes});
3069 $insertion_mode = 'in frameset';
3070 !!!next-token;
3071 redo B;
3072 } elsif ({
3073 base => 1, link => 1, meta => 1,
3074 script=> 1, style => 1, title => 1,
3075 }->{$token->{tag_name}}) {
3076 !!!parse-error;
3077 $insertion_mode = 'in head';
3078 ## reprocess
3079 redo B;
3080 } else {
3081 #
3082 }
3083 } else {
3084 #
3085 }
3086
3087 ## As if <body>
3088 !!!insert-element ('body');
3089 $insertion_mode = 'in body';
3090 ## reprocess
3091 redo B;
3092 } elsif ($insertion_mode eq 'in body') {
3093 if ($token->{type} eq 'character') {
3094 ## NOTE: There is a code clone of "character in body".
3095 $reconstruct_active_formatting_elements->($insert_to_current);
3096
3097 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3098
3099 !!!next-token;
3100 redo B;
3101 } elsif ($token->{type} eq 'comment') {
3102 ## NOTE: There is a code clone of "comment in body".
3103 my $comment = $self->{document}->create_comment ($token->{data});
3104 $open_elements->[-1]->[0]->append_child ($comment);
3105 !!!next-token;
3106 redo B;
3107 } else {
3108 $in_body->($insert_to_current);
3109 redo B;
3110 }
3111 } elsif ($insertion_mode eq 'in table') {
3112 if ($token->{type} eq 'character') {
3113 ## NOTE: There are "character in table" code clones.
3114 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3115 $open_elements->[-1]->[0]->manakai_append_text ($1);
3116
3117 unless (length $token->{data}) {
3118 !!!next-token;
3119 redo B;
3120 }
3121 }
3122
3123 ## As if in body, but insert into foster parent element
3124 ## ISSUE: Spec says that "whenever a node would be inserted
3125 ## into the current node" while characters might not be
3126 ## result in a new Text node.
3127 $reconstruct_active_formatting_elements->($insert_to_foster);
3128
3129 if ({
3130 table => 1, tbody => 1, tfoot => 1,
3131 thead => 1, tr => 1,
3132 }->{$open_elements->[-1]->[1]}) {
3133 # MUST
3134 my $foster_parent_element;
3135 my $next_sibling;
3136 my $prev_sibling;
3137 OE: for (reverse 0..$#$open_elements) {
3138 if ($open_elements->[$_]->[1] eq 'table') {
3139 my $parent = $open_elements->[$_]->[0]->parent_node;
3140 if (defined $parent and $parent->node_type == 1) {
3141 $foster_parent_element = $parent;
3142 $next_sibling = $open_elements->[$_]->[0];
3143 $prev_sibling = $next_sibling->previous_sibling;
3144 } else {
3145 $foster_parent_element = $open_elements->[$_ - 1]->[0];
3146 $prev_sibling = $foster_parent_element->last_child;
3147 }
3148 last OE;
3149 }
3150 } # OE
3151 $foster_parent_element = $open_elements->[0]->[0] and
3152 $prev_sibling = $foster_parent_element->last_child
3153 unless defined $foster_parent_element;
3154 if (defined $prev_sibling and
3155 $prev_sibling->node_type == 3) {
3156 $prev_sibling->manakai_append_text ($token->{data});
3157 } else {
3158 $foster_parent_element->insert_before
3159 ($self->{document}->create_text_node ($token->{data}),
3160 $next_sibling);
3161 }
3162 } else {
3163 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3164 }
3165
3166 !!!next-token;
3167 redo B;
3168 } elsif ($token->{type} eq 'comment') {
3169 my $comment = $self->{document}->create_comment ($token->{data});
3170 $open_elements->[-1]->[0]->append_child ($comment);
3171 !!!next-token;
3172 redo B;
3173 } elsif ($token->{type} eq 'start tag') {
3174 if ({
3175 caption => 1,
3176 colgroup => 1,
3177 tbody => 1, tfoot => 1, thead => 1,
3178 }->{$token->{tag_name}}) {
3179 ## Clear back to table context
3180 while ($open_elements->[-1]->[1] ne 'table' and
3181 $open_elements->[-1]->[1] ne 'html') {
3182 !!!parse-error;
3183 pop @$open_elements;
3184 }
3185
3186 push @$active_formatting_elements, ['#marker', '']
3187 if $token->{tag_name} eq 'caption';
3188
3189 !!!insert-element ($token->{tag_name}, $token->{attributes});
3190 $insertion_mode = {
3191 caption => 'in caption',
3192 colgroup => 'in column group',
3193 tbody => 'in table body',
3194 tfoot => 'in table body',
3195 thead => 'in table body',
3196 }->{$token->{tag_name}};
3197 !!!next-token;
3198 redo B;
3199 } elsif ({
3200 col => 1,
3201 td => 1, th => 1, tr => 1,
3202 }->{$token->{tag_name}}) {
3203 ## Clear back to table context
3204 while ($open_elements->[-1]->[1] ne 'table' and
3205 $open_elements->[-1]->[1] ne 'html') {
3206 !!!parse-error;
3207 pop @$open_elements;
3208 }
3209
3210 !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
3211 $insertion_mode = $token->{tag_name} eq 'col'
3212 ? 'in column group' : 'in table body';
3213 ## reprocess
3214 redo B;
3215 } elsif ($token->{tag_name} eq 'table') {
3216 ## NOTE: There are code clones for this "table in table"
3217 !!!parse-error;
3218
3219 ## As if </table>
3220 ## have a table element in table scope
3221 my $i;
3222 INSCOPE: for (reverse 0..$#$open_elements) {
3223 my $node = $open_elements->[$_];
3224 if ($node->[1] eq 'table') {
3225 $i = $_;
3226 last INSCOPE;
3227 } elsif ({
3228 table => 1, html => 1,
3229 }->{$node->[1]}) {
3230 last INSCOPE;
3231 }
3232 } # INSCOPE
3233 unless (defined $i) {
3234 !!!parse-error;
3235 ## Ignore tokens </table><table>
3236 !!!next-token;
3237 redo B;
3238 }
3239
3240 ## generate implied end tags
3241 if ({
3242 dd => 1, dt => 1, li => 1, p => 1,
3243 td => 1, th => 1, tr => 1,
3244 }->{$open_elements->[-1]->[1]}) {
3245 !!!back-token; # <table>
3246 $token = {type => 'end tag', tag_name => 'table'};
3247 !!!back-token;
3248 $token = {type => 'end tag',
3249 tag_name => $open_elements->[-1]->[1]}; # MUST
3250 redo B;
3251 }
3252
3253 if ($open_elements->[-1]->[1] ne 'table') {
3254 !!!parse-error;
3255 }
3256
3257 splice @$open_elements, $i;
3258
3259 $reset_insertion_mode->();
3260
3261 ## reprocess
3262 redo B;
3263 } else {
3264 #
3265 }
3266 } elsif ($token->{type} eq 'end tag') {
3267 if ($token->{tag_name} eq 'table') {
3268 ## have a table element in table scope
3269 my $i;
3270 INSCOPE: for (reverse 0..$#$open_elements) {
3271 my $node = $open_elements->[$_];
3272 if ($node->[1] eq $token->{tag_name}) {
3273 $i = $_;
3274 last INSCOPE;
3275 } elsif ({
3276 table => 1, html => 1,
3277 }->{$node->[1]}) {
3278 last INSCOPE;
3279 }
3280 } # INSCOPE
3281 unless (defined $i) {
3282 !!!parse-error;
3283 ## Ignore the token
3284 !!!next-token;
3285 redo B;
3286 }
3287
3288 ## generate implied end tags
3289 if ({
3290 dd => 1, dt => 1, li => 1, p => 1,
3291 td => 1, th => 1, tr => 1,
3292 }->{$open_elements->[-1]->[1]}) {
3293 !!!back-token;
3294 $token = {type => 'end tag',
3295 tag_name => $open_elements->[-1]->[1]}; # MUST
3296 redo B;
3297 }
3298
3299 if ($open_elements->[-1]->[1] ne 'table') {
3300 !!!parse-error;
3301 }
3302
3303 splice @$open_elements, $i;
3304
3305 $reset_insertion_mode->();
3306
3307 !!!next-token;
3308 redo B;
3309 } elsif ({
3310 body => 1, caption => 1, col => 1, colgroup => 1,
3311 html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
3312 thead => 1, tr => 1,
3313 }->{$token->{tag_name}}) {
3314 !!!parse-error;
3315 ## Ignore the token
3316 !!!next-token;
3317 redo B;
3318 } else {
3319 #
3320 }
3321 } else {
3322 #
3323 }
3324
3325 !!!parse-error;
3326 $in_body->($insert_to_foster);
3327 redo B;
3328 } elsif ($insertion_mode eq 'in caption') {
3329 if ($token->{type} eq 'character') {
3330 ## NOTE: This is a code clone of "character in body".
3331 $reconstruct_active_formatting_elements->($insert_to_current);
3332
3333 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3334
3335 !!!next-token;
3336 redo B;
3337 } elsif ($token->{type} eq 'comment') {
3338 ## NOTE: This is a code clone of "comment in body".
3339 my $comment = $self->{document}->create_comment ($token->{data});
3340 $open_elements->[-1]->[0]->append_child ($comment);
3341 !!!next-token;
3342 redo B;
3343 } elsif ($token->{type} eq 'start tag') {
3344 if ({
3345 caption => 1, col => 1, colgroup => 1, tbody => 1,
3346 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3347 }->{$token->{tag_name}}) {
3348 !!!parse-error;
3349
3350 ## As if </caption>
3351 ## have a table element in table scope
3352 my $i;
3353 INSCOPE: for (reverse 0..$#$open_elements) {
3354 my $node = $open_elements->[$_];
3355 if ($node->[1] eq 'caption') {
3356 $i = $_;
3357 last INSCOPE;
3358 } elsif ({
3359 table => 1, html => 1,
3360 }->{$node->[1]}) {
3361 last INSCOPE;
3362 }
3363 } # INSCOPE
3364 unless (defined $i) {
3365 !!!parse-error;
3366 ## Ignore the token
3367 !!!next-token;
3368 redo B;
3369 }
3370
3371 ## generate implied end tags
3372 if ({
3373 dd => 1, dt => 1, li => 1, p => 1,
3374 td => 1, th => 1, tr => 1,
3375 }->{$open_elements->[-1]->[1]}) {
3376 !!!back-token; # <?>
3377 $token = {type => 'end tag', tag_name => 'caption'};
3378 !!!back-token;
3379 $token = {type => 'end tag',
3380 tag_name => $open_elements->[-1]->[1]}; # MUST
3381 redo B;
3382 }
3383
3384 if ($open_elements->[-1]->[1] ne 'caption') {
3385 !!!parse-error;
3386 }
3387
3388 splice @$open_elements, $i;
3389
3390 $clear_up_to_marker->();
3391
3392 $insertion_mode = 'in table';
3393
3394 ## reprocess
3395 redo B;
3396 } else {
3397 #
3398 }
3399 } elsif ($token->{type} eq 'end tag') {
3400 if ($token->{tag_name} eq 'caption') {
3401 ## have a table element in table scope
3402 my $i;
3403 INSCOPE: for (reverse 0..$#$open_elements) {
3404 my $node = $open_elements->[$_];
3405 if ($node->[1] eq $token->{tag_name}) {
3406 $i = $_;
3407 last INSCOPE;
3408 } elsif ({
3409 table => 1, html => 1,
3410 }->{$node->[1]}) {
3411 last INSCOPE;
3412 }
3413 } # INSCOPE
3414 unless (defined $i) {
3415 !!!parse-error;
3416 ## Ignore the token
3417 !!!next-token;
3418 redo B;
3419 }
3420
3421 ## generate implied end tags
3422 if ({
3423 dd => 1, dt => 1, li => 1, p => 1,
3424 td => 1, th => 1, tr => 1,
3425 }->{$open_elements->[-1]->[1]}) {
3426 !!!back-token;
3427 $token = {type => 'end tag',
3428 tag_name => $open_elements->[-1]->[1]}; # MUST
3429 redo B;
3430 }
3431
3432 if ($open_elements->[-1]->[1] ne 'caption') {
3433 !!!parse-error;
3434 }
3435
3436 splice @$open_elements, $i;
3437
3438 $clear_up_to_marker->();
3439
3440 $insertion_mode = 'in table';
3441
3442 !!!next-token;
3443 redo B;
3444 } elsif ($token->{tag_name} eq 'table') {
3445 !!!parse-error;
3446
3447 ## As if </caption>
3448 ## have a table element in table scope
3449 my $i;
3450 INSCOPE: for (reverse 0..$#$open_elements) {
3451 my $node = $open_elements->[$_];
3452 if ($node->[1] eq 'caption') {
3453 $i = $_;
3454 last INSCOPE;
3455 } elsif ({
3456 table => 1, html => 1,
3457 }->{$node->[1]}) {
3458 last INSCOPE;
3459 }
3460 } # INSCOPE
3461 unless (defined $i) {
3462 !!!parse-error;
3463 ## Ignore the token
3464 !!!next-token;
3465 redo B;
3466 }
3467
3468 ## generate implied end tags
3469 if ({
3470 dd => 1, dt => 1, li => 1, p => 1,
3471 td => 1, th => 1, tr => 1,
3472 }->{$open_elements->[-1]->[1]}) {
3473 !!!back-token; # </table>
3474 $token = {type => 'end tag', tag_name => 'caption'};
3475 !!!back-token;
3476 $token = {type => 'end tag',
3477 tag_name => $open_elements->[-1]->[1]}; # MUST
3478 redo B;
3479 }
3480
3481 if ($open_elements->[-1]->[1] ne 'caption') {
3482 !!!parse-error;
3483 }
3484
3485 splice @$open_elements, $i;
3486
3487 $clear_up_to_marker->();
3488
3489 $insertion_mode = 'in table';
3490
3491 ## reprocess
3492 redo B;
3493 } elsif ({
3494 body => 1, col => 1, colgroup => 1,
3495 html => 1, tbody => 1, td => 1, tfoot => 1,
3496 th => 1, thead => 1, tr => 1,
3497 }->{$token->{tag_name}}) {
3498 !!!parse-error;
3499 ## Ignore the token
3500 redo B;
3501 } else {
3502 #
3503 }
3504 } else {
3505 #
3506 }
3507
3508 $in_body->($insert_to_current);
3509 redo B;
3510 } elsif ($insertion_mode eq 'in column group') {
3511 if ($token->{type} eq 'character') {
3512 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3513 $open_elements->[-1]->[0]->manakai_append_text ($1);
3514 unless (length $token->{data}) {
3515 !!!next-token;
3516 redo B;
3517 }
3518 }
3519
3520 #
3521 } elsif ($token->{type} eq 'comment') {
3522 my $comment = $self->{document}->create_comment ($token->{data});
3523 $open_elements->[-1]->[0]->append_child ($comment);
3524 !!!next-token;
3525 redo B;
3526 } elsif ($token->{type} eq 'start tag') {
3527 if ($token->{tag_name} eq 'col') {
3528 !!!insert-element ($token->{tag_name}, $token->{attributes});
3529 pop @$open_elements;
3530 !!!next-token;
3531 redo B;
3532 } else {
3533 #
3534 }
3535 } elsif ($token->{type} eq 'end tag') {
3536 if ($token->{tag_name} eq 'colgroup') {
3537 if ($open_elements->[-1]->[1] eq 'html') {
3538 !!!parse-error;
3539 ## Ignore the token
3540 !!!next-token;
3541 redo B;
3542 } else {
3543 pop @$open_elements; # colgroup
3544 $insertion_mode = 'in table';
3545 !!!next-token;
3546 redo B;
3547 }
3548 } elsif ($token->{tag_name} eq 'col') {
3549 !!!parse-error;
3550 ## Ignore the token
3551 !!!next-token;
3552 redo B;
3553 } else {
3554 #
3555 }
3556 } else {
3557 #
3558 }
3559
3560 ## As if </colgroup>
3561 if ($open_elements->[-1]->[1] eq 'html') {
3562 !!!parse-error;
3563 ## Ignore the token
3564 !!!next-token;
3565 redo B;
3566 } else {
3567 pop @$open_elements; # colgroup
3568 $insertion_mode = 'in table';
3569 ## reprocess
3570 redo B;
3571 }
3572 } elsif ($insertion_mode eq 'in table body') {
3573 if ($token->{type} eq 'character') {
3574 ## NOTE: This is a "character in table" code clone.
3575 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3576 $open_elements->[-1]->[0]->manakai_append_text ($1);
3577
3578 unless (length $token->{data}) {
3579 !!!next-token;
3580 redo B;
3581 }
3582 }
3583
3584 ## As if in body, but insert into foster parent element
3585 ## ISSUE: Spec says that "whenever a node would be inserted
3586 ## into the current node" while characters might not be
3587 ## result in a new Text node.
3588 $reconstruct_active_formatting_elements->($insert_to_foster);
3589
3590 if ({
3591 table => 1, tbody => 1, tfoot => 1,
3592 thead => 1, tr => 1,
3593 }->{$open_elements->[-1]->[1]}) {
3594 # MUST
3595 my $foster_parent_element;
3596 my $next_sibling;
3597 my $prev_sibling;
3598 OE: for (reverse 0..$#$open_elements) {
3599 if ($open_elements->[$_]->[1] eq 'table') {
3600 my $parent = $open_elements->[$_]->[0]->parent_node;
3601 if (defined $parent and $parent->node_type == 1) {
3602 $foster_parent_element = $parent;
3603 $next_sibling = $open_elements->[$_]->[0];
3604 $prev_sibling = $next_sibling->previous_sibling;
3605 } else {
3606 $foster_parent_element = $open_elements->[$_ - 1]->[0];
3607 $prev_sibling = $foster_parent_element->last_child;
3608 }
3609 last OE;
3610 }
3611 } # OE
3612 $foster_parent_element = $open_elements->[0]->[0] and
3613 $prev_sibling = $foster_parent_element->last_child
3614 unless defined $foster_parent_element;
3615 if (defined $prev_sibling and
3616 $prev_sibling->node_type == 3) {
3617 $prev_sibling->manakai_append_text ($token->{data});
3618 } else {
3619 $foster_parent_element->insert_before
3620 ($self->{document}->create_text_node ($token->{data}),
3621 $next_sibling);
3622 }
3623 } else {
3624 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3625 }
3626
3627 !!!next-token;
3628 redo B;
3629 } elsif ($token->{type} eq 'comment') {
3630 ## Copied from 'in table'
3631 my $comment = $self->{document}->create_comment ($token->{data});
3632 $open_elements->[-1]->[0]->append_child ($comment);
3633 !!!next-token;
3634 redo B;
3635 } elsif ($token->{type} eq 'start tag') {
3636 if ({
3637 tr => 1,
3638 th => 1, td => 1,
3639 }->{$token->{tag_name}}) {
3640 ## Clear back to table body context
3641 while (not {
3642 tbody => 1, tfoot => 1, thead => 1, html => 1,
3643 }->{$open_elements->[-1]->[1]}) {
3644 !!!parse-error;
3645 pop @$open_elements;
3646 }
3647
3648 $insertion_mode = 'in row';
3649 if ($token->{tag_name} eq 'tr') {
3650 !!!insert-element ($token->{tag_name}, $token->{attributes});
3651 !!!next-token;
3652 } else {
3653 !!!insert-element ('tr');
3654 ## reprocess
3655 }
3656 redo B;
3657 } elsif ({
3658 caption => 1, col => 1, colgroup => 1,
3659 tbody => 1, tfoot => 1, thead => 1,
3660 }->{$token->{tag_name}}) {
3661 ## have an element in table scope
3662 my $i;
3663 INSCOPE: for (reverse 0..$#$open_elements) {
3664 my $node = $open_elements->[$_];
3665 if ({
3666 tbody => 1, thead => 1, tfoot => 1,
3667 }->{$node->[1]}) {
3668 $i = $_;
3669 last INSCOPE;
3670 } elsif ({
3671 table => 1, html => 1,
3672 }->{$node->[1]}) {
3673 last INSCOPE;
3674 }
3675 } # INSCOPE
3676 unless (defined $i) {
3677 !!!parse-error;
3678 ## Ignore the token
3679 !!!next-token;
3680 redo B;
3681 }
3682
3683 ## Clear back to table body context
3684 while (not {
3685 tbody => 1, tfoot => 1, thead => 1, html => 1,
3686 }->{$open_elements->[-1]->[1]}) {
3687 !!!parse-error;
3688 pop @$open_elements;
3689 }
3690
3691 ## As if <{current node}>
3692 ## have an element in table scope
3693 ## true by definition
3694
3695 ## Clear back to table body context
3696 ## nop by definition
3697
3698 pop @$open_elements;
3699 $insertion_mode = 'in table';
3700 ## reprocess
3701 redo B;
3702 } elsif ($token->{tag_name} eq 'table') {
3703 ## NOTE: This is a code clone of "table in table"
3704 !!!parse-error;
3705
3706 ## As if </table>
3707 ## have a table element in table scope
3708 my $i;
3709 INSCOPE: for (reverse 0..$#$open_elements) {
3710 my $node = $open_elements->[$_];
3711 if ($node->[1] eq 'table') {
3712 $i = $_;
3713 last INSCOPE;
3714 } elsif ({
3715 table => 1, html => 1,
3716 }->{$node->[1]}) {
3717 last INSCOPE;
3718 }
3719 } # INSCOPE
3720 unless (defined $i) {
3721 !!!parse-error;
3722 ## Ignore tokens </table><table>
3723 !!!next-token;
3724 redo B;
3725 }
3726
3727 ## generate implied end tags
3728 if ({
3729 dd => 1, dt => 1, li => 1, p => 1,
3730 td => 1, th => 1, tr => 1,
3731 }->{$open_elements->[-1]->[1]}) {
3732 !!!back-token; # <table>
3733 $token = {type => 'end tag', tag_name => 'table'};
3734 !!!back-token;
3735 $token = {type => 'end tag',
3736 tag_name => $open_elements->[-1]->[1]}; # MUST
3737 redo B;
3738 }
3739
3740 if ($open_elements->[-1]->[1] ne 'table') {
3741 !!!parse-error;
3742 }
3743
3744 splice @$open_elements, $i;
3745
3746 $reset_insertion_mode->();
3747
3748 ## reprocess
3749 redo B;
3750 } else {
3751 #
3752 }
3753 } elsif ($token->{type} eq 'end tag') {
3754 if ({
3755 tbody => 1, tfoot => 1, thead => 1,
3756 }->{$token->{tag_name}}) {
3757 ## have an element in table scope
3758 my $i;
3759 INSCOPE: for (reverse 0..$#$open_elements) {
3760 my $node = $open_elements->[$_];
3761 if ($node->[1] eq $token->{tag_name}) {
3762 $i = $_;
3763 last INSCOPE;
3764 } elsif ({
3765 table => 1, html => 1,
3766 }->{$node->[1]}) {
3767 last INSCOPE;
3768 }
3769 } # INSCOPE
3770 unless (defined $i) {
3771 !!!parse-error;
3772 ## Ignore the token
3773 !!!next-token;
3774 redo B;
3775 }
3776
3777 ## Clear back to table body context
3778 while (not {
3779 tbody => 1, tfoot => 1, thead => 1, html => 1,
3780 }->{$open_elements->[-1]->[1]}) {
3781 !!!parse-error;
3782 pop @$open_elements;
3783 }
3784
3785 pop @$open_elements;
3786 $insertion_mode = 'in table';
3787 !!!next-token;
3788 redo B;
3789 } elsif ($token->{tag_name} eq 'table') {
3790 ## have an element in table scope
3791 my $i;
3792 INSCOPE: for (reverse 0..$#$open_elements) {
3793 my $node = $open_elements->[$_];
3794 if ({
3795 tbody => 1, thead => 1, tfoot => 1,
3796 }->{$node->[1]}) {
3797 $i = $_;
3798 last INSCOPE;
3799 } elsif ({
3800 table => 1, html => 1,
3801 }->{$node->[1]}) {
3802 last INSCOPE;
3803 }
3804 } # INSCOPE
3805 unless (defined $i) {
3806 !!!parse-error;
3807 ## Ignore the token
3808 !!!next-token;
3809 redo B;
3810 }
3811
3812 ## Clear back to table body context
3813 while (not {
3814 tbody => 1, tfoot => 1, thead => 1, html => 1,
3815 }->{$open_elements->[-1]->[1]}) {
3816 !!!parse-error;
3817 pop @$open_elements;
3818 }
3819
3820 ## As if <{current node}>
3821 ## have an element in table scope
3822 ## true by definition
3823
3824 ## Clear back to table body context
3825 ## nop by definition
3826
3827 pop @$open_elements;
3828 $insertion_mode = 'in table';
3829 ## reprocess
3830 redo B;
3831 } elsif ({
3832 body => 1, caption => 1, col => 1, colgroup => 1,
3833 html => 1, td => 1, th => 1, tr => 1,
3834 }->{$token->{tag_name}}) {
3835 !!!parse-error;
3836 ## Ignore the token
3837 !!!next-token;
3838 redo B;
3839 } else {
3840 #
3841 }
3842 } else {
3843 #
3844 }
3845
3846 ## As if in table
3847 !!!parse-error;
3848 $in_body->($insert_to_foster);
3849 redo B;
3850 } elsif ($insertion_mode eq 'in row') {
3851 if ($token->{type} eq 'character') {
3852 ## NOTE: This is a "character in table" code clone.
3853 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3854 $open_elements->[-1]->[0]->manakai_append_text ($1);
3855
3856 unless (length $token->{data}) {
3857 !!!next-token;
3858 redo B;
3859 }
3860 }
3861
3862 ## As if in body, but insert into foster parent element
3863 ## ISSUE: Spec says that "whenever a node would be inserted
3864 ## into the current node" while characters might not be
3865 ## result in a new Text node.
3866 $reconstruct_active_formatting_elements->($insert_to_foster);
3867
3868 if ({
3869 table => 1, tbody => 1, tfoot => 1,
3870 thead => 1, tr => 1,
3871 }->{$open_elements->[-1]->[1]}) {
3872 # MUST
3873 my $foster_parent_element;
3874 my $next_sibling;
3875 my $prev_sibling;
3876 OE: for (reverse 0..$#$open_elements) {
3877 if ($open_elements->[$_]->[1] eq 'table') {
3878 my $parent = $open_elements->[$_]->[0]->parent_node;
3879 if (defined $parent and $parent->node_type == 1) {
3880 $foster_parent_element = $parent;
3881 $next_sibling = $open_elements->[$_]->[0];
3882 $prev_sibling = $next_sibling->previous_sibling;
3883 } else {
3884 $foster_parent_element = $open_elements->[$_ - 1]->[0];
3885 $prev_sibling = $foster_parent_element->last_child;
3886 }
3887 last OE;
3888 }
3889 } # OE
3890 $foster_parent_element = $open_elements->[0]->[0] and
3891 $prev_sibling = $foster_parent_element->last_child
3892 unless defined $foster_parent_element;
3893 if (defined $prev_sibling and
3894 $prev_sibling->node_type == 3) {
3895 $prev_sibling->manakai_append_text ($token->{data});
3896 } else {
3897 $foster_parent_element->insert_before
3898 ($self->{document}->create_text_node ($token->{data}),
3899 $next_sibling);
3900 }
3901 } else {
3902 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3903 }
3904
3905 !!!next-token;
3906 redo B;
3907 } elsif ($token->{type} eq 'comment') {
3908 ## Copied from 'in table'
3909 my $comment = $self->{document}->create_comment ($token->{data});
3910 $open_elements->[-1]->[0]->append_child ($comment);
3911 !!!next-token;
3912 redo B;
3913 } elsif ($token->{type} eq 'start tag') {
3914 if ($token->{tag_name} eq 'th' or
3915 $token->{tag_name} eq 'td') {
3916 ## Clear back to table row context
3917 while (not {
3918 tr => 1, html => 1,
3919 }->{$open_elements->[-1]->[1]}) {
3920 !!!parse-error;
3921 pop @$open_elements;
3922 }
3923
3924 !!!insert-element ($token->{tag_name}, $token->{attributes});
3925 $insertion_mode = 'in cell';
3926
3927 push @$active_formatting_elements, ['#marker', ''];
3928
3929 !!!next-token;
3930 redo B;
3931 } elsif ({
3932 caption => 1, col => 1, colgroup => 1,
3933 tbody => 1, tfoot => 1, thead => 1, tr => 1,
3934 }->{$token->{tag_name}}) {
3935 ## As if </tr>
3936 ## have an element in table scope
3937 my $i;
3938 INSCOPE: for (reverse 0..$#$open_elements) {
3939 my $node = $open_elements->[$_];
3940 if ($node->[1] eq 'tr') {
3941 $i = $_;
3942 last INSCOPE;
3943 } elsif ({
3944 table => 1, html => 1,
3945 }->{$node->[1]}) {
3946 last INSCOPE;
3947 }
3948 } # INSCOPE
3949 unless (defined $i) {
3950 !!!parse-error;
3951 ## Ignore the token
3952 !!!next-token;
3953 redo B;
3954 }
3955
3956 ## Clear back to table row context
3957 while (not {
3958 tr => 1, html => 1,
3959 }->{$open_elements->[-1]->[1]}) {
3960 !!!parse-error;
3961 pop @$open_elements;
3962 }
3963
3964 pop @$open_elements; # tr
3965 $insertion_mode = 'in table body';
3966 ## reprocess
3967 redo B;
3968 } elsif ($token->{tag_name} eq 'table') {
3969 ## NOTE: This is a code clone of "table in table"
3970 !!!parse-error;
3971
3972 ## As if </table>
3973 ## have a table element in table scope
3974 my $i;
3975 INSCOPE: for (reverse 0..$#$open_elements) {
3976 my $node = $open_elements->[$_];
3977 if ($node->[1] eq 'table') {
3978 $i = $_;
3979 last INSCOPE;
3980 } elsif ({
3981 table => 1, html => 1,
3982 }->{$node->[1]}) {
3983 last INSCOPE;
3984 }
3985 } # INSCOPE
3986 unless (defined $i) {
3987 !!!parse-error;
3988 ## Ignore tokens </table><table>
3989 !!!next-token;
3990 redo B;
3991 }
3992
3993 ## generate implied end tags
3994 if ({
3995 dd => 1, dt => 1, li => 1, p => 1,
3996 td => 1, th => 1, tr => 1,
3997 }->{$open_elements->[-1]->[1]}) {
3998 !!!back-token; # <table>
3999 $token = {type => 'end tag', tag_name => 'table'};
4000 !!!back-token;
4001 $token = {type => 'end tag',
4002 tag_name => $open_elements->[-1]->[1]}; # MUST
4003 redo B;
4004 }
4005
4006 if ($open_elements->[-1]->[1] ne 'table') {
4007 !!!parse-error;
4008 }
4009
4010 splice @$open_elements, $i;
4011
4012 $reset_insertion_mode->();
4013
4014 ## reprocess
4015 redo B;
4016 } else {
4017 #
4018 }
4019 } elsif ($token->{type} eq 'end tag') {
4020 if ($token->{tag_name} eq 'tr') {
4021 ## have an element in table scope
4022 my $i;
4023 INSCOPE: for (reverse 0..$#$open_elements) {
4024 my $node = $open_elements->[$_];
4025 if ($node->[1] eq $token->{tag_name}) {
4026 $i = $_;
4027 last INSCOPE;
4028 } elsif ({
4029 table => 1, html => 1,
4030 }->{$node->[1]}) {
4031 last INSCOPE;
4032 }
4033 } # INSCOPE
4034 unless (defined $i) {
4035 !!!parse-error;
4036 ## Ignore the token
4037 !!!next-token;
4038 redo B;
4039 }
4040
4041 ## Clear back to table row context
4042 while (not {
4043 tr => 1, html => 1,
4044 }->{$open_elements->[-1]->[1]}) {
4045 !!!parse-error;
4046 pop @$open_elements;
4047 }
4048
4049 pop @$open_elements; # tr
4050 $insertion_mode = 'in table body';
4051 !!!next-token;
4052 redo B;
4053 } elsif ($token->{tag_name} eq 'table') {
4054 ## As if </tr>
4055 ## have an element in table scope
4056 my $i;
4057 INSCOPE: for (reverse 0..$#$open_elements) {
4058 my $node = $open_elements->[$_];
4059 if ($node->[1] eq 'tr') {
4060 $i = $_;
4061 last INSCOPE;
4062 } elsif ({
4063 table => 1, html => 1,
4064 }->{$node->[1]}) {
4065 last INSCOPE;
4066 }
4067 } # INSCOPE
4068 unless (defined $i) {
4069 !!!parse-error;
4070 ## Ignore the token
4071 !!!next-token;
4072 redo B;
4073 }
4074
4075 ## Clear back to table row context
4076 while (not {
4077 tr => 1, html => 1,
4078 }->{$open_elements->[-1]->[1]}) {
4079 !!!parse-error;
4080 pop @$open_elements;
4081 }
4082
4083 pop @$open_elements; # tr
4084 $insertion_mode = 'in table body';
4085 ## reprocess
4086 redo B;
4087 } elsif ({
4088 tbody => 1, tfoot => 1, thead => 1,
4089 }->{$token->{tag_name}}) {
4090 ## have an element in table scope
4091 my $i;
4092 INSCOPE: for (reverse 0..$#$open_elements) {
4093 my $node = $open_elements->[$_];
4094 if ($node->[1] eq $token->{tag_name}) {
4095 $i = $_;
4096 last INSCOPE;
4097 } elsif ({
4098 table => 1, html => 1,
4099 }->{$node->[1]}) {
4100 last INSCOPE;
4101 }
4102 } # INSCOPE
4103 unless (defined $i) {
4104 !!!parse-error;
4105 ## Ignore the token
4106 !!!next-token;
4107 redo B;
4108 }
4109
4110 ## As if </tr>
4111 ## have an element in table scope
4112 my $i;
4113 INSCOPE: for (reverse 0..$#$open_elements) {
4114 my $node = $open_elements->[$_];
4115 if ($node->[1] eq 'tr') {
4116 $i = $_;
4117 last INSCOPE;
4118 } elsif ({
4119 table => 1, html => 1,
4120 }->{$node->[1]}) {
4121 last INSCOPE;
4122 }
4123 } # INSCOPE
4124 unless (defined $i) {
4125 !!!parse-error;
4126 ## Ignore the token
4127 !!!next-token;
4128 redo B;
4129 }
4130
4131 ## Clear back to table row context
4132 while (not {
4133 tr => 1, html => 1,
4134 }->{$open_elements->[-1]->[1]}) {
4135 !!!parse-error;
4136 pop @$open_elements;
4137 }
4138
4139 pop @$open_elements; # tr
4140 $insertion_mode = 'in table body';
4141 ## reprocess
4142 redo B;
4143 } elsif ({
4144 body => 1, caption => 1, col => 1,
4145 colgroup => 1, html => 1, td => 1, th => 1,
4146 }->{$token->{tag_name}}) {
4147 !!!parse-error;
4148 ## Ignore the token
4149 !!!next-token;
4150 redo B;
4151 } else {
4152 #
4153 }
4154 } else {
4155 #
4156 }
4157
4158 ## As if in table
4159 !!!parse-error;
4160 $in_body->($insert_to_foster);
4161 redo B;
4162 } elsif ($insertion_mode eq 'in cell') {
4163 if ($token->{type} eq 'character') {
4164 ## NOTE: This is a code clone of "character in body".
4165 $reconstruct_active_formatting_elements->($insert_to_current);
4166
4167 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4168
4169 !!!next-token;
4170 redo B;
4171 } elsif ($token->{type} eq 'comment') {
4172 ## NOTE: This is a code clone of "comment in body".
4173 my $comment = $self->{document}->create_comment ($token->{data});
4174 $open_elements->[-1]->[0]->append_child ($comment);
4175 !!!next-token;
4176 redo B;
4177 } elsif ($token->{type} eq 'start tag') {
4178 if ({
4179 caption => 1, col => 1, colgroup => 1,
4180 tbody => 1, td => 1, tfoot => 1, th => 1,
4181 thead => 1, tr => 1,
4182 }->{$token->{tag_name}}) {
4183 ## have an element in table scope
4184 my $tn;
4185 INSCOPE: for (reverse 0..$#$open_elements) {
4186 my $node = $open_elements->[$_];
4187 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4188 $tn = $node->[1];
4189 last INSCOPE;
4190 } elsif ({
4191 table => 1, html => 1,
4192 }->{$node->[1]}) {
4193 last INSCOPE;
4194 }
4195 } # INSCOPE
4196 unless (defined $tn) {
4197 !!!parse-error;
4198 ## Ignore the token
4199 !!!next-token;
4200 redo B;
4201 }
4202
4203 ## Close the cell
4204 !!!back-token; # <?>
4205 $token = {type => 'end tag', tag_name => $tn};
4206 redo B;
4207 } else {
4208 #
4209 }
4210 } elsif ($token->{type} eq 'end tag') {
4211 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4212 ## have an element in table scope
4213 my $i;
4214 INSCOPE: for (reverse 0..$#$open_elements) {
4215 my $node = $open_elements->[$_];
4216 if ($node->[1] eq $token->{tag_name}) {
4217 $i = $_;
4218 last INSCOPE;
4219 } elsif ({
4220 table => 1, html => 1,
4221 }->{$node->[1]}) {
4222 last INSCOPE;
4223 }
4224 } # INSCOPE
4225 unless (defined $i) {
4226 !!!parse-error;
4227 ## Ignore the token
4228 !!!next-token;
4229 redo B;
4230 }
4231
4232 ## generate implied end tags
4233 if ({
4234 dd => 1, dt => 1, li => 1, p => 1,
4235 td => ($token->{tag_name} eq 'th'),
4236 th => ($token->{tag_name} eq 'td'),
4237 tr => 1,
4238 }->{$open_elements->[-1]->[1]}) {
4239 !!!back-token;
4240 $token = {type => 'end tag',
4241 tag_name => $open_elements->[-1]->[1]}; # MUST
4242 redo B;
4243 }
4244
4245 if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
4246 !!!parse-error;
4247 }
4248
4249 splice @$open_elements, $i;
4250
4251 $clear_up_to_marker->();
4252
4253 $insertion_mode = 'in row';
4254
4255 !!!next-token;
4256 redo B;
4257 } elsif ({
4258 body => 1, caption => 1, col => 1,
4259 colgroup => 1, html => 1,
4260 }->{$token->{tag_name}}) {
4261 !!!parse-error;
4262 ## Ignore the token
4263 !!!next-token;
4264 redo B;
4265 } elsif ({
4266 table => 1, tbody => 1, tfoot => 1,
4267 thead => 1, tr => 1,
4268 }->{$token->{tag_name}}) {
4269 ## have an element in table scope
4270 my $i;
4271 my $tn;
4272 INSCOPE: for (reverse 0..$#$open_elements) {
4273 my $node = $open_elements->[$_];
4274 if ($node->[1] eq $token->{tag_name}) {
4275 $i = $_;
4276 last INSCOPE;
4277 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4278 $tn = $node->[1];
4279 ## NOTE: There is exactly one |td| or |th| element
4280 ## in scope in the stack of open elements by definition.
4281 } elsif ({
4282 table => 1, html => 1,
4283 }->{$node->[1]}) {
4284 last INSCOPE;
4285 }
4286 } # INSCOPE
4287 unless (defined $i) {
4288 !!!parse-error;
4289 ## Ignore the token
4290 !!!next-token;
4291 redo B;
4292 }
4293
4294 ## Close the cell
4295 !!!back-token; # </?>
4296 $token = {type => 'end tag', tag_name => $tn};
4297 redo B;
4298 } else {
4299 #
4300 }
4301 } else {
4302 #
4303 }
4304
4305 $in_body->($insert_to_current);
4306 redo B;
4307 } elsif ($insertion_mode eq 'in select') {
4308 if ($token->{type} eq 'character') {
4309 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4310 !!!next-token;
4311 redo B;
4312 } elsif ($token->{type} eq 'comment') {
4313 my $comment = $self->{document}->create_comment ($token->{data});
4314 $open_elements->[-1]->[0]->append_child ($comment);
4315 !!!next-token;
4316 redo B;
4317 } elsif ($token->{type} eq 'start tag') {
4318 if ($token->{tag_name} eq 'option') {
4319 if ($open_elements->[-1]->[1] eq 'option') {
4320 ## As if </option>
4321 pop @$open_elements;
4322 }
4323
4324 !!!insert-element ($token->{tag_name}, $token->{attributes});
4325 !!!next-token;
4326 redo B;
4327 } elsif ($token->{tag_name} eq 'optgroup') {
4328 if ($open_elements->[-1]->[1] eq 'option') {
4329 ## As if </option>
4330 pop @$open_elements;
4331 }
4332
4333 if ($open_elements->[-1]->[1] eq 'optgroup') {
4334 ## As if </optgroup>
4335 pop @$open_elements;
4336 }
4337
4338 !!!insert-element ($token->{tag_name}, $token->{attributes});
4339 !!!next-token;
4340 redo B;
4341 } elsif ($token->{tag_name} eq 'select') {
4342 !!!parse-error;
4343 ## As if </select> instead
4344 ## have an element in table scope
4345 my $i;
4346 INSCOPE: for (reverse 0..$#$open_elements) {
4347 my $node = $open_elements->[$_];
4348 if ($node->[1] eq $token->{tag_name}) {
4349 $i = $_;
4350 last INSCOPE;
4351 } elsif ({
4352 table => 1, html => 1,
4353 }->{$node->[1]}) {
4354 last INSCOPE;
4355 }
4356 } # INSCOPE
4357 unless (defined $i) {
4358 !!!parse-error;
4359 ## Ignore the token
4360 !!!next-token;
4361 redo B;
4362 }
4363
4364 splice @$open_elements, $i;
4365
4366 $reset_insertion_mode->();
4367
4368 !!!next-token;
4369 redo B;
4370 } else {
4371 #
4372 }
4373 } elsif ($token->{type} eq 'end tag') {
4374 if ($token->{tag_name} eq 'optgroup') {
4375 if ($open_elements->[-1]->[1] eq 'option' and
4376 $open_elements->[-2]->[1] eq 'optgroup') {
4377 ## As if </option>
4378 splice @$open_elements, -2;
4379 } elsif ($open_elements->[-1]->[1] eq 'optgroup') {
4380 pop @$open_elements;
4381 } else {
4382 !!!parse-error;
4383 ## Ignore the token
4384 }
4385 !!!next-token;
4386 redo B;
4387 } elsif ($token->{tag_name} eq 'option') {
4388 if ($open_elements->[-1]->[1] eq 'option') {
4389 pop @$open_elements;
4390 } else {
4391 !!!parse-error;
4392 ## Ignore the token
4393 }
4394 !!!next-token;
4395 redo B;
4396 } elsif ($token->{tag_name} eq 'select') {
4397 ## have an element in table scope
4398 my $i;
4399 INSCOPE: for (reverse 0..$#$open_elements) {
4400 my $node = $open_elements->[$_];
4401 if ($node->[1] eq $token->{tag_name}) {
4402 $i = $_;
4403 last INSCOPE;
4404 } elsif ({
4405 table => 1, html => 1,
4406 }->{$node->[1]}) {
4407 last INSCOPE;
4408 }
4409 } # INSCOPE
4410 unless (defined $i) {
4411 !!!parse-error;
4412 ## Ignore the token
4413 !!!next-token;
4414 redo B;
4415 }
4416
4417 splice @$open_elements, $i;
4418
4419 $reset_insertion_mode->();
4420
4421 !!!next-token;
4422 redo B;
4423 } elsif ({
4424 caption => 1, table => 1, tbody => 1,
4425 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4426 }->{$token->{tag_name}}) {
4427 !!!parse-error;
4428
4429 ## have an element in table scope
4430 my $i;
4431 INSCOPE: for (reverse 0..$#$open_elements) {
4432 my $node = $open_elements->[$_];
4433 if ($node->[1] eq $token->{tag_name}) {
4434 $i = $_;
4435 last INSCOPE;
4436 } elsif ({
4437 table => 1, html => 1,
4438 }->{$node->[1]}) {
4439 last INSCOPE;
4440 }
4441 } # INSCOPE
4442 unless (defined $i) {
4443 ## Ignore the token
4444 !!!next-token;
4445 redo B;
4446 }
4447
4448 ## As if </select>
4449 ## have an element in table scope
4450 undef $i;
4451 INSCOPE: for (reverse 0..$#$open_elements) {
4452 my $node = $open_elements->[$_];
4453 if ($node->[1] eq 'select') {
4454 $i = $_;
4455 last INSCOPE;
4456 } elsif ({
4457 table => 1, html => 1,
4458 }->{$node->[1]}) {
4459 last INSCOPE;
4460 }
4461 } # INSCOPE
4462 unless (defined $i) {
4463 !!!parse-error;
4464 ## Ignore the </select> token
4465 !!!next-token; ## TODO: ok?
4466 redo B;
4467 }
4468
4469 splice @$open_elements, $i;
4470
4471 $reset_insertion_mode->();
4472
4473 ## reprocess
4474 redo B;
4475 } else {
4476 #
4477 }
4478 } else {
4479 #
4480 }
4481
4482 !!!parse-error;
4483 ## Ignore the token
4484 !!!next-token;
4485 redo B;
4486 } elsif ($insertion_mode eq 'after body') {
4487 if ($token->{type} eq 'character') {
4488 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4489 ## As if in body
4490 $reconstruct_active_formatting_elements->($insert_to_current);
4491
4492 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4493
4494 unless (length $token->{data}) {
4495 !!!next-token;
4496 redo B;
4497 }
4498 }
4499
4500 #
4501 } elsif ($token->{type} eq 'comment') {
4502 my $comment = $self->{document}->create_comment ($token->{data});
4503 $open_elements->[0]->[0]->append_child ($comment);
4504 !!!next-token;
4505 redo B;
4506 } elsif ($token->{type} eq 'end tag') {
4507 if ($token->{tag_name} eq 'html') {
4508 ## TODO: if inner_html, parse-error, ignore the token; otherwise,
4509
4510 $phase = 'trailing end';
4511 !!!next-token;
4512 redo B;
4513 } else {
4514 #
4515 }
4516 } else {
4517 #
4518 }
4519
4520 !!!parse-error ('data after body');
4521 $insertion_mode = 'in body';
4522 ## reprocess
4523 redo B;
4524 } elsif ($insertion_mode eq 'in frameset') {
4525 if ($token->{type} eq 'character') {
4526 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4527 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4528
4529 unless (length $token->{data}) {
4530 !!!next-token;
4531 redo B;
4532 }
4533 }
4534
4535 #
4536 } elsif ($token->{type} eq 'comment') {
4537 my $comment = $self->{document}->create_comment ($token->{data});
4538 $open_elements->[-1]->[0]->append_child ($comment);
4539 !!!next-token;
4540 redo B;
4541 } elsif ($token->{type} eq 'start tag') {
4542 if ($token->{tag_name} eq 'frameset') {
4543 !!!insert-element ($token->{tag_name}, $token->{attributes});
4544 !!!next-token;
4545 redo B;
4546 } elsif ($token->{tag_name} eq 'frame') {
4547 !!!insert-element ($token->{tag_name}, $token->{attributes});
4548 pop @$open_elements;
4549 !!!next-token;
4550 redo B;
4551 } elsif ($token->{tag_name} eq 'noframes') {
4552 $in_body->($insert_to_current);
4553 redo B;
4554 } else {
4555 #
4556 }
4557 } elsif ($token->{type} eq 'end tag') {
4558 if ($token->{tag_name} eq 'frameset') {
4559 if ($open_elements->[-1]->[1] eq 'html' and
4560 @$open_elements == 1) {
4561 !!!parse-error;
4562 ## Ignore the token
4563 !!!next-token;
4564 } else {
4565 pop @$open_elements;
4566 !!!next-token;
4567 }
4568
4569 ## if not inner_html and
4570 if ($open_elements->[-1]->[1] ne 'frameset') {
4571 $insertion_mode = 'after frameset';
4572 }
4573 redo B;
4574 } else {
4575 #
4576 }
4577 } else {
4578 #
4579 }
4580
4581 !!!parse-error;
4582 ## Ignore the token
4583 !!!next-token;
4584 redo B;
4585 } elsif ($insertion_mode eq 'after frameset') {
4586 if ($token->{type} eq 'character') {
4587 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4588 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4589
4590 unless (length $token->{data}) {
4591 !!!next-token;
4592 redo B;
4593 }
4594 }
4595
4596 #
4597 } elsif ($token->{type} eq 'comment') {
4598 my $comment = $self->{document}->create_comment ($token->{data});
4599 $open_elements->[-1]->[0]->append_child ($comment);
4600 !!!next-token;
4601 redo B;
4602 } elsif ($token->{type} eq 'start tag') {
4603 if ($token->{tag_name} eq 'noframes') {
4604 $in_body->($insert_to_current);
4605 redo B;
4606 } else {
4607 #
4608 }
4609 } elsif ($token->{type} eq 'end tag') {
4610 if ($token->{tag_name} eq 'html') {
4611 $phase = 'trailing end';
4612 !!!next-token;
4613 redo B;
4614 } else {
4615 #
4616 }
4617 } else {
4618 #
4619 }
4620
4621 !!!parse-error;
4622 ## Ignore the token
4623 !!!next-token;
4624 redo B;
4625
4626 ## ISSUE: An issue in spec there
4627 } else {
4628 die "$0: $insertion_mode: Unknown insertion mode";
4629 }
4630 }
4631 } elsif ($phase eq 'trailing end') {
4632 ## states in the main stage is preserved yet # MUST
4633
4634 if ($token->{type} eq 'DOCTYPE') {
4635 !!!parse-error;
4636 ## Ignore the token
4637 !!!next-token;
4638 redo B;
4639 } elsif ($token->{type} eq 'comment') {
4640 my $comment = $self->{document}->create_comment ($token->{data});
4641 $self->{document}->append_child ($comment);
4642 !!!next-token;
4643 redo B;
4644 } elsif ($token->{type} eq 'character') {
4645 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4646 my $data = $1;
4647 ## As if in the main phase.
4648 ## NOTE: The insertion mode in the main phase
4649 ## just before the phase has been changed to the trailing
4650 ## end phase is either "after body" or "after frameset".
4651 $reconstruct_active_formatting_elements->($insert_to_current)
4652 if $phase eq 'main';
4653
4654 $open_elements->[-1]->[0]->manakai_append_text ($data);
4655
4656 unless (length $token->{data}) {
4657 !!!next-token;
4658 redo B;
4659 }
4660 }
4661
4662 !!!parse-error;
4663 $phase = 'main';
4664 ## reprocess
4665 redo B;
4666 } elsif ($token->{type} eq 'start tag' or
4667 $token->{type} eq 'end tag') {
4668 !!!parse-error;
4669 $phase = 'main';
4670 ## reprocess
4671 redo B;
4672 } elsif ($token->{type} eq 'end-of-file') {
4673 ## Stop parsing
4674 last B;
4675 } else {
4676 die "$0: $token->{type}: Unknown token";
4677 }
4678 }
4679 } # B
4680
4681 ## Stop parsing # MUST
4682
4683 ## TODO: script stuffs
4684 } # _construct_tree
4685
4686 sub get_inner_html ($$$) {
4687 my ($class, $node, $on_error) = @_;
4688
4689 ## Step 1
4690 my $s = '';
4691
4692 my $in_cdata;
4693 my $parent = $node;
4694 while (defined $parent) {
4695 if ($parent->node_type == 1 and
4696 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
4697 {
4698 style => 1, script => 1, xmp => 1, iframe => 1,
4699 noembed => 1, noframes => 1, noscript => 1,
4700 }->{$parent->local_name}) { ## TODO: case thingy
4701 $in_cdata = 1;
4702 }
4703 $parent = $parent->parent_node;
4704 }
4705
4706 ## Step 2
4707 my @node = @{$node->child_nodes};
4708 C: while (@node) {
4709 my $child = shift @node;
4710 unless (ref $child) {
4711 if ($child eq 'cdata-out') {
4712 $in_cdata = 0;
4713 } else {
4714 $s .= $child; # end tag
4715 }
4716 next C;
4717 }
4718
4719 my $nt = $child->node_type;
4720 if ($nt == 1) { # Element
4721 my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
4722 $s .= '<' . $tag_name;
4723
4724 ## ISSUE: Non-html elements
4725
4726 my @attrs = @{$child->attributes}; # sort order MUST be stable
4727 for my $attr (@attrs) { # order is implementation dependent
4728 my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
4729 $s .= ' ' . $attr_name . '="';
4730 my $attr_value = $attr->value;
4731 ## escape
4732 $attr_value =~ s/&/&amp;/g;
4733 $attr_value =~ s/</&lt;/g;
4734 $attr_value =~ s/>/&gt;/g;
4735 $attr_value =~ s/"/&quot;/g;
4736 $s .= $attr_value . '"';
4737 }
4738 $s .= '>';
4739
4740 next C if {
4741 area => 1, base => 1, basefont => 1, bgsound => 1,
4742 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
4743 img => 1, input => 1, link => 1, meta => 1, param => 1,
4744 spacer => 1, wbr => 1,
4745 }->{$tag_name};
4746
4747 if (not $in_cdata and {
4748 style => 1, script => 1, xmp => 1, iframe => 1,
4749 noembed => 1, noframes => 1, noscript => 1,
4750 }->{$tag_name}) {
4751 unshift @node, 'cdata-out';
4752 $in_cdata = 1;
4753 }
4754
4755 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
4756 } elsif ($nt == 3 or $nt == 4) {
4757 if ($in_cdata) {
4758 $s .= $child->data;
4759 } else {
4760 my $value = $child->data;
4761 $value =~ s/&/&amp;/g;
4762 $value =~ s/</&lt;/g;
4763 $value =~ s/>/&gt;/g;
4764 $value =~ s/"/&quot;/g;
4765 $s .= $value;
4766 }
4767 } elsif ($nt == 8) {
4768 $s .= '<!--' . $child->data . '-->';
4769 } elsif ($nt == 10) {
4770 $s .= '<!DOCTYPE ' . $child->name . '>';
4771 } elsif ($nt == 5) { # entrefs
4772 push @node, @{$child->child_nodes};
4773 } else {
4774 $on_error->($child) if defined $on_error;
4775 }
4776 ## ISSUE: This code does not support PIs.
4777 } # C
4778
4779 ## Step 3
4780 return \$s;
4781 } # get_inner_html
4782
4783 1;
4784 # $Date: 2007/05/01 07:46:42 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24