/[suikacvs]/markup/html/whatpm/What/HTML.pm
Suika

Contents of /markup/html/whatpm/What/HTML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (show annotations) (download)
Mon Apr 30 12:06:12 2007 UTC (18 years, 11 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +310 -14136 lines
++ whatpm/What/ChangeLog	30 Apr 2007 12:05:44 -0000
	* mkhtmlparser.pl, Makefile: References to the |HTML-consume-entity.src|
	are removed.

	* HTML.pm.src: Tokenizer's handling on named entities are rewritten.

	* HTML-consume-entity.src: Removed.

2007-04-30  Wakaba  <wakaba@suika.fam.cx>

1 package What::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## This is a very, very early version of an HTML parser.
6
7 my $permitted_slash_tag_name = {
8 base => 1,
9 link => 1,
10 meta => 1,
11 hr => 1,
12 br => 1,
13 img=> 1,
14 embed => 1,
15 param => 1,
16 area => 1,
17 col => 1,
18 input => 1,
19 };
20
21 my $entity_char = {
22 AElig => "\x{00C6}",
23 Aacute => "\x{00C1}",
24 Acirc => "\x{00C2}",
25 Agrave => "\x{00C0}",
26 Alpha => "\x{0391}",
27 Aring => "\x{00C5}",
28 Atilde => "\x{00C3}",
29 Auml => "\x{00C4}",
30 Beta => "\x{0392}",
31 Ccedil => "\x{00C7}",
32 Chi => "\x{03A7}",
33 Dagger => "\x{2021}",
34 Delta => "\x{0394}",
35 ETH => "\x{00D0}",
36 Eacute => "\x{00C9}",
37 Ecirc => "\x{00CA}",
38 Egrave => "\x{00C8}",
39 Epsilon => "\x{0395}",
40 Eta => "\x{0397}",
41 Euml => "\x{00CB}",
42 Gamma => "\x{0393}",
43 Iacute => "\x{00CD}",
44 Icirc => "\x{00CE}",
45 Igrave => "\x{00CC}",
46 Iota => "\x{0399}",
47 Iuml => "\x{00CF}",
48 Kappa => "\x{039A}",
49 Lambda => "\x{039B}",
50 Mu => "\x{039C}",
51 Ntilde => "\x{00D1}",
52 Nu => "\x{039D}",
53 OElig => "\x{0152}",
54 Oacute => "\x{00D3}",
55 Ocirc => "\x{00D4}",
56 Ograve => "\x{00D2}",
57 Omega => "\x{03A9}",
58 Omicron => "\x{039F}",
59 Oslash => "\x{00D8}",
60 Otilde => "\x{00D5}",
61 Ouml => "\x{00D6}",
62 Phi => "\x{03A6}",
63 Pi => "\x{03A0}",
64 Prime => "\x{2033}",
65 Psi => "\x{03A8}",
66 Rho => "\x{03A1}",
67 Scaron => "\x{0160}",
68 Sigma => "\x{03A3}",
69 THORN => "\x{00DE}",
70 Tau => "\x{03A4}",
71 Theta => "\x{0398}",
72 Uacute => "\x{00DA}",
73 Ucirc => "\x{00DB}",
74 Ugrave => "\x{00D9}",
75 Upsilon => "\x{03A5}",
76 Uuml => "\x{00DC}",
77 Xi => "\x{039E}",
78 Yacute => "\x{00DD}",
79 Yuml => "\x{0178}",
80 Zeta => "\x{0396}",
81 aacute => "\x{00E1}",
82 acirc => "\x{00E2}",
83 acute => "\x{00B4}",
84 aelig => "\x{00E6}",
85 agrave => "\x{00E0}",
86 alefsym => "\x{2135}",
87 alpha => "\x{03B1}",
88 amp => "\x{0026}",
89 AMP => "\x{0026}",
90 and => "\x{2227}",
91 ang => "\x{2220}",
92 apos => "\x{0027}",
93 aring => "\x{00E5}",
94 asymp => "\x{2248}",
95 atilde => "\x{00E3}",
96 auml => "\x{00E4}",
97 bdquo => "\x{201E}",
98 beta => "\x{03B2}",
99 brvbar => "\x{00A6}",
100 bull => "\x{2022}",
101 cap => "\x{2229}",
102 ccedil => "\x{00E7}",
103 cedil => "\x{00B8}",
104 cent => "\x{00A2}",
105 chi => "\x{03C7}",
106 circ => "\x{02C6}",
107 clubs => "\x{2663}",
108 cong => "\x{2245}",
109 copy => "\x{00A9}",
110 COPY => "\x{00A9}",
111 crarr => "\x{21B5}",
112 cup => "\x{222A}",
113 curren => "\x{00A4}",
114 dArr => "\x{21D3}",
115 dagger => "\x{2020}",
116 darr => "\x{2193}",
117 deg => "\x{00B0}",
118 delta => "\x{03B4}",
119 diams => "\x{2666}",
120 divide => "\x{00F7}",
121 eacute => "\x{00E9}",
122 ecirc => "\x{00EA}",
123 egrave => "\x{00E8}",
124 empty => "\x{2205}",
125 emsp => "\x{2003}",
126 ensp => "\x{2002}",
127 epsilon => "\x{03B5}",
128 equiv => "\x{2261}",
129 eta => "\x{03B7}",
130 eth => "\x{00F0}",
131 euml => "\x{00EB}",
132 euro => "\x{20AC}",
133 exist => "\x{2203}",
134 fnof => "\x{0192}",
135 forall => "\x{2200}",
136 frac12 => "\x{00BD}",
137 frac14 => "\x{00BC}",
138 frac34 => "\x{00BE}",
139 frasl => "\x{2044}",
140 gamma => "\x{03B3}",
141 ge => "\x{2265}",
142 gt => "\x{003E}",
143 GT => "\x{003E}",
144 hArr => "\x{21D4}",
145 harr => "\x{2194}",
146 hearts => "\x{2665}",
147 hellip => "\x{2026}",
148 iacute => "\x{00ED}",
149 icirc => "\x{00EE}",
150 iexcl => "\x{00A1}",
151 igrave => "\x{00EC}",
152 image => "\x{2111}",
153 infin => "\x{221E}",
154 int => "\x{222B}",
155 iota => "\x{03B9}",
156 iquest => "\x{00BF}",
157 isin => "\x{2208}",
158 iuml => "\x{00EF}",
159 kappa => "\x{03BA}",
160 lArr => "\x{21D0}",
161 lambda => "\x{03BB}",
162 lang => "\x{2329}",
163 laquo => "\x{00AB}",
164 larr => "\x{2190}",
165 lceil => "\x{2308}",
166 ldquo => "\x{201C}",
167 le => "\x{2264}",
168 lfloor => "\x{230A}",
169 lowast => "\x{2217}",
170 loz => "\x{25CA}",
171 lrm => "\x{200E}",
172 lsaquo => "\x{2039}",
173 lsquo => "\x{2018}",
174 lt => "\x{003C}",
175 LT => "\x{003C}",
176 macr => "\x{00AF}",
177 mdash => "\x{2014}",
178 micro => "\x{00B5}",
179 middot => "\x{00B7}",
180 minus => "\x{2212}",
181 mu => "\x{03BC}",
182 nabla => "\x{2207}",
183 nbsp => "\x{00A0}",
184 ndash => "\x{2013}",
185 ne => "\x{2260}",
186 ni => "\x{220B}",
187 not => "\x{00AC}",
188 notin => "\x{2209}",
189 nsub => "\x{2284}",
190 ntilde => "\x{00F1}",
191 nu => "\x{03BD}",
192 oacute => "\x{00F3}",
193 ocirc => "\x{00F4}",
194 oelig => "\x{0153}",
195 ograve => "\x{00F2}",
196 oline => "\x{203E}",
197 omega => "\x{03C9}",
198 omicron => "\x{03BF}",
199 oplus => "\x{2295}",
200 or => "\x{2228}",
201 ordf => "\x{00AA}",
202 ordm => "\x{00BA}",
203 oslash => "\x{00F8}",
204 otilde => "\x{00F5}",
205 otimes => "\x{2297}",
206 ouml => "\x{00F6}",
207 para => "\x{00B6}",
208 part => "\x{2202}",
209 permil => "\x{2030}",
210 perp => "\x{22A5}",
211 phi => "\x{03C6}",
212 pi => "\x{03C0}",
213 piv => "\x{03D6}",
214 plusmn => "\x{00B1}",
215 pound => "\x{00A3}",
216 prime => "\x{2032}",
217 prod => "\x{220F}",
218 prop => "\x{221D}",
219 psi => "\x{03C8}",
220 quot => "\x{0022}",
221 QUOT => "\x{0022}",
222 rArr => "\x{21D2}",
223 radic => "\x{221A}",
224 rang => "\x{232A}",
225 raquo => "\x{00BB}",
226 rarr => "\x{2192}",
227 rceil => "\x{2309}",
228 rdquo => "\x{201D}",
229 real => "\x{211C}",
230 reg => "\x{00AE}",
231 REG => "\x{00AE}",
232 rfloor => "\x{230B}",
233 rho => "\x{03C1}",
234 rlm => "\x{200F}",
235 rsaquo => "\x{203A}",
236 rsquo => "\x{2019}",
237 sbquo => "\x{201A}",
238 scaron => "\x{0161}",
239 sdot => "\x{22C5}",
240 sect => "\x{00A7}",
241 shy => "\x{00AD}",
242 sigma => "\x{03C3}",
243 sigmaf => "\x{03C2}",
244 sim => "\x{223C}",
245 spades => "\x{2660}",
246 sub => "\x{2282}",
247 sube => "\x{2286}",
248 sum => "\x{2211}",
249 sup => "\x{2283}",
250 sup1 => "\x{00B9}",
251 sup2 => "\x{00B2}",
252 sup3 => "\x{00B3}",
253 supe => "\x{2287}",
254 szlig => "\x{00DF}",
255 tau => "\x{03C4}",
256 there4 => "\x{2234}",
257 theta => "\x{03B8}",
258 thetasym => "\x{03D1}",
259 thinsp => "\x{2009}",
260 thorn => "\x{00FE}",
261 tilde => "\x{02DC}",
262 times => "\x{00D7}",
263 trade => "\x{2122}",
264 uArr => "\x{21D1}",
265 uacute => "\x{00FA}",
266 uarr => "\x{2191}",
267 ucirc => "\x{00FB}",
268 ugrave => "\x{00F9}",
269 uml => "\x{00A8}",
270 upsih => "\x{03D2}",
271 upsilon => "\x{03C5}",
272 uuml => "\x{00FC}",
273 weierp => "\x{2118}",
274 xi => "\x{03BE}",
275 yacute => "\x{00FD}",
276 yen => "\x{00A5}",
277 yuml => "\x{00FF}",
278 zeta => "\x{03B6}",
279 zwj => "\x{200D}",
280 zwnj => "\x{200C}",
281 };
282
283 my $special_category = {
284 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
285 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
286 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
287 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
288 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
289 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
290 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
291 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
292 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
293 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
294 };
295 my $scoping_category = {
296 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
297 table => 1, td => 1, th => 1,
298 };
299 my $formatting_category = {
300 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
301 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
302 };
303 # $phrasing_category: all other elements
304
305 sub new ($) {
306 my $class = shift;
307 my $self = bless {}, $class;
308 $self->{set_next_input_character} = sub {
309 $self->{next_input_character} = -1;
310 };
311 $self->{parse_error} = sub {
312 #
313 };
314 return $self;
315 } # new
316
317 ## Implementations MUST act as if state machine in the spec
318
319 sub _initialize_tokenizer ($) {
320 my $self = shift;
321 $self->{state} = 'data'; # MUST
322 $self->{content_model_flag} = 'PCDATA'; # be
323 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
324 undef $self->{current_attribute};
325 undef $self->{last_emitted_start_tag_name};
326 undef $self->{last_attribute_value_state};
327 $self->{char} = [];
328 # $self->{next_input_character}
329
330 if (@{$self->{char}}) {
331 $self->{next_input_character} = shift @{$self->{char}};
332 } else {
333 $self->{set_next_input_character}->($self);
334 }
335
336 $self->{token} = [];
337 } # _initialize_tokenizer
338
339 ## A token has:
340 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
341 ## 'character', or 'end-of-file'
342 ## ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
343 ## ISSUE: the spec need s/tagname/tag name/
344 ## ->{error} == 1 or 0 (DOCTYPE)
345 ## ->{attributes} isa HASH (start tag, end tag)
346 ## ->{data} (comment, character)
347
348 ## Macros
349 ## Macros MUST be preceded by three EXCLAMATION MARKs.
350 ## emit ($token)
351 ## Emits the specified token.
352
353 ## Emitted token MUST immediately be handled by the tree construction state.
354
355 ## Before each step, UA MAY check to see if either one of the scripts in
356 ## "list of scripts that will execute as soon as possible" or the first
357 ## script in the "list of scripts that will execute asynchronously",
358 ## has completed loading. If one has, then it MUST be executed
359 ## and removed from the list.
360
361 sub _get_next_token ($) {
362 my $self = shift;
363 if (@{$self->{token}}) {
364 return shift @{$self->{token}};
365 }
366
367 A: {
368 if ($self->{state} eq 'data') {
369 if ($self->{next_input_character} == 0x0026) { # &
370 if ($self->{content_model_flag} eq 'PCDATA' or
371 $self->{content_model_flag} eq 'RCDATA') {
372 $self->{state} = 'entity data';
373
374 if (@{$self->{char}}) {
375 $self->{next_input_character} = shift @{$self->{char}};
376 } else {
377 $self->{set_next_input_character}->($self);
378 }
379
380 redo A;
381 } else {
382 #
383 }
384 } elsif ($self->{next_input_character} == 0x003C) { # <
385 if ($self->{content_model_flag} ne 'PLAINTEXT') {
386 $self->{state} = 'tag open';
387
388 if (@{$self->{char}}) {
389 $self->{next_input_character} = shift @{$self->{char}};
390 } else {
391 $self->{set_next_input_character}->($self);
392 }
393
394 redo A;
395 } else {
396 #
397 }
398 } elsif ($self->{next_input_character} == -1) {
399 return ({type => 'end-of-file'});
400 last A; ## TODO: ok?
401 }
402 # Anything else
403 my $token = {type => 'character',
404 data => chr $self->{next_input_character}};
405 ## Stay in the data state
406
407 if (@{$self->{char}}) {
408 $self->{next_input_character} = shift @{$self->{char}};
409 } else {
410 $self->{set_next_input_character}->($self);
411 }
412
413
414 return ($token);
415
416 redo A;
417 } elsif ($self->{state} eq 'entity data') {
418 ## (cannot happen in CDATA state)
419
420 my $token = $self->_tokenize_attempt_to_consume_an_entity;
421
422 $self->{state} = 'data';
423 # next-input-character is already done
424
425 unless (defined $token) {
426 return ({type => 'character', data => '&'});
427 } else {
428 return ($token);
429 }
430
431 redo A;
432 } elsif ($self->{state} eq 'tag open') {
433 if ($self->{content_model_flag} eq 'RCDATA' or
434 $self->{content_model_flag} eq 'CDATA') {
435 if ($self->{next_input_character} == 0x002F) { # /
436
437 if (@{$self->{char}}) {
438 $self->{next_input_character} = shift @{$self->{char}};
439 } else {
440 $self->{set_next_input_character}->($self);
441 }
442
443 $self->{state} = 'close tag open';
444 redo A;
445 } else {
446 ## reconsume
447 $self->{state} = 'data';
448
449 return (type => 'character', data => {'/'});
450
451 redo A;
452 }
453 } elsif ($self->{content_model_flag} eq 'PCDATA') {
454 if ($self->{next_input_character} == 0x0021) { # !
455 $self->{state} = 'markup declaration open';
456
457 if (@{$self->{char}}) {
458 $self->{next_input_character} = shift @{$self->{char}};
459 } else {
460 $self->{set_next_input_character}->($self);
461 }
462
463 redo A;
464 } elsif ($self->{next_input_character} == 0x002F) { # /
465 $self->{state} = 'close tag open';
466
467 if (@{$self->{char}}) {
468 $self->{next_input_character} = shift @{$self->{char}};
469 } else {
470 $self->{set_next_input_character}->($self);
471 }
472
473 redo A;
474 } elsif (0x0041 <= $self->{next_input_character} and
475 $self->{next_input_character} <= 0x005A) { # A..Z
476 $self->{current_token}
477 = {type => 'start tag',
478 tag_name => chr ($self->{next_input_character} + 0x0020)};
479 $self->{state} = 'tag name';
480
481 if (@{$self->{char}}) {
482 $self->{next_input_character} = shift @{$self->{char}};
483 } else {
484 $self->{set_next_input_character}->($self);
485 }
486
487 redo A;
488 } elsif (0x0061 <= $self->{next_input_character} and
489 $self->{next_input_character} <= 0x007A) { # a..z
490 $self->{current_token} = {type => 'start tag',
491 tag_name => chr ($self->{next_input_character})};
492 $self->{state} = 'tag name';
493
494 if (@{$self->{char}}) {
495 $self->{next_input_character} = shift @{$self->{char}};
496 } else {
497 $self->{set_next_input_character}->($self);
498 }
499
500 redo A;
501 } elsif ($self->{next_input_character} == 0x003E) { # >
502 $self->{parse_error}->();
503 $self->{state} = 'data';
504
505 if (@{$self->{char}}) {
506 $self->{next_input_character} = shift @{$self->{char}};
507 } else {
508 $self->{set_next_input_character}->($self);
509 }
510
511
512 return ({type => 'character', data => '<>'});
513
514 redo A;
515 } elsif ($self->{next_input_character} == 0x003F) { # ?
516 $self->{parse_error}->();
517 $self->{state} = 'bogus comment';
518 ## $self->{next_input_character} is intentionally left as is
519 redo A;
520 } else {
521 $self->{parse_error}->();
522 $self->{state} = 'data';
523 ## reconsume
524
525 return ({type => 'character', data => '<'});
526
527 redo A;
528 }
529 } else {
530 die "$0: $self->{content_model_flag}: Unknown content model flag";
531 }
532 } elsif ($self->{state} eq 'close tag open') {
533 if ($self->{content_model_flag} eq 'RCDATA' or
534 $self->{content_model_flag} eq 'CDATA') {
535 my @next_char;
536 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
537 push @next_char, $self->{next_input_character};
538 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
539 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
540 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
541
542 if (@{$self->{char}}) {
543 $self->{next_input_character} = shift @{$self->{char}};
544 } else {
545 $self->{set_next_input_character}->($self);
546 }
547
548 next TAGNAME;
549 } else {
550 $self->{parse_error}->();
551 $self->{next_input_character} = shift @next_char; # reconsume
552 unshift @{$self->{char}}, (@next_char);
553 $self->{state} = 'data';
554
555 return ({type => 'character', data => '</'});
556
557 redo A;
558 }
559 }
560 push @next_char, $self->{next_input_character};
561
562 unless ($self->{next_input_character} == 0x0009 or # HT
563 $self->{next_input_character} == 0x000A or # LF
564 $self->{next_input_character} == 0x000B or # VT
565 $self->{next_input_character} == 0x000C or # FF
566 $self->{next_input_character} == 0x0020 or # SP
567 $self->{next_input_character} == 0x003E or # >
568 $self->{next_input_character} == 0x002F or # /
569 $self->{next_input_character} == 0x003C or # <
570 $self->{next_input_character} == -1) {
571 $self->{parse_error}->();
572 $self->{next_input_character} = shift @next_char; # reconsume
573 unshift @{$self->{char}}, (@next_char);
574 $self->{state} = 'data';
575
576 return ({type => 'character', data => '</'});
577
578 redo A;
579 } else {
580 $self->{next_input_character} = shift @next_char;
581 unshift @{$self->{char}}, (@next_char);
582 # and consume...
583 }
584 }
585
586 if (0x0041 <= $self->{next_input_character} and
587 $self->{next_input_character} <= 0x005A) { # A..Z
588 $self->{current_token} = {type => 'end tag',
589 tag_name => chr ($self->{next_input_character} + 0x0020)};
590 $self->{state} = 'tag name';
591
592 if (@{$self->{char}}) {
593 $self->{next_input_character} = shift @{$self->{char}};
594 } else {
595 $self->{set_next_input_character}->($self);
596 }
597
598 redo A;
599 } elsif (0x0061 <= $self->{next_input_character} and
600 $self->{next_input_character} <= 0x007A) { # a..z
601 $self->{current_token} = {type => 'end tag',
602 tag_name => chr ($self->{next_input_character})};
603 $self->{state} = 'tag name';
604
605 if (@{$self->{char}}) {
606 $self->{next_input_character} = shift @{$self->{char}};
607 } else {
608 $self->{set_next_input_character}->($self);
609 }
610
611 redo A;
612 } elsif ($self->{next_input_character} == 0x003E) { # >
613 $self->{parse_error}->();
614 $self->{state} = 'data';
615
616 if (@{$self->{char}}) {
617 $self->{next_input_character} = shift @{$self->{char}};
618 } else {
619 $self->{set_next_input_character}->($self);
620 }
621
622 redo A;
623 } elsif ($self->{next_input_character} == -1) {
624 $self->{parse_error}->();
625 $self->{state} = 'data';
626 # reconsume
627
628 return ({type => 'character', data => '</'});
629
630 redo A;
631 } else {
632 $self->{parse_error}->();
633 $self->{state} = 'bogus comment';
634 ## $self->{next_input_character} is intentionally left as is
635 redo A;
636 }
637 } elsif ($self->{state} eq 'tag name') {
638 if ($self->{next_input_character} == 0x0009 or # HT
639 $self->{next_input_character} == 0x000A or # LF
640 $self->{next_input_character} == 0x000B or # VT
641 $self->{next_input_character} == 0x000C or # FF
642 $self->{next_input_character} == 0x0020) { # SP
643 $self->{state} = 'before attribute name';
644
645 if (@{$self->{char}}) {
646 $self->{next_input_character} = shift @{$self->{char}};
647 } else {
648 $self->{set_next_input_character}->($self);
649 }
650
651 redo A;
652 } elsif ($self->{next_input_character} == 0x003E) { # >
653 if ($self->{current_token}->{type} eq 'start tag') {
654 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
655 } elsif ($self->{current_token}->{type} eq 'end tag') {
656 $self->{content_model_flag} = 'PCDATA'; # MUST
657 if ($self->{current_token}->{attributes}) {
658 $self->{parse_error}->();
659 }
660 } else {
661 die "$0: $self->{current_token}->{type}: Unknown token type";
662 }
663 $self->{state} = 'data';
664
665 if (@{$self->{char}}) {
666 $self->{next_input_character} = shift @{$self->{char}};
667 } else {
668 $self->{set_next_input_character}->($self);
669 }
670
671
672 return ($self->{current_token}); # start tag or end tag
673 undef $self->{current_token};
674
675 redo A;
676 } elsif (0x0041 <= $self->{next_input_character} and
677 $self->{next_input_character} <= 0x005A) { # A..Z
678 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
679 # start tag or end tag
680 ## Stay in this state
681
682 if (@{$self->{char}}) {
683 $self->{next_input_character} = shift @{$self->{char}};
684 } else {
685 $self->{set_next_input_character}->($self);
686 }
687
688 redo A;
689 } elsif ($self->{next_input_character} == 0x003C or # <
690 $self->{next_input_character} == -1) {
691 $self->{parse_error}->();
692 if ($self->{current_token}->{type} eq 'start tag') {
693 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
694 } elsif ($self->{current_token}->{type} eq 'end tag') {
695 $self->{content_model_flag} = 'PCDATA'; # MUST
696 if ($self->{current_token}->{attributes}) {
697 $self->{parse_error}->();
698 }
699 } else {
700 die "$0: $self->{current_token}->{type}: Unknown token type";
701 }
702 $self->{state} = 'data';
703 # reconsume
704
705 return ($self->{current_token}); # start tag or end tag
706 undef $self->{current_token};
707
708 redo A;
709 } elsif ($self->{next_input_character} == 0x002F) { # /
710
711 if (@{$self->{char}}) {
712 $self->{next_input_character} = shift @{$self->{char}};
713 } else {
714 $self->{set_next_input_character}->($self);
715 }
716
717 if ($self->{next_input_character} == 0x003E and # >
718 $self->{current_token}->{type} eq 'start tag' and
719 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
720 # permitted slash
721 #
722 } else {
723 $self->{parse_error}->();
724 }
725 $self->{state} = 'before attribute name';
726 # next-input-character is already done
727 redo A;
728 } else {
729 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
730 # start tag or end tag
731 ## Stay in the state
732
733 if (@{$self->{char}}) {
734 $self->{next_input_character} = shift @{$self->{char}};
735 } else {
736 $self->{set_next_input_character}->($self);
737 }
738
739 redo A;
740 }
741 } elsif ($self->{state} eq 'before attribute name') {
742 if ($self->{next_input_character} == 0x0009 or # HT
743 $self->{next_input_character} == 0x000A or # LF
744 $self->{next_input_character} == 0x000B or # VT
745 $self->{next_input_character} == 0x000C or # FF
746 $self->{next_input_character} == 0x0020) { # SP
747 ## Stay in the state
748
749 if (@{$self->{char}}) {
750 $self->{next_input_character} = shift @{$self->{char}};
751 } else {
752 $self->{set_next_input_character}->($self);
753 }
754
755 redo A;
756 } elsif ($self->{next_input_character} == 0x003E) { # >
757 if ($self->{current_token}->{type} eq 'start tag') {
758 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
759 } elsif ($self->{current_token}->{type} eq 'end tag') {
760 $self->{content_model_flag} = 'PCDATA'; # MUST
761 if ($self->{current_token}->{attributes}) {
762 $self->{parse_error}->();
763 }
764 } else {
765 die "$0: $self->{current_token}->{type}: Unknown token type";
766 }
767 $self->{state} = 'data';
768
769 if (@{$self->{char}}) {
770 $self->{next_input_character} = shift @{$self->{char}};
771 } else {
772 $self->{set_next_input_character}->($self);
773 }
774
775
776 return ($self->{current_token}); # start tag or end tag
777 undef $self->{current_token};
778
779 redo A;
780 } elsif (0x0041 <= $self->{next_input_character} and
781 $self->{next_input_character} <= 0x005A) { # A..Z
782 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
783 value => ''};
784 $self->{state} = 'attribute name';
785
786 if (@{$self->{char}}) {
787 $self->{next_input_character} = shift @{$self->{char}};
788 } else {
789 $self->{set_next_input_character}->($self);
790 }
791
792 redo A;
793 } elsif ($self->{next_input_character} == 0x002F) { # /
794
795 if (@{$self->{char}}) {
796 $self->{next_input_character} = shift @{$self->{char}};
797 } else {
798 $self->{set_next_input_character}->($self);
799 }
800
801 if ($self->{next_input_character} == 0x003E and # >
802 $self->{current_token}->{type} eq 'start tag' and
803 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
804 # permitted slash
805 #
806 } else {
807 $self->{parse_error}->();
808 }
809 ## Stay in the state
810 # next-input-character is already done
811 redo A;
812 } elsif ($self->{next_input_character} == 0x003C or # <
813 $self->{next_input_character} == -1) {
814 $self->{parse_error}->();
815 if ($self->{current_token}->{type} eq 'start tag') {
816 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
817 } elsif ($self->{current_token}->{type} eq 'end tag') {
818 $self->{content_model_flag} = 'PCDATA'; # MUST
819 if ($self->{current_token}->{attributes}) {
820 $self->{parse_error}->();
821 }
822 } else {
823 die "$0: $self->{current_token}->{type}: Unknown token type";
824 }
825 $self->{state} = 'data';
826 # reconsume
827
828 return ($self->{current_token}); # start tag or end tag
829 undef $self->{current_token};
830
831 redo A;
832 } else {
833 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
834 value => ''};
835 $self->{state} = 'attribute name';
836
837 if (@{$self->{char}}) {
838 $self->{next_input_character} = shift @{$self->{char}};
839 } else {
840 $self->{set_next_input_character}->($self);
841 }
842
843 redo A;
844 }
845 } elsif ($self->{state} eq 'attribute name') {
846 my $before_leave = sub {
847 if (exists $self->{current_token}->{attributes} # start tag or end tag
848 ->{$self->{current_attribute}->{name}}) { # MUST
849 $self->{parse_error}->();
850 ## Discard $self->{current_attribute} # MUST
851 } else {
852 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
853 = $self->{current_attribute};
854 }
855 }; # $before_leave
856
857 if ($self->{next_input_character} == 0x0009 or # HT
858 $self->{next_input_character} == 0x000A or # LF
859 $self->{next_input_character} == 0x000B or # VT
860 $self->{next_input_character} == 0x000C or # FF
861 $self->{next_input_character} == 0x0020) { # SP
862 $before_leave->();
863 $self->{state} = 'after attribute name';
864
865 if (@{$self->{char}}) {
866 $self->{next_input_character} = shift @{$self->{char}};
867 } else {
868 $self->{set_next_input_character}->($self);
869 }
870
871 redo A;
872 } elsif ($self->{next_input_character} == 0x003D) { # =
873 $before_leave->();
874 $self->{state} = 'before attribute value';
875
876 if (@{$self->{char}}) {
877 $self->{next_input_character} = shift @{$self->{char}};
878 } else {
879 $self->{set_next_input_character}->($self);
880 }
881
882 redo A;
883 } elsif ($self->{next_input_character} == 0x003E) { # >
884 $before_leave->();
885 if ($self->{current_token}->{type} eq 'start tag') {
886 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
887 } elsif ($self->{current_token}->{type} eq 'end tag') {
888 $self->{content_model_flag} = 'PCDATA'; # MUST
889 if ($self->{current_token}->{attributes}) {
890 $self->{parse_error}->();
891 }
892 } else {
893 die "$0: $self->{current_token}->{type}: Unknown token type";
894 }
895 $self->{state} = 'data';
896
897 if (@{$self->{char}}) {
898 $self->{next_input_character} = shift @{$self->{char}};
899 } else {
900 $self->{set_next_input_character}->($self);
901 }
902
903
904 return ($self->{current_token}); # start tag or end tag
905 undef $self->{current_token};
906
907 redo A;
908 } elsif (0x0041 <= $self->{next_input_character} and
909 $self->{next_input_character} <= 0x005A) { # A..Z
910 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
911 ## Stay in the state
912
913 if (@{$self->{char}}) {
914 $self->{next_input_character} = shift @{$self->{char}};
915 } else {
916 $self->{set_next_input_character}->($self);
917 }
918
919 redo A;
920 } elsif ($self->{next_input_character} == 0x002F) { # /
921 $before_leave->();
922
923 if (@{$self->{char}}) {
924 $self->{next_input_character} = shift @{$self->{char}};
925 } else {
926 $self->{set_next_input_character}->($self);
927 }
928
929 if ($self->{next_input_character} == 0x003E and # >
930 $self->{current_token}->{type} eq 'start tag' and
931 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
932 # permitted slash
933 #
934 } else {
935 $self->{parse_error}->();
936 }
937 $self->{state} = 'before attribute name';
938 # next-input-character is already done
939 redo A;
940 } elsif ($self->{next_input_character} == 0x003C or # <
941 $self->{next_input_character} == -1) {
942 $self->{parse_error}->();
943 $before_leave->();
944 if ($self->{current_token}->{type} eq 'start tag') {
945 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
946 } elsif ($self->{current_token}->{type} eq 'end tag') {
947 $self->{content_model_flag} = 'PCDATA'; # MUST
948 if ($self->{current_token}->{attributes}) {
949 $self->{parse_error}->();
950 }
951 } else {
952 die "$0: $self->{current_token}->{type}: Unknown token type";
953 }
954 $self->{state} = 'data';
955 # reconsume
956
957 return ($self->{current_token}); # start tag or end tag
958 undef $self->{current_token};
959
960 redo A;
961 } else {
962 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
963 ## Stay in the state
964
965 if (@{$self->{char}}) {
966 $self->{next_input_character} = shift @{$self->{char}};
967 } else {
968 $self->{set_next_input_character}->($self);
969 }
970
971 redo A;
972 }
973 } elsif ($self->{state} eq 'after attribute name') {
974 if ($self->{next_input_character} == 0x0009 or # HT
975 $self->{next_input_character} == 0x000A or # LF
976 $self->{next_input_character} == 0x000B or # VT
977 $self->{next_input_character} == 0x000C or # FF
978 $self->{next_input_character} == 0x0020) { # SP
979 ## Stay in the state
980
981 if (@{$self->{char}}) {
982 $self->{next_input_character} = shift @{$self->{char}};
983 } else {
984 $self->{set_next_input_character}->($self);
985 }
986
987 redo A;
988 } elsif ($self->{next_input_character} == 0x003D) { # =
989 $self->{state} = 'before attribute value';
990
991 if (@{$self->{char}}) {
992 $self->{next_input_character} = shift @{$self->{char}};
993 } else {
994 $self->{set_next_input_character}->($self);
995 }
996
997 redo A;
998 } elsif ($self->{next_input_character} == 0x003E) { # >
999 if ($self->{current_token}->{type} eq 'start tag') {
1000 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1001 } elsif ($self->{current_token}->{type} eq 'end tag') {
1002 $self->{content_model_flag} = 'PCDATA'; # MUST
1003 if ($self->{current_token}->{attributes}) {
1004 $self->{parse_error}->();
1005 }
1006 } else {
1007 die "$0: $self->{current_token}->{type}: Unknown token type";
1008 }
1009 $self->{state} = 'data';
1010
1011 if (@{$self->{char}}) {
1012 $self->{next_input_character} = shift @{$self->{char}};
1013 } else {
1014 $self->{set_next_input_character}->($self);
1015 }
1016
1017
1018 return ($self->{current_token}); # start tag or end tag
1019 undef $self->{current_token};
1020
1021 redo A;
1022 } elsif (0x0041 <= $self->{next_input_character} and
1023 $self->{next_input_character} <= 0x005A) { # A..Z
1024 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
1025 value => ''};
1026 $self->{state} = 'attribute name';
1027
1028 if (@{$self->{char}}) {
1029 $self->{next_input_character} = shift @{$self->{char}};
1030 } else {
1031 $self->{set_next_input_character}->($self);
1032 }
1033
1034 redo A;
1035 } elsif ($self->{next_input_character} == 0x002F) { # /
1036
1037 if (@{$self->{char}}) {
1038 $self->{next_input_character} = shift @{$self->{char}};
1039 } else {
1040 $self->{set_next_input_character}->($self);
1041 }
1042
1043 if ($self->{next_input_character} == 0x003E and # >
1044 $self->{current_token}->{type} eq 'start tag' and
1045 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
1046 # permitted slash
1047 #
1048 } else {
1049 $self->{parse_error}->();
1050 }
1051 $self->{state} = 'before attribute name';
1052 # next-input-character is already done
1053 redo A;
1054 } elsif ($self->{next_input_character} == 0x003C or # <
1055 $self->{next_input_character} == -1) {
1056 $self->{parse_error}->();
1057 if ($self->{current_token}->{type} eq 'start tag') {
1058 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1059 } elsif ($self->{current_token}->{type} eq 'end tag') {
1060 $self->{content_model_flag} = 'PCDATA'; # MUST
1061 if ($self->{current_token}->{attributes}) {
1062 $self->{parse_error}->();
1063 }
1064 } else {
1065 die "$0: $self->{current_token}->{type}: Unknown token type";
1066 }
1067 $self->{state} = 'data';
1068 # reconsume
1069
1070 return ($self->{current_token}); # start tag or end tag
1071 undef $self->{current_token};
1072
1073 redo A;
1074 } else {
1075 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
1076 value => ''};
1077 $self->{state} = 'attribute name';
1078
1079 if (@{$self->{char}}) {
1080 $self->{next_input_character} = shift @{$self->{char}};
1081 } else {
1082 $self->{set_next_input_character}->($self);
1083 }
1084
1085 redo A;
1086 }
1087 } elsif ($self->{state} eq 'before attribute value') {
1088 if ($self->{next_input_character} == 0x0009 or # HT
1089 $self->{next_input_character} == 0x000A or # LF
1090 $self->{next_input_character} == 0x000B or # VT
1091 $self->{next_input_character} == 0x000C or # FF
1092 $self->{next_input_character} == 0x0020) { # SP
1093 ## Stay in the state
1094
1095 if (@{$self->{char}}) {
1096 $self->{next_input_character} = shift @{$self->{char}};
1097 } else {
1098 $self->{set_next_input_character}->($self);
1099 }
1100
1101 redo A;
1102 } elsif ($self->{next_input_character} == 0x0022) { # "
1103 $self->{state} = 'attribute value (double-quoted)';
1104
1105 if (@{$self->{char}}) {
1106 $self->{next_input_character} = shift @{$self->{char}};
1107 } else {
1108 $self->{set_next_input_character}->($self);
1109 }
1110
1111 redo A;
1112 } elsif ($self->{next_input_character} == 0x0026) { # &
1113 $self->{state} = 'attribute value (unquoted)';
1114 ## reconsume
1115 redo A;
1116 } elsif ($self->{next_input_character} == 0x0027) { # '
1117 $self->{state} = 'attribute value (single-quoted)';
1118
1119 if (@{$self->{char}}) {
1120 $self->{next_input_character} = shift @{$self->{char}};
1121 } else {
1122 $self->{set_next_input_character}->($self);
1123 }
1124
1125 redo A;
1126 } elsif ($self->{next_input_character} == 0x003E) { # >
1127 if ($self->{current_token}->{type} eq 'start tag') {
1128 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1129 } elsif ($self->{current_token}->{type} eq 'end tag') {
1130 $self->{content_model_flag} = 'PCDATA'; # MUST
1131 if ($self->{current_token}->{attributes}) {
1132 $self->{parse_error}->();
1133 }
1134 } else {
1135 die "$0: $self->{current_token}->{type}: Unknown token type";
1136 }
1137 $self->{state} = 'data';
1138
1139 if (@{$self->{char}}) {
1140 $self->{next_input_character} = shift @{$self->{char}};
1141 } else {
1142 $self->{set_next_input_character}->($self);
1143 }
1144
1145
1146 return ($self->{current_token}); # start tag or end tag
1147 undef $self->{current_token};
1148
1149 redo A;
1150 } elsif ($self->{next_input_character} == 0x003C or # <
1151 $self->{next_input_character} == -1) {
1152 $self->{parse_error}->();
1153 if ($self->{current_token}->{type} eq 'start tag') {
1154 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1155 } elsif ($self->{current_token}->{type} eq 'end tag') {
1156 $self->{content_model_flag} = 'PCDATA'; # MUST
1157 if ($self->{current_token}->{attributes}) {
1158 $self->{parse_error}->();
1159 }
1160 } else {
1161 die "$0: $self->{current_token}->{type}: Unknown token type";
1162 }
1163 $self->{state} = 'data';
1164 ## reconsume
1165
1166 return ($self->{current_token}); # start tag or end tag
1167 undef $self->{current_token};
1168
1169 redo A;
1170 } else {
1171 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1172 $self->{state} = 'attribute value (unquoted)';
1173
1174 if (@{$self->{char}}) {
1175 $self->{next_input_character} = shift @{$self->{char}};
1176 } else {
1177 $self->{set_next_input_character}->($self);
1178 }
1179
1180 redo A;
1181 }
1182 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
1183 if ($self->{next_input_character} == 0x0022) { # "
1184 $self->{state} = 'before attribute name';
1185
1186 if (@{$self->{char}}) {
1187 $self->{next_input_character} = shift @{$self->{char}};
1188 } else {
1189 $self->{set_next_input_character}->($self);
1190 }
1191
1192 redo A;
1193 } elsif ($self->{next_input_character} == 0x0026) { # &
1194 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
1195 $self->{state} = 'entity in attribute value';
1196
1197 if (@{$self->{char}}) {
1198 $self->{next_input_character} = shift @{$self->{char}};
1199 } else {
1200 $self->{set_next_input_character}->($self);
1201 }
1202
1203 redo A;
1204 } elsif ($self->{next_input_character} == -1) {
1205 $self->{parse_error}->();
1206 if ($self->{current_token}->{type} eq 'start tag') {
1207 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1208 } elsif ($self->{current_token}->{type} eq 'end tag') {
1209 $self->{content_model_flag} = 'PCDATA'; # MUST
1210 if ($self->{current_token}->{attributes}) {
1211 $self->{parse_error}->();
1212 }
1213 } else {
1214 die "$0: $self->{current_token}->{type}: Unknown token type";
1215 }
1216 $self->{state} = 'data';
1217 ## reconsume
1218
1219 return ($self->{current_token}); # start tag or end tag
1220 undef $self->{current_token};
1221
1222 redo A;
1223 } else {
1224 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1225 ## Stay in the state
1226
1227 if (@{$self->{char}}) {
1228 $self->{next_input_character} = shift @{$self->{char}};
1229 } else {
1230 $self->{set_next_input_character}->($self);
1231 }
1232
1233 redo A;
1234 }
1235 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
1236 if ($self->{next_input_character} == 0x0027) { # '
1237 $self->{state} = 'before attribute name';
1238
1239 if (@{$self->{char}}) {
1240 $self->{next_input_character} = shift @{$self->{char}};
1241 } else {
1242 $self->{set_next_input_character}->($self);
1243 }
1244
1245 redo A;
1246 } elsif ($self->{next_input_character} == 0x0026) { # &
1247 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
1248 $self->{state} = 'entity in attribute value';
1249
1250 if (@{$self->{char}}) {
1251 $self->{next_input_character} = shift @{$self->{char}};
1252 } else {
1253 $self->{set_next_input_character}->($self);
1254 }
1255
1256 redo A;
1257 } elsif ($self->{next_input_character} == -1) {
1258 $self->{parse_error}->();
1259 if ($self->{current_token}->{type} eq 'start tag') {
1260 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1261 } elsif ($self->{current_token}->{type} eq 'end tag') {
1262 $self->{content_model_flag} = 'PCDATA'; # MUST
1263 if ($self->{current_token}->{attributes}) {
1264 $self->{parse_error}->();
1265 }
1266 } else {
1267 die "$0: $self->{current_token}->{type}: Unknown token type";
1268 }
1269 $self->{state} = 'data';
1270 ## reconsume
1271
1272 return ($self->{current_token}); # start tag or end tag
1273 undef $self->{current_token};
1274
1275 redo A;
1276 } else {
1277 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1278 ## Stay in the state
1279
1280 if (@{$self->{char}}) {
1281 $self->{next_input_character} = shift @{$self->{char}};
1282 } else {
1283 $self->{set_next_input_character}->($self);
1284 }
1285
1286 redo A;
1287 }
1288 } elsif ($self->{state} eq 'attribute value (unquoted)') {
1289 if ($self->{next_input_character} == 0x0009 or # HT
1290 $self->{next_input_character} == 0x000A or # LF
1291 $self->{next_input_character} == 0x000B or # HT
1292 $self->{next_input_character} == 0x000C or # FF
1293 $self->{next_input_character} == 0x0020) { # SP
1294 $self->{state} = 'before attribute name';
1295
1296 if (@{$self->{char}}) {
1297 $self->{next_input_character} = shift @{$self->{char}};
1298 } else {
1299 $self->{set_next_input_character}->($self);
1300 }
1301
1302 redo A;
1303 } elsif ($self->{next_input_character} == 0x0026) { # &
1304 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
1305 $self->{state} = 'entity in attribute value';
1306
1307 if (@{$self->{char}}) {
1308 $self->{next_input_character} = shift @{$self->{char}};
1309 } else {
1310 $self->{set_next_input_character}->($self);
1311 }
1312
1313 redo A;
1314 } elsif ($self->{next_input_character} == 0x003E) { # >
1315 if ($self->{current_token}->{type} eq 'start tag') {
1316 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1317 } elsif ($self->{current_token}->{type} eq 'end tag') {
1318 $self->{content_model_flag} = 'PCDATA'; # MUST
1319 if ($self->{current_token}->{attributes}) {
1320 $self->{parse_error}->();
1321 }
1322 } else {
1323 die "$0: $self->{current_token}->{type}: Unknown token type";
1324 }
1325 $self->{state} = 'data';
1326
1327 if (@{$self->{char}}) {
1328 $self->{next_input_character} = shift @{$self->{char}};
1329 } else {
1330 $self->{set_next_input_character}->($self);
1331 }
1332
1333
1334 return ($self->{current_token}); # start tag or end tag
1335 undef $self->{current_token};
1336
1337 redo A;
1338 } elsif ($self->{next_input_character} == 0x003C or # <
1339 $self->{next_input_character} == -1) {
1340 $self->{parse_error}->();
1341 if ($self->{current_token}->{type} eq 'start tag') {
1342 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1343 } elsif ($self->{current_token}->{type} eq 'end tag') {
1344 $self->{content_model_flag} = 'PCDATA'; # MUST
1345 if ($self->{current_token}->{attributes}) {
1346 $self->{parse_error}->();
1347 }
1348 } else {
1349 die "$0: $self->{current_token}->{type}: Unknown token type";
1350 }
1351 $self->{state} = 'data';
1352 ## reconsume
1353
1354 return ($self->{current_token}); # start tag or end tag
1355 undef $self->{current_token};
1356
1357 redo A;
1358 } else {
1359 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1360 ## Stay in the state
1361
1362 if (@{$self->{char}}) {
1363 $self->{next_input_character} = shift @{$self->{char}};
1364 } else {
1365 $self->{set_next_input_character}->($self);
1366 }
1367
1368 redo A;
1369 }
1370 } elsif ($self->{state} eq 'entity in attribute value') {
1371 my $token = $self->_tokenize_attempt_to_consume_an_entity;
1372
1373 unless (defined $token) {
1374 $self->{current_attribute}->{value} .= '&';
1375 } else {
1376 $self->{current_attribute}->{value} .= $token->{data};
1377 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1378 }
1379
1380 $self->{state} = $self->{last_attribute_value_state};
1381 # next-input-character is already done
1382 redo A;
1383 } elsif ($self->{state} eq 'bogus comment') {
1384 ## (only happen if PCDATA state)
1385
1386 my $token = {type => 'comment', data => ''};
1387
1388 BC: {
1389 if ($self->{next_input_character} == 0x003E) { # >
1390 $self->{state} = 'data';
1391
1392 if (@{$self->{char}}) {
1393 $self->{next_input_character} = shift @{$self->{char}};
1394 } else {
1395 $self->{set_next_input_character}->($self);
1396 }
1397
1398
1399 return ($token);
1400
1401 redo A;
1402 } elsif ($self->{next_input_character} == -1) {
1403 $self->{state} = 'data';
1404 ## reconsume
1405
1406 return ($token);
1407
1408 redo A;
1409 } else {
1410 $token->{data} .= chr ($self->{next_input_character});
1411
1412 if (@{$self->{char}}) {
1413 $self->{next_input_character} = shift @{$self->{char}};
1414 } else {
1415 $self->{set_next_input_character}->($self);
1416 }
1417
1418 redo BC;
1419 }
1420 } # BC
1421 } elsif ($self->{state} eq 'markup declaration open') {
1422 ## (only happen if PCDATA state)
1423
1424 my @next_char;
1425 push @next_char, $self->{next_input_character};
1426
1427 if ($self->{next_input_character} == 0x002D) { # -
1428
1429 if (@{$self->{char}}) {
1430 $self->{next_input_character} = shift @{$self->{char}};
1431 } else {
1432 $self->{set_next_input_character}->($self);
1433 }
1434
1435 push @next_char, $self->{next_input_character};
1436 if ($self->{next_input_character} == 0x002D) { # -
1437 $self->{current_token} = {type => 'comment', data => ''};
1438 $self->{state} = 'comment';
1439
1440 if (@{$self->{char}}) {
1441 $self->{next_input_character} = shift @{$self->{char}};
1442 } else {
1443 $self->{set_next_input_character}->($self);
1444 }
1445
1446 redo A;
1447 }
1448 } elsif ($self->{next_input_character} == 0x0044 or # D
1449 $self->{next_input_character} == 0x0064) { # d
1450
1451 if (@{$self->{char}}) {
1452 $self->{next_input_character} = shift @{$self->{char}};
1453 } else {
1454 $self->{set_next_input_character}->($self);
1455 }
1456
1457 push @next_char, $self->{next_input_character};
1458 if ($self->{next_input_character} == 0x004F or # O
1459 $self->{next_input_character} == 0x006F) { # o
1460
1461 if (@{$self->{char}}) {
1462 $self->{next_input_character} = shift @{$self->{char}};
1463 } else {
1464 $self->{set_next_input_character}->($self);
1465 }
1466
1467 push @next_char, $self->{next_input_character};
1468 if ($self->{next_input_character} == 0x0043 or # C
1469 $self->{next_input_character} == 0x0063) { # c
1470
1471 if (@{$self->{char}}) {
1472 $self->{next_input_character} = shift @{$self->{char}};
1473 } else {
1474 $self->{set_next_input_character}->($self);
1475 }
1476
1477 push @next_char, $self->{next_input_character};
1478 if ($self->{next_input_character} == 0x0054 or # T
1479 $self->{next_input_character} == 0x0074) { # t
1480
1481 if (@{$self->{char}}) {
1482 $self->{next_input_character} = shift @{$self->{char}};
1483 } else {
1484 $self->{set_next_input_character}->($self);
1485 }
1486
1487 push @next_char, $self->{next_input_character};
1488 if ($self->{next_input_character} == 0x0059 or # Y
1489 $self->{next_input_character} == 0x0079) { # y
1490
1491 if (@{$self->{char}}) {
1492 $self->{next_input_character} = shift @{$self->{char}};
1493 } else {
1494 $self->{set_next_input_character}->($self);
1495 }
1496
1497 push @next_char, $self->{next_input_character};
1498 if ($self->{next_input_character} == 0x0050 or # P
1499 $self->{next_input_character} == 0x0070) { # p
1500
1501 if (@{$self->{char}}) {
1502 $self->{next_input_character} = shift @{$self->{char}};
1503 } else {
1504 $self->{set_next_input_character}->($self);
1505 }
1506
1507 push @next_char, $self->{next_input_character};
1508 if ($self->{next_input_character} == 0x0045 or # E
1509 $self->{next_input_character} == 0x0065) { # e
1510 ## ISSUE: What a stupid code this is!
1511 $self->{state} = 'DOCTYPE';
1512
1513 if (@{$self->{char}}) {
1514 $self->{next_input_character} = shift @{$self->{char}};
1515 } else {
1516 $self->{set_next_input_character}->($self);
1517 }
1518
1519 redo A;
1520 }
1521 }
1522 }
1523 }
1524 }
1525 }
1526 }
1527
1528 $self->{parse_error}->();
1529 $self->{next_input_character} = shift @next_char;
1530 unshift @{$self->{char}}, (@next_char);
1531 $self->{state} = 'bogus comment';
1532 redo A;
1533
1534 ## ISSUE: typos in spec: chacacters, is is a parse error
1535 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1536 } elsif ($self->{state} eq 'comment') {
1537 if ($self->{next_input_character} == 0x002D) { # -
1538 $self->{state} = 'comment dash';
1539
1540 if (@{$self->{char}}) {
1541 $self->{next_input_character} = shift @{$self->{char}};
1542 } else {
1543 $self->{set_next_input_character}->($self);
1544 }
1545
1546 redo A;
1547 } elsif ($self->{next_input_character} == -1) {
1548 $self->{parse_error}->();
1549 $self->{state} = 'data';
1550 ## reconsume
1551
1552 return ($self->{current_token}); # comment
1553 undef $self->{current_token};
1554
1555 redo A;
1556 } else {
1557 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1558 ## Stay in the state
1559
1560 if (@{$self->{char}}) {
1561 $self->{next_input_character} = shift @{$self->{char}};
1562 } else {
1563 $self->{set_next_input_character}->($self);
1564 }
1565
1566 redo A;
1567 }
1568 } elsif ($self->{state} eq 'comment dash') {
1569 if ($self->{next_input_character} == 0x002D) { # -
1570 $self->{state} = 'comment end';
1571
1572 if (@{$self->{char}}) {
1573 $self->{next_input_character} = shift @{$self->{char}};
1574 } else {
1575 $self->{set_next_input_character}->($self);
1576 }
1577
1578 redo A;
1579 } elsif ($self->{next_input_character} == -1) {
1580 $self->{parse_error}->();
1581 $self->{state} = 'data';
1582 ## reconsume
1583
1584 return ($self->{current_token}); # comment
1585 undef $self->{current_token};
1586
1587 redo A;
1588 } else {
1589 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1590 $self->{state} = 'comment';
1591
1592 if (@{$self->{char}}) {
1593 $self->{next_input_character} = shift @{$self->{char}};
1594 } else {
1595 $self->{set_next_input_character}->($self);
1596 }
1597
1598 redo A;
1599 }
1600 } elsif ($self->{state} eq 'comment end') {
1601 if ($self->{next_input_character} == 0x003E) { # >
1602 $self->{state} = 'data';
1603
1604 if (@{$self->{char}}) {
1605 $self->{next_input_character} = shift @{$self->{char}};
1606 } else {
1607 $self->{set_next_input_character}->($self);
1608 }
1609
1610
1611 return ($self->{current_token}); # comment
1612 undef $self->{current_token};
1613
1614 redo A;
1615 } elsif ($self->{next_input_character} == 0x002D) { # -
1616 $self->{parse_error}->();
1617 $self->{current_token}->{data} .= '-'; # comment
1618 ## Stay in the state
1619
1620 if (@{$self->{char}}) {
1621 $self->{next_input_character} = shift @{$self->{char}};
1622 } else {
1623 $self->{set_next_input_character}->($self);
1624 }
1625
1626 redo A;
1627 } elsif ($self->{next_input_character} == -1) {
1628 $self->{parse_error}->();
1629 $self->{state} = 'data';
1630 ## reconsume
1631
1632 return ($self->{current_token}); # comment
1633 undef $self->{current_token};
1634
1635 redo A;
1636 } else {
1637 $self->{parse_error}->();
1638 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1639 $self->{state} = 'comment';
1640
1641 if (@{$self->{char}}) {
1642 $self->{next_input_character} = shift @{$self->{char}};
1643 } else {
1644 $self->{set_next_input_character}->($self);
1645 }
1646
1647 redo A;
1648 }
1649 } elsif ($self->{state} eq 'DOCTYPE') {
1650 if ($self->{next_input_character} == 0x0009 or # HT
1651 $self->{next_input_character} == 0x000A or # LF
1652 $self->{next_input_character} == 0x000B or # VT
1653 $self->{next_input_character} == 0x000C or # FF
1654 $self->{next_input_character} == 0x0020) { # SP
1655 $self->{state} = 'before DOCTYPE name';
1656
1657 if (@{$self->{char}}) {
1658 $self->{next_input_character} = shift @{$self->{char}};
1659 } else {
1660 $self->{set_next_input_character}->($self);
1661 }
1662
1663 redo A;
1664 } else {
1665 $self->{parse_error}->();
1666 $self->{state} = 'before DOCTYPE name';
1667 ## reconsume
1668 redo A;
1669 }
1670 } elsif ($self->{state} eq 'before DOCTYPE name') {
1671 if ($self->{next_input_character} == 0x0009 or # HT
1672 $self->{next_input_character} == 0x000A or # LF
1673 $self->{next_input_character} == 0x000B or # VT
1674 $self->{next_input_character} == 0x000C or # FF
1675 $self->{next_input_character} == 0x0020) { # SP
1676 ## Stay in the state
1677
1678 if (@{$self->{char}}) {
1679 $self->{next_input_character} = shift @{$self->{char}};
1680 } else {
1681 $self->{set_next_input_character}->($self);
1682 }
1683
1684 redo A;
1685 } elsif (0x0061 <= $self->{next_input_character} and
1686 $self->{next_input_character} <= 0x007A) { # a..z
1687 $self->{current_token} = {type => 'DOCTYPE',
1688 name => chr ($self->{next_input_character} - 0x0020),
1689 error => 1};
1690 $self->{state} = 'DOCTYPE name';
1691
1692 if (@{$self->{char}}) {
1693 $self->{next_input_character} = shift @{$self->{char}};
1694 } else {
1695 $self->{set_next_input_character}->($self);
1696 }
1697
1698 redo A;
1699 } elsif ($self->{next_input_character} == 0x003E) { # >
1700 $self->{parse_error}->();
1701 $self->{state} = 'data';
1702
1703 if (@{$self->{char}}) {
1704 $self->{next_input_character} = shift @{$self->{char}};
1705 } else {
1706 $self->{set_next_input_character}->($self);
1707 }
1708
1709
1710 return ({type => 'DOCTYPE', name => '', error => 1});
1711
1712 redo A;
1713 } elsif ($self->{next_input_character} == -1) {
1714 $self->{parse_error}->();
1715 $self->{state} = 'data';
1716 ## reconsume
1717
1718 return ({type => 'DOCTYPE', name => '', error => 1});
1719
1720 redo A;
1721 } else {
1722 $self->{current_token} = {type => 'DOCTYPE',
1723 name => chr ($self->{next_input_character}),
1724 error => 1};
1725 $self->{state} = 'DOCTYPE name';
1726
1727 if (@{$self->{char}}) {
1728 $self->{next_input_character} = shift @{$self->{char}};
1729 } else {
1730 $self->{set_next_input_character}->($self);
1731 }
1732
1733 redo A;
1734 }
1735 } elsif ($self->{state} eq 'DOCTYPE name') {
1736 if ($self->{next_input_character} == 0x0009 or # HT
1737 $self->{next_input_character} == 0x000A or # LF
1738 $self->{next_input_character} == 0x000B or # VT
1739 $self->{next_input_character} == 0x000C or # FF
1740 $self->{next_input_character} == 0x0020) { # SP
1741 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1742 $self->{state} = 'after DOCTYPE name';
1743
1744 if (@{$self->{char}}) {
1745 $self->{next_input_character} = shift @{$self->{char}};
1746 } else {
1747 $self->{set_next_input_character}->($self);
1748 }
1749
1750 redo A;
1751 } elsif ($self->{next_input_character} == 0x003E) { # >
1752 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1753 $self->{state} = 'data';
1754
1755 if (@{$self->{char}}) {
1756 $self->{next_input_character} = shift @{$self->{char}};
1757 } else {
1758 $self->{set_next_input_character}->($self);
1759 }
1760
1761
1762 return ($self->{current_token}); # DOCTYPE
1763 undef $self->{current_token};
1764
1765 redo A;
1766 } elsif (0x0061 <= $self->{next_input_character} and
1767 $self->{next_input_character} <= 0x007A) { # a..z
1768 $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
1769 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1770 ## Stay in the state
1771
1772 if (@{$self->{char}}) {
1773 $self->{next_input_character} = shift @{$self->{char}};
1774 } else {
1775 $self->{set_next_input_character}->($self);
1776 }
1777
1778 redo A;
1779 } elsif ($self->{next_input_character} == -1) {
1780 $self->{parse_error}->();
1781 $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
1782 $self->{state} = 'data';
1783 ## reconsume
1784
1785 return ($self->{current_token});
1786 undef $self->{current_token};
1787
1788 redo A;
1789 } else {
1790 $self->{current_token}->{name}
1791 .= chr ($self->{next_input_character}); # DOCTYPE
1792 #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
1793 ## Stay in the state
1794
1795 if (@{$self->{char}}) {
1796 $self->{next_input_character} = shift @{$self->{char}};
1797 } else {
1798 $self->{set_next_input_character}->($self);
1799 }
1800
1801 redo A;
1802 }
1803 } elsif ($self->{state} eq 'after DOCTYPE name') {
1804 if ($self->{next_input_character} == 0x0009 or # HT
1805 $self->{next_input_character} == 0x000A or # LF
1806 $self->{next_input_character} == 0x000B or # VT
1807 $self->{next_input_character} == 0x000C or # FF
1808 $self->{next_input_character} == 0x0020) { # SP
1809 ## Stay in the state
1810
1811 if (@{$self->{char}}) {
1812 $self->{next_input_character} = shift @{$self->{char}};
1813 } else {
1814 $self->{set_next_input_character}->($self);
1815 }
1816
1817 redo A;
1818 } elsif ($self->{next_input_character} == 0x003E) { # >
1819 $self->{state} = 'data';
1820
1821 if (@{$self->{char}}) {
1822 $self->{next_input_character} = shift @{$self->{char}};
1823 } else {
1824 $self->{set_next_input_character}->($self);
1825 }
1826
1827
1828 return ($self->{current_token}); # DOCTYPE
1829 undef $self->{current_token};
1830
1831 redo A;
1832 } elsif ($self->{next_input_character} == -1) {
1833 $self->{parse_error}->();
1834 $self->{state} = 'data';
1835 ## reconsume
1836
1837 return ($self->{current_token}); # DOCTYPE
1838 undef $self->{current_token};
1839
1840 redo A;
1841 } else {
1842 $self->{parse_error}->();
1843 $self->{current_token}->{error} = 1; # DOCTYPE
1844 $self->{state} = 'bogus DOCTYPE';
1845
1846 if (@{$self->{char}}) {
1847 $self->{next_input_character} = shift @{$self->{char}};
1848 } else {
1849 $self->{set_next_input_character}->($self);
1850 }
1851
1852 redo A;
1853 }
1854 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1855 if ($self->{next_input_character} == 0x003E) { # >
1856 $self->{state} = 'data';
1857
1858 if (@{$self->{char}}) {
1859 $self->{next_input_character} = shift @{$self->{char}};
1860 } else {
1861 $self->{set_next_input_character}->($self);
1862 }
1863
1864
1865 return ($self->{current_token}); # DOCTYPE
1866 undef $self->{current_token};
1867
1868 redo A;
1869 } elsif ($self->{next_input_character} == -1) {
1870 $self->{parse_error}->();
1871 $self->{state} = 'data';
1872 ## reconsume
1873
1874 return ($self->{current_token}); # DOCTYPE
1875 undef $self->{current_token};
1876
1877 redo A;
1878 } else {
1879 ## Stay in the state
1880
1881 if (@{$self->{char}}) {
1882 $self->{next_input_character} = shift @{$self->{char}};
1883 } else {
1884 $self->{set_next_input_character}->($self);
1885 }
1886
1887 redo A;
1888 }
1889 } else {
1890 die "$0: $self->{state}: Unknown state";
1891 }
1892 } # A
1893
1894 die "$0: _get_next_token: unexpected case";
1895 } # _get_next_token
1896
1897 sub _tokenize_attempt_to_consume_an_entity ($) {
1898 my $self = shift;
1899
1900 if ($self->{next_input_character} == 0x0023) { # #
1901
1902 if (@{$self->{char}}) {
1903 $self->{next_input_character} = shift @{$self->{char}};
1904 } else {
1905 $self->{set_next_input_character}->($self);
1906 }
1907
1908 my $num;
1909 if ($self->{next_input_character} == 0x0078 or # x
1910 $self->{next_input_character} == 0x0058) { # X
1911 X: {
1912 my $x_char = $self->{next_input_character};
1913
1914 if (@{$self->{char}}) {
1915 $self->{next_input_character} = shift @{$self->{char}};
1916 } else {
1917 $self->{set_next_input_character}->($self);
1918 }
1919
1920 if (0x0030 <= $self->{next_input_character} and
1921 $self->{next_input_character} <= 0x0039) { # 0..9
1922 $num ||= 0;
1923 $num *= 0x10;
1924 $num += $self->{next_input_character} - 0x0030;
1925 redo X;
1926 } elsif (0x0061 <= $self->{next_input_character} and
1927 $self->{next_input_character} <= 0x0066) { # a..f
1928 ## ISSUE: the spec says U+0078, which is apparently incorrect
1929 $num ||= 0;
1930 $num *= 0x10;
1931 $num += $self->{next_input_character} - 0x0060 + 9;
1932 redo X;
1933 } elsif (0x0041 <= $self->{next_input_character} and
1934 $self->{next_input_character} <= 0x0046) { # A..F
1935 ## ISSUE: the spec says U+0058, which is apparently incorrect
1936 $num ||= 0;
1937 $num *= 0x10;
1938 $num += $self->{next_input_character} - 0x0040 + 9;
1939 redo X;
1940 } elsif (not defined $num) { # no hexadecimal digit
1941 $self->{parse_error}->();
1942 $self->{next_input_character} = 0x0023; # #
1943 unshift @{$self->{char}}, ($x_char);
1944 return undef;
1945 } elsif ($self->{next_input_character} == 0x003B) { # ;
1946
1947 if (@{$self->{char}}) {
1948 $self->{next_input_character} = shift @{$self->{char}};
1949 } else {
1950 $self->{set_next_input_character}->($self);
1951 }
1952
1953 } else {
1954 $self->{parse_error}->();
1955 }
1956
1957 ## TODO: check the definition for |a valid Unicode character|.
1958 if ($num > 1114111 or $num == 0) {
1959 $num = 0xFFFD; # REPLACEMENT CHARACTER
1960 ## ISSUE: Why this is not an error?
1961 }
1962
1963 return {type => 'character', data => chr $num};
1964 } # X
1965 } elsif (0x0030 <= $self->{next_input_character} and
1966 $self->{next_input_character} <= 0x0039) { # 0..9
1967 my $code = $self->{next_input_character} - 0x0030;
1968
1969 if (@{$self->{char}}) {
1970 $self->{next_input_character} = shift @{$self->{char}};
1971 } else {
1972 $self->{set_next_input_character}->($self);
1973 }
1974
1975
1976 while (0x0030 <= $self->{next_input_character} and
1977 $self->{next_input_character} <= 0x0039) { # 0..9
1978 $code *= 10;
1979 $code += $self->{next_input_character} - 0x0030;
1980
1981
1982 if (@{$self->{char}}) {
1983 $self->{next_input_character} = shift @{$self->{char}};
1984 } else {
1985 $self->{set_next_input_character}->($self);
1986 }
1987
1988 }
1989
1990 if ($self->{next_input_character} == 0x003B) { # ;
1991
1992 if (@{$self->{char}}) {
1993 $self->{next_input_character} = shift @{$self->{char}};
1994 } else {
1995 $self->{set_next_input_character}->($self);
1996 }
1997
1998 } else {
1999 $self->{parse_error}->();
2000 }
2001
2002 ## TODO: check the definition for |a valid Unicode character|.
2003 if ($code > 1114111 or $code == 0) {
2004 $code = 0xFFFD; # REPLACEMENT CHARACTER
2005 ## ISSUE: Why this is not an error?
2006 }
2007
2008 return {type => 'character', data => chr $code};
2009 } else {
2010 $self->{parse_error}->();
2011 unshift @{$self->{char}}, ($self->{next_input_character});
2012 $self->{next_input_character} = 0x0023; # #
2013 return undef;
2014 }
2015 } elsif ((0x0041 <= $self->{next_input_character} and
2016 $self->{next_input_character} <= 0x005A) or
2017 (0x0061 <= $self->{next_input_character} and
2018 $self->{next_input_character} <= 0x007A)) {
2019 my $entity_name = chr $self->{next_input_character};
2020
2021 if (@{$self->{char}}) {
2022 $self->{next_input_character} = shift @{$self->{char}};
2023 } else {
2024 $self->{set_next_input_character}->($self);
2025 }
2026
2027
2028 my $value = $entity_name;
2029 my $match;
2030
2031 while (length $entity_name < 10 and
2032 ## NOTE: Some number greater than the maximum length of entity name
2033 ((0x0041 <= $self->{next_input_character} and
2034 $self->{next_input_character} <= 0x005A) or
2035 (0x0061 <= $self->{next_input_character} and
2036 $self->{next_input_character} <= 0x007A) or
2037 (0x0030 <= $self->{next_input_character} and
2038 $self->{next_input_character} <= 0x0039))) {
2039 $entity_name .= chr $self->{next_input_character};
2040 if (defined $entity_char->{$entity_name}) {
2041 $value = $entity_char->{$entity_name};
2042 $match = 1;
2043 } else {
2044 $value .= chr $self->{next_input_character};
2045 }
2046
2047 if (@{$self->{char}}) {
2048 $self->{next_input_character} = shift @{$self->{char}};
2049 } else {
2050 $self->{set_next_input_character}->($self);
2051 }
2052
2053 }
2054
2055 if ($match) {
2056 if ($self->{next_input_character} == 0x003B) { # ;
2057
2058 if (@{$self->{char}}) {
2059 $self->{next_input_character} = shift @{$self->{char}};
2060 } else {
2061 $self->{set_next_input_character}->($self);
2062 }
2063
2064 } else {
2065 $self->{parse_error}->();
2066 }
2067
2068 return {type => 'character', data => $value};
2069 } else {
2070 $self->{parse_error}->();
2071 ## NOTE: No characters are consumed in the spec.
2072 unshift @{$self->{token}}, ({type => 'character', data => $value});
2073 return undef;
2074 }
2075 } else {
2076 ## no characters are consumed
2077 $self->{parse_error}->();
2078 return undef;
2079 }
2080 } # _tokenize_attempt_to_consume_an_entity
2081
2082 sub _initialize_tree_constructor ($) {
2083 my $self = shift;
2084 require What::NanoDOM;
2085 $self->{document} = What::NanoDOM::Document->new;
2086 $self->{document}->strict_error_checking (0);
2087 ## TODO: Turn mutation events off # MUST
2088 ## TODO: Turn loose Document option (manakai extension) on
2089 } # _initialize_tree_constructor
2090
2091 sub _terminate_tree_constructor ($) {
2092 my $self = shift;
2093 $self->{document}->strict_error_checking (1);
2094 ## TODO: Turn mutation events on
2095 } # _terminate_tree_constructor
2096
2097 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2098
2099 sub _construct_tree ($) {
2100 my ($self) = @_;
2101
2102 ## When an interactive UA render the $self->{document} available
2103 ## to the user, or when it begin accepting user input, are
2104 ## not defined.
2105
2106 ## Append a character: collect it and all subsequent consecutive
2107 ## characters and insert one Text node whose data is concatenation
2108 ## of all those characters. # MUST
2109
2110 my $token;
2111 $token = $self->_get_next_token;
2112
2113 my $phase = 'initial'; # MUST
2114
2115 my $open_elements = [];
2116 my $active_formatting_elements = [];
2117 my $head_element;
2118 my $form_element;
2119 my $insertion_mode = 'before head';
2120
2121 my $reconstruct_active_formatting_elements = sub { # MUST
2122 ## Step 1
2123 return unless @$active_formatting_elements;
2124
2125 ## Step 3
2126 my $i = -1;
2127 my $entry = $active_formatting_elements->[$i];
2128
2129 ## Step 2
2130 return if $entry->[0] eq '#marker';
2131 for (@$open_elements) {
2132 if ($entry->[0] eq $_->[0]) {
2133 return;
2134 }
2135 }
2136
2137 ## Step 4
2138 S4: {
2139 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2140
2141 ## Step 5
2142 $i--;
2143 $entry = $active_formatting_elements->[$i];
2144
2145 ## Step 6
2146 if ($entry->[0] eq '#marker') {
2147 #
2148 } else {
2149 my $in_open_elements;
2150 OE: for (@$open_elements) {
2151 if ($entry->[0] eq $_->[0]) {
2152 $in_open_elements = 1;
2153 last OE;
2154 }
2155 }
2156 if ($in_open_elements) {
2157 #
2158 } else {
2159 redo S4;
2160 }
2161 }
2162
2163 ## Step 7
2164 $i++;
2165 $entry = $active_formatting_elements->[$i];
2166 } # S4
2167
2168 S7: {
2169 ## Step 8
2170 my $clone = $entry->[0]->clone_node (0);
2171
2172 ## Step 9
2173 $open_elements->[-1]->[0]->append_child ($clone);
2174 push @$open_elements, [$clone, $entry->[1]];
2175
2176 ## Step 10
2177 $active_formatting_elements->[$i] = $open_elements->[-1];
2178
2179 unless ($i == $#$active_formatting_elements) {
2180 ## Step 7'
2181 $i++;
2182 $entry = $active_formatting_elements->[$i];
2183
2184 redo S7;
2185 }
2186 } # S7
2187 }; # $reconstruct_active_formatting_elements
2188
2189 my $clear_up_to_marker = sub {
2190 for (reverse 0..$#$active_formatting_elements) {
2191 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2192 splice @$active_formatting_elements, $_;
2193 return;
2194 }
2195 }
2196 }; # $clear_up_to_marker
2197
2198 my $reset_insertion_mode = sub {
2199 ## Step 1
2200 my $last;
2201
2202 ## Step 2
2203 my $i = -1;
2204 my $node = $open_elements->[$i];
2205
2206 ## Step 3
2207 S3: {
2208 $last = 1 if $open_elements->[0]->[0] eq $node->[0];
2209 ## TODO: the element whose inner_html is set is neither td nor th, then $node = the element
2210
2211 ## Step 4..13
2212 my $new_mode = {
2213 select => 'in select',
2214 td => 'in cell',
2215 th => 'in cell',
2216 tr => 'in row',
2217 tbody => 'in table body',
2218 thead => 'in table head',
2219 tfoot => 'in table foot',
2220 caption => 'in caption',
2221 colgroup => 'in column group',
2222 table => 'in table',
2223 head => 'in body', # not in head!
2224 body => 'in body',
2225 frameset => 'in frameset',
2226 }->{$node->[1]};
2227 $insertion_mode = $new_mode and return if defined $new_mode;
2228
2229 ## Step 14
2230 if ($node->[1] eq 'html') {
2231 unless (defined $head_element) {
2232 $insertion_mode = 'before head';
2233 } else {
2234 $insertion_mode = 'after head';
2235 }
2236 return;
2237 }
2238
2239 ## Step 15
2240 $insertion_mode = 'in body' and return if $last;
2241
2242 ## Step 16
2243 $i--;
2244 $node = $open_elements->[$i];
2245
2246 ## Step 17
2247 redo S3;
2248 } # S3
2249 }; # $reset_insertion_mode
2250
2251 my $style_start_tag = sub {
2252 my $style_el;
2253 $style_el = $self->{document}->create_element_ns
2254 (q<http://www.w3.org/1999/xhtml>, [undef, 'style']);
2255
2256 ## $insertion_mode eq 'in head' and ... (always true)
2257 (($insertion_mode eq 'in head' and defined $head_element)
2258 ? $head_element : $open_elements->[-1]->[0])
2259 ->append_child ($style_el);
2260 $self->{content_model_flag} = 'CDATA';
2261
2262 my $text = '';
2263 $token = $self->_get_next_token;
2264 while ($token->{type} eq 'character') {
2265 $text .= $token->{data};
2266 $token = $self->_get_next_token;
2267 } # stop if non-character token or tokenizer stops tokenising
2268 if (length $text) {
2269 $style_el->manakai_append_text ($text);
2270 }
2271
2272 $self->{content_model_flag} = 'PCDATA';
2273
2274 if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
2275 ## Ignore the token
2276 } else {
2277 $self->{parse_error}->();
2278 ## ISSUE: And ignore?
2279 }
2280 $token = $self->_get_next_token;
2281 }; # $style_start_tag
2282
2283 my $script_start_tag = sub {
2284 my $script_el;
2285 $script_el = $self->{document}->create_element_ns
2286 (q<http://www.w3.org/1999/xhtml>, [undef, 'script']);
2287
2288 ## TODO: mark as "parser-inserted"
2289
2290 $self->{content_model_flag} = 'CDATA';
2291
2292 my $text = '';
2293 $token = $self->_get_next_token;
2294 while ($token->{type} eq 'character') {
2295 $text .= $token->{data};
2296 $token = $self->_get_next_token;
2297 } # stop if non-character token or tokenizer stops tokenising
2298 if (length $text) {
2299 $script_el->manakai_append_text ($text);
2300 }
2301
2302 $self->{content_model_flag} = 'PCDATA';
2303
2304 if ($token->{type} eq 'end tag' and
2305 $token->{tag_name} eq 'script') {
2306 ## Ignore the token
2307 } else {
2308 $self->{parse_error}->();
2309 ## ISSUE: And ignore?
2310 ## TODO: mark as "already executed"
2311 }
2312
2313 ## TODO: inner_html mode then mark as "already executed" and skip
2314 if (1) {
2315 ## TODO: $old_insertion_point = current insertion point
2316 ## TODO: insertion point = just before the next input character
2317
2318 (($insertion_mode eq 'in head' and defined $head_element)
2319 ? $head_element : $open_elements->[-1]->[0])->append_child ($script_el);
2320
2321 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2322
2323 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2324 }
2325
2326 $token = $self->_get_next_token;
2327 }; # $script_start_tag
2328
2329 my $formatting_end_tag = sub {
2330 my $tag_name = shift;
2331
2332 FET: {
2333 ## Step 1
2334 my $formatting_element;
2335 my $formatting_element_i_in_active;
2336 AFE: for (reverse 0..$#$active_formatting_elements) {
2337 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2338 $formatting_element = $active_formatting_elements->[$_];
2339 $formatting_element_i_in_active = $_;
2340 last AFE;
2341 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2342 last AFE;
2343 }
2344 } # AFE
2345 unless (defined $formatting_element) {
2346 $self->{parse_error}->();
2347 ## Ignore the token
2348 $token = $self->_get_next_token;
2349 return;
2350 }
2351 ## has an element in scope
2352 my $in_scope = 1;
2353 my $formatting_element_i_in_open;
2354 INSCOPE: for (reverse 0..$#$open_elements) {
2355 my $node = $open_elements->[$_];
2356 if ($node->[0] eq $formatting_element->[0]) {
2357 if ($in_scope) {
2358 $formatting_element_i_in_open = $_;
2359 last INSCOPE;
2360 } else { # in open elements but not in scope
2361 $self->{parse_error}->();
2362 ## Ignore the token
2363 $token = $self->_get_next_token;
2364 return;
2365 }
2366 } elsif ({
2367 table => 1, caption => 1, td => 1, th => 1,
2368 button => 1, marquee => 1, object => 1, html => 1,
2369 }->{$node->[1]}) {
2370 $in_scope = 0;
2371 }
2372 } # INSCOPE
2373 unless (defined $formatting_element_i_in_open) {
2374 $self->{parse_error}->();
2375 pop @$active_formatting_elements; # $formatting_element
2376 $token = $self->_get_next_token; ## TODO: ok?
2377 return;
2378 }
2379 if (not $open_elements->[-1]->[0] eq $formatting_element->[0]) {
2380 $self->{parse_error}->();
2381 }
2382
2383 ## Step 2
2384 my $furthest_block;
2385 my $furthest_block_i_in_open;
2386 OE: for (reverse 0..$#$open_elements) {
2387 my $node = $open_elements->[$_];
2388 if (not $formatting_category->{$node->[1]} and
2389 #not $phrasing_category->{$node->[1]} and
2390 ($special_category->{$node->[1]} or
2391 $scoping_category->{$node->[1]})) {
2392 $furthest_block = $node;
2393 $furthest_block_i_in_open = $_;
2394 } elsif ($node->[0] eq $formatting_element->[0]) {
2395 last OE;
2396 }
2397 } # OE
2398
2399 ## Step 3
2400 unless (defined $furthest_block) { # MUST
2401 splice @$open_elements, $formatting_element_i_in_open;
2402 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2403 $token = $self->_get_next_token;
2404 return;
2405 }
2406
2407 ## Step 4
2408 my $common_ancestor_node = $open_elements->[$formatting_element_i_in_open - 1];
2409
2410 ## Step 5
2411 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2412 if (defined $furthest_block_parent) {
2413 $furthest_block_parent->remove_child ($furthest_block->[0]);
2414 }
2415
2416 ## Step 6
2417 my $bookmark_prev_el
2418 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2419 ->[0];
2420
2421 ## Step 7
2422 my $node = $furthest_block;
2423 my $node_i_in_open = $furthest_block_i_in_open;
2424 my $last_node = $furthest_block;
2425 S7: {
2426 ## Step 1
2427 $node_i_in_open--;
2428 $node = $open_elements->[$node_i_in_open];
2429
2430 ## Step 2
2431 my $node_i_in_active;
2432 S7S2: {
2433 for (reverse 0..$#$active_formatting_elements) {
2434 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2435 $node_i_in_active = $_;
2436 last S7S2;
2437 }
2438 }
2439 splice @$open_elements, $node_i_in_open, 1;
2440 redo S7;
2441 } # S7S2
2442
2443 ## Step 3
2444 last S7 if $node->[0] eq $formatting_element->[0];
2445
2446 ## Step 4
2447 if ($last_node->[0] eq $furthest_block->[0]) {
2448 $bookmark_prev_el = $node->[0];
2449 }
2450
2451 ## Step 5
2452 if ($node->[0]->has_child_nodes ()) {
2453 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2454 $active_formatting_elements->[$node_i_in_active] = $clone;
2455 $open_elements->[$node_i_in_open] = $clone;
2456 $node = $clone;
2457 }
2458
2459 ## Step 6
2460 $node->append_child ($last_node);
2461
2462 ## Step 7
2463 $last_node = $node;
2464
2465 ## Step 8
2466 redo S7;
2467 } # S7
2468
2469 ## Step 8
2470 $common_ancestor_node->append_child ($last_node);
2471
2472 ## Step 9
2473 my $clone = [$formatting_element->[0]->clone_node (0),
2474 $formatting_element->[1]];
2475
2476 ## Step 10
2477 my @cn = @{$furthest_block->[0]->child_nodes};
2478 $clone->[0]->append_child ($_) for @cn;
2479
2480 ## Step 11
2481 $furthest_block->[0]->append_child ($clone->[0]);
2482
2483 ## Step 12
2484 my $i;
2485 AFE: for (reverse 0..$#$active_formatting_elements) {
2486 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2487 splice @$active_formatting_elements, $_, 1;
2488 $i-- and last AFE if defined $i;
2489 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2490 $i = $_;
2491 }
2492 } # AFE
2493 splice @$active_formatting_elements, $i + 1, 0, $clone;
2494
2495 ## Step 13
2496 undef $i;
2497 OE: for (reverse 0..$#$open_elements) {
2498 if ($open_elements->[$_]->[0] eq $formatting_element->[0]) {
2499 splice @$open_elements, $_, 1;
2500 $i-- and last OE if defined $i;
2501 } elsif ($open_elements->[$_]->[0] eq $furthest_block->[0]) {
2502 $i = $_;
2503 }
2504 } # OE
2505 splice @$open_elements, $i + 1, 1, $clone;
2506
2507 ## Step 14
2508 redo FET;
2509 } # FET
2510 }; # $formatting_end_tag
2511
2512 my $in_body = sub {
2513 my $insert = shift;
2514 if ($token->{type} eq 'start tag') {
2515 if ($token->{tag_name} eq 'script') {
2516 $script_start_tag->();
2517 return;
2518 } elsif ($token->{tag_name} eq 'style') {
2519 $style_start_tag->();
2520 return;
2521 } elsif ({
2522 base => 1, link => 1, meta => 1, title => 1,
2523 }->{$token->{tag_name}}) {
2524 $self->{parse_error}->();
2525 ## NOTE: This is an "as if in head" code clone
2526 my $el;
2527
2528 $el = $self->{document}->create_element_ns
2529 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2530
2531 for my $attr_name (keys %{ $token->{attributes}}) {
2532 $el->set_attribute_ns (undef, [undef, $attr_name],
2533 $token->{attributes} ->{$attr_name}->{value});
2534 }
2535
2536 if (defined $head_element) {
2537 $head_element->append_child ($el);
2538 } else {
2539 $insert->($el);
2540 }
2541
2542 ## ISSUE: Issue on magical <base> in the spec
2543
2544 $token = $self->_get_next_token;
2545 return;
2546 } elsif ($token->{tag_name} eq 'body') {
2547 $self->{parse_error}->();
2548
2549 if (@$open_elements == 1 or
2550 $open_elements->[1]->[1] ne 'body') {
2551 ## Ignore the token
2552 } else {
2553 my $body_el = $open_elements->[1]->[0];
2554 for my $attr_name (keys %{$token->{attributes}}) {
2555 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2556 $body_el->set_attribute_ns
2557 (undef, [undef, $attr_name],
2558 $token->{attributes}->{$attr_name}->{value});
2559 }
2560 }
2561 }
2562 $token = $self->_get_next_token;
2563 return;
2564 } elsif ({
2565 address => 1, blockquote => 1, center => 1, dir => 1,
2566 div => 1, dl => 1, fieldset => 1, listing => 1,
2567 menu => 1, ol => 1, p => 1, ul => 1,
2568 pre => 1,
2569 }->{$token->{tag_name}}) {
2570 ## has a p element in scope
2571 INSCOPE: for (reverse @$open_elements) {
2572 if ($_->[1] eq 'p') {
2573 unshift @{$self->{token}}, $token;
2574 $token = {type => 'end tag', tag_name => 'p'};
2575 return;
2576 } elsif ({
2577 table => 1, caption => 1, td => 1, th => 1,
2578 button => 1, marquee => 1, object => 1, html => 1,
2579 }->{$_->[1]}) {
2580 last INSCOPE;
2581 }
2582 } # INSCOPE
2583
2584
2585 {
2586 my $el;
2587
2588 $el = $self->{document}->create_element_ns
2589 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2590
2591 for my $attr_name (keys %{ $token->{attributes}}) {
2592 $el->set_attribute_ns (undef, [undef, $attr_name],
2593 $token->{attributes} ->{$attr_name}->{value});
2594 }
2595
2596 $insert->($el);
2597 push @$open_elements, [$el, $token->{tag_name}];
2598 }
2599
2600 if ($token->{tag_name} eq 'pre') {
2601 $token = $self->_get_next_token;
2602 if ($token->{type} eq 'character') {
2603 $token->{data} =~ s/^\x0A//;
2604 unless (length $token->{data}) {
2605 $token = $self->_get_next_token;
2606 }
2607 }
2608 } else {
2609 $token = $self->_get_next_token;
2610 }
2611 return;
2612 } elsif ($token->{tag_name} eq 'form') {
2613 if (defined $form_element) {
2614 $self->{parse_error}->();
2615 ## Ignore the token
2616 } else {
2617 ## has a p element in scope
2618 INSCOPE: for (reverse @$open_elements) {
2619 if ($_->[1] eq 'p') {
2620 unshift @{$self->{token}}, $token;
2621 $token = {type => 'end tag', tag_name => 'p'};
2622 return;
2623 } elsif ({
2624 table => 1, caption => 1, td => 1, th => 1,
2625 button => 1, marquee => 1, object => 1, html => 1,
2626 }->{$_->[1]}) {
2627 last INSCOPE;
2628 }
2629 } # INSCOPE
2630
2631
2632 {
2633 my $el;
2634
2635 $el = $self->{document}->create_element_ns
2636 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2637
2638 for my $attr_name (keys %{ $token->{attributes}}) {
2639 $el->set_attribute_ns (undef, [undef, $attr_name],
2640 $token->{attributes} ->{$attr_name}->{value});
2641 }
2642
2643 $insert->($el);
2644 push @$open_elements, [$el, $token->{tag_name}];
2645 }
2646
2647 $form_element = $open_elements->[-1]->[0];
2648 $token = $self->_get_next_token;
2649 return;
2650 }
2651 } elsif ($token->{tag_name} eq 'li') {
2652 ## has a p element in scope
2653 INSCOPE: for (reverse @$open_elements) {
2654 if ($_->[1] eq 'p') {
2655 unshift @{$self->{token}}, $token;
2656 $token = {type => 'end tag', tag_name => 'p'};
2657 return;
2658 } elsif ({
2659 table => 1, caption => 1, td => 1, th => 1,
2660 button => 1, marquee => 1, object => 1, html => 1,
2661 }->{$_->[1]}) {
2662 last INSCOPE;
2663 }
2664 } # INSCOPE
2665
2666 ## Step 1
2667 my $i = -1;
2668 my $node = $open_elements->[$i];
2669 LI: {
2670 ## Step 2
2671 if ($node->[1] eq 'li') {
2672 splice @$open_elements, $i;
2673 last LI;
2674 }
2675
2676 ## Step 3
2677 if (not $formatting_category->{$node->[1]} and
2678 #not $phrasing_category->{$node->[1]} and
2679 ($special_category->{$node->[1]} or
2680 $scoping_category->{$node->[1]}) and
2681 $node->[1] ne 'address' and $node->[1] ne 'div') {
2682 last LI;
2683 }
2684
2685 ## Step 4
2686 $i++;
2687 $node = $open_elements->[$i];
2688 redo LI;
2689 } # LI
2690
2691
2692 {
2693 my $el;
2694
2695 $el = $self->{document}->create_element_ns
2696 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2697
2698 for my $attr_name (keys %{ $token->{attributes}}) {
2699 $el->set_attribute_ns (undef, [undef, $attr_name],
2700 $token->{attributes} ->{$attr_name}->{value});
2701 }
2702
2703 $insert->($el);
2704 push @$open_elements, [$el, $token->{tag_name}];
2705 }
2706
2707 $token = $self->_get_next_token;
2708 return;
2709 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2710 ## has a p element in scope
2711 INSCOPE: for (reverse @$open_elements) {
2712 if ($_->[1] eq 'p') {
2713 unshift @{$self->{token}}, $token;
2714 $token = {type => 'end tag', tag_name => 'p'};
2715 return;
2716 } elsif ({
2717 table => 1, caption => 1, td => 1, th => 1,
2718 button => 1, marquee => 1, object => 1, html => 1,
2719 }->{$_->[1]}) {
2720 last INSCOPE;
2721 }
2722 } # INSCOPE
2723
2724 ## Step 1
2725 my $i = -1;
2726 my $node = $open_elements->[$i];
2727 LI: {
2728 ## Step 2
2729 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2730 splice @$open_elements, $i;
2731 last LI;
2732 }
2733
2734 ## Step 3
2735 if (not $formatting_category->{$node->[1]} and
2736 #not $phrasing_category->{$node->[1]} and
2737 ($special_category->{$node->[1]} or
2738 $scoping_category->{$node->[1]}) and
2739 $node->[1] ne 'address' and $node->[1] ne 'div') {
2740 last LI;
2741 }
2742
2743 ## Step 4
2744 $i++;
2745 $node = $open_elements->[$i];
2746 redo LI;
2747 } # LI
2748
2749
2750 {
2751 my $el;
2752
2753 $el = $self->{document}->create_element_ns
2754 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2755
2756 for my $attr_name (keys %{ $token->{attributes}}) {
2757 $el->set_attribute_ns (undef, [undef, $attr_name],
2758 $token->{attributes} ->{$attr_name}->{value});
2759 }
2760
2761 $insert->($el);
2762 push @$open_elements, [$el, $token->{tag_name}];
2763 }
2764
2765 $token = $self->_get_next_token;
2766 return;
2767 } elsif ($token->{tag_name} eq 'plaintext') {
2768 ## has a p element in scope
2769 INSCOPE: for (reverse @$open_elements) {
2770 if ($_->[1] eq 'p') {
2771 unshift @{$self->{token}}, $token;
2772 $token = {type => 'end tag', tag_name => 'p'};
2773 return;
2774 } elsif ({
2775 table => 1, caption => 1, td => 1, th => 1,
2776 button => 1, marquee => 1, object => 1, html => 1,
2777 }->{$_->[1]}) {
2778 last INSCOPE;
2779 }
2780 } # INSCOPE
2781
2782
2783 {
2784 my $el;
2785
2786 $el = $self->{document}->create_element_ns
2787 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2788
2789 for my $attr_name (keys %{ $token->{attributes}}) {
2790 $el->set_attribute_ns (undef, [undef, $attr_name],
2791 $token->{attributes} ->{$attr_name}->{value});
2792 }
2793
2794 $insert->($el);
2795 push @$open_elements, [$el, $token->{tag_name}];
2796 }
2797
2798
2799 $self->{content_model_flag} = 'PLAINTEXT';
2800
2801 $token = $self->_get_next_token;
2802 return;
2803 } elsif ({
2804 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2805 }->{$token->{tag_name}}) {
2806 ## has a p element in scope
2807 INSCOPE: for (reverse 0..$#$open_elements) {
2808 my $node = $open_elements->[$_];
2809 if ($node->[1] eq 'p') {
2810 unshift @{$self->{token}}, $token;
2811 $token = {type => 'end tag', tag_name => 'p'};
2812 return;
2813 } elsif ({
2814 table => 1, caption => 1, td => 1, th => 1,
2815 button => 1, marquee => 1, object => 1, html => 1,
2816 }->{$node->[1]}) {
2817 last INSCOPE;
2818 }
2819 } # INSCOPE
2820
2821 ## has an element in scope
2822 my $i;
2823 INSCOPE: for (reverse 0..$#$open_elements) {
2824 my $node = $open_elements->[$_];
2825 if ({
2826 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2827 }->{$node->[1]}) {
2828 $i = $_;
2829 last INSCOPE;
2830 } elsif ({
2831 table => 1, caption => 1, td => 1, th => 1,
2832 button => 1, marquee => 1, object => 1, html => 1,
2833 }->{$node->[1]}) {
2834 last INSCOPE;
2835 }
2836 } # INSCOPE
2837
2838 if (defined $i) {
2839 $self->{parse_error}->();
2840 splice @$open_elements, $i;
2841 }
2842
2843
2844 {
2845 my $el;
2846
2847 $el = $self->{document}->create_element_ns
2848 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2849
2850 for my $attr_name (keys %{ $token->{attributes}}) {
2851 $el->set_attribute_ns (undef, [undef, $attr_name],
2852 $token->{attributes} ->{$attr_name}->{value});
2853 }
2854
2855 $insert->($el);
2856 push @$open_elements, [$el, $token->{tag_name}];
2857 }
2858
2859
2860 $token = $self->_get_next_token;
2861 return;
2862 } elsif ($token->{tag_name} eq 'a') {
2863 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2864 my $node = $active_formatting_elements->[$i];
2865 if ($node->[1] eq 'a') {
2866 $self->{parse_error}->();
2867
2868 unshift @{$self->{token}}, $token;
2869 $token = {type => 'end tag', tag_name => 'a'};
2870 $formatting_end_tag->($token->{tag_name});
2871
2872 splice @$active_formatting_elements, $i;
2873 OE: for (reverse 0..$#$open_elements) {
2874 if ($open_elements->[$_]->[0] eq $node->[0]) {
2875 splice @$open_elements, $_;
2876 last OE;
2877 }
2878 } # OE
2879 last AFE;
2880 } elsif ($node->[0] eq '#marker') {
2881 last AFE;
2882 }
2883 } # AFE
2884
2885 $reconstruct_active_formatting_elements->();
2886
2887
2888 {
2889 my $el;
2890
2891 $el = $self->{document}->create_element_ns
2892 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2893
2894 for my $attr_name (keys %{ $token->{attributes}}) {
2895 $el->set_attribute_ns (undef, [undef, $attr_name],
2896 $token->{attributes} ->{$attr_name}->{value});
2897 }
2898
2899 $insert->($el);
2900 push @$open_elements, [$el, $token->{tag_name}];
2901 }
2902
2903 push @$active_formatting_elements, $open_elements->[-1];
2904
2905 $token = $self->_get_next_token;
2906 return;
2907 } elsif ({
2908 b => 1, big => 1, em => 1, font => 1, i => 1,
2909 nobr => 1, s => 1, small => 1, strile => 1,
2910 strong => 1, tt => 1, u => 1,
2911 }->{$token->{tag_name}}) {
2912 $reconstruct_active_formatting_elements->();
2913
2914
2915 {
2916 my $el;
2917
2918 $el = $self->{document}->create_element_ns
2919 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2920
2921 for my $attr_name (keys %{ $token->{attributes}}) {
2922 $el->set_attribute_ns (undef, [undef, $attr_name],
2923 $token->{attributes} ->{$attr_name}->{value});
2924 }
2925
2926 $insert->($el);
2927 push @$open_elements, [$el, $token->{tag_name}];
2928 }
2929
2930 push @$active_formatting_elements, $open_elements->[-1];
2931
2932 $token = $self->_get_next_token;
2933 return;
2934 } elsif ($token->{tag_name} eq 'button') {
2935 ## has a button element in scope
2936 INSCOPE: for (reverse 0..$#$open_elements) {
2937 my $node = $open_elements->[$_];
2938 if ($node->[1] eq 'button') {
2939 $self->{parse_error}->();
2940 unshift @{$self->{token}}, $token;
2941 $token = {type => 'end tag', tag_name => 'button'};
2942 return;
2943 } elsif ({
2944 table => 1, caption => 1, td => 1, th => 1,
2945 button => 1, marquee => 1, object => 1, html => 1,
2946 }->{$node->[1]}) {
2947 last INSCOPE;
2948 }
2949 } # INSCOPE
2950
2951 $reconstruct_active_formatting_elements->();
2952
2953
2954 {
2955 my $el;
2956
2957 $el = $self->{document}->create_element_ns
2958 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2959
2960 for my $attr_name (keys %{ $token->{attributes}}) {
2961 $el->set_attribute_ns (undef, [undef, $attr_name],
2962 $token->{attributes} ->{$attr_name}->{value});
2963 }
2964
2965 $insert->($el);
2966 push @$open_elements, [$el, $token->{tag_name}];
2967 }
2968
2969 push @$active_formatting_elements, ['#marker', ''];
2970
2971 $token = $self->_get_next_token;
2972 return;
2973 } elsif ($token->{tag_name} eq 'marquee' or
2974 $token->{tag_name} eq 'object') {
2975 $reconstruct_active_formatting_elements->();
2976
2977
2978 {
2979 my $el;
2980
2981 $el = $self->{document}->create_element_ns
2982 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
2983
2984 for my $attr_name (keys %{ $token->{attributes}}) {
2985 $el->set_attribute_ns (undef, [undef, $attr_name],
2986 $token->{attributes} ->{$attr_name}->{value});
2987 }
2988
2989 $insert->($el);
2990 push @$open_elements, [$el, $token->{tag_name}];
2991 }
2992
2993 push @$active_formatting_elements, ['#marker', ''];
2994
2995 $token = $self->_get_next_token;
2996 return;
2997 } elsif ($token->{tag_name} eq 'xmp') {
2998 $reconstruct_active_formatting_elements->();
2999
3000
3001 {
3002 my $el;
3003
3004 $el = $self->{document}->create_element_ns
3005 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3006
3007 for my $attr_name (keys %{ $token->{attributes}}) {
3008 $el->set_attribute_ns (undef, [undef, $attr_name],
3009 $token->{attributes} ->{$attr_name}->{value});
3010 }
3011
3012 $insert->($el);
3013 push @$open_elements, [$el, $token->{tag_name}];
3014 }
3015
3016
3017 $self->{content_model_flag} = 'CDATA';
3018
3019 $token = $self->_get_next_token;
3020 return;
3021 } elsif ($token->{tag_name} eq 'tbale') {
3022 ## has a p element in scope
3023 INSCOPE: for (reverse @$open_elements) {
3024 if ($_->[1] eq 'p') {
3025 unshift @{$self->{token}}, $token;
3026 $token = {type => 'end tag', tag_name => 'p'};
3027 return;
3028 } elsif ({
3029 table => 1, caption => 1, td => 1, th => 1,
3030 button => 1, marquee => 1, object => 1, html => 1,
3031 }->{$_->[1]}) {
3032 last INSCOPE;
3033 }
3034 } # INSCOPE
3035
3036
3037 {
3038 my $el;
3039
3040 $el = $self->{document}->create_element_ns
3041 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3042
3043 for my $attr_name (keys %{ $token->{attributes}}) {
3044 $el->set_attribute_ns (undef, [undef, $attr_name],
3045 $token->{attributes} ->{$attr_name}->{value});
3046 }
3047
3048 $insert->($el);
3049 push @$open_elements, [$el, $token->{tag_name}];
3050 }
3051
3052
3053 $insertion_mode = 'in table';
3054
3055 $token = $self->_get_next_token;
3056 return;
3057 } elsif ({
3058 area => 1, basefont => 1, bgsound => 1, br => 1,
3059 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
3060 image => 1,
3061 }->{$token->{tag_name}}) {
3062 if ($token->{tag_name} eq 'image') {
3063 $self->{parse_error}->();
3064 $token->{tag_name} = 'img';
3065 }
3066
3067 $reconstruct_active_formatting_elements->();
3068
3069
3070 {
3071 my $el;
3072
3073 $el = $self->{document}->create_element_ns
3074 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3075
3076 for my $attr_name (keys %{ $token->{attributes}}) {
3077 $el->set_attribute_ns (undef, [undef, $attr_name],
3078 $token->{attributes} ->{$attr_name}->{value});
3079 }
3080
3081 $insert->($el);
3082 push @$open_elements, [$el, $token->{tag_name}];
3083 }
3084
3085 pop @$open_elements;
3086
3087 $token = $self->_get_next_token;
3088 return;
3089 } elsif ($token->{tag_name} eq 'hr') {
3090 ## has a p element in scope
3091 INSCOPE: for (reverse @$open_elements) {
3092 if ($_->[1] eq 'p') {
3093 unshift @{$self->{token}}, $token;
3094 $token = {type => 'end tag', tag_name => 'p'};
3095 return;
3096 } elsif ({
3097 table => 1, caption => 1, td => 1, th => 1,
3098 button => 1, marquee => 1, object => 1, html => 1,
3099 }->{$_->[1]}) {
3100 last INSCOPE;
3101 }
3102 } # INSCOPE
3103
3104
3105 {
3106 my $el;
3107
3108 $el = $self->{document}->create_element_ns
3109 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3110
3111 for my $attr_name (keys %{ $token->{attributes}}) {
3112 $el->set_attribute_ns (undef, [undef, $attr_name],
3113 $token->{attributes} ->{$attr_name}->{value});
3114 }
3115
3116 $insert->($el);
3117 push @$open_elements, [$el, $token->{tag_name}];
3118 }
3119
3120 pop @$open_elements;
3121
3122 $token = $self->_get_next_token;
3123 return;
3124 } elsif ($token->{tag_name} eq 'input') {
3125 $reconstruct_active_formatting_elements->();
3126
3127
3128 {
3129 my $el;
3130
3131 $el = $self->{document}->create_element_ns
3132 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3133
3134 for my $attr_name (keys %{ $token->{attributes}}) {
3135 $el->set_attribute_ns (undef, [undef, $attr_name],
3136 $token->{attributes} ->{$attr_name}->{value});
3137 }
3138
3139 $insert->($el);
3140 push @$open_elements, [$el, $token->{tag_name}];
3141 }
3142
3143 ## TODO: associate with $form_element if defined
3144 pop @$open_elements;
3145
3146 $token = $self->_get_next_token;
3147 return;
3148 } elsif ($token->{tag_name} eq 'isindex') {
3149 $self->{parse_error}->();
3150
3151 if (defined $form_element) {
3152 ## Ignore the token
3153 $token = $self->_get_next_token;
3154 return;
3155 } else {
3156 my $at = $token->{attributes};
3157 $at->{name} = {name => 'name', value => 'isindex'};
3158 my @tokens = (
3159 {type => 'start tag', tag_name => 'form'},
3160 {type => 'start tag', tag_name => 'hr'},
3161 {type => 'start tag', tag_name => 'p'},
3162 {type => 'start tag', tag_name => 'label'},
3163 {type => 'character',
3164 data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
3165 ## TODO: make this configurable
3166 {type => 'start tag', tag_name => 'input', attributes => $at},
3167 #{type => 'character', data => ''}, # SHOULD
3168 {type => 'end tag', tag_name => 'label'},
3169 {type => 'end tag', tag_name => 'p'},
3170 {type => 'start tag', tag_name => 'hr'},
3171 {type => 'end tag', tag_name => 'form'},
3172 );
3173 $token = shift @tokens;
3174 unshift @{$self->{token}}, (@tokens);
3175 return;
3176 }
3177 } elsif ({
3178 textarea => 1,
3179 noembed => 1,
3180 noframes => 1,
3181 noscript => 0, ## TODO: 1 if scripting is enabled
3182 }->{$token->{tag_name}}) {
3183 my $tag_name = $token->{tag_name};
3184 my $el;
3185
3186 $el = $self->{document}->create_element_ns
3187 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3188
3189 for my $attr_name (keys %{ $token->{attributes}}) {
3190 $el->set_attribute_ns (undef, [undef, $attr_name],
3191 $token->{attributes} ->{$attr_name}->{value});
3192 }
3193
3194
3195 if ($token->{tag_name} eq 'textarea') {
3196 ## TODO: form_element if defined
3197 $self->{content_model_flag} = 'RCDATA';
3198 } else {
3199 $self->{content_model_flag} = 'CDATA';
3200 }
3201
3202 $insert->($el);
3203
3204 my $text = '';
3205 $token = $self->_get_next_token;
3206 while ($token->{type} eq 'character') {
3207 $text .= $token->{data};
3208 $token = $self->_get_next_token;
3209 }
3210 if (length $text) {
3211 $el->manakai_append_text ($text);
3212 }
3213
3214 $self->{content_model_flag} = 'PCDATA';
3215
3216 if ($token->{type} eq 'end tag' and
3217 $token->{tag_name} eq $tag_name) {
3218 ## Ignore the token
3219 } else {
3220 $self->{parse_error}->();
3221 ## ISSUE: And ignore?
3222 }
3223 $token = $self->_get_next_token;
3224 return;
3225 } elsif ($token->{type} eq 'select') {
3226 $reconstruct_active_formatting_elements->();
3227
3228
3229 {
3230 my $el;
3231
3232 $el = $self->{document}->create_element_ns
3233 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3234
3235 for my $attr_name (keys %{ $token->{attributes}}) {
3236 $el->set_attribute_ns (undef, [undef, $attr_name],
3237 $token->{attributes} ->{$attr_name}->{value});
3238 }
3239
3240 $insert->($el);
3241 push @$open_elements, [$el, $token->{tag_name}];
3242 }
3243
3244
3245 $insertion_mode = 'in select';
3246 $token = $self->_get_next_token;
3247 return;
3248 } elsif ({
3249 caption => 1, col => 1, colgroup => 1, frame => 1,
3250 frameset => 1, head => 1, option => 1, optgroup => 1,
3251 tbody => 1, td => 1, tfoot => 1, th => 1,
3252 thead => 1, tr => 1,
3253 }->{$token->{tag_name}}) {
3254 $self->{parse_error}->();
3255 ## Ignore the token
3256 $token = $self->_get_next_token;
3257 return;
3258
3259 ## ISSUE: An issue on HTML5 new elements in the spec.
3260 } else {
3261 $reconstruct_active_formatting_elements->();
3262
3263
3264 {
3265 my $el;
3266
3267 $el = $self->{document}->create_element_ns
3268 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3269
3270 for my $attr_name (keys %{ $token->{attributes}}) {
3271 $el->set_attribute_ns (undef, [undef, $attr_name],
3272 $token->{attributes} ->{$attr_name}->{value});
3273 }
3274
3275 $insert->($el);
3276 push @$open_elements, [$el, $token->{tag_name}];
3277 }
3278
3279
3280 $token = $self->_get_next_token;
3281 return;
3282 }
3283 } elsif ($token->{type} eq 'end tag') {
3284 if ($token->{tag_name} eq 'body') {
3285 if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
3286 ## ISSUE: There is an issue in the spec.
3287 if ($open_elements->[-1]->[1] ne 'body') {
3288 $self->{parse_error}->();
3289 }
3290 $insertion_mode = 'after body';
3291 $token = $self->_get_next_token;
3292 return;
3293 } else {
3294 $self->{parse_error}->();
3295 ## Ignore the token
3296 $token = $self->_get_next_token;
3297 return;
3298 }
3299 } elsif ($token->{tag_name} eq 'html') {
3300 if (@$open_elements > 1 and $open_elements->[1]->[1] eq 'body') {
3301 ## ISSUE: There is an issue in the spec.
3302 if ($open_elements->[-1]->[1] ne 'body') {
3303 $self->{parse_error}->();
3304 }
3305 $insertion_mode = 'after body';
3306 ## reprocess
3307 return;
3308 } else {
3309 $self->{parse_error}->();
3310 ## Ignore the token
3311 $token = $self->_get_next_token;
3312 return;
3313 }
3314 } elsif ({
3315 address => 1, blockquote => 1, center => 1, dir => 1,
3316 div => 1, dl => 1, fieldset => 1, listing => 1,
3317 menu => 1, ol => 1, pre => 1, ul => 1,
3318 form => 1,
3319 p => 1,
3320 dd => 1, dt => 1, li => 1,
3321 button => 1, marquee => 1, object => 1,
3322 }->{$token->{tag_name}}) {
3323 ## has an element in scope
3324 my $i;
3325 INSCOPE: for (reverse 0..$#$open_elements) {
3326 my $node = $open_elements->[$_];
3327 if ($node->[1] eq $token->{tag_name}) {
3328 ## generate implied end tags
3329 if ({
3330 dd => ($token->{tag_name} ne 'dd'),
3331 dt => ($token->{tag_name} ne 'dt'),
3332 li => ($token->{tag_name} ne 'li'),
3333 p => ($token->{tag_name} ne 'p'),
3334 td => 1, th => 1, tr => 1,
3335 }->{$open_elements->[-1]->[1]}) {
3336 unshift @{$self->{token}}, $token;
3337 $token = {type => 'end tag',
3338 tag_name => $open_elements->[-1]->[1]}; # MUST
3339 return;
3340 }
3341 $i = $_;
3342 last INSCOPE unless $token->{tag_name} eq 'p';
3343 } elsif ({
3344 table => 1, caption => 1, td => 1, th => 1,
3345 button => 1, marquee => 1, object => 1, html => 1,
3346 }->{$node->[1]}) {
3347 last INSCOPE;
3348 }
3349 } # INSCOPE
3350
3351 if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
3352 $self->{parse_error}->();
3353 }
3354
3355 splice @$open_elements, $i if defined $i;
3356 undef $form_element if $token->{tag_name} eq 'form';
3357 $clear_up_to_marker->()
3358 if {
3359 button => 1, marquee => 1, object => 1,
3360 }->{$token->{tag_name}};
3361 $token = $self->_get_next_token;
3362 return;
3363 } elsif ({
3364 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3365 }->{$token->{tag_name}}) {
3366 ## has an element in scope
3367 my $i;
3368 INSCOPE: for (reverse 0..$#$open_elements) {
3369 my $node = $open_elements->[$_];
3370 if ({
3371 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3372 }->{$node->[1]}) {
3373 ## generate implied end tags
3374 if ({
3375 dd => 1, dt => 1, li => 1, p => 1,
3376 td => 1, th => 1, tr => 1,
3377 }->{$open_elements->[-1]->[1]}) {
3378 unshift @{$self->{token}}, $token;
3379 $token = {type => 'end tag',
3380 tag_name => $open_elements->[-1]->[1]}; # MUST
3381 return;
3382 }
3383 $i = $_;
3384 last INSCOPE;
3385 } elsif ({
3386 table => 1, caption => 1, td => 1, th => 1,
3387 button => 1, marquee => 1, object => 1, html => 1,
3388 }->{$node->[1]}) {
3389 last INSCOPE;
3390 }
3391 } # INSCOPE
3392
3393 if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
3394 $self->{parse_error}->();
3395 }
3396
3397 splice @$open_elements, $i if defined $i;
3398 $token = $self->_get_next_token;
3399 return;
3400 } elsif ({
3401 a => 1,
3402 b => 1, big => 1, em => 1, font => 1, i => 1,
3403 nobr => 1, s => 1, small => 1, strile => 1,
3404 strong => 1, tt => 1, u => 1,
3405 }->{$token->{tag_name}}) {
3406 $formatting_end_tag->($token->{tag_name});
3407 return;
3408 } elsif ({
3409 caption => 1, col => 1, colgroup => 1, frame => 1,
3410 frameset => 1, head => 1, option => 1, optgroup => 1,
3411 tbody => 1, td => 1, tfoot => 1, th => 1,
3412 thead => 1, tr => 1,
3413 area => 1, basefont => 1, bgsound => 1, br => 1,
3414 embed => 1, hr => 1, iframe => 1, image => 1,
3415 img => 1, input => 1, isindex=> 1, noembed => 1,
3416 noframes => 1, param => 1, select => 1, spacer => 1,
3417 table => 1, textarea => 1, wbr => 1,
3418 noscript => 0, ## TODO: if scripting is enabled
3419 }->{$token->{tag_name}}) {
3420 $self->{parse_error}->();
3421 ## Ignore the token
3422 $token = $self->_get_next_token;
3423 return;
3424
3425 ## ISSUE: Issue on HTML5 new elements in spec
3426
3427 } else {
3428 ## Step 1
3429 my $node_i = -1;
3430 my $node = $open_elements->[$node_i];
3431
3432 ## Step 2
3433 S2: {
3434 if ($node->[1] eq $token->{tag_name}) {
3435 ## Step 1
3436 ## generate implied end tags
3437 if ({
3438 dd => 1, dt => 1, li => 1, p => 1,
3439 td => 1, th => 1, tr => 1,
3440 }->{$open_elements->[-1]->[1]}) {
3441 unshift @{$self->{token}}, $token;
3442 $token = {type => 'end tag',
3443 tag_name => $open_elements->[-1]->[1]}; # MUST
3444 return;
3445 }
3446
3447 ## Step 2
3448 if ($token->{tag_name} ne $open_elements->[-1]->[1]) {
3449 $self->{parse_error}->();
3450 }
3451
3452 ## Step 3
3453 splice @$open_elements, $node_i;
3454 last S2;
3455 } else {
3456 ## Step 3
3457 if (not $formatting_category->{$node->[1]} and
3458 #not $phrasing_category->{$node->[1]} and
3459 ($special_category->{$node->[1]} or
3460 $scoping_category->{$node->[1]})) {
3461 $self->{parse_error}->();
3462 ## Ignore the token
3463 $token = $self->_get_next_token;
3464 last S2;
3465 }
3466 }
3467
3468 ## Step 4
3469 $node_i--;
3470 $node = $open_elements->[$node_i];
3471
3472 ## Step 5;
3473 redo S2;
3474 } # S2
3475 }
3476 }
3477 }; # $in_body
3478
3479 B: {
3480 if ($phase eq 'initial') {
3481 if ($token->{type} eq 'DOCTYPE') {
3482 if ($token->{error}) {
3483 ## ISSUE: Spec currently left this case undefined.
3484 }
3485 my $doctype = $self->{document}->create_document_type_definition
3486 ($token->{name});
3487 $self->{document}->append_child ($doctype);
3488 $phase = 'root element';
3489 $token = $self->_get_next_token;
3490 redo B;
3491 } elsif ({
3492 comment => 1,
3493 'start tag' => 1,
3494 'end tag' => 1,
3495 'end-of-file' => 1,
3496 }->{$token->{type}}) {
3497 ## ISSUE: Spec currently left this case undefined.
3498 $phase = 'root element';
3499 ## reprocess
3500 redo B;
3501 } elsif ($token->{type} eq 'character') {
3502 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3503 $self->{document}->manakai_append_text ($1);
3504 ## ISSUE: DOM3 Core does not allow Document > Text
3505 unless (length $token->{data}) {
3506 ## Stay in the phase
3507 $token = $self->_get_next_token;
3508 redo B;
3509 }
3510 }
3511 ## ISSUE: Spec currently left this case undefined.
3512 $phase = 'root element';
3513 ## reprocess
3514 redo B;
3515 } else {
3516 die "$0: $token->{type}: Unknown token";
3517 }
3518 } elsif ($phase eq 'root element') {
3519 if ($token->{type} eq 'DOCTYPE') {
3520 $self->{parse_error}->();
3521 ## Ignore the token
3522 ## Stay in the phase
3523 $token = $self->_get_next_token;
3524 redo B;
3525 } elsif ($token->{type} eq 'comment') {
3526 my $comment = $self->{document}->create_comment ($token->{data});
3527 $self->{document}->append_child ($comment);
3528 ## Stay in the phase
3529 $token = $self->_get_next_token;
3530 redo B;
3531 } elsif ($token->{type} eq 'character') {
3532 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3533 $self->{document}->manakai_append_text ($1);
3534 ## ISSUE: DOM3 Core does not allow Document > Text
3535 unless (length $token->{data}) {
3536 ## Stay in the phase
3537 $token = $self->_get_next_token;
3538 redo B;
3539 }
3540 }
3541 #
3542 } elsif ({
3543 'start tag' => 1,
3544 'end tag' => 1,
3545 'end-of-file' => 1,
3546 }->{$token->{type}}) {
3547 ## ISSUE: There is an issue in the spec
3548 #
3549 } else {
3550 die "$0: $token->{type}: Unknown token";
3551 }
3552 my $root_element;
3553 $root_element = $self->{document}->create_element_ns
3554 (q<http://www.w3.org/1999/xhtml>, [undef, 'html']);
3555
3556 $self->{document}->append_child ($root_element);
3557 $open_elements = [[$root_element, 'html']];
3558 $phase = 'main';
3559 ## reprocess
3560 redo B;
3561 } elsif ($phase eq 'main') {
3562 if ($token->{type} eq 'DOCTYPE') {
3563 $self->{parse_error}->();
3564 ## Ignore the token
3565 ## Stay in the phase
3566 $token = $self->_get_next_token;
3567 redo B;
3568 } elsif ($token->{type} eq 'start tag' and
3569 $token->{tag_name} eq 'html') {
3570 ## TODO: unless it is the first start tag token, parse-error
3571 my $top_el = $open_elements->[0]->[0];
3572 for my $attr_name (keys %{$token->{attributes}}) {
3573 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3574 $top_el->set_attribute_ns (undef, [undef, $attr_name],
3575 $token->{attributes}->{value});
3576 }
3577 }
3578 $token = $self->_get_next_token;
3579 redo B;
3580 } elsif ($token->{type} eq 'end-of-file') {
3581 ## Generate implied end tags
3582 if ({
3583 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3584 }->{$open_elements->[-1]->[1]}) {
3585 unshift @{$self->{token}}, $token;
3586 $token = {type => 'end tag', tag_name => $open_elements->[-1]->[1]};
3587 redo B;
3588 }
3589
3590 if (@$open_elements > 2 or
3591 (@$open_elements == 2 and $open_elements->[1]->[1] ne 'body')) {
3592 $self->{parse_error}->();
3593 } else {
3594 ## TODO: inner_html parser and @$open_elements > 1 and $open_elements->[1] ne 'body', then parse-error
3595 }
3596
3597 ## Stop parsing
3598 last B;
3599
3600 ## ISSUE: There is an issue in the spec.
3601 } else {
3602 if ($insertion_mode eq 'before head') {
3603 if ($token->{type} eq 'character') {
3604 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3605 $open_elements->[-1]->[0]->manakai_append_text ($1);
3606 unless (length $token->{data}) {
3607 $token = $self->_get_next_token;
3608 redo B;
3609 }
3610 }
3611 ## As if <head>
3612
3613 $head_element = $self->{document}->create_element_ns
3614 (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3615
3616 $open_elements->[-1]->[0]->append_child ($head_element);
3617 push @$open_elements, [$head_element, 'head'];
3618 $insertion_mode = 'in head';
3619 ## reprocess
3620 redo B;
3621 } elsif ($token->{type} eq 'comment') {
3622 my $comment = $self->{document}->create_comment ($token->{data});
3623 $open_elements->[-1]->[0]->append_child ($comment);
3624 $token = $self->_get_next_token;
3625 redo B;
3626 } elsif ($token->{type} eq 'start tag') {
3627 my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3628
3629 $head_element = $self->{document}->create_element_ns
3630 (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3631
3632 for my $attr_name (keys %{ $attr}) {
3633 $head_element->set_attribute_ns (undef, [undef, $attr_name],
3634 $attr ->{$attr_name}->{value});
3635 }
3636
3637 $open_elements->[-1]->[0]->append_child ($head_element);
3638 push @$open_elements, [$head_element, 'head'];
3639 $insertion_mode = 'in head';
3640 if ($token->{tag_name} eq 'head') {
3641 $token = $self->_get_next_token;
3642 #} elsif ({
3643 # base => 1, link => 1, meta => 1,
3644 # script => 1, style => 1, title => 1,
3645 # }->{$token->{tag_name}}) {
3646 # ## reprocess
3647 } else {
3648 ## reprocess
3649 }
3650 redo B;
3651 } elsif ($token->{type} eq 'end tag') {
3652 if ($token->{tag_name} eq 'html') {
3653 ## As if <head>
3654
3655 $head_element = $self->{document}->create_element_ns
3656 (q<http://www.w3.org/1999/xhtml>, [undef, 'head']);
3657
3658 $open_elements->[-1]->[0]->append_child ($head_element);
3659 push @$open_elements, [$head_element, 'head'];
3660 $insertion_mode = 'in head';
3661 ## reprocess
3662 redo B;
3663 } else {
3664 $self->{parse_error}->();
3665 ## Ignore the token
3666 redo B;
3667 }
3668 } else {
3669 die "$0: $token->{type}: Unknown type";
3670 }
3671 } elsif ($insertion_mode eq 'in head') {
3672 if ($token->{type} eq 'character') {
3673 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3674 $open_elements->[-1]->[0]->manakai_append_text ($1);
3675 unless (length $token->{data}) {
3676 $token = $self->_get_next_token;
3677 redo B;
3678 }
3679 }
3680
3681 #
3682 } elsif ($token->{type} eq 'comment') {
3683 my $comment = $self->{document}->create_comment ($token->{data});
3684 $open_elements->[-1]->[0]->append_child ($comment);
3685 $token = $self->_get_next_token;
3686 redo B;
3687 } elsif ($token->{type} eq 'start tag') {
3688 if ($token->{tag_name} eq 'title') {
3689 my $title_el;
3690 $title_el = $self->{document}->create_element_ns
3691 (q<http://www.w3.org/1999/xhtml>, [undef, 'title']);
3692
3693 (defined $head_element ? $head_element : $open_elements->[-1]->[0])
3694 ->append_child ($title_el);
3695 $self->{content_model_flag} = 'RCDATA';
3696
3697 my $text = '';
3698 $token = $self->_get_next_token;
3699 while ($token->{type} eq 'character') {
3700 $text .= $token->{data};
3701 $token = $self->_get_next_token;
3702 }
3703 if (length $text) {
3704 $title_el->manakai_append_text ($text);
3705 }
3706
3707 $self->{content_model_flag} = 'PCDATA';
3708
3709 if ($token->{type} eq 'end tag' and
3710 $token->{tag_name} eq 'title') {
3711 ## Ignore the token
3712 } else {
3713 $self->{parse_error}->();
3714 ## ISSUE: And ignore?
3715 }
3716 $token = $self->_get_next_token;
3717 redo B;
3718 } elsif ($token->{tag_name} eq 'style') {
3719 $style_start_tag->();
3720 redo B;
3721 } elsif ($token->{tag_name} eq 'script') {
3722 $script_start_tag->();
3723 redo B;
3724 } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
3725 ## NOTE: There are "as if in head" code clones
3726 my $el;
3727
3728 $el = $self->{document}->create_element_ns
3729 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3730
3731 for my $attr_name (keys %{ $token->{attributes}}) {
3732 $el->set_attribute_ns (undef, [undef, $attr_name],
3733 $token->{attributes} ->{$attr_name}->{value});
3734 }
3735
3736 (defined $head_element ? $head_element : $open_elements->[-1]->[0])
3737 ->append_child ($el);
3738
3739 ## ISSUE: Issue on magical <base> in the spec
3740
3741 $token = $self->_get_next_token;
3742 redo B;
3743 } elsif ($token->{tag_name} eq 'head') {
3744 $self->{parse_error}->();
3745 ## Ignore the token
3746 $token = $self->_get_next_token;
3747 redo B;
3748 } else {
3749 #
3750 }
3751 } elsif ($token->{type} eq 'end tag') {
3752 if ($token->{tag_name} eq 'head') {
3753 if ($open_elements->[-1]->[1] eq 'head') {
3754 pop @$open_elements;
3755 } else {
3756 $self->{parse_error}->();
3757 }
3758 $insertion_mode = 'after head';
3759 $token = $self->_get_next_token;
3760 redo B;
3761 } elsif ($token->{tag_name} eq 'html') {
3762 #
3763 } else {
3764 $self->{parse_error}->();
3765 ## Ignore the token
3766 $token = $self->_get_next_token;
3767 redo B;
3768 }
3769 } else {
3770 #
3771 }
3772
3773 if ($open_elements->[-1]->[1] eq 'head') {
3774 ## As if </head>
3775 pop @$open_elements;
3776 }
3777 $insertion_mode = 'after head';
3778 ## reprocess
3779 redo B;
3780
3781 ## ISSUE: An issue in the spec.
3782 } elsif ($insertion_mode eq 'after head') {
3783 if ($token->{type} eq 'character') {
3784 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3785 $open_elements->[-1]->[0]->manakai_append_text ($1);
3786 unless (length $token->{data}) {
3787 $token = $self->_get_next_token;
3788 redo B;
3789 }
3790 }
3791
3792 #
3793 } elsif ($token->{type} eq 'comment') {
3794 my $comment = $self->{document}->create_comment ($token->{data});
3795 $open_elements->[-1]->[0]->append_child ($comment);
3796 $token = $self->_get_next_token;
3797 redo B;
3798 } elsif ($token->{type} eq 'start tag') {
3799 if ($token->{tag_name} eq 'body') {
3800
3801 {
3802 my $el;
3803
3804 $el = $self->{document}->create_element_ns
3805 (q<http://www.w3.org/1999/xhtml>, [undef, 'body']);
3806
3807 for my $attr_name (keys %{ $token->{attributes}}) {
3808 $el->set_attribute_ns (undef, [undef, $attr_name],
3809 $token->{attributes} ->{$attr_name}->{value});
3810 }
3811
3812 $open_elements->[-1]->[0]->append_child ($el);
3813 push @$open_elements, [$el, 'body'];
3814 }
3815
3816 $insertion_mode = 'in body';
3817 $token = $self->_get_next_token;
3818 redo B;
3819 } elsif ($token->{tag_name} eq 'frameset') {
3820
3821 {
3822 my $el;
3823
3824 $el = $self->{document}->create_element_ns
3825 (q<http://www.w3.org/1999/xhtml>, [undef, 'frameset']);
3826
3827 for my $attr_name (keys %{ $token->{attributes}}) {
3828 $el->set_attribute_ns (undef, [undef, $attr_name],
3829 $token->{attributes} ->{$attr_name}->{value});
3830 }
3831
3832 $open_elements->[-1]->[0]->append_child ($el);
3833 push @$open_elements, [$el, 'frameset'];
3834 }
3835
3836 $insertion_mode = 'in frameset';
3837 $token = $self->_get_next_token;
3838 redo B;
3839 } elsif ({
3840 base => 1, link => 1, meta => 1,
3841 script=> 1, style => 1, title => 1,
3842 }->{$token->{tag_name}}) {
3843 $self->{parse_error}->();
3844 $insertion_mode = 'in head';
3845 ## reprocess
3846 redo B;
3847 } else {
3848 #
3849 }
3850 } else {
3851 #
3852 }
3853
3854 ## As if <body>
3855
3856 {
3857 my $el;
3858
3859 $el = $self->{document}->create_element_ns
3860 (q<http://www.w3.org/1999/xhtml>, [undef, 'body']);
3861
3862 $open_elements->[-1]->[0]->append_child ($el);
3863 push @$open_elements, [$el, 'body'];
3864 }
3865
3866 $insertion_mode = 'in body';
3867 ## reprocess
3868 redo B;
3869 } elsif ($insertion_mode eq 'in body') {
3870 if ($token->{type} eq 'character') {
3871 ## NOTE: There is a code clone of "character in body".
3872 $reconstruct_active_formatting_elements->();
3873
3874 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3875
3876 $token = $self->_get_next_token;
3877 redo B;
3878 } elsif ($token->{type} eq 'comment') {
3879 ## NOTE: There is a code clone of "comment in body".
3880 my $comment = $self->{document}->create_comment ($token->{data});
3881 $open_elements->[-1]->[0]->append_child ($comment);
3882 $token = $self->_get_next_token;
3883 redo B;
3884 } else {
3885 $in_body->(sub {
3886 $open_elements->[-1]->[0]->append_child (shift);
3887 });
3888 redo B;
3889 }
3890 } elsif ($insertion_mode eq 'in table') {
3891 if ($token->{type} eq 'character') {
3892 $reconstruct_active_formatting_elements->();
3893
3894 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
3895
3896 $token = $self->_get_next_token;
3897 redo B;
3898 } elsif ($token->{type} eq 'comment') {
3899 my $comment = $self->{document}->create_comment ($token->{data});
3900 $open_elements->[-1]->[0]->append_child ($comment);
3901 $token = $self->_get_next_token;
3902 redo B;
3903 } elsif ($token->{type} eq 'start tag') {
3904 if ({
3905 caption => 1,
3906 colgroup => 1,
3907 tbody => 1, tfoot => 1, thead => 1,
3908 }->{$token->{tag_name}}) {
3909 ## Clear back to table context
3910 while ($open_elements->[-1]->[1] ne 'table' and
3911 $open_elements->[-1]->[1] ne 'html') {
3912 $self->{parse_error}->();
3913 pop @$open_elements;
3914 }
3915
3916 push @$active_formatting_elements, ['#marker', '']
3917 if $token->{tag_name} eq 'caption';
3918
3919
3920 {
3921 my $el;
3922
3923 $el = $self->{document}->create_element_ns
3924 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
3925
3926 for my $attr_name (keys %{ $token->{attributes}}) {
3927 $el->set_attribute_ns (undef, [undef, $attr_name],
3928 $token->{attributes} ->{$attr_name}->{value});
3929 }
3930
3931 $open_elements->[-1]->[0]->append_child ($el);
3932 push @$open_elements, [$el, $token->{tag_name}];
3933 }
3934
3935 $insertion_mode = {
3936 caption => 'in caption',
3937 colgroup => 'in column group',
3938 tbody => 'in table body',
3939 tfoot => 'in table body',
3940 thead => 'in table body',
3941 }->{$token->{tag_name}};
3942 $token = $self->_get_next_token;
3943 redo B;
3944 } elsif ({
3945 col => 1,
3946 td => 1, th => 1, tr => 1,
3947 }->{$token->{tag_name}}) {
3948 ## Clear back to table context
3949 while ($open_elements->[-1]->[1] ne 'table' and
3950 $open_elements->[-1]->[1] ne 'html') {
3951 $self->{parse_error}->();
3952 pop @$open_elements;
3953 }
3954
3955
3956 {
3957 my $el;
3958
3959 $el = $self->{document}->create_element_ns
3960 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name} eq 'col' ? 'colgroup' : 'tbody']);
3961
3962 $open_elements->[-1]->[0]->append_child ($el);
3963 push @$open_elements, [$el, $token->{tag_name} eq 'col' ? 'colgroup' : 'tbody'];
3964 }
3965
3966 $insertion_mode = $token->{tag_name} eq 'col'
3967 ? 'in column group' : 'in table body';
3968 ## reprocess
3969 redo B;
3970 } elsif ($token->{tag_name} eq 'table') {
3971 ## NOTE: There are code clones for this "table in table"
3972 $self->{parse_error}->();
3973
3974 ## As if </table>
3975 ## have a table element in table scope
3976 my $i;
3977 INSCOPE: for (reverse 0..$#$open_elements) {
3978 my $node = $open_elements->[$_];
3979 if ($node->[1] eq 'table') {
3980 $i = $_;
3981 last INSCOPE;
3982 } elsif ({
3983 table => 1, html => 1,
3984 }->{$node->[1]}) {
3985 last INSCOPE;
3986 }
3987 } # INSCOPE
3988 unless (defined $i) {
3989 $self->{parse_error}->();
3990 ## Ignore tokens </table><table>
3991 $token = $self->_get_next_token;
3992 redo B;
3993 }
3994
3995 ## generate implied end tags
3996 if ({
3997 dd => 1, dt => 1, li => 1, p => 1,
3998 td => 1, th => 1, tr => 1,
3999 }->{$open_elements->[-1]->[1]}) {
4000 unshift @{$self->{token}}, $token; # <table>
4001 $token = {type => 'end tag', tag_name => 'table'};
4002 unshift @{$self->{token}}, $token;
4003 $token = {type => 'end tag',
4004 tag_name => $open_elements->[-1]->[1]}; # MUST
4005 redo B;
4006 }
4007
4008 if ($open_elements->[-1]->[1] ne 'table') {
4009 $self->{parse_error}->();
4010 }
4011
4012 splice @$open_elements, $i;
4013
4014 $reset_insertion_mode->();
4015
4016 ## reprocess
4017 redo B;
4018 } else {
4019 #
4020 }
4021 } elsif ($token->{type} eq 'end tag') {
4022 if ($token->{tag_name} eq 'table') {
4023 ## have a table element in table scope
4024 my $i;
4025 INSCOPE: for (reverse 0..$#$open_elements) {
4026 my $node = $open_elements->[$_];
4027 if ($node->[1] eq $token->{tag_name}) {
4028 $i = $_;
4029 last INSCOPE;
4030 } elsif ({
4031 table => 1, html => 1,
4032 }->{$node->[1]}) {
4033 last INSCOPE;
4034 }
4035 } # INSCOPE
4036 unless (defined $i) {
4037 $self->{parse_error}->();
4038 ## Ignore the token
4039 $token = $self->_get_next_token;
4040 redo B;
4041 }
4042
4043 ## generate implied end tags
4044 if ({
4045 dd => 1, dt => 1, li => 1, p => 1,
4046 td => 1, th => 1, tr => 1,
4047 }->{$open_elements->[-1]->[1]}) {
4048 unshift @{$self->{token}}, $token;
4049 $token = {type => 'end tag',
4050 tag_name => $open_elements->[-1]->[1]}; # MUST
4051 redo B;
4052 }
4053
4054 if ($open_elements->[-1]->[1] ne 'table') {
4055 $self->{parse_error}->();
4056 }
4057
4058 splice @$open_elements, $i;
4059
4060 $reset_insertion_mode->();
4061
4062 $token = $self->_get_next_token;
4063 redo B;
4064 } elsif ({
4065 body => 1, caption => 1, col => 1, colgroup => 1,
4066 html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
4067 thead => 1, tr => 1,
4068 }->{$token->{tag_name}}) {
4069 $self->{parse_error}->();
4070 ## Ignore the token
4071 $token = $self->_get_next_token;
4072 redo B;
4073 } else {
4074 #
4075 }
4076 } else {
4077 #
4078 }
4079
4080 ## NOTE: There are code clones of "misc in table".
4081 $self->{parse_error}->();
4082 $in_body->(sub {
4083 my $child = shift;
4084 if ({
4085 table => 1, tbody => 1, tfoot => 1,
4086 thead => 1, tr => 1,
4087 }->{$open_elements->[-1]->[1]}) {
4088 # MUST
4089 my $foster_parent_element;
4090 my $next_sibling;
4091 OE: for (reverse 0..$#$open_elements) {
4092 if ($open_elements->[$_]->[1] eq 'table') {
4093 my $parent = $open_elements->[$_]->[0]->parent_node;
4094 if (defined $parent and $parent->node_type == 1) {
4095 $foster_parent_element = $parent;
4096 $next_sibling = $open_elements->[$_]->[0];
4097 } else {
4098 $foster_parent_element
4099 = $open_elements->[$_ - 1]->[0];
4100 }
4101 last OE;
4102 }
4103 } # OE
4104 $foster_parent_element = $open_elements->[0]->[0]
4105 unless defined $foster_parent_element;
4106 $foster_parent_element->insert_before
4107 ($child, $next_sibling);
4108 } else {
4109 $open_elements->[-1]->[0]->append_child ($child);
4110 }
4111 });
4112 redo B;
4113 } elsif ($insertion_mode eq 'in caption') {
4114 if ($token->{type} eq 'start tag') {
4115 if ({
4116 caption => 1, col => 1, colgroup => 1, tbody => 1,
4117 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4118 }->{$token->{tag_name}}) {
4119 $self->{parse_error}->();
4120
4121 ## As if </caption>
4122 ## have a table element in table scope
4123 my $i;
4124 INSCOPE: for (reverse 0..$#$open_elements) {
4125 my $node = $open_elements->[$_];
4126 if ($node->[1] eq 'caption') {
4127 $i = $_;
4128 last INSCOPE;
4129 } elsif ({
4130 table => 1, html => 1,
4131 }->{$node->[1]}) {
4132 last INSCOPE;
4133 }
4134 } # INSCOPE
4135 unless (defined $i) {
4136 $self->{parse_error}->();
4137 ## Ignore the token
4138 $token = $self->_get_next_token;
4139 redo B;
4140 }
4141
4142 ## generate implied end tags
4143 if ({
4144 dd => 1, dt => 1, li => 1, p => 1,
4145 td => 1, th => 1, tr => 1,
4146 }->{$open_elements->[-1]->[1]}) {
4147 unshift @{$self->{token}}, $token; # <?>
4148 $token = {type => 'end tag', tag_name => 'caption'};
4149 unshift @{$self->{token}}, $token;
4150 $token = {type => 'end tag',
4151 tag_name => $open_elements->[-1]->[1]}; # MUST
4152 redo B;
4153 }
4154
4155 if ($open_elements->[-1]->[1] ne 'caption') {
4156 $self->{parse_error}->();
4157 }
4158
4159 splice @$open_elements, $i;
4160
4161 $clear_up_to_marker->();
4162
4163 $insertion_mode = 'in table';
4164
4165 ## reprocess
4166 redo B;
4167 } else {
4168 #
4169 }
4170 } elsif ($token->{type} eq 'end tag') {
4171 if ($token->{tag_name} eq 'caption') {
4172 ## have a table element in table scope
4173 my $i;
4174 INSCOPE: for (reverse 0..$#$open_elements) {
4175 my $node = $open_elements->[$_];
4176 if ($node->[1] eq $token->{tag_name}) {
4177 $i = $_;
4178 last INSCOPE;
4179 } elsif ({
4180 table => 1, html => 1,
4181 }->{$node->[1]}) {
4182 last INSCOPE;
4183 }
4184 } # INSCOPE
4185 unless (defined $i) {
4186 $self->{parse_error}->();
4187 ## Ignore the token
4188 $token = $self->_get_next_token;
4189 redo B;
4190 }
4191
4192 ## generate implied end tags
4193 if ({
4194 dd => 1, dt => 1, li => 1, p => 1,
4195 td => 1, th => 1, tr => 1,
4196 }->{$open_elements->[-1]->[1]}) {
4197 unshift @{$self->{token}}, $token;
4198 $token = {type => 'end tag',
4199 tag_name => $open_elements->[-1]->[1]}; # MUST
4200 redo B;
4201 }
4202
4203 if ($open_elements->[-1]->[1] ne 'caption') {
4204 $self->{parse_error}->();
4205 }
4206
4207 splice @$open_elements, $i;
4208
4209 $clear_up_to_marker->();
4210
4211 $insertion_mode = 'in table';
4212
4213 $token = $self->_get_next_token;
4214 redo B;
4215 } elsif ($token->{tag_name} eq 'table') {
4216 $self->{parse_error}->();
4217
4218 ## As if </caption>
4219 ## have a table element in table scope
4220 my $i;
4221 INSCOPE: for (reverse 0..$#$open_elements) {
4222 my $node = $open_elements->[$_];
4223 if ($node->[1] eq 'caption') {
4224 $i = $_;
4225 last INSCOPE;
4226 } elsif ({
4227 table => 1, html => 1,
4228 }->{$node->[1]}) {
4229 last INSCOPE;
4230 }
4231 } # INSCOPE
4232 unless (defined $i) {
4233 $self->{parse_error}->();
4234 ## Ignore the token
4235 $token = $self->_get_next_token;
4236 redo B;
4237 }
4238
4239 ## generate implied end tags
4240 if ({
4241 dd => 1, dt => 1, li => 1, p => 1,
4242 td => 1, th => 1, tr => 1,
4243 }->{$open_elements->[-1]->[1]}) {
4244 unshift @{$self->{token}}, $token; # </table>
4245 $token = {type => 'end tag', tag_name => 'caption'};
4246 unshift @{$self->{token}}, $token;
4247 $token = {type => 'end tag',
4248 tag_name => $open_elements->[-1]->[1]}; # MUST
4249 redo B;
4250 }
4251
4252 if ($open_elements->[-1]->[1] ne 'caption') {
4253 $self->{parse_error}->();
4254 }
4255
4256 splice @$open_elements, $i;
4257
4258 $clear_up_to_marker->();
4259
4260 $insertion_mode = 'in table';
4261
4262 ## reprocess
4263 redo B;
4264 } elsif ({
4265 body => 1, col => 1, colgroup => 1,
4266 html => 1, tbody => 1, td => 1, tfoot => 1,
4267 th => 1, thead => 1, tr => 1,
4268 }->{$token->{tag_name}}) {
4269 $self->{parse_error}->();
4270 ## Ignore the token
4271 redo B;
4272 } else {
4273 #
4274 }
4275 } else {
4276 #
4277 }
4278
4279 $in_body->(sub {
4280 $open_elements->[-1]->[0]->append_child (shift);
4281 });
4282 redo B;
4283 } elsif ($insertion_mode eq 'in column group') {
4284 if ($token->{type} eq 'character') {
4285 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4286 $open_elements->[-1]->[0]->manakai_append_text ($1);
4287 unless (length $token->{data}) {
4288 $token = $self->_get_next_token;
4289 redo B;
4290 }
4291 }
4292
4293 #
4294 } elsif ($token->{type} eq 'comment') {
4295 my $comment = $self->{document}->create_comment ($token->{data});
4296 $open_elements->[-1]->[0]->append_child ($comment);
4297 $token = $self->_get_next_token;
4298 redo B;
4299 } elsif ($token->{type} eq 'start tag') {
4300 if ($token->{tag_name} eq 'col') {
4301
4302 {
4303 my $el;
4304
4305 $el = $self->{document}->create_element_ns
4306 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4307
4308 for my $attr_name (keys %{ $token->{attributes}}) {
4309 $el->set_attribute_ns (undef, [undef, $attr_name],
4310 $token->{attributes} ->{$attr_name}->{value});
4311 }
4312
4313 $open_elements->[-1]->[0]->append_child ($el);
4314 push @$open_elements, [$el, $token->{tag_name}];
4315 }
4316
4317 pop @$open_elements;
4318 $token = $self->_get_next_token;
4319 redo B;
4320 } else {
4321 #
4322 }
4323 } elsif ($token->{type} eq 'end tag') {
4324 if ($token->{tag_name} eq 'colgroup') {
4325 if ($open_elements->[-1]->[1] eq 'html') {
4326 $self->{parse_error}->();
4327 ## Ignore the token
4328 $token = $self->_get_next_token;
4329 redo B;
4330 } else {
4331 pop @$open_elements; # colgroup
4332 $insertion_mode = 'in table';
4333 $token = $self->_get_next_token;
4334 redo B;
4335 }
4336 } elsif ($token->{tag_name} eq 'col') {
4337 $self->{parse_error}->();
4338 ## Ignore the token
4339 $token = $self->_get_next_token;
4340 redo B;
4341 } else {
4342 #
4343 }
4344 } else {
4345 #
4346 }
4347
4348 ## As if </colgroup>
4349 if ($open_elements->[-1]->[1] eq 'html') {
4350 $self->{parse_error}->();
4351 ## Ignore the token
4352 $token = $self->_get_next_token;
4353 redo B;
4354 } else {
4355 pop @$open_elements; # colgroup
4356 $insertion_mode = 'in table';
4357 ## reprocess
4358 redo B;
4359 }
4360 } elsif ($insertion_mode eq 'in table body') {
4361 if ($token->{type} eq 'character') {
4362 ## Copied from 'in table'
4363 $reconstruct_active_formatting_elements->();
4364
4365 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4366
4367 $token = $self->_get_next_token;
4368 redo B;
4369 } elsif ($token->{type} eq 'comment') {
4370 ## Copied from 'in table'
4371 my $comment = $self->{document}->create_comment ($token->{data});
4372 $open_elements->[-1]->[0]->append_child ($comment);
4373 $token = $self->_get_next_token;
4374 redo B;
4375 } elsif ($token->{type} eq 'start tag') {
4376 if ({
4377 tr => 1,
4378 th => 1, td => 1,
4379 }->{$token->{tag_name}}) {
4380 ## Clear back to table body context
4381 while (not {
4382 tbody => 1, tfoot => 1, thead => 1, html => 1,
4383 }->{$open_elements->[-1]->[1]}) {
4384 $self->{parse_error}->();
4385 pop @$open_elements;
4386 }
4387
4388 $insertion_mode = 'in row';
4389 if ($token->{tag_name} eq 'tr') {
4390
4391 {
4392 my $el;
4393
4394 $el = $self->{document}->create_element_ns
4395 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4396
4397 for my $attr_name (keys %{ $token->{attributes}}) {
4398 $el->set_attribute_ns (undef, [undef, $attr_name],
4399 $token->{attributes} ->{$attr_name}->{value});
4400 }
4401
4402 $open_elements->[-1]->[0]->append_child ($el);
4403 push @$open_elements, [$el, $token->{tag_name}];
4404 }
4405
4406 $token = $self->_get_next_token;
4407 } else {
4408
4409 {
4410 my $el;
4411
4412 $el = $self->{document}->create_element_ns
4413 (q<http://www.w3.org/1999/xhtml>, [undef, 'tr']);
4414
4415 $open_elements->[-1]->[0]->append_child ($el);
4416 push @$open_elements, [$el, 'tr'];
4417 }
4418
4419 ## reprocess
4420 }
4421 redo B;
4422 } elsif ({
4423 caption => 1, col => 1, colgroup => 1,
4424 tbody => 1, tfoot => 1, thead => 1,
4425 }->{$token->{tag_name}}) {
4426 ## have an element in table scope
4427 my $i;
4428 INSCOPE: for (reverse 0..$#$open_elements) {
4429 my $node = $open_elements->[$_];
4430 if ({
4431 tbody => 1, thead => 1, tfoot => 1,
4432 }->{$node->[1]}) {
4433 $i = $_;
4434 last INSCOPE;
4435 } elsif ({
4436 table => 1, html => 1,
4437 }->{$node->[1]}) {
4438 last INSCOPE;
4439 }
4440 } # INSCOPE
4441 unless (defined $i) {
4442 $self->{parse_error}->();
4443 ## Ignore the token
4444 $token = $self->_get_next_token;
4445 redo B;
4446 }
4447
4448 ## Clear back to table body context
4449 while (not {
4450 tbody => 1, tfoot => 1, thead => 1, html => 1,
4451 }->{$open_elements->[-1]->[1]}) {
4452 $self->{parse_error}->();
4453 pop @$open_elements;
4454 }
4455
4456 ## As if <{current node}>
4457 ## have an element in table scope
4458 ## true by definition
4459
4460 ## Clear back to table body context
4461 ## nop by definition
4462
4463 pop @$open_elements;
4464 $insertion_mode = 'in table';
4465 ## reprocess
4466 redo B;
4467 } elsif ($token->{tag_name} eq 'table') {
4468 ## NOTE: This is a code clone of "table in table"
4469 $self->{parse_error}->();
4470
4471 ## As if </table>
4472 ## have a table element in table scope
4473 my $i;
4474 INSCOPE: for (reverse 0..$#$open_elements) {
4475 my $node = $open_elements->[$_];
4476 if ($node->[1] eq 'table') {
4477 $i = $_;
4478 last INSCOPE;
4479 } elsif ({
4480 table => 1, html => 1,
4481 }->{$node->[1]}) {
4482 last INSCOPE;
4483 }
4484 } # INSCOPE
4485 unless (defined $i) {
4486 $self->{parse_error}->();
4487 ## Ignore tokens </table><table>
4488 $token = $self->_get_next_token;
4489 redo B;
4490 }
4491
4492 ## generate implied end tags
4493 if ({
4494 dd => 1, dt => 1, li => 1, p => 1,
4495 td => 1, th => 1, tr => 1,
4496 }->{$open_elements->[-1]->[1]}) {
4497 unshift @{$self->{token}}, $token; # <table>
4498 $token = {type => 'end tag', tag_name => 'table'};
4499 unshift @{$self->{token}}, $token;
4500 $token = {type => 'end tag',
4501 tag_name => $open_elements->[-1]->[1]}; # MUST
4502 redo B;
4503 }
4504
4505 if ($open_elements->[-1]->[1] ne 'table') {
4506 $self->{parse_error}->();
4507 }
4508
4509 splice @$open_elements, $i;
4510
4511 $reset_insertion_mode->();
4512
4513 ## reprocess
4514 redo B;
4515 } else {
4516 #
4517 }
4518 } elsif ($token->{type} eq 'end tag') {
4519 if ({
4520 tbody => 1, tfoot => 1, thead => 1,
4521 }->{$token->{tag_name}}) {
4522 ## have an element in table scope
4523 my $i;
4524 INSCOPE: for (reverse 0..$#$open_elements) {
4525 my $node = $open_elements->[$_];
4526 if ($node->[1] eq $token->{tag_name}) {
4527 $i = $_;
4528 last INSCOPE;
4529 } elsif ({
4530 table => 1, html => 1,
4531 }->{$node->[1]}) {
4532 last INSCOPE;
4533 }
4534 } # INSCOPE
4535 unless (defined $i) {
4536 $self->{parse_error}->();
4537 ## Ignore the token
4538 $token = $self->_get_next_token;
4539 redo B;
4540 }
4541
4542 ## Clear back to table body context
4543 while (not {
4544 tbody => 1, tfoot => 1, thead => 1, html => 1,
4545 }->{$open_elements->[-1]->[1]}) {
4546 $self->{parse_error}->();
4547 pop @$open_elements;
4548 }
4549
4550 pop @$open_elements;
4551 $insertion_mode = 'in table';
4552 $token = $self->_get_next_token;
4553 redo B;
4554 } elsif ($token->{tag_name} eq 'table') {
4555 ## have an element in table scope
4556 my $i;
4557 INSCOPE: for (reverse 0..$#$open_elements) {
4558 my $node = $open_elements->[$_];
4559 if ({
4560 tbody => 1, thead => 1, tfoot => 1,
4561 }->{$node->[1]}) {
4562 $i = $_;
4563 last INSCOPE;
4564 } elsif ({
4565 table => 1, html => 1,
4566 }->{$node->[1]}) {
4567 last INSCOPE;
4568 }
4569 } # INSCOPE
4570 unless (defined $i) {
4571 $self->{parse_error}->();
4572 ## Ignore the token
4573 $token = $self->_get_next_token;
4574 redo B;
4575 }
4576
4577 ## Clear back to table body context
4578 while (not {
4579 tbody => 1, tfoot => 1, thead => 1, html => 1,
4580 }->{$open_elements->[-1]->[1]}) {
4581 $self->{parse_error}->();
4582 pop @$open_elements;
4583 }
4584
4585 ## As if <{current node}>
4586 ## have an element in table scope
4587 ## true by definition
4588
4589 ## Clear back to table body context
4590 ## nop by definition
4591
4592 pop @$open_elements;
4593 $insertion_mode = 'in table';
4594 ## reprocess
4595 redo B;
4596 } elsif ({
4597 body => 1, caption => 1, col => 1, colgroup => 1,
4598 html => 1, td => 1, th => 1, tr => 1,
4599 }->{$token->{tag_name}}) {
4600 $self->{parse_error}->();
4601 ## Ignore the token
4602 $token = $self->_get_next_token;
4603 redo B;
4604 } else {
4605 #
4606 }
4607 } else {
4608 #
4609 }
4610
4611 ## As if in table
4612 ## NOTE: This is a code clone of "misc in table".
4613 $self->{parse_error}->();
4614 $in_body->(sub {
4615 my $child = shift;
4616 if ({
4617 table => 1, tbody => 1, tfoot => 1,
4618 thead => 1, tr => 1,
4619 }->{$open_elements->[-1]->[1]}) {
4620 # MUST
4621 my $foster_parent_element;
4622 my $next_sibling;
4623 OE: for (reverse 0..$#$open_elements) {
4624 if ($open_elements->[$_]->[1] eq 'table') {
4625 my $parent = $open_elements->[$_]->[0]->parent_node;
4626 if (defined $parent and $parent->node_type == 1) {
4627 $foster_parent_element = $parent;
4628 $next_sibling = $open_elements->[$_]->[0];
4629 } else {
4630 $foster_parent_element
4631 = $open_elements->[$_ - 1]->[0];
4632 }
4633 last OE;
4634 }
4635 } # OE
4636 $foster_parent_element = $open_elements->[0]->[0]
4637 unless defined $foster_parent_element;
4638 $foster_parent_element->insert_before
4639 ($child, $next_sibling);
4640 } else {
4641 $open_elements->[-1]->[0]->append_child ($child);
4642 }
4643 });
4644 redo B;
4645 } elsif ($insertion_mode eq 'in row') {
4646 if ($token->{type} eq 'character') {
4647 ## Copied from 'in table'
4648 $reconstruct_active_formatting_elements->();
4649
4650 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4651
4652 $token = $self->_get_next_token;
4653 redo B;
4654 } elsif ($token->{type} eq 'comment') {
4655 ## Copied from 'in table'
4656 my $comment = $self->{document}->create_comment ($token->{data});
4657 $open_elements->[-1]->[0]->append_child ($comment);
4658 $token = $self->_get_next_token;
4659 redo B;
4660 } elsif ($token->{type} eq 'start tag') {
4661 if ($token->{tag_name} eq 'th' or
4662 $token->{tag_name} eq 'td') {
4663 ## Clear back to table row context
4664 while (not {
4665 th => 1, td => 1, html => 1,
4666 }->{$open_elements->[-1]->[1]}) {
4667 $self->{parse_error}->();
4668 pop @$open_elements;
4669 }
4670
4671
4672 {
4673 my $el;
4674
4675 $el = $self->{document}->create_element_ns
4676 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
4677
4678 for my $attr_name (keys %{ $token->{attributes}}) {
4679 $el->set_attribute_ns (undef, [undef, $attr_name],
4680 $token->{attributes} ->{$attr_name}->{value});
4681 }
4682
4683 $open_elements->[-1]->[0]->append_child ($el);
4684 push @$open_elements, [$el, $token->{tag_name}];
4685 }
4686
4687 $insertion_mode = 'in cell';
4688
4689 push @$active_formatting_elements, ['#marker', ''];
4690
4691 $token = $self->_get_next_token;
4692 redo B;
4693 } elsif ({
4694 caption => 1, col => 1, colgroup => 1,
4695 tbody => 1, tfoot => 1, thead => 1, tr => 1,
4696 }->{$token->{tag_name}}) {
4697 ## As if </tr>
4698 ## have an element in table scope
4699 my $i;
4700 INSCOPE: for (reverse 0..$#$open_elements) {
4701 my $node = $open_elements->[$_];
4702 if ($node->[1] eq 'tr') {
4703 $i = $_;
4704 last INSCOPE;
4705 } elsif ({
4706 table => 1, html => 1,
4707 }->{$node->[1]}) {
4708 last INSCOPE;
4709 }
4710 } # INSCOPE
4711 unless (defined $i) {
4712 $self->{parse_error}->();
4713 ## Ignore the token
4714 $token = $self->_get_next_token;
4715 redo B;
4716 }
4717
4718 ## Clear back to table row context
4719 while (not {
4720 tr => 1, html => 1,
4721 }->{$open_elements->[-1]->[1]}) {
4722 $self->{parse_error}->();
4723 pop @$open_elements;
4724 }
4725
4726 pop @$open_elements; # tr
4727 $insertion_mode = 'in table body';
4728 ## reprocess
4729 redo B;
4730 } elsif ($token->{tag_name} eq 'table') {
4731 ## NOTE: This is a code clone of "table in table"
4732 $self->{parse_error}->();
4733
4734 ## As if </table>
4735 ## have a table element in table scope
4736 my $i;
4737 INSCOPE: for (reverse 0..$#$open_elements) {
4738 my $node = $open_elements->[$_];
4739 if ($node->[1] eq 'table') {
4740 $i = $_;
4741 last INSCOPE;
4742 } elsif ({
4743 table => 1, html => 1,
4744 }->{$node->[1]}) {
4745 last INSCOPE;
4746 }
4747 } # INSCOPE
4748 unless (defined $i) {
4749 $self->{parse_error}->();
4750 ## Ignore tokens </table><table>
4751 $token = $self->_get_next_token;
4752 redo B;
4753 }
4754
4755 ## generate implied end tags
4756 if ({
4757 dd => 1, dt => 1, li => 1, p => 1,
4758 td => 1, th => 1, tr => 1,
4759 }->{$open_elements->[-1]->[1]}) {
4760 unshift @{$self->{token}}, $token; # <table>
4761 $token = {type => 'end tag', tag_name => 'table'};
4762 unshift @{$self->{token}}, $token;
4763 $token = {type => 'end tag',
4764 tag_name => $open_elements->[-1]->[1]}; # MUST
4765 redo B;
4766 }
4767
4768 if ($open_elements->[-1]->[1] ne 'table') {
4769 $self->{parse_error}->();
4770 }
4771
4772 splice @$open_elements, $i;
4773
4774 $reset_insertion_mode->();
4775
4776 ## reprocess
4777 redo B;
4778 } else {
4779 #
4780 }
4781 } elsif ($token->{type} eq 'end tag') {
4782 if ($token->{tag_name} eq 'tr') {
4783 ## have an element in table scope
4784 my $i;
4785 INSCOPE: for (reverse 0..$#$open_elements) {
4786 my $node = $open_elements->[$_];
4787 if ($node->[1] eq $token->{tag_name}) {
4788 $i = $_;
4789 last INSCOPE;
4790 } elsif ({
4791 table => 1, html => 1,
4792 }->{$node->[1]}) {
4793 last INSCOPE;
4794 }
4795 } # INSCOPE
4796 unless (defined $i) {
4797 $self->{parse_error}->();
4798 ## Ignore the token
4799 $token = $self->_get_next_token;
4800 redo B;
4801 }
4802
4803 ## Clear back to table row context
4804 while (not {
4805 tr => 1, html => 1,
4806 }->{$open_elements->[-1]->[1]}) {
4807 $self->{parse_error}->();
4808 pop @$open_elements;
4809 }
4810
4811 pop @$open_elements; # tr
4812 $insertion_mode = 'in table body';
4813 $token = $self->_get_next_token;
4814 redo B;
4815 } elsif ($token->{tag_name} eq 'table') {
4816 ## As if </tr>
4817 ## have an element in table scope
4818 my $i;
4819 INSCOPE: for (reverse 0..$#$open_elements) {
4820 my $node = $open_elements->[$_];
4821 if ($node->[1] eq 'tr') {
4822 $i = $_;
4823 last INSCOPE;
4824 } elsif ({
4825 table => 1, html => 1,
4826 }->{$node->[1]}) {
4827 last INSCOPE;
4828 }
4829 } # INSCOPE
4830 unless (defined $i) {
4831 $self->{parse_error}->();
4832 ## Ignore the token
4833 $token = $self->_get_next_token;
4834 redo B;
4835 }
4836
4837 ## Clear back to table row context
4838 while (not {
4839 tr => 1, html => 1,
4840 }->{$open_elements->[-1]->[1]}) {
4841 $self->{parse_error}->();
4842 pop @$open_elements;
4843 }
4844
4845 pop @$open_elements; # tr
4846 $insertion_mode = 'in table body';
4847 ## reprocess
4848 redo B;
4849 } elsif ({
4850 tbody => 1, tfoot => 1, thead => 1,
4851 }->{$token->{tag_name}}) {
4852 ## have an element in table scope
4853 my $i;
4854 INSCOPE: for (reverse 0..$#$open_elements) {
4855 my $node = $open_elements->[$_];
4856 if ($node->[1] eq $token->{tag_name}) {
4857 $i = $_;
4858 last INSCOPE;
4859 } elsif ({
4860 table => 1, html => 1,
4861 }->{$node->[1]}) {
4862 last INSCOPE;
4863 }
4864 } # INSCOPE
4865 unless (defined $i) {
4866 $self->{parse_error}->();
4867 ## Ignore the token
4868 $token = $self->_get_next_token;
4869 redo B;
4870 }
4871
4872 ## As if </tr>
4873 ## have an element in table scope
4874 my $i;
4875 INSCOPE: for (reverse 0..$#$open_elements) {
4876 my $node = $open_elements->[$_];
4877 if ($node->[1] eq 'tr') {
4878 $i = $_;
4879 last INSCOPE;
4880 } elsif ({
4881 table => 1, html => 1,
4882 }->{$node->[1]}) {
4883 last INSCOPE;
4884 }
4885 } # INSCOPE
4886 unless (defined $i) {
4887 $self->{parse_error}->();
4888 ## Ignore the token
4889 $token = $self->_get_next_token;
4890 redo B;
4891 }
4892
4893 ## Clear back to table row context
4894 while (not {
4895 tr => 1, html => 1,
4896 }->{$open_elements->[-1]->[1]}) {
4897 $self->{parse_error}->();
4898 pop @$open_elements;
4899 }
4900
4901 pop @$open_elements; # tr
4902 $insertion_mode = 'in table body';
4903 ## reprocess
4904 redo B;
4905 } elsif ({
4906 body => 1, caption => 1, col => 1,
4907 colgroup => 1, html => 1, td => 1, th => 1,
4908 }->{$token->{tag_name}}) {
4909 $self->{parse_error}->();
4910 ## Ignore the token
4911 $token = $self->_get_next_token;
4912 redo B;
4913 } else {
4914 #
4915 }
4916 } else {
4917 #
4918 }
4919
4920 ## As if in table
4921 ## NOTE: This is a code clone of "misc in table".
4922 $self->{parse_error}->();
4923 $in_body->(sub {
4924 my $child = shift;
4925 if ({
4926 table => 1, tbody => 1, tfoot => 1,
4927 thead => 1, tr => 1,
4928 }->{$open_elements->[-1]->[1]}) {
4929 # MUST
4930 my $foster_parent_element;
4931 my $next_sibling;
4932 OE: for (reverse 0..$#$open_elements) {
4933 if ($open_elements->[$_]->[1] eq 'table') {
4934 my $parent = $open_elements->[$_]->[0]->parent_node;
4935 if (defined $parent and $parent->node_type == 1) {
4936 $foster_parent_element = $parent;
4937 $next_sibling = $open_elements->[$_]->[0];
4938 } else {
4939 $foster_parent_element
4940 = $open_elements->[$_ - 1]->[0];
4941 }
4942 last OE;
4943 }
4944 } # OE
4945 $foster_parent_element = $open_elements->[0]->[0]
4946 unless defined $foster_parent_element;
4947 $foster_parent_element->insert_before
4948 ($child, $next_sibling);
4949 } else {
4950 $open_elements->[-1]->[0]->append_child ($child);
4951 }
4952 });
4953 redo B;
4954 } elsif ($insertion_mode eq 'in cell') {
4955 if ($token->{type} eq 'character') {
4956 ## NOTE: This is a code clone of "character in body".
4957 $reconstruct_active_formatting_elements->();
4958
4959 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
4960
4961 $token = $self->_get_next_token;
4962 redo B;
4963 } elsif ($token->{type} eq 'comment') {
4964 ## NOTE: This is a code clone of "comment in body".
4965 my $comment = $self->{document}->create_comment ($token->{data});
4966 $open_elements->[-1]->[0]->append_child ($comment);
4967 $token = $self->_get_next_token;
4968 redo B;
4969 } elsif ($token->{type} eq 'start tag') {
4970 if ({
4971 caption => 1, col => 1, colgroup => 1,
4972 tbody => 1, td => 1, tfoot => 1, th => 1,
4973 thead => 1, tr => 1,
4974 }->{$token->{tag_name}}) {
4975 ## have an element in table scope
4976 my $tn;
4977 INSCOPE: for (reverse 0..$#$open_elements) {
4978 my $node = $open_elements->[$_];
4979 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4980 $tn = $node->[1];
4981 last INSCOPE;
4982 } elsif ({
4983 table => 1, html => 1,
4984 }->{$node->[1]}) {
4985 last INSCOPE;
4986 }
4987 } # INSCOPE
4988 unless (defined $tn) {
4989 $self->{parse_error}->();
4990 ## Ignore the token
4991 $token = $self->_get_next_token;
4992 redo B;
4993 }
4994
4995 ## Close the cell
4996 unshift @{$self->{token}}, $token; # <?>
4997 $token = {type => 'end tag', tag_name => $tn};
4998 redo B;
4999 } else {
5000 #
5001 }
5002 } elsif ($token->{type} eq 'end tag') {
5003 if ($token->{type} eq 'td' or $token->{type} eq 'th') {
5004 ## have an element in table scope
5005 my $i;
5006 INSCOPE: for (reverse 0..$#$open_elements) {
5007 my $node = $open_elements->[$_];
5008 if ($node->[1] eq $token->{tag_name}) {
5009 $i = $_;
5010 last INSCOPE;
5011 } elsif ({
5012 table => 1, html => 1,
5013 }->{$node->[1]}) {
5014 last INSCOPE;
5015 }
5016 } # INSCOPE
5017 unless (defined $i) {
5018 $self->{parse_error}->();
5019 ## Ignore the token
5020 $token = $self->_get_next_token;
5021 redo B;
5022 }
5023
5024 ## generate implied end tags
5025 if ({
5026 dd => 1, dt => 1, li => 1, p => 1,
5027 td => ($token->{tag_name} eq 'th'),
5028 th => ($token->{tag_name} eq 'td'),
5029 tr => 1,
5030 }->{$open_elements->[-1]->[1]}) {
5031 unshift @{$self->{token}}, $token;
5032 $token = {type => 'end tag',
5033 tag_name => $open_elements->[-1]->[1]}; # MUST
5034 redo B;
5035 }
5036
5037 if ($open_elements->[-1]->[1] ne $token->{tag_name}) {
5038 $self->{parse_error}->();
5039 }
5040
5041 splice @$open_elements, $i;
5042
5043 $clear_up_to_marker->();
5044
5045 $insertion_mode = 'in row';
5046
5047 $token = $self->_get_next_token;
5048 redo B;
5049 } elsif ({
5050 body => 1, caption => 1, col => 1,
5051 colgroup => 1, html => 1,
5052 }->{$token->{tag_name}}) {
5053 $self->{parse_error}->();
5054 ## Ignore the token
5055 $token = $self->_get_next_token;
5056 redo B;
5057 } elsif ({
5058 table => 1, tbody => 1, tfoot => 1,
5059 thead => 1, tr => 1,
5060 }->{$token->{tag_name}}) {
5061 ## have an element in table scope
5062 my $i;
5063 my $tn;
5064 INSCOPE: for (reverse 0..$#$open_elements) {
5065 my $node = $open_elements->[$_];
5066 if ($node->[1] eq $token->{tag_name}) {
5067 $i = $_;
5068 $tn = $node->[1];
5069 last INSCOPE;
5070 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
5071 $tn = $node->[1];
5072 ## NOTE: There is exactly one |td| or |th| element
5073 ## in scope in the stack of open elements by definition.
5074 } elsif ({
5075 table => 1, html => 1,
5076 }->{$node->[1]}) {
5077 last INSCOPE;
5078 }
5079 } # INSCOPE
5080 unless (defined $i) {
5081 $self->{parse_error}->();
5082 ## Ignore the token
5083 $token = $self->_get_next_token;
5084 redo B;
5085 }
5086
5087 ## Close the cell
5088 unshift @{$self->{token}}, $token; # </?>
5089 $token = {type => 'end tag', tag_name => $tn};
5090 redo B;
5091 } else {
5092 #
5093 }
5094 } else {
5095 #
5096 }
5097
5098 $in_body->(sub {
5099 $open_elements->[-1]->[0]->append_child (shift);
5100 });
5101 redo B;
5102 } elsif ($insertion_mode eq 'in select') {
5103 if ($token->{type} eq 'character') {
5104 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5105 $token = $self->_get_next_token;
5106 redo B;
5107 } elsif ($token->{type} eq 'comment') {
5108 my $comment = $self->{document}->create_comment ($token->{data});
5109 $open_elements->[-1]->[0]->append_child ($comment);
5110 $token = $self->_get_next_token;
5111 redo B;
5112 } elsif ($token->{type} eq 'start tag') {
5113 if ($token->{tag_name} eq 'option') {
5114 if ($open_elements->[-1]->[1] eq 'option') {
5115 ## As if </option>
5116 pop @$open_elements;
5117 }
5118
5119
5120 {
5121 my $el;
5122
5123 $el = $self->{document}->create_element_ns
5124 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5125
5126 for my $attr_name (keys %{ $token->{attributes}}) {
5127 $el->set_attribute_ns (undef, [undef, $attr_name],
5128 $token->{attributes} ->{$attr_name}->{value});
5129 }
5130
5131 $open_elements->[-1]->[0]->append_child ($el);
5132 push @$open_elements, [$el, $token->{tag_name}];
5133 }
5134
5135 $token = $self->_get_next_token;
5136 redo B;
5137 } elsif ($token->{tag_name} eq 'optgroup') {
5138 if ($open_elements->[-1]->[1] eq 'option') {
5139 ## As if </option>
5140 pop @$open_elements;
5141 }
5142
5143 if ($open_elements->[-1]->[1] eq 'optgroup') {
5144 ## As if </optgroup>
5145 pop @$open_elements;
5146 }
5147
5148
5149 {
5150 my $el;
5151
5152 $el = $self->{document}->create_element_ns
5153 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5154
5155 for my $attr_name (keys %{ $token->{attributes}}) {
5156 $el->set_attribute_ns (undef, [undef, $attr_name],
5157 $token->{attributes} ->{$attr_name}->{value});
5158 }
5159
5160 $open_elements->[-1]->[0]->append_child ($el);
5161 push @$open_elements, [$el, $token->{tag_name}];
5162 }
5163
5164 $token = $self->_get_next_token;
5165 redo B;
5166 } elsif ($token->{tag_name} eq 'select') {
5167 $self->{parse_error}->();
5168 ## As if </select> instead
5169 ## have an element in table scope
5170 my $i;
5171 INSCOPE: for (reverse 0..$#$open_elements) {
5172 my $node = $open_elements->[$_];
5173 if ($node->[1] eq $token->{tag_name}) {
5174 $i = $_;
5175 last INSCOPE;
5176 } elsif ({
5177 table => 1, html => 1,
5178 }->{$node->[1]}) {
5179 last INSCOPE;
5180 }
5181 } # INSCOPE
5182 unless (defined $i) {
5183 $self->{parse_error}->();
5184 ## Ignore the token
5185 $token = $self->_get_next_token;
5186 redo B;
5187 }
5188
5189 splice @$open_elements, $i;
5190
5191 $reset_insertion_mode->();
5192
5193 $token = $self->_get_next_token;
5194 redo B;
5195 } else {
5196 #
5197 }
5198 } elsif ($token->{type} eq 'end tag') {
5199 if ($token->{tag_name} eq 'optgroup') {
5200 if ($open_elements->[-1]->[1] eq 'option' and
5201 $open_elements->[-2]->[1] eq 'optgroup') {
5202 ## As if </option>
5203 splice @$open_elements, -2;
5204 } elsif ($open_elements->[-1]->[1] eq 'optgroup') {
5205 pop @$open_elements;
5206 } else {
5207 $self->{parse_error}->();
5208 ## Ignore the token
5209 }
5210 $token = $self->_get_next_token;
5211 redo B;
5212 } elsif ($token->{tag_name} eq 'option') {
5213 if ($open_elements->[-1]->[1] eq 'option') {
5214 pop @$open_elements;
5215 } else {
5216 $self->{parse_error}->();
5217 ## Ignore the token
5218 }
5219 $token = $self->_get_next_token;
5220 redo B;
5221 } elsif ($token->{tag_name} eq 'select') {
5222 ## have an element in table scope
5223 my $i;
5224 INSCOPE: for (reverse 0..$#$open_elements) {
5225 my $node = $open_elements->[$_];
5226 if ($node->[1] eq $token->{tag_name}) {
5227 $i = $_;
5228 last INSCOPE;
5229 } elsif ({
5230 table => 1, html => 1,
5231 }->{$node->[1]}) {
5232 last INSCOPE;
5233 }
5234 } # INSCOPE
5235 unless (defined $i) {
5236 $self->{parse_error}->();
5237 ## Ignore the token
5238 $token = $self->_get_next_token;
5239 redo B;
5240 }
5241
5242 splice @$open_elements, $i;
5243
5244 $reset_insertion_mode->();
5245
5246 $token = $self->_get_next_token;
5247 redo B;
5248 } elsif ({
5249 caption => 1, table => 1, tbody => 1,
5250 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5251 }->{$token->{tag_name}}) {
5252 $self->{parse_error}->();
5253
5254 ## have an element in table scope
5255 my $i;
5256 INSCOPE: for (reverse 0..$#$open_elements) {
5257 my $node = $open_elements->[$_];
5258 if ($node->[1] eq $token->{tag_name}) {
5259 $i = $_;
5260 last INSCOPE;
5261 } elsif ({
5262 table => 1, html => 1,
5263 }->{$node->[1]}) {
5264 last INSCOPE;
5265 }
5266 } # INSCOPE
5267 unless (defined $i) {
5268 ## Ignore the token
5269 $token = $self->_get_next_token;
5270 redo B;
5271 }
5272
5273 ## As if </select>
5274 ## have an element in table scope
5275 undef $i;
5276 INSCOPE: for (reverse 0..$#$open_elements) {
5277 my $node = $open_elements->[$_];
5278 if ($node->[1] eq 'select') {
5279 $i = $_;
5280 last INSCOPE;
5281 } elsif ({
5282 table => 1, html => 1,
5283 }->{$node->[1]}) {
5284 last INSCOPE;
5285 }
5286 } # INSCOPE
5287 unless (defined $i) {
5288 $self->{parse_error}->();
5289 ## Ignore the </select> token
5290 $token = $self->_get_next_token; ## TODO: ok?
5291 redo B;
5292 }
5293
5294 splice @$open_elements, $i;
5295
5296 $reset_insertion_mode->();
5297
5298 ## reprocess
5299 redo B;
5300 } else {
5301 #
5302 }
5303 } else {
5304 #
5305 }
5306
5307 $self->{parse_error}->();
5308 ## Ignore the token
5309 redo B;
5310 } elsif ($insertion_mode eq 'after body') {
5311 if ($token->{type} eq 'character') {
5312 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5313 ## As if in body
5314 $reconstruct_active_formatting_elements->();
5315
5316 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5317
5318 unless (length $token->{data}) {
5319 $token = $self->_get_next_token;
5320 redo B;
5321 }
5322 }
5323
5324 #
5325 } elsif ($token->{type} eq 'comment') {
5326 my $comment = $self->{document}->create_comment ($token->{data});
5327 $open_elements->[0]->[0]->append_child ($comment);
5328 $token = $self->_get_next_token;
5329 redo B;
5330 } elsif ($token->{type} eq 'end tag') {
5331 if ($token->{type} eq 'html') {
5332 ## TODO: if inner_html, parse-error, ignore the token; otherwise,
5333
5334 $phase = 'trailing end';
5335 $token = $self->_get_next_token;
5336 redo B;
5337 } else {
5338 #
5339 }
5340 } else {
5341 #
5342 }
5343
5344 $self->{parse_error}->();
5345 $insertion_mode = 'in body';
5346 ## reprocess
5347 redo B;
5348 } elsif ($insertion_mode eq 'in frameset') {
5349 if ($token->{type} eq 'character') {
5350 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5351 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5352
5353 unless (length $token->{data}) {
5354 $token = $self->_get_next_token;
5355 redo B;
5356 }
5357 }
5358
5359 #
5360 } elsif ($token->{type} eq 'comment') {
5361 my $comment = $self->{document}->create_comment ($token->{data});
5362 $open_elements->[-1]->[0]->append_child ($comment);
5363 $token = $self->_get_next_token;
5364 redo B;
5365 } elsif ($token->{type} eq 'start tag') {
5366 if ($token->{tag_name} eq 'frameset') {
5367
5368 {
5369 my $el;
5370
5371 $el = $self->{document}->create_element_ns
5372 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5373
5374 for my $attr_name (keys %{ $token->{attributes}}) {
5375 $el->set_attribute_ns (undef, [undef, $attr_name],
5376 $token->{attributes} ->{$attr_name}->{value});
5377 }
5378
5379 $open_elements->[-1]->[0]->append_child ($el);
5380 push @$open_elements, [$el, $token->{tag_name}];
5381 }
5382
5383 $token = $self->_get_next_token;
5384 redo B;
5385 } elsif ($token->{tag_name} eq 'frame') {
5386
5387 {
5388 my $el;
5389
5390 $el = $self->{document}->create_element_ns
5391 (q<http://www.w3.org/1999/xhtml>, [undef, $token->{tag_name}]);
5392
5393 for my $attr_name (keys %{ $token->{attributes}}) {
5394 $el->set_attribute_ns (undef, [undef, $attr_name],
5395 $token->{attributes} ->{$attr_name}->{value});
5396 }
5397
5398 $open_elements->[-1]->[0]->append_child ($el);
5399 push @$open_elements, [$el, $token->{tag_name}];
5400 }
5401
5402 pop @$open_elements;
5403 $token = $self->_get_next_token;
5404 redo B;
5405 } elsif ($token->{tag_name} eq 'noframes') {
5406 $in_body->(sub {
5407 $open_elements->[-1]->[0]->append_child (shift);
5408 });
5409 redo B;
5410 } else {
5411 #
5412 }
5413 } elsif ($token->{type} eq 'end tag') {
5414 if ($token->{tag_name} eq 'frameset') {
5415 if ($open_elements->[-1]->[1] eq 'html' and
5416 @$open_elements == 1) {
5417 $self->{parse_error}->();
5418 ## Ignore the token
5419 $token = $self->_get_next_token;
5420 } else {
5421 pop @$open_elements;
5422 $token = $self->_get_next_token;
5423 }
5424
5425 ## if not inner_html and
5426 if ($open_elements->[-1]->[1] ne 'frameset') {
5427 $insertion_mode = 'after frameset';
5428 }
5429 redo B;
5430 } else {
5431 #
5432 }
5433 } else {
5434 #
5435 }
5436
5437 $self->{parse_error}->();
5438 ## Ignore the token
5439 $token = $self->_get_next_token;
5440 redo B;
5441 } elsif ($insertion_mode eq 'after frameset') {
5442 if ($token->{type} eq 'character') {
5443 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5444 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5445
5446 unless (length $token->{data}) {
5447 $token = $self->_get_next_token;
5448 redo B;
5449 }
5450 }
5451
5452 #
5453 } elsif ($token->{type} eq 'comment') {
5454 my $comment = $self->{document}->create_comment ($token->{data});
5455 $open_elements->[-1]->[0]->append_child ($comment);
5456 $token = $self->_get_next_token;
5457 redo B;
5458 } elsif ($token->{type} eq 'start tag') {
5459 if ($token->{tag_name} eq 'noframes') {
5460 $in_body->(sub {
5461 $open_elements->[-1]->[0]->append_child (shift);
5462 });
5463 redo B;
5464 } else {
5465 #
5466 }
5467 } elsif ($token->{type} eq 'end tag') {
5468 if ($token->{tag_name} eq 'html') {
5469 $phase = 'trailing end';
5470 $token = $self->_get_next_token;
5471 redo B;
5472 } else {
5473 #
5474 }
5475 } else {
5476 #
5477 }
5478
5479 $self->{parse_error}->();
5480 ## Ignore the token
5481 $token = $self->_get_next_token;
5482 redo B;
5483
5484 ## ISSUE: An issue in spec there
5485 } else {
5486 die "$0: $insertion_mode: Unknown insertion mode";
5487 }
5488 }
5489 } elsif ($phase eq 'trailing end') {
5490 ## states in the main stage is preserved yet # MUST
5491
5492 if ($token->{type} eq 'DOCTYPE') {
5493 $self->{parse_error}->();
5494 ## Ignore the token
5495 $token = $self->_get_next_token;
5496 redo B;
5497 } elsif ($token->{type} eq 'comment') {
5498 my $comment = $self->{document}->create_comment ($token->{data});
5499 $self->{document}->append_child ($comment);
5500 $token = $self->_get_next_token;
5501 redo B;
5502 } elsif ($token->{type} eq 'character') {
5503 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5504 ## As if in the main phase.
5505 ## NOTE: The insertion mode in the main phase
5506 ## just before the phase has been changed to the trailing
5507 ## end phase is either "after body" or "after frameset".
5508 $reconstruct_active_formatting_elements->()
5509 if $phase eq 'main';
5510
5511 $open_elements->[-1]->[0]->manakai_append_text ($token->{data});
5512
5513 unless (length $token->{data}) {
5514 $token = $self->_get_next_token;
5515 redo B;
5516 }
5517 }
5518
5519 $self->{parse_error}->();
5520 $phase = 'main';
5521 ## reprocess
5522 redo B;
5523 } elsif ($token->{type} eq 'start tag' or
5524 $token->{type} eq 'end tag') {
5525 $self->{parse_error}->();
5526 $phase = 'main';
5527 ## reprocess
5528 redo B;
5529 } elsif ($token->{type} eq 'end-of-file') {
5530 ## Stop parsing
5531 last B;
5532 } else {
5533 die "$0: $token->{type}: Unknown token";
5534 }
5535 }
5536 } # B
5537
5538 ## Stop parsing # MUST
5539
5540 ## TODO: script stuffs
5541 } # _construct_tree
5542
5543 sub inner_html ($$$) {
5544 my ($class, $node, $on_error) = @_;
5545
5546 ## Step 1
5547 my $s = '';
5548
5549 my $in_cdata;
5550 my $parent = $node;
5551 while (defined $parent) {
5552 if ($parent->node_type == 1 and
5553 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5554 {
5555 style => 1, script => 1, xmp => 1, iframe => 1,
5556 noembed => 1, noframes => 1, noscript => 1,
5557 }->{$parent->local_name}) { ## TODO: case thingy
5558 $in_cdata = 1;
5559 }
5560 $parent = $parent->parent_node;
5561 }
5562
5563 ## Step 2
5564 my @node = @{$node->child_nodes};
5565 C: while (@node) {
5566 my $child = shift @node;
5567 unless (ref $child) {
5568 if ($child eq 'cdata-out') {
5569 $in_cdata = 0;
5570 } else {
5571 $s .= $child; # end tag
5572 }
5573 next C;
5574 }
5575
5576 my $nt = $child->node_type;
5577 if ($nt == 1) { # Element
5578 my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
5579 $s .= '<' . $tag_name;
5580
5581 ## ISSUE: Non-html elements
5582
5583 my @attrs = @{$child->attributes}; # sort order MUST be stable
5584 for my $attr (@attrs) { # order is implementation dependent
5585 my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
5586 $s .= ' ' . $attr_name . '="';
5587 my $attr_value = $attr->value;
5588 ## escape
5589 $attr_value =~ s/&/&amp;/g;
5590 $attr_value =~ s/</&lt;/g;
5591 $attr_value =~ s/>/&gt;/g;
5592 $attr_value =~ s/"/&quot;/g;
5593 $s .= $attr_value . '"';
5594 }
5595 $s .= '>';
5596
5597 next C if {
5598 area => 1, base => 1, basefont => 1, bgsound => 1,
5599 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5600 img => 1, input => 1, link => 1, meta => 1, param => 1,
5601 spacer => 1, wbr => 1,
5602 }->{$tag_name};
5603
5604 if (not $in_cdata and {
5605 style => 1, script => 1, xmp => 1, iframe => 1,
5606 noembed => 1, noframes => 1, noscript => 1,
5607 }->{$tag_name}) {
5608 unshift @node, 'cdata-out';
5609 $in_cdata = 1;
5610 }
5611
5612 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5613 } elsif ($nt == 3 or $nt == 4) {
5614 if ($in_cdata) {
5615 $s .= $child->data;
5616 } else {
5617 my $value = $child->data;
5618 $value =~ s/&/&amp;/g;
5619 $value =~ s/</&lt;/g;
5620 $value =~ s/>/&gt;/g;
5621 $value =~ s/"/&quot;/g;
5622 $s .= $value;
5623 }
5624 } elsif ($nt == 8) {
5625 $s .= '<!--' . $child->data . '-->';
5626 } elsif ($nt == 10) {
5627 $s .= '<!DOCTYPE ' . $child->name . '>';
5628 } elsif ($nt == 5) { # entrefs
5629 push @node, @{$child->child_nodes};
5630 } else {
5631 $on_error->($child);
5632 }
5633 } # C
5634
5635 ## Step 3
5636 return \$s;
5637 } # inner_html
5638
5639 1;
5640 # $Date: 2007/04/30 11:45:24 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24