/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.45 - (show annotations) (download) (as text)
Sat Jul 21 08:17:43 2007 UTC (17 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.44: +4 -4 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	21 Jul 2007 08:17:40 -0000
	* tree-test-2.dat: New tests for |thead|, |tbody|, and |tfoot|
	fragment parsing.

2007-07-21  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	21 Jul 2007 08:17:12 -0000
	* HTML.pm.src: There is no "in table head" or "in table foot"
	insertion mode!

2007-07-21  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.44 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## ISSUE:
6 ## var doc = implementation.createDocument (null, null, null);
7 ## doc.write ('');
8 ## alert (doc.compatMode);
9
10 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
11 ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
12 ## is not yet clear.
13 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
14 ## "{U+FEFF}..." in GB18030?
15
16 my $permitted_slash_tag_name = {
17 base => 1,
18 link => 1,
19 meta => 1,
20 hr => 1,
21 br => 1,
22 img=> 1,
23 embed => 1,
24 param => 1,
25 area => 1,
26 col => 1,
27 input => 1,
28 };
29
30 my $c1_entity_char = {
31 0x80 => 0x20AC,
32 0x81 => 0xFFFD,
33 0x82 => 0x201A,
34 0x83 => 0x0192,
35 0x84 => 0x201E,
36 0x85 => 0x2026,
37 0x86 => 0x2020,
38 0x87 => 0x2021,
39 0x88 => 0x02C6,
40 0x89 => 0x2030,
41 0x8A => 0x0160,
42 0x8B => 0x2039,
43 0x8C => 0x0152,
44 0x8D => 0xFFFD,
45 0x8E => 0x017D,
46 0x8F => 0xFFFD,
47 0x90 => 0xFFFD,
48 0x91 => 0x2018,
49 0x92 => 0x2019,
50 0x93 => 0x201C,
51 0x94 => 0x201D,
52 0x95 => 0x2022,
53 0x96 => 0x2013,
54 0x97 => 0x2014,
55 0x98 => 0x02DC,
56 0x99 => 0x2122,
57 0x9A => 0x0161,
58 0x9B => 0x203A,
59 0x9C => 0x0153,
60 0x9D => 0xFFFD,
61 0x9E => 0x017E,
62 0x9F => 0x0178,
63 }; # $c1_entity_char
64
65 my $special_category = {
66 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
67 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
68 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
69 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
70 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
71 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
72 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
73 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
74 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
75 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
76 };
77 my $scoping_category = {
78 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
79 table => 1, td => 1, th => 1,
80 };
81 my $formatting_category = {
82 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
83 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
84 };
85 # $phrasing_category: all other elements
86
87 sub parse_string ($$$;$) {
88 my $self = shift->new;
89 my $s = \$_[0];
90 $self->{document} = $_[1];
91
92 ## NOTE: |set_inner_html| copies most of this method's code
93
94 my $i = 0;
95 my $line = 1;
96 my $column = 0;
97 $self->{set_next_input_character} = sub {
98 my $self = shift;
99
100 pop @{$self->{prev_input_character}};
101 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
102
103 $self->{next_input_character} = -1 and return if $i >= length $$s;
104 $self->{next_input_character} = ord substr $$s, $i++, 1;
105 $column++;
106
107 if ($self->{next_input_character} == 0x000A) { # LF
108 $line++;
109 $column = 0;
110 } elsif ($self->{next_input_character} == 0x000D) { # CR
111 $i++ if substr ($$s, $i, 1) eq "\x0A";
112 $self->{next_input_character} = 0x000A; # LF # MUST
113 $line++;
114 $column = 0;
115 } elsif ($self->{next_input_character} > 0x10FFFF) {
116 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
117 } elsif ($self->{next_input_character} == 0x0000) { # NULL
118 !!!parse-error (type => 'NULL');
119 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120 }
121 };
122 $self->{prev_input_character} = [-1, -1, -1];
123 $self->{next_input_character} = -1;
124
125 my $onerror = $_[2] || sub {
126 my (%opt) = @_;
127 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
128 };
129 $self->{parse_error} = sub {
130 $onerror->(@_, line => $line, column => $column);
131 };
132
133 $self->_initialize_tokenizer;
134 $self->_initialize_tree_constructor;
135 $self->_construct_tree;
136 $self->_terminate_tree_constructor;
137
138 return $self->{document};
139 } # parse_string
140
141 sub new ($) {
142 my $class = shift;
143 my $self = bless {}, $class;
144 $self->{set_next_input_character} = sub {
145 $self->{next_input_character} = -1;
146 };
147 $self->{parse_error} = sub {
148 #
149 };
150 return $self;
151 } # new
152
153 sub CM_ENTITY () { 0b001 } # & markup in data
154 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
155 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
156
157 sub PLAINTEXT_CONTENT_MODEL () { 0 }
158 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
159 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
160 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
161
162 ## Implementations MUST act as if state machine in the spec
163
164 sub _initialize_tokenizer ($) {
165 my $self = shift;
166 $self->{state} = 'data'; # MUST
167 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
168 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
169 undef $self->{current_attribute};
170 undef $self->{last_emitted_start_tag_name};
171 undef $self->{last_attribute_value_state};
172 $self->{char} = [];
173 # $self->{next_input_character}
174 !!!next-input-character;
175 $self->{token} = [];
176 # $self->{escape}
177 } # _initialize_tokenizer
178
179 ## A token has:
180 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
181 ## 'character', or 'end-of-file'
182 ## ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
183 ## ->{public_identifier} (DOCTYPE)
184 ## ->{system_identifier} (DOCTYPE)
185 ## ->{correct} == 1 or 0 (DOCTYPE)
186 ## ->{attributes} isa HASH (start tag, end tag)
187 ## ->{data} (comment, character)
188
189 ## Emitted token MUST immediately be handled by the tree construction state.
190
191 ## Before each step, UA MAY check to see if either one of the scripts in
192 ## "list of scripts that will execute as soon as possible" or the first
193 ## script in the "list of scripts that will execute asynchronously",
194 ## has completed loading. If one has, then it MUST be executed
195 ## and removed from the list.
196
197 sub _get_next_token ($) {
198 my $self = shift;
199 if (@{$self->{token}}) {
200 return shift @{$self->{token}};
201 }
202
203 A: {
204 if ($self->{state} eq 'data') {
205 if ($self->{next_input_character} == 0x0026) { # &
206 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
207 $self->{state} = 'entity data';
208 !!!next-input-character;
209 redo A;
210 } else {
211 #
212 }
213 } elsif ($self->{next_input_character} == 0x002D) { # -
214 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
215 unless ($self->{escape}) {
216 if ($self->{prev_input_character}->[0] == 0x002D and # -
217 $self->{prev_input_character}->[1] == 0x0021 and # !
218 $self->{prev_input_character}->[2] == 0x003C) { # <
219 $self->{escape} = 1;
220 }
221 }
222 }
223
224 #
225 } elsif ($self->{next_input_character} == 0x003C) { # <
226 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
227 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
228 not $self->{escape})) {
229 $self->{state} = 'tag open';
230 !!!next-input-character;
231 redo A;
232 } else {
233 #
234 }
235 } elsif ($self->{next_input_character} == 0x003E) { # >
236 if ($self->{escape} and
237 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
238 if ($self->{prev_input_character}->[0] == 0x002D and # -
239 $self->{prev_input_character}->[1] == 0x002D) { # -
240 delete $self->{escape};
241 }
242 }
243
244 #
245 } elsif ($self->{next_input_character} == -1) {
246 !!!emit ({type => 'end-of-file'});
247 last A; ## TODO: ok?
248 }
249 # Anything else
250 my $token = {type => 'character',
251 data => chr $self->{next_input_character}};
252 ## Stay in the data state
253 !!!next-input-character;
254
255 !!!emit ($token);
256
257 redo A;
258 } elsif ($self->{state} eq 'entity data') {
259 ## (cannot happen in CDATA state)
260
261 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
262
263 $self->{state} = 'data';
264 # next-input-character is already done
265
266 unless (defined $token) {
267 !!!emit ({type => 'character', data => '&'});
268 } else {
269 !!!emit ($token);
270 }
271
272 redo A;
273 } elsif ($self->{state} eq 'tag open') {
274 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
275 if ($self->{next_input_character} == 0x002F) { # /
276 !!!next-input-character;
277 $self->{state} = 'close tag open';
278 redo A;
279 } else {
280 ## reconsume
281 $self->{state} = 'data';
282
283 !!!emit ({type => 'character', data => '<'});
284
285 redo A;
286 }
287 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
288 if ($self->{next_input_character} == 0x0021) { # !
289 $self->{state} = 'markup declaration open';
290 !!!next-input-character;
291 redo A;
292 } elsif ($self->{next_input_character} == 0x002F) { # /
293 $self->{state} = 'close tag open';
294 !!!next-input-character;
295 redo A;
296 } elsif (0x0041 <= $self->{next_input_character} and
297 $self->{next_input_character} <= 0x005A) { # A..Z
298 $self->{current_token}
299 = {type => 'start tag',
300 tag_name => chr ($self->{next_input_character} + 0x0020)};
301 $self->{state} = 'tag name';
302 !!!next-input-character;
303 redo A;
304 } elsif (0x0061 <= $self->{next_input_character} and
305 $self->{next_input_character} <= 0x007A) { # a..z
306 $self->{current_token} = {type => 'start tag',
307 tag_name => chr ($self->{next_input_character})};
308 $self->{state} = 'tag name';
309 !!!next-input-character;
310 redo A;
311 } elsif ($self->{next_input_character} == 0x003E) { # >
312 !!!parse-error (type => 'empty start tag');
313 $self->{state} = 'data';
314 !!!next-input-character;
315
316 !!!emit ({type => 'character', data => '<>'});
317
318 redo A;
319 } elsif ($self->{next_input_character} == 0x003F) { # ?
320 !!!parse-error (type => 'pio');
321 $self->{state} = 'bogus comment';
322 ## $self->{next_input_character} is intentionally left as is
323 redo A;
324 } else {
325 !!!parse-error (type => 'bare stago');
326 $self->{state} = 'data';
327 ## reconsume
328
329 !!!emit ({type => 'character', data => '<'});
330
331 redo A;
332 }
333 } else {
334 die "$0: $self->{content_model} in tag open";
335 }
336 } elsif ($self->{state} eq 'close tag open') {
337 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
338 if (defined $self->{last_emitted_start_tag_name}) {
339 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
340 my @next_char;
341 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
342 push @next_char, $self->{next_input_character};
343 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
344 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
345 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
346 !!!next-input-character;
347 next TAGNAME;
348 } else {
349 $self->{next_input_character} = shift @next_char; # reconsume
350 !!!back-next-input-character (@next_char);
351 $self->{state} = 'data';
352
353 !!!emit ({type => 'character', data => '</'});
354
355 redo A;
356 }
357 }
358 push @next_char, $self->{next_input_character};
359
360 unless ($self->{next_input_character} == 0x0009 or # HT
361 $self->{next_input_character} == 0x000A or # LF
362 $self->{next_input_character} == 0x000B or # VT
363 $self->{next_input_character} == 0x000C or # FF
364 $self->{next_input_character} == 0x0020 or # SP
365 $self->{next_input_character} == 0x003E or # >
366 $self->{next_input_character} == 0x002F or # /
367 $self->{next_input_character} == -1) {
368 $self->{next_input_character} = shift @next_char; # reconsume
369 !!!back-next-input-character (@next_char);
370 $self->{state} = 'data';
371 !!!emit ({type => 'character', data => '</'});
372 redo A;
373 } else {
374 $self->{next_input_character} = shift @next_char;
375 !!!back-next-input-character (@next_char);
376 # and consume...
377 }
378 } else {
379 ## No start tag token has ever been emitted
380 # next-input-character is already done
381 $self->{state} = 'data';
382 !!!emit ({type => 'character', data => '</'});
383 redo A;
384 }
385 }
386
387 if (0x0041 <= $self->{next_input_character} and
388 $self->{next_input_character} <= 0x005A) { # A..Z
389 $self->{current_token} = {type => 'end tag',
390 tag_name => chr ($self->{next_input_character} + 0x0020)};
391 $self->{state} = 'tag name';
392 !!!next-input-character;
393 redo A;
394 } elsif (0x0061 <= $self->{next_input_character} and
395 $self->{next_input_character} <= 0x007A) { # a..z
396 $self->{current_token} = {type => 'end tag',
397 tag_name => chr ($self->{next_input_character})};
398 $self->{state} = 'tag name';
399 !!!next-input-character;
400 redo A;
401 } elsif ($self->{next_input_character} == 0x003E) { # >
402 !!!parse-error (type => 'empty end tag');
403 $self->{state} = 'data';
404 !!!next-input-character;
405 redo A;
406 } elsif ($self->{next_input_character} == -1) {
407 !!!parse-error (type => 'bare etago');
408 $self->{state} = 'data';
409 # reconsume
410
411 !!!emit ({type => 'character', data => '</'});
412
413 redo A;
414 } else {
415 !!!parse-error (type => 'bogus end tag');
416 $self->{state} = 'bogus comment';
417 ## $self->{next_input_character} is intentionally left as is
418 redo A;
419 }
420 } elsif ($self->{state} eq 'tag name') {
421 if ($self->{next_input_character} == 0x0009 or # HT
422 $self->{next_input_character} == 0x000A or # LF
423 $self->{next_input_character} == 0x000B or # VT
424 $self->{next_input_character} == 0x000C or # FF
425 $self->{next_input_character} == 0x0020) { # SP
426 $self->{state} = 'before attribute name';
427 !!!next-input-character;
428 redo A;
429 } elsif ($self->{next_input_character} == 0x003E) { # >
430 if ($self->{current_token}->{type} eq 'start tag') {
431 $self->{current_token}->{first_start_tag}
432 = not defined $self->{last_emitted_start_tag_name};
433 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
434 } elsif ($self->{current_token}->{type} eq 'end tag') {
435 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
436 if ($self->{current_token}->{attributes}) {
437 !!!parse-error (type => 'end tag attribute');
438 }
439 } else {
440 die "$0: $self->{current_token}->{type}: Unknown token type";
441 }
442 $self->{state} = 'data';
443 !!!next-input-character;
444
445 !!!emit ($self->{current_token}); # start tag or end tag
446
447 redo A;
448 } elsif (0x0041 <= $self->{next_input_character} and
449 $self->{next_input_character} <= 0x005A) { # A..Z
450 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
451 # start tag or end tag
452 ## Stay in this state
453 !!!next-input-character;
454 redo A;
455 } elsif ($self->{next_input_character} == -1) {
456 !!!parse-error (type => 'unclosed tag');
457 if ($self->{current_token}->{type} eq 'start tag') {
458 $self->{current_token}->{first_start_tag}
459 = not defined $self->{last_emitted_start_tag_name};
460 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
461 } elsif ($self->{current_token}->{type} eq 'end tag') {
462 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
463 if ($self->{current_token}->{attributes}) {
464 !!!parse-error (type => 'end tag attribute');
465 }
466 } else {
467 die "$0: $self->{current_token}->{type}: Unknown token type";
468 }
469 $self->{state} = 'data';
470 # reconsume
471
472 !!!emit ($self->{current_token}); # start tag or end tag
473
474 redo A;
475 } elsif ($self->{next_input_character} == 0x002F) { # /
476 !!!next-input-character;
477 if ($self->{next_input_character} == 0x003E and # >
478 $self->{current_token}->{type} eq 'start tag' and
479 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
480 # permitted slash
481 #
482 } else {
483 !!!parse-error (type => 'nestc');
484 }
485 $self->{state} = 'before attribute name';
486 # next-input-character is already done
487 redo A;
488 } else {
489 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
490 # start tag or end tag
491 ## Stay in the state
492 !!!next-input-character;
493 redo A;
494 }
495 } elsif ($self->{state} eq 'before attribute name') {
496 if ($self->{next_input_character} == 0x0009 or # HT
497 $self->{next_input_character} == 0x000A or # LF
498 $self->{next_input_character} == 0x000B or # VT
499 $self->{next_input_character} == 0x000C or # FF
500 $self->{next_input_character} == 0x0020) { # SP
501 ## Stay in the state
502 !!!next-input-character;
503 redo A;
504 } elsif ($self->{next_input_character} == 0x003E) { # >
505 if ($self->{current_token}->{type} eq 'start tag') {
506 $self->{current_token}->{first_start_tag}
507 = not defined $self->{last_emitted_start_tag_name};
508 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
509 } elsif ($self->{current_token}->{type} eq 'end tag') {
510 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
511 if ($self->{current_token}->{attributes}) {
512 !!!parse-error (type => 'end tag attribute');
513 }
514 } else {
515 die "$0: $self->{current_token}->{type}: Unknown token type";
516 }
517 $self->{state} = 'data';
518 !!!next-input-character;
519
520 !!!emit ($self->{current_token}); # start tag or end tag
521
522 redo A;
523 } elsif (0x0041 <= $self->{next_input_character} and
524 $self->{next_input_character} <= 0x005A) { # A..Z
525 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
526 value => ''};
527 $self->{state} = 'attribute name';
528 !!!next-input-character;
529 redo A;
530 } elsif ($self->{next_input_character} == 0x002F) { # /
531 !!!next-input-character;
532 if ($self->{next_input_character} == 0x003E and # >
533 $self->{current_token}->{type} eq 'start tag' and
534 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
535 # permitted slash
536 #
537 } else {
538 !!!parse-error (type => 'nestc');
539 }
540 ## Stay in the state
541 # next-input-character is already done
542 redo A;
543 } elsif ($self->{next_input_character} == -1) {
544 !!!parse-error (type => 'unclosed tag');
545 if ($self->{current_token}->{type} eq 'start tag') {
546 $self->{current_token}->{first_start_tag}
547 = not defined $self->{last_emitted_start_tag_name};
548 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
549 } elsif ($self->{current_token}->{type} eq 'end tag') {
550 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
551 if ($self->{current_token}->{attributes}) {
552 !!!parse-error (type => 'end tag attribute');
553 }
554 } else {
555 die "$0: $self->{current_token}->{type}: Unknown token type";
556 }
557 $self->{state} = 'data';
558 # reconsume
559
560 !!!emit ($self->{current_token}); # start tag or end tag
561
562 redo A;
563 } else {
564 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
565 value => ''};
566 $self->{state} = 'attribute name';
567 !!!next-input-character;
568 redo A;
569 }
570 } elsif ($self->{state} eq 'attribute name') {
571 my $before_leave = sub {
572 if (exists $self->{current_token}->{attributes} # start tag or end tag
573 ->{$self->{current_attribute}->{name}}) { # MUST
574 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
575 ## Discard $self->{current_attribute} # MUST
576 } else {
577 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
578 = $self->{current_attribute};
579 }
580 }; # $before_leave
581
582 if ($self->{next_input_character} == 0x0009 or # HT
583 $self->{next_input_character} == 0x000A or # LF
584 $self->{next_input_character} == 0x000B or # VT
585 $self->{next_input_character} == 0x000C or # FF
586 $self->{next_input_character} == 0x0020) { # SP
587 $before_leave->();
588 $self->{state} = 'after attribute name';
589 !!!next-input-character;
590 redo A;
591 } elsif ($self->{next_input_character} == 0x003D) { # =
592 $before_leave->();
593 $self->{state} = 'before attribute value';
594 !!!next-input-character;
595 redo A;
596 } elsif ($self->{next_input_character} == 0x003E) { # >
597 $before_leave->();
598 if ($self->{current_token}->{type} eq 'start tag') {
599 $self->{current_token}->{first_start_tag}
600 = not defined $self->{last_emitted_start_tag_name};
601 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
602 } elsif ($self->{current_token}->{type} eq 'end tag') {
603 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
604 if ($self->{current_token}->{attributes}) {
605 !!!parse-error (type => 'end tag attribute');
606 }
607 } else {
608 die "$0: $self->{current_token}->{type}: Unknown token type";
609 }
610 $self->{state} = 'data';
611 !!!next-input-character;
612
613 !!!emit ($self->{current_token}); # start tag or end tag
614
615 redo A;
616 } elsif (0x0041 <= $self->{next_input_character} and
617 $self->{next_input_character} <= 0x005A) { # A..Z
618 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
619 ## Stay in the state
620 !!!next-input-character;
621 redo A;
622 } elsif ($self->{next_input_character} == 0x002F) { # /
623 $before_leave->();
624 !!!next-input-character;
625 if ($self->{next_input_character} == 0x003E and # >
626 $self->{current_token}->{type} eq 'start tag' and
627 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
628 # permitted slash
629 #
630 } else {
631 !!!parse-error (type => 'nestc');
632 }
633 $self->{state} = 'before attribute name';
634 # next-input-character is already done
635 redo A;
636 } elsif ($self->{next_input_character} == -1) {
637 !!!parse-error (type => 'unclosed tag');
638 $before_leave->();
639 if ($self->{current_token}->{type} eq 'start tag') {
640 $self->{current_token}->{first_start_tag}
641 = not defined $self->{last_emitted_start_tag_name};
642 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
643 } elsif ($self->{current_token}->{type} eq 'end tag') {
644 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
645 if ($self->{current_token}->{attributes}) {
646 !!!parse-error (type => 'end tag attribute');
647 }
648 } else {
649 die "$0: $self->{current_token}->{type}: Unknown token type";
650 }
651 $self->{state} = 'data';
652 # reconsume
653
654 !!!emit ($self->{current_token}); # start tag or end tag
655
656 redo A;
657 } else {
658 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
659 ## Stay in the state
660 !!!next-input-character;
661 redo A;
662 }
663 } elsif ($self->{state} eq 'after attribute name') {
664 if ($self->{next_input_character} == 0x0009 or # HT
665 $self->{next_input_character} == 0x000A or # LF
666 $self->{next_input_character} == 0x000B or # VT
667 $self->{next_input_character} == 0x000C or # FF
668 $self->{next_input_character} == 0x0020) { # SP
669 ## Stay in the state
670 !!!next-input-character;
671 redo A;
672 } elsif ($self->{next_input_character} == 0x003D) { # =
673 $self->{state} = 'before attribute value';
674 !!!next-input-character;
675 redo A;
676 } elsif ($self->{next_input_character} == 0x003E) { # >
677 if ($self->{current_token}->{type} eq 'start tag') {
678 $self->{current_token}->{first_start_tag}
679 = not defined $self->{last_emitted_start_tag_name};
680 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
681 } elsif ($self->{current_token}->{type} eq 'end tag') {
682 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
683 if ($self->{current_token}->{attributes}) {
684 !!!parse-error (type => 'end tag attribute');
685 }
686 } else {
687 die "$0: $self->{current_token}->{type}: Unknown token type";
688 }
689 $self->{state} = 'data';
690 !!!next-input-character;
691
692 !!!emit ($self->{current_token}); # start tag or end tag
693
694 redo A;
695 } elsif (0x0041 <= $self->{next_input_character} and
696 $self->{next_input_character} <= 0x005A) { # A..Z
697 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
698 value => ''};
699 $self->{state} = 'attribute name';
700 !!!next-input-character;
701 redo A;
702 } elsif ($self->{next_input_character} == 0x002F) { # /
703 !!!next-input-character;
704 if ($self->{next_input_character} == 0x003E and # >
705 $self->{current_token}->{type} eq 'start tag' and
706 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
707 # permitted slash
708 #
709 } else {
710 !!!parse-error (type => 'nestc');
711 ## TODO: Different error type for <aa / bb> than <aa/>
712 }
713 $self->{state} = 'before attribute name';
714 # next-input-character is already done
715 redo A;
716 } elsif ($self->{next_input_character} == -1) {
717 !!!parse-error (type => 'unclosed tag');
718 if ($self->{current_token}->{type} eq 'start tag') {
719 $self->{current_token}->{first_start_tag}
720 = not defined $self->{last_emitted_start_tag_name};
721 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
722 } elsif ($self->{current_token}->{type} eq 'end tag') {
723 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
724 if ($self->{current_token}->{attributes}) {
725 !!!parse-error (type => 'end tag attribute');
726 }
727 } else {
728 die "$0: $self->{current_token}->{type}: Unknown token type";
729 }
730 $self->{state} = 'data';
731 # reconsume
732
733 !!!emit ($self->{current_token}); # start tag or end tag
734
735 redo A;
736 } else {
737 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
738 value => ''};
739 $self->{state} = 'attribute name';
740 !!!next-input-character;
741 redo A;
742 }
743 } elsif ($self->{state} eq 'before attribute value') {
744 if ($self->{next_input_character} == 0x0009 or # HT
745 $self->{next_input_character} == 0x000A or # LF
746 $self->{next_input_character} == 0x000B or # VT
747 $self->{next_input_character} == 0x000C or # FF
748 $self->{next_input_character} == 0x0020) { # SP
749 ## Stay in the state
750 !!!next-input-character;
751 redo A;
752 } elsif ($self->{next_input_character} == 0x0022) { # "
753 $self->{state} = 'attribute value (double-quoted)';
754 !!!next-input-character;
755 redo A;
756 } elsif ($self->{next_input_character} == 0x0026) { # &
757 $self->{state} = 'attribute value (unquoted)';
758 ## reconsume
759 redo A;
760 } elsif ($self->{next_input_character} == 0x0027) { # '
761 $self->{state} = 'attribute value (single-quoted)';
762 !!!next-input-character;
763 redo A;
764 } elsif ($self->{next_input_character} == 0x003E) { # >
765 if ($self->{current_token}->{type} eq 'start tag') {
766 $self->{current_token}->{first_start_tag}
767 = not defined $self->{last_emitted_start_tag_name};
768 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
769 } elsif ($self->{current_token}->{type} eq 'end tag') {
770 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
771 if ($self->{current_token}->{attributes}) {
772 !!!parse-error (type => 'end tag attribute');
773 }
774 } else {
775 die "$0: $self->{current_token}->{type}: Unknown token type";
776 }
777 $self->{state} = 'data';
778 !!!next-input-character;
779
780 !!!emit ($self->{current_token}); # start tag or end tag
781
782 redo A;
783 } elsif ($self->{next_input_character} == -1) {
784 !!!parse-error (type => 'unclosed tag');
785 if ($self->{current_token}->{type} eq 'start tag') {
786 $self->{current_token}->{first_start_tag}
787 = not defined $self->{last_emitted_start_tag_name};
788 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
789 } elsif ($self->{current_token}->{type} eq 'end tag') {
790 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791 if ($self->{current_token}->{attributes}) {
792 !!!parse-error (type => 'end tag attribute');
793 }
794 } else {
795 die "$0: $self->{current_token}->{type}: Unknown token type";
796 }
797 $self->{state} = 'data';
798 ## reconsume
799
800 !!!emit ($self->{current_token}); # start tag or end tag
801
802 redo A;
803 } else {
804 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
805 $self->{state} = 'attribute value (unquoted)';
806 !!!next-input-character;
807 redo A;
808 }
809 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
810 if ($self->{next_input_character} == 0x0022) { # "
811 $self->{state} = 'before attribute name';
812 !!!next-input-character;
813 redo A;
814 } elsif ($self->{next_input_character} == 0x0026) { # &
815 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
816 $self->{state} = 'entity in attribute value';
817 !!!next-input-character;
818 redo A;
819 } elsif ($self->{next_input_character} == -1) {
820 !!!parse-error (type => 'unclosed attribute value');
821 if ($self->{current_token}->{type} eq 'start tag') {
822 $self->{current_token}->{first_start_tag}
823 = not defined $self->{last_emitted_start_tag_name};
824 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
825 } elsif ($self->{current_token}->{type} eq 'end tag') {
826 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
827 if ($self->{current_token}->{attributes}) {
828 !!!parse-error (type => 'end tag attribute');
829 }
830 } else {
831 die "$0: $self->{current_token}->{type}: Unknown token type";
832 }
833 $self->{state} = 'data';
834 ## reconsume
835
836 !!!emit ($self->{current_token}); # start tag or end tag
837
838 redo A;
839 } else {
840 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
841 ## Stay in the state
842 !!!next-input-character;
843 redo A;
844 }
845 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
846 if ($self->{next_input_character} == 0x0027) { # '
847 $self->{state} = 'before attribute name';
848 !!!next-input-character;
849 redo A;
850 } elsif ($self->{next_input_character} == 0x0026) { # &
851 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
852 $self->{state} = 'entity in attribute value';
853 !!!next-input-character;
854 redo A;
855 } elsif ($self->{next_input_character} == -1) {
856 !!!parse-error (type => 'unclosed attribute value');
857 if ($self->{current_token}->{type} eq 'start tag') {
858 $self->{current_token}->{first_start_tag}
859 = not defined $self->{last_emitted_start_tag_name};
860 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
861 } elsif ($self->{current_token}->{type} eq 'end tag') {
862 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
863 if ($self->{current_token}->{attributes}) {
864 !!!parse-error (type => 'end tag attribute');
865 }
866 } else {
867 die "$0: $self->{current_token}->{type}: Unknown token type";
868 }
869 $self->{state} = 'data';
870 ## reconsume
871
872 !!!emit ($self->{current_token}); # start tag or end tag
873
874 redo A;
875 } else {
876 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
877 ## Stay in the state
878 !!!next-input-character;
879 redo A;
880 }
881 } elsif ($self->{state} eq 'attribute value (unquoted)') {
882 if ($self->{next_input_character} == 0x0009 or # HT
883 $self->{next_input_character} == 0x000A or # LF
884 $self->{next_input_character} == 0x000B or # HT
885 $self->{next_input_character} == 0x000C or # FF
886 $self->{next_input_character} == 0x0020) { # SP
887 $self->{state} = 'before attribute name';
888 !!!next-input-character;
889 redo A;
890 } elsif ($self->{next_input_character} == 0x0026) { # &
891 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
892 $self->{state} = 'entity in attribute value';
893 !!!next-input-character;
894 redo A;
895 } elsif ($self->{next_input_character} == 0x003E) { # >
896 if ($self->{current_token}->{type} eq 'start tag') {
897 $self->{current_token}->{first_start_tag}
898 = not defined $self->{last_emitted_start_tag_name};
899 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
900 } elsif ($self->{current_token}->{type} eq 'end tag') {
901 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
902 if ($self->{current_token}->{attributes}) {
903 !!!parse-error (type => 'end tag attribute');
904 }
905 } else {
906 die "$0: $self->{current_token}->{type}: Unknown token type";
907 }
908 $self->{state} = 'data';
909 !!!next-input-character;
910
911 !!!emit ($self->{current_token}); # start tag or end tag
912
913 redo A;
914 } elsif ($self->{next_input_character} == -1) {
915 !!!parse-error (type => 'unclosed tag');
916 if ($self->{current_token}->{type} eq 'start tag') {
917 $self->{current_token}->{first_start_tag}
918 = not defined $self->{last_emitted_start_tag_name};
919 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
920 } elsif ($self->{current_token}->{type} eq 'end tag') {
921 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
922 if ($self->{current_token}->{attributes}) {
923 !!!parse-error (type => 'end tag attribute');
924 }
925 } else {
926 die "$0: $self->{current_token}->{type}: Unknown token type";
927 }
928 $self->{state} = 'data';
929 ## reconsume
930
931 !!!emit ($self->{current_token}); # start tag or end tag
932
933 redo A;
934 } else {
935 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
936 ## Stay in the state
937 !!!next-input-character;
938 redo A;
939 }
940 } elsif ($self->{state} eq 'entity in attribute value') {
941 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
942
943 unless (defined $token) {
944 $self->{current_attribute}->{value} .= '&';
945 } else {
946 $self->{current_attribute}->{value} .= $token->{data};
947 ## ISSUE: spec says "append the returned character token to the current attribute's value"
948 }
949
950 $self->{state} = $self->{last_attribute_value_state};
951 # next-input-character is already done
952 redo A;
953 } elsif ($self->{state} eq 'bogus comment') {
954 ## (only happen if PCDATA state)
955
956 my $token = {type => 'comment', data => ''};
957
958 BC: {
959 if ($self->{next_input_character} == 0x003E) { # >
960 $self->{state} = 'data';
961 !!!next-input-character;
962
963 !!!emit ($token);
964
965 redo A;
966 } elsif ($self->{next_input_character} == -1) {
967 $self->{state} = 'data';
968 ## reconsume
969
970 !!!emit ($token);
971
972 redo A;
973 } else {
974 $token->{data} .= chr ($self->{next_input_character});
975 !!!next-input-character;
976 redo BC;
977 }
978 } # BC
979 } elsif ($self->{state} eq 'markup declaration open') {
980 ## (only happen if PCDATA state)
981
982 my @next_char;
983 push @next_char, $self->{next_input_character};
984
985 if ($self->{next_input_character} == 0x002D) { # -
986 !!!next-input-character;
987 push @next_char, $self->{next_input_character};
988 if ($self->{next_input_character} == 0x002D) { # -
989 $self->{current_token} = {type => 'comment', data => ''};
990 $self->{state} = 'comment start';
991 !!!next-input-character;
992 redo A;
993 }
994 } elsif ($self->{next_input_character} == 0x0044 or # D
995 $self->{next_input_character} == 0x0064) { # d
996 !!!next-input-character;
997 push @next_char, $self->{next_input_character};
998 if ($self->{next_input_character} == 0x004F or # O
999 $self->{next_input_character} == 0x006F) { # o
1000 !!!next-input-character;
1001 push @next_char, $self->{next_input_character};
1002 if ($self->{next_input_character} == 0x0043 or # C
1003 $self->{next_input_character} == 0x0063) { # c
1004 !!!next-input-character;
1005 push @next_char, $self->{next_input_character};
1006 if ($self->{next_input_character} == 0x0054 or # T
1007 $self->{next_input_character} == 0x0074) { # t
1008 !!!next-input-character;
1009 push @next_char, $self->{next_input_character};
1010 if ($self->{next_input_character} == 0x0059 or # Y
1011 $self->{next_input_character} == 0x0079) { # y
1012 !!!next-input-character;
1013 push @next_char, $self->{next_input_character};
1014 if ($self->{next_input_character} == 0x0050 or # P
1015 $self->{next_input_character} == 0x0070) { # p
1016 !!!next-input-character;
1017 push @next_char, $self->{next_input_character};
1018 if ($self->{next_input_character} == 0x0045 or # E
1019 $self->{next_input_character} == 0x0065) { # e
1020 ## ISSUE: What a stupid code this is!
1021 $self->{state} = 'DOCTYPE';
1022 !!!next-input-character;
1023 redo A;
1024 }
1025 }
1026 }
1027 }
1028 }
1029 }
1030 }
1031
1032 !!!parse-error (type => 'bogus comment');
1033 $self->{next_input_character} = shift @next_char;
1034 !!!back-next-input-character (@next_char);
1035 $self->{state} = 'bogus comment';
1036 redo A;
1037
1038 ## ISSUE: typos in spec: chacacters, is is a parse error
1039 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1040 } elsif ($self->{state} eq 'comment start') {
1041 if ($self->{next_input_character} == 0x002D) { # -
1042 $self->{state} = 'comment start dash';
1043 !!!next-input-character;
1044 redo A;
1045 } elsif ($self->{next_input_character} == 0x003E) { # >
1046 !!!parse-error (type => 'bogus comment');
1047 $self->{state} = 'data';
1048 !!!next-input-character;
1049
1050 !!!emit ($self->{current_token}); # comment
1051
1052 redo A;
1053 } elsif ($self->{next_input_character} == -1) {
1054 !!!parse-error (type => 'unclosed comment');
1055 $self->{state} = 'data';
1056 ## reconsume
1057
1058 !!!emit ($self->{current_token}); # comment
1059
1060 redo A;
1061 } else {
1062 $self->{current_token}->{data} # comment
1063 .= chr ($self->{next_input_character});
1064 $self->{state} = 'comment';
1065 !!!next-input-character;
1066 redo A;
1067 }
1068 } elsif ($self->{state} eq 'comment start dash') {
1069 if ($self->{next_input_character} == 0x002D) { # -
1070 $self->{state} = 'comment end';
1071 !!!next-input-character;
1072 redo A;
1073 } elsif ($self->{next_input_character} == 0x003E) { # >
1074 !!!parse-error (type => 'bogus comment');
1075 $self->{state} = 'data';
1076 !!!next-input-character;
1077
1078 !!!emit ($self->{current_token}); # comment
1079
1080 redo A;
1081 } elsif ($self->{next_input_character} == -1) {
1082 !!!parse-error (type => 'unclosed comment');
1083 $self->{state} = 'data';
1084 ## reconsume
1085
1086 !!!emit ($self->{current_token}); # comment
1087
1088 redo A;
1089 } else {
1090 $self->{current_token}->{data} # comment
1091 .= '-' . chr ($self->{next_input_character});
1092 $self->{state} = 'comment';
1093 !!!next-input-character;
1094 redo A;
1095 }
1096 } elsif ($self->{state} eq 'comment') {
1097 if ($self->{next_input_character} == 0x002D) { # -
1098 $self->{state} = 'comment end dash';
1099 !!!next-input-character;
1100 redo A;
1101 } elsif ($self->{next_input_character} == -1) {
1102 !!!parse-error (type => 'unclosed comment');
1103 $self->{state} = 'data';
1104 ## reconsume
1105
1106 !!!emit ($self->{current_token}); # comment
1107
1108 redo A;
1109 } else {
1110 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1111 ## Stay in the state
1112 !!!next-input-character;
1113 redo A;
1114 }
1115 } elsif ($self->{state} eq 'comment end dash') {
1116 if ($self->{next_input_character} == 0x002D) { # -
1117 $self->{state} = 'comment end';
1118 !!!next-input-character;
1119 redo A;
1120 } elsif ($self->{next_input_character} == -1) {
1121 !!!parse-error (type => 'unclosed comment');
1122 $self->{state} = 'data';
1123 ## reconsume
1124
1125 !!!emit ($self->{current_token}); # comment
1126
1127 redo A;
1128 } else {
1129 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1130 $self->{state} = 'comment';
1131 !!!next-input-character;
1132 redo A;
1133 }
1134 } elsif ($self->{state} eq 'comment end') {
1135 if ($self->{next_input_character} == 0x003E) { # >
1136 $self->{state} = 'data';
1137 !!!next-input-character;
1138
1139 !!!emit ($self->{current_token}); # comment
1140
1141 redo A;
1142 } elsif ($self->{next_input_character} == 0x002D) { # -
1143 !!!parse-error (type => 'dash in comment');
1144 $self->{current_token}->{data} .= '-'; # comment
1145 ## Stay in the state
1146 !!!next-input-character;
1147 redo A;
1148 } elsif ($self->{next_input_character} == -1) {
1149 !!!parse-error (type => 'unclosed comment');
1150 $self->{state} = 'data';
1151 ## reconsume
1152
1153 !!!emit ($self->{current_token}); # comment
1154
1155 redo A;
1156 } else {
1157 !!!parse-error (type => 'dash in comment');
1158 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1159 $self->{state} = 'comment';
1160 !!!next-input-character;
1161 redo A;
1162 }
1163 } elsif ($self->{state} eq 'DOCTYPE') {
1164 if ($self->{next_input_character} == 0x0009 or # HT
1165 $self->{next_input_character} == 0x000A or # LF
1166 $self->{next_input_character} == 0x000B or # VT
1167 $self->{next_input_character} == 0x000C or # FF
1168 $self->{next_input_character} == 0x0020) { # SP
1169 $self->{state} = 'before DOCTYPE name';
1170 !!!next-input-character;
1171 redo A;
1172 } else {
1173 !!!parse-error (type => 'no space before DOCTYPE name');
1174 $self->{state} = 'before DOCTYPE name';
1175 ## reconsume
1176 redo A;
1177 }
1178 } elsif ($self->{state} eq 'before DOCTYPE name') {
1179 if ($self->{next_input_character} == 0x0009 or # HT
1180 $self->{next_input_character} == 0x000A or # LF
1181 $self->{next_input_character} == 0x000B or # VT
1182 $self->{next_input_character} == 0x000C or # FF
1183 $self->{next_input_character} == 0x0020) { # SP
1184 ## Stay in the state
1185 !!!next-input-character;
1186 redo A;
1187 } elsif ($self->{next_input_character} == 0x003E) { # >
1188 !!!parse-error (type => 'no DOCTYPE name');
1189 $self->{state} = 'data';
1190 !!!next-input-character;
1191
1192 !!!emit ({type => 'DOCTYPE'}); # incorrect
1193
1194 redo A;
1195 } elsif ($self->{next_input_character} == -1) {
1196 !!!parse-error (type => 'no DOCTYPE name');
1197 $self->{state} = 'data';
1198 ## reconsume
1199
1200 !!!emit ({type => 'DOCTYPE'}); # incorrect
1201
1202 redo A;
1203 } else {
1204 $self->{current_token}
1205 = {type => 'DOCTYPE',
1206 name => chr ($self->{next_input_character}),
1207 correct => 1};
1208 ## ISSUE: "Set the token's name name to the" in the spec
1209 $self->{state} = 'DOCTYPE name';
1210 !!!next-input-character;
1211 redo A;
1212 }
1213 } elsif ($self->{state} eq 'DOCTYPE name') {
1214 ## ISSUE: Redundant "First," in the spec.
1215 if ($self->{next_input_character} == 0x0009 or # HT
1216 $self->{next_input_character} == 0x000A or # LF
1217 $self->{next_input_character} == 0x000B or # VT
1218 $self->{next_input_character} == 0x000C or # FF
1219 $self->{next_input_character} == 0x0020) { # SP
1220 $self->{state} = 'after DOCTYPE name';
1221 !!!next-input-character;
1222 redo A;
1223 } elsif ($self->{next_input_character} == 0x003E) { # >
1224 $self->{state} = 'data';
1225 !!!next-input-character;
1226
1227 !!!emit ($self->{current_token}); # DOCTYPE
1228
1229 redo A;
1230 } elsif ($self->{next_input_character} == -1) {
1231 !!!parse-error (type => 'unclosed DOCTYPE');
1232 $self->{state} = 'data';
1233 ## reconsume
1234
1235 delete $self->{current_token}->{correct};
1236 !!!emit ($self->{current_token}); # DOCTYPE
1237
1238 redo A;
1239 } else {
1240 $self->{current_token}->{name}
1241 .= chr ($self->{next_input_character}); # DOCTYPE
1242 ## Stay in the state
1243 !!!next-input-character;
1244 redo A;
1245 }
1246 } elsif ($self->{state} eq 'after DOCTYPE name') {
1247 if ($self->{next_input_character} == 0x0009 or # HT
1248 $self->{next_input_character} == 0x000A or # LF
1249 $self->{next_input_character} == 0x000B or # VT
1250 $self->{next_input_character} == 0x000C or # FF
1251 $self->{next_input_character} == 0x0020) { # SP
1252 ## Stay in the state
1253 !!!next-input-character;
1254 redo A;
1255 } elsif ($self->{next_input_character} == 0x003E) { # >
1256 $self->{state} = 'data';
1257 !!!next-input-character;
1258
1259 !!!emit ($self->{current_token}); # DOCTYPE
1260
1261 redo A;
1262 } elsif ($self->{next_input_character} == -1) {
1263 !!!parse-error (type => 'unclosed DOCTYPE');
1264 $self->{state} = 'data';
1265 ## reconsume
1266
1267 delete $self->{current_token}->{correct};
1268 !!!emit ($self->{current_token}); # DOCTYPE
1269
1270 redo A;
1271 } elsif ($self->{next_input_character} == 0x0050 or # P
1272 $self->{next_input_character} == 0x0070) { # p
1273 !!!next-input-character;
1274 if ($self->{next_input_character} == 0x0055 or # U
1275 $self->{next_input_character} == 0x0075) { # u
1276 !!!next-input-character;
1277 if ($self->{next_input_character} == 0x0042 or # B
1278 $self->{next_input_character} == 0x0062) { # b
1279 !!!next-input-character;
1280 if ($self->{next_input_character} == 0x004C or # L
1281 $self->{next_input_character} == 0x006C) { # l
1282 !!!next-input-character;
1283 if ($self->{next_input_character} == 0x0049 or # I
1284 $self->{next_input_character} == 0x0069) { # i
1285 !!!next-input-character;
1286 if ($self->{next_input_character} == 0x0043 or # C
1287 $self->{next_input_character} == 0x0063) { # c
1288 $self->{state} = 'before DOCTYPE public identifier';
1289 !!!next-input-character;
1290 redo A;
1291 }
1292 }
1293 }
1294 }
1295 }
1296
1297 #
1298 } elsif ($self->{next_input_character} == 0x0053 or # S
1299 $self->{next_input_character} == 0x0073) { # s
1300 !!!next-input-character;
1301 if ($self->{next_input_character} == 0x0059 or # Y
1302 $self->{next_input_character} == 0x0079) { # y
1303 !!!next-input-character;
1304 if ($self->{next_input_character} == 0x0053 or # S
1305 $self->{next_input_character} == 0x0073) { # s
1306 !!!next-input-character;
1307 if ($self->{next_input_character} == 0x0054 or # T
1308 $self->{next_input_character} == 0x0074) { # t
1309 !!!next-input-character;
1310 if ($self->{next_input_character} == 0x0045 or # E
1311 $self->{next_input_character} == 0x0065) { # e
1312 !!!next-input-character;
1313 if ($self->{next_input_character} == 0x004D or # M
1314 $self->{next_input_character} == 0x006D) { # m
1315 $self->{state} = 'before DOCTYPE system identifier';
1316 !!!next-input-character;
1317 redo A;
1318 }
1319 }
1320 }
1321 }
1322 }
1323
1324 #
1325 } else {
1326 !!!next-input-character;
1327 #
1328 }
1329
1330 !!!parse-error (type => 'string after DOCTYPE name');
1331 $self->{state} = 'bogus DOCTYPE';
1332 # next-input-character is already done
1333 redo A;
1334 } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
1335 if ({
1336 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1337 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1338 }->{$self->{next_input_character}}) {
1339 ## Stay in the state
1340 !!!next-input-character;
1341 redo A;
1342 } elsif ($self->{next_input_character} eq 0x0022) { # "
1343 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1344 $self->{state} = 'DOCTYPE public identifier (double-quoted)';
1345 !!!next-input-character;
1346 redo A;
1347 } elsif ($self->{next_input_character} eq 0x0027) { # '
1348 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1349 $self->{state} = 'DOCTYPE public identifier (single-quoted)';
1350 !!!next-input-character;
1351 redo A;
1352 } elsif ($self->{next_input_character} eq 0x003E) { # >
1353 !!!parse-error (type => 'no PUBLIC literal');
1354
1355 $self->{state} = 'data';
1356 !!!next-input-character;
1357
1358 delete $self->{current_token}->{correct};
1359 !!!emit ($self->{current_token}); # DOCTYPE
1360
1361 redo A;
1362 } elsif ($self->{next_input_character} == -1) {
1363 !!!parse-error (type => 'unclosed DOCTYPE');
1364
1365 $self->{state} = 'data';
1366 ## reconsume
1367
1368 delete $self->{current_token}->{correct};
1369 !!!emit ($self->{current_token}); # DOCTYPE
1370
1371 redo A;
1372 } else {
1373 !!!parse-error (type => 'string after PUBLIC');
1374 $self->{state} = 'bogus DOCTYPE';
1375 !!!next-input-character;
1376 redo A;
1377 }
1378 } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
1379 if ($self->{next_input_character} == 0x0022) { # "
1380 $self->{state} = 'after DOCTYPE public identifier';
1381 !!!next-input-character;
1382 redo A;
1383 } elsif ($self->{next_input_character} == -1) {
1384 !!!parse-error (type => 'unclosed PUBLIC literal');
1385
1386 $self->{state} = 'data';
1387 ## reconsume
1388
1389 delete $self->{current_token}->{correct};
1390 !!!emit ($self->{current_token}); # DOCTYPE
1391
1392 redo A;
1393 } else {
1394 $self->{current_token}->{public_identifier} # DOCTYPE
1395 .= chr $self->{next_input_character};
1396 ## Stay in the state
1397 !!!next-input-character;
1398 redo A;
1399 }
1400 } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
1401 if ($self->{next_input_character} == 0x0027) { # '
1402 $self->{state} = 'after DOCTYPE public identifier';
1403 !!!next-input-character;
1404 redo A;
1405 } elsif ($self->{next_input_character} == -1) {
1406 !!!parse-error (type => 'unclosed PUBLIC literal');
1407
1408 $self->{state} = 'data';
1409 ## reconsume
1410
1411 delete $self->{current_token}->{correct};
1412 !!!emit ($self->{current_token}); # DOCTYPE
1413
1414 redo A;
1415 } else {
1416 $self->{current_token}->{public_identifier} # DOCTYPE
1417 .= chr $self->{next_input_character};
1418 ## Stay in the state
1419 !!!next-input-character;
1420 redo A;
1421 }
1422 } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
1423 if ({
1424 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1425 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1426 }->{$self->{next_input_character}}) {
1427 ## Stay in the state
1428 !!!next-input-character;
1429 redo A;
1430 } elsif ($self->{next_input_character} == 0x0022) { # "
1431 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1432 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1433 !!!next-input-character;
1434 redo A;
1435 } elsif ($self->{next_input_character} == 0x0027) { # '
1436 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1437 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1438 !!!next-input-character;
1439 redo A;
1440 } elsif ($self->{next_input_character} == 0x003E) { # >
1441 $self->{state} = 'data';
1442 !!!next-input-character;
1443
1444 !!!emit ($self->{current_token}); # DOCTYPE
1445
1446 redo A;
1447 } elsif ($self->{next_input_character} == -1) {
1448 !!!parse-error (type => 'unclosed DOCTYPE');
1449
1450 $self->{state} = 'data';
1451 ## reconsume
1452
1453 delete $self->{current_token}->{correct};
1454 !!!emit ($self->{current_token}); # DOCTYPE
1455
1456 redo A;
1457 } else {
1458 !!!parse-error (type => 'string after PUBLIC literal');
1459 $self->{state} = 'bogus DOCTYPE';
1460 !!!next-input-character;
1461 redo A;
1462 }
1463 } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
1464 if ({
1465 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1466 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1467 }->{$self->{next_input_character}}) {
1468 ## Stay in the state
1469 !!!next-input-character;
1470 redo A;
1471 } elsif ($self->{next_input_character} == 0x0022) { # "
1472 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1473 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1474 !!!next-input-character;
1475 redo A;
1476 } elsif ($self->{next_input_character} == 0x0027) { # '
1477 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1478 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1479 !!!next-input-character;
1480 redo A;
1481 } elsif ($self->{next_input_character} == 0x003E) { # >
1482 !!!parse-error (type => 'no SYSTEM literal');
1483 $self->{state} = 'data';
1484 !!!next-input-character;
1485
1486 delete $self->{current_token}->{correct};
1487 !!!emit ($self->{current_token}); # DOCTYPE
1488
1489 redo A;
1490 } elsif ($self->{next_input_character} == -1) {
1491 !!!parse-error (type => 'unclosed DOCTYPE');
1492
1493 $self->{state} = 'data';
1494 ## reconsume
1495
1496 delete $self->{current_token}->{correct};
1497 !!!emit ($self->{current_token}); # DOCTYPE
1498
1499 redo A;
1500 } else {
1501 !!!parse-error (type => 'string after SYSTEM');
1502 $self->{state} = 'bogus DOCTYPE';
1503 !!!next-input-character;
1504 redo A;
1505 }
1506 } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
1507 if ($self->{next_input_character} == 0x0022) { # "
1508 $self->{state} = 'after DOCTYPE system identifier';
1509 !!!next-input-character;
1510 redo A;
1511 } elsif ($self->{next_input_character} == -1) {
1512 !!!parse-error (type => 'unclosed SYSTEM literal');
1513
1514 $self->{state} = 'data';
1515 ## reconsume
1516
1517 delete $self->{current_token}->{correct};
1518 !!!emit ($self->{current_token}); # DOCTYPE
1519
1520 redo A;
1521 } else {
1522 $self->{current_token}->{system_identifier} # DOCTYPE
1523 .= chr $self->{next_input_character};
1524 ## Stay in the state
1525 !!!next-input-character;
1526 redo A;
1527 }
1528 } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
1529 if ($self->{next_input_character} == 0x0027) { # '
1530 $self->{state} = 'after DOCTYPE system identifier';
1531 !!!next-input-character;
1532 redo A;
1533 } elsif ($self->{next_input_character} == -1) {
1534 !!!parse-error (type => 'unclosed SYSTEM literal');
1535
1536 $self->{state} = 'data';
1537 ## reconsume
1538
1539 delete $self->{current_token}->{correct};
1540 !!!emit ($self->{current_token}); # DOCTYPE
1541
1542 redo A;
1543 } else {
1544 $self->{current_token}->{system_identifier} # DOCTYPE
1545 .= chr $self->{next_input_character};
1546 ## Stay in the state
1547 !!!next-input-character;
1548 redo A;
1549 }
1550 } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
1551 if ({
1552 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1553 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1554 }->{$self->{next_input_character}}) {
1555 ## Stay in the state
1556 !!!next-input-character;
1557 redo A;
1558 } elsif ($self->{next_input_character} == 0x003E) { # >
1559 $self->{state} = 'data';
1560 !!!next-input-character;
1561
1562 !!!emit ($self->{current_token}); # DOCTYPE
1563
1564 redo A;
1565 } elsif ($self->{next_input_character} == -1) {
1566 !!!parse-error (type => 'unclosed DOCTYPE');
1567
1568 $self->{state} = 'data';
1569 ## reconsume
1570
1571 delete $self->{current_token}->{correct};
1572 !!!emit ($self->{current_token}); # DOCTYPE
1573
1574 redo A;
1575 } else {
1576 !!!parse-error (type => 'string after SYSTEM literal');
1577 $self->{state} = 'bogus DOCTYPE';
1578 !!!next-input-character;
1579 redo A;
1580 }
1581 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1582 if ($self->{next_input_character} == 0x003E) { # >
1583 $self->{state} = 'data';
1584 !!!next-input-character;
1585
1586 delete $self->{current_token}->{correct};
1587 !!!emit ($self->{current_token}); # DOCTYPE
1588
1589 redo A;
1590 } elsif ($self->{next_input_character} == -1) {
1591 !!!parse-error (type => 'unclosed DOCTYPE');
1592 $self->{state} = 'data';
1593 ## reconsume
1594
1595 delete $self->{current_token}->{correct};
1596 !!!emit ($self->{current_token}); # DOCTYPE
1597
1598 redo A;
1599 } else {
1600 ## Stay in the state
1601 !!!next-input-character;
1602 redo A;
1603 }
1604 } else {
1605 die "$0: $self->{state}: Unknown state";
1606 }
1607 } # A
1608
1609 die "$0: _get_next_token: unexpected case";
1610 } # _get_next_token
1611
1612 sub _tokenize_attempt_to_consume_an_entity ($$) {
1613 my ($self, $in_attr) = @_;
1614
1615 if ({
1616 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1617 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1618 }->{$self->{next_input_character}}) {
1619 ## Don't consume
1620 ## No error
1621 return undef;
1622 } elsif ($self->{next_input_character} == 0x0023) { # #
1623 !!!next-input-character;
1624 if ($self->{next_input_character} == 0x0078 or # x
1625 $self->{next_input_character} == 0x0058) { # X
1626 my $code;
1627 X: {
1628 my $x_char = $self->{next_input_character};
1629 !!!next-input-character;
1630 if (0x0030 <= $self->{next_input_character} and
1631 $self->{next_input_character} <= 0x0039) { # 0..9
1632 $code ||= 0;
1633 $code *= 0x10;
1634 $code += $self->{next_input_character} - 0x0030;
1635 redo X;
1636 } elsif (0x0061 <= $self->{next_input_character} and
1637 $self->{next_input_character} <= 0x0066) { # a..f
1638 $code ||= 0;
1639 $code *= 0x10;
1640 $code += $self->{next_input_character} - 0x0060 + 9;
1641 redo X;
1642 } elsif (0x0041 <= $self->{next_input_character} and
1643 $self->{next_input_character} <= 0x0046) { # A..F
1644 $code ||= 0;
1645 $code *= 0x10;
1646 $code += $self->{next_input_character} - 0x0040 + 9;
1647 redo X;
1648 } elsif (not defined $code) { # no hexadecimal digit
1649 !!!parse-error (type => 'bare hcro');
1650 !!!back-next-input-character ($x_char, $self->{next_input_character});
1651 $self->{next_input_character} = 0x0023; # #
1652 return undef;
1653 } elsif ($self->{next_input_character} == 0x003B) { # ;
1654 !!!next-input-character;
1655 } else {
1656 !!!parse-error (type => 'no refc');
1657 }
1658
1659 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1660 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1661 $code = 0xFFFD;
1662 } elsif ($code > 0x10FFFF) {
1663 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1664 $code = 0xFFFD;
1665 } elsif ($code == 0x000D) {
1666 !!!parse-error (type => 'CR character reference');
1667 $code = 0x000A;
1668 } elsif (0x80 <= $code and $code <= 0x9F) {
1669 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1670 $code = $c1_entity_char->{$code};
1671 }
1672
1673 return {type => 'character', data => chr $code};
1674 } # X
1675 } elsif (0x0030 <= $self->{next_input_character} and
1676 $self->{next_input_character} <= 0x0039) { # 0..9
1677 my $code = $self->{next_input_character} - 0x0030;
1678 !!!next-input-character;
1679
1680 while (0x0030 <= $self->{next_input_character} and
1681 $self->{next_input_character} <= 0x0039) { # 0..9
1682 $code *= 10;
1683 $code += $self->{next_input_character} - 0x0030;
1684
1685 !!!next-input-character;
1686 }
1687
1688 if ($self->{next_input_character} == 0x003B) { # ;
1689 !!!next-input-character;
1690 } else {
1691 !!!parse-error (type => 'no refc');
1692 }
1693
1694 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1695 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1696 $code = 0xFFFD;
1697 } elsif ($code > 0x10FFFF) {
1698 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1699 $code = 0xFFFD;
1700 } elsif ($code == 0x000D) {
1701 !!!parse-error (type => 'CR character reference');
1702 $code = 0x000A;
1703 } elsif (0x80 <= $code and $code <= 0x9F) {
1704 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1705 $code = $c1_entity_char->{$code};
1706 }
1707
1708 return {type => 'character', data => chr $code};
1709 } else {
1710 !!!parse-error (type => 'bare nero');
1711 !!!back-next-input-character ($self->{next_input_character});
1712 $self->{next_input_character} = 0x0023; # #
1713 return undef;
1714 }
1715 } elsif ((0x0041 <= $self->{next_input_character} and
1716 $self->{next_input_character} <= 0x005A) or
1717 (0x0061 <= $self->{next_input_character} and
1718 $self->{next_input_character} <= 0x007A)) {
1719 my $entity_name = chr $self->{next_input_character};
1720 !!!next-input-character;
1721
1722 my $value = $entity_name;
1723 my $match = 0;
1724 require Whatpm::_NamedEntityList;
1725 our $EntityChar;
1726
1727 while (length $entity_name < 10 and
1728 ## NOTE: Some number greater than the maximum length of entity name
1729 ((0x0041 <= $self->{next_input_character} and # a
1730 $self->{next_input_character} <= 0x005A) or # x
1731 (0x0061 <= $self->{next_input_character} and # a
1732 $self->{next_input_character} <= 0x007A) or # z
1733 (0x0030 <= $self->{next_input_character} and # 0
1734 $self->{next_input_character} <= 0x0039) or # 9
1735 $self->{next_input_character} == 0x003B)) { # ;
1736 $entity_name .= chr $self->{next_input_character};
1737 if (defined $EntityChar->{$entity_name}) {
1738 if ($self->{next_input_character} == 0x003B) { # ;
1739 $value = $EntityChar->{$entity_name};
1740 $match = 1;
1741 !!!next-input-character;
1742 last;
1743 } else {
1744 $value = $EntityChar->{$entity_name};
1745 $match = -1;
1746 !!!next-input-character;
1747 }
1748 } else {
1749 $value .= chr $self->{next_input_character};
1750 $match *= 2;
1751 !!!next-input-character;
1752 }
1753 }
1754
1755 if ($match > 0) {
1756 return {type => 'character', data => $value};
1757 } elsif ($match < 0) {
1758 !!!parse-error (type => 'no refc');
1759 if ($in_attr and $match < -1) {
1760 return {type => 'character', data => '&'.$entity_name};
1761 } else {
1762 return {type => 'character', data => $value};
1763 }
1764 } else {
1765 !!!parse-error (type => 'bare ero');
1766 ## NOTE: No characters are consumed in the spec.
1767 return {type => 'character', data => '&'.$value};
1768 }
1769 } else {
1770 ## no characters are consumed
1771 !!!parse-error (type => 'bare ero');
1772 return undef;
1773 }
1774 } # _tokenize_attempt_to_consume_an_entity
1775
1776 sub _initialize_tree_constructor ($) {
1777 my $self = shift;
1778 ## NOTE: $self->{document} MUST be specified before this method is called
1779 $self->{document}->strict_error_checking (0);
1780 ## TODO: Turn mutation events off # MUST
1781 ## TODO: Turn loose Document option (manakai extension) on
1782 $self->{document}->manakai_is_html (1); # MUST
1783 } # _initialize_tree_constructor
1784
1785 sub _terminate_tree_constructor ($) {
1786 my $self = shift;
1787 $self->{document}->strict_error_checking (1);
1788 ## TODO: Turn mutation events on
1789 } # _terminate_tree_constructor
1790
1791 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1792
1793 { # tree construction stage
1794 my $token;
1795
1796 sub _construct_tree ($) {
1797 my ($self) = @_;
1798
1799 ## When an interactive UA render the $self->{document} available
1800 ## to the user, or when it begin accepting user input, are
1801 ## not defined.
1802
1803 ## Append a character: collect it and all subsequent consecutive
1804 ## characters and insert one Text node whose data is concatenation
1805 ## of all those characters. # MUST
1806
1807 !!!next-token;
1808
1809 $self->{insertion_mode} = 'before head';
1810 undef $self->{form_element};
1811 undef $self->{head_element};
1812 $self->{open_elements} = [];
1813 undef $self->{inner_html_node};
1814
1815 $self->_tree_construction_initial; # MUST
1816 $self->_tree_construction_root_element;
1817 $self->_tree_construction_main;
1818 } # _construct_tree
1819
1820 sub _tree_construction_initial ($) {
1821 my $self = shift;
1822 INITIAL: {
1823 if ($token->{type} eq 'DOCTYPE') {
1824 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1825 ## error, switch to a conformance checking mode for another
1826 ## language.
1827 my $doctype_name = $token->{name};
1828 $doctype_name = '' unless defined $doctype_name;
1829 $doctype_name =~ tr/a-z/A-Z/;
1830 if (not defined $token->{name} or # <!DOCTYPE>
1831 defined $token->{public_identifier} or
1832 defined $token->{system_identifier}) {
1833 !!!parse-error (type => 'not HTML5');
1834 } elsif ($doctype_name ne 'HTML') {
1835 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1836 !!!parse-error (type => 'not HTML5');
1837 }
1838
1839 my $doctype = $self->{document}->create_document_type_definition
1840 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1841 $doctype->public_id ($token->{public_identifier})
1842 if defined $token->{public_identifier};
1843 $doctype->system_id ($token->{system_identifier})
1844 if defined $token->{system_identifier};
1845 ## NOTE: Other DocumentType attributes are null or empty lists.
1846 ## ISSUE: internalSubset = null??
1847 $self->{document}->append_child ($doctype);
1848
1849 if (not $token->{correct} or $doctype_name ne 'HTML') {
1850 $self->{document}->manakai_compat_mode ('quirks');
1851 } elsif (defined $token->{public_identifier}) {
1852 my $pubid = $token->{public_identifier};
1853 $pubid =~ tr/a-z/A-z/;
1854 if ({
1855 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1856 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1857 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1858 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1859 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1860 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1861 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1862 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1863 "-//IETF//DTD HTML 2.0//EN" => 1,
1864 "-//IETF//DTD HTML 2.1E//EN" => 1,
1865 "-//IETF//DTD HTML 3.0//EN" => 1,
1866 "-//IETF//DTD HTML 3.0//EN//" => 1,
1867 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1868 "-//IETF//DTD HTML 3.2//EN" => 1,
1869 "-//IETF//DTD HTML 3//EN" => 1,
1870 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1871 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1872 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1873 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1874 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1875 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1876 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1877 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1878 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1879 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1880 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1881 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1882 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1883 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1884 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1885 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1886 "-//IETF//DTD HTML STRICT//EN" => 1,
1887 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1888 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1889 "-//IETF//DTD HTML//EN" => 1,
1890 "-//IETF//DTD HTML//EN//2.0" => 1,
1891 "-//IETF//DTD HTML//EN//3.0" => 1,
1892 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1893 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1894 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1895 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1896 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1897 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1898 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1899 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1900 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1901 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1902 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1903 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1904 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1905 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1906 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1907 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1908 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1909 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1910 "-//W3C//DTD HTML 3.2//EN" => 1,
1911 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1912 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1913 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1914 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1915 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1916 "-//W3C//DTD W3 HTML//EN" => 1,
1917 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1918 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1919 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1920 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1921 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1922 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1923 "HTML" => 1,
1924 }->{$pubid}) {
1925 $self->{document}->manakai_compat_mode ('quirks');
1926 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1927 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1928 if (defined $token->{system_identifier}) {
1929 $self->{document}->manakai_compat_mode ('quirks');
1930 } else {
1931 $self->{document}->manakai_compat_mode ('limited quirks');
1932 }
1933 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
1934 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
1935 $self->{document}->manakai_compat_mode ('limited quirks');
1936 }
1937 }
1938 if (defined $token->{system_identifier}) {
1939 my $sysid = $token->{system_identifier};
1940 $sysid =~ tr/A-Z/a-z/;
1941 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
1942 $self->{document}->manakai_compat_mode ('quirks');
1943 }
1944 }
1945
1946 ## Go to the root element phase.
1947 !!!next-token;
1948 return;
1949 } elsif ({
1950 'start tag' => 1,
1951 'end tag' => 1,
1952 'end-of-file' => 1,
1953 }->{$token->{type}}) {
1954 !!!parse-error (type => 'no DOCTYPE');
1955 $self->{document}->manakai_compat_mode ('quirks');
1956 ## Go to the root element phase
1957 ## reprocess
1958 return;
1959 } elsif ($token->{type} eq 'character') {
1960 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1961 ## Ignore the token
1962
1963 unless (length $token->{data}) {
1964 ## Stay in the phase
1965 !!!next-token;
1966 redo INITIAL;
1967 }
1968 }
1969
1970 !!!parse-error (type => 'no DOCTYPE');
1971 $self->{document}->manakai_compat_mode ('quirks');
1972 ## Go to the root element phase
1973 ## reprocess
1974 return;
1975 } elsif ($token->{type} eq 'comment') {
1976 my $comment = $self->{document}->create_comment ($token->{data});
1977 $self->{document}->append_child ($comment);
1978
1979 ## Stay in the phase.
1980 !!!next-token;
1981 redo INITIAL;
1982 } else {
1983 die "$0: $token->{type}: Unknown token";
1984 }
1985 } # INITIAL
1986 } # _tree_construction_initial
1987
1988 sub _tree_construction_root_element ($) {
1989 my $self = shift;
1990
1991 B: {
1992 if ($token->{type} eq 'DOCTYPE') {
1993 !!!parse-error (type => 'in html:#DOCTYPE');
1994 ## Ignore the token
1995 ## Stay in the phase
1996 !!!next-token;
1997 redo B;
1998 } elsif ($token->{type} eq 'comment') {
1999 my $comment = $self->{document}->create_comment ($token->{data});
2000 $self->{document}->append_child ($comment);
2001 ## Stay in the phase
2002 !!!next-token;
2003 redo B;
2004 } elsif ($token->{type} eq 'character') {
2005 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2006 ## Ignore the token.
2007
2008 unless (length $token->{data}) {
2009 ## Stay in the phase
2010 !!!next-token;
2011 redo B;
2012 }
2013 }
2014 #
2015 } elsif ({
2016 'start tag' => 1,
2017 'end tag' => 1,
2018 'end-of-file' => 1,
2019 }->{$token->{type}}) {
2020 ## ISSUE: There is an issue in the spec
2021 #
2022 } else {
2023 die "$0: $token->{type}: Unknown token";
2024 }
2025 my $root_element; !!!create-element ($root_element, 'html');
2026 $self->{document}->append_child ($root_element);
2027 push @{$self->{open_elements}}, [$root_element, 'html'];
2028 ## reprocess
2029 #redo B;
2030 return; ## Go to the main phase.
2031 } # B
2032 } # _tree_construction_root_element
2033
2034 sub _reset_insertion_mode ($) {
2035 my $self = shift;
2036
2037 ## Step 1
2038 my $last;
2039
2040 ## Step 2
2041 my $i = -1;
2042 my $node = $self->{open_elements}->[$i];
2043
2044 ## Step 3
2045 S3: {
2046 ## ISSUE: Oops! "If node is the first node in the stack of open
2047 ## elements, then set last to true. If the context element of the
2048 ## HTML fragment parsing algorithm is neither a td element nor a
2049 ## th element, then set node to the context element. (fragment case)":
2050 ## The second "if" is in the scope of the first "if"!?
2051 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2052 $last = 1;
2053 if (defined $self->{inner_html_node}) {
2054 if ($self->{inner_html_node}->[1] eq 'td' or
2055 $self->{inner_html_node}->[1] eq 'th') {
2056 #
2057 } else {
2058 $node = $self->{inner_html_node};
2059 }
2060 }
2061 }
2062
2063 ## Step 4..13
2064 my $new_mode = {
2065 select => 'in select',
2066 td => 'in cell',
2067 th => 'in cell',
2068 tr => 'in row',
2069 tbody => 'in table body',
2070 thead => 'in table body',
2071 tfoot => 'in table body',
2072 caption => 'in caption',
2073 colgroup => 'in column group',
2074 table => 'in table',
2075 head => 'in body', # not in head!
2076 body => 'in body',
2077 frameset => 'in frameset',
2078 }->{$node->[1]};
2079 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2080
2081 ## Step 14
2082 if ($node->[1] eq 'html') {
2083 unless (defined $self->{head_element}) {
2084 $self->{insertion_mode} = 'before head';
2085 } else {
2086 $self->{insertion_mode} = 'after head';
2087 }
2088 return;
2089 }
2090
2091 ## Step 15
2092 $self->{insertion_mode} = 'in body' and return if $last;
2093
2094 ## Step 16
2095 $i--;
2096 $node = $self->{open_elements}->[$i];
2097
2098 ## Step 17
2099 redo S3;
2100 } # S3
2101 } # _reset_insertion_mode
2102
2103 sub _tree_construction_main ($) {
2104 my $self = shift;
2105
2106 my $previous_insertion_mode;
2107
2108 my $active_formatting_elements = [];
2109
2110 my $reconstruct_active_formatting_elements = sub { # MUST
2111 my $insert = shift;
2112
2113 ## Step 1
2114 return unless @$active_formatting_elements;
2115
2116 ## Step 3
2117 my $i = -1;
2118 my $entry = $active_formatting_elements->[$i];
2119
2120 ## Step 2
2121 return if $entry->[0] eq '#marker';
2122 for (@{$self->{open_elements}}) {
2123 if ($entry->[0] eq $_->[0]) {
2124 return;
2125 }
2126 }
2127
2128 S4: {
2129 ## Step 4
2130 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2131
2132 ## Step 5
2133 $i--;
2134 $entry = $active_formatting_elements->[$i];
2135
2136 ## Step 6
2137 if ($entry->[0] eq '#marker') {
2138 #
2139 } else {
2140 my $in_open_elements;
2141 OE: for (@{$self->{open_elements}}) {
2142 if ($entry->[0] eq $_->[0]) {
2143 $in_open_elements = 1;
2144 last OE;
2145 }
2146 }
2147 if ($in_open_elements) {
2148 #
2149 } else {
2150 redo S4;
2151 }
2152 }
2153
2154 ## Step 7
2155 $i++;
2156 $entry = $active_formatting_elements->[$i];
2157 } # S4
2158
2159 S7: {
2160 ## Step 8
2161 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2162
2163 ## Step 9
2164 $insert->($clone->[0]);
2165 push @{$self->{open_elements}}, $clone;
2166
2167 ## Step 10
2168 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2169
2170 ## Step 11
2171 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2172 ## Step 7'
2173 $i++;
2174 $entry = $active_formatting_elements->[$i];
2175
2176 redo S7;
2177 }
2178 } # S7
2179 }; # $reconstruct_active_formatting_elements
2180
2181 my $clear_up_to_marker = sub {
2182 for (reverse 0..$#$active_formatting_elements) {
2183 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2184 splice @$active_formatting_elements, $_;
2185 return;
2186 }
2187 }
2188 }; # $clear_up_to_marker
2189
2190 my $parse_rcdata = sub ($$) {
2191 my ($content_model_flag, $insert) = @_;
2192
2193 ## Step 1
2194 my $start_tag_name = $token->{tag_name};
2195 my $el;
2196 !!!create-element ($el, $start_tag_name, $token->{attributes});
2197
2198 ## Step 2
2199 $insert->($el); # /context node/->append_child ($el)
2200
2201 ## Step 3
2202 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2203 delete $self->{escape}; # MUST
2204
2205 ## Step 4
2206 my $text = '';
2207 !!!next-token;
2208 while ($token->{type} eq 'character') { # or until stop tokenizing
2209 $text .= $token->{data};
2210 !!!next-token;
2211 }
2212
2213 ## Step 5
2214 if (length $text) {
2215 my $text = $self->{document}->create_text_node ($text);
2216 $el->append_child ($text);
2217 }
2218
2219 ## Step 6
2220 $self->{content_model} = PCDATA_CONTENT_MODEL;
2221
2222 ## Step 7
2223 if ($token->{type} eq 'end tag' and $token->{tag_name} eq $start_tag_name) {
2224 ## Ignore the token
2225 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2226 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2227 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2228 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2229 } else {
2230 die "$0: $content_model_flag in parse_rcdata";
2231 }
2232 !!!next-token;
2233 }; # $parse_rcdata
2234
2235 my $script_start_tag = sub ($) {
2236 my $insert = $_[0];
2237 my $script_el;
2238 !!!create-element ($script_el, 'script', $token->{attributes});
2239 ## TODO: mark as "parser-inserted"
2240
2241 $self->{content_model} = CDATA_CONTENT_MODEL;
2242 delete $self->{escape}; # MUST
2243
2244 my $text = '';
2245 !!!next-token;
2246 while ($token->{type} eq 'character') {
2247 $text .= $token->{data};
2248 !!!next-token;
2249 } # stop if non-character token or tokenizer stops tokenising
2250 if (length $text) {
2251 $script_el->manakai_append_text ($text);
2252 }
2253
2254 $self->{content_model} = PCDATA_CONTENT_MODEL;
2255
2256 if ($token->{type} eq 'end tag' and
2257 $token->{tag_name} eq 'script') {
2258 ## Ignore the token
2259 } else {
2260 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2261 ## ISSUE: And ignore?
2262 ## TODO: mark as "already executed"
2263 }
2264
2265 if (defined $self->{inner_html_node}) {
2266 ## TODO: mark as "already executed"
2267 } else {
2268 ## TODO: $old_insertion_point = current insertion point
2269 ## TODO: insertion point = just before the next input character
2270
2271 $insert->($script_el);
2272
2273 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2274
2275 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2276 }
2277
2278 !!!next-token;
2279 }; # $script_start_tag
2280
2281 my $formatting_end_tag = sub {
2282 my $tag_name = shift;
2283
2284 FET: {
2285 ## Step 1
2286 my $formatting_element;
2287 my $formatting_element_i_in_active;
2288 AFE: for (reverse 0..$#$active_formatting_elements) {
2289 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2290 $formatting_element = $active_formatting_elements->[$_];
2291 $formatting_element_i_in_active = $_;
2292 last AFE;
2293 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2294 last AFE;
2295 }
2296 } # AFE
2297 unless (defined $formatting_element) {
2298 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2299 ## Ignore the token
2300 !!!next-token;
2301 return;
2302 }
2303 ## has an element in scope
2304 my $in_scope = 1;
2305 my $formatting_element_i_in_open;
2306 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2307 my $node = $self->{open_elements}->[$_];
2308 if ($node->[0] eq $formatting_element->[0]) {
2309 if ($in_scope) {
2310 $formatting_element_i_in_open = $_;
2311 last INSCOPE;
2312 } else { # in open elements but not in scope
2313 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2314 ## Ignore the token
2315 !!!next-token;
2316 return;
2317 }
2318 } elsif ({
2319 table => 1, caption => 1, td => 1, th => 1,
2320 button => 1, marquee => 1, object => 1, html => 1,
2321 }->{$node->[1]}) {
2322 $in_scope = 0;
2323 }
2324 } # INSCOPE
2325 unless (defined $formatting_element_i_in_open) {
2326 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2327 pop @$active_formatting_elements; # $formatting_element
2328 !!!next-token; ## TODO: ok?
2329 return;
2330 }
2331 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2332 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2333 }
2334
2335 ## Step 2
2336 my $furthest_block;
2337 my $furthest_block_i_in_open;
2338 OE: for (reverse 0..$#{$self->{open_elements}}) {
2339 my $node = $self->{open_elements}->[$_];
2340 if (not $formatting_category->{$node->[1]} and
2341 #not $phrasing_category->{$node->[1]} and
2342 ($special_category->{$node->[1]} or
2343 $scoping_category->{$node->[1]})) {
2344 $furthest_block = $node;
2345 $furthest_block_i_in_open = $_;
2346 } elsif ($node->[0] eq $formatting_element->[0]) {
2347 last OE;
2348 }
2349 } # OE
2350
2351 ## Step 3
2352 unless (defined $furthest_block) { # MUST
2353 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2354 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2355 !!!next-token;
2356 return;
2357 }
2358
2359 ## Step 4
2360 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2361
2362 ## Step 5
2363 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2364 if (defined $furthest_block_parent) {
2365 $furthest_block_parent->remove_child ($furthest_block->[0]);
2366 }
2367
2368 ## Step 6
2369 my $bookmark_prev_el
2370 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2371 ->[0];
2372
2373 ## Step 7
2374 my $node = $furthest_block;
2375 my $node_i_in_open = $furthest_block_i_in_open;
2376 my $last_node = $furthest_block;
2377 S7: {
2378 ## Step 1
2379 $node_i_in_open--;
2380 $node = $self->{open_elements}->[$node_i_in_open];
2381
2382 ## Step 2
2383 my $node_i_in_active;
2384 S7S2: {
2385 for (reverse 0..$#$active_formatting_elements) {
2386 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2387 $node_i_in_active = $_;
2388 last S7S2;
2389 }
2390 }
2391 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2392 redo S7;
2393 } # S7S2
2394
2395 ## Step 3
2396 last S7 if $node->[0] eq $formatting_element->[0];
2397
2398 ## Step 4
2399 if ($last_node->[0] eq $furthest_block->[0]) {
2400 $bookmark_prev_el = $node->[0];
2401 }
2402
2403 ## Step 5
2404 if ($node->[0]->has_child_nodes ()) {
2405 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2406 $active_formatting_elements->[$node_i_in_active] = $clone;
2407 $self->{open_elements}->[$node_i_in_open] = $clone;
2408 $node = $clone;
2409 }
2410
2411 ## Step 6
2412 $node->[0]->append_child ($last_node->[0]);
2413
2414 ## Step 7
2415 $last_node = $node;
2416
2417 ## Step 8
2418 redo S7;
2419 } # S7
2420
2421 ## Step 8
2422 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2423
2424 ## Step 9
2425 my $clone = [$formatting_element->[0]->clone_node (0),
2426 $formatting_element->[1]];
2427
2428 ## Step 10
2429 my @cn = @{$furthest_block->[0]->child_nodes};
2430 $clone->[0]->append_child ($_) for @cn;
2431
2432 ## Step 11
2433 $furthest_block->[0]->append_child ($clone->[0]);
2434
2435 ## Step 12
2436 my $i;
2437 AFE: for (reverse 0..$#$active_formatting_elements) {
2438 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2439 splice @$active_formatting_elements, $_, 1;
2440 $i-- and last AFE if defined $i;
2441 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2442 $i = $_;
2443 }
2444 } # AFE
2445 splice @$active_formatting_elements, $i + 1, 0, $clone;
2446
2447 ## Step 13
2448 undef $i;
2449 OE: for (reverse 0..$#{$self->{open_elements}}) {
2450 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2451 splice @{$self->{open_elements}}, $_, 1;
2452 $i-- and last OE if defined $i;
2453 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2454 $i = $_;
2455 }
2456 } # OE
2457 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2458
2459 ## Step 14
2460 redo FET;
2461 } # FET
2462 }; # $formatting_end_tag
2463
2464 my $insert_to_current = sub {
2465 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2466 }; # $insert_to_current
2467
2468 my $insert_to_foster = sub {
2469 my $child = shift;
2470 if ({
2471 table => 1, tbody => 1, tfoot => 1,
2472 thead => 1, tr => 1,
2473 }->{$self->{open_elements}->[-1]->[1]}) {
2474 # MUST
2475 my $foster_parent_element;
2476 my $next_sibling;
2477 OE: for (reverse 0..$#{$self->{open_elements}}) {
2478 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2479 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2480 if (defined $parent and $parent->node_type == 1) {
2481 $foster_parent_element = $parent;
2482 $next_sibling = $self->{open_elements}->[$_]->[0];
2483 } else {
2484 $foster_parent_element
2485 = $self->{open_elements}->[$_ - 1]->[0];
2486 }
2487 last OE;
2488 }
2489 } # OE
2490 $foster_parent_element = $self->{open_elements}->[0]->[0]
2491 unless defined $foster_parent_element;
2492 $foster_parent_element->insert_before
2493 ($child, $next_sibling);
2494 } else {
2495 $self->{open_elements}->[-1]->[0]->append_child ($child);
2496 }
2497 }; # $insert_to_foster
2498
2499 my $in_body = sub {
2500 my $insert = shift;
2501 if ($token->{type} eq 'start tag') {
2502 if ($token->{tag_name} eq 'script') {
2503 ## NOTE: This is an "as if in head" code clone
2504 $script_start_tag->($insert);
2505 return;
2506 } elsif ($token->{tag_name} eq 'style') {
2507 ## NOTE: This is an "as if in head" code clone
2508 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
2509 return;
2510 } elsif ({
2511 base => 1, link => 1,
2512 }->{$token->{tag_name}}) {
2513 ## NOTE: This is an "as if in head" code clone, only "-t" differs
2514 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2515 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2516 !!!next-token;
2517 return;
2518 } elsif ($token->{tag_name} eq 'meta') {
2519 ## NOTE: This is an "as if in head" code clone, only "-t" differs
2520 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2521 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2522
2523 unless ($self->{confident}) {
2524 my $charset;
2525 if ($token->{attributes}->{charset}) { ## TODO: And if supported
2526 $charset = $token->{attributes}->{charset}->{value};
2527 }
2528 if ($token->{attributes}->{'http-equiv'}) {
2529 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2530 if ($token->{attributes}->{'http-equiv'}->{value}
2531 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2532 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2533 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2534 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
2535 } ## TODO: And if supported
2536 }
2537 ## TODO: Change the encoding
2538 }
2539
2540 !!!next-token;
2541 return;
2542 } elsif ($token->{tag_name} eq 'title') {
2543 !!!parse-error (type => 'in body:title');
2544 ## NOTE: This is an "as if in head" code clone
2545 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
2546 if (defined $self->{head_element}) {
2547 $self->{head_element}->append_child ($_[0]);
2548 } else {
2549 $insert->($_[0]);
2550 }
2551 });
2552 return;
2553 } elsif ($token->{tag_name} eq 'body') {
2554 !!!parse-error (type => 'in body:body');
2555
2556 if (@{$self->{open_elements}} == 1 or
2557 $self->{open_elements}->[1]->[1] ne 'body') {
2558 ## Ignore the token
2559 } else {
2560 my $body_el = $self->{open_elements}->[1]->[0];
2561 for my $attr_name (keys %{$token->{attributes}}) {
2562 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2563 $body_el->set_attribute_ns
2564 (undef, [undef, $attr_name],
2565 $token->{attributes}->{$attr_name}->{value});
2566 }
2567 }
2568 }
2569 !!!next-token;
2570 return;
2571 } elsif ({
2572 address => 1, blockquote => 1, center => 1, dir => 1,
2573 div => 1, dl => 1, fieldset => 1, listing => 1,
2574 menu => 1, ol => 1, p => 1, ul => 1,
2575 pre => 1,
2576 }->{$token->{tag_name}}) {
2577 ## has a p element in scope
2578 INSCOPE: for (reverse @{$self->{open_elements}}) {
2579 if ($_->[1] eq 'p') {
2580 !!!back-token;
2581 $token = {type => 'end tag', tag_name => 'p'};
2582 return;
2583 } elsif ({
2584 table => 1, caption => 1, td => 1, th => 1,
2585 button => 1, marquee => 1, object => 1, html => 1,
2586 }->{$_->[1]}) {
2587 last INSCOPE;
2588 }
2589 } # INSCOPE
2590
2591 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2592 if ($token->{tag_name} eq 'pre') {
2593 !!!next-token;
2594 if ($token->{type} eq 'character') {
2595 $token->{data} =~ s/^\x0A//;
2596 unless (length $token->{data}) {
2597 !!!next-token;
2598 }
2599 }
2600 } else {
2601 !!!next-token;
2602 }
2603 return;
2604 } elsif ($token->{tag_name} eq 'form') {
2605 if (defined $self->{form_element}) {
2606 !!!parse-error (type => 'in form:form');
2607 ## Ignore the token
2608 !!!next-token;
2609 return;
2610 } else {
2611 ## has a p element in scope
2612 INSCOPE: for (reverse @{$self->{open_elements}}) {
2613 if ($_->[1] eq 'p') {
2614 !!!back-token;
2615 $token = {type => 'end tag', tag_name => 'p'};
2616 return;
2617 } elsif ({
2618 table => 1, caption => 1, td => 1, th => 1,
2619 button => 1, marquee => 1, object => 1, html => 1,
2620 }->{$_->[1]}) {
2621 last INSCOPE;
2622 }
2623 } # INSCOPE
2624
2625 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2626 $self->{form_element} = $self->{open_elements}->[-1]->[0];
2627 !!!next-token;
2628 return;
2629 }
2630 } elsif ($token->{tag_name} eq 'li') {
2631 ## has a p element in scope
2632 INSCOPE: for (reverse @{$self->{open_elements}}) {
2633 if ($_->[1] eq 'p') {
2634 !!!back-token;
2635 $token = {type => 'end tag', tag_name => 'p'};
2636 return;
2637 } elsif ({
2638 table => 1, caption => 1, td => 1, th => 1,
2639 button => 1, marquee => 1, object => 1, html => 1,
2640 }->{$_->[1]}) {
2641 last INSCOPE;
2642 }
2643 } # INSCOPE
2644
2645 ## Step 1
2646 my $i = -1;
2647 my $node = $self->{open_elements}->[$i];
2648 LI: {
2649 ## Step 2
2650 if ($node->[1] eq 'li') {
2651 if ($i != -1) {
2652 !!!parse-error (type => 'end tag missing:'.
2653 $self->{open_elements}->[-1]->[1]);
2654 }
2655 splice @{$self->{open_elements}}, $i;
2656 last LI;
2657 }
2658
2659 ## Step 3
2660 if (not $formatting_category->{$node->[1]} and
2661 #not $phrasing_category->{$node->[1]} and
2662 ($special_category->{$node->[1]} or
2663 $scoping_category->{$node->[1]}) and
2664 $node->[1] ne 'address' and $node->[1] ne 'div') {
2665 last LI;
2666 }
2667
2668 ## Step 4
2669 $i--;
2670 $node = $self->{open_elements}->[$i];
2671 redo LI;
2672 } # LI
2673
2674 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2675 !!!next-token;
2676 return;
2677 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2678 ## has a p element in scope
2679 INSCOPE: for (reverse @{$self->{open_elements}}) {
2680 if ($_->[1] eq 'p') {
2681 !!!back-token;
2682 $token = {type => 'end tag', tag_name => 'p'};
2683 return;
2684 } elsif ({
2685 table => 1, caption => 1, td => 1, th => 1,
2686 button => 1, marquee => 1, object => 1, html => 1,
2687 }->{$_->[1]}) {
2688 last INSCOPE;
2689 }
2690 } # INSCOPE
2691
2692 ## Step 1
2693 my $i = -1;
2694 my $node = $self->{open_elements}->[$i];
2695 LI: {
2696 ## Step 2
2697 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2698 if ($i != -1) {
2699 !!!parse-error (type => 'end tag missing:'.
2700 $self->{open_elements}->[-1]->[1]);
2701 }
2702 splice @{$self->{open_elements}}, $i;
2703 last LI;
2704 }
2705
2706 ## Step 3
2707 if (not $formatting_category->{$node->[1]} and
2708 #not $phrasing_category->{$node->[1]} and
2709 ($special_category->{$node->[1]} or
2710 $scoping_category->{$node->[1]}) and
2711 $node->[1] ne 'address' and $node->[1] ne 'div') {
2712 last LI;
2713 }
2714
2715 ## Step 4
2716 $i--;
2717 $node = $self->{open_elements}->[$i];
2718 redo LI;
2719 } # LI
2720
2721 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2722 !!!next-token;
2723 return;
2724 } elsif ($token->{tag_name} eq 'plaintext') {
2725 ## has a p element in scope
2726 INSCOPE: for (reverse @{$self->{open_elements}}) {
2727 if ($_->[1] eq 'p') {
2728 !!!back-token;
2729 $token = {type => 'end tag', tag_name => 'p'};
2730 return;
2731 } elsif ({
2732 table => 1, caption => 1, td => 1, th => 1,
2733 button => 1, marquee => 1, object => 1, html => 1,
2734 }->{$_->[1]}) {
2735 last INSCOPE;
2736 }
2737 } # INSCOPE
2738
2739 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2740
2741 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
2742
2743 !!!next-token;
2744 return;
2745 } elsif ({
2746 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2747 }->{$token->{tag_name}}) {
2748 ## has a p element in scope
2749 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2750 my $node = $self->{open_elements}->[$_];
2751 if ($node->[1] eq 'p') {
2752 !!!back-token;
2753 $token = {type => 'end tag', tag_name => 'p'};
2754 return;
2755 } elsif ({
2756 table => 1, caption => 1, td => 1, th => 1,
2757 button => 1, marquee => 1, object => 1, html => 1,
2758 }->{$node->[1]}) {
2759 last INSCOPE;
2760 }
2761 } # INSCOPE
2762
2763 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
2764 ## has an element in scope
2765 #my $i;
2766 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2767 # my $node = $self->{open_elements}->[$_];
2768 # if ({
2769 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2770 # }->{$node->[1]}) {
2771 # $i = $_;
2772 # last INSCOPE;
2773 # } elsif ({
2774 # table => 1, caption => 1, td => 1, th => 1,
2775 # button => 1, marquee => 1, object => 1, html => 1,
2776 # }->{$node->[1]}) {
2777 # last INSCOPE;
2778 # }
2779 #} # INSCOPE
2780 #
2781 #if (defined $i) {
2782 # !!! parse-error (type => 'in hn:hn');
2783 # splice @{$self->{open_elements}}, $i;
2784 #}
2785
2786 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2787
2788 !!!next-token;
2789 return;
2790 } elsif ($token->{tag_name} eq 'a') {
2791 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2792 my $node = $active_formatting_elements->[$i];
2793 if ($node->[1] eq 'a') {
2794 !!!parse-error (type => 'in a:a');
2795
2796 !!!back-token;
2797 $token = {type => 'end tag', tag_name => 'a'};
2798 $formatting_end_tag->($token->{tag_name});
2799
2800 AFE2: for (reverse 0..$#$active_formatting_elements) {
2801 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2802 splice @$active_formatting_elements, $_, 1;
2803 last AFE2;
2804 }
2805 } # AFE2
2806 OE: for (reverse 0..$#{$self->{open_elements}}) {
2807 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
2808 splice @{$self->{open_elements}}, $_, 1;
2809 last OE;
2810 }
2811 } # OE
2812 last AFE;
2813 } elsif ($node->[0] eq '#marker') {
2814 last AFE;
2815 }
2816 } # AFE
2817
2818 $reconstruct_active_formatting_elements->($insert_to_current);
2819
2820 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2821 push @$active_formatting_elements, $self->{open_elements}->[-1];
2822
2823 !!!next-token;
2824 return;
2825 } elsif ({
2826 b => 1, big => 1, em => 1, font => 1, i => 1,
2827 s => 1, small => 1, strile => 1,
2828 strong => 1, tt => 1, u => 1,
2829 }->{$token->{tag_name}}) {
2830 $reconstruct_active_formatting_elements->($insert_to_current);
2831
2832 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2833 push @$active_formatting_elements, $self->{open_elements}->[-1];
2834
2835 !!!next-token;
2836 return;
2837 } elsif ($token->{tag_name} eq 'nobr') {
2838 $reconstruct_active_formatting_elements->($insert_to_current);
2839
2840 ## has a |nobr| element in scope
2841 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2842 my $node = $self->{open_elements}->[$_];
2843 if ($node->[1] eq 'nobr') {
2844 !!!parse-error (type => 'not closed:nobr');
2845 !!!back-token;
2846 $token = {type => 'end tag', tag_name => 'nobr'};
2847 return;
2848 } elsif ({
2849 table => 1, caption => 1, td => 1, th => 1,
2850 button => 1, marquee => 1, object => 1, html => 1,
2851 }->{$node->[1]}) {
2852 last INSCOPE;
2853 }
2854 } # INSCOPE
2855
2856 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2857 push @$active_formatting_elements, $self->{open_elements}->[-1];
2858
2859 !!!next-token;
2860 return;
2861 } elsif ($token->{tag_name} eq 'button') {
2862 ## has a button element in scope
2863 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2864 my $node = $self->{open_elements}->[$_];
2865 if ($node->[1] eq 'button') {
2866 !!!parse-error (type => 'in button:button');
2867 !!!back-token;
2868 $token = {type => 'end tag', tag_name => 'button'};
2869 return;
2870 } elsif ({
2871 table => 1, caption => 1, td => 1, th => 1,
2872 button => 1, marquee => 1, object => 1, html => 1,
2873 }->{$node->[1]}) {
2874 last INSCOPE;
2875 }
2876 } # INSCOPE
2877
2878 $reconstruct_active_formatting_elements->($insert_to_current);
2879
2880 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2881 push @$active_formatting_elements, ['#marker', ''];
2882
2883 !!!next-token;
2884 return;
2885 } elsif ($token->{tag_name} eq 'marquee' or
2886 $token->{tag_name} eq 'object') {
2887 $reconstruct_active_formatting_elements->($insert_to_current);
2888
2889 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2890 push @$active_formatting_elements, ['#marker', ''];
2891
2892 !!!next-token;
2893 return;
2894 } elsif ($token->{tag_name} eq 'xmp') {
2895 $reconstruct_active_formatting_elements->($insert_to_current);
2896 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
2897 return;
2898 } elsif ($token->{tag_name} eq 'table') {
2899 ## has a p element in scope
2900 INSCOPE: for (reverse @{$self->{open_elements}}) {
2901 if ($_->[1] eq 'p') {
2902 !!!back-token;
2903 $token = {type => 'end tag', tag_name => 'p'};
2904 return;
2905 } elsif ({
2906 table => 1, caption => 1, td => 1, th => 1,
2907 button => 1, marquee => 1, object => 1, html => 1,
2908 }->{$_->[1]}) {
2909 last INSCOPE;
2910 }
2911 } # INSCOPE
2912
2913 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2914
2915 $self->{insertion_mode} = 'in table';
2916
2917 !!!next-token;
2918 return;
2919 } elsif ({
2920 area => 1, basefont => 1, bgsound => 1, br => 1,
2921 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2922 image => 1,
2923 }->{$token->{tag_name}}) {
2924 if ($token->{tag_name} eq 'image') {
2925 !!!parse-error (type => 'image');
2926 $token->{tag_name} = 'img';
2927 }
2928
2929 ## NOTE: There is an "as if <br>" code clone.
2930 $reconstruct_active_formatting_elements->($insert_to_current);
2931
2932 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2933 pop @{$self->{open_elements}};
2934
2935 !!!next-token;
2936 return;
2937 } elsif ($token->{tag_name} eq 'hr') {
2938 ## has a p element in scope
2939 INSCOPE: for (reverse @{$self->{open_elements}}) {
2940 if ($_->[1] eq 'p') {
2941 !!!back-token;
2942 $token = {type => 'end tag', tag_name => 'p'};
2943 return;
2944 } elsif ({
2945 table => 1, caption => 1, td => 1, th => 1,
2946 button => 1, marquee => 1, object => 1, html => 1,
2947 }->{$_->[1]}) {
2948 last INSCOPE;
2949 }
2950 } # INSCOPE
2951
2952 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2953 pop @{$self->{open_elements}};
2954
2955 !!!next-token;
2956 return;
2957 } elsif ($token->{tag_name} eq 'input') {
2958 $reconstruct_active_formatting_elements->($insert_to_current);
2959
2960 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2961 ## TODO: associate with $self->{form_element} if defined
2962 pop @{$self->{open_elements}};
2963
2964 !!!next-token;
2965 return;
2966 } elsif ($token->{tag_name} eq 'isindex') {
2967 !!!parse-error (type => 'isindex');
2968
2969 if (defined $self->{form_element}) {
2970 ## Ignore the token
2971 !!!next-token;
2972 return;
2973 } else {
2974 my $at = $token->{attributes};
2975 my $form_attrs;
2976 $form_attrs->{action} = $at->{action} if $at->{action};
2977 my $prompt_attr = $at->{prompt};
2978 $at->{name} = {name => 'name', value => 'isindex'};
2979 delete $at->{action};
2980 delete $at->{prompt};
2981 my @tokens = (
2982 {type => 'start tag', tag_name => 'form',
2983 attributes => $form_attrs},
2984 {type => 'start tag', tag_name => 'hr'},
2985 {type => 'start tag', tag_name => 'p'},
2986 {type => 'start tag', tag_name => 'label'},
2987 );
2988 if ($prompt_attr) {
2989 push @tokens, {type => 'character', data => $prompt_attr->{value}};
2990 } else {
2991 push @tokens, {type => 'character',
2992 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
2993 ## TODO: make this configurable
2994 }
2995 push @tokens,
2996 {type => 'start tag', tag_name => 'input', attributes => $at},
2997 #{type => 'character', data => ''}, # SHOULD
2998 {type => 'end tag', tag_name => 'label'},
2999 {type => 'end tag', tag_name => 'p'},
3000 {type => 'start tag', tag_name => 'hr'},
3001 {type => 'end tag', tag_name => 'form'};
3002 $token = shift @tokens;
3003 !!!back-token (@tokens);
3004 return;
3005 }
3006 } elsif ($token->{tag_name} eq 'textarea') {
3007 my $tag_name = $token->{tag_name};
3008 my $el;
3009 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
3010
3011 ## TODO: $self->{form_element} if defined
3012 $self->{content_model} = RCDATA_CONTENT_MODEL;
3013 delete $self->{escape}; # MUST
3014
3015 $insert->($el);
3016
3017 my $text = '';
3018 !!!next-token;
3019 if ($token->{type} eq 'character') {
3020 $token->{data} =~ s/^\x0A//;
3021 unless (length $token->{data}) {
3022 !!!next-token;
3023 }
3024 }
3025 while ($token->{type} eq 'character') {
3026 $text .= $token->{data};
3027 !!!next-token;
3028 }
3029 if (length $text) {
3030 $el->manakai_append_text ($text);
3031 }
3032
3033 $self->{content_model} = PCDATA_CONTENT_MODEL;
3034
3035 if ($token->{type} eq 'end tag' and
3036 $token->{tag_name} eq $tag_name) {
3037 ## Ignore the token
3038 } else {
3039 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
3040 }
3041 !!!next-token;
3042 return;
3043 } elsif ({
3044 iframe => 1,
3045 noembed => 1,
3046 noframes => 1,
3047 noscript => 0, ## TODO: 1 if scripting is enabled
3048 }->{$token->{tag_name}}) {
3049 ## NOTE: There are two "as if in body" code clones.
3050 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
3051 return;
3052 } elsif ($token->{tag_name} eq 'select') {
3053 $reconstruct_active_formatting_elements->($insert_to_current);
3054
3055 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
3056
3057 $self->{insertion_mode} = 'in select';
3058 !!!next-token;
3059 return;
3060 } elsif ({
3061 caption => 1, col => 1, colgroup => 1, frame => 1,
3062 frameset => 1, head => 1, option => 1, optgroup => 1,
3063 tbody => 1, td => 1, tfoot => 1, th => 1,
3064 thead => 1, tr => 1,
3065 }->{$token->{tag_name}}) {
3066 !!!parse-error (type => 'in body:'.$token->{tag_name});
3067 ## Ignore the token
3068 !!!next-token;
3069 return;
3070
3071 ## ISSUE: An issue on HTML5 new elements in the spec.
3072 } else {
3073 $reconstruct_active_formatting_elements->($insert_to_current);
3074
3075 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
3076
3077 !!!next-token;
3078 return;
3079 }
3080 } elsif ($token->{type} eq 'end tag') {
3081 if ($token->{tag_name} eq 'body') {
3082 if (@{$self->{open_elements}} > 1 and
3083 $self->{open_elements}->[1]->[1] eq 'body') {
3084 for (@{$self->{open_elements}}) {
3085 unless ({
3086 dd => 1, dt => 1, li => 1, p => 1, td => 1,
3087 th => 1, tr => 1, body => 1, html => 1,
3088 tbody => 1, tfoot => 1, thead => 1,
3089 }->{$_->[1]}) {
3090 !!!parse-error (type => 'not closed:'.$_->[1]);
3091 }
3092 }
3093
3094 $self->{insertion_mode} = 'after body';
3095 !!!next-token;
3096 return;
3097 } else {
3098 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3099 ## Ignore the token
3100 !!!next-token;
3101 return;
3102 }
3103 } elsif ($token->{tag_name} eq 'html') {
3104 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
3105 ## ISSUE: There is an issue in the spec.
3106 if ($self->{open_elements}->[-1]->[1] ne 'body') {
3107 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
3108 }
3109 $self->{insertion_mode} = 'after body';
3110 ## reprocess
3111 return;
3112 } else {
3113 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3114 ## Ignore the token
3115 !!!next-token;
3116 return;
3117 }
3118 } elsif ({
3119 address => 1, blockquote => 1, center => 1, dir => 1,
3120 div => 1, dl => 1, fieldset => 1, listing => 1,
3121 menu => 1, ol => 1, pre => 1, ul => 1,
3122 p => 1,
3123 dd => 1, dt => 1, li => 1,
3124 button => 1, marquee => 1, object => 1,
3125 }->{$token->{tag_name}}) {
3126 ## has an element in scope
3127 my $i;
3128 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3129 my $node = $self->{open_elements}->[$_];
3130 if ($node->[1] eq $token->{tag_name}) {
3131 ## generate implied end tags
3132 if ({
3133 dd => ($token->{tag_name} ne 'dd'),
3134 dt => ($token->{tag_name} ne 'dt'),
3135 li => ($token->{tag_name} ne 'li'),
3136 p => ($token->{tag_name} ne 'p'),
3137 td => 1, th => 1, tr => 1,
3138 tbody => 1, tfoot=> 1, thead => 1,
3139 }->{$self->{open_elements}->[-1]->[1]}) {
3140 !!!back-token;
3141 $token = {type => 'end tag',
3142 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3143 return;
3144 }
3145 $i = $_;
3146 last INSCOPE unless $token->{tag_name} eq 'p';
3147 } elsif ({
3148 table => 1, caption => 1, td => 1, th => 1,
3149 button => 1, marquee => 1, object => 1, html => 1,
3150 }->{$node->[1]}) {
3151 last INSCOPE;
3152 }
3153 } # INSCOPE
3154
3155 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3156 if (defined $i) {
3157 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3158 } else {
3159 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3160 }
3161 }
3162
3163 if (defined $i) {
3164 splice @{$self->{open_elements}}, $i;
3165 } elsif ($token->{tag_name} eq 'p') {
3166 ## As if <p>, then reprocess the current token
3167 my $el;
3168 !!!create-element ($el, 'p');
3169 $insert->($el);
3170 }
3171 $clear_up_to_marker->()
3172 if {
3173 button => 1, marquee => 1, object => 1,
3174 }->{$token->{tag_name}};
3175 !!!next-token;
3176 return;
3177 } elsif ($token->{tag_name} eq 'form') {
3178 ## has an element in scope
3179 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3180 my $node = $self->{open_elements}->[$_];
3181 if ($node->[1] eq $token->{tag_name}) {
3182 ## generate implied end tags
3183 if ({
3184 dd => 1, dt => 1, li => 1, p => 1,
3185 td => 1, th => 1, tr => 1,
3186 tbody => 1, tfoot=> 1, thead => 1,
3187 }->{$self->{open_elements}->[-1]->[1]}) {
3188 !!!back-token;
3189 $token = {type => 'end tag',
3190 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3191 return;
3192 }
3193 last INSCOPE;
3194 } elsif ({
3195 table => 1, caption => 1, td => 1, th => 1,
3196 button => 1, marquee => 1, object => 1, html => 1,
3197 }->{$node->[1]}) {
3198 last INSCOPE;
3199 }
3200 } # INSCOPE
3201
3202 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
3203 pop @{$self->{open_elements}};
3204 } else {
3205 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3206 }
3207
3208 undef $self->{form_element};
3209 !!!next-token;
3210 return;
3211 } elsif ({
3212 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3213 }->{$token->{tag_name}}) {
3214 ## has an element in scope
3215 my $i;
3216 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3217 my $node = $self->{open_elements}->[$_];
3218 if ({
3219 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3220 }->{$node->[1]}) {
3221 ## generate implied end tags
3222 if ({
3223 dd => 1, dt => 1, li => 1, p => 1,
3224 td => 1, th => 1, tr => 1,
3225 tbody => 1, tfoot=> 1, thead => 1,
3226 }->{$self->{open_elements}->[-1]->[1]}) {
3227 !!!back-token;
3228 $token = {type => 'end tag',
3229 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3230 return;
3231 }
3232 $i = $_;
3233 last INSCOPE;
3234 } elsif ({
3235 table => 1, caption => 1, td => 1, th => 1,
3236 button => 1, marquee => 1, object => 1, html => 1,
3237 }->{$node->[1]}) {
3238 last INSCOPE;
3239 }
3240 } # INSCOPE
3241
3242 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3243 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3244 }
3245
3246 splice @{$self->{open_elements}}, $i if defined $i;
3247 !!!next-token;
3248 return;
3249 } elsif ({
3250 a => 1,
3251 b => 1, big => 1, em => 1, font => 1, i => 1,
3252 nobr => 1, s => 1, small => 1, strile => 1,
3253 strong => 1, tt => 1, u => 1,
3254 }->{$token->{tag_name}}) {
3255 $formatting_end_tag->($token->{tag_name});
3256 return;
3257 } elsif ($token->{tag_name} eq 'br') {
3258 !!!parse-error (type => 'unmatched end tag:br');
3259
3260 ## As if <br>
3261 $reconstruct_active_formatting_elements->($insert_to_current);
3262
3263 my $el;
3264 !!!create-element ($el, 'br');
3265 $insert->($el);
3266
3267 ## Ignore the token.
3268 !!!next-token;
3269 return;
3270 } elsif ({
3271 caption => 1, col => 1, colgroup => 1, frame => 1,
3272 frameset => 1, head => 1, option => 1, optgroup => 1,
3273 tbody => 1, td => 1, tfoot => 1, th => 1,
3274 thead => 1, tr => 1,
3275 area => 1, basefont => 1, bgsound => 1,
3276 embed => 1, hr => 1, iframe => 1, image => 1,
3277 img => 1, input => 1, isindex => 1, noembed => 1,
3278 noframes => 1, param => 1, select => 1, spacer => 1,
3279 table => 1, textarea => 1, wbr => 1,
3280 noscript => 0, ## TODO: if scripting is enabled
3281 }->{$token->{tag_name}}) {
3282 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3283 ## Ignore the token
3284 !!!next-token;
3285 return;
3286
3287 ## ISSUE: Issue on HTML5 new elements in spec
3288
3289 } else {
3290 ## Step 1
3291 my $node_i = -1;
3292 my $node = $self->{open_elements}->[$node_i];
3293
3294 ## Step 2
3295 S2: {
3296 if ($node->[1] eq $token->{tag_name}) {
3297 ## Step 1
3298 ## generate implied end tags
3299 if ({
3300 dd => 1, dt => 1, li => 1, p => 1,
3301 td => 1, th => 1, tr => 1,
3302 tbody => 1, tfoot=> 1, thead => 1,
3303 }->{$self->{open_elements}->[-1]->[1]}) {
3304 !!!back-token;
3305 $token = {type => 'end tag',
3306 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3307 return;
3308 }
3309
3310 ## Step 2
3311 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
3312 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3313 }
3314
3315 ## Step 3
3316 splice @{$self->{open_elements}}, $node_i;
3317
3318 !!!next-token;
3319 last S2;
3320 } else {
3321 ## Step 3
3322 if (not $formatting_category->{$node->[1]} and
3323 #not $phrasing_category->{$node->[1]} and
3324 ($special_category->{$node->[1]} or
3325 $scoping_category->{$node->[1]})) {
3326 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3327 ## Ignore the token
3328 !!!next-token;
3329 last S2;
3330 }
3331 }
3332
3333 ## Step 4
3334 $node_i--;
3335 $node = $self->{open_elements}->[$node_i];
3336
3337 ## Step 5;
3338 redo S2;
3339 } # S2
3340 return;
3341 }
3342 }
3343 }; # $in_body
3344
3345 B: {
3346 if ($token->{type} eq 'DOCTYPE') {
3347 !!!parse-error (type => 'DOCTYPE in the middle');
3348 ## Ignore the token
3349 ## Stay in the phase
3350 !!!next-token;
3351 redo B;
3352 } elsif ($token->{type} eq 'end-of-file') {
3353 if ($token->{insertion_mode} ne 'trailing end') {
3354 ## Generate implied end tags
3355 if ({
3356 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3357 tbody => 1, tfoot=> 1, thead => 1,
3358 }->{$self->{open_elements}->[-1]->[1]}) {
3359 !!!back-token;
3360 $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
3361 redo B;
3362 }
3363
3364 if (@{$self->{open_elements}} > 2 or
3365 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3366 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3367 } elsif (defined $self->{inner_html_node} and
3368 @{$self->{open_elements}} > 1 and
3369 $self->{open_elements}->[1]->[1] ne 'body') {
3370 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3371 }
3372
3373 ## ISSUE: There is an issue in the spec.
3374 }
3375
3376 ## Stop parsing
3377 last B;
3378 } elsif ($token->{type} eq 'start tag' and
3379 $token->{tag_name} eq 'html') {
3380 if ($self->{insertion_mode} eq 'trailing end') {
3381 ## Turn into the main phase
3382 !!!parse-error (type => 'after html:html');
3383 $self->{insertion_mode} = $previous_insertion_mode;
3384 }
3385
3386 ## ISSUE: "aa<html>" is not a parse error.
3387 ## ISSUE: "<html>" in fragment is not a parse error.
3388 unless ($token->{first_start_tag}) {
3389 !!!parse-error (type => 'not first start tag');
3390 }
3391 my $top_el = $self->{open_elements}->[0]->[0];
3392 for my $attr_name (keys %{$token->{attributes}}) {
3393 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3394 $top_el->set_attribute_ns
3395 (undef, [undef, $attr_name],
3396 $token->{attributes}->{$attr_name}->{value});
3397 }
3398 }
3399 !!!next-token;
3400 redo B;
3401 } elsif ($token->{type} eq 'comment') {
3402 my $comment = $self->{document}->create_comment ($token->{data});
3403 if ($self->{insertion_mode} eq 'trailing end') {
3404 $self->{document}->append_child ($comment);
3405 } elsif ($self->{insertion_mode} eq 'after body') {
3406 $self->{open_elements}->[0]->[0]->append_child ($comment);
3407 } else {
3408 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3409 }
3410 !!!next-token;
3411 redo B;
3412 } elsif ($self->{insertion_mode} eq 'before head') {
3413 if ($token->{type} eq 'character') {
3414 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3415 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3416 unless (length $token->{data}) {
3417 !!!next-token;
3418 redo B;
3419 }
3420 }
3421 ## As if <head>
3422 !!!create-element ($self->{head_element}, 'head');
3423 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3424 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3425 $self->{insertion_mode} = 'in head';
3426 ## reprocess
3427 redo B;
3428 } elsif ($token->{type} eq 'start tag') {
3429 my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3430 !!!create-element ($self->{head_element}, 'head', $attr);
3431 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3432 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3433 $self->{insertion_mode} = 'in head';
3434 if ($token->{tag_name} eq 'head') {
3435 !!!next-token;
3436 #} elsif ({
3437 # base => 1, link => 1, meta => 1,
3438 # script => 1, style => 1, title => 1,
3439 # }->{$token->{tag_name}}) {
3440 # ## reprocess
3441 } else {
3442 ## reprocess
3443 }
3444 redo B;
3445 } elsif ($token->{type} eq 'end tag') {
3446 if ({
3447 head => 1, body => 1, html => 1,
3448 p => 1, br => 1,
3449 }->{$token->{tag_name}}) {
3450 ## As if <head>
3451 !!!create-element ($self->{head_element}, 'head');
3452 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3453 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3454 $self->{insertion_mode} = 'in head';
3455 ## reprocess
3456 redo B;
3457 } else {
3458 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3459 ## Ignore the token ## ISSUE: An issue in the spec.
3460 !!!next-token;
3461 redo B;
3462 }
3463 } else {
3464 die "$0: $token->{type}: Unknown type";
3465 }
3466 } elsif ($self->{insertion_mode} eq 'in head' or
3467 $self->{insertion_mode} eq 'in head noscript' or
3468 $self->{insertion_mode} eq 'after head') {
3469 if ($token->{type} eq 'character') {
3470 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3471 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3472 unless (length $token->{data}) {
3473 !!!next-token;
3474 redo B;
3475 }
3476 }
3477
3478 #
3479 } elsif ($token->{type} eq 'start tag') {
3480 if ({base => ($self->{insertion_mode} eq 'in head' or
3481 $self->{insertion_mode} eq 'after head'),
3482 link => 1}->{$token->{tag_name}}) {
3483 ## NOTE: There is a "as if in head" code clone.
3484 if ($self->{insertion_mode} eq 'after head') {
3485 !!!parse-error (type => 'after head:'.$token->{tag_name});
3486 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3487 }
3488 !!!insert-element ($token->{tag_name}, $token->{attributes});
3489 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3490 pop @{$self->{open_elements}}
3491 if $self->{insertion_mode} eq 'after head';
3492 !!!next-token;
3493 redo B;
3494 } elsif ($token->{tag_name} eq 'meta') {
3495 ## NOTE: There is a "as if in head" code clone.
3496 if ($self->{insertion_mode} eq 'after head') {
3497 !!!parse-error (type => 'after head:'.$token->{tag_name});
3498 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3499 }
3500 !!!insert-element ($token->{tag_name}, $token->{attributes});
3501 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3502
3503 unless ($self->{confident}) {
3504 my $charset;
3505 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3506 $charset = $token->{attributes}->{charset}->{value};
3507 }
3508 if ($token->{attributes}->{'http-equiv'}) {
3509 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3510 if ($token->{attributes}->{'http-equiv'}->{value}
3511 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
3512 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3513 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3514 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
3515 } ## TODO: And if supported
3516 }
3517 ## TODO: Change the encoding
3518 }
3519
3520 ## TODO: Extracting |charset| from |meta|.
3521 pop @{$self->{open_elements}}
3522 if $self->{insertion_mode} eq 'after head';
3523 !!!next-token;
3524 redo B;
3525 } elsif ($token->{tag_name} eq 'title' and
3526 $self->{insertion_mode} eq 'in head') {
3527 ## NOTE: There is a "as if in head" code clone.
3528 if ($self->{insertion_mode} eq 'after head') {
3529 !!!parse-error (type => 'after head:'.$token->{tag_name});
3530 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3531 }
3532 my $parent = defined $self->{head_element} ? $self->{head_element}
3533 : $self->{open_elements}->[-1]->[0];
3534 $parse_rcdata->(RCDATA_CONTENT_MODEL,
3535 sub { $parent->append_child ($_[0]) });
3536 pop @{$self->{open_elements}}
3537 if $self->{insertion_mode} eq 'after head';
3538 redo B;
3539 } elsif ($token->{tag_name} eq 'style') {
3540 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3541 ## insertion mode 'in head')
3542 ## NOTE: There is a "as if in head" code clone.
3543 if ($self->{insertion_mode} eq 'after head') {
3544 !!!parse-error (type => 'after head:'.$token->{tag_name});
3545 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3546 }
3547 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
3548 pop @{$self->{open_elements}}
3549 if $self->{insertion_mode} eq 'after head';
3550 redo B;
3551 } elsif ($token->{tag_name} eq 'noscript') {
3552 if ($self->{insertion_mode} eq 'in head') {
3553 ## NOTE: and scripting is disalbed
3554 !!!insert-element ($token->{tag_name}, $token->{attributes});
3555 $self->{insertion_mode} = 'in head noscript';
3556 !!!next-token;
3557 redo B;
3558 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3559 !!!parse-error (type => 'in noscript:noscript');
3560 ## Ignore the token
3561 !!!next-token;
3562 redo B;
3563 } else {
3564 #
3565 }
3566 } elsif ($token->{tag_name} eq 'head' and
3567 $self->{insertion_mode} ne 'after head') {
3568 !!!parse-error (type => 'in head:head'); # or in head noscript
3569 ## Ignore the token
3570 !!!next-token;
3571 redo B;
3572 } elsif ($self->{insertion_mode} ne 'in head noscript' and
3573 $token->{tag_name} eq 'script') {
3574 if ($self->{insertion_mode} eq 'after head') {
3575 !!!parse-error (type => 'after head:'.$token->{tag_name});
3576 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3577 }
3578 ## NOTE: There is a "as if in head" code clone.
3579 $script_start_tag->($insert_to_current);
3580 pop @{$self->{open_elements}}
3581 if $self->{insertion_mode} eq 'after head';
3582 redo B;
3583 } elsif ($self->{insertion_mode} eq 'after head' and
3584 $token->{tag_name} eq 'body') {
3585 !!!insert-element ('body', $token->{attributes});
3586 $self->{insertion_mode} = 'in body';
3587 !!!next-token;
3588 redo B;
3589 } elsif ($self->{insertion_mode} eq 'after head' and
3590 $token->{tag_name} eq 'frameset') {
3591 !!!insert-element ('frameset', $token->{attributes});
3592 $self->{insertion_mode} = 'in frameset';
3593 !!!next-token;
3594 redo B;
3595 } else {
3596 #
3597 }
3598 } elsif ($token->{type} eq 'end tag') {
3599 if ($self->{insertion_mode} eq 'in head' and
3600 $token->{tag_name} eq 'head') {
3601 pop @{$self->{open_elements}};
3602 $self->{insertion_mode} = 'after head';
3603 !!!next-token;
3604 redo B;
3605 } elsif ($self->{insertion_mode} eq 'in head noscript' and
3606 $token->{tag_name} eq 'noscript') {
3607 pop @{$self->{open_elements}};
3608 $self->{insertion_mode} = 'in head';
3609 !!!next-token;
3610 redo B;
3611 } elsif ($self->{insertion_mode} eq 'in head' and
3612 {
3613 body => 1, html => 1,
3614 p => 1, br => 1,
3615 }->{$token->{tag_name}}) {
3616 #
3617 } elsif ($self->{insertion_mode} eq 'in head noscript' and
3618 {
3619 p => 1, br => 1,
3620 }->{$token->{tag_name}}) {
3621 #
3622 } elsif ($self->{insertion_mode} ne 'after head') {
3623 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3624 ## Ignore the token
3625 !!!next-token;
3626 redo B;
3627 } else {
3628 #
3629 }
3630 } else {
3631 #
3632 }
3633
3634 ## As if </head> or </noscript> or <body>
3635 if ($self->{insertion_mode} eq 'in head') {
3636 pop @{$self->{open_elements}};
3637 $self->{insertion_mode} = 'after head';
3638 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3639 pop @{$self->{open_elements}};
3640 !!!parse-error (type => 'in noscript:'.(defined $token->{tag_name} ? ($token->{type} eq 'end tag' ? '/' : '') . $token->{tag_name} : '#' . $token->{type}));
3641 $self->{insertion_mode} = 'in head';
3642 } else { # 'after head'
3643 !!!insert-element ('body');
3644 $self->{insertion_mode} = 'in body';
3645 }
3646 ## reprocess
3647 redo B;
3648
3649 ## ISSUE: An issue in the spec.
3650 } elsif ($self->{insertion_mode} eq 'in body' or
3651 $self->{insertion_mode} eq 'in cell' or
3652 $self->{insertion_mode} eq 'in caption') {
3653 if ($token->{type} eq 'character') {
3654 ## NOTE: There is a code clone of "character in body".
3655 $reconstruct_active_formatting_elements->($insert_to_current);
3656
3657 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3658
3659 !!!next-token;
3660 redo B;
3661 } elsif ($token->{type} eq 'start tag') {
3662 if ({
3663 caption => 1, col => 1, colgroup => 1, tbody => 1,
3664 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3665 }->{$token->{tag_name}}) {
3666 if ($self->{insertion_mode} eq 'in cell') {
3667 ## have an element in table scope
3668 my $tn;
3669 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3670 my $node = $self->{open_elements}->[$_];
3671 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3672 $tn = $node->[1];
3673 last INSCOPE;
3674 } elsif ({
3675 table => 1, html => 1,
3676 }->{$node->[1]}) {
3677 last INSCOPE;
3678 }
3679 } # INSCOPE
3680 unless (defined $tn) {
3681 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3682 ## Ignore the token
3683 !!!next-token;
3684 redo B;
3685 }
3686
3687 ## Close the cell
3688 !!!back-token; # <?>
3689 $token = {type => 'end tag', tag_name => $tn};
3690 redo B;
3691 } elsif ($self->{insertion_mode} eq 'in caption') {
3692 !!!parse-error (type => 'not closed:caption');
3693
3694 ## As if </caption>
3695 ## have a table element in table scope
3696 my $i;
3697 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3698 my $node = $self->{open_elements}->[$_];
3699 if ($node->[1] eq 'caption') {
3700 $i = $_;
3701 last INSCOPE;
3702 } elsif ({
3703 table => 1, html => 1,
3704 }->{$node->[1]}) {
3705 last INSCOPE;
3706 }
3707 } # INSCOPE
3708 unless (defined $i) {
3709 !!!parse-error (type => 'unmatched end tag:caption');
3710 ## Ignore the token
3711 !!!next-token;
3712 redo B;
3713 }
3714
3715 ## generate implied end tags
3716 if ({
3717 dd => 1, dt => 1, li => 1, p => 1,
3718 td => 1, th => 1, tr => 1,
3719 tbody => 1, tfoot=> 1, thead => 1,
3720 }->{$self->{open_elements}->[-1]->[1]}) {
3721 !!!back-token; # <?>
3722 $token = {type => 'end tag', tag_name => 'caption'};
3723 !!!back-token;
3724 $token = {type => 'end tag',
3725 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3726 redo B;
3727 }
3728
3729 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3730 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3731 }
3732
3733 splice @{$self->{open_elements}}, $i;
3734
3735 $clear_up_to_marker->();
3736
3737 $self->{insertion_mode} = 'in table';
3738
3739 ## reprocess
3740 redo B;
3741 } else {
3742 #
3743 }
3744 } else {
3745 #
3746 }
3747 } elsif ($token->{type} eq 'end tag') {
3748 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3749 if ($self->{insertion_mode} eq 'in cell') {
3750 ## have an element in table scope
3751 my $i;
3752 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3753 my $node = $self->{open_elements}->[$_];
3754 if ($node->[1] eq $token->{tag_name}) {
3755 $i = $_;
3756 last INSCOPE;
3757 } elsif ({
3758 table => 1, html => 1,
3759 }->{$node->[1]}) {
3760 last INSCOPE;
3761 }
3762 } # INSCOPE
3763 unless (defined $i) {
3764 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3765 ## Ignore the token
3766 !!!next-token;
3767 redo B;
3768 }
3769
3770 ## generate implied end tags
3771 if ({
3772 dd => 1, dt => 1, li => 1, p => 1,
3773 td => ($token->{tag_name} eq 'th'),
3774 th => ($token->{tag_name} eq 'td'),
3775 tr => 1,
3776 tbody => 1, tfoot=> 1, thead => 1,
3777 }->{$self->{open_elements}->[-1]->[1]}) {
3778 !!!back-token;
3779 $token = {type => 'end tag',
3780 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3781 redo B;
3782 }
3783
3784 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3785 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3786 }
3787
3788 splice @{$self->{open_elements}}, $i;
3789
3790 $clear_up_to_marker->();
3791
3792 $self->{insertion_mode} = 'in row';
3793
3794 !!!next-token;
3795 redo B;
3796 } elsif ($self->{insertion_mode} eq 'in caption') {
3797 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3798 ## Ignore the token
3799 !!!next-token;
3800 redo B;
3801 } else {
3802 #
3803 }
3804 } elsif ($token->{tag_name} eq 'caption') {
3805 if ($self->{insertion_mode} eq 'in caption') {
3806 ## have a table element in table scope
3807 my $i;
3808 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3809 my $node = $self->{open_elements}->[$_];
3810 if ($node->[1] eq $token->{tag_name}) {
3811 $i = $_;
3812 last INSCOPE;
3813 } elsif ({
3814 table => 1, html => 1,
3815 }->{$node->[1]}) {
3816 last INSCOPE;
3817 }
3818 } # INSCOPE
3819 unless (defined $i) {
3820 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3821 ## Ignore the token
3822 !!!next-token;
3823 redo B;
3824 }
3825
3826 ## generate implied end tags
3827 if ({
3828 dd => 1, dt => 1, li => 1, p => 1,
3829 td => 1, th => 1, tr => 1,
3830 tbody => 1, tfoot=> 1, thead => 1,
3831 }->{$self->{open_elements}->[-1]->[1]}) {
3832 !!!back-token;
3833 $token = {type => 'end tag',
3834 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3835 redo B;
3836 }
3837
3838 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3839 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3840 }
3841
3842 splice @{$self->{open_elements}}, $i;
3843
3844 $clear_up_to_marker->();
3845
3846 $self->{insertion_mode} = 'in table';
3847
3848 !!!next-token;
3849 redo B;
3850 } elsif ($self->{insertion_mode} eq 'in cell') {
3851 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3852 ## Ignore the token
3853 !!!next-token;
3854 redo B;
3855 } else {
3856 #
3857 }
3858 } elsif ({
3859 table => 1, tbody => 1, tfoot => 1,
3860 thead => 1, tr => 1,
3861 }->{$token->{tag_name}} and
3862 $self->{insertion_mode} eq 'in cell') {
3863 ## have an element in table scope
3864 my $i;
3865 my $tn;
3866 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3867 my $node = $self->{open_elements}->[$_];
3868 if ($node->[1] eq $token->{tag_name}) {
3869 $i = $_;
3870 last INSCOPE;
3871 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3872 $tn = $node->[1];
3873 ## NOTE: There is exactly one |td| or |th| element
3874 ## in scope in the stack of open elements by definition.
3875 } elsif ({
3876 table => 1, html => 1,
3877 }->{$node->[1]}) {
3878 last INSCOPE;
3879 }
3880 } # INSCOPE
3881 unless (defined $i) {
3882 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3883 ## Ignore the token
3884 !!!next-token;
3885 redo B;
3886 }
3887
3888 ## Close the cell
3889 !!!back-token; # </?>
3890 $token = {type => 'end tag', tag_name => $tn};
3891 redo B;
3892 } elsif ($token->{tag_name} eq 'table' and
3893 $self->{insertion_mode} eq 'in caption') {
3894 !!!parse-error (type => 'not closed:caption');
3895
3896 ## As if </caption>
3897 ## have a table element in table scope
3898 my $i;
3899 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3900 my $node = $self->{open_elements}->[$_];
3901 if ($node->[1] eq 'caption') {
3902 $i = $_;
3903 last INSCOPE;
3904 } elsif ({
3905 table => 1, html => 1,
3906 }->{$node->[1]}) {
3907 last INSCOPE;
3908 }
3909 } # INSCOPE
3910 unless (defined $i) {
3911 !!!parse-error (type => 'unmatched end tag:caption');
3912 ## Ignore the token
3913 !!!next-token;
3914 redo B;
3915 }
3916
3917 ## generate implied end tags
3918 if ({
3919 dd => 1, dt => 1, li => 1, p => 1,
3920 td => 1, th => 1, tr => 1,
3921 tbody => 1, tfoot=> 1, thead => 1,
3922 }->{$self->{open_elements}->[-1]->[1]}) {
3923 !!!back-token; # </table>
3924 $token = {type => 'end tag', tag_name => 'caption'};
3925 !!!back-token;
3926 $token = {type => 'end tag',
3927 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3928 redo B;
3929 }
3930
3931 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3932 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3933 }
3934
3935 splice @{$self->{open_elements}}, $i;
3936
3937 $clear_up_to_marker->();
3938
3939 $self->{insertion_mode} = 'in table';
3940
3941 ## reprocess
3942 redo B;
3943 } elsif ({
3944 body => 1, col => 1, colgroup => 1, html => 1,
3945 }->{$token->{tag_name}}) {
3946 if ($self->{insertion_mode} eq 'in cell' or
3947 $self->{insertion_mode} eq 'in caption') {
3948 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3949 ## Ignore the token
3950 !!!next-token;
3951 redo B;
3952 } else {
3953 #
3954 }
3955 } elsif ({
3956 tbody => 1, tfoot => 1,
3957 thead => 1, tr => 1,
3958 }->{$token->{tag_name}} and
3959 $self->{insertion_mode} eq 'in caption') {
3960 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3961 ## Ignore the token
3962 !!!next-token;
3963 redo B;
3964 } else {
3965 #
3966 }
3967 } else {
3968 #
3969 }
3970
3971 $in_body->($insert_to_current);
3972 redo B;
3973 } elsif ($self->{insertion_mode} eq 'in table') {
3974 if ($token->{type} eq 'character') {
3975 ## NOTE: There are "character in table" code clones.
3976 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3977 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3978
3979 unless (length $token->{data}) {
3980 !!!next-token;
3981 redo B;
3982 }
3983 }
3984
3985 !!!parse-error (type => 'in table:#character');
3986
3987 ## As if in body, but insert into foster parent element
3988 ## ISSUE: Spec says that "whenever a node would be inserted
3989 ## into the current node" while characters might not be
3990 ## result in a new Text node.
3991 $reconstruct_active_formatting_elements->($insert_to_foster);
3992
3993 if ({
3994 table => 1, tbody => 1, tfoot => 1,
3995 thead => 1, tr => 1,
3996 }->{$self->{open_elements}->[-1]->[1]}) {
3997 # MUST
3998 my $foster_parent_element;
3999 my $next_sibling;
4000 my $prev_sibling;
4001 OE: for (reverse 0..$#{$self->{open_elements}}) {
4002 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4003 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4004 if (defined $parent and $parent->node_type == 1) {
4005 $foster_parent_element = $parent;
4006 $next_sibling = $self->{open_elements}->[$_]->[0];
4007 $prev_sibling = $next_sibling->previous_sibling;
4008 } else {
4009 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4010 $prev_sibling = $foster_parent_element->last_child;
4011 }
4012 last OE;
4013 }
4014 } # OE
4015 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4016 $prev_sibling = $foster_parent_element->last_child
4017 unless defined $foster_parent_element;
4018 if (defined $prev_sibling and
4019 $prev_sibling->node_type == 3) {
4020 $prev_sibling->manakai_append_text ($token->{data});
4021 } else {
4022 $foster_parent_element->insert_before
4023 ($self->{document}->create_text_node ($token->{data}),
4024 $next_sibling);
4025 }
4026 } else {
4027 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4028 }
4029
4030 !!!next-token;
4031 redo B;
4032 } elsif ($token->{type} eq 'start tag') {
4033 if ({
4034 caption => 1,
4035 colgroup => 1,
4036 tbody => 1, tfoot => 1, thead => 1,
4037 }->{$token->{tag_name}}) {
4038 ## Clear back to table context
4039 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4040 $self->{open_elements}->[-1]->[1] ne 'html') {
4041 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4042 pop @{$self->{open_elements}};
4043 }
4044
4045 push @$active_formatting_elements, ['#marker', '']
4046 if $token->{tag_name} eq 'caption';
4047
4048 !!!insert-element ($token->{tag_name}, $token->{attributes});
4049 $self->{insertion_mode} = {
4050 caption => 'in caption',
4051 colgroup => 'in column group',
4052 tbody => 'in table body',
4053 tfoot => 'in table body',
4054 thead => 'in table body',
4055 }->{$token->{tag_name}};
4056 !!!next-token;
4057 redo B;
4058 } elsif ({
4059 col => 1,
4060 td => 1, th => 1, tr => 1,
4061 }->{$token->{tag_name}}) {
4062 ## Clear back to table context
4063 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4064 $self->{open_elements}->[-1]->[1] ne 'html') {
4065 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4066 pop @{$self->{open_elements}};
4067 }
4068
4069 !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
4070 $self->{insertion_mode} = $token->{tag_name} eq 'col'
4071 ? 'in column group' : 'in table body';
4072 ## reprocess
4073 redo B;
4074 } elsif ($token->{tag_name} eq 'table') {
4075 ## NOTE: There are code clones for this "table in table"
4076 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4077
4078 ## As if </table>
4079 ## have a table element in table scope
4080 my $i;
4081 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4082 my $node = $self->{open_elements}->[$_];
4083 if ($node->[1] eq 'table') {
4084 $i = $_;
4085 last INSCOPE;
4086 } elsif ({
4087 table => 1, html => 1,
4088 }->{$node->[1]}) {
4089 last INSCOPE;
4090 }
4091 } # INSCOPE
4092 unless (defined $i) {
4093 !!!parse-error (type => 'unmatched end tag:table');
4094 ## Ignore tokens </table><table>
4095 !!!next-token;
4096 redo B;
4097 }
4098
4099 ## generate implied end tags
4100 if ({
4101 dd => 1, dt => 1, li => 1, p => 1,
4102 td => 1, th => 1, tr => 1,
4103 tbody => 1, tfoot=> 1, thead => 1,
4104 }->{$self->{open_elements}->[-1]->[1]}) {
4105 !!!back-token; # <table>
4106 $token = {type => 'end tag', tag_name => 'table'};
4107 !!!back-token;
4108 $token = {type => 'end tag',
4109 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4110 redo B;
4111 }
4112
4113 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4114 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4115 }
4116
4117 splice @{$self->{open_elements}}, $i;
4118
4119 $self->_reset_insertion_mode;
4120
4121 ## reprocess
4122 redo B;
4123 } else {
4124 #
4125 }
4126 } elsif ($token->{type} eq 'end tag') {
4127 if ($token->{tag_name} eq 'table') {
4128 ## have a table element in table scope
4129 my $i;
4130 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4131 my $node = $self->{open_elements}->[$_];
4132 if ($node->[1] eq $token->{tag_name}) {
4133 $i = $_;
4134 last INSCOPE;
4135 } elsif ({
4136 table => 1, html => 1,
4137 }->{$node->[1]}) {
4138 last INSCOPE;
4139 }
4140 } # INSCOPE
4141 unless (defined $i) {
4142 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4143 ## Ignore the token
4144 !!!next-token;
4145 redo B;
4146 }
4147
4148 ## generate implied end tags
4149 if ({
4150 dd => 1, dt => 1, li => 1, p => 1,
4151 td => 1, th => 1, tr => 1,
4152 tbody => 1, tfoot=> 1, thead => 1,
4153 }->{$self->{open_elements}->[-1]->[1]}) {
4154 !!!back-token;
4155 $token = {type => 'end tag',
4156 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4157 redo B;
4158 }
4159
4160 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4161 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4162 }
4163
4164 splice @{$self->{open_elements}}, $i;
4165
4166 $self->_reset_insertion_mode;
4167
4168 !!!next-token;
4169 redo B;
4170 } elsif ({
4171 body => 1, caption => 1, col => 1, colgroup => 1,
4172 html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
4173 thead => 1, tr => 1,
4174 }->{$token->{tag_name}}) {
4175 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4176 ## Ignore the token
4177 !!!next-token;
4178 redo B;
4179 } else {
4180 #
4181 }
4182 } else {
4183 #
4184 }
4185
4186 !!!parse-error (type => 'in table:'.$token->{tag_name});
4187 $in_body->($insert_to_foster);
4188 redo B;
4189 } elsif ($self->{insertion_mode} eq 'in column group') {
4190 if ($token->{type} eq 'character') {
4191 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4192 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4193 unless (length $token->{data}) {
4194 !!!next-token;
4195 redo B;
4196 }
4197 }
4198
4199 #
4200 } elsif ($token->{type} eq 'start tag') {
4201 if ($token->{tag_name} eq 'col') {
4202 !!!insert-element ($token->{tag_name}, $token->{attributes});
4203 pop @{$self->{open_elements}};
4204 !!!next-token;
4205 redo B;
4206 } else {
4207 #
4208 }
4209 } elsif ($token->{type} eq 'end tag') {
4210 if ($token->{tag_name} eq 'colgroup') {
4211 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4212 !!!parse-error (type => 'unmatched end tag:colgroup');
4213 ## Ignore the token
4214 !!!next-token;
4215 redo B;
4216 } else {
4217 pop @{$self->{open_elements}}; # colgroup
4218 $self->{insertion_mode} = 'in table';
4219 !!!next-token;
4220 redo B;
4221 }
4222 } elsif ($token->{tag_name} eq 'col') {
4223 !!!parse-error (type => 'unmatched end tag:col');
4224 ## Ignore the token
4225 !!!next-token;
4226 redo B;
4227 } else {
4228 #
4229 }
4230 } else {
4231 #
4232 }
4233
4234 ## As if </colgroup>
4235 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4236 !!!parse-error (type => 'unmatched end tag:colgroup');
4237 ## Ignore the token
4238 !!!next-token;
4239 redo B;
4240 } else {
4241 pop @{$self->{open_elements}}; # colgroup
4242 $self->{insertion_mode} = 'in table';
4243 ## reprocess
4244 redo B;
4245 }
4246 } elsif ($self->{insertion_mode} eq 'in table body') {
4247 if ($token->{type} eq 'character') {
4248 ## NOTE: This is a "character in table" code clone.
4249 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4250 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4251
4252 unless (length $token->{data}) {
4253 !!!next-token;
4254 redo B;
4255 }
4256 }
4257
4258 !!!parse-error (type => 'in table:#character');
4259
4260 ## As if in body, but insert into foster parent element
4261 ## ISSUE: Spec says that "whenever a node would be inserted
4262 ## into the current node" while characters might not be
4263 ## result in a new Text node.
4264 $reconstruct_active_formatting_elements->($insert_to_foster);
4265
4266 if ({
4267 table => 1, tbody => 1, tfoot => 1,
4268 thead => 1, tr => 1,
4269 }->{$self->{open_elements}->[-1]->[1]}) {
4270 # MUST
4271 my $foster_parent_element;
4272 my $next_sibling;
4273 my $prev_sibling;
4274 OE: for (reverse 0..$#{$self->{open_elements}}) {
4275 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4276 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4277 if (defined $parent and $parent->node_type == 1) {
4278 $foster_parent_element = $parent;
4279 $next_sibling = $self->{open_elements}->[$_]->[0];
4280 $prev_sibling = $next_sibling->previous_sibling;
4281 } else {
4282 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4283 $prev_sibling = $foster_parent_element->last_child;
4284 }
4285 last OE;
4286 }
4287 } # OE
4288 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4289 $prev_sibling = $foster_parent_element->last_child
4290 unless defined $foster_parent_element;
4291 if (defined $prev_sibling and
4292 $prev_sibling->node_type == 3) {
4293 $prev_sibling->manakai_append_text ($token->{data});
4294 } else {
4295 $foster_parent_element->insert_before
4296 ($self->{document}->create_text_node ($token->{data}),
4297 $next_sibling);
4298 }
4299 } else {
4300 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4301 }
4302
4303 !!!next-token;
4304 redo B;
4305 } elsif ($token->{type} eq 'start tag') {
4306 if ({
4307 tr => 1,
4308 th => 1, td => 1,
4309 }->{$token->{tag_name}}) {
4310 unless ($token->{tag_name} eq 'tr') {
4311 !!!parse-error (type => 'missing start tag:tr');
4312 }
4313
4314 ## Clear back to table body context
4315 while (not {
4316 tbody => 1, tfoot => 1, thead => 1, html => 1,
4317 }->{$self->{open_elements}->[-1]->[1]}) {
4318 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4319 pop @{$self->{open_elements}};
4320 }
4321
4322 $self->{insertion_mode} = 'in row';
4323 if ($token->{tag_name} eq 'tr') {
4324 !!!insert-element ($token->{tag_name}, $token->{attributes});
4325 !!!next-token;
4326 } else {
4327 !!!insert-element ('tr');
4328 ## reprocess
4329 }
4330 redo B;
4331 } elsif ({
4332 caption => 1, col => 1, colgroup => 1,
4333 tbody => 1, tfoot => 1, thead => 1,
4334 }->{$token->{tag_name}}) {
4335 ## have an element in table scope
4336 my $i;
4337 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4338 my $node = $self->{open_elements}->[$_];
4339 if ({
4340 tbody => 1, thead => 1, tfoot => 1,
4341 }->{$node->[1]}) {
4342 $i = $_;
4343 last INSCOPE;
4344 } elsif ({
4345 table => 1, html => 1,
4346 }->{$node->[1]}) {
4347 last INSCOPE;
4348 }
4349 } # INSCOPE
4350 unless (defined $i) {
4351 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4352 ## Ignore the token
4353 !!!next-token;
4354 redo B;
4355 }
4356
4357 ## Clear back to table body context
4358 while (not {
4359 tbody => 1, tfoot => 1, thead => 1, html => 1,
4360 }->{$self->{open_elements}->[-1]->[1]}) {
4361 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4362 pop @{$self->{open_elements}};
4363 }
4364
4365 ## As if <{current node}>
4366 ## have an element in table scope
4367 ## true by definition
4368
4369 ## Clear back to table body context
4370 ## nop by definition
4371
4372 pop @{$self->{open_elements}};
4373 $self->{insertion_mode} = 'in table';
4374 ## reprocess
4375 redo B;
4376 } elsif ($token->{tag_name} eq 'table') {
4377 ## NOTE: This is a code clone of "table in table"
4378 !!!parse-error (type => 'not closed:table');
4379
4380 ## As if </table>
4381 ## have a table element in table scope
4382 my $i;
4383 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4384 my $node = $self->{open_elements}->[$_];
4385 if ($node->[1] eq 'table') {
4386 $i = $_;
4387 last INSCOPE;
4388 } elsif ({
4389 table => 1, html => 1,
4390 }->{$node->[1]}) {
4391 last INSCOPE;
4392 }
4393 } # INSCOPE
4394 unless (defined $i) {
4395 !!!parse-error (type => 'unmatched end tag:table');
4396 ## Ignore tokens </table><table>
4397 !!!next-token;
4398 redo B;
4399 }
4400
4401 ## generate implied end tags
4402 if ({
4403 dd => 1, dt => 1, li => 1, p => 1,
4404 td => 1, th => 1, tr => 1,
4405 tbody => 1, tfoot=> 1, thead => 1,
4406 }->{$self->{open_elements}->[-1]->[1]}) {
4407 !!!back-token; # <table>
4408 $token = {type => 'end tag', tag_name => 'table'};
4409 !!!back-token;
4410 $token = {type => 'end tag',
4411 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4412 redo B;
4413 }
4414
4415 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4416 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4417 }
4418
4419 splice @{$self->{open_elements}}, $i;
4420
4421 $self->_reset_insertion_mode;
4422
4423 ## reprocess
4424 redo B;
4425 } else {
4426 #
4427 }
4428 } elsif ($token->{type} eq 'end tag') {
4429 if ({
4430 tbody => 1, tfoot => 1, thead => 1,
4431 }->{$token->{tag_name}}) {
4432 ## have an element in table scope
4433 my $i;
4434 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4435 my $node = $self->{open_elements}->[$_];
4436 if ($node->[1] eq $token->{tag_name}) {
4437 $i = $_;
4438 last INSCOPE;
4439 } elsif ({
4440 table => 1, html => 1,
4441 }->{$node->[1]}) {
4442 last INSCOPE;
4443 }
4444 } # INSCOPE
4445 unless (defined $i) {
4446 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4447 ## Ignore the token
4448 !!!next-token;
4449 redo B;
4450 }
4451
4452 ## Clear back to table body context
4453 while (not {
4454 tbody => 1, tfoot => 1, thead => 1, html => 1,
4455 }->{$self->{open_elements}->[-1]->[1]}) {
4456 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4457 pop @{$self->{open_elements}};
4458 }
4459
4460 pop @{$self->{open_elements}};
4461 $self->{insertion_mode} = 'in table';
4462 !!!next-token;
4463 redo B;
4464 } elsif ($token->{tag_name} eq 'table') {
4465 ## have an element in table scope
4466 my $i;
4467 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4468 my $node = $self->{open_elements}->[$_];
4469 if ({
4470 tbody => 1, thead => 1, tfoot => 1,
4471 }->{$node->[1]}) {
4472 $i = $_;
4473 last INSCOPE;
4474 } elsif ({
4475 table => 1, html => 1,
4476 }->{$node->[1]}) {
4477 last INSCOPE;
4478 }
4479 } # INSCOPE
4480 unless (defined $i) {
4481 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4482 ## Ignore the token
4483 !!!next-token;
4484 redo B;
4485 }
4486
4487 ## Clear back to table body context
4488 while (not {
4489 tbody => 1, tfoot => 1, thead => 1, html => 1,
4490 }->{$self->{open_elements}->[-1]->[1]}) {
4491 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4492 pop @{$self->{open_elements}};
4493 }
4494
4495 ## As if <{current node}>
4496 ## have an element in table scope
4497 ## true by definition
4498
4499 ## Clear back to table body context
4500 ## nop by definition
4501
4502 pop @{$self->{open_elements}};
4503 $self->{insertion_mode} = 'in table';
4504 ## reprocess
4505 redo B;
4506 } elsif ({
4507 body => 1, caption => 1, col => 1, colgroup => 1,
4508 html => 1, td => 1, th => 1, tr => 1,
4509 }->{$token->{tag_name}}) {
4510 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4511 ## Ignore the token
4512 !!!next-token;
4513 redo B;
4514 } else {
4515 #
4516 }
4517 } else {
4518 #
4519 }
4520
4521 ## As if in table
4522 !!!parse-error (type => 'in table:'.$token->{tag_name});
4523 $in_body->($insert_to_foster);
4524 redo B;
4525 } elsif ($self->{insertion_mode} eq 'in row') {
4526 if ($token->{type} eq 'character') {
4527 ## NOTE: This is a "character in table" code clone.
4528 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4529 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4530
4531 unless (length $token->{data}) {
4532 !!!next-token;
4533 redo B;
4534 }
4535 }
4536
4537 !!!parse-error (type => 'in table:#character');
4538
4539 ## As if in body, but insert into foster parent element
4540 ## ISSUE: Spec says that "whenever a node would be inserted
4541 ## into the current node" while characters might not be
4542 ## result in a new Text node.
4543 $reconstruct_active_formatting_elements->($insert_to_foster);
4544
4545 if ({
4546 table => 1, tbody => 1, tfoot => 1,
4547 thead => 1, tr => 1,
4548 }->{$self->{open_elements}->[-1]->[1]}) {
4549 # MUST
4550 my $foster_parent_element;
4551 my $next_sibling;
4552 my $prev_sibling;
4553 OE: for (reverse 0..$#{$self->{open_elements}}) {
4554 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4555 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4556 if (defined $parent and $parent->node_type == 1) {
4557 $foster_parent_element = $parent;
4558 $next_sibling = $self->{open_elements}->[$_]->[0];
4559 $prev_sibling = $next_sibling->previous_sibling;
4560 } else {
4561 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4562 $prev_sibling = $foster_parent_element->last_child;
4563 }
4564 last OE;
4565 }
4566 } # OE
4567 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4568 $prev_sibling = $foster_parent_element->last_child
4569 unless defined $foster_parent_element;
4570 if (defined $prev_sibling and
4571 $prev_sibling->node_type == 3) {
4572 $prev_sibling->manakai_append_text ($token->{data});
4573 } else {
4574 $foster_parent_element->insert_before
4575 ($self->{document}->create_text_node ($token->{data}),
4576 $next_sibling);
4577 }
4578 } else {
4579 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4580 }
4581
4582 !!!next-token;
4583 redo B;
4584 } elsif ($token->{type} eq 'start tag') {
4585 if ($token->{tag_name} eq 'th' or
4586 $token->{tag_name} eq 'td') {
4587 ## Clear back to table row context
4588 while (not {
4589 tr => 1, html => 1,
4590 }->{$self->{open_elements}->[-1]->[1]}) {
4591 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4592 pop @{$self->{open_elements}};
4593 }
4594
4595 !!!insert-element ($token->{tag_name}, $token->{attributes});
4596 $self->{insertion_mode} = 'in cell';
4597
4598 push @$active_formatting_elements, ['#marker', ''];
4599
4600 !!!next-token;
4601 redo B;
4602 } elsif ({
4603 caption => 1, col => 1, colgroup => 1,
4604 tbody => 1, tfoot => 1, thead => 1, tr => 1,
4605 }->{$token->{tag_name}}) {
4606 ## As if </tr>
4607 ## have an element in table scope
4608 my $i;
4609 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4610 my $node = $self->{open_elements}->[$_];
4611 if ($node->[1] eq 'tr') {
4612 $i = $_;
4613 last INSCOPE;
4614 } elsif ({
4615 table => 1, html => 1,
4616 }->{$node->[1]}) {
4617 last INSCOPE;
4618 }
4619 } # INSCOPE
4620 unless (defined $i) {
4621 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4622 ## Ignore the token
4623 !!!next-token;
4624 redo B;
4625 }
4626
4627 ## Clear back to table row context
4628 while (not {
4629 tr => 1, html => 1,
4630 }->{$self->{open_elements}->[-1]->[1]}) {
4631 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4632 pop @{$self->{open_elements}};
4633 }
4634
4635 pop @{$self->{open_elements}}; # tr
4636 $self->{insertion_mode} = 'in table body';
4637 ## reprocess
4638 redo B;
4639 } elsif ($token->{tag_name} eq 'table') {
4640 ## NOTE: This is a code clone of "table in table"
4641 !!!parse-error (type => 'not closed:table');
4642
4643 ## As if </table>
4644 ## have a table element in table scope
4645 my $i;
4646 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4647 my $node = $self->{open_elements}->[$_];
4648 if ($node->[1] eq 'table') {
4649 $i = $_;
4650 last INSCOPE;
4651 } elsif ({
4652 table => 1, html => 1,
4653 }->{$node->[1]}) {
4654 last INSCOPE;
4655 }
4656 } # INSCOPE
4657 unless (defined $i) {
4658 !!!parse-error (type => 'unmatched end tag:table');
4659 ## Ignore tokens </table><table>
4660 !!!next-token;
4661 redo B;
4662 }
4663
4664 ## generate implied end tags
4665 if ({
4666 dd => 1, dt => 1, li => 1, p => 1,
4667 td => 1, th => 1, tr => 1,
4668 tbody => 1, tfoot=> 1, thead => 1,
4669 }->{$self->{open_elements}->[-1]->[1]}) {
4670 !!!back-token; # <table>
4671 $token = {type => 'end tag', tag_name => 'table'};
4672 !!!back-token;
4673 $token = {type => 'end tag',
4674 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4675 redo B;
4676 }
4677
4678 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4679 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4680 }
4681
4682 splice @{$self->{open_elements}}, $i;
4683
4684 $self->_reset_insertion_mode;
4685
4686 ## reprocess
4687 redo B;
4688 } else {
4689 #
4690 }
4691 } elsif ($token->{type} eq 'end tag') {
4692 if ($token->{tag_name} eq 'tr') {
4693 ## have an element in table scope
4694 my $i;
4695 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4696 my $node = $self->{open_elements}->[$_];
4697 if ($node->[1] eq $token->{tag_name}) {
4698 $i = $_;
4699 last INSCOPE;
4700 } elsif ({
4701 table => 1, html => 1,
4702 }->{$node->[1]}) {
4703 last INSCOPE;
4704 }
4705 } # INSCOPE
4706 unless (defined $i) {
4707 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4708 ## Ignore the token
4709 !!!next-token;
4710 redo B;
4711 }
4712
4713 ## Clear back to table row context
4714 while (not {
4715 tr => 1, html => 1,
4716 }->{$self->{open_elements}->[-1]->[1]}) {
4717 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4718 pop @{$self->{open_elements}};
4719 }
4720
4721 pop @{$self->{open_elements}}; # tr
4722 $self->{insertion_mode} = 'in table body';
4723 !!!next-token;
4724 redo B;
4725 } elsif ($token->{tag_name} eq 'table') {
4726 ## As if </tr>
4727 ## have an element in table scope
4728 my $i;
4729 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4730 my $node = $self->{open_elements}->[$_];
4731 if ($node->[1] eq 'tr') {
4732 $i = $_;
4733 last INSCOPE;
4734 } elsif ({
4735 table => 1, html => 1,
4736 }->{$node->[1]}) {
4737 last INSCOPE;
4738 }
4739 } # INSCOPE
4740 unless (defined $i) {
4741 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4742 ## Ignore the token
4743 !!!next-token;
4744 redo B;
4745 }
4746
4747 ## Clear back to table row context
4748 while (not {
4749 tr => 1, html => 1,
4750 }->{$self->{open_elements}->[-1]->[1]}) {
4751 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4752 pop @{$self->{open_elements}};
4753 }
4754
4755 pop @{$self->{open_elements}}; # tr
4756 $self->{insertion_mode} = 'in table body';
4757 ## reprocess
4758 redo B;
4759 } elsif ({
4760 tbody => 1, tfoot => 1, thead => 1,
4761 }->{$token->{tag_name}}) {
4762 ## have an element in table scope
4763 my $i;
4764 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4765 my $node = $self->{open_elements}->[$_];
4766 if ($node->[1] eq $token->{tag_name}) {
4767 $i = $_;
4768 last INSCOPE;
4769 } elsif ({
4770 table => 1, html => 1,
4771 }->{$node->[1]}) {
4772 last INSCOPE;
4773 }
4774 } # INSCOPE
4775 unless (defined $i) {
4776 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4777 ## Ignore the token
4778 !!!next-token;
4779 redo B;
4780 }
4781
4782 ## As if </tr>
4783 ## have an element in table scope
4784 my $i;
4785 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4786 my $node = $self->{open_elements}->[$_];
4787 if ($node->[1] eq 'tr') {
4788 $i = $_;
4789 last INSCOPE;
4790 } elsif ({
4791 table => 1, html => 1,
4792 }->{$node->[1]}) {
4793 last INSCOPE;
4794 }
4795 } # INSCOPE
4796 unless (defined $i) {
4797 !!!parse-error (type => 'unmatched end tag:tr');
4798 ## Ignore the token
4799 !!!next-token;
4800 redo B;
4801 }
4802
4803 ## Clear back to table row context
4804 while (not {
4805 tr => 1, html => 1,
4806 }->{$self->{open_elements}->[-1]->[1]}) {
4807 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4808 pop @{$self->{open_elements}};
4809 }
4810
4811 pop @{$self->{open_elements}}; # tr
4812 $self->{insertion_mode} = 'in table body';
4813 ## reprocess
4814 redo B;
4815 } elsif ({
4816 body => 1, caption => 1, col => 1,
4817 colgroup => 1, html => 1, td => 1, th => 1,
4818 }->{$token->{tag_name}}) {
4819 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4820 ## Ignore the token
4821 !!!next-token;
4822 redo B;
4823 } else {
4824 #
4825 }
4826 } else {
4827 #
4828 }
4829
4830 ## As if in table
4831 !!!parse-error (type => 'in table:'.$token->{tag_name});
4832 $in_body->($insert_to_foster);
4833 redo B;
4834 } elsif ($self->{insertion_mode} eq 'in select') {
4835 if ($token->{type} eq 'character') {
4836 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4837 !!!next-token;
4838 redo B;
4839 } elsif ($token->{type} eq 'start tag') {
4840 if ($token->{tag_name} eq 'option') {
4841 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4842 ## As if </option>
4843 pop @{$self->{open_elements}};
4844 }
4845
4846 !!!insert-element ($token->{tag_name}, $token->{attributes});
4847 !!!next-token;
4848 redo B;
4849 } elsif ($token->{tag_name} eq 'optgroup') {
4850 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4851 ## As if </option>
4852 pop @{$self->{open_elements}};
4853 }
4854
4855 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4856 ## As if </optgroup>
4857 pop @{$self->{open_elements}};
4858 }
4859
4860 !!!insert-element ($token->{tag_name}, $token->{attributes});
4861 !!!next-token;
4862 redo B;
4863 } elsif ($token->{tag_name} eq 'select') {
4864 !!!parse-error (type => 'not closed:select');
4865 ## As if </select> instead
4866 ## have an element in table scope
4867 my $i;
4868 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4869 my $node = $self->{open_elements}->[$_];
4870 if ($node->[1] eq $token->{tag_name}) {
4871 $i = $_;
4872 last INSCOPE;
4873 } elsif ({
4874 table => 1, html => 1,
4875 }->{$node->[1]}) {
4876 last INSCOPE;
4877 }
4878 } # INSCOPE
4879 unless (defined $i) {
4880 !!!parse-error (type => 'unmatched end tag:select');
4881 ## Ignore the token
4882 !!!next-token;
4883 redo B;
4884 }
4885
4886 splice @{$self->{open_elements}}, $i;
4887
4888 $self->_reset_insertion_mode;
4889
4890 !!!next-token;
4891 redo B;
4892 } else {
4893 #
4894 }
4895 } elsif ($token->{type} eq 'end tag') {
4896 if ($token->{tag_name} eq 'optgroup') {
4897 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4898 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4899 ## As if </option>
4900 splice @{$self->{open_elements}}, -2;
4901 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4902 pop @{$self->{open_elements}};
4903 } else {
4904 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4905 ## Ignore the token
4906 }
4907 !!!next-token;
4908 redo B;
4909 } elsif ($token->{tag_name} eq 'option') {
4910 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4911 pop @{$self->{open_elements}};
4912 } else {
4913 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4914 ## Ignore the token
4915 }
4916 !!!next-token;
4917 redo B;
4918 } elsif ($token->{tag_name} eq 'select') {
4919 ## have an element in table scope
4920 my $i;
4921 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4922 my $node = $self->{open_elements}->[$_];
4923 if ($node->[1] eq $token->{tag_name}) {
4924 $i = $_;
4925 last INSCOPE;
4926 } elsif ({
4927 table => 1, html => 1,
4928 }->{$node->[1]}) {
4929 last INSCOPE;
4930 }
4931 } # INSCOPE
4932 unless (defined $i) {
4933 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4934 ## Ignore the token
4935 !!!next-token;
4936 redo B;
4937 }
4938
4939 splice @{$self->{open_elements}}, $i;
4940
4941 $self->_reset_insertion_mode;
4942
4943 !!!next-token;
4944 redo B;
4945 } elsif ({
4946 caption => 1, table => 1, tbody => 1,
4947 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4948 }->{$token->{tag_name}}) {
4949 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4950
4951 ## have an element in table scope
4952 my $i;
4953 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4954 my $node = $self->{open_elements}->[$_];
4955 if ($node->[1] eq $token->{tag_name}) {
4956 $i = $_;
4957 last INSCOPE;
4958 } elsif ({
4959 table => 1, html => 1,
4960 }->{$node->[1]}) {
4961 last INSCOPE;
4962 }
4963 } # INSCOPE
4964 unless (defined $i) {
4965 ## Ignore the token
4966 !!!next-token;
4967 redo B;
4968 }
4969
4970 ## As if </select>
4971 ## have an element in table scope
4972 undef $i;
4973 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4974 my $node = $self->{open_elements}->[$_];
4975 if ($node->[1] eq 'select') {
4976 $i = $_;
4977 last INSCOPE;
4978 } elsif ({
4979 table => 1, html => 1,
4980 }->{$node->[1]}) {
4981 last INSCOPE;
4982 }
4983 } # INSCOPE
4984 unless (defined $i) {
4985 !!!parse-error (type => 'unmatched end tag:select');
4986 ## Ignore the </select> token
4987 !!!next-token; ## TODO: ok?
4988 redo B;
4989 }
4990
4991 splice @{$self->{open_elements}}, $i;
4992
4993 $self->_reset_insertion_mode;
4994
4995 ## reprocess
4996 redo B;
4997 } else {
4998 #
4999 }
5000 } else {
5001 #
5002 }
5003
5004 !!!parse-error (type => 'in select:'.$token->{tag_name});
5005 ## Ignore the token
5006 !!!next-token;
5007 redo B;
5008 } elsif ($self->{insertion_mode} eq 'after body') {
5009 if ($token->{type} eq 'character') {
5010 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5011 my $data = $1;
5012 ## As if in body
5013 $reconstruct_active_formatting_elements->($insert_to_current);
5014
5015 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5016
5017 unless (length $token->{data}) {
5018 !!!next-token;
5019 redo B;
5020 }
5021 }
5022
5023 #
5024 !!!parse-error (type => 'after body:#character');
5025 } elsif ($token->{type} eq 'start tag') {
5026 !!!parse-error (type => 'after body:'.$token->{tag_name});
5027 #
5028 } elsif ($token->{type} eq 'end tag') {
5029 if ($token->{tag_name} eq 'html') {
5030 if (defined $self->{inner_html_node}) {
5031 !!!parse-error (type => 'unmatched end tag:html');
5032 ## Ignore the token
5033 !!!next-token;
5034 redo B;
5035 } else {
5036 $previous_insertion_mode = $self->{insertion_mode};
5037 $self->{insertion_mode} = 'trailing end';
5038 !!!next-token;
5039 redo B;
5040 }
5041 } else {
5042 !!!parse-error (type => 'after body:/'.$token->{tag_name});
5043 }
5044 } else {
5045 die "$0: $token->{type}: Unknown token type";
5046 }
5047
5048 $self->{insertion_mode} = 'in body';
5049 ## reprocess
5050 redo B;
5051 } elsif ($self->{insertion_mode} eq 'in frameset') {
5052 if ($token->{type} eq 'character') {
5053 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5054 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5055
5056 unless (length $token->{data}) {
5057 !!!next-token;
5058 redo B;
5059 }
5060 }
5061
5062 !!!parse-error (type => 'in frameset:#character');
5063 ## Ignore the token
5064 !!!next-token;
5065 redo B;
5066 } elsif ($token->{type} eq 'start tag') {
5067 if ($token->{tag_name} eq 'frameset') {
5068 !!!insert-element ($token->{tag_name}, $token->{attributes});
5069 !!!next-token;
5070 redo B;
5071 } elsif ($token->{tag_name} eq 'frame') {
5072 !!!insert-element ($token->{tag_name}, $token->{attributes});
5073 pop @{$self->{open_elements}};
5074 !!!next-token;
5075 redo B;
5076 } elsif ($token->{tag_name} eq 'noframes') {
5077 ## NOTE: As if in body.
5078 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
5079 redo B;
5080 } else {
5081 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
5082 ## Ignore the token
5083 !!!next-token;
5084 redo B;
5085 }
5086 } elsif ($token->{type} eq 'end tag') {
5087 if ($token->{tag_name} eq 'frameset') {
5088 if ($self->{open_elements}->[-1]->[1] eq 'html' and
5089 @{$self->{open_elements}} == 1) {
5090 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5091 ## Ignore the token
5092 !!!next-token;
5093 } else {
5094 pop @{$self->{open_elements}};
5095 !!!next-token;
5096 }
5097
5098 if (not defined $self->{inner_html_node} and
5099 $self->{open_elements}->[-1]->[1] ne 'frameset') {
5100 $self->{insertion_mode} = 'after frameset';
5101 }
5102 redo B;
5103 } else {
5104 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
5105 ## Ignore the token
5106 !!!next-token;
5107 redo B;
5108 }
5109 } else {
5110 die "$0: $token->{type}: Unknown token type";
5111 }
5112 } elsif ($self->{insertion_mode} eq 'after frameset') {
5113 if ($token->{type} eq 'character') {
5114 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5115 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5116
5117 unless (length $token->{data}) {
5118 !!!next-token;
5119 redo B;
5120 }
5121 }
5122
5123 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5124 !!!parse-error (type => 'after frameset:#character');
5125
5126 ## Ignore the token.
5127 if (length $token->{data}) {
5128 ## reprocess the rest of characters
5129 } else {
5130 !!!next-token;
5131 }
5132 redo B;
5133 }
5134
5135 die qq[$0: Character "$token->{data}"];
5136 } elsif ($token->{type} eq 'start tag') {
5137 if ($token->{tag_name} eq 'noframes') {
5138 ## NOTE: As if in body.
5139 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
5140 redo B;
5141 } else {
5142 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
5143 ## Ignore the token
5144 !!!next-token;
5145 redo B;
5146 }
5147 } elsif ($token->{type} eq 'end tag') {
5148 if ($token->{tag_name} eq 'html') {
5149 $previous_insertion_mode = $self->{insertion_mode};
5150 $self->{insertion_mode} = 'trailing end';
5151 !!!next-token;
5152 redo B;
5153 } else {
5154 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
5155 ## Ignore the token
5156 !!!next-token;
5157 redo B;
5158 }
5159 } else {
5160 die "$0: $token->{type}: Unknown token type";
5161 }
5162
5163 ## ISSUE: An issue in spec here
5164 } elsif ($self->{insertion_mode} eq 'trailing end') {
5165 ## states in the main stage is preserved yet # MUST
5166
5167 if ($token->{type} eq 'character') {
5168 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5169 my $data = $1;
5170 ## As if in the main phase.
5171 ## NOTE: The insertion mode in the main phase
5172 ## just before the phase has been changed to the trailing
5173 ## end phase is either "after body" or "after frameset".
5174 $reconstruct_active_formatting_elements->($insert_to_current);
5175
5176 $self->{open_elements}->[-1]->[0]->manakai_append_text ($data);
5177
5178 unless (length $token->{data}) {
5179 !!!next-token;
5180 redo B;
5181 }
5182 }
5183
5184 !!!parse-error (type => 'after html:#character');
5185 $self->{insertion_mode} = $previous_insertion_mode;
5186 ## reprocess
5187 redo B;
5188 } elsif ($token->{type} eq 'start tag') {
5189 !!!parse-error (type => 'after html:'.$token->{tag_name});
5190 $self->{insertion_mode} = $previous_insertion_mode;
5191 ## reprocess
5192 redo B;
5193 } elsif ($token->{type} eq 'end tag') {
5194 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5195 $self->{insertion_mode} = $previous_insertion_mode;
5196 ## reprocess
5197 redo B;
5198 } else {
5199 die "$0: $token->{type}: Unknown token";
5200 }
5201 } else {
5202 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5203 }
5204 } # B
5205
5206 ## Stop parsing # MUST
5207
5208 ## TODO: script stuffs
5209 } # _tree_construct_main
5210
5211 sub set_inner_html ($$$) {
5212 my $class = shift;
5213 my $node = shift;
5214 my $s = \$_[0];
5215 my $onerror = $_[1];
5216
5217 my $nt = $node->node_type;
5218 if ($nt == 9) {
5219 # MUST
5220
5221 ## Step 1 # MUST
5222 ## TODO: If the document has an active parser, ...
5223 ## ISSUE: There is an issue in the spec.
5224
5225 ## Step 2 # MUST
5226 my @cn = @{$node->child_nodes};
5227 for (@cn) {
5228 $node->remove_child ($_);
5229 }
5230
5231 ## Step 3, 4, 5 # MUST
5232 $class->parse_string ($$s => $node, $onerror);
5233 } elsif ($nt == 1) {
5234 ## TODO: If non-html element
5235
5236 ## NOTE: Most of this code is copied from |parse_string|
5237
5238 ## Step 1 # MUST
5239 my $this_doc = $node->owner_document;
5240 my $doc = $this_doc->implementation->create_document;
5241 $doc->manakai_is_html (1);
5242 my $p = $class->new;
5243 $p->{document} = $doc;
5244
5245 ## Step 9 # MUST
5246 my $i = 0;
5247 my $line = 1;
5248 my $column = 0;
5249 $p->{set_next_input_character} = sub {
5250 my $self = shift;
5251
5252 pop @{$self->{prev_input_character}};
5253 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5254
5255 $self->{next_input_character} = -1 and return if $i >= length $$s;
5256 $self->{next_input_character} = ord substr $$s, $i++, 1;
5257 $column++;
5258
5259 if ($self->{next_input_character} == 0x000A) { # LF
5260 $line++;
5261 $column = 0;
5262 } elsif ($self->{next_input_character} == 0x000D) { # CR
5263 $i++ if substr ($$s, $i, 1) eq "\x0A";
5264 $self->{next_input_character} = 0x000A; # LF # MUST
5265 $line++;
5266 $column = 0;
5267 } elsif ($self->{next_input_character} > 0x10FFFF) {
5268 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5269 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5270 !!!parse-error (type => 'NULL');
5271 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5272 }
5273 };
5274 $p->{prev_input_character} = [-1, -1, -1];
5275 $p->{next_input_character} = -1;
5276
5277 my $ponerror = $onerror || sub {
5278 my (%opt) = @_;
5279 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5280 };
5281 $p->{parse_error} = sub {
5282 $ponerror->(@_, line => $line, column => $column);
5283 };
5284
5285 $p->_initialize_tokenizer;
5286 $p->_initialize_tree_constructor;
5287
5288 ## Step 2
5289 my $node_ln = $node->local_name;
5290 $p->{content_model} = {
5291 title => RCDATA_CONTENT_MODEL,
5292 textarea => RCDATA_CONTENT_MODEL,
5293 style => CDATA_CONTENT_MODEL,
5294 script => CDATA_CONTENT_MODEL,
5295 xmp => CDATA_CONTENT_MODEL,
5296 iframe => CDATA_CONTENT_MODEL,
5297 noembed => CDATA_CONTENT_MODEL,
5298 noframes => CDATA_CONTENT_MODEL,
5299 noscript => CDATA_CONTENT_MODEL,
5300 plaintext => PLAINTEXT_CONTENT_MODEL,
5301 }->{$node_ln};
5302 $p->{content_model} = PCDATA_CONTENT_MODEL
5303 unless defined $p->{content_model};
5304 ## ISSUE: What is "the name of the element"? local name?
5305
5306 $p->{inner_html_node} = [$node, $node_ln];
5307
5308 ## Step 4
5309 my $root = $doc->create_element_ns
5310 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5311
5312 ## Step 5 # MUST
5313 $doc->append_child ($root);
5314
5315 ## Step 6 # MUST
5316 push @{$p->{open_elements}}, [$root, 'html'];
5317
5318 undef $p->{head_element};
5319
5320 ## Step 7 # MUST
5321 $p->_reset_insertion_mode;
5322
5323 ## Step 8 # MUST
5324 my $anode = $node;
5325 AN: while (defined $anode) {
5326 if ($anode->node_type == 1) {
5327 my $nsuri = $anode->namespace_uri;
5328 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5329 if ($anode->local_name eq 'form') { ## TODO: case?
5330 $p->{form_element} = $anode;
5331 last AN;
5332 }
5333 }
5334 }
5335 $anode = $anode->parent_node;
5336 } # AN
5337
5338 ## Step 3 # MUST
5339 ## Step 10 # MUST
5340 {
5341 my $self = $p;
5342 !!!next-token;
5343 }
5344 $p->_tree_construction_main;
5345
5346 ## Step 11 # MUST
5347 my @cn = @{$node->child_nodes};
5348 for (@cn) {
5349 $node->remove_child ($_);
5350 }
5351 ## ISSUE: mutation events? read-only?
5352
5353 ## Step 12 # MUST
5354 @cn = @{$root->child_nodes};
5355 for (@cn) {
5356 $this_doc->adopt_node ($_);
5357 $node->append_child ($_);
5358 }
5359 ## ISSUE: mutation events?
5360
5361 $p->_terminate_tree_constructor;
5362 } else {
5363 die "$0: |set_inner_html| is not defined for node of type $nt";
5364 }
5365 } # set_inner_html
5366
5367 } # tree construction stage
5368
5369 sub get_inner_html ($$$) {
5370 my (undef, $node, $on_error) = @_;
5371
5372 ## Step 1
5373 my $s = '';
5374
5375 my $in_cdata;
5376 my $parent = $node;
5377 while (defined $parent) {
5378 if ($parent->node_type == 1 and
5379 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5380 {
5381 style => 1, script => 1, xmp => 1, iframe => 1,
5382 noembed => 1, noframes => 1, noscript => 1,
5383 }->{$parent->local_name}) { ## TODO: case thingy
5384 $in_cdata = 1;
5385 }
5386 $parent = $parent->parent_node;
5387 }
5388
5389 ## Step 2
5390 my @node = @{$node->child_nodes};
5391 C: while (@node) {
5392 my $child = shift @node;
5393 unless (ref $child) {
5394 if ($child eq 'cdata-out') {
5395 $in_cdata = 0;
5396 } else {
5397 $s .= $child; # end tag
5398 }
5399 next C;
5400 }
5401
5402 my $nt = $child->node_type;
5403 if ($nt == 1) { # Element
5404 my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
5405 $s .= '<' . $tag_name;
5406 ## NOTE: Non-HTML case:
5407 ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
5408
5409 my @attrs = @{$child->attributes}; # sort order MUST be stable
5410 for my $attr (@attrs) { # order is implementation dependent
5411 my $attr_name = $attr->name; ## TODO: manakai_name
5412 $s .= ' ' . $attr_name . '="';
5413 my $attr_value = $attr->value;
5414 ## escape
5415 $attr_value =~ s/&/&amp;/g;
5416 $attr_value =~ s/</&lt;/g;
5417 $attr_value =~ s/>/&gt;/g;
5418 $attr_value =~ s/"/&quot;/g;
5419 $s .= $attr_value . '"';
5420 }
5421 $s .= '>';
5422
5423 next C if {
5424 area => 1, base => 1, basefont => 1, bgsound => 1,
5425 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5426 img => 1, input => 1, link => 1, meta => 1, param => 1,
5427 spacer => 1, wbr => 1,
5428 }->{$tag_name};
5429
5430 $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
5431
5432 if (not $in_cdata and {
5433 style => 1, script => 1, xmp => 1, iframe => 1,
5434 noembed => 1, noframes => 1, noscript => 1,
5435 plaintext => 1,
5436 }->{$tag_name}) {
5437 unshift @node, 'cdata-out';
5438 $in_cdata = 1;
5439 }
5440
5441 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5442 } elsif ($nt == 3 or $nt == 4) {
5443 if ($in_cdata) {
5444 $s .= $child->data;
5445 } else {
5446 my $value = $child->data;
5447 $value =~ s/&/&amp;/g;
5448 $value =~ s/</&lt;/g;
5449 $value =~ s/>/&gt;/g;
5450 $value =~ s/"/&quot;/g;
5451 $s .= $value;
5452 }
5453 } elsif ($nt == 8) {
5454 $s .= '<!--' . $child->data . '-->';
5455 } elsif ($nt == 10) {
5456 $s .= '<!DOCTYPE ' . $child->name . '>';
5457 } elsif ($nt == 5) { # entrefs
5458 push @node, @{$child->child_nodes};
5459 } else {
5460 $on_error->($child) if defined $on_error;
5461 }
5462 ## ISSUE: This code does not support PIs.
5463 } # C
5464
5465 ## Step 3
5466 return \$s;
5467 } # get_inner_html
5468
5469 1;
5470 # $Date: 2007/07/21 07:34:32 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24