/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.51 - (show annotations) (download) (as text)
Sat Jul 21 11:46:41 2007 UTC (18 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.50: +147 -144 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	21 Jul 2007 11:46:34 -0000
	* HTML.pm.src: The "trailing end" insertion mode
	is split into "after html body" and "after html frameset"
	insertion modes.  Their codes are merged with "after body"
	and "after frameset" codes.  |$previous_insertion_mode|
	has been removed.  "after frameset" code is
	merged with "in frameset" code.

2007-07-21  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.50 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## ISSUE:
6 ## var doc = implementation.createDocument (null, null, null);
7 ## doc.write ('');
8 ## alert (doc.compatMode);
9
10 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
11 ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
12 ## is not yet clear.
13 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
14 ## "{U+FEFF}..." in GB18030?
15
16 my $permitted_slash_tag_name = {
17 base => 1,
18 link => 1,
19 meta => 1,
20 hr => 1,
21 br => 1,
22 img=> 1,
23 embed => 1,
24 param => 1,
25 area => 1,
26 col => 1,
27 input => 1,
28 };
29
30 my $c1_entity_char = {
31 0x80 => 0x20AC,
32 0x81 => 0xFFFD,
33 0x82 => 0x201A,
34 0x83 => 0x0192,
35 0x84 => 0x201E,
36 0x85 => 0x2026,
37 0x86 => 0x2020,
38 0x87 => 0x2021,
39 0x88 => 0x02C6,
40 0x89 => 0x2030,
41 0x8A => 0x0160,
42 0x8B => 0x2039,
43 0x8C => 0x0152,
44 0x8D => 0xFFFD,
45 0x8E => 0x017D,
46 0x8F => 0xFFFD,
47 0x90 => 0xFFFD,
48 0x91 => 0x2018,
49 0x92 => 0x2019,
50 0x93 => 0x201C,
51 0x94 => 0x201D,
52 0x95 => 0x2022,
53 0x96 => 0x2013,
54 0x97 => 0x2014,
55 0x98 => 0x02DC,
56 0x99 => 0x2122,
57 0x9A => 0x0161,
58 0x9B => 0x203A,
59 0x9C => 0x0153,
60 0x9D => 0xFFFD,
61 0x9E => 0x017E,
62 0x9F => 0x0178,
63 }; # $c1_entity_char
64
65 my $special_category = {
66 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
67 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
68 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
69 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
70 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
71 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
72 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
73 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
74 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
75 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
76 };
77 my $scoping_category = {
78 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
79 table => 1, td => 1, th => 1,
80 };
81 my $formatting_category = {
82 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
83 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
84 };
85 # $phrasing_category: all other elements
86
87 sub parse_string ($$$;$) {
88 my $self = shift->new;
89 my $s = \$_[0];
90 $self->{document} = $_[1];
91
92 ## NOTE: |set_inner_html| copies most of this method's code
93
94 my $i = 0;
95 my $line = 1;
96 my $column = 0;
97 $self->{set_next_input_character} = sub {
98 my $self = shift;
99
100 pop @{$self->{prev_input_character}};
101 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
102
103 $self->{next_input_character} = -1 and return if $i >= length $$s;
104 $self->{next_input_character} = ord substr $$s, $i++, 1;
105 $column++;
106
107 if ($self->{next_input_character} == 0x000A) { # LF
108 $line++;
109 $column = 0;
110 } elsif ($self->{next_input_character} == 0x000D) { # CR
111 $i++ if substr ($$s, $i, 1) eq "\x0A";
112 $self->{next_input_character} = 0x000A; # LF # MUST
113 $line++;
114 $column = 0;
115 } elsif ($self->{next_input_character} > 0x10FFFF) {
116 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
117 } elsif ($self->{next_input_character} == 0x0000) { # NULL
118 !!!parse-error (type => 'NULL');
119 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120 }
121 };
122 $self->{prev_input_character} = [-1, -1, -1];
123 $self->{next_input_character} = -1;
124
125 my $onerror = $_[2] || sub {
126 my (%opt) = @_;
127 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
128 };
129 $self->{parse_error} = sub {
130 $onerror->(@_, line => $line, column => $column);
131 };
132
133 $self->_initialize_tokenizer;
134 $self->_initialize_tree_constructor;
135 $self->_construct_tree;
136 $self->_terminate_tree_constructor;
137
138 return $self->{document};
139 } # parse_string
140
141 sub new ($) {
142 my $class = shift;
143 my $self = bless {}, $class;
144 $self->{set_next_input_character} = sub {
145 $self->{next_input_character} = -1;
146 };
147 $self->{parse_error} = sub {
148 #
149 };
150 return $self;
151 } # new
152
153 sub CM_ENTITY () { 0b001 } # & markup in data
154 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
155 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
156
157 sub PLAINTEXT_CONTENT_MODEL () { 0 }
158 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
159 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
160 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
161
162 ## Implementations MUST act as if state machine in the spec
163
164 sub _initialize_tokenizer ($) {
165 my $self = shift;
166 $self->{state} = 'data'; # MUST
167 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
168 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
169 undef $self->{current_attribute};
170 undef $self->{last_emitted_start_tag_name};
171 undef $self->{last_attribute_value_state};
172 $self->{char} = [];
173 # $self->{next_input_character}
174 !!!next-input-character;
175 $self->{token} = [];
176 # $self->{escape}
177 } # _initialize_tokenizer
178
179 ## A token has:
180 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
181 ## 'character', or 'end-of-file'
182 ## ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
183 ## ->{public_identifier} (DOCTYPE)
184 ## ->{system_identifier} (DOCTYPE)
185 ## ->{correct} == 1 or 0 (DOCTYPE)
186 ## ->{attributes} isa HASH (start tag, end tag)
187 ## ->{data} (comment, character)
188
189 ## Emitted token MUST immediately be handled by the tree construction state.
190
191 ## Before each step, UA MAY check to see if either one of the scripts in
192 ## "list of scripts that will execute as soon as possible" or the first
193 ## script in the "list of scripts that will execute asynchronously",
194 ## has completed loading. If one has, then it MUST be executed
195 ## and removed from the list.
196
197 sub _get_next_token ($) {
198 my $self = shift;
199 if (@{$self->{token}}) {
200 return shift @{$self->{token}};
201 }
202
203 A: {
204 if ($self->{state} eq 'data') {
205 if ($self->{next_input_character} == 0x0026) { # &
206 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
207 $self->{state} = 'entity data';
208 !!!next-input-character;
209 redo A;
210 } else {
211 #
212 }
213 } elsif ($self->{next_input_character} == 0x002D) { # -
214 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
215 unless ($self->{escape}) {
216 if ($self->{prev_input_character}->[0] == 0x002D and # -
217 $self->{prev_input_character}->[1] == 0x0021 and # !
218 $self->{prev_input_character}->[2] == 0x003C) { # <
219 $self->{escape} = 1;
220 }
221 }
222 }
223
224 #
225 } elsif ($self->{next_input_character} == 0x003C) { # <
226 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
227 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
228 not $self->{escape})) {
229 $self->{state} = 'tag open';
230 !!!next-input-character;
231 redo A;
232 } else {
233 #
234 }
235 } elsif ($self->{next_input_character} == 0x003E) { # >
236 if ($self->{escape} and
237 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
238 if ($self->{prev_input_character}->[0] == 0x002D and # -
239 $self->{prev_input_character}->[1] == 0x002D) { # -
240 delete $self->{escape};
241 }
242 }
243
244 #
245 } elsif ($self->{next_input_character} == -1) {
246 !!!emit ({type => 'end-of-file'});
247 last A; ## TODO: ok?
248 }
249 # Anything else
250 my $token = {type => 'character',
251 data => chr $self->{next_input_character}};
252 ## Stay in the data state
253 !!!next-input-character;
254
255 !!!emit ($token);
256
257 redo A;
258 } elsif ($self->{state} eq 'entity data') {
259 ## (cannot happen in CDATA state)
260
261 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
262
263 $self->{state} = 'data';
264 # next-input-character is already done
265
266 unless (defined $token) {
267 !!!emit ({type => 'character', data => '&'});
268 } else {
269 !!!emit ($token);
270 }
271
272 redo A;
273 } elsif ($self->{state} eq 'tag open') {
274 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
275 if ($self->{next_input_character} == 0x002F) { # /
276 !!!next-input-character;
277 $self->{state} = 'close tag open';
278 redo A;
279 } else {
280 ## reconsume
281 $self->{state} = 'data';
282
283 !!!emit ({type => 'character', data => '<'});
284
285 redo A;
286 }
287 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
288 if ($self->{next_input_character} == 0x0021) { # !
289 $self->{state} = 'markup declaration open';
290 !!!next-input-character;
291 redo A;
292 } elsif ($self->{next_input_character} == 0x002F) { # /
293 $self->{state} = 'close tag open';
294 !!!next-input-character;
295 redo A;
296 } elsif (0x0041 <= $self->{next_input_character} and
297 $self->{next_input_character} <= 0x005A) { # A..Z
298 $self->{current_token}
299 = {type => 'start tag',
300 tag_name => chr ($self->{next_input_character} + 0x0020)};
301 $self->{state} = 'tag name';
302 !!!next-input-character;
303 redo A;
304 } elsif (0x0061 <= $self->{next_input_character} and
305 $self->{next_input_character} <= 0x007A) { # a..z
306 $self->{current_token} = {type => 'start tag',
307 tag_name => chr ($self->{next_input_character})};
308 $self->{state} = 'tag name';
309 !!!next-input-character;
310 redo A;
311 } elsif ($self->{next_input_character} == 0x003E) { # >
312 !!!parse-error (type => 'empty start tag');
313 $self->{state} = 'data';
314 !!!next-input-character;
315
316 !!!emit ({type => 'character', data => '<>'});
317
318 redo A;
319 } elsif ($self->{next_input_character} == 0x003F) { # ?
320 !!!parse-error (type => 'pio');
321 $self->{state} = 'bogus comment';
322 ## $self->{next_input_character} is intentionally left as is
323 redo A;
324 } else {
325 !!!parse-error (type => 'bare stago');
326 $self->{state} = 'data';
327 ## reconsume
328
329 !!!emit ({type => 'character', data => '<'});
330
331 redo A;
332 }
333 } else {
334 die "$0: $self->{content_model} in tag open";
335 }
336 } elsif ($self->{state} eq 'close tag open') {
337 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
338 if (defined $self->{last_emitted_start_tag_name}) {
339 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
340 my @next_char;
341 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
342 push @next_char, $self->{next_input_character};
343 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
344 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
345 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
346 !!!next-input-character;
347 next TAGNAME;
348 } else {
349 $self->{next_input_character} = shift @next_char; # reconsume
350 !!!back-next-input-character (@next_char);
351 $self->{state} = 'data';
352
353 !!!emit ({type => 'character', data => '</'});
354
355 redo A;
356 }
357 }
358 push @next_char, $self->{next_input_character};
359
360 unless ($self->{next_input_character} == 0x0009 or # HT
361 $self->{next_input_character} == 0x000A or # LF
362 $self->{next_input_character} == 0x000B or # VT
363 $self->{next_input_character} == 0x000C or # FF
364 $self->{next_input_character} == 0x0020 or # SP
365 $self->{next_input_character} == 0x003E or # >
366 $self->{next_input_character} == 0x002F or # /
367 $self->{next_input_character} == -1) {
368 $self->{next_input_character} = shift @next_char; # reconsume
369 !!!back-next-input-character (@next_char);
370 $self->{state} = 'data';
371 !!!emit ({type => 'character', data => '</'});
372 redo A;
373 } else {
374 $self->{next_input_character} = shift @next_char;
375 !!!back-next-input-character (@next_char);
376 # and consume...
377 }
378 } else {
379 ## No start tag token has ever been emitted
380 # next-input-character is already done
381 $self->{state} = 'data';
382 !!!emit ({type => 'character', data => '</'});
383 redo A;
384 }
385 }
386
387 if (0x0041 <= $self->{next_input_character} and
388 $self->{next_input_character} <= 0x005A) { # A..Z
389 $self->{current_token} = {type => 'end tag',
390 tag_name => chr ($self->{next_input_character} + 0x0020)};
391 $self->{state} = 'tag name';
392 !!!next-input-character;
393 redo A;
394 } elsif (0x0061 <= $self->{next_input_character} and
395 $self->{next_input_character} <= 0x007A) { # a..z
396 $self->{current_token} = {type => 'end tag',
397 tag_name => chr ($self->{next_input_character})};
398 $self->{state} = 'tag name';
399 !!!next-input-character;
400 redo A;
401 } elsif ($self->{next_input_character} == 0x003E) { # >
402 !!!parse-error (type => 'empty end tag');
403 $self->{state} = 'data';
404 !!!next-input-character;
405 redo A;
406 } elsif ($self->{next_input_character} == -1) {
407 !!!parse-error (type => 'bare etago');
408 $self->{state} = 'data';
409 # reconsume
410
411 !!!emit ({type => 'character', data => '</'});
412
413 redo A;
414 } else {
415 !!!parse-error (type => 'bogus end tag');
416 $self->{state} = 'bogus comment';
417 ## $self->{next_input_character} is intentionally left as is
418 redo A;
419 }
420 } elsif ($self->{state} eq 'tag name') {
421 if ($self->{next_input_character} == 0x0009 or # HT
422 $self->{next_input_character} == 0x000A or # LF
423 $self->{next_input_character} == 0x000B or # VT
424 $self->{next_input_character} == 0x000C or # FF
425 $self->{next_input_character} == 0x0020) { # SP
426 $self->{state} = 'before attribute name';
427 !!!next-input-character;
428 redo A;
429 } elsif ($self->{next_input_character} == 0x003E) { # >
430 if ($self->{current_token}->{type} eq 'start tag') {
431 $self->{current_token}->{first_start_tag}
432 = not defined $self->{last_emitted_start_tag_name};
433 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
434 } elsif ($self->{current_token}->{type} eq 'end tag') {
435 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
436 if ($self->{current_token}->{attributes}) {
437 !!!parse-error (type => 'end tag attribute');
438 }
439 } else {
440 die "$0: $self->{current_token}->{type}: Unknown token type";
441 }
442 $self->{state} = 'data';
443 !!!next-input-character;
444
445 !!!emit ($self->{current_token}); # start tag or end tag
446
447 redo A;
448 } elsif (0x0041 <= $self->{next_input_character} and
449 $self->{next_input_character} <= 0x005A) { # A..Z
450 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
451 # start tag or end tag
452 ## Stay in this state
453 !!!next-input-character;
454 redo A;
455 } elsif ($self->{next_input_character} == -1) {
456 !!!parse-error (type => 'unclosed tag');
457 if ($self->{current_token}->{type} eq 'start tag') {
458 $self->{current_token}->{first_start_tag}
459 = not defined $self->{last_emitted_start_tag_name};
460 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
461 } elsif ($self->{current_token}->{type} eq 'end tag') {
462 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
463 if ($self->{current_token}->{attributes}) {
464 !!!parse-error (type => 'end tag attribute');
465 }
466 } else {
467 die "$0: $self->{current_token}->{type}: Unknown token type";
468 }
469 $self->{state} = 'data';
470 # reconsume
471
472 !!!emit ($self->{current_token}); # start tag or end tag
473
474 redo A;
475 } elsif ($self->{next_input_character} == 0x002F) { # /
476 !!!next-input-character;
477 if ($self->{next_input_character} == 0x003E and # >
478 $self->{current_token}->{type} eq 'start tag' and
479 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
480 # permitted slash
481 #
482 } else {
483 !!!parse-error (type => 'nestc');
484 }
485 $self->{state} = 'before attribute name';
486 # next-input-character is already done
487 redo A;
488 } else {
489 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
490 # start tag or end tag
491 ## Stay in the state
492 !!!next-input-character;
493 redo A;
494 }
495 } elsif ($self->{state} eq 'before attribute name') {
496 if ($self->{next_input_character} == 0x0009 or # HT
497 $self->{next_input_character} == 0x000A or # LF
498 $self->{next_input_character} == 0x000B or # VT
499 $self->{next_input_character} == 0x000C or # FF
500 $self->{next_input_character} == 0x0020) { # SP
501 ## Stay in the state
502 !!!next-input-character;
503 redo A;
504 } elsif ($self->{next_input_character} == 0x003E) { # >
505 if ($self->{current_token}->{type} eq 'start tag') {
506 $self->{current_token}->{first_start_tag}
507 = not defined $self->{last_emitted_start_tag_name};
508 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
509 } elsif ($self->{current_token}->{type} eq 'end tag') {
510 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
511 if ($self->{current_token}->{attributes}) {
512 !!!parse-error (type => 'end tag attribute');
513 }
514 } else {
515 die "$0: $self->{current_token}->{type}: Unknown token type";
516 }
517 $self->{state} = 'data';
518 !!!next-input-character;
519
520 !!!emit ($self->{current_token}); # start tag or end tag
521
522 redo A;
523 } elsif (0x0041 <= $self->{next_input_character} and
524 $self->{next_input_character} <= 0x005A) { # A..Z
525 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
526 value => ''};
527 $self->{state} = 'attribute name';
528 !!!next-input-character;
529 redo A;
530 } elsif ($self->{next_input_character} == 0x002F) { # /
531 !!!next-input-character;
532 if ($self->{next_input_character} == 0x003E and # >
533 $self->{current_token}->{type} eq 'start tag' and
534 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
535 # permitted slash
536 #
537 } else {
538 !!!parse-error (type => 'nestc');
539 }
540 ## Stay in the state
541 # next-input-character is already done
542 redo A;
543 } elsif ($self->{next_input_character} == -1) {
544 !!!parse-error (type => 'unclosed tag');
545 if ($self->{current_token}->{type} eq 'start tag') {
546 $self->{current_token}->{first_start_tag}
547 = not defined $self->{last_emitted_start_tag_name};
548 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
549 } elsif ($self->{current_token}->{type} eq 'end tag') {
550 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
551 if ($self->{current_token}->{attributes}) {
552 !!!parse-error (type => 'end tag attribute');
553 }
554 } else {
555 die "$0: $self->{current_token}->{type}: Unknown token type";
556 }
557 $self->{state} = 'data';
558 # reconsume
559
560 !!!emit ($self->{current_token}); # start tag or end tag
561
562 redo A;
563 } else {
564 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
565 value => ''};
566 $self->{state} = 'attribute name';
567 !!!next-input-character;
568 redo A;
569 }
570 } elsif ($self->{state} eq 'attribute name') {
571 my $before_leave = sub {
572 if (exists $self->{current_token}->{attributes} # start tag or end tag
573 ->{$self->{current_attribute}->{name}}) { # MUST
574 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
575 ## Discard $self->{current_attribute} # MUST
576 } else {
577 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
578 = $self->{current_attribute};
579 }
580 }; # $before_leave
581
582 if ($self->{next_input_character} == 0x0009 or # HT
583 $self->{next_input_character} == 0x000A or # LF
584 $self->{next_input_character} == 0x000B or # VT
585 $self->{next_input_character} == 0x000C or # FF
586 $self->{next_input_character} == 0x0020) { # SP
587 $before_leave->();
588 $self->{state} = 'after attribute name';
589 !!!next-input-character;
590 redo A;
591 } elsif ($self->{next_input_character} == 0x003D) { # =
592 $before_leave->();
593 $self->{state} = 'before attribute value';
594 !!!next-input-character;
595 redo A;
596 } elsif ($self->{next_input_character} == 0x003E) { # >
597 $before_leave->();
598 if ($self->{current_token}->{type} eq 'start tag') {
599 $self->{current_token}->{first_start_tag}
600 = not defined $self->{last_emitted_start_tag_name};
601 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
602 } elsif ($self->{current_token}->{type} eq 'end tag') {
603 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
604 if ($self->{current_token}->{attributes}) {
605 !!!parse-error (type => 'end tag attribute');
606 }
607 } else {
608 die "$0: $self->{current_token}->{type}: Unknown token type";
609 }
610 $self->{state} = 'data';
611 !!!next-input-character;
612
613 !!!emit ($self->{current_token}); # start tag or end tag
614
615 redo A;
616 } elsif (0x0041 <= $self->{next_input_character} and
617 $self->{next_input_character} <= 0x005A) { # A..Z
618 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
619 ## Stay in the state
620 !!!next-input-character;
621 redo A;
622 } elsif ($self->{next_input_character} == 0x002F) { # /
623 $before_leave->();
624 !!!next-input-character;
625 if ($self->{next_input_character} == 0x003E and # >
626 $self->{current_token}->{type} eq 'start tag' and
627 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
628 # permitted slash
629 #
630 } else {
631 !!!parse-error (type => 'nestc');
632 }
633 $self->{state} = 'before attribute name';
634 # next-input-character is already done
635 redo A;
636 } elsif ($self->{next_input_character} == -1) {
637 !!!parse-error (type => 'unclosed tag');
638 $before_leave->();
639 if ($self->{current_token}->{type} eq 'start tag') {
640 $self->{current_token}->{first_start_tag}
641 = not defined $self->{last_emitted_start_tag_name};
642 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
643 } elsif ($self->{current_token}->{type} eq 'end tag') {
644 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
645 if ($self->{current_token}->{attributes}) {
646 !!!parse-error (type => 'end tag attribute');
647 }
648 } else {
649 die "$0: $self->{current_token}->{type}: Unknown token type";
650 }
651 $self->{state} = 'data';
652 # reconsume
653
654 !!!emit ($self->{current_token}); # start tag or end tag
655
656 redo A;
657 } else {
658 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
659 ## Stay in the state
660 !!!next-input-character;
661 redo A;
662 }
663 } elsif ($self->{state} eq 'after attribute name') {
664 if ($self->{next_input_character} == 0x0009 or # HT
665 $self->{next_input_character} == 0x000A or # LF
666 $self->{next_input_character} == 0x000B or # VT
667 $self->{next_input_character} == 0x000C or # FF
668 $self->{next_input_character} == 0x0020) { # SP
669 ## Stay in the state
670 !!!next-input-character;
671 redo A;
672 } elsif ($self->{next_input_character} == 0x003D) { # =
673 $self->{state} = 'before attribute value';
674 !!!next-input-character;
675 redo A;
676 } elsif ($self->{next_input_character} == 0x003E) { # >
677 if ($self->{current_token}->{type} eq 'start tag') {
678 $self->{current_token}->{first_start_tag}
679 = not defined $self->{last_emitted_start_tag_name};
680 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
681 } elsif ($self->{current_token}->{type} eq 'end tag') {
682 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
683 if ($self->{current_token}->{attributes}) {
684 !!!parse-error (type => 'end tag attribute');
685 }
686 } else {
687 die "$0: $self->{current_token}->{type}: Unknown token type";
688 }
689 $self->{state} = 'data';
690 !!!next-input-character;
691
692 !!!emit ($self->{current_token}); # start tag or end tag
693
694 redo A;
695 } elsif (0x0041 <= $self->{next_input_character} and
696 $self->{next_input_character} <= 0x005A) { # A..Z
697 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
698 value => ''};
699 $self->{state} = 'attribute name';
700 !!!next-input-character;
701 redo A;
702 } elsif ($self->{next_input_character} == 0x002F) { # /
703 !!!next-input-character;
704 if ($self->{next_input_character} == 0x003E and # >
705 $self->{current_token}->{type} eq 'start tag' and
706 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
707 # permitted slash
708 #
709 } else {
710 !!!parse-error (type => 'nestc');
711 ## TODO: Different error type for <aa / bb> than <aa/>
712 }
713 $self->{state} = 'before attribute name';
714 # next-input-character is already done
715 redo A;
716 } elsif ($self->{next_input_character} == -1) {
717 !!!parse-error (type => 'unclosed tag');
718 if ($self->{current_token}->{type} eq 'start tag') {
719 $self->{current_token}->{first_start_tag}
720 = not defined $self->{last_emitted_start_tag_name};
721 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
722 } elsif ($self->{current_token}->{type} eq 'end tag') {
723 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
724 if ($self->{current_token}->{attributes}) {
725 !!!parse-error (type => 'end tag attribute');
726 }
727 } else {
728 die "$0: $self->{current_token}->{type}: Unknown token type";
729 }
730 $self->{state} = 'data';
731 # reconsume
732
733 !!!emit ($self->{current_token}); # start tag or end tag
734
735 redo A;
736 } else {
737 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
738 value => ''};
739 $self->{state} = 'attribute name';
740 !!!next-input-character;
741 redo A;
742 }
743 } elsif ($self->{state} eq 'before attribute value') {
744 if ($self->{next_input_character} == 0x0009 or # HT
745 $self->{next_input_character} == 0x000A or # LF
746 $self->{next_input_character} == 0x000B or # VT
747 $self->{next_input_character} == 0x000C or # FF
748 $self->{next_input_character} == 0x0020) { # SP
749 ## Stay in the state
750 !!!next-input-character;
751 redo A;
752 } elsif ($self->{next_input_character} == 0x0022) { # "
753 $self->{state} = 'attribute value (double-quoted)';
754 !!!next-input-character;
755 redo A;
756 } elsif ($self->{next_input_character} == 0x0026) { # &
757 $self->{state} = 'attribute value (unquoted)';
758 ## reconsume
759 redo A;
760 } elsif ($self->{next_input_character} == 0x0027) { # '
761 $self->{state} = 'attribute value (single-quoted)';
762 !!!next-input-character;
763 redo A;
764 } elsif ($self->{next_input_character} == 0x003E) { # >
765 if ($self->{current_token}->{type} eq 'start tag') {
766 $self->{current_token}->{first_start_tag}
767 = not defined $self->{last_emitted_start_tag_name};
768 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
769 } elsif ($self->{current_token}->{type} eq 'end tag') {
770 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
771 if ($self->{current_token}->{attributes}) {
772 !!!parse-error (type => 'end tag attribute');
773 }
774 } else {
775 die "$0: $self->{current_token}->{type}: Unknown token type";
776 }
777 $self->{state} = 'data';
778 !!!next-input-character;
779
780 !!!emit ($self->{current_token}); # start tag or end tag
781
782 redo A;
783 } elsif ($self->{next_input_character} == -1) {
784 !!!parse-error (type => 'unclosed tag');
785 if ($self->{current_token}->{type} eq 'start tag') {
786 $self->{current_token}->{first_start_tag}
787 = not defined $self->{last_emitted_start_tag_name};
788 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
789 } elsif ($self->{current_token}->{type} eq 'end tag') {
790 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791 if ($self->{current_token}->{attributes}) {
792 !!!parse-error (type => 'end tag attribute');
793 }
794 } else {
795 die "$0: $self->{current_token}->{type}: Unknown token type";
796 }
797 $self->{state} = 'data';
798 ## reconsume
799
800 !!!emit ($self->{current_token}); # start tag or end tag
801
802 redo A;
803 } else {
804 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
805 $self->{state} = 'attribute value (unquoted)';
806 !!!next-input-character;
807 redo A;
808 }
809 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
810 if ($self->{next_input_character} == 0x0022) { # "
811 $self->{state} = 'before attribute name';
812 !!!next-input-character;
813 redo A;
814 } elsif ($self->{next_input_character} == 0x0026) { # &
815 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
816 $self->{state} = 'entity in attribute value';
817 !!!next-input-character;
818 redo A;
819 } elsif ($self->{next_input_character} == -1) {
820 !!!parse-error (type => 'unclosed attribute value');
821 if ($self->{current_token}->{type} eq 'start tag') {
822 $self->{current_token}->{first_start_tag}
823 = not defined $self->{last_emitted_start_tag_name};
824 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
825 } elsif ($self->{current_token}->{type} eq 'end tag') {
826 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
827 if ($self->{current_token}->{attributes}) {
828 !!!parse-error (type => 'end tag attribute');
829 }
830 } else {
831 die "$0: $self->{current_token}->{type}: Unknown token type";
832 }
833 $self->{state} = 'data';
834 ## reconsume
835
836 !!!emit ($self->{current_token}); # start tag or end tag
837
838 redo A;
839 } else {
840 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
841 ## Stay in the state
842 !!!next-input-character;
843 redo A;
844 }
845 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
846 if ($self->{next_input_character} == 0x0027) { # '
847 $self->{state} = 'before attribute name';
848 !!!next-input-character;
849 redo A;
850 } elsif ($self->{next_input_character} == 0x0026) { # &
851 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
852 $self->{state} = 'entity in attribute value';
853 !!!next-input-character;
854 redo A;
855 } elsif ($self->{next_input_character} == -1) {
856 !!!parse-error (type => 'unclosed attribute value');
857 if ($self->{current_token}->{type} eq 'start tag') {
858 $self->{current_token}->{first_start_tag}
859 = not defined $self->{last_emitted_start_tag_name};
860 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
861 } elsif ($self->{current_token}->{type} eq 'end tag') {
862 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
863 if ($self->{current_token}->{attributes}) {
864 !!!parse-error (type => 'end tag attribute');
865 }
866 } else {
867 die "$0: $self->{current_token}->{type}: Unknown token type";
868 }
869 $self->{state} = 'data';
870 ## reconsume
871
872 !!!emit ($self->{current_token}); # start tag or end tag
873
874 redo A;
875 } else {
876 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
877 ## Stay in the state
878 !!!next-input-character;
879 redo A;
880 }
881 } elsif ($self->{state} eq 'attribute value (unquoted)') {
882 if ($self->{next_input_character} == 0x0009 or # HT
883 $self->{next_input_character} == 0x000A or # LF
884 $self->{next_input_character} == 0x000B or # HT
885 $self->{next_input_character} == 0x000C or # FF
886 $self->{next_input_character} == 0x0020) { # SP
887 $self->{state} = 'before attribute name';
888 !!!next-input-character;
889 redo A;
890 } elsif ($self->{next_input_character} == 0x0026) { # &
891 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
892 $self->{state} = 'entity in attribute value';
893 !!!next-input-character;
894 redo A;
895 } elsif ($self->{next_input_character} == 0x003E) { # >
896 if ($self->{current_token}->{type} eq 'start tag') {
897 $self->{current_token}->{first_start_tag}
898 = not defined $self->{last_emitted_start_tag_name};
899 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
900 } elsif ($self->{current_token}->{type} eq 'end tag') {
901 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
902 if ($self->{current_token}->{attributes}) {
903 !!!parse-error (type => 'end tag attribute');
904 }
905 } else {
906 die "$0: $self->{current_token}->{type}: Unknown token type";
907 }
908 $self->{state} = 'data';
909 !!!next-input-character;
910
911 !!!emit ($self->{current_token}); # start tag or end tag
912
913 redo A;
914 } elsif ($self->{next_input_character} == -1) {
915 !!!parse-error (type => 'unclosed tag');
916 if ($self->{current_token}->{type} eq 'start tag') {
917 $self->{current_token}->{first_start_tag}
918 = not defined $self->{last_emitted_start_tag_name};
919 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
920 } elsif ($self->{current_token}->{type} eq 'end tag') {
921 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
922 if ($self->{current_token}->{attributes}) {
923 !!!parse-error (type => 'end tag attribute');
924 }
925 } else {
926 die "$0: $self->{current_token}->{type}: Unknown token type";
927 }
928 $self->{state} = 'data';
929 ## reconsume
930
931 !!!emit ($self->{current_token}); # start tag or end tag
932
933 redo A;
934 } else {
935 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
936 ## Stay in the state
937 !!!next-input-character;
938 redo A;
939 }
940 } elsif ($self->{state} eq 'entity in attribute value') {
941 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
942
943 unless (defined $token) {
944 $self->{current_attribute}->{value} .= '&';
945 } else {
946 $self->{current_attribute}->{value} .= $token->{data};
947 ## ISSUE: spec says "append the returned character token to the current attribute's value"
948 }
949
950 $self->{state} = $self->{last_attribute_value_state};
951 # next-input-character is already done
952 redo A;
953 } elsif ($self->{state} eq 'bogus comment') {
954 ## (only happen if PCDATA state)
955
956 my $token = {type => 'comment', data => ''};
957
958 BC: {
959 if ($self->{next_input_character} == 0x003E) { # >
960 $self->{state} = 'data';
961 !!!next-input-character;
962
963 !!!emit ($token);
964
965 redo A;
966 } elsif ($self->{next_input_character} == -1) {
967 $self->{state} = 'data';
968 ## reconsume
969
970 !!!emit ($token);
971
972 redo A;
973 } else {
974 $token->{data} .= chr ($self->{next_input_character});
975 !!!next-input-character;
976 redo BC;
977 }
978 } # BC
979 } elsif ($self->{state} eq 'markup declaration open') {
980 ## (only happen if PCDATA state)
981
982 my @next_char;
983 push @next_char, $self->{next_input_character};
984
985 if ($self->{next_input_character} == 0x002D) { # -
986 !!!next-input-character;
987 push @next_char, $self->{next_input_character};
988 if ($self->{next_input_character} == 0x002D) { # -
989 $self->{current_token} = {type => 'comment', data => ''};
990 $self->{state} = 'comment start';
991 !!!next-input-character;
992 redo A;
993 }
994 } elsif ($self->{next_input_character} == 0x0044 or # D
995 $self->{next_input_character} == 0x0064) { # d
996 !!!next-input-character;
997 push @next_char, $self->{next_input_character};
998 if ($self->{next_input_character} == 0x004F or # O
999 $self->{next_input_character} == 0x006F) { # o
1000 !!!next-input-character;
1001 push @next_char, $self->{next_input_character};
1002 if ($self->{next_input_character} == 0x0043 or # C
1003 $self->{next_input_character} == 0x0063) { # c
1004 !!!next-input-character;
1005 push @next_char, $self->{next_input_character};
1006 if ($self->{next_input_character} == 0x0054 or # T
1007 $self->{next_input_character} == 0x0074) { # t
1008 !!!next-input-character;
1009 push @next_char, $self->{next_input_character};
1010 if ($self->{next_input_character} == 0x0059 or # Y
1011 $self->{next_input_character} == 0x0079) { # y
1012 !!!next-input-character;
1013 push @next_char, $self->{next_input_character};
1014 if ($self->{next_input_character} == 0x0050 or # P
1015 $self->{next_input_character} == 0x0070) { # p
1016 !!!next-input-character;
1017 push @next_char, $self->{next_input_character};
1018 if ($self->{next_input_character} == 0x0045 or # E
1019 $self->{next_input_character} == 0x0065) { # e
1020 ## ISSUE: What a stupid code this is!
1021 $self->{state} = 'DOCTYPE';
1022 !!!next-input-character;
1023 redo A;
1024 }
1025 }
1026 }
1027 }
1028 }
1029 }
1030 }
1031
1032 !!!parse-error (type => 'bogus comment');
1033 $self->{next_input_character} = shift @next_char;
1034 !!!back-next-input-character (@next_char);
1035 $self->{state} = 'bogus comment';
1036 redo A;
1037
1038 ## ISSUE: typos in spec: chacacters, is is a parse error
1039 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1040 } elsif ($self->{state} eq 'comment start') {
1041 if ($self->{next_input_character} == 0x002D) { # -
1042 $self->{state} = 'comment start dash';
1043 !!!next-input-character;
1044 redo A;
1045 } elsif ($self->{next_input_character} == 0x003E) { # >
1046 !!!parse-error (type => 'bogus comment');
1047 $self->{state} = 'data';
1048 !!!next-input-character;
1049
1050 !!!emit ($self->{current_token}); # comment
1051
1052 redo A;
1053 } elsif ($self->{next_input_character} == -1) {
1054 !!!parse-error (type => 'unclosed comment');
1055 $self->{state} = 'data';
1056 ## reconsume
1057
1058 !!!emit ($self->{current_token}); # comment
1059
1060 redo A;
1061 } else {
1062 $self->{current_token}->{data} # comment
1063 .= chr ($self->{next_input_character});
1064 $self->{state} = 'comment';
1065 !!!next-input-character;
1066 redo A;
1067 }
1068 } elsif ($self->{state} eq 'comment start dash') {
1069 if ($self->{next_input_character} == 0x002D) { # -
1070 $self->{state} = 'comment end';
1071 !!!next-input-character;
1072 redo A;
1073 } elsif ($self->{next_input_character} == 0x003E) { # >
1074 !!!parse-error (type => 'bogus comment');
1075 $self->{state} = 'data';
1076 !!!next-input-character;
1077
1078 !!!emit ($self->{current_token}); # comment
1079
1080 redo A;
1081 } elsif ($self->{next_input_character} == -1) {
1082 !!!parse-error (type => 'unclosed comment');
1083 $self->{state} = 'data';
1084 ## reconsume
1085
1086 !!!emit ($self->{current_token}); # comment
1087
1088 redo A;
1089 } else {
1090 $self->{current_token}->{data} # comment
1091 .= '-' . chr ($self->{next_input_character});
1092 $self->{state} = 'comment';
1093 !!!next-input-character;
1094 redo A;
1095 }
1096 } elsif ($self->{state} eq 'comment') {
1097 if ($self->{next_input_character} == 0x002D) { # -
1098 $self->{state} = 'comment end dash';
1099 !!!next-input-character;
1100 redo A;
1101 } elsif ($self->{next_input_character} == -1) {
1102 !!!parse-error (type => 'unclosed comment');
1103 $self->{state} = 'data';
1104 ## reconsume
1105
1106 !!!emit ($self->{current_token}); # comment
1107
1108 redo A;
1109 } else {
1110 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1111 ## Stay in the state
1112 !!!next-input-character;
1113 redo A;
1114 }
1115 } elsif ($self->{state} eq 'comment end dash') {
1116 if ($self->{next_input_character} == 0x002D) { # -
1117 $self->{state} = 'comment end';
1118 !!!next-input-character;
1119 redo A;
1120 } elsif ($self->{next_input_character} == -1) {
1121 !!!parse-error (type => 'unclosed comment');
1122 $self->{state} = 'data';
1123 ## reconsume
1124
1125 !!!emit ($self->{current_token}); # comment
1126
1127 redo A;
1128 } else {
1129 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1130 $self->{state} = 'comment';
1131 !!!next-input-character;
1132 redo A;
1133 }
1134 } elsif ($self->{state} eq 'comment end') {
1135 if ($self->{next_input_character} == 0x003E) { # >
1136 $self->{state} = 'data';
1137 !!!next-input-character;
1138
1139 !!!emit ($self->{current_token}); # comment
1140
1141 redo A;
1142 } elsif ($self->{next_input_character} == 0x002D) { # -
1143 !!!parse-error (type => 'dash in comment');
1144 $self->{current_token}->{data} .= '-'; # comment
1145 ## Stay in the state
1146 !!!next-input-character;
1147 redo A;
1148 } elsif ($self->{next_input_character} == -1) {
1149 !!!parse-error (type => 'unclosed comment');
1150 $self->{state} = 'data';
1151 ## reconsume
1152
1153 !!!emit ($self->{current_token}); # comment
1154
1155 redo A;
1156 } else {
1157 !!!parse-error (type => 'dash in comment');
1158 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1159 $self->{state} = 'comment';
1160 !!!next-input-character;
1161 redo A;
1162 }
1163 } elsif ($self->{state} eq 'DOCTYPE') {
1164 if ($self->{next_input_character} == 0x0009 or # HT
1165 $self->{next_input_character} == 0x000A or # LF
1166 $self->{next_input_character} == 0x000B or # VT
1167 $self->{next_input_character} == 0x000C or # FF
1168 $self->{next_input_character} == 0x0020) { # SP
1169 $self->{state} = 'before DOCTYPE name';
1170 !!!next-input-character;
1171 redo A;
1172 } else {
1173 !!!parse-error (type => 'no space before DOCTYPE name');
1174 $self->{state} = 'before DOCTYPE name';
1175 ## reconsume
1176 redo A;
1177 }
1178 } elsif ($self->{state} eq 'before DOCTYPE name') {
1179 if ($self->{next_input_character} == 0x0009 or # HT
1180 $self->{next_input_character} == 0x000A or # LF
1181 $self->{next_input_character} == 0x000B or # VT
1182 $self->{next_input_character} == 0x000C or # FF
1183 $self->{next_input_character} == 0x0020) { # SP
1184 ## Stay in the state
1185 !!!next-input-character;
1186 redo A;
1187 } elsif ($self->{next_input_character} == 0x003E) { # >
1188 !!!parse-error (type => 'no DOCTYPE name');
1189 $self->{state} = 'data';
1190 !!!next-input-character;
1191
1192 !!!emit ({type => 'DOCTYPE'}); # incorrect
1193
1194 redo A;
1195 } elsif ($self->{next_input_character} == -1) {
1196 !!!parse-error (type => 'no DOCTYPE name');
1197 $self->{state} = 'data';
1198 ## reconsume
1199
1200 !!!emit ({type => 'DOCTYPE'}); # incorrect
1201
1202 redo A;
1203 } else {
1204 $self->{current_token}
1205 = {type => 'DOCTYPE',
1206 name => chr ($self->{next_input_character}),
1207 correct => 1};
1208 ## ISSUE: "Set the token's name name to the" in the spec
1209 $self->{state} = 'DOCTYPE name';
1210 !!!next-input-character;
1211 redo A;
1212 }
1213 } elsif ($self->{state} eq 'DOCTYPE name') {
1214 ## ISSUE: Redundant "First," in the spec.
1215 if ($self->{next_input_character} == 0x0009 or # HT
1216 $self->{next_input_character} == 0x000A or # LF
1217 $self->{next_input_character} == 0x000B or # VT
1218 $self->{next_input_character} == 0x000C or # FF
1219 $self->{next_input_character} == 0x0020) { # SP
1220 $self->{state} = 'after DOCTYPE name';
1221 !!!next-input-character;
1222 redo A;
1223 } elsif ($self->{next_input_character} == 0x003E) { # >
1224 $self->{state} = 'data';
1225 !!!next-input-character;
1226
1227 !!!emit ($self->{current_token}); # DOCTYPE
1228
1229 redo A;
1230 } elsif ($self->{next_input_character} == -1) {
1231 !!!parse-error (type => 'unclosed DOCTYPE');
1232 $self->{state} = 'data';
1233 ## reconsume
1234
1235 delete $self->{current_token}->{correct};
1236 !!!emit ($self->{current_token}); # DOCTYPE
1237
1238 redo A;
1239 } else {
1240 $self->{current_token}->{name}
1241 .= chr ($self->{next_input_character}); # DOCTYPE
1242 ## Stay in the state
1243 !!!next-input-character;
1244 redo A;
1245 }
1246 } elsif ($self->{state} eq 'after DOCTYPE name') {
1247 if ($self->{next_input_character} == 0x0009 or # HT
1248 $self->{next_input_character} == 0x000A or # LF
1249 $self->{next_input_character} == 0x000B or # VT
1250 $self->{next_input_character} == 0x000C or # FF
1251 $self->{next_input_character} == 0x0020) { # SP
1252 ## Stay in the state
1253 !!!next-input-character;
1254 redo A;
1255 } elsif ($self->{next_input_character} == 0x003E) { # >
1256 $self->{state} = 'data';
1257 !!!next-input-character;
1258
1259 !!!emit ($self->{current_token}); # DOCTYPE
1260
1261 redo A;
1262 } elsif ($self->{next_input_character} == -1) {
1263 !!!parse-error (type => 'unclosed DOCTYPE');
1264 $self->{state} = 'data';
1265 ## reconsume
1266
1267 delete $self->{current_token}->{correct};
1268 !!!emit ($self->{current_token}); # DOCTYPE
1269
1270 redo A;
1271 } elsif ($self->{next_input_character} == 0x0050 or # P
1272 $self->{next_input_character} == 0x0070) { # p
1273 !!!next-input-character;
1274 if ($self->{next_input_character} == 0x0055 or # U
1275 $self->{next_input_character} == 0x0075) { # u
1276 !!!next-input-character;
1277 if ($self->{next_input_character} == 0x0042 or # B
1278 $self->{next_input_character} == 0x0062) { # b
1279 !!!next-input-character;
1280 if ($self->{next_input_character} == 0x004C or # L
1281 $self->{next_input_character} == 0x006C) { # l
1282 !!!next-input-character;
1283 if ($self->{next_input_character} == 0x0049 or # I
1284 $self->{next_input_character} == 0x0069) { # i
1285 !!!next-input-character;
1286 if ($self->{next_input_character} == 0x0043 or # C
1287 $self->{next_input_character} == 0x0063) { # c
1288 $self->{state} = 'before DOCTYPE public identifier';
1289 !!!next-input-character;
1290 redo A;
1291 }
1292 }
1293 }
1294 }
1295 }
1296
1297 #
1298 } elsif ($self->{next_input_character} == 0x0053 or # S
1299 $self->{next_input_character} == 0x0073) { # s
1300 !!!next-input-character;
1301 if ($self->{next_input_character} == 0x0059 or # Y
1302 $self->{next_input_character} == 0x0079) { # y
1303 !!!next-input-character;
1304 if ($self->{next_input_character} == 0x0053 or # S
1305 $self->{next_input_character} == 0x0073) { # s
1306 !!!next-input-character;
1307 if ($self->{next_input_character} == 0x0054 or # T
1308 $self->{next_input_character} == 0x0074) { # t
1309 !!!next-input-character;
1310 if ($self->{next_input_character} == 0x0045 or # E
1311 $self->{next_input_character} == 0x0065) { # e
1312 !!!next-input-character;
1313 if ($self->{next_input_character} == 0x004D or # M
1314 $self->{next_input_character} == 0x006D) { # m
1315 $self->{state} = 'before DOCTYPE system identifier';
1316 !!!next-input-character;
1317 redo A;
1318 }
1319 }
1320 }
1321 }
1322 }
1323
1324 #
1325 } else {
1326 !!!next-input-character;
1327 #
1328 }
1329
1330 !!!parse-error (type => 'string after DOCTYPE name');
1331 $self->{state} = 'bogus DOCTYPE';
1332 # next-input-character is already done
1333 redo A;
1334 } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
1335 if ({
1336 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1337 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1338 }->{$self->{next_input_character}}) {
1339 ## Stay in the state
1340 !!!next-input-character;
1341 redo A;
1342 } elsif ($self->{next_input_character} eq 0x0022) { # "
1343 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1344 $self->{state} = 'DOCTYPE public identifier (double-quoted)';
1345 !!!next-input-character;
1346 redo A;
1347 } elsif ($self->{next_input_character} eq 0x0027) { # '
1348 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1349 $self->{state} = 'DOCTYPE public identifier (single-quoted)';
1350 !!!next-input-character;
1351 redo A;
1352 } elsif ($self->{next_input_character} eq 0x003E) { # >
1353 !!!parse-error (type => 'no PUBLIC literal');
1354
1355 $self->{state} = 'data';
1356 !!!next-input-character;
1357
1358 delete $self->{current_token}->{correct};
1359 !!!emit ($self->{current_token}); # DOCTYPE
1360
1361 redo A;
1362 } elsif ($self->{next_input_character} == -1) {
1363 !!!parse-error (type => 'unclosed DOCTYPE');
1364
1365 $self->{state} = 'data';
1366 ## reconsume
1367
1368 delete $self->{current_token}->{correct};
1369 !!!emit ($self->{current_token}); # DOCTYPE
1370
1371 redo A;
1372 } else {
1373 !!!parse-error (type => 'string after PUBLIC');
1374 $self->{state} = 'bogus DOCTYPE';
1375 !!!next-input-character;
1376 redo A;
1377 }
1378 } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
1379 if ($self->{next_input_character} == 0x0022) { # "
1380 $self->{state} = 'after DOCTYPE public identifier';
1381 !!!next-input-character;
1382 redo A;
1383 } elsif ($self->{next_input_character} == -1) {
1384 !!!parse-error (type => 'unclosed PUBLIC literal');
1385
1386 $self->{state} = 'data';
1387 ## reconsume
1388
1389 delete $self->{current_token}->{correct};
1390 !!!emit ($self->{current_token}); # DOCTYPE
1391
1392 redo A;
1393 } else {
1394 $self->{current_token}->{public_identifier} # DOCTYPE
1395 .= chr $self->{next_input_character};
1396 ## Stay in the state
1397 !!!next-input-character;
1398 redo A;
1399 }
1400 } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
1401 if ($self->{next_input_character} == 0x0027) { # '
1402 $self->{state} = 'after DOCTYPE public identifier';
1403 !!!next-input-character;
1404 redo A;
1405 } elsif ($self->{next_input_character} == -1) {
1406 !!!parse-error (type => 'unclosed PUBLIC literal');
1407
1408 $self->{state} = 'data';
1409 ## reconsume
1410
1411 delete $self->{current_token}->{correct};
1412 !!!emit ($self->{current_token}); # DOCTYPE
1413
1414 redo A;
1415 } else {
1416 $self->{current_token}->{public_identifier} # DOCTYPE
1417 .= chr $self->{next_input_character};
1418 ## Stay in the state
1419 !!!next-input-character;
1420 redo A;
1421 }
1422 } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
1423 if ({
1424 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1425 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1426 }->{$self->{next_input_character}}) {
1427 ## Stay in the state
1428 !!!next-input-character;
1429 redo A;
1430 } elsif ($self->{next_input_character} == 0x0022) { # "
1431 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1432 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1433 !!!next-input-character;
1434 redo A;
1435 } elsif ($self->{next_input_character} == 0x0027) { # '
1436 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1437 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1438 !!!next-input-character;
1439 redo A;
1440 } elsif ($self->{next_input_character} == 0x003E) { # >
1441 $self->{state} = 'data';
1442 !!!next-input-character;
1443
1444 !!!emit ($self->{current_token}); # DOCTYPE
1445
1446 redo A;
1447 } elsif ($self->{next_input_character} == -1) {
1448 !!!parse-error (type => 'unclosed DOCTYPE');
1449
1450 $self->{state} = 'data';
1451 ## reconsume
1452
1453 delete $self->{current_token}->{correct};
1454 !!!emit ($self->{current_token}); # DOCTYPE
1455
1456 redo A;
1457 } else {
1458 !!!parse-error (type => 'string after PUBLIC literal');
1459 $self->{state} = 'bogus DOCTYPE';
1460 !!!next-input-character;
1461 redo A;
1462 }
1463 } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
1464 if ({
1465 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1466 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1467 }->{$self->{next_input_character}}) {
1468 ## Stay in the state
1469 !!!next-input-character;
1470 redo A;
1471 } elsif ($self->{next_input_character} == 0x0022) { # "
1472 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1473 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1474 !!!next-input-character;
1475 redo A;
1476 } elsif ($self->{next_input_character} == 0x0027) { # '
1477 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1478 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1479 !!!next-input-character;
1480 redo A;
1481 } elsif ($self->{next_input_character} == 0x003E) { # >
1482 !!!parse-error (type => 'no SYSTEM literal');
1483 $self->{state} = 'data';
1484 !!!next-input-character;
1485
1486 delete $self->{current_token}->{correct};
1487 !!!emit ($self->{current_token}); # DOCTYPE
1488
1489 redo A;
1490 } elsif ($self->{next_input_character} == -1) {
1491 !!!parse-error (type => 'unclosed DOCTYPE');
1492
1493 $self->{state} = 'data';
1494 ## reconsume
1495
1496 delete $self->{current_token}->{correct};
1497 !!!emit ($self->{current_token}); # DOCTYPE
1498
1499 redo A;
1500 } else {
1501 !!!parse-error (type => 'string after SYSTEM');
1502 $self->{state} = 'bogus DOCTYPE';
1503 !!!next-input-character;
1504 redo A;
1505 }
1506 } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
1507 if ($self->{next_input_character} == 0x0022) { # "
1508 $self->{state} = 'after DOCTYPE system identifier';
1509 !!!next-input-character;
1510 redo A;
1511 } elsif ($self->{next_input_character} == -1) {
1512 !!!parse-error (type => 'unclosed SYSTEM literal');
1513
1514 $self->{state} = 'data';
1515 ## reconsume
1516
1517 delete $self->{current_token}->{correct};
1518 !!!emit ($self->{current_token}); # DOCTYPE
1519
1520 redo A;
1521 } else {
1522 $self->{current_token}->{system_identifier} # DOCTYPE
1523 .= chr $self->{next_input_character};
1524 ## Stay in the state
1525 !!!next-input-character;
1526 redo A;
1527 }
1528 } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
1529 if ($self->{next_input_character} == 0x0027) { # '
1530 $self->{state} = 'after DOCTYPE system identifier';
1531 !!!next-input-character;
1532 redo A;
1533 } elsif ($self->{next_input_character} == -1) {
1534 !!!parse-error (type => 'unclosed SYSTEM literal');
1535
1536 $self->{state} = 'data';
1537 ## reconsume
1538
1539 delete $self->{current_token}->{correct};
1540 !!!emit ($self->{current_token}); # DOCTYPE
1541
1542 redo A;
1543 } else {
1544 $self->{current_token}->{system_identifier} # DOCTYPE
1545 .= chr $self->{next_input_character};
1546 ## Stay in the state
1547 !!!next-input-character;
1548 redo A;
1549 }
1550 } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
1551 if ({
1552 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1553 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1554 }->{$self->{next_input_character}}) {
1555 ## Stay in the state
1556 !!!next-input-character;
1557 redo A;
1558 } elsif ($self->{next_input_character} == 0x003E) { # >
1559 $self->{state} = 'data';
1560 !!!next-input-character;
1561
1562 !!!emit ($self->{current_token}); # DOCTYPE
1563
1564 redo A;
1565 } elsif ($self->{next_input_character} == -1) {
1566 !!!parse-error (type => 'unclosed DOCTYPE');
1567
1568 $self->{state} = 'data';
1569 ## reconsume
1570
1571 delete $self->{current_token}->{correct};
1572 !!!emit ($self->{current_token}); # DOCTYPE
1573
1574 redo A;
1575 } else {
1576 !!!parse-error (type => 'string after SYSTEM literal');
1577 $self->{state} = 'bogus DOCTYPE';
1578 !!!next-input-character;
1579 redo A;
1580 }
1581 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1582 if ($self->{next_input_character} == 0x003E) { # >
1583 $self->{state} = 'data';
1584 !!!next-input-character;
1585
1586 delete $self->{current_token}->{correct};
1587 !!!emit ($self->{current_token}); # DOCTYPE
1588
1589 redo A;
1590 } elsif ($self->{next_input_character} == -1) {
1591 !!!parse-error (type => 'unclosed DOCTYPE');
1592 $self->{state} = 'data';
1593 ## reconsume
1594
1595 delete $self->{current_token}->{correct};
1596 !!!emit ($self->{current_token}); # DOCTYPE
1597
1598 redo A;
1599 } else {
1600 ## Stay in the state
1601 !!!next-input-character;
1602 redo A;
1603 }
1604 } else {
1605 die "$0: $self->{state}: Unknown state";
1606 }
1607 } # A
1608
1609 die "$0: _get_next_token: unexpected case";
1610 } # _get_next_token
1611
1612 sub _tokenize_attempt_to_consume_an_entity ($$) {
1613 my ($self, $in_attr) = @_;
1614
1615 if ({
1616 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1617 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1618 }->{$self->{next_input_character}}) {
1619 ## Don't consume
1620 ## No error
1621 return undef;
1622 } elsif ($self->{next_input_character} == 0x0023) { # #
1623 !!!next-input-character;
1624 if ($self->{next_input_character} == 0x0078 or # x
1625 $self->{next_input_character} == 0x0058) { # X
1626 my $code;
1627 X: {
1628 my $x_char = $self->{next_input_character};
1629 !!!next-input-character;
1630 if (0x0030 <= $self->{next_input_character} and
1631 $self->{next_input_character} <= 0x0039) { # 0..9
1632 $code ||= 0;
1633 $code *= 0x10;
1634 $code += $self->{next_input_character} - 0x0030;
1635 redo X;
1636 } elsif (0x0061 <= $self->{next_input_character} and
1637 $self->{next_input_character} <= 0x0066) { # a..f
1638 $code ||= 0;
1639 $code *= 0x10;
1640 $code += $self->{next_input_character} - 0x0060 + 9;
1641 redo X;
1642 } elsif (0x0041 <= $self->{next_input_character} and
1643 $self->{next_input_character} <= 0x0046) { # A..F
1644 $code ||= 0;
1645 $code *= 0x10;
1646 $code += $self->{next_input_character} - 0x0040 + 9;
1647 redo X;
1648 } elsif (not defined $code) { # no hexadecimal digit
1649 !!!parse-error (type => 'bare hcro');
1650 !!!back-next-input-character ($x_char, $self->{next_input_character});
1651 $self->{next_input_character} = 0x0023; # #
1652 return undef;
1653 } elsif ($self->{next_input_character} == 0x003B) { # ;
1654 !!!next-input-character;
1655 } else {
1656 !!!parse-error (type => 'no refc');
1657 }
1658
1659 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1660 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1661 $code = 0xFFFD;
1662 } elsif ($code > 0x10FFFF) {
1663 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1664 $code = 0xFFFD;
1665 } elsif ($code == 0x000D) {
1666 !!!parse-error (type => 'CR character reference');
1667 $code = 0x000A;
1668 } elsif (0x80 <= $code and $code <= 0x9F) {
1669 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1670 $code = $c1_entity_char->{$code};
1671 }
1672
1673 return {type => 'character', data => chr $code};
1674 } # X
1675 } elsif (0x0030 <= $self->{next_input_character} and
1676 $self->{next_input_character} <= 0x0039) { # 0..9
1677 my $code = $self->{next_input_character} - 0x0030;
1678 !!!next-input-character;
1679
1680 while (0x0030 <= $self->{next_input_character} and
1681 $self->{next_input_character} <= 0x0039) { # 0..9
1682 $code *= 10;
1683 $code += $self->{next_input_character} - 0x0030;
1684
1685 !!!next-input-character;
1686 }
1687
1688 if ($self->{next_input_character} == 0x003B) { # ;
1689 !!!next-input-character;
1690 } else {
1691 !!!parse-error (type => 'no refc');
1692 }
1693
1694 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1695 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1696 $code = 0xFFFD;
1697 } elsif ($code > 0x10FFFF) {
1698 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1699 $code = 0xFFFD;
1700 } elsif ($code == 0x000D) {
1701 !!!parse-error (type => 'CR character reference');
1702 $code = 0x000A;
1703 } elsif (0x80 <= $code and $code <= 0x9F) {
1704 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1705 $code = $c1_entity_char->{$code};
1706 }
1707
1708 return {type => 'character', data => chr $code};
1709 } else {
1710 !!!parse-error (type => 'bare nero');
1711 !!!back-next-input-character ($self->{next_input_character});
1712 $self->{next_input_character} = 0x0023; # #
1713 return undef;
1714 }
1715 } elsif ((0x0041 <= $self->{next_input_character} and
1716 $self->{next_input_character} <= 0x005A) or
1717 (0x0061 <= $self->{next_input_character} and
1718 $self->{next_input_character} <= 0x007A)) {
1719 my $entity_name = chr $self->{next_input_character};
1720 !!!next-input-character;
1721
1722 my $value = $entity_name;
1723 my $match = 0;
1724 require Whatpm::_NamedEntityList;
1725 our $EntityChar;
1726
1727 while (length $entity_name < 10 and
1728 ## NOTE: Some number greater than the maximum length of entity name
1729 ((0x0041 <= $self->{next_input_character} and # a
1730 $self->{next_input_character} <= 0x005A) or # x
1731 (0x0061 <= $self->{next_input_character} and # a
1732 $self->{next_input_character} <= 0x007A) or # z
1733 (0x0030 <= $self->{next_input_character} and # 0
1734 $self->{next_input_character} <= 0x0039) or # 9
1735 $self->{next_input_character} == 0x003B)) { # ;
1736 $entity_name .= chr $self->{next_input_character};
1737 if (defined $EntityChar->{$entity_name}) {
1738 if ($self->{next_input_character} == 0x003B) { # ;
1739 $value = $EntityChar->{$entity_name};
1740 $match = 1;
1741 !!!next-input-character;
1742 last;
1743 } else {
1744 $value = $EntityChar->{$entity_name};
1745 $match = -1;
1746 !!!next-input-character;
1747 }
1748 } else {
1749 $value .= chr $self->{next_input_character};
1750 $match *= 2;
1751 !!!next-input-character;
1752 }
1753 }
1754
1755 if ($match > 0) {
1756 return {type => 'character', data => $value};
1757 } elsif ($match < 0) {
1758 !!!parse-error (type => 'no refc');
1759 if ($in_attr and $match < -1) {
1760 return {type => 'character', data => '&'.$entity_name};
1761 } else {
1762 return {type => 'character', data => $value};
1763 }
1764 } else {
1765 !!!parse-error (type => 'bare ero');
1766 ## NOTE: No characters are consumed in the spec.
1767 return {type => 'character', data => '&'.$value};
1768 }
1769 } else {
1770 ## no characters are consumed
1771 !!!parse-error (type => 'bare ero');
1772 return undef;
1773 }
1774 } # _tokenize_attempt_to_consume_an_entity
1775
1776 sub _initialize_tree_constructor ($) {
1777 my $self = shift;
1778 ## NOTE: $self->{document} MUST be specified before this method is called
1779 $self->{document}->strict_error_checking (0);
1780 ## TODO: Turn mutation events off # MUST
1781 ## TODO: Turn loose Document option (manakai extension) on
1782 $self->{document}->manakai_is_html (1); # MUST
1783 } # _initialize_tree_constructor
1784
1785 sub _terminate_tree_constructor ($) {
1786 my $self = shift;
1787 $self->{document}->strict_error_checking (1);
1788 ## TODO: Turn mutation events on
1789 } # _terminate_tree_constructor
1790
1791 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1792
1793 { # tree construction stage
1794 my $token;
1795
1796 sub _construct_tree ($) {
1797 my ($self) = @_;
1798
1799 ## When an interactive UA render the $self->{document} available
1800 ## to the user, or when it begin accepting user input, are
1801 ## not defined.
1802
1803 ## Append a character: collect it and all subsequent consecutive
1804 ## characters and insert one Text node whose data is concatenation
1805 ## of all those characters. # MUST
1806
1807 !!!next-token;
1808
1809 $self->{insertion_mode} = 'before head';
1810 undef $self->{form_element};
1811 undef $self->{head_element};
1812 $self->{open_elements} = [];
1813 undef $self->{inner_html_node};
1814
1815 $self->_tree_construction_initial; # MUST
1816 $self->_tree_construction_root_element;
1817 $self->_tree_construction_main;
1818 } # _construct_tree
1819
1820 sub _tree_construction_initial ($) {
1821 my $self = shift;
1822 INITIAL: {
1823 if ($token->{type} eq 'DOCTYPE') {
1824 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1825 ## error, switch to a conformance checking mode for another
1826 ## language.
1827 my $doctype_name = $token->{name};
1828 $doctype_name = '' unless defined $doctype_name;
1829 $doctype_name =~ tr/a-z/A-Z/;
1830 if (not defined $token->{name} or # <!DOCTYPE>
1831 defined $token->{public_identifier} or
1832 defined $token->{system_identifier}) {
1833 !!!parse-error (type => 'not HTML5');
1834 } elsif ($doctype_name ne 'HTML') {
1835 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1836 !!!parse-error (type => 'not HTML5');
1837 }
1838
1839 my $doctype = $self->{document}->create_document_type_definition
1840 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1841 $doctype->public_id ($token->{public_identifier})
1842 if defined $token->{public_identifier};
1843 $doctype->system_id ($token->{system_identifier})
1844 if defined $token->{system_identifier};
1845 ## NOTE: Other DocumentType attributes are null or empty lists.
1846 ## ISSUE: internalSubset = null??
1847 $self->{document}->append_child ($doctype);
1848
1849 if (not $token->{correct} or $doctype_name ne 'HTML') {
1850 $self->{document}->manakai_compat_mode ('quirks');
1851 } elsif (defined $token->{public_identifier}) {
1852 my $pubid = $token->{public_identifier};
1853 $pubid =~ tr/a-z/A-z/;
1854 if ({
1855 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1856 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1857 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1858 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1859 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1860 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1861 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1862 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1863 "-//IETF//DTD HTML 2.0//EN" => 1,
1864 "-//IETF//DTD HTML 2.1E//EN" => 1,
1865 "-//IETF//DTD HTML 3.0//EN" => 1,
1866 "-//IETF//DTD HTML 3.0//EN//" => 1,
1867 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1868 "-//IETF//DTD HTML 3.2//EN" => 1,
1869 "-//IETF//DTD HTML 3//EN" => 1,
1870 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1871 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1872 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1873 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1874 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1875 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1876 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1877 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1878 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1879 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1880 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1881 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1882 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1883 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1884 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1885 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1886 "-//IETF//DTD HTML STRICT//EN" => 1,
1887 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1888 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1889 "-//IETF//DTD HTML//EN" => 1,
1890 "-//IETF//DTD HTML//EN//2.0" => 1,
1891 "-//IETF//DTD HTML//EN//3.0" => 1,
1892 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1893 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1894 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1895 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1896 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1897 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1898 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1899 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1900 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1901 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1902 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1903 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1904 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1905 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1906 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1907 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1908 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1909 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1910 "-//W3C//DTD HTML 3.2//EN" => 1,
1911 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1912 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1913 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1914 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1915 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1916 "-//W3C//DTD W3 HTML//EN" => 1,
1917 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1918 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1919 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1920 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1921 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1922 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1923 "HTML" => 1,
1924 }->{$pubid}) {
1925 $self->{document}->manakai_compat_mode ('quirks');
1926 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1927 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1928 if (defined $token->{system_identifier}) {
1929 $self->{document}->manakai_compat_mode ('quirks');
1930 } else {
1931 $self->{document}->manakai_compat_mode ('limited quirks');
1932 }
1933 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
1934 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
1935 $self->{document}->manakai_compat_mode ('limited quirks');
1936 }
1937 }
1938 if (defined $token->{system_identifier}) {
1939 my $sysid = $token->{system_identifier};
1940 $sysid =~ tr/A-Z/a-z/;
1941 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
1942 $self->{document}->manakai_compat_mode ('quirks');
1943 }
1944 }
1945
1946 ## Go to the root element phase.
1947 !!!next-token;
1948 return;
1949 } elsif ({
1950 'start tag' => 1,
1951 'end tag' => 1,
1952 'end-of-file' => 1,
1953 }->{$token->{type}}) {
1954 !!!parse-error (type => 'no DOCTYPE');
1955 $self->{document}->manakai_compat_mode ('quirks');
1956 ## Go to the root element phase
1957 ## reprocess
1958 return;
1959 } elsif ($token->{type} eq 'character') {
1960 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1961 ## Ignore the token
1962
1963 unless (length $token->{data}) {
1964 ## Stay in the phase
1965 !!!next-token;
1966 redo INITIAL;
1967 }
1968 }
1969
1970 !!!parse-error (type => 'no DOCTYPE');
1971 $self->{document}->manakai_compat_mode ('quirks');
1972 ## Go to the root element phase
1973 ## reprocess
1974 return;
1975 } elsif ($token->{type} eq 'comment') {
1976 my $comment = $self->{document}->create_comment ($token->{data});
1977 $self->{document}->append_child ($comment);
1978
1979 ## Stay in the phase.
1980 !!!next-token;
1981 redo INITIAL;
1982 } else {
1983 die "$0: $token->{type}: Unknown token";
1984 }
1985 } # INITIAL
1986 } # _tree_construction_initial
1987
1988 sub _tree_construction_root_element ($) {
1989 my $self = shift;
1990
1991 B: {
1992 if ($token->{type} eq 'DOCTYPE') {
1993 !!!parse-error (type => 'in html:#DOCTYPE');
1994 ## Ignore the token
1995 ## Stay in the phase
1996 !!!next-token;
1997 redo B;
1998 } elsif ($token->{type} eq 'comment') {
1999 my $comment = $self->{document}->create_comment ($token->{data});
2000 $self->{document}->append_child ($comment);
2001 ## Stay in the phase
2002 !!!next-token;
2003 redo B;
2004 } elsif ($token->{type} eq 'character') {
2005 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2006 ## Ignore the token.
2007
2008 unless (length $token->{data}) {
2009 ## Stay in the phase
2010 !!!next-token;
2011 redo B;
2012 }
2013 }
2014 #
2015 } elsif ({
2016 'start tag' => 1,
2017 'end tag' => 1,
2018 'end-of-file' => 1,
2019 }->{$token->{type}}) {
2020 ## ISSUE: There is an issue in the spec
2021 #
2022 } else {
2023 die "$0: $token->{type}: Unknown token";
2024 }
2025 my $root_element; !!!create-element ($root_element, 'html');
2026 $self->{document}->append_child ($root_element);
2027 push @{$self->{open_elements}}, [$root_element, 'html'];
2028 ## reprocess
2029 #redo B;
2030 return; ## Go to the main phase.
2031 } # B
2032 } # _tree_construction_root_element
2033
2034 sub _reset_insertion_mode ($) {
2035 my $self = shift;
2036
2037 ## Step 1
2038 my $last;
2039
2040 ## Step 2
2041 my $i = -1;
2042 my $node = $self->{open_elements}->[$i];
2043
2044 ## Step 3
2045 S3: {
2046 ## ISSUE: Oops! "If node is the first node in the stack of open
2047 ## elements, then set last to true. If the context element of the
2048 ## HTML fragment parsing algorithm is neither a td element nor a
2049 ## th element, then set node to the context element. (fragment case)":
2050 ## The second "if" is in the scope of the first "if"!?
2051 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2052 $last = 1;
2053 if (defined $self->{inner_html_node}) {
2054 if ($self->{inner_html_node}->[1] eq 'td' or
2055 $self->{inner_html_node}->[1] eq 'th') {
2056 #
2057 } else {
2058 $node = $self->{inner_html_node};
2059 }
2060 }
2061 }
2062
2063 ## Step 4..13
2064 my $new_mode = {
2065 select => 'in select',
2066 td => 'in cell',
2067 th => 'in cell',
2068 tr => 'in row',
2069 tbody => 'in table body',
2070 thead => 'in table body',
2071 tfoot => 'in table body',
2072 caption => 'in caption',
2073 colgroup => 'in column group',
2074 table => 'in table',
2075 head => 'in body', # not in head!
2076 body => 'in body',
2077 frameset => 'in frameset',
2078 }->{$node->[1]};
2079 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2080
2081 ## Step 14
2082 if ($node->[1] eq 'html') {
2083 unless (defined $self->{head_element}) {
2084 $self->{insertion_mode} = 'before head';
2085 } else {
2086 $self->{insertion_mode} = 'after head';
2087 }
2088 return;
2089 }
2090
2091 ## Step 15
2092 $self->{insertion_mode} = 'in body' and return if $last;
2093
2094 ## Step 16
2095 $i--;
2096 $node = $self->{open_elements}->[$i];
2097
2098 ## Step 17
2099 redo S3;
2100 } # S3
2101 } # _reset_insertion_mode
2102
2103 sub _tree_construction_main ($) {
2104 my $self = shift;
2105
2106 my $active_formatting_elements = [];
2107
2108 my $reconstruct_active_formatting_elements = sub { # MUST
2109 my $insert = shift;
2110
2111 ## Step 1
2112 return unless @$active_formatting_elements;
2113
2114 ## Step 3
2115 my $i = -1;
2116 my $entry = $active_formatting_elements->[$i];
2117
2118 ## Step 2
2119 return if $entry->[0] eq '#marker';
2120 for (@{$self->{open_elements}}) {
2121 if ($entry->[0] eq $_->[0]) {
2122 return;
2123 }
2124 }
2125
2126 S4: {
2127 ## Step 4
2128 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2129
2130 ## Step 5
2131 $i--;
2132 $entry = $active_formatting_elements->[$i];
2133
2134 ## Step 6
2135 if ($entry->[0] eq '#marker') {
2136 #
2137 } else {
2138 my $in_open_elements;
2139 OE: for (@{$self->{open_elements}}) {
2140 if ($entry->[0] eq $_->[0]) {
2141 $in_open_elements = 1;
2142 last OE;
2143 }
2144 }
2145 if ($in_open_elements) {
2146 #
2147 } else {
2148 redo S4;
2149 }
2150 }
2151
2152 ## Step 7
2153 $i++;
2154 $entry = $active_formatting_elements->[$i];
2155 } # S4
2156
2157 S7: {
2158 ## Step 8
2159 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2160
2161 ## Step 9
2162 $insert->($clone->[0]);
2163 push @{$self->{open_elements}}, $clone;
2164
2165 ## Step 10
2166 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2167
2168 ## Step 11
2169 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2170 ## Step 7'
2171 $i++;
2172 $entry = $active_formatting_elements->[$i];
2173
2174 redo S7;
2175 }
2176 } # S7
2177 }; # $reconstruct_active_formatting_elements
2178
2179 my $clear_up_to_marker = sub {
2180 for (reverse 0..$#$active_formatting_elements) {
2181 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2182 splice @$active_formatting_elements, $_;
2183 return;
2184 }
2185 }
2186 }; # $clear_up_to_marker
2187
2188 my $parse_rcdata = sub ($$) {
2189 my ($content_model_flag, $insert) = @_;
2190
2191 ## Step 1
2192 my $start_tag_name = $token->{tag_name};
2193 my $el;
2194 !!!create-element ($el, $start_tag_name, $token->{attributes});
2195
2196 ## Step 2
2197 $insert->($el); # /context node/->append_child ($el)
2198
2199 ## Step 3
2200 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2201 delete $self->{escape}; # MUST
2202
2203 ## Step 4
2204 my $text = '';
2205 !!!next-token;
2206 while ($token->{type} eq 'character') { # or until stop tokenizing
2207 $text .= $token->{data};
2208 !!!next-token;
2209 }
2210
2211 ## Step 5
2212 if (length $text) {
2213 my $text = $self->{document}->create_text_node ($text);
2214 $el->append_child ($text);
2215 }
2216
2217 ## Step 6
2218 $self->{content_model} = PCDATA_CONTENT_MODEL;
2219
2220 ## Step 7
2221 if ($token->{type} eq 'end tag' and $token->{tag_name} eq $start_tag_name) {
2222 ## Ignore the token
2223 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2224 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2225 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2226 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2227 } else {
2228 die "$0: $content_model_flag in parse_rcdata";
2229 }
2230 !!!next-token;
2231 }; # $parse_rcdata
2232
2233 my $script_start_tag = sub ($) {
2234 my $insert = $_[0];
2235 my $script_el;
2236 !!!create-element ($script_el, 'script', $token->{attributes});
2237 ## TODO: mark as "parser-inserted"
2238
2239 $self->{content_model} = CDATA_CONTENT_MODEL;
2240 delete $self->{escape}; # MUST
2241
2242 my $text = '';
2243 !!!next-token;
2244 while ($token->{type} eq 'character') {
2245 $text .= $token->{data};
2246 !!!next-token;
2247 } # stop if non-character token or tokenizer stops tokenising
2248 if (length $text) {
2249 $script_el->manakai_append_text ($text);
2250 }
2251
2252 $self->{content_model} = PCDATA_CONTENT_MODEL;
2253
2254 if ($token->{type} eq 'end tag' and
2255 $token->{tag_name} eq 'script') {
2256 ## Ignore the token
2257 } else {
2258 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2259 ## ISSUE: And ignore?
2260 ## TODO: mark as "already executed"
2261 }
2262
2263 if (defined $self->{inner_html_node}) {
2264 ## TODO: mark as "already executed"
2265 } else {
2266 ## TODO: $old_insertion_point = current insertion point
2267 ## TODO: insertion point = just before the next input character
2268
2269 $insert->($script_el);
2270
2271 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2272
2273 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2274 }
2275
2276 !!!next-token;
2277 }; # $script_start_tag
2278
2279 my $formatting_end_tag = sub {
2280 my $tag_name = shift;
2281
2282 FET: {
2283 ## Step 1
2284 my $formatting_element;
2285 my $formatting_element_i_in_active;
2286 AFE: for (reverse 0..$#$active_formatting_elements) {
2287 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2288 $formatting_element = $active_formatting_elements->[$_];
2289 $formatting_element_i_in_active = $_;
2290 last AFE;
2291 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2292 last AFE;
2293 }
2294 } # AFE
2295 unless (defined $formatting_element) {
2296 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2297 ## Ignore the token
2298 !!!next-token;
2299 return;
2300 }
2301 ## has an element in scope
2302 my $in_scope = 1;
2303 my $formatting_element_i_in_open;
2304 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2305 my $node = $self->{open_elements}->[$_];
2306 if ($node->[0] eq $formatting_element->[0]) {
2307 if ($in_scope) {
2308 $formatting_element_i_in_open = $_;
2309 last INSCOPE;
2310 } else { # in open elements but not in scope
2311 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2312 ## Ignore the token
2313 !!!next-token;
2314 return;
2315 }
2316 } elsif ({
2317 table => 1, caption => 1, td => 1, th => 1,
2318 button => 1, marquee => 1, object => 1, html => 1,
2319 }->{$node->[1]}) {
2320 $in_scope = 0;
2321 }
2322 } # INSCOPE
2323 unless (defined $formatting_element_i_in_open) {
2324 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2325 pop @$active_formatting_elements; # $formatting_element
2326 !!!next-token; ## TODO: ok?
2327 return;
2328 }
2329 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2330 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2331 }
2332
2333 ## Step 2
2334 my $furthest_block;
2335 my $furthest_block_i_in_open;
2336 OE: for (reverse 0..$#{$self->{open_elements}}) {
2337 my $node = $self->{open_elements}->[$_];
2338 if (not $formatting_category->{$node->[1]} and
2339 #not $phrasing_category->{$node->[1]} and
2340 ($special_category->{$node->[1]} or
2341 $scoping_category->{$node->[1]})) {
2342 $furthest_block = $node;
2343 $furthest_block_i_in_open = $_;
2344 } elsif ($node->[0] eq $formatting_element->[0]) {
2345 last OE;
2346 }
2347 } # OE
2348
2349 ## Step 3
2350 unless (defined $furthest_block) { # MUST
2351 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2352 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2353 !!!next-token;
2354 return;
2355 }
2356
2357 ## Step 4
2358 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2359
2360 ## Step 5
2361 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2362 if (defined $furthest_block_parent) {
2363 $furthest_block_parent->remove_child ($furthest_block->[0]);
2364 }
2365
2366 ## Step 6
2367 my $bookmark_prev_el
2368 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2369 ->[0];
2370
2371 ## Step 7
2372 my $node = $furthest_block;
2373 my $node_i_in_open = $furthest_block_i_in_open;
2374 my $last_node = $furthest_block;
2375 S7: {
2376 ## Step 1
2377 $node_i_in_open--;
2378 $node = $self->{open_elements}->[$node_i_in_open];
2379
2380 ## Step 2
2381 my $node_i_in_active;
2382 S7S2: {
2383 for (reverse 0..$#$active_formatting_elements) {
2384 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2385 $node_i_in_active = $_;
2386 last S7S2;
2387 }
2388 }
2389 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2390 redo S7;
2391 } # S7S2
2392
2393 ## Step 3
2394 last S7 if $node->[0] eq $formatting_element->[0];
2395
2396 ## Step 4
2397 if ($last_node->[0] eq $furthest_block->[0]) {
2398 $bookmark_prev_el = $node->[0];
2399 }
2400
2401 ## Step 5
2402 if ($node->[0]->has_child_nodes ()) {
2403 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2404 $active_formatting_elements->[$node_i_in_active] = $clone;
2405 $self->{open_elements}->[$node_i_in_open] = $clone;
2406 $node = $clone;
2407 }
2408
2409 ## Step 6
2410 $node->[0]->append_child ($last_node->[0]);
2411
2412 ## Step 7
2413 $last_node = $node;
2414
2415 ## Step 8
2416 redo S7;
2417 } # S7
2418
2419 ## Step 8
2420 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2421
2422 ## Step 9
2423 my $clone = [$formatting_element->[0]->clone_node (0),
2424 $formatting_element->[1]];
2425
2426 ## Step 10
2427 my @cn = @{$furthest_block->[0]->child_nodes};
2428 $clone->[0]->append_child ($_) for @cn;
2429
2430 ## Step 11
2431 $furthest_block->[0]->append_child ($clone->[0]);
2432
2433 ## Step 12
2434 my $i;
2435 AFE: for (reverse 0..$#$active_formatting_elements) {
2436 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2437 splice @$active_formatting_elements, $_, 1;
2438 $i-- and last AFE if defined $i;
2439 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2440 $i = $_;
2441 }
2442 } # AFE
2443 splice @$active_formatting_elements, $i + 1, 0, $clone;
2444
2445 ## Step 13
2446 undef $i;
2447 OE: for (reverse 0..$#{$self->{open_elements}}) {
2448 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2449 splice @{$self->{open_elements}}, $_, 1;
2450 $i-- and last OE if defined $i;
2451 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2452 $i = $_;
2453 }
2454 } # OE
2455 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2456
2457 ## Step 14
2458 redo FET;
2459 } # FET
2460 }; # $formatting_end_tag
2461
2462 my $insert_to_current = sub {
2463 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2464 }; # $insert_to_current
2465
2466 my $insert_to_foster = sub {
2467 my $child = shift;
2468 if ({
2469 table => 1, tbody => 1, tfoot => 1,
2470 thead => 1, tr => 1,
2471 }->{$self->{open_elements}->[-1]->[1]}) {
2472 # MUST
2473 my $foster_parent_element;
2474 my $next_sibling;
2475 OE: for (reverse 0..$#{$self->{open_elements}}) {
2476 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2477 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2478 if (defined $parent and $parent->node_type == 1) {
2479 $foster_parent_element = $parent;
2480 $next_sibling = $self->{open_elements}->[$_]->[0];
2481 } else {
2482 $foster_parent_element
2483 = $self->{open_elements}->[$_ - 1]->[0];
2484 }
2485 last OE;
2486 }
2487 } # OE
2488 $foster_parent_element = $self->{open_elements}->[0]->[0]
2489 unless defined $foster_parent_element;
2490 $foster_parent_element->insert_before
2491 ($child, $next_sibling);
2492 } else {
2493 $self->{open_elements}->[-1]->[0]->append_child ($child);
2494 }
2495 }; # $insert_to_foster
2496
2497 my $in_body = sub {
2498 my $insert = shift;
2499 if ($token->{type} eq 'start tag') {
2500 if ($token->{tag_name} eq 'script') {
2501 ## NOTE: This is an "as if in head" code clone
2502 $script_start_tag->($insert);
2503 return;
2504 } elsif ($token->{tag_name} eq 'style') {
2505 ## NOTE: This is an "as if in head" code clone
2506 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
2507 return;
2508 } elsif ({
2509 base => 1, link => 1,
2510 }->{$token->{tag_name}}) {
2511 ## NOTE: This is an "as if in head" code clone, only "-t" differs
2512 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2513 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2514 !!!next-token;
2515 return;
2516 } elsif ($token->{tag_name} eq 'meta') {
2517 ## NOTE: This is an "as if in head" code clone, only "-t" differs
2518 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2519 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2520
2521 unless ($self->{confident}) {
2522 my $charset;
2523 if ($token->{attributes}->{charset}) { ## TODO: And if supported
2524 $charset = $token->{attributes}->{charset}->{value};
2525 }
2526 if ($token->{attributes}->{'http-equiv'}) {
2527 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2528 if ($token->{attributes}->{'http-equiv'}->{value}
2529 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2530 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2531 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2532 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
2533 } ## TODO: And if supported
2534 }
2535 ## TODO: Change the encoding
2536 }
2537
2538 !!!next-token;
2539 return;
2540 } elsif ($token->{tag_name} eq 'title') {
2541 !!!parse-error (type => 'in body:title');
2542 ## NOTE: This is an "as if in head" code clone
2543 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
2544 if (defined $self->{head_element}) {
2545 $self->{head_element}->append_child ($_[0]);
2546 } else {
2547 $insert->($_[0]);
2548 }
2549 });
2550 return;
2551 } elsif ($token->{tag_name} eq 'body') {
2552 !!!parse-error (type => 'in body:body');
2553
2554 if (@{$self->{open_elements}} == 1 or
2555 $self->{open_elements}->[1]->[1] ne 'body') {
2556 ## Ignore the token
2557 } else {
2558 my $body_el = $self->{open_elements}->[1]->[0];
2559 for my $attr_name (keys %{$token->{attributes}}) {
2560 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2561 $body_el->set_attribute_ns
2562 (undef, [undef, $attr_name],
2563 $token->{attributes}->{$attr_name}->{value});
2564 }
2565 }
2566 }
2567 !!!next-token;
2568 return;
2569 } elsif ({
2570 address => 1, blockquote => 1, center => 1, dir => 1,
2571 div => 1, dl => 1, fieldset => 1, listing => 1,
2572 menu => 1, ol => 1, p => 1, ul => 1,
2573 pre => 1,
2574 }->{$token->{tag_name}}) {
2575 ## has a p element in scope
2576 INSCOPE: for (reverse @{$self->{open_elements}}) {
2577 if ($_->[1] eq 'p') {
2578 !!!back-token;
2579 $token = {type => 'end tag', tag_name => 'p'};
2580 return;
2581 } elsif ({
2582 table => 1, caption => 1, td => 1, th => 1,
2583 button => 1, marquee => 1, object => 1, html => 1,
2584 }->{$_->[1]}) {
2585 last INSCOPE;
2586 }
2587 } # INSCOPE
2588
2589 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2590 if ($token->{tag_name} eq 'pre') {
2591 !!!next-token;
2592 if ($token->{type} eq 'character') {
2593 $token->{data} =~ s/^\x0A//;
2594 unless (length $token->{data}) {
2595 !!!next-token;
2596 }
2597 }
2598 } else {
2599 !!!next-token;
2600 }
2601 return;
2602 } elsif ($token->{tag_name} eq 'form') {
2603 if (defined $self->{form_element}) {
2604 !!!parse-error (type => 'in form:form');
2605 ## Ignore the token
2606 !!!next-token;
2607 return;
2608 } else {
2609 ## has a p element in scope
2610 INSCOPE: for (reverse @{$self->{open_elements}}) {
2611 if ($_->[1] eq 'p') {
2612 !!!back-token;
2613 $token = {type => 'end tag', tag_name => 'p'};
2614 return;
2615 } elsif ({
2616 table => 1, caption => 1, td => 1, th => 1,
2617 button => 1, marquee => 1, object => 1, html => 1,
2618 }->{$_->[1]}) {
2619 last INSCOPE;
2620 }
2621 } # INSCOPE
2622
2623 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2624 $self->{form_element} = $self->{open_elements}->[-1]->[0];
2625 !!!next-token;
2626 return;
2627 }
2628 } elsif ($token->{tag_name} eq 'li') {
2629 ## has a p element in scope
2630 INSCOPE: for (reverse @{$self->{open_elements}}) {
2631 if ($_->[1] eq 'p') {
2632 !!!back-token;
2633 $token = {type => 'end tag', tag_name => 'p'};
2634 return;
2635 } elsif ({
2636 table => 1, caption => 1, td => 1, th => 1,
2637 button => 1, marquee => 1, object => 1, html => 1,
2638 }->{$_->[1]}) {
2639 last INSCOPE;
2640 }
2641 } # INSCOPE
2642
2643 ## Step 1
2644 my $i = -1;
2645 my $node = $self->{open_elements}->[$i];
2646 LI: {
2647 ## Step 2
2648 if ($node->[1] eq 'li') {
2649 if ($i != -1) {
2650 !!!parse-error (type => 'end tag missing:'.
2651 $self->{open_elements}->[-1]->[1]);
2652 }
2653 splice @{$self->{open_elements}}, $i;
2654 last LI;
2655 }
2656
2657 ## Step 3
2658 if (not $formatting_category->{$node->[1]} and
2659 #not $phrasing_category->{$node->[1]} and
2660 ($special_category->{$node->[1]} or
2661 $scoping_category->{$node->[1]}) and
2662 $node->[1] ne 'address' and $node->[1] ne 'div') {
2663 last LI;
2664 }
2665
2666 ## Step 4
2667 $i--;
2668 $node = $self->{open_elements}->[$i];
2669 redo LI;
2670 } # LI
2671
2672 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2673 !!!next-token;
2674 return;
2675 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2676 ## has a p element in scope
2677 INSCOPE: for (reverse @{$self->{open_elements}}) {
2678 if ($_->[1] eq 'p') {
2679 !!!back-token;
2680 $token = {type => 'end tag', tag_name => 'p'};
2681 return;
2682 } elsif ({
2683 table => 1, caption => 1, td => 1, th => 1,
2684 button => 1, marquee => 1, object => 1, html => 1,
2685 }->{$_->[1]}) {
2686 last INSCOPE;
2687 }
2688 } # INSCOPE
2689
2690 ## Step 1
2691 my $i = -1;
2692 my $node = $self->{open_elements}->[$i];
2693 LI: {
2694 ## Step 2
2695 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2696 if ($i != -1) {
2697 !!!parse-error (type => 'end tag missing:'.
2698 $self->{open_elements}->[-1]->[1]);
2699 }
2700 splice @{$self->{open_elements}}, $i;
2701 last LI;
2702 }
2703
2704 ## Step 3
2705 if (not $formatting_category->{$node->[1]} and
2706 #not $phrasing_category->{$node->[1]} and
2707 ($special_category->{$node->[1]} or
2708 $scoping_category->{$node->[1]}) and
2709 $node->[1] ne 'address' and $node->[1] ne 'div') {
2710 last LI;
2711 }
2712
2713 ## Step 4
2714 $i--;
2715 $node = $self->{open_elements}->[$i];
2716 redo LI;
2717 } # LI
2718
2719 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2720 !!!next-token;
2721 return;
2722 } elsif ($token->{tag_name} eq 'plaintext') {
2723 ## has a p element in scope
2724 INSCOPE: for (reverse @{$self->{open_elements}}) {
2725 if ($_->[1] eq 'p') {
2726 !!!back-token;
2727 $token = {type => 'end tag', tag_name => 'p'};
2728 return;
2729 } elsif ({
2730 table => 1, caption => 1, td => 1, th => 1,
2731 button => 1, marquee => 1, object => 1, html => 1,
2732 }->{$_->[1]}) {
2733 last INSCOPE;
2734 }
2735 } # INSCOPE
2736
2737 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2738
2739 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
2740
2741 !!!next-token;
2742 return;
2743 } elsif ({
2744 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2745 }->{$token->{tag_name}}) {
2746 ## has a p element in scope
2747 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2748 my $node = $self->{open_elements}->[$_];
2749 if ($node->[1] eq 'p') {
2750 !!!back-token;
2751 $token = {type => 'end tag', tag_name => 'p'};
2752 return;
2753 } elsif ({
2754 table => 1, caption => 1, td => 1, th => 1,
2755 button => 1, marquee => 1, object => 1, html => 1,
2756 }->{$node->[1]}) {
2757 last INSCOPE;
2758 }
2759 } # INSCOPE
2760
2761 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
2762 ## has an element in scope
2763 #my $i;
2764 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2765 # my $node = $self->{open_elements}->[$_];
2766 # if ({
2767 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2768 # }->{$node->[1]}) {
2769 # $i = $_;
2770 # last INSCOPE;
2771 # } elsif ({
2772 # table => 1, caption => 1, td => 1, th => 1,
2773 # button => 1, marquee => 1, object => 1, html => 1,
2774 # }->{$node->[1]}) {
2775 # last INSCOPE;
2776 # }
2777 #} # INSCOPE
2778 #
2779 #if (defined $i) {
2780 # !!! parse-error (type => 'in hn:hn');
2781 # splice @{$self->{open_elements}}, $i;
2782 #}
2783
2784 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2785
2786 !!!next-token;
2787 return;
2788 } elsif ($token->{tag_name} eq 'a') {
2789 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2790 my $node = $active_formatting_elements->[$i];
2791 if ($node->[1] eq 'a') {
2792 !!!parse-error (type => 'in a:a');
2793
2794 !!!back-token;
2795 $token = {type => 'end tag', tag_name => 'a'};
2796 $formatting_end_tag->($token->{tag_name});
2797
2798 AFE2: for (reverse 0..$#$active_formatting_elements) {
2799 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2800 splice @$active_formatting_elements, $_, 1;
2801 last AFE2;
2802 }
2803 } # AFE2
2804 OE: for (reverse 0..$#{$self->{open_elements}}) {
2805 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
2806 splice @{$self->{open_elements}}, $_, 1;
2807 last OE;
2808 }
2809 } # OE
2810 last AFE;
2811 } elsif ($node->[0] eq '#marker') {
2812 last AFE;
2813 }
2814 } # AFE
2815
2816 $reconstruct_active_formatting_elements->($insert_to_current);
2817
2818 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2819 push @$active_formatting_elements, $self->{open_elements}->[-1];
2820
2821 !!!next-token;
2822 return;
2823 } elsif ({
2824 b => 1, big => 1, em => 1, font => 1, i => 1,
2825 s => 1, small => 1, strile => 1,
2826 strong => 1, tt => 1, u => 1,
2827 }->{$token->{tag_name}}) {
2828 $reconstruct_active_formatting_elements->($insert_to_current);
2829
2830 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2831 push @$active_formatting_elements, $self->{open_elements}->[-1];
2832
2833 !!!next-token;
2834 return;
2835 } elsif ($token->{tag_name} eq 'nobr') {
2836 $reconstruct_active_formatting_elements->($insert_to_current);
2837
2838 ## has a |nobr| element in scope
2839 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2840 my $node = $self->{open_elements}->[$_];
2841 if ($node->[1] eq 'nobr') {
2842 !!!parse-error (type => 'not closed:nobr');
2843 !!!back-token;
2844 $token = {type => 'end tag', tag_name => 'nobr'};
2845 return;
2846 } elsif ({
2847 table => 1, caption => 1, td => 1, th => 1,
2848 button => 1, marquee => 1, object => 1, html => 1,
2849 }->{$node->[1]}) {
2850 last INSCOPE;
2851 }
2852 } # INSCOPE
2853
2854 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2855 push @$active_formatting_elements, $self->{open_elements}->[-1];
2856
2857 !!!next-token;
2858 return;
2859 } elsif ($token->{tag_name} eq 'button') {
2860 ## has a button element in scope
2861 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2862 my $node = $self->{open_elements}->[$_];
2863 if ($node->[1] eq 'button') {
2864 !!!parse-error (type => 'in button:button');
2865 !!!back-token;
2866 $token = {type => 'end tag', tag_name => 'button'};
2867 return;
2868 } elsif ({
2869 table => 1, caption => 1, td => 1, th => 1,
2870 button => 1, marquee => 1, object => 1, html => 1,
2871 }->{$node->[1]}) {
2872 last INSCOPE;
2873 }
2874 } # INSCOPE
2875
2876 $reconstruct_active_formatting_elements->($insert_to_current);
2877
2878 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2879 push @$active_formatting_elements, ['#marker', ''];
2880
2881 !!!next-token;
2882 return;
2883 } elsif ($token->{tag_name} eq 'marquee' or
2884 $token->{tag_name} eq 'object') {
2885 $reconstruct_active_formatting_elements->($insert_to_current);
2886
2887 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2888 push @$active_formatting_elements, ['#marker', ''];
2889
2890 !!!next-token;
2891 return;
2892 } elsif ($token->{tag_name} eq 'xmp') {
2893 $reconstruct_active_formatting_elements->($insert_to_current);
2894 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
2895 return;
2896 } elsif ($token->{tag_name} eq 'table') {
2897 ## has a p element in scope
2898 INSCOPE: for (reverse @{$self->{open_elements}}) {
2899 if ($_->[1] eq 'p') {
2900 !!!back-token;
2901 $token = {type => 'end tag', tag_name => 'p'};
2902 return;
2903 } elsif ({
2904 table => 1, caption => 1, td => 1, th => 1,
2905 button => 1, marquee => 1, object => 1, html => 1,
2906 }->{$_->[1]}) {
2907 last INSCOPE;
2908 }
2909 } # INSCOPE
2910
2911 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2912
2913 $self->{insertion_mode} = 'in table';
2914
2915 !!!next-token;
2916 return;
2917 } elsif ({
2918 area => 1, basefont => 1, bgsound => 1, br => 1,
2919 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2920 image => 1,
2921 }->{$token->{tag_name}}) {
2922 if ($token->{tag_name} eq 'image') {
2923 !!!parse-error (type => 'image');
2924 $token->{tag_name} = 'img';
2925 }
2926
2927 ## NOTE: There is an "as if <br>" code clone.
2928 $reconstruct_active_formatting_elements->($insert_to_current);
2929
2930 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2931 pop @{$self->{open_elements}};
2932
2933 !!!next-token;
2934 return;
2935 } elsif ($token->{tag_name} eq 'hr') {
2936 ## has a p element in scope
2937 INSCOPE: for (reverse @{$self->{open_elements}}) {
2938 if ($_->[1] eq 'p') {
2939 !!!back-token;
2940 $token = {type => 'end tag', tag_name => 'p'};
2941 return;
2942 } elsif ({
2943 table => 1, caption => 1, td => 1, th => 1,
2944 button => 1, marquee => 1, object => 1, html => 1,
2945 }->{$_->[1]}) {
2946 last INSCOPE;
2947 }
2948 } # INSCOPE
2949
2950 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2951 pop @{$self->{open_elements}};
2952
2953 !!!next-token;
2954 return;
2955 } elsif ($token->{tag_name} eq 'input') {
2956 $reconstruct_active_formatting_elements->($insert_to_current);
2957
2958 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2959 ## TODO: associate with $self->{form_element} if defined
2960 pop @{$self->{open_elements}};
2961
2962 !!!next-token;
2963 return;
2964 } elsif ($token->{tag_name} eq 'isindex') {
2965 !!!parse-error (type => 'isindex');
2966
2967 if (defined $self->{form_element}) {
2968 ## Ignore the token
2969 !!!next-token;
2970 return;
2971 } else {
2972 my $at = $token->{attributes};
2973 my $form_attrs;
2974 $form_attrs->{action} = $at->{action} if $at->{action};
2975 my $prompt_attr = $at->{prompt};
2976 $at->{name} = {name => 'name', value => 'isindex'};
2977 delete $at->{action};
2978 delete $at->{prompt};
2979 my @tokens = (
2980 {type => 'start tag', tag_name => 'form',
2981 attributes => $form_attrs},
2982 {type => 'start tag', tag_name => 'hr'},
2983 {type => 'start tag', tag_name => 'p'},
2984 {type => 'start tag', tag_name => 'label'},
2985 );
2986 if ($prompt_attr) {
2987 push @tokens, {type => 'character', data => $prompt_attr->{value}};
2988 } else {
2989 push @tokens, {type => 'character',
2990 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
2991 ## TODO: make this configurable
2992 }
2993 push @tokens,
2994 {type => 'start tag', tag_name => 'input', attributes => $at},
2995 #{type => 'character', data => ''}, # SHOULD
2996 {type => 'end tag', tag_name => 'label'},
2997 {type => 'end tag', tag_name => 'p'},
2998 {type => 'start tag', tag_name => 'hr'},
2999 {type => 'end tag', tag_name => 'form'};
3000 $token = shift @tokens;
3001 !!!back-token (@tokens);
3002 return;
3003 }
3004 } elsif ($token->{tag_name} eq 'textarea') {
3005 my $tag_name = $token->{tag_name};
3006 my $el;
3007 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
3008
3009 ## TODO: $self->{form_element} if defined
3010 $self->{content_model} = RCDATA_CONTENT_MODEL;
3011 delete $self->{escape}; # MUST
3012
3013 $insert->($el);
3014
3015 my $text = '';
3016 !!!next-token;
3017 if ($token->{type} eq 'character') {
3018 $token->{data} =~ s/^\x0A//;
3019 unless (length $token->{data}) {
3020 !!!next-token;
3021 }
3022 }
3023 while ($token->{type} eq 'character') {
3024 $text .= $token->{data};
3025 !!!next-token;
3026 }
3027 if (length $text) {
3028 $el->manakai_append_text ($text);
3029 }
3030
3031 $self->{content_model} = PCDATA_CONTENT_MODEL;
3032
3033 if ($token->{type} eq 'end tag' and
3034 $token->{tag_name} eq $tag_name) {
3035 ## Ignore the token
3036 } else {
3037 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
3038 }
3039 !!!next-token;
3040 return;
3041 } elsif ({
3042 iframe => 1,
3043 noembed => 1,
3044 noframes => 1,
3045 noscript => 0, ## TODO: 1 if scripting is enabled
3046 }->{$token->{tag_name}}) {
3047 ## NOTE: There are two "as if in body" code clones.
3048 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
3049 return;
3050 } elsif ($token->{tag_name} eq 'select') {
3051 $reconstruct_active_formatting_elements->($insert_to_current);
3052
3053 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
3054
3055 $self->{insertion_mode} = 'in select';
3056 !!!next-token;
3057 return;
3058 } elsif ({
3059 caption => 1, col => 1, colgroup => 1, frame => 1,
3060 frameset => 1, head => 1, option => 1, optgroup => 1,
3061 tbody => 1, td => 1, tfoot => 1, th => 1,
3062 thead => 1, tr => 1,
3063 }->{$token->{tag_name}}) {
3064 !!!parse-error (type => 'in body:'.$token->{tag_name});
3065 ## Ignore the token
3066 !!!next-token;
3067 return;
3068
3069 ## ISSUE: An issue on HTML5 new elements in the spec.
3070 } else {
3071 $reconstruct_active_formatting_elements->($insert_to_current);
3072
3073 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
3074
3075 !!!next-token;
3076 return;
3077 }
3078 } elsif ($token->{type} eq 'end tag') {
3079 if ($token->{tag_name} eq 'body') {
3080 if (@{$self->{open_elements}} > 1 and
3081 $self->{open_elements}->[1]->[1] eq 'body') {
3082 for (@{$self->{open_elements}}) {
3083 unless ({
3084 dd => 1, dt => 1, li => 1, p => 1, td => 1,
3085 th => 1, tr => 1, body => 1, html => 1,
3086 tbody => 1, tfoot => 1, thead => 1,
3087 }->{$_->[1]}) {
3088 !!!parse-error (type => 'not closed:'.$_->[1]);
3089 }
3090 }
3091
3092 $self->{insertion_mode} = 'after body';
3093 !!!next-token;
3094 return;
3095 } else {
3096 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3097 ## Ignore the token
3098 !!!next-token;
3099 return;
3100 }
3101 } elsif ($token->{tag_name} eq 'html') {
3102 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
3103 ## ISSUE: There is an issue in the spec.
3104 if ($self->{open_elements}->[-1]->[1] ne 'body') {
3105 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
3106 }
3107 $self->{insertion_mode} = 'after body';
3108 ## reprocess
3109 return;
3110 } else {
3111 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3112 ## Ignore the token
3113 !!!next-token;
3114 return;
3115 }
3116 } elsif ({
3117 address => 1, blockquote => 1, center => 1, dir => 1,
3118 div => 1, dl => 1, fieldset => 1, listing => 1,
3119 menu => 1, ol => 1, pre => 1, ul => 1,
3120 p => 1,
3121 dd => 1, dt => 1, li => 1,
3122 button => 1, marquee => 1, object => 1,
3123 }->{$token->{tag_name}}) {
3124 ## has an element in scope
3125 my $i;
3126 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3127 my $node = $self->{open_elements}->[$_];
3128 if ($node->[1] eq $token->{tag_name}) {
3129 ## generate implied end tags
3130 if ({
3131 dd => ($token->{tag_name} ne 'dd'),
3132 dt => ($token->{tag_name} ne 'dt'),
3133 li => ($token->{tag_name} ne 'li'),
3134 p => ($token->{tag_name} ne 'p'),
3135 td => 1, th => 1, tr => 1,
3136 tbody => 1, tfoot=> 1, thead => 1,
3137 }->{$self->{open_elements}->[-1]->[1]}) {
3138 !!!back-token;
3139 $token = {type => 'end tag',
3140 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3141 return;
3142 }
3143 $i = $_;
3144 last INSCOPE unless $token->{tag_name} eq 'p';
3145 } elsif ({
3146 table => 1, caption => 1, td => 1, th => 1,
3147 button => 1, marquee => 1, object => 1, html => 1,
3148 }->{$node->[1]}) {
3149 last INSCOPE;
3150 }
3151 } # INSCOPE
3152
3153 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3154 if (defined $i) {
3155 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3156 } else {
3157 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3158 }
3159 }
3160
3161 if (defined $i) {
3162 splice @{$self->{open_elements}}, $i;
3163 } elsif ($token->{tag_name} eq 'p') {
3164 ## As if <p>, then reprocess the current token
3165 my $el;
3166 !!!create-element ($el, 'p');
3167 $insert->($el);
3168 }
3169 $clear_up_to_marker->()
3170 if {
3171 button => 1, marquee => 1, object => 1,
3172 }->{$token->{tag_name}};
3173 !!!next-token;
3174 return;
3175 } elsif ($token->{tag_name} eq 'form') {
3176 ## has an element in scope
3177 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3178 my $node = $self->{open_elements}->[$_];
3179 if ($node->[1] eq $token->{tag_name}) {
3180 ## generate implied end tags
3181 if ({
3182 dd => 1, dt => 1, li => 1, p => 1,
3183 td => 1, th => 1, tr => 1,
3184 tbody => 1, tfoot=> 1, thead => 1,
3185 }->{$self->{open_elements}->[-1]->[1]}) {
3186 !!!back-token;
3187 $token = {type => 'end tag',
3188 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3189 return;
3190 }
3191 last INSCOPE;
3192 } elsif ({
3193 table => 1, caption => 1, td => 1, th => 1,
3194 button => 1, marquee => 1, object => 1, html => 1,
3195 }->{$node->[1]}) {
3196 last INSCOPE;
3197 }
3198 } # INSCOPE
3199
3200 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
3201 pop @{$self->{open_elements}};
3202 } else {
3203 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3204 }
3205
3206 undef $self->{form_element};
3207 !!!next-token;
3208 return;
3209 } elsif ({
3210 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3211 }->{$token->{tag_name}}) {
3212 ## has an element in scope
3213 my $i;
3214 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3215 my $node = $self->{open_elements}->[$_];
3216 if ({
3217 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3218 }->{$node->[1]}) {
3219 ## generate implied end tags
3220 if ({
3221 dd => 1, dt => 1, li => 1, p => 1,
3222 td => 1, th => 1, tr => 1,
3223 tbody => 1, tfoot=> 1, thead => 1,
3224 }->{$self->{open_elements}->[-1]->[1]}) {
3225 !!!back-token;
3226 $token = {type => 'end tag',
3227 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3228 return;
3229 }
3230 $i = $_;
3231 last INSCOPE;
3232 } elsif ({
3233 table => 1, caption => 1, td => 1, th => 1,
3234 button => 1, marquee => 1, object => 1, html => 1,
3235 }->{$node->[1]}) {
3236 last INSCOPE;
3237 }
3238 } # INSCOPE
3239
3240 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3241 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3242 }
3243
3244 splice @{$self->{open_elements}}, $i if defined $i;
3245 !!!next-token;
3246 return;
3247 } elsif ({
3248 a => 1,
3249 b => 1, big => 1, em => 1, font => 1, i => 1,
3250 nobr => 1, s => 1, small => 1, strile => 1,
3251 strong => 1, tt => 1, u => 1,
3252 }->{$token->{tag_name}}) {
3253 $formatting_end_tag->($token->{tag_name});
3254 return;
3255 } elsif ($token->{tag_name} eq 'br') {
3256 !!!parse-error (type => 'unmatched end tag:br');
3257
3258 ## As if <br>
3259 $reconstruct_active_formatting_elements->($insert_to_current);
3260
3261 my $el;
3262 !!!create-element ($el, 'br');
3263 $insert->($el);
3264
3265 ## Ignore the token.
3266 !!!next-token;
3267 return;
3268 } elsif ({
3269 caption => 1, col => 1, colgroup => 1, frame => 1,
3270 frameset => 1, head => 1, option => 1, optgroup => 1,
3271 tbody => 1, td => 1, tfoot => 1, th => 1,
3272 thead => 1, tr => 1,
3273 area => 1, basefont => 1, bgsound => 1,
3274 embed => 1, hr => 1, iframe => 1, image => 1,
3275 img => 1, input => 1, isindex => 1, noembed => 1,
3276 noframes => 1, param => 1, select => 1, spacer => 1,
3277 table => 1, textarea => 1, wbr => 1,
3278 noscript => 0, ## TODO: if scripting is enabled
3279 }->{$token->{tag_name}}) {
3280 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3281 ## Ignore the token
3282 !!!next-token;
3283 return;
3284
3285 ## ISSUE: Issue on HTML5 new elements in spec
3286
3287 } else {
3288 ## Step 1
3289 my $node_i = -1;
3290 my $node = $self->{open_elements}->[$node_i];
3291
3292 ## Step 2
3293 S2: {
3294 if ($node->[1] eq $token->{tag_name}) {
3295 ## Step 1
3296 ## generate implied end tags
3297 if ({
3298 dd => 1, dt => 1, li => 1, p => 1,
3299 td => 1, th => 1, tr => 1,
3300 tbody => 1, tfoot=> 1, thead => 1,
3301 }->{$self->{open_elements}->[-1]->[1]}) {
3302 !!!back-token;
3303 $token = {type => 'end tag',
3304 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3305 return;
3306 }
3307
3308 ## Step 2
3309 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
3310 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3311 }
3312
3313 ## Step 3
3314 splice @{$self->{open_elements}}, $node_i;
3315
3316 !!!next-token;
3317 last S2;
3318 } else {
3319 ## Step 3
3320 if (not $formatting_category->{$node->[1]} and
3321 #not $phrasing_category->{$node->[1]} and
3322 ($special_category->{$node->[1]} or
3323 $scoping_category->{$node->[1]})) {
3324 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3325 ## Ignore the token
3326 !!!next-token;
3327 last S2;
3328 }
3329 }
3330
3331 ## Step 4
3332 $node_i--;
3333 $node = $self->{open_elements}->[$node_i];
3334
3335 ## Step 5;
3336 redo S2;
3337 } # S2
3338 return;
3339 }
3340 }
3341 }; # $in_body
3342
3343 B: {
3344 if ($token->{type} eq 'DOCTYPE') {
3345 !!!parse-error (type => 'DOCTYPE in the middle');
3346 ## Ignore the token
3347 ## Stay in the phase
3348 !!!next-token;
3349 redo B;
3350 } elsif ($token->{type} eq 'end-of-file') {
3351 if ($self->{insertion_mode} eq 'after html body' or
3352 $self->{insertion_mode} eq 'after html frameset') {
3353 #
3354 } else {
3355 ## Generate implied end tags
3356 if ({
3357 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3358 tbody => 1, tfoot=> 1, thead => 1,
3359 }->{$self->{open_elements}->[-1]->[1]}) {
3360 !!!back-token;
3361 $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
3362 redo B;
3363 }
3364
3365 if (@{$self->{open_elements}} > 2 or
3366 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3367 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3368 } elsif (defined $self->{inner_html_node} and
3369 @{$self->{open_elements}} > 1 and
3370 $self->{open_elements}->[1]->[1] ne 'body') {
3371 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3372 }
3373
3374 ## ISSUE: There is an issue in the spec.
3375 }
3376
3377 ## Stop parsing
3378 last B;
3379 } elsif ($token->{type} eq 'start tag' and
3380 $token->{tag_name} eq 'html') {
3381 if ($self->{insertion_mode} eq 'after html body') {
3382 ## Turn into the main phase
3383 !!!parse-error (type => 'after html:html');
3384 $self->{insertion_mode} = 'after body';
3385 } elsif ($self->{insertion_mode} eq 'after html frameset') {
3386 ## Turn into the main phase
3387 !!!parse-error (type => 'after html:html');
3388 $self->{insertion_mode} = 'after frameset';
3389 }
3390
3391 ## ISSUE: "aa<html>" is not a parse error.
3392 ## ISSUE: "<html>" in fragment is not a parse error.
3393 unless ($token->{first_start_tag}) {
3394 !!!parse-error (type => 'not first start tag');
3395 }
3396 my $top_el = $self->{open_elements}->[0]->[0];
3397 for my $attr_name (keys %{$token->{attributes}}) {
3398 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3399 $top_el->set_attribute_ns
3400 (undef, [undef, $attr_name],
3401 $token->{attributes}->{$attr_name}->{value});
3402 }
3403 }
3404 !!!next-token;
3405 redo B;
3406 } elsif ($token->{type} eq 'comment') {
3407 my $comment = $self->{document}->create_comment ($token->{data});
3408 if ($self->{insertion_mode} eq 'after html body' or
3409 $self->{insertion_mode} eq 'after html frameset') {
3410 $self->{document}->append_child ($comment);
3411 } elsif ($self->{insertion_mode} eq 'after body') {
3412 $self->{open_elements}->[0]->[0]->append_child ($comment);
3413 } else {
3414 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3415 }
3416 !!!next-token;
3417 redo B;
3418 } elsif ($self->{insertion_mode} eq 'in head' or
3419 $self->{insertion_mode} eq 'in head noscript' or
3420 $self->{insertion_mode} eq 'after head' or
3421 $self->{insertion_mode} eq 'before head') {
3422 if ($token->{type} eq 'character') {
3423 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3424 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3425 unless (length $token->{data}) {
3426 !!!next-token;
3427 redo B;
3428 }
3429 }
3430
3431 if ($self->{insertion_mode} eq 'before head') {
3432 ## As if <head>
3433 !!!create-element ($self->{head_element}, 'head');
3434 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3435 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3436
3437 ## Reprocess in the "in head" insertion mode...
3438 pop @{$self->{open_elements}};
3439
3440 ## Reprocess in the "after head" insertion mode...
3441 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3442 ## As if </noscript>
3443 pop @{$self->{open_elements}};
3444 !!!parse-error (type => 'in noscript:#character');
3445
3446 ## Reprocess in the "in head" insertion mode...
3447 ## As if </head>
3448 pop @{$self->{open_elements}};
3449
3450 ## Reprocess in the "after head" insertion mode...
3451 } elsif ($self->{insertion_mode} eq 'in head') {
3452 pop @{$self->{open_elements}};
3453
3454 ## Reprocess in the "after head" insertion mode...
3455 }
3456
3457 ## "after head" insertion mode
3458 ## As if <body>
3459 !!!insert-element ('body');
3460 $self->{insertion_mode} = 'in body';
3461 ## reprocess
3462 redo B;
3463 } elsif ($token->{type} eq 'start tag') {
3464 if ($token->{tag_name} eq 'head') {
3465 if ($self->{insertion_mode} eq 'before head') {
3466 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
3467 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3468 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
3469 $self->{insertion_mode} = 'in head';
3470 !!!next-token;
3471 redo B;
3472 } elsif ($self->{insertion_mode} ne 'after head') {
3473 !!!parse-error (type => 'in head:head'); # or in head noscript
3474 ## Ignore the token
3475 !!!next-token;
3476 redo B;
3477 } else {
3478 #
3479 }
3480 } elsif ($self->{insertion_mode} eq 'before head') {
3481 ## As if <head>
3482 !!!create-element ($self->{head_element}, 'head');
3483 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3484 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3485
3486 $self->{insertion_mode} = 'in head';
3487 ## Reprocess in the "in head" insertion mode...
3488 }
3489
3490 if ($token->{tag_name} eq 'base') {
3491 if ($self->{insertion_mode} eq 'in head noscript') {
3492 ## As if </noscript>
3493 pop @{$self->{open_elements}};
3494 !!!parse-error (type => 'in noscript:base');
3495
3496 $self->{insertion_mode} = 'in head';
3497 ## Reprocess in the "in head" insertion mode...
3498 }
3499
3500 ## NOTE: There is a "as if in head" code clone.
3501 if ($self->{insertion_mode} eq 'after head') {
3502 !!!parse-error (type => 'after head:'.$token->{tag_name});
3503 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3504 }
3505 !!!insert-element ($token->{tag_name}, $token->{attributes});
3506 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3507 pop @{$self->{open_elements}}
3508 if $self->{insertion_mode} eq 'after head';
3509 !!!next-token;
3510 redo B;
3511 } elsif ($token->{tag_name} eq 'link') {
3512 ## NOTE: There is a "as if in head" code clone.
3513 if ($self->{insertion_mode} eq 'after head') {
3514 !!!parse-error (type => 'after head:'.$token->{tag_name});
3515 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3516 }
3517 !!!insert-element ($token->{tag_name}, $token->{attributes});
3518 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3519 pop @{$self->{open_elements}}
3520 if $self->{insertion_mode} eq 'after head';
3521 !!!next-token;
3522 redo B;
3523 } elsif ($token->{tag_name} eq 'meta') {
3524 ## NOTE: There is a "as if in head" code clone.
3525 if ($self->{insertion_mode} eq 'after head') {
3526 !!!parse-error (type => 'after head:'.$token->{tag_name});
3527 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3528 }
3529 !!!insert-element ($token->{tag_name}, $token->{attributes});
3530 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3531
3532 unless ($self->{confident}) {
3533 my $charset;
3534 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3535 $charset = $token->{attributes}->{charset}->{value};
3536 }
3537 if ($token->{attributes}->{'http-equiv'}) {
3538 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3539 if ($token->{attributes}->{'http-equiv'}->{value}
3540 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
3541 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3542 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3543 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
3544 } ## TODO: And if supported
3545 }
3546 ## TODO: Change the encoding
3547 }
3548
3549 ## TODO: Extracting |charset| from |meta|.
3550 pop @{$self->{open_elements}}
3551 if $self->{insertion_mode} eq 'after head';
3552 !!!next-token;
3553 redo B;
3554 } elsif ($token->{tag_name} eq 'title') {
3555 if ($self->{insertion_mode} eq 'in head noscript') {
3556 ## As if </noscript>
3557 pop @{$self->{open_elements}};
3558 !!!parse-error (type => 'in noscript:title');
3559
3560 $self->{insertion_mode} = 'in head';
3561 ## Reprocess in the "in head" insertion mode...
3562 } elsif ($self->{insertion_mode} eq 'after head') {
3563 !!!parse-error (type => 'after head:'.$token->{tag_name});
3564 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3565 }
3566
3567 ## NOTE: There is a "as if in head" code clone.
3568 my $parent = defined $self->{head_element} ? $self->{head_element}
3569 : $self->{open_elements}->[-1]->[0];
3570 $parse_rcdata->(RCDATA_CONTENT_MODEL,
3571 sub { $parent->append_child ($_[0]) });
3572 pop @{$self->{open_elements}}
3573 if $self->{insertion_mode} eq 'after head';
3574 redo B;
3575 } elsif ($token->{tag_name} eq 'style') {
3576 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3577 ## insertion mode 'in head')
3578 ## NOTE: There is a "as if in head" code clone.
3579 if ($self->{insertion_mode} eq 'after head') {
3580 !!!parse-error (type => 'after head:'.$token->{tag_name});
3581 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3582 }
3583 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
3584 pop @{$self->{open_elements}}
3585 if $self->{insertion_mode} eq 'after head';
3586 redo B;
3587 } elsif ($token->{tag_name} eq 'noscript') {
3588 if ($self->{insertion_mode} eq 'in head') {
3589 ## NOTE: and scripting is disalbed
3590 !!!insert-element ($token->{tag_name}, $token->{attributes});
3591 $self->{insertion_mode} = 'in head noscript';
3592 !!!next-token;
3593 redo B;
3594 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3595 !!!parse-error (type => 'in noscript:noscript');
3596 ## Ignore the token
3597 !!!next-token;
3598 redo B;
3599 } else {
3600 #
3601 }
3602 } elsif ($token->{tag_name} eq 'script') {
3603 if ($self->{insertion_mode} eq 'in head noscript') {
3604 ## As if </noscript>
3605 pop @{$self->{open_elements}};
3606 !!!parse-error (type => 'in noscript:script');
3607
3608 $self->{insertion_mode} = 'in head';
3609 ## Reprocess in the "in head" insertion mode...
3610 } elsif ($self->{insertion_mode} eq 'after head') {
3611 !!!parse-error (type => 'after head:'.$token->{tag_name});
3612 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3613 }
3614
3615 ## NOTE: There is a "as if in head" code clone.
3616 $script_start_tag->($insert_to_current);
3617 pop @{$self->{open_elements}}
3618 if $self->{insertion_mode} eq 'after head';
3619 redo B;
3620 } elsif ($token->{tag_name} eq 'body' or
3621 $token->{tag_name} eq 'frameset') {
3622 if ($self->{insertion_mode} eq 'in head noscript') {
3623 ## As if </noscript>
3624 pop @{$self->{open_elements}};
3625 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
3626
3627 ## Reprocess in the "in head" insertion mode...
3628 ## As if </head>
3629 pop @{$self->{open_elements}};
3630
3631 ## Reprocess in the "after head" insertion mode...
3632 } elsif ($self->{insertion_mode} eq 'in head') {
3633 pop @{$self->{open_elements}};
3634
3635 ## Reprocess in the "after head" insertion mode...
3636 }
3637
3638 ## "after head" insertion mode
3639 !!!insert-element ($token->{tag_name}, $token->{attributes});
3640 $self->{insertion_mode} = 'in '.$token->{tag_name};
3641 !!!next-token;
3642 redo B;
3643 } else {
3644 #
3645 }
3646
3647 if ($self->{insertion_mode} eq 'in head noscript') {
3648 ## As if </noscript>
3649 pop @{$self->{open_elements}};
3650 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3651
3652 ## Reprocess in the "in head" insertion mode...
3653 ## As if </head>
3654 pop @{$self->{open_elements}};
3655
3656 ## Reprocess in the "after head" insertion mode...
3657 } elsif ($self->{insertion_mode} eq 'in head') {
3658 ## As if </head>
3659 pop @{$self->{open_elements}};
3660
3661 ## Reprocess in the "after head" insertion mode...
3662 }
3663
3664 ## "after head" insertion mode
3665 ## As if <body>
3666 !!!insert-element ('body');
3667 $self->{insertion_mode} = 'in body';
3668 ## reprocess
3669 redo B;
3670 } elsif ($token->{type} eq 'end tag') {
3671 if ($token->{tag_name} eq 'head') {
3672 if ($self->{insertion_mode} eq 'before head') {
3673 ## As if <head>
3674 !!!create-element ($self->{head_element}, 'head');
3675 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3676 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3677
3678 ## Reprocess in the "in head" insertion mode...
3679 pop @{$self->{open_elements}};
3680 $self->{insertion_mode} = 'after head';
3681 !!!next-token;
3682 redo B;
3683 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3684 ## As if </noscript>
3685 pop @{$self->{open_elements}};
3686 !!!parse-error (type => 'in noscript:script');
3687
3688 ## Reprocess in the "in head" insertion mode...
3689 pop @{$self->{open_elements}};
3690 $self->{insertion_mode} = 'after head';
3691 !!!next-token;
3692 redo B;
3693 } elsif ($self->{insertion_mode} eq 'in head') {
3694 pop @{$self->{open_elements}};
3695 $self->{insertion_mode} = 'after head';
3696 !!!next-token;
3697 redo B;
3698 } else {
3699 #
3700 }
3701 } elsif ($token->{tag_name} eq 'noscript') {
3702 if ($self->{insertion_mode} eq 'in head noscript') {
3703 pop @{$self->{open_elements}};
3704 $self->{insertion_mode} = 'in head';
3705 !!!next-token;
3706 redo B;
3707 } elsif ($self->{insertion_mode} eq 'before head') {
3708 !!!parse-error (type => 'unmatched end tag:noscript');
3709 ## Ignore the token ## ISSUE: An issue in the spec.
3710 !!!next-token;
3711 redo B;
3712 } else {
3713 #
3714 }
3715 } elsif ({
3716 body => 1, html => 1,
3717 }->{$token->{tag_name}}) {
3718 if ($self->{insertion_mode} eq 'before head') {
3719 ## As if <head>
3720 !!!create-element ($self->{head_element}, 'head');
3721 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3722 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3723
3724 $self->{insertion_mode} = 'in head';
3725 ## Reprocess in the "in head" insertion mode...
3726 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3727 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3728 ## Ignore the token
3729 !!!next-token;
3730 redo B;
3731 }
3732
3733 #
3734 } elsif ({
3735 p => 1, br => 1,
3736 }->{$token->{tag_name}}) {
3737 if ($self->{insertion_mode} eq 'before head') {
3738 ## As if <head>
3739 !!!create-element ($self->{head_element}, 'head');
3740 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3741 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3742
3743 $self->{insertion_mode} = 'in head';
3744 ## Reprocess in the "in head" insertion mode...
3745 }
3746
3747 #
3748 } else {
3749 if ($self->{insertion_mode} ne 'after head') {
3750 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3751 ## Ignore the token
3752 !!!next-token;
3753 redo B;
3754 } else {
3755 #
3756 }
3757 }
3758
3759 if ($self->{insertion_mode} eq 'in head noscript') {
3760 ## As if </noscript>
3761 pop @{$self->{open_elements}};
3762 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3763
3764 ## Reprocess in the "in head" insertion mode...
3765 ## As if </head>
3766 pop @{$self->{open_elements}};
3767
3768 ## Reprocess in the "after head" insertion mode...
3769 } elsif ($self->{insertion_mode} eq 'in head') {
3770 ## As if </head>
3771 pop @{$self->{open_elements}};
3772
3773 ## Reprocess in the "after head" insertion mode...
3774 } elsif ($self->{insertion_mode} eq 'before head') {
3775 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3776 ## Ignore the token ## ISSUE: An issue in the spec.
3777 !!!next-token;
3778 redo B;
3779 }
3780
3781 ## "after head" insertion mode
3782 ## As if <body>
3783 !!!insert-element ('body');
3784 $self->{insertion_mode} = 'in body';
3785 ## reprocess
3786 redo B;
3787 } else {
3788 die "$0: $token->{type}: Unknown token type";
3789 }
3790
3791 ## ISSUE: An issue in the spec.
3792 } elsif ($self->{insertion_mode} eq 'in body' or
3793 $self->{insertion_mode} eq 'in cell' or
3794 $self->{insertion_mode} eq 'in caption') {
3795 if ($token->{type} eq 'character') {
3796 ## NOTE: There is a code clone of "character in body".
3797 $reconstruct_active_formatting_elements->($insert_to_current);
3798
3799 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3800
3801 !!!next-token;
3802 redo B;
3803 } elsif ($token->{type} eq 'start tag') {
3804 if ({
3805 caption => 1, col => 1, colgroup => 1, tbody => 1,
3806 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3807 }->{$token->{tag_name}}) {
3808 if ($self->{insertion_mode} eq 'in cell') {
3809 ## have an element in table scope
3810 my $tn;
3811 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3812 my $node = $self->{open_elements}->[$_];
3813 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3814 $tn = $node->[1];
3815 last INSCOPE;
3816 } elsif ({
3817 table => 1, html => 1,
3818 }->{$node->[1]}) {
3819 last INSCOPE;
3820 }
3821 } # INSCOPE
3822 unless (defined $tn) {
3823 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3824 ## Ignore the token
3825 !!!next-token;
3826 redo B;
3827 }
3828
3829 ## Close the cell
3830 !!!back-token; # <?>
3831 $token = {type => 'end tag', tag_name => $tn};
3832 redo B;
3833 } elsif ($self->{insertion_mode} eq 'in caption') {
3834 !!!parse-error (type => 'not closed:caption');
3835
3836 ## As if </caption>
3837 ## have a table element in table scope
3838 my $i;
3839 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3840 my $node = $self->{open_elements}->[$_];
3841 if ($node->[1] eq 'caption') {
3842 $i = $_;
3843 last INSCOPE;
3844 } elsif ({
3845 table => 1, html => 1,
3846 }->{$node->[1]}) {
3847 last INSCOPE;
3848 }
3849 } # INSCOPE
3850 unless (defined $i) {
3851 !!!parse-error (type => 'unmatched end tag:caption');
3852 ## Ignore the token
3853 !!!next-token;
3854 redo B;
3855 }
3856
3857 ## generate implied end tags
3858 if ({
3859 dd => 1, dt => 1, li => 1, p => 1,
3860 td => 1, th => 1, tr => 1,
3861 tbody => 1, tfoot=> 1, thead => 1,
3862 }->{$self->{open_elements}->[-1]->[1]}) {
3863 !!!back-token; # <?>
3864 $token = {type => 'end tag', tag_name => 'caption'};
3865 !!!back-token;
3866 $token = {type => 'end tag',
3867 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3868 redo B;
3869 }
3870
3871 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3872 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3873 }
3874
3875 splice @{$self->{open_elements}}, $i;
3876
3877 $clear_up_to_marker->();
3878
3879 $self->{insertion_mode} = 'in table';
3880
3881 ## reprocess
3882 redo B;
3883 } else {
3884 #
3885 }
3886 } else {
3887 #
3888 }
3889 } elsif ($token->{type} eq 'end tag') {
3890 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3891 if ($self->{insertion_mode} eq 'in cell') {
3892 ## have an element in table scope
3893 my $i;
3894 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3895 my $node = $self->{open_elements}->[$_];
3896 if ($node->[1] eq $token->{tag_name}) {
3897 $i = $_;
3898 last INSCOPE;
3899 } elsif ({
3900 table => 1, html => 1,
3901 }->{$node->[1]}) {
3902 last INSCOPE;
3903 }
3904 } # INSCOPE
3905 unless (defined $i) {
3906 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3907 ## Ignore the token
3908 !!!next-token;
3909 redo B;
3910 }
3911
3912 ## generate implied end tags
3913 if ({
3914 dd => 1, dt => 1, li => 1, p => 1,
3915 td => ($token->{tag_name} eq 'th'),
3916 th => ($token->{tag_name} eq 'td'),
3917 tr => 1,
3918 tbody => 1, tfoot=> 1, thead => 1,
3919 }->{$self->{open_elements}->[-1]->[1]}) {
3920 !!!back-token;
3921 $token = {type => 'end tag',
3922 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3923 redo B;
3924 }
3925
3926 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3927 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3928 }
3929
3930 splice @{$self->{open_elements}}, $i;
3931
3932 $clear_up_to_marker->();
3933
3934 $self->{insertion_mode} = 'in row';
3935
3936 !!!next-token;
3937 redo B;
3938 } elsif ($self->{insertion_mode} eq 'in caption') {
3939 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3940 ## Ignore the token
3941 !!!next-token;
3942 redo B;
3943 } else {
3944 #
3945 }
3946 } elsif ($token->{tag_name} eq 'caption') {
3947 if ($self->{insertion_mode} eq 'in caption') {
3948 ## have a table element in table scope
3949 my $i;
3950 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3951 my $node = $self->{open_elements}->[$_];
3952 if ($node->[1] eq $token->{tag_name}) {
3953 $i = $_;
3954 last INSCOPE;
3955 } elsif ({
3956 table => 1, html => 1,
3957 }->{$node->[1]}) {
3958 last INSCOPE;
3959 }
3960 } # INSCOPE
3961 unless (defined $i) {
3962 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3963 ## Ignore the token
3964 !!!next-token;
3965 redo B;
3966 }
3967
3968 ## generate implied end tags
3969 if ({
3970 dd => 1, dt => 1, li => 1, p => 1,
3971 td => 1, th => 1, tr => 1,
3972 tbody => 1, tfoot=> 1, thead => 1,
3973 }->{$self->{open_elements}->[-1]->[1]}) {
3974 !!!back-token;
3975 $token = {type => 'end tag',
3976 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3977 redo B;
3978 }
3979
3980 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3981 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3982 }
3983
3984 splice @{$self->{open_elements}}, $i;
3985
3986 $clear_up_to_marker->();
3987
3988 $self->{insertion_mode} = 'in table';
3989
3990 !!!next-token;
3991 redo B;
3992 } elsif ($self->{insertion_mode} eq 'in cell') {
3993 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3994 ## Ignore the token
3995 !!!next-token;
3996 redo B;
3997 } else {
3998 #
3999 }
4000 } elsif ({
4001 table => 1, tbody => 1, tfoot => 1,
4002 thead => 1, tr => 1,
4003 }->{$token->{tag_name}} and
4004 $self->{insertion_mode} eq 'in cell') {
4005 ## have an element in table scope
4006 my $i;
4007 my $tn;
4008 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4009 my $node = $self->{open_elements}->[$_];
4010 if ($node->[1] eq $token->{tag_name}) {
4011 $i = $_;
4012 last INSCOPE;
4013 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4014 $tn = $node->[1];
4015 ## NOTE: There is exactly one |td| or |th| element
4016 ## in scope in the stack of open elements by definition.
4017 } elsif ({
4018 table => 1, html => 1,
4019 }->{$node->[1]}) {
4020 last INSCOPE;
4021 }
4022 } # INSCOPE
4023 unless (defined $i) {
4024 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4025 ## Ignore the token
4026 !!!next-token;
4027 redo B;
4028 }
4029
4030 ## Close the cell
4031 !!!back-token; # </?>
4032 $token = {type => 'end tag', tag_name => $tn};
4033 redo B;
4034 } elsif ($token->{tag_name} eq 'table' and
4035 $self->{insertion_mode} eq 'in caption') {
4036 !!!parse-error (type => 'not closed:caption');
4037
4038 ## As if </caption>
4039 ## have a table element in table scope
4040 my $i;
4041 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4042 my $node = $self->{open_elements}->[$_];
4043 if ($node->[1] eq 'caption') {
4044 $i = $_;
4045 last INSCOPE;
4046 } elsif ({
4047 table => 1, html => 1,
4048 }->{$node->[1]}) {
4049 last INSCOPE;
4050 }
4051 } # INSCOPE
4052 unless (defined $i) {
4053 !!!parse-error (type => 'unmatched end tag:caption');
4054 ## Ignore the token
4055 !!!next-token;
4056 redo B;
4057 }
4058
4059 ## generate implied end tags
4060 if ({
4061 dd => 1, dt => 1, li => 1, p => 1,
4062 td => 1, th => 1, tr => 1,
4063 tbody => 1, tfoot=> 1, thead => 1,
4064 }->{$self->{open_elements}->[-1]->[1]}) {
4065 !!!back-token; # </table>
4066 $token = {type => 'end tag', tag_name => 'caption'};
4067 !!!back-token;
4068 $token = {type => 'end tag',
4069 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4070 redo B;
4071 }
4072
4073 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4074 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4075 }
4076
4077 splice @{$self->{open_elements}}, $i;
4078
4079 $clear_up_to_marker->();
4080
4081 $self->{insertion_mode} = 'in table';
4082
4083 ## reprocess
4084 redo B;
4085 } elsif ({
4086 body => 1, col => 1, colgroup => 1, html => 1,
4087 }->{$token->{tag_name}}) {
4088 if ($self->{insertion_mode} eq 'in cell' or
4089 $self->{insertion_mode} eq 'in caption') {
4090 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4091 ## Ignore the token
4092 !!!next-token;
4093 redo B;
4094 } else {
4095 #
4096 }
4097 } elsif ({
4098 tbody => 1, tfoot => 1,
4099 thead => 1, tr => 1,
4100 }->{$token->{tag_name}} and
4101 $self->{insertion_mode} eq 'in caption') {
4102 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4103 ## Ignore the token
4104 !!!next-token;
4105 redo B;
4106 } else {
4107 #
4108 }
4109 } else {
4110 #
4111 }
4112
4113 $in_body->($insert_to_current);
4114 redo B;
4115 } elsif ($self->{insertion_mode} eq 'in row' or
4116 $self->{insertion_mode} eq 'in table body' or
4117 $self->{insertion_mode} eq 'in table') {
4118 if ($token->{type} eq 'character') {
4119 ## NOTE: There are "character in table" code clones.
4120 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4121 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4122
4123 unless (length $token->{data}) {
4124 !!!next-token;
4125 redo B;
4126 }
4127 }
4128
4129 !!!parse-error (type => 'in table:#character');
4130
4131 ## As if in body, but insert into foster parent element
4132 ## ISSUE: Spec says that "whenever a node would be inserted
4133 ## into the current node" while characters might not be
4134 ## result in a new Text node.
4135 $reconstruct_active_formatting_elements->($insert_to_foster);
4136
4137 if ({
4138 table => 1, tbody => 1, tfoot => 1,
4139 thead => 1, tr => 1,
4140 }->{$self->{open_elements}->[-1]->[1]}) {
4141 # MUST
4142 my $foster_parent_element;
4143 my $next_sibling;
4144 my $prev_sibling;
4145 OE: for (reverse 0..$#{$self->{open_elements}}) {
4146 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4147 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4148 if (defined $parent and $parent->node_type == 1) {
4149 $foster_parent_element = $parent;
4150 $next_sibling = $self->{open_elements}->[$_]->[0];
4151 $prev_sibling = $next_sibling->previous_sibling;
4152 } else {
4153 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4154 $prev_sibling = $foster_parent_element->last_child;
4155 }
4156 last OE;
4157 }
4158 } # OE
4159 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4160 $prev_sibling = $foster_parent_element->last_child
4161 unless defined $foster_parent_element;
4162 if (defined $prev_sibling and
4163 $prev_sibling->node_type == 3) {
4164 $prev_sibling->manakai_append_text ($token->{data});
4165 } else {
4166 $foster_parent_element->insert_before
4167 ($self->{document}->create_text_node ($token->{data}),
4168 $next_sibling);
4169 }
4170 } else {
4171 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4172 }
4173
4174 !!!next-token;
4175 redo B;
4176 } elsif ($token->{type} eq 'start tag') {
4177 if ({
4178 tr => ($self->{insertion_mode} ne 'in row'),
4179 th => 1, td => 1,
4180 }->{$token->{tag_name}}) {
4181 if ($self->{insertion_mode} eq 'in table') {
4182 ## Clear back to table context
4183 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4184 $self->{open_elements}->[-1]->[1] ne 'html') {
4185 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4186 pop @{$self->{open_elements}};
4187 }
4188
4189 !!!insert-element ('tbody');
4190 $self->{insertion_mode} = 'in table body';
4191 ## reprocess in the "in table body" insertion mode...
4192 }
4193
4194 if ($self->{insertion_mode} eq 'in table body') {
4195 unless ($token->{tag_name} eq 'tr') {
4196 !!!parse-error (type => 'missing start tag:tr');
4197 }
4198
4199 ## Clear back to table body context
4200 while (not {
4201 tbody => 1, tfoot => 1, thead => 1, html => 1,
4202 }->{$self->{open_elements}->[-1]->[1]}) {
4203 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4204 pop @{$self->{open_elements}};
4205 }
4206
4207 $self->{insertion_mode} = 'in row';
4208 if ($token->{tag_name} eq 'tr') {
4209 !!!insert-element ($token->{tag_name}, $token->{attributes});
4210 !!!next-token;
4211 redo B;
4212 } else {
4213 !!!insert-element ('tr');
4214 ## reprocess in the "in row" insertion mode
4215 }
4216 }
4217
4218 ## Clear back to table row context
4219 while (not {
4220 tr => 1, html => 1,
4221 }->{$self->{open_elements}->[-1]->[1]}) {
4222 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4223 pop @{$self->{open_elements}};
4224 }
4225
4226 !!!insert-element ($token->{tag_name}, $token->{attributes});
4227 $self->{insertion_mode} = 'in cell';
4228
4229 push @$active_formatting_elements, ['#marker', ''];
4230
4231 !!!next-token;
4232 redo B;
4233 } elsif ({
4234 caption => 1, col => 1, colgroup => 1,
4235 tbody => 1, tfoot => 1, thead => 1,
4236 tr => 1, # $self->{insertion_mode} eq 'in row'
4237 }->{$token->{tag_name}}) {
4238 if ($self->{insertion_mode} eq 'in row') {
4239 ## As if </tr>
4240 ## have an element in table scope
4241 my $i;
4242 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4243 my $node = $self->{open_elements}->[$_];
4244 if ($node->[1] eq 'tr') {
4245 $i = $_;
4246 last INSCOPE;
4247 } elsif ({
4248 table => 1, html => 1,
4249 }->{$node->[1]}) {
4250 last INSCOPE;
4251 }
4252 } # INSCOPE
4253 unless (defined $i) {
4254 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4255 ## Ignore the token
4256 !!!next-token;
4257 redo B;
4258 }
4259
4260 ## Clear back to table row context
4261 while (not {
4262 tr => 1, html => 1,
4263 }->{$self->{open_elements}->[-1]->[1]}) {
4264 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4265 pop @{$self->{open_elements}};
4266 }
4267
4268 pop @{$self->{open_elements}}; # tr
4269 $self->{insertion_mode} = 'in table body';
4270 if ($token->{tag_name} eq 'tr') {
4271 ## reprocess
4272 redo B;
4273 } else {
4274 ## reprocess in the "in table body" insertion mode...
4275 }
4276 }
4277
4278 if ($self->{insertion_mode} eq 'in table body') {
4279 ## have an element in table scope
4280 my $i;
4281 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4282 my $node = $self->{open_elements}->[$_];
4283 if ({
4284 tbody => 1, thead => 1, tfoot => 1,
4285 }->{$node->[1]}) {
4286 $i = $_;
4287 last INSCOPE;
4288 } elsif ({
4289 table => 1, html => 1,
4290 }->{$node->[1]}) {
4291 last INSCOPE;
4292 }
4293 } # INSCOPE
4294 unless (defined $i) {
4295 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4296 ## Ignore the token
4297 !!!next-token;
4298 redo B;
4299 }
4300
4301 ## Clear back to table body context
4302 while (not {
4303 tbody => 1, tfoot => 1, thead => 1, html => 1,
4304 }->{$self->{open_elements}->[-1]->[1]}) {
4305 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4306 pop @{$self->{open_elements}};
4307 }
4308
4309 ## As if <{current node}>
4310 ## have an element in table scope
4311 ## true by definition
4312
4313 ## Clear back to table body context
4314 ## nop by definition
4315
4316 pop @{$self->{open_elements}};
4317 $self->{insertion_mode} = 'in table';
4318 ## reprocess in "in table" insertion mode...
4319 }
4320
4321 if ($token->{tag_name} eq 'col') {
4322 ## Clear back to table context
4323 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4324 $self->{open_elements}->[-1]->[1] ne 'html') {
4325 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4326 pop @{$self->{open_elements}};
4327 }
4328
4329 !!!insert-element ('colgroup');
4330 $self->{insertion_mode} = 'in column group';
4331 ## reprocess
4332 redo B;
4333 } elsif ({
4334 caption => 1,
4335 colgroup => 1,
4336 tbody => 1, tfoot => 1, thead => 1,
4337 }->{$token->{tag_name}}) {
4338 ## Clear back to table context
4339 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4340 $self->{open_elements}->[-1]->[1] ne 'html') {
4341 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4342 pop @{$self->{open_elements}};
4343 }
4344
4345 push @$active_formatting_elements, ['#marker', '']
4346 if $token->{tag_name} eq 'caption';
4347
4348 !!!insert-element ($token->{tag_name}, $token->{attributes});
4349 $self->{insertion_mode} = {
4350 caption => 'in caption',
4351 colgroup => 'in column group',
4352 tbody => 'in table body',
4353 tfoot => 'in table body',
4354 thead => 'in table body',
4355 }->{$token->{tag_name}};
4356 !!!next-token;
4357 redo B;
4358 } else {
4359 die "$0: in table: <>: $token->{tag_name}";
4360 }
4361 } elsif ($token->{tag_name} eq 'table') {
4362 ## NOTE: There are code clones for this "table in table"
4363 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4364
4365 ## As if </table>
4366 ## have a table element in table scope
4367 my $i;
4368 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4369 my $node = $self->{open_elements}->[$_];
4370 if ($node->[1] eq 'table') {
4371 $i = $_;
4372 last INSCOPE;
4373 } elsif ({
4374 table => 1, html => 1,
4375 }->{$node->[1]}) {
4376 last INSCOPE;
4377 }
4378 } # INSCOPE
4379 unless (defined $i) {
4380 !!!parse-error (type => 'unmatched end tag:table');
4381 ## Ignore tokens </table><table>
4382 !!!next-token;
4383 redo B;
4384 }
4385
4386 ## generate implied end tags
4387 if ({
4388 dd => 1, dt => 1, li => 1, p => 1,
4389 td => 1, th => 1, tr => 1,
4390 tbody => 1, tfoot=> 1, thead => 1,
4391 }->{$self->{open_elements}->[-1]->[1]}) {
4392 !!!back-token; # <table>
4393 $token = {type => 'end tag', tag_name => 'table'};
4394 !!!back-token;
4395 $token = {type => 'end tag',
4396 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4397 redo B;
4398 }
4399
4400 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4401 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4402 }
4403
4404 splice @{$self->{open_elements}}, $i;
4405
4406 $self->_reset_insertion_mode;
4407
4408 ## reprocess
4409 redo B;
4410 } else {
4411 #
4412 }
4413 } elsif ($token->{type} eq 'end tag') {
4414 if ($token->{tag_name} eq 'tr' and
4415 $self->{insertion_mode} eq 'in row') {
4416 ## have an element in table scope
4417 my $i;
4418 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4419 my $node = $self->{open_elements}->[$_];
4420 if ($node->[1] eq $token->{tag_name}) {
4421 $i = $_;
4422 last INSCOPE;
4423 } elsif ({
4424 table => 1, html => 1,
4425 }->{$node->[1]}) {
4426 last INSCOPE;
4427 }
4428 } # INSCOPE
4429 unless (defined $i) {
4430 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4431 ## Ignore the token
4432 !!!next-token;
4433 redo B;
4434 }
4435
4436 ## Clear back to table row context
4437 while (not {
4438 tr => 1, html => 1,
4439 }->{$self->{open_elements}->[-1]->[1]}) {
4440 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4441 pop @{$self->{open_elements}};
4442 }
4443
4444 pop @{$self->{open_elements}}; # tr
4445 $self->{insertion_mode} = 'in table body';
4446 !!!next-token;
4447 redo B;
4448 } elsif ($token->{tag_name} eq 'table') {
4449 if ($self->{insertion_mode} eq 'in row') {
4450 ## As if </tr>
4451 ## have an element in table scope
4452 my $i;
4453 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4454 my $node = $self->{open_elements}->[$_];
4455 if ($node->[1] eq 'tr') {
4456 $i = $_;
4457 last INSCOPE;
4458 } elsif ({
4459 table => 1, html => 1,
4460 }->{$node->[1]}) {
4461 last INSCOPE;
4462 }
4463 } # INSCOPE
4464 unless (defined $i) {
4465 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4466 ## Ignore the token
4467 !!!next-token;
4468 redo B;
4469 }
4470
4471 ## Clear back to table row context
4472 while (not {
4473 tr => 1, html => 1,
4474 }->{$self->{open_elements}->[-1]->[1]}) {
4475 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4476 pop @{$self->{open_elements}};
4477 }
4478
4479 pop @{$self->{open_elements}}; # tr
4480 $self->{insertion_mode} = 'in table body';
4481 ## reprocess in the "in table body" insertion mode...
4482 }
4483
4484 if ($self->{insertion_mode} eq 'in table body') {
4485 ## have an element in table scope
4486 my $i;
4487 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4488 my $node = $self->{open_elements}->[$_];
4489 if ({
4490 tbody => 1, thead => 1, tfoot => 1,
4491 }->{$node->[1]}) {
4492 $i = $_;
4493 last INSCOPE;
4494 } elsif ({
4495 table => 1, html => 1,
4496 }->{$node->[1]}) {
4497 last INSCOPE;
4498 }
4499 } # INSCOPE
4500 unless (defined $i) {
4501 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4502 ## Ignore the token
4503 !!!next-token;
4504 redo B;
4505 }
4506
4507 ## Clear back to table body context
4508 while (not {
4509 tbody => 1, tfoot => 1, thead => 1, html => 1,
4510 }->{$self->{open_elements}->[-1]->[1]}) {
4511 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4512 pop @{$self->{open_elements}};
4513 }
4514
4515 ## As if <{current node}>
4516 ## have an element in table scope
4517 ## true by definition
4518
4519 ## Clear back to table body context
4520 ## nop by definition
4521
4522 pop @{$self->{open_elements}};
4523 $self->{insertion_mode} = 'in table';
4524 ## reprocess in the "in table" insertion mode...
4525 }
4526
4527 ## have a table element in table scope
4528 my $i;
4529 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4530 my $node = $self->{open_elements}->[$_];
4531 if ($node->[1] eq $token->{tag_name}) {
4532 $i = $_;
4533 last INSCOPE;
4534 } elsif ({
4535 table => 1, html => 1,
4536 }->{$node->[1]}) {
4537 last INSCOPE;
4538 }
4539 } # INSCOPE
4540 unless (defined $i) {
4541 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4542 ## Ignore the token
4543 !!!next-token;
4544 redo B;
4545 }
4546
4547 ## generate implied end tags
4548 if ({
4549 dd => 1, dt => 1, li => 1, p => 1,
4550 td => 1, th => 1, tr => 1,
4551 tbody => 1, tfoot=> 1, thead => 1,
4552 }->{$self->{open_elements}->[-1]->[1]}) {
4553 !!!back-token;
4554 $token = {type => 'end tag',
4555 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4556 redo B;
4557 }
4558
4559 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4560 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4561 }
4562
4563 splice @{$self->{open_elements}}, $i;
4564
4565 $self->_reset_insertion_mode;
4566
4567 !!!next-token;
4568 redo B;
4569 } elsif ({
4570 tbody => 1, tfoot => 1, thead => 1,
4571 }->{$token->{tag_name}} and
4572 ($self->{insertion_mode} eq 'in row' or
4573 $self->{insertion_mode} eq 'in table body')) {
4574 if ($self->{insertion_mode} eq 'in row') {
4575 ## have an element in table scope
4576 my $i;
4577 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4578 my $node = $self->{open_elements}->[$_];
4579 if ($node->[1] eq $token->{tag_name}) {
4580 $i = $_;
4581 last INSCOPE;
4582 } elsif ({
4583 table => 1, html => 1,
4584 }->{$node->[1]}) {
4585 last INSCOPE;
4586 }
4587 } # INSCOPE
4588 unless (defined $i) {
4589 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4590 ## Ignore the token
4591 !!!next-token;
4592 redo B;
4593 }
4594
4595 ## As if </tr>
4596 ## have an element in table scope
4597 my $i;
4598 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4599 my $node = $self->{open_elements}->[$_];
4600 if ($node->[1] eq 'tr') {
4601 $i = $_;
4602 last INSCOPE;
4603 } elsif ({
4604 table => 1, html => 1,
4605 }->{$node->[1]}) {
4606 last INSCOPE;
4607 }
4608 } # INSCOPE
4609 unless (defined $i) {
4610 !!!parse-error (type => 'unmatched end tag:tr');
4611 ## Ignore the token
4612 !!!next-token;
4613 redo B;
4614 }
4615
4616 ## Clear back to table row context
4617 while (not {
4618 tr => 1, html => 1,
4619 }->{$self->{open_elements}->[-1]->[1]}) {
4620 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4621 pop @{$self->{open_elements}};
4622 }
4623
4624 pop @{$self->{open_elements}}; # tr
4625 $self->{insertion_mode} = 'in table body';
4626 ## reprocess in the "in table body" insertion mode...
4627 }
4628
4629 ## have an element in table scope
4630 my $i;
4631 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4632 my $node = $self->{open_elements}->[$_];
4633 if ($node->[1] eq $token->{tag_name}) {
4634 $i = $_;
4635 last INSCOPE;
4636 } elsif ({
4637 table => 1, html => 1,
4638 }->{$node->[1]}) {
4639 last INSCOPE;
4640 }
4641 } # INSCOPE
4642 unless (defined $i) {
4643 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4644 ## Ignore the token
4645 !!!next-token;
4646 redo B;
4647 }
4648
4649 ## Clear back to table body context
4650 while (not {
4651 tbody => 1, tfoot => 1, thead => 1, html => 1,
4652 }->{$self->{open_elements}->[-1]->[1]}) {
4653 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4654 pop @{$self->{open_elements}};
4655 }
4656
4657 pop @{$self->{open_elements}};
4658 $self->{insertion_mode} = 'in table';
4659 !!!next-token;
4660 redo B;
4661 } elsif ({
4662 body => 1, caption => 1, col => 1, colgroup => 1,
4663 html => 1, td => 1, th => 1,
4664 tr => 1, # $self->{insertion_mode} eq 'in row'
4665 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} eq 'in table'
4666 }->{$token->{tag_name}}) {
4667 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4668 ## Ignore the token
4669 !!!next-token;
4670 redo B;
4671 } else {
4672 #
4673 }
4674 } else {
4675 die "$0: $token->{type}: Unknown token type";
4676 }
4677
4678 !!!parse-error (type => 'in table:'.$token->{tag_name});
4679 $in_body->($insert_to_foster);
4680 redo B;
4681 } elsif ($self->{insertion_mode} eq 'in column group') {
4682 if ($token->{type} eq 'character') {
4683 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4684 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4685 unless (length $token->{data}) {
4686 !!!next-token;
4687 redo B;
4688 }
4689 }
4690
4691 #
4692 } elsif ($token->{type} eq 'start tag') {
4693 if ($token->{tag_name} eq 'col') {
4694 !!!insert-element ($token->{tag_name}, $token->{attributes});
4695 pop @{$self->{open_elements}};
4696 !!!next-token;
4697 redo B;
4698 } else {
4699 #
4700 }
4701 } elsif ($token->{type} eq 'end tag') {
4702 if ($token->{tag_name} eq 'colgroup') {
4703 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4704 !!!parse-error (type => 'unmatched end tag:colgroup');
4705 ## Ignore the token
4706 !!!next-token;
4707 redo B;
4708 } else {
4709 pop @{$self->{open_elements}}; # colgroup
4710 $self->{insertion_mode} = 'in table';
4711 !!!next-token;
4712 redo B;
4713 }
4714 } elsif ($token->{tag_name} eq 'col') {
4715 !!!parse-error (type => 'unmatched end tag:col');
4716 ## Ignore the token
4717 !!!next-token;
4718 redo B;
4719 } else {
4720 #
4721 }
4722 } else {
4723 #
4724 }
4725
4726 ## As if </colgroup>
4727 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4728 !!!parse-error (type => 'unmatched end tag:colgroup');
4729 ## Ignore the token
4730 !!!next-token;
4731 redo B;
4732 } else {
4733 pop @{$self->{open_elements}}; # colgroup
4734 $self->{insertion_mode} = 'in table';
4735 ## reprocess
4736 redo B;
4737 }
4738 } elsif ($self->{insertion_mode} eq 'in select') {
4739 if ($token->{type} eq 'character') {
4740 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4741 !!!next-token;
4742 redo B;
4743 } elsif ($token->{type} eq 'start tag') {
4744 if ($token->{tag_name} eq 'option') {
4745 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4746 ## As if </option>
4747 pop @{$self->{open_elements}};
4748 }
4749
4750 !!!insert-element ($token->{tag_name}, $token->{attributes});
4751 !!!next-token;
4752 redo B;
4753 } elsif ($token->{tag_name} eq 'optgroup') {
4754 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4755 ## As if </option>
4756 pop @{$self->{open_elements}};
4757 }
4758
4759 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4760 ## As if </optgroup>
4761 pop @{$self->{open_elements}};
4762 }
4763
4764 !!!insert-element ($token->{tag_name}, $token->{attributes});
4765 !!!next-token;
4766 redo B;
4767 } elsif ($token->{tag_name} eq 'select') {
4768 !!!parse-error (type => 'not closed:select');
4769 ## As if </select> instead
4770 ## have an element in table scope
4771 my $i;
4772 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4773 my $node = $self->{open_elements}->[$_];
4774 if ($node->[1] eq $token->{tag_name}) {
4775 $i = $_;
4776 last INSCOPE;
4777 } elsif ({
4778 table => 1, html => 1,
4779 }->{$node->[1]}) {
4780 last INSCOPE;
4781 }
4782 } # INSCOPE
4783 unless (defined $i) {
4784 !!!parse-error (type => 'unmatched end tag:select');
4785 ## Ignore the token
4786 !!!next-token;
4787 redo B;
4788 }
4789
4790 splice @{$self->{open_elements}}, $i;
4791
4792 $self->_reset_insertion_mode;
4793
4794 !!!next-token;
4795 redo B;
4796 } else {
4797 #
4798 }
4799 } elsif ($token->{type} eq 'end tag') {
4800 if ($token->{tag_name} eq 'optgroup') {
4801 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4802 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4803 ## As if </option>
4804 splice @{$self->{open_elements}}, -2;
4805 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4806 pop @{$self->{open_elements}};
4807 } else {
4808 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4809 ## Ignore the token
4810 }
4811 !!!next-token;
4812 redo B;
4813 } elsif ($token->{tag_name} eq 'option') {
4814 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4815 pop @{$self->{open_elements}};
4816 } else {
4817 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4818 ## Ignore the token
4819 }
4820 !!!next-token;
4821 redo B;
4822 } elsif ($token->{tag_name} eq 'select') {
4823 ## have an element in table scope
4824 my $i;
4825 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4826 my $node = $self->{open_elements}->[$_];
4827 if ($node->[1] eq $token->{tag_name}) {
4828 $i = $_;
4829 last INSCOPE;
4830 } elsif ({
4831 table => 1, html => 1,
4832 }->{$node->[1]}) {
4833 last INSCOPE;
4834 }
4835 } # INSCOPE
4836 unless (defined $i) {
4837 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4838 ## Ignore the token
4839 !!!next-token;
4840 redo B;
4841 }
4842
4843 splice @{$self->{open_elements}}, $i;
4844
4845 $self->_reset_insertion_mode;
4846
4847 !!!next-token;
4848 redo B;
4849 } elsif ({
4850 caption => 1, table => 1, tbody => 1,
4851 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4852 }->{$token->{tag_name}}) {
4853 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4854
4855 ## have an element in table scope
4856 my $i;
4857 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4858 my $node = $self->{open_elements}->[$_];
4859 if ($node->[1] eq $token->{tag_name}) {
4860 $i = $_;
4861 last INSCOPE;
4862 } elsif ({
4863 table => 1, html => 1,
4864 }->{$node->[1]}) {
4865 last INSCOPE;
4866 }
4867 } # INSCOPE
4868 unless (defined $i) {
4869 ## Ignore the token
4870 !!!next-token;
4871 redo B;
4872 }
4873
4874 ## As if </select>
4875 ## have an element in table scope
4876 undef $i;
4877 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4878 my $node = $self->{open_elements}->[$_];
4879 if ($node->[1] eq 'select') {
4880 $i = $_;
4881 last INSCOPE;
4882 } elsif ({
4883 table => 1, html => 1,
4884 }->{$node->[1]}) {
4885 last INSCOPE;
4886 }
4887 } # INSCOPE
4888 unless (defined $i) {
4889 !!!parse-error (type => 'unmatched end tag:select');
4890 ## Ignore the </select> token
4891 !!!next-token; ## TODO: ok?
4892 redo B;
4893 }
4894
4895 splice @{$self->{open_elements}}, $i;
4896
4897 $self->_reset_insertion_mode;
4898
4899 ## reprocess
4900 redo B;
4901 } else {
4902 #
4903 }
4904 } else {
4905 #
4906 }
4907
4908 !!!parse-error (type => 'in select:'.$token->{tag_name});
4909 ## Ignore the token
4910 !!!next-token;
4911 redo B;
4912 } elsif ($self->{insertion_mode} eq 'after body' or
4913 $self->{insertion_mode} eq 'after html body') {
4914 if ($token->{type} eq 'character') {
4915 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4916 my $data = $1;
4917 ## As if in body
4918 $reconstruct_active_formatting_elements->($insert_to_current);
4919
4920 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4921
4922 unless (length $token->{data}) {
4923 !!!next-token;
4924 redo B;
4925 }
4926 }
4927
4928 if ($self->{insertion_mode} eq 'after html body') {
4929 !!!parse-error (type => 'after html:#character');
4930
4931 ## Reprocess in the "main" phase, "after body" insertion mode...
4932 }
4933
4934 ## "after body" insertion mode
4935 !!!parse-error (type => 'after body:#character');
4936
4937 $self->{insertion_mode} = 'in body';
4938 ## reprocess
4939 redo B;
4940 } elsif ($token->{type} eq 'start tag') {
4941 if ($self->{insertion_mode} eq 'after html body') {
4942 !!!parse-error (type => 'after html:'.$token->{tag_name});
4943
4944 ## Reprocess in the "main" phase, "after body" insertion mode...
4945 }
4946
4947 ## "after body" insertion mode
4948 !!!parse-error (type => 'after body:'.$token->{tag_name});
4949
4950 $self->{insertion_mode} = 'in body';
4951 ## reprocess
4952 redo B;
4953 } elsif ($token->{type} eq 'end tag') {
4954 if ($self->{insertion_mode} eq 'after html body') {
4955 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4956
4957 $self->{insertion_mode} = 'after body';
4958 ## Reprocess in the "main" phase, "after body" insertion mode...
4959 }
4960
4961 ## "after body" insertion mode
4962 if ($token->{tag_name} eq 'html') {
4963 if (defined $self->{inner_html_node}) {
4964 !!!parse-error (type => 'unmatched end tag:html');
4965 ## Ignore the token
4966 !!!next-token;
4967 redo B;
4968 } else {
4969 $self->{insertion_mode} = 'after html body';
4970 !!!next-token;
4971 redo B;
4972 }
4973 } else {
4974 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4975
4976 $self->{insertion_mode} = 'in body';
4977 ## reprocess
4978 redo B;
4979 }
4980 } else {
4981 die "$0: $token->{type}: Unknown token type";
4982 }
4983 } elsif ($self->{insertion_mode} eq 'in frameset' or
4984 $self->{insertion_mode} eq 'after frameset' or
4985 $self->{insertion_mode} eq 'after html frameset') {
4986 if ($token->{type} eq 'character') {
4987 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4988 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4989
4990 unless (length $token->{data}) {
4991 !!!next-token;
4992 redo B;
4993 }
4994 }
4995
4996 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4997 if ($self->{insertion_mode} eq 'in frameset') {
4998 !!!parse-error (type => 'in frameset:#character');
4999 } elsif ($self->{insertion_mode} eq 'after frameset') {
5000 !!!parse-error (type => 'after frameset:#character');
5001 } else { # "after html frameset"
5002 !!!parse-error (type => 'after html:#character');
5003
5004 $self->{insertion_mode} = 'after frameset';
5005 ## Reprocess in the "main" phase, "after frameset"...
5006 !!!parse-error (type => 'after frameset:#character');
5007 }
5008
5009 ## Ignore the token.
5010 if (length $token->{data}) {
5011 ## reprocess the rest of characters
5012 } else {
5013 !!!next-token;
5014 }
5015 redo B;
5016 }
5017
5018 die qq[$0: Character "$token->{data}"];
5019 } elsif ($token->{type} eq 'start tag') {
5020 if ($self->{insertion_mode} eq 'after html frameset') {
5021 !!!parse-error (type => 'after html:'.$token->{tag_name});
5022
5023 $self->{insertion_mode} = 'after frameset';
5024 ## Process in the "main" phase, "after frameset" insertion mode...
5025 }
5026
5027 if ($token->{tag_name} eq 'frameset' and
5028 $self->{insertion_mode} eq 'in frameset') {
5029 !!!insert-element ($token->{tag_name}, $token->{attributes});
5030 !!!next-token;
5031 redo B;
5032 } elsif ($token->{tag_name} eq 'frame' and
5033 $self->{insertion_mode} eq 'in frameset') {
5034 !!!insert-element ($token->{tag_name}, $token->{attributes});
5035 pop @{$self->{open_elements}};
5036 !!!next-token;
5037 redo B;
5038 } elsif ($token->{tag_name} eq 'noframes') {
5039 ## NOTE: As if in body.
5040 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
5041 redo B;
5042 } else {
5043 if ($self->{insertion_mode} eq 'in frameset') {
5044 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
5045 } else {
5046 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
5047 }
5048 ## Ignore the token
5049 !!!next-token;
5050 redo B;
5051 }
5052 } elsif ($token->{type} eq 'end tag') {
5053 if ($self->{insertion_mode} eq 'after html frameset') {
5054 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5055
5056 $self->{insertion_mode} = 'after frameset';
5057 ## Process in the "main" phase, "after frameset" insertion mode...
5058 }
5059
5060 if ($token->{tag_name} eq 'frameset' and
5061 $self->{insertion_mode} eq 'in frameset') {
5062 if ($self->{open_elements}->[-1]->[1] eq 'html' and
5063 @{$self->{open_elements}} == 1) {
5064 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5065 ## Ignore the token
5066 !!!next-token;
5067 } else {
5068 pop @{$self->{open_elements}};
5069 !!!next-token;
5070 }
5071
5072 if (not defined $self->{inner_html_node} and
5073 $self->{open_elements}->[-1]->[1] ne 'frameset') {
5074 $self->{insertion_mode} = 'after frameset';
5075 }
5076 redo B;
5077 } elsif ($token->{tag_name} eq 'html' and
5078 $self->{insertion_mode} eq 'after frameset') {
5079 $self->{insertion_mode} = 'after html frameset';
5080 !!!next-token;
5081 redo B;
5082 } else {
5083 if ($self->{insertion_mode} eq 'in frameset') {
5084 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
5085 } else {
5086 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
5087 }
5088 ## Ignore the token
5089 !!!next-token;
5090 redo B;
5091 }
5092 } else {
5093 die "$0: $token->{type}: Unknown token type";
5094 }
5095
5096 ## ISSUE: An issue in spec here
5097 } else {
5098 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5099 }
5100 } # B
5101
5102 ## NOTE: The "trailing end" phase in HTML5 is split into
5103 ## two insertion modes: "after html body" and "after html frameset".
5104 ## NOTE: States in the main stage is preserved while
5105 ## the parser stays in the trailing end phase. # MUST
5106
5107 ## Stop parsing # MUST
5108
5109 ## TODO: script stuffs
5110 } # _tree_construct_main
5111
5112 sub set_inner_html ($$$) {
5113 my $class = shift;
5114 my $node = shift;
5115 my $s = \$_[0];
5116 my $onerror = $_[1];
5117
5118 my $nt = $node->node_type;
5119 if ($nt == 9) {
5120 # MUST
5121
5122 ## Step 1 # MUST
5123 ## TODO: If the document has an active parser, ...
5124 ## ISSUE: There is an issue in the spec.
5125
5126 ## Step 2 # MUST
5127 my @cn = @{$node->child_nodes};
5128 for (@cn) {
5129 $node->remove_child ($_);
5130 }
5131
5132 ## Step 3, 4, 5 # MUST
5133 $class->parse_string ($$s => $node, $onerror);
5134 } elsif ($nt == 1) {
5135 ## TODO: If non-html element
5136
5137 ## NOTE: Most of this code is copied from |parse_string|
5138
5139 ## Step 1 # MUST
5140 my $this_doc = $node->owner_document;
5141 my $doc = $this_doc->implementation->create_document;
5142 $doc->manakai_is_html (1);
5143 my $p = $class->new;
5144 $p->{document} = $doc;
5145
5146 ## Step 9 # MUST
5147 my $i = 0;
5148 my $line = 1;
5149 my $column = 0;
5150 $p->{set_next_input_character} = sub {
5151 my $self = shift;
5152
5153 pop @{$self->{prev_input_character}};
5154 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5155
5156 $self->{next_input_character} = -1 and return if $i >= length $$s;
5157 $self->{next_input_character} = ord substr $$s, $i++, 1;
5158 $column++;
5159
5160 if ($self->{next_input_character} == 0x000A) { # LF
5161 $line++;
5162 $column = 0;
5163 } elsif ($self->{next_input_character} == 0x000D) { # CR
5164 $i++ if substr ($$s, $i, 1) eq "\x0A";
5165 $self->{next_input_character} = 0x000A; # LF # MUST
5166 $line++;
5167 $column = 0;
5168 } elsif ($self->{next_input_character} > 0x10FFFF) {
5169 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5170 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5171 !!!parse-error (type => 'NULL');
5172 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5173 }
5174 };
5175 $p->{prev_input_character} = [-1, -1, -1];
5176 $p->{next_input_character} = -1;
5177
5178 my $ponerror = $onerror || sub {
5179 my (%opt) = @_;
5180 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5181 };
5182 $p->{parse_error} = sub {
5183 $ponerror->(@_, line => $line, column => $column);
5184 };
5185
5186 $p->_initialize_tokenizer;
5187 $p->_initialize_tree_constructor;
5188
5189 ## Step 2
5190 my $node_ln = $node->local_name;
5191 $p->{content_model} = {
5192 title => RCDATA_CONTENT_MODEL,
5193 textarea => RCDATA_CONTENT_MODEL,
5194 style => CDATA_CONTENT_MODEL,
5195 script => CDATA_CONTENT_MODEL,
5196 xmp => CDATA_CONTENT_MODEL,
5197 iframe => CDATA_CONTENT_MODEL,
5198 noembed => CDATA_CONTENT_MODEL,
5199 noframes => CDATA_CONTENT_MODEL,
5200 noscript => CDATA_CONTENT_MODEL,
5201 plaintext => PLAINTEXT_CONTENT_MODEL,
5202 }->{$node_ln};
5203 $p->{content_model} = PCDATA_CONTENT_MODEL
5204 unless defined $p->{content_model};
5205 ## ISSUE: What is "the name of the element"? local name?
5206
5207 $p->{inner_html_node} = [$node, $node_ln];
5208
5209 ## Step 4
5210 my $root = $doc->create_element_ns
5211 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5212
5213 ## Step 5 # MUST
5214 $doc->append_child ($root);
5215
5216 ## Step 6 # MUST
5217 push @{$p->{open_elements}}, [$root, 'html'];
5218
5219 undef $p->{head_element};
5220
5221 ## Step 7 # MUST
5222 $p->_reset_insertion_mode;
5223
5224 ## Step 8 # MUST
5225 my $anode = $node;
5226 AN: while (defined $anode) {
5227 if ($anode->node_type == 1) {
5228 my $nsuri = $anode->namespace_uri;
5229 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5230 if ($anode->local_name eq 'form') { ## TODO: case?
5231 $p->{form_element} = $anode;
5232 last AN;
5233 }
5234 }
5235 }
5236 $anode = $anode->parent_node;
5237 } # AN
5238
5239 ## Step 3 # MUST
5240 ## Step 10 # MUST
5241 {
5242 my $self = $p;
5243 !!!next-token;
5244 }
5245 $p->_tree_construction_main;
5246
5247 ## Step 11 # MUST
5248 my @cn = @{$node->child_nodes};
5249 for (@cn) {
5250 $node->remove_child ($_);
5251 }
5252 ## ISSUE: mutation events? read-only?
5253
5254 ## Step 12 # MUST
5255 @cn = @{$root->child_nodes};
5256 for (@cn) {
5257 $this_doc->adopt_node ($_);
5258 $node->append_child ($_);
5259 }
5260 ## ISSUE: mutation events?
5261
5262 $p->_terminate_tree_constructor;
5263 } else {
5264 die "$0: |set_inner_html| is not defined for node of type $nt";
5265 }
5266 } # set_inner_html
5267
5268 } # tree construction stage
5269
5270 sub get_inner_html ($$$) {
5271 my (undef, $node, $on_error) = @_;
5272
5273 ## Step 1
5274 my $s = '';
5275
5276 my $in_cdata;
5277 my $parent = $node;
5278 while (defined $parent) {
5279 if ($parent->node_type == 1 and
5280 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5281 {
5282 style => 1, script => 1, xmp => 1, iframe => 1,
5283 noembed => 1, noframes => 1, noscript => 1,
5284 }->{$parent->local_name}) { ## TODO: case thingy
5285 $in_cdata = 1;
5286 }
5287 $parent = $parent->parent_node;
5288 }
5289
5290 ## Step 2
5291 my @node = @{$node->child_nodes};
5292 C: while (@node) {
5293 my $child = shift @node;
5294 unless (ref $child) {
5295 if ($child eq 'cdata-out') {
5296 $in_cdata = 0;
5297 } else {
5298 $s .= $child; # end tag
5299 }
5300 next C;
5301 }
5302
5303 my $nt = $child->node_type;
5304 if ($nt == 1) { # Element
5305 my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
5306 $s .= '<' . $tag_name;
5307 ## NOTE: Non-HTML case:
5308 ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
5309
5310 my @attrs = @{$child->attributes}; # sort order MUST be stable
5311 for my $attr (@attrs) { # order is implementation dependent
5312 my $attr_name = $attr->name; ## TODO: manakai_name
5313 $s .= ' ' . $attr_name . '="';
5314 my $attr_value = $attr->value;
5315 ## escape
5316 $attr_value =~ s/&/&amp;/g;
5317 $attr_value =~ s/</&lt;/g;
5318 $attr_value =~ s/>/&gt;/g;
5319 $attr_value =~ s/"/&quot;/g;
5320 $s .= $attr_value . '"';
5321 }
5322 $s .= '>';
5323
5324 next C if {
5325 area => 1, base => 1, basefont => 1, bgsound => 1,
5326 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5327 img => 1, input => 1, link => 1, meta => 1, param => 1,
5328 spacer => 1, wbr => 1,
5329 }->{$tag_name};
5330
5331 $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
5332
5333 if (not $in_cdata and {
5334 style => 1, script => 1, xmp => 1, iframe => 1,
5335 noembed => 1, noframes => 1, noscript => 1,
5336 plaintext => 1,
5337 }->{$tag_name}) {
5338 unshift @node, 'cdata-out';
5339 $in_cdata = 1;
5340 }
5341
5342 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5343 } elsif ($nt == 3 or $nt == 4) {
5344 if ($in_cdata) {
5345 $s .= $child->data;
5346 } else {
5347 my $value = $child->data;
5348 $value =~ s/&/&amp;/g;
5349 $value =~ s/</&lt;/g;
5350 $value =~ s/>/&gt;/g;
5351 $value =~ s/"/&quot;/g;
5352 $s .= $value;
5353 }
5354 } elsif ($nt == 8) {
5355 $s .= '<!--' . $child->data . '-->';
5356 } elsif ($nt == 10) {
5357 $s .= '<!DOCTYPE ' . $child->name . '>';
5358 } elsif ($nt == 5) { # entrefs
5359 push @node, @{$child->child_nodes};
5360 } else {
5361 $on_error->($child) if defined $on_error;
5362 }
5363 ## ISSUE: This code does not support PIs.
5364 } # C
5365
5366 ## Step 3
5367 return \$s;
5368 } # get_inner_html
5369
5370 1;
5371 # $Date: 2007/07/21 10:59:40 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24