/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.35 - (show annotations) (download) (as text)
Mon Jul 16 03:21:04 2007 UTC (17 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.34: +33 -27 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	16 Jul 2007 03:18:55 -0000
2007-07-16  Wakaba  <wakaba@suika.fam.cx>

	* tree-test-1.dat: New tests for trailing end phase
	and white space in some insertion modes are added.

++ whatpm/Whatpm/ChangeLog	16 Jul 2007 03:18:16 -0000
	* HTML.pm.src: |$phase| has been removed; The |trailing end|
	phase is now an insertion mode.  Treatments for white
	space character tokens were incorrect for some
	insertion modes.  An old |meta| case was not removed.

2007-07-16  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.34 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## ISSUE:
6 ## var doc = implementation.createDocument (null, null, null);
7 ## doc.write ('');
8 ## alert (doc.compatMode);
9
10 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
11 ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
12 ## is not yet clear.
13 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
14 ## "{U+FEFF}..." in GB18030?
15
16 my $permitted_slash_tag_name = {
17 base => 1,
18 link => 1,
19 meta => 1,
20 hr => 1,
21 br => 1,
22 img=> 1,
23 embed => 1,
24 param => 1,
25 area => 1,
26 col => 1,
27 input => 1,
28 };
29
30 my $c1_entity_char = {
31 0x80 => 0x20AC,
32 0x81 => 0xFFFD,
33 0x82 => 0x201A,
34 0x83 => 0x0192,
35 0x84 => 0x201E,
36 0x85 => 0x2026,
37 0x86 => 0x2020,
38 0x87 => 0x2021,
39 0x88 => 0x02C6,
40 0x89 => 0x2030,
41 0x8A => 0x0160,
42 0x8B => 0x2039,
43 0x8C => 0x0152,
44 0x8D => 0xFFFD,
45 0x8E => 0x017D,
46 0x8F => 0xFFFD,
47 0x90 => 0xFFFD,
48 0x91 => 0x2018,
49 0x92 => 0x2019,
50 0x93 => 0x201C,
51 0x94 => 0x201D,
52 0x95 => 0x2022,
53 0x96 => 0x2013,
54 0x97 => 0x2014,
55 0x98 => 0x02DC,
56 0x99 => 0x2122,
57 0x9A => 0x0161,
58 0x9B => 0x203A,
59 0x9C => 0x0153,
60 0x9D => 0xFFFD,
61 0x9E => 0x017E,
62 0x9F => 0x0178,
63 }; # $c1_entity_char
64
65 my $special_category = {
66 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
67 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
68 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
69 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
70 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
71 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
72 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
73 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
74 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
75 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
76 };
77 my $scoping_category = {
78 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
79 table => 1, td => 1, th => 1,
80 };
81 my $formatting_category = {
82 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
83 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
84 };
85 # $phrasing_category: all other elements
86
87 sub parse_string ($$$;$) {
88 my $self = shift->new;
89 my $s = \$_[0];
90 $self->{document} = $_[1];
91
92 ## NOTE: |set_inner_html| copies most of this method's code
93
94 my $i = 0;
95 my $line = 1;
96 my $column = 0;
97 $self->{set_next_input_character} = sub {
98 my $self = shift;
99
100 pop @{$self->{prev_input_character}};
101 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
102
103 $self->{next_input_character} = -1 and return if $i >= length $$s;
104 $self->{next_input_character} = ord substr $$s, $i++, 1;
105 $column++;
106
107 if ($self->{next_input_character} == 0x000A) { # LF
108 $line++;
109 $column = 0;
110 } elsif ($self->{next_input_character} == 0x000D) { # CR
111 $i++ if substr ($$s, $i, 1) eq "\x0A";
112 $self->{next_input_character} = 0x000A; # LF # MUST
113 $line++;
114 $column = 0;
115 } elsif ($self->{next_input_character} > 0x10FFFF) {
116 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
117 } elsif ($self->{next_input_character} == 0x0000) { # NULL
118 !!!parse-error (type => 'NULL');
119 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120 }
121 };
122 $self->{prev_input_character} = [-1, -1, -1];
123 $self->{next_input_character} = -1;
124
125 my $onerror = $_[2] || sub {
126 my (%opt) = @_;
127 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
128 };
129 $self->{parse_error} = sub {
130 $onerror->(@_, line => $line, column => $column);
131 };
132
133 $self->_initialize_tokenizer;
134 $self->_initialize_tree_constructor;
135 $self->_construct_tree;
136 $self->_terminate_tree_constructor;
137
138 return $self->{document};
139 } # parse_string
140
141 sub new ($) {
142 my $class = shift;
143 my $self = bless {}, $class;
144 $self->{set_next_input_character} = sub {
145 $self->{next_input_character} = -1;
146 };
147 $self->{parse_error} = sub {
148 #
149 };
150 return $self;
151 } # new
152
153 ## Implementations MUST act as if state machine in the spec
154
155 sub _initialize_tokenizer ($) {
156 my $self = shift;
157 $self->{state} = 'data'; # MUST
158 $self->{content_model_flag} = 'PCDATA'; # be
159 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
160 undef $self->{current_attribute};
161 undef $self->{last_emitted_start_tag_name};
162 undef $self->{last_attribute_value_state};
163 $self->{char} = [];
164 # $self->{next_input_character}
165 !!!next-input-character;
166 $self->{token} = [];
167 # $self->{escape}
168 } # _initialize_tokenizer
169
170 ## A token has:
171 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
172 ## 'character', or 'end-of-file'
173 ## ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
174 ## ->{public_identifier} (DOCTYPE)
175 ## ->{system_identifier} (DOCTYPE)
176 ## ->{correct} == 1 or 0 (DOCTYPE)
177 ## ->{attributes} isa HASH (start tag, end tag)
178 ## ->{data} (comment, character)
179
180 ## Emitted token MUST immediately be handled by the tree construction state.
181
182 ## Before each step, UA MAY check to see if either one of the scripts in
183 ## "list of scripts that will execute as soon as possible" or the first
184 ## script in the "list of scripts that will execute asynchronously",
185 ## has completed loading. If one has, then it MUST be executed
186 ## and removed from the list.
187
188 sub _get_next_token ($) {
189 my $self = shift;
190 if (@{$self->{token}}) {
191 return shift @{$self->{token}};
192 }
193
194 A: {
195 if ($self->{state} eq 'data') {
196 if ($self->{next_input_character} == 0x0026) { # &
197 if ($self->{content_model_flag} eq 'PCDATA' or
198 $self->{content_model_flag} eq 'RCDATA') {
199 $self->{state} = 'entity data';
200 !!!next-input-character;
201 redo A;
202 } else {
203 #
204 }
205 } elsif ($self->{next_input_character} == 0x002D) { # -
206 if ($self->{content_model_flag} eq 'RCDATA' or
207 $self->{content_model_flag} eq 'CDATA') {
208 unless ($self->{escape}) {
209 if ($self->{prev_input_character}->[0] == 0x002D and # -
210 $self->{prev_input_character}->[1] == 0x0021 and # !
211 $self->{prev_input_character}->[2] == 0x003C) { # <
212 $self->{escape} = 1;
213 }
214 }
215 }
216
217 #
218 } elsif ($self->{next_input_character} == 0x003C) { # <
219 if ($self->{content_model_flag} eq 'PCDATA' or
220 (($self->{content_model_flag} eq 'CDATA' or
221 $self->{content_model_flag} eq 'RCDATA') and
222 not $self->{escape})) {
223 $self->{state} = 'tag open';
224 !!!next-input-character;
225 redo A;
226 } else {
227 #
228 }
229 } elsif ($self->{next_input_character} == 0x003E) { # >
230 if ($self->{escape} and
231 ($self->{content_model_flag} eq 'RCDATA' or
232 $self->{content_model_flag} eq 'CDATA')) {
233 if ($self->{prev_input_character}->[0] == 0x002D and # -
234 $self->{prev_input_character}->[1] == 0x002D) { # -
235 delete $self->{escape};
236 }
237 }
238
239 #
240 } elsif ($self->{next_input_character} == -1) {
241 !!!emit ({type => 'end-of-file'});
242 last A; ## TODO: ok?
243 }
244 # Anything else
245 my $token = {type => 'character',
246 data => chr $self->{next_input_character}};
247 ## Stay in the data state
248 !!!next-input-character;
249
250 !!!emit ($token);
251
252 redo A;
253 } elsif ($self->{state} eq 'entity data') {
254 ## (cannot happen in CDATA state)
255
256 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
257
258 $self->{state} = 'data';
259 # next-input-character is already done
260
261 unless (defined $token) {
262 !!!emit ({type => 'character', data => '&'});
263 } else {
264 !!!emit ($token);
265 }
266
267 redo A;
268 } elsif ($self->{state} eq 'tag open') {
269 if ($self->{content_model_flag} eq 'RCDATA' or
270 $self->{content_model_flag} eq 'CDATA') {
271 if ($self->{next_input_character} == 0x002F) { # /
272 !!!next-input-character;
273 $self->{state} = 'close tag open';
274 redo A;
275 } else {
276 ## reconsume
277 $self->{state} = 'data';
278
279 !!!emit ({type => 'character', data => '<'});
280
281 redo A;
282 }
283 } elsif ($self->{content_model_flag} eq 'PCDATA') {
284 if ($self->{next_input_character} == 0x0021) { # !
285 $self->{state} = 'markup declaration open';
286 !!!next-input-character;
287 redo A;
288 } elsif ($self->{next_input_character} == 0x002F) { # /
289 $self->{state} = 'close tag open';
290 !!!next-input-character;
291 redo A;
292 } elsif (0x0041 <= $self->{next_input_character} and
293 $self->{next_input_character} <= 0x005A) { # A..Z
294 $self->{current_token}
295 = {type => 'start tag',
296 tag_name => chr ($self->{next_input_character} + 0x0020)};
297 $self->{state} = 'tag name';
298 !!!next-input-character;
299 redo A;
300 } elsif (0x0061 <= $self->{next_input_character} and
301 $self->{next_input_character} <= 0x007A) { # a..z
302 $self->{current_token} = {type => 'start tag',
303 tag_name => chr ($self->{next_input_character})};
304 $self->{state} = 'tag name';
305 !!!next-input-character;
306 redo A;
307 } elsif ($self->{next_input_character} == 0x003E) { # >
308 !!!parse-error (type => 'empty start tag');
309 $self->{state} = 'data';
310 !!!next-input-character;
311
312 !!!emit ({type => 'character', data => '<>'});
313
314 redo A;
315 } elsif ($self->{next_input_character} == 0x003F) { # ?
316 !!!parse-error (type => 'pio');
317 $self->{state} = 'bogus comment';
318 ## $self->{next_input_character} is intentionally left as is
319 redo A;
320 } else {
321 !!!parse-error (type => 'bare stago');
322 $self->{state} = 'data';
323 ## reconsume
324
325 !!!emit ({type => 'character', data => '<'});
326
327 redo A;
328 }
329 } else {
330 die "$0: $self->{content_model_flag}: Unknown content model flag";
331 }
332 } elsif ($self->{state} eq 'close tag open') {
333 if ($self->{content_model_flag} eq 'RCDATA' or
334 $self->{content_model_flag} eq 'CDATA') {
335 if (defined $self->{last_emitted_start_tag_name}) {
336 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
337 my @next_char;
338 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
339 push @next_char, $self->{next_input_character};
340 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
341 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
342 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
343 !!!next-input-character;
344 next TAGNAME;
345 } else {
346 $self->{next_input_character} = shift @next_char; # reconsume
347 !!!back-next-input-character (@next_char);
348 $self->{state} = 'data';
349
350 !!!emit ({type => 'character', data => '</'});
351
352 redo A;
353 }
354 }
355 push @next_char, $self->{next_input_character};
356
357 unless ($self->{next_input_character} == 0x0009 or # HT
358 $self->{next_input_character} == 0x000A or # LF
359 $self->{next_input_character} == 0x000B or # VT
360 $self->{next_input_character} == 0x000C or # FF
361 $self->{next_input_character} == 0x0020 or # SP
362 $self->{next_input_character} == 0x003E or # >
363 $self->{next_input_character} == 0x002F or # /
364 $self->{next_input_character} == -1) {
365 $self->{next_input_character} = shift @next_char; # reconsume
366 !!!back-next-input-character (@next_char);
367 $self->{state} = 'data';
368 !!!emit ({type => 'character', data => '</'});
369 redo A;
370 } else {
371 $self->{next_input_character} = shift @next_char;
372 !!!back-next-input-character (@next_char);
373 # and consume...
374 }
375 } else {
376 ## No start tag token has ever been emitted
377 # next-input-character is already done
378 $self->{state} = 'data';
379 !!!emit ({type => 'character', data => '</'});
380 redo A;
381 }
382 }
383
384 if (0x0041 <= $self->{next_input_character} and
385 $self->{next_input_character} <= 0x005A) { # A..Z
386 $self->{current_token} = {type => 'end tag',
387 tag_name => chr ($self->{next_input_character} + 0x0020)};
388 $self->{state} = 'tag name';
389 !!!next-input-character;
390 redo A;
391 } elsif (0x0061 <= $self->{next_input_character} and
392 $self->{next_input_character} <= 0x007A) { # a..z
393 $self->{current_token} = {type => 'end tag',
394 tag_name => chr ($self->{next_input_character})};
395 $self->{state} = 'tag name';
396 !!!next-input-character;
397 redo A;
398 } elsif ($self->{next_input_character} == 0x003E) { # >
399 !!!parse-error (type => 'empty end tag');
400 $self->{state} = 'data';
401 !!!next-input-character;
402 redo A;
403 } elsif ($self->{next_input_character} == -1) {
404 !!!parse-error (type => 'bare etago');
405 $self->{state} = 'data';
406 # reconsume
407
408 !!!emit ({type => 'character', data => '</'});
409
410 redo A;
411 } else {
412 !!!parse-error (type => 'bogus end tag');
413 $self->{state} = 'bogus comment';
414 ## $self->{next_input_character} is intentionally left as is
415 redo A;
416 }
417 } elsif ($self->{state} eq 'tag name') {
418 if ($self->{next_input_character} == 0x0009 or # HT
419 $self->{next_input_character} == 0x000A or # LF
420 $self->{next_input_character} == 0x000B or # VT
421 $self->{next_input_character} == 0x000C or # FF
422 $self->{next_input_character} == 0x0020) { # SP
423 $self->{state} = 'before attribute name';
424 !!!next-input-character;
425 redo A;
426 } elsif ($self->{next_input_character} == 0x003E) { # >
427 if ($self->{current_token}->{type} eq 'start tag') {
428 $self->{current_token}->{first_start_tag}
429 = not defined $self->{last_emitted_start_tag_name};
430 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
431 } elsif ($self->{current_token}->{type} eq 'end tag') {
432 $self->{content_model_flag} = 'PCDATA'; # MUST
433 if ($self->{current_token}->{attributes}) {
434 !!!parse-error (type => 'end tag attribute');
435 }
436 } else {
437 die "$0: $self->{current_token}->{type}: Unknown token type";
438 }
439 $self->{state} = 'data';
440 !!!next-input-character;
441
442 !!!emit ($self->{current_token}); # start tag or end tag
443
444 redo A;
445 } elsif (0x0041 <= $self->{next_input_character} and
446 $self->{next_input_character} <= 0x005A) { # A..Z
447 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
448 # start tag or end tag
449 ## Stay in this state
450 !!!next-input-character;
451 redo A;
452 } elsif ($self->{next_input_character} == -1) {
453 !!!parse-error (type => 'unclosed tag');
454 if ($self->{current_token}->{type} eq 'start tag') {
455 $self->{current_token}->{first_start_tag}
456 = not defined $self->{last_emitted_start_tag_name};
457 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
458 } elsif ($self->{current_token}->{type} eq 'end tag') {
459 $self->{content_model_flag} = 'PCDATA'; # MUST
460 if ($self->{current_token}->{attributes}) {
461 !!!parse-error (type => 'end tag attribute');
462 }
463 } else {
464 die "$0: $self->{current_token}->{type}: Unknown token type";
465 }
466 $self->{state} = 'data';
467 # reconsume
468
469 !!!emit ($self->{current_token}); # start tag or end tag
470
471 redo A;
472 } elsif ($self->{next_input_character} == 0x002F) { # /
473 !!!next-input-character;
474 if ($self->{next_input_character} == 0x003E and # >
475 $self->{current_token}->{type} eq 'start tag' and
476 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
477 # permitted slash
478 #
479 } else {
480 !!!parse-error (type => 'nestc');
481 }
482 $self->{state} = 'before attribute name';
483 # next-input-character is already done
484 redo A;
485 } else {
486 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
487 # start tag or end tag
488 ## Stay in the state
489 !!!next-input-character;
490 redo A;
491 }
492 } elsif ($self->{state} eq 'before attribute name') {
493 if ($self->{next_input_character} == 0x0009 or # HT
494 $self->{next_input_character} == 0x000A or # LF
495 $self->{next_input_character} == 0x000B or # VT
496 $self->{next_input_character} == 0x000C or # FF
497 $self->{next_input_character} == 0x0020) { # SP
498 ## Stay in the state
499 !!!next-input-character;
500 redo A;
501 } elsif ($self->{next_input_character} == 0x003E) { # >
502 if ($self->{current_token}->{type} eq 'start tag') {
503 $self->{current_token}->{first_start_tag}
504 = not defined $self->{last_emitted_start_tag_name};
505 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
506 } elsif ($self->{current_token}->{type} eq 'end tag') {
507 $self->{content_model_flag} = 'PCDATA'; # MUST
508 if ($self->{current_token}->{attributes}) {
509 !!!parse-error (type => 'end tag attribute');
510 }
511 } else {
512 die "$0: $self->{current_token}->{type}: Unknown token type";
513 }
514 $self->{state} = 'data';
515 !!!next-input-character;
516
517 !!!emit ($self->{current_token}); # start tag or end tag
518
519 redo A;
520 } elsif (0x0041 <= $self->{next_input_character} and
521 $self->{next_input_character} <= 0x005A) { # A..Z
522 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
523 value => ''};
524 $self->{state} = 'attribute name';
525 !!!next-input-character;
526 redo A;
527 } elsif ($self->{next_input_character} == 0x002F) { # /
528 !!!next-input-character;
529 if ($self->{next_input_character} == 0x003E and # >
530 $self->{current_token}->{type} eq 'start tag' and
531 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
532 # permitted slash
533 #
534 } else {
535 !!!parse-error (type => 'nestc');
536 }
537 ## Stay in the state
538 # next-input-character is already done
539 redo A;
540 } elsif ($self->{next_input_character} == -1) {
541 !!!parse-error (type => 'unclosed tag');
542 if ($self->{current_token}->{type} eq 'start tag') {
543 $self->{current_token}->{first_start_tag}
544 = not defined $self->{last_emitted_start_tag_name};
545 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
546 } elsif ($self->{current_token}->{type} eq 'end tag') {
547 $self->{content_model_flag} = 'PCDATA'; # MUST
548 if ($self->{current_token}->{attributes}) {
549 !!!parse-error (type => 'end tag attribute');
550 }
551 } else {
552 die "$0: $self->{current_token}->{type}: Unknown token type";
553 }
554 $self->{state} = 'data';
555 # reconsume
556
557 !!!emit ($self->{current_token}); # start tag or end tag
558
559 redo A;
560 } else {
561 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
562 value => ''};
563 $self->{state} = 'attribute name';
564 !!!next-input-character;
565 redo A;
566 }
567 } elsif ($self->{state} eq 'attribute name') {
568 my $before_leave = sub {
569 if (exists $self->{current_token}->{attributes} # start tag or end tag
570 ->{$self->{current_attribute}->{name}}) { # MUST
571 !!!parse-error (type => 'dupulicate attribute');
572 ## Discard $self->{current_attribute} # MUST
573 } else {
574 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
575 = $self->{current_attribute};
576 }
577 }; # $before_leave
578
579 if ($self->{next_input_character} == 0x0009 or # HT
580 $self->{next_input_character} == 0x000A or # LF
581 $self->{next_input_character} == 0x000B or # VT
582 $self->{next_input_character} == 0x000C or # FF
583 $self->{next_input_character} == 0x0020) { # SP
584 $before_leave->();
585 $self->{state} = 'after attribute name';
586 !!!next-input-character;
587 redo A;
588 } elsif ($self->{next_input_character} == 0x003D) { # =
589 $before_leave->();
590 $self->{state} = 'before attribute value';
591 !!!next-input-character;
592 redo A;
593 } elsif ($self->{next_input_character} == 0x003E) { # >
594 $before_leave->();
595 if ($self->{current_token}->{type} eq 'start tag') {
596 $self->{current_token}->{first_start_tag}
597 = not defined $self->{last_emitted_start_tag_name};
598 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
599 } elsif ($self->{current_token}->{type} eq 'end tag') {
600 $self->{content_model_flag} = 'PCDATA'; # MUST
601 if ($self->{current_token}->{attributes}) {
602 !!!parse-error (type => 'end tag attribute');
603 }
604 } else {
605 die "$0: $self->{current_token}->{type}: Unknown token type";
606 }
607 $self->{state} = 'data';
608 !!!next-input-character;
609
610 !!!emit ($self->{current_token}); # start tag or end tag
611
612 redo A;
613 } elsif (0x0041 <= $self->{next_input_character} and
614 $self->{next_input_character} <= 0x005A) { # A..Z
615 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
616 ## Stay in the state
617 !!!next-input-character;
618 redo A;
619 } elsif ($self->{next_input_character} == 0x002F) { # /
620 $before_leave->();
621 !!!next-input-character;
622 if ($self->{next_input_character} == 0x003E and # >
623 $self->{current_token}->{type} eq 'start tag' and
624 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
625 # permitted slash
626 #
627 } else {
628 !!!parse-error (type => 'nestc');
629 }
630 $self->{state} = 'before attribute name';
631 # next-input-character is already done
632 redo A;
633 } elsif ($self->{next_input_character} == -1) {
634 !!!parse-error (type => 'unclosed tag');
635 $before_leave->();
636 if ($self->{current_token}->{type} eq 'start tag') {
637 $self->{current_token}->{first_start_tag}
638 = not defined $self->{last_emitted_start_tag_name};
639 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
640 } elsif ($self->{current_token}->{type} eq 'end tag') {
641 $self->{content_model_flag} = 'PCDATA'; # MUST
642 if ($self->{current_token}->{attributes}) {
643 !!!parse-error (type => 'end tag attribute');
644 }
645 } else {
646 die "$0: $self->{current_token}->{type}: Unknown token type";
647 }
648 $self->{state} = 'data';
649 # reconsume
650
651 !!!emit ($self->{current_token}); # start tag or end tag
652
653 redo A;
654 } else {
655 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
656 ## Stay in the state
657 !!!next-input-character;
658 redo A;
659 }
660 } elsif ($self->{state} eq 'after attribute name') {
661 if ($self->{next_input_character} == 0x0009 or # HT
662 $self->{next_input_character} == 0x000A or # LF
663 $self->{next_input_character} == 0x000B or # VT
664 $self->{next_input_character} == 0x000C or # FF
665 $self->{next_input_character} == 0x0020) { # SP
666 ## Stay in the state
667 !!!next-input-character;
668 redo A;
669 } elsif ($self->{next_input_character} == 0x003D) { # =
670 $self->{state} = 'before attribute value';
671 !!!next-input-character;
672 redo A;
673 } elsif ($self->{next_input_character} == 0x003E) { # >
674 if ($self->{current_token}->{type} eq 'start tag') {
675 $self->{current_token}->{first_start_tag}
676 = not defined $self->{last_emitted_start_tag_name};
677 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
678 } elsif ($self->{current_token}->{type} eq 'end tag') {
679 $self->{content_model_flag} = 'PCDATA'; # MUST
680 if ($self->{current_token}->{attributes}) {
681 !!!parse-error (type => 'end tag attribute');
682 }
683 } else {
684 die "$0: $self->{current_token}->{type}: Unknown token type";
685 }
686 $self->{state} = 'data';
687 !!!next-input-character;
688
689 !!!emit ($self->{current_token}); # start tag or end tag
690
691 redo A;
692 } elsif (0x0041 <= $self->{next_input_character} and
693 $self->{next_input_character} <= 0x005A) { # A..Z
694 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
695 value => ''};
696 $self->{state} = 'attribute name';
697 !!!next-input-character;
698 redo A;
699 } elsif ($self->{next_input_character} == 0x002F) { # /
700 !!!next-input-character;
701 if ($self->{next_input_character} == 0x003E and # >
702 $self->{current_token}->{type} eq 'start tag' and
703 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
704 # permitted slash
705 #
706 } else {
707 !!!parse-error (type => 'nestc');
708 ## TODO: Different error type for <aa / bb> than <aa/>
709 }
710 $self->{state} = 'before attribute name';
711 # next-input-character is already done
712 redo A;
713 } elsif ($self->{next_input_character} == -1) {
714 !!!parse-error (type => 'unclosed tag');
715 if ($self->{current_token}->{type} eq 'start tag') {
716 $self->{current_token}->{first_start_tag}
717 = not defined $self->{last_emitted_start_tag_name};
718 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
719 } elsif ($self->{current_token}->{type} eq 'end tag') {
720 $self->{content_model_flag} = 'PCDATA'; # MUST
721 if ($self->{current_token}->{attributes}) {
722 !!!parse-error (type => 'end tag attribute');
723 }
724 } else {
725 die "$0: $self->{current_token}->{type}: Unknown token type";
726 }
727 $self->{state} = 'data';
728 # reconsume
729
730 !!!emit ($self->{current_token}); # start tag or end tag
731
732 redo A;
733 } else {
734 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
735 value => ''};
736 $self->{state} = 'attribute name';
737 !!!next-input-character;
738 redo A;
739 }
740 } elsif ($self->{state} eq 'before attribute value') {
741 if ($self->{next_input_character} == 0x0009 or # HT
742 $self->{next_input_character} == 0x000A or # LF
743 $self->{next_input_character} == 0x000B or # VT
744 $self->{next_input_character} == 0x000C or # FF
745 $self->{next_input_character} == 0x0020) { # SP
746 ## Stay in the state
747 !!!next-input-character;
748 redo A;
749 } elsif ($self->{next_input_character} == 0x0022) { # "
750 $self->{state} = 'attribute value (double-quoted)';
751 !!!next-input-character;
752 redo A;
753 } elsif ($self->{next_input_character} == 0x0026) { # &
754 $self->{state} = 'attribute value (unquoted)';
755 ## reconsume
756 redo A;
757 } elsif ($self->{next_input_character} == 0x0027) { # '
758 $self->{state} = 'attribute value (single-quoted)';
759 !!!next-input-character;
760 redo A;
761 } elsif ($self->{next_input_character} == 0x003E) { # >
762 if ($self->{current_token}->{type} eq 'start tag') {
763 $self->{current_token}->{first_start_tag}
764 = not defined $self->{last_emitted_start_tag_name};
765 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
766 } elsif ($self->{current_token}->{type} eq 'end tag') {
767 $self->{content_model_flag} = 'PCDATA'; # MUST
768 if ($self->{current_token}->{attributes}) {
769 !!!parse-error (type => 'end tag attribute');
770 }
771 } else {
772 die "$0: $self->{current_token}->{type}: Unknown token type";
773 }
774 $self->{state} = 'data';
775 !!!next-input-character;
776
777 !!!emit ($self->{current_token}); # start tag or end tag
778
779 redo A;
780 } elsif ($self->{next_input_character} == -1) {
781 !!!parse-error (type => 'unclosed tag');
782 if ($self->{current_token}->{type} eq 'start tag') {
783 $self->{current_token}->{first_start_tag}
784 = not defined $self->{last_emitted_start_tag_name};
785 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
786 } elsif ($self->{current_token}->{type} eq 'end tag') {
787 $self->{content_model_flag} = 'PCDATA'; # MUST
788 if ($self->{current_token}->{attributes}) {
789 !!!parse-error (type => 'end tag attribute');
790 }
791 } else {
792 die "$0: $self->{current_token}->{type}: Unknown token type";
793 }
794 $self->{state} = 'data';
795 ## reconsume
796
797 !!!emit ($self->{current_token}); # start tag or end tag
798
799 redo A;
800 } else {
801 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
802 $self->{state} = 'attribute value (unquoted)';
803 !!!next-input-character;
804 redo A;
805 }
806 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
807 if ($self->{next_input_character} == 0x0022) { # "
808 $self->{state} = 'before attribute name';
809 !!!next-input-character;
810 redo A;
811 } elsif ($self->{next_input_character} == 0x0026) { # &
812 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
813 $self->{state} = 'entity in attribute value';
814 !!!next-input-character;
815 redo A;
816 } elsif ($self->{next_input_character} == -1) {
817 !!!parse-error (type => 'unclosed attribute value');
818 if ($self->{current_token}->{type} eq 'start tag') {
819 $self->{current_token}->{first_start_tag}
820 = not defined $self->{last_emitted_start_tag_name};
821 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
822 } elsif ($self->{current_token}->{type} eq 'end tag') {
823 $self->{content_model_flag} = 'PCDATA'; # MUST
824 if ($self->{current_token}->{attributes}) {
825 !!!parse-error (type => 'end tag attribute');
826 }
827 } else {
828 die "$0: $self->{current_token}->{type}: Unknown token type";
829 }
830 $self->{state} = 'data';
831 ## reconsume
832
833 !!!emit ($self->{current_token}); # start tag or end tag
834
835 redo A;
836 } else {
837 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
838 ## Stay in the state
839 !!!next-input-character;
840 redo A;
841 }
842 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
843 if ($self->{next_input_character} == 0x0027) { # '
844 $self->{state} = 'before attribute name';
845 !!!next-input-character;
846 redo A;
847 } elsif ($self->{next_input_character} == 0x0026) { # &
848 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
849 $self->{state} = 'entity in attribute value';
850 !!!next-input-character;
851 redo A;
852 } elsif ($self->{next_input_character} == -1) {
853 !!!parse-error (type => 'unclosed attribute value');
854 if ($self->{current_token}->{type} eq 'start tag') {
855 $self->{current_token}->{first_start_tag}
856 = not defined $self->{last_emitted_start_tag_name};
857 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
858 } elsif ($self->{current_token}->{type} eq 'end tag') {
859 $self->{content_model_flag} = 'PCDATA'; # MUST
860 if ($self->{current_token}->{attributes}) {
861 !!!parse-error (type => 'end tag attribute');
862 }
863 } else {
864 die "$0: $self->{current_token}->{type}: Unknown token type";
865 }
866 $self->{state} = 'data';
867 ## reconsume
868
869 !!!emit ($self->{current_token}); # start tag or end tag
870
871 redo A;
872 } else {
873 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
874 ## Stay in the state
875 !!!next-input-character;
876 redo A;
877 }
878 } elsif ($self->{state} eq 'attribute value (unquoted)') {
879 if ($self->{next_input_character} == 0x0009 or # HT
880 $self->{next_input_character} == 0x000A or # LF
881 $self->{next_input_character} == 0x000B or # HT
882 $self->{next_input_character} == 0x000C or # FF
883 $self->{next_input_character} == 0x0020) { # SP
884 $self->{state} = 'before attribute name';
885 !!!next-input-character;
886 redo A;
887 } elsif ($self->{next_input_character} == 0x0026) { # &
888 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
889 $self->{state} = 'entity in attribute value';
890 !!!next-input-character;
891 redo A;
892 } elsif ($self->{next_input_character} == 0x003E) { # >
893 if ($self->{current_token}->{type} eq 'start tag') {
894 $self->{current_token}->{first_start_tag}
895 = not defined $self->{last_emitted_start_tag_name};
896 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
897 } elsif ($self->{current_token}->{type} eq 'end tag') {
898 $self->{content_model_flag} = 'PCDATA'; # MUST
899 if ($self->{current_token}->{attributes}) {
900 !!!parse-error (type => 'end tag attribute');
901 }
902 } else {
903 die "$0: $self->{current_token}->{type}: Unknown token type";
904 }
905 $self->{state} = 'data';
906 !!!next-input-character;
907
908 !!!emit ($self->{current_token}); # start tag or end tag
909
910 redo A;
911 } elsif ($self->{next_input_character} == -1) {
912 !!!parse-error (type => 'unclosed tag');
913 if ($self->{current_token}->{type} eq 'start tag') {
914 $self->{current_token}->{first_start_tag}
915 = not defined $self->{last_emitted_start_tag_name};
916 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
917 } elsif ($self->{current_token}->{type} eq 'end tag') {
918 $self->{content_model_flag} = 'PCDATA'; # MUST
919 if ($self->{current_token}->{attributes}) {
920 !!!parse-error (type => 'end tag attribute');
921 }
922 } else {
923 die "$0: $self->{current_token}->{type}: Unknown token type";
924 }
925 $self->{state} = 'data';
926 ## reconsume
927
928 !!!emit ($self->{current_token}); # start tag or end tag
929
930 redo A;
931 } else {
932 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
933 ## Stay in the state
934 !!!next-input-character;
935 redo A;
936 }
937 } elsif ($self->{state} eq 'entity in attribute value') {
938 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
939
940 unless (defined $token) {
941 $self->{current_attribute}->{value} .= '&';
942 } else {
943 $self->{current_attribute}->{value} .= $token->{data};
944 ## ISSUE: spec says "append the returned character token to the current attribute's value"
945 }
946
947 $self->{state} = $self->{last_attribute_value_state};
948 # next-input-character is already done
949 redo A;
950 } elsif ($self->{state} eq 'bogus comment') {
951 ## (only happen if PCDATA state)
952
953 my $token = {type => 'comment', data => ''};
954
955 BC: {
956 if ($self->{next_input_character} == 0x003E) { # >
957 $self->{state} = 'data';
958 !!!next-input-character;
959
960 !!!emit ($token);
961
962 redo A;
963 } elsif ($self->{next_input_character} == -1) {
964 $self->{state} = 'data';
965 ## reconsume
966
967 !!!emit ($token);
968
969 redo A;
970 } else {
971 $token->{data} .= chr ($self->{next_input_character});
972 !!!next-input-character;
973 redo BC;
974 }
975 } # BC
976 } elsif ($self->{state} eq 'markup declaration open') {
977 ## (only happen if PCDATA state)
978
979 my @next_char;
980 push @next_char, $self->{next_input_character};
981
982 if ($self->{next_input_character} == 0x002D) { # -
983 !!!next-input-character;
984 push @next_char, $self->{next_input_character};
985 if ($self->{next_input_character} == 0x002D) { # -
986 $self->{current_token} = {type => 'comment', data => ''};
987 $self->{state} = 'comment start';
988 !!!next-input-character;
989 redo A;
990 }
991 } elsif ($self->{next_input_character} == 0x0044 or # D
992 $self->{next_input_character} == 0x0064) { # d
993 !!!next-input-character;
994 push @next_char, $self->{next_input_character};
995 if ($self->{next_input_character} == 0x004F or # O
996 $self->{next_input_character} == 0x006F) { # o
997 !!!next-input-character;
998 push @next_char, $self->{next_input_character};
999 if ($self->{next_input_character} == 0x0043 or # C
1000 $self->{next_input_character} == 0x0063) { # c
1001 !!!next-input-character;
1002 push @next_char, $self->{next_input_character};
1003 if ($self->{next_input_character} == 0x0054 or # T
1004 $self->{next_input_character} == 0x0074) { # t
1005 !!!next-input-character;
1006 push @next_char, $self->{next_input_character};
1007 if ($self->{next_input_character} == 0x0059 or # Y
1008 $self->{next_input_character} == 0x0079) { # y
1009 !!!next-input-character;
1010 push @next_char, $self->{next_input_character};
1011 if ($self->{next_input_character} == 0x0050 or # P
1012 $self->{next_input_character} == 0x0070) { # p
1013 !!!next-input-character;
1014 push @next_char, $self->{next_input_character};
1015 if ($self->{next_input_character} == 0x0045 or # E
1016 $self->{next_input_character} == 0x0065) { # e
1017 ## ISSUE: What a stupid code this is!
1018 $self->{state} = 'DOCTYPE';
1019 !!!next-input-character;
1020 redo A;
1021 }
1022 }
1023 }
1024 }
1025 }
1026 }
1027 }
1028
1029 !!!parse-error (type => 'bogus comment');
1030 $self->{next_input_character} = shift @next_char;
1031 !!!back-next-input-character (@next_char);
1032 $self->{state} = 'bogus comment';
1033 redo A;
1034
1035 ## ISSUE: typos in spec: chacacters, is is a parse error
1036 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1037 } elsif ($self->{state} eq 'comment start') {
1038 if ($self->{next_input_character} == 0x002D) { # -
1039 $self->{state} = 'comment start dash';
1040 !!!next-input-character;
1041 redo A;
1042 } elsif ($self->{next_input_character} == 0x003E) { # >
1043 !!!parse-error (type => 'bogus comment');
1044 $self->{state} = 'data';
1045 !!!next-input-character;
1046
1047 !!!emit ($self->{current_token}); # comment
1048
1049 redo A;
1050 } elsif ($self->{next_input_character} == -1) {
1051 !!!parse-error (type => 'unclosed comment');
1052 $self->{state} = 'data';
1053 ## reconsume
1054
1055 !!!emit ($self->{current_token}); # comment
1056
1057 redo A;
1058 } else {
1059 $self->{current_token}->{data} # comment
1060 .= chr ($self->{next_input_character});
1061 $self->{state} = 'comment';
1062 !!!next-input-character;
1063 redo A;
1064 }
1065 } elsif ($self->{state} eq 'comment start dash') {
1066 if ($self->{next_input_character} == 0x002D) { # -
1067 $self->{state} = 'comment end';
1068 !!!next-input-character;
1069 redo A;
1070 } elsif ($self->{next_input_character} == 0x003E) { # >
1071 !!!parse-error (type => 'bogus comment');
1072 $self->{state} = 'data';
1073 !!!next-input-character;
1074
1075 !!!emit ($self->{current_token}); # comment
1076
1077 redo A;
1078 } elsif ($self->{next_input_character} == -1) {
1079 !!!parse-error (type => 'unclosed comment');
1080 $self->{state} = 'data';
1081 ## reconsume
1082
1083 !!!emit ($self->{current_token}); # comment
1084
1085 redo A;
1086 } else {
1087 $self->{current_token}->{data} # comment
1088 .= '-' . chr ($self->{next_input_character});
1089 $self->{state} = 'comment';
1090 !!!next-input-character;
1091 redo A;
1092 }
1093 } elsif ($self->{state} eq 'comment') {
1094 if ($self->{next_input_character} == 0x002D) { # -
1095 $self->{state} = 'comment end dash';
1096 !!!next-input-character;
1097 redo A;
1098 } elsif ($self->{next_input_character} == -1) {
1099 !!!parse-error (type => 'unclosed comment');
1100 $self->{state} = 'data';
1101 ## reconsume
1102
1103 !!!emit ($self->{current_token}); # comment
1104
1105 redo A;
1106 } else {
1107 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1108 ## Stay in the state
1109 !!!next-input-character;
1110 redo A;
1111 }
1112 } elsif ($self->{state} eq 'comment end dash') {
1113 if ($self->{next_input_character} == 0x002D) { # -
1114 $self->{state} = 'comment end';
1115 !!!next-input-character;
1116 redo A;
1117 } elsif ($self->{next_input_character} == -1) {
1118 !!!parse-error (type => 'unclosed comment');
1119 $self->{state} = 'data';
1120 ## reconsume
1121
1122 !!!emit ($self->{current_token}); # comment
1123
1124 redo A;
1125 } else {
1126 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1127 $self->{state} = 'comment';
1128 !!!next-input-character;
1129 redo A;
1130 }
1131 } elsif ($self->{state} eq 'comment end') {
1132 if ($self->{next_input_character} == 0x003E) { # >
1133 $self->{state} = 'data';
1134 !!!next-input-character;
1135
1136 !!!emit ($self->{current_token}); # comment
1137
1138 redo A;
1139 } elsif ($self->{next_input_character} == 0x002D) { # -
1140 !!!parse-error (type => 'dash in comment');
1141 $self->{current_token}->{data} .= '-'; # comment
1142 ## Stay in the state
1143 !!!next-input-character;
1144 redo A;
1145 } elsif ($self->{next_input_character} == -1) {
1146 !!!parse-error (type => 'unclosed comment');
1147 $self->{state} = 'data';
1148 ## reconsume
1149
1150 !!!emit ($self->{current_token}); # comment
1151
1152 redo A;
1153 } else {
1154 !!!parse-error (type => 'dash in comment');
1155 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1156 $self->{state} = 'comment';
1157 !!!next-input-character;
1158 redo A;
1159 }
1160 } elsif ($self->{state} eq 'DOCTYPE') {
1161 if ($self->{next_input_character} == 0x0009 or # HT
1162 $self->{next_input_character} == 0x000A or # LF
1163 $self->{next_input_character} == 0x000B or # VT
1164 $self->{next_input_character} == 0x000C or # FF
1165 $self->{next_input_character} == 0x0020) { # SP
1166 $self->{state} = 'before DOCTYPE name';
1167 !!!next-input-character;
1168 redo A;
1169 } else {
1170 !!!parse-error (type => 'no space before DOCTYPE name');
1171 $self->{state} = 'before DOCTYPE name';
1172 ## reconsume
1173 redo A;
1174 }
1175 } elsif ($self->{state} eq 'before DOCTYPE name') {
1176 if ($self->{next_input_character} == 0x0009 or # HT
1177 $self->{next_input_character} == 0x000A or # LF
1178 $self->{next_input_character} == 0x000B or # VT
1179 $self->{next_input_character} == 0x000C or # FF
1180 $self->{next_input_character} == 0x0020) { # SP
1181 ## Stay in the state
1182 !!!next-input-character;
1183 redo A;
1184 } elsif ($self->{next_input_character} == 0x003E) { # >
1185 !!!parse-error (type => 'no DOCTYPE name');
1186 $self->{state} = 'data';
1187 !!!next-input-character;
1188
1189 !!!emit ({type => 'DOCTYPE'}); # incorrect
1190
1191 redo A;
1192 } elsif ($self->{next_input_character} == -1) {
1193 !!!parse-error (type => 'no DOCTYPE name');
1194 $self->{state} = 'data';
1195 ## reconsume
1196
1197 !!!emit ({type => 'DOCTYPE'}); # incorrect
1198
1199 redo A;
1200 } else {
1201 $self->{current_token}
1202 = {type => 'DOCTYPE',
1203 name => chr ($self->{next_input_character}),
1204 correct => 1};
1205 ## ISSUE: "Set the token's name name to the" in the spec
1206 $self->{state} = 'DOCTYPE name';
1207 !!!next-input-character;
1208 redo A;
1209 }
1210 } elsif ($self->{state} eq 'DOCTYPE name') {
1211 ## ISSUE: Redundant "First," in the spec.
1212 if ($self->{next_input_character} == 0x0009 or # HT
1213 $self->{next_input_character} == 0x000A or # LF
1214 $self->{next_input_character} == 0x000B or # VT
1215 $self->{next_input_character} == 0x000C or # FF
1216 $self->{next_input_character} == 0x0020) { # SP
1217 $self->{state} = 'after DOCTYPE name';
1218 !!!next-input-character;
1219 redo A;
1220 } elsif ($self->{next_input_character} == 0x003E) { # >
1221 $self->{state} = 'data';
1222 !!!next-input-character;
1223
1224 !!!emit ($self->{current_token}); # DOCTYPE
1225
1226 redo A;
1227 } elsif ($self->{next_input_character} == -1) {
1228 !!!parse-error (type => 'unclosed DOCTYPE');
1229 $self->{state} = 'data';
1230 ## reconsume
1231
1232 delete $self->{current_token}->{correct};
1233 !!!emit ($self->{current_token}); # DOCTYPE
1234
1235 redo A;
1236 } else {
1237 $self->{current_token}->{name}
1238 .= chr ($self->{next_input_character}); # DOCTYPE
1239 ## Stay in the state
1240 !!!next-input-character;
1241 redo A;
1242 }
1243 } elsif ($self->{state} eq 'after DOCTYPE name') {
1244 if ($self->{next_input_character} == 0x0009 or # HT
1245 $self->{next_input_character} == 0x000A or # LF
1246 $self->{next_input_character} == 0x000B or # VT
1247 $self->{next_input_character} == 0x000C or # FF
1248 $self->{next_input_character} == 0x0020) { # SP
1249 ## Stay in the state
1250 !!!next-input-character;
1251 redo A;
1252 } elsif ($self->{next_input_character} == 0x003E) { # >
1253 $self->{state} = 'data';
1254 !!!next-input-character;
1255
1256 !!!emit ($self->{current_token}); # DOCTYPE
1257
1258 redo A;
1259 } elsif ($self->{next_input_character} == -1) {
1260 !!!parse-error (type => 'unclosed DOCTYPE');
1261 $self->{state} = 'data';
1262 ## reconsume
1263
1264 delete $self->{current_token}->{correct};
1265 !!!emit ($self->{current_token}); # DOCTYPE
1266
1267 redo A;
1268 } elsif ($self->{next_input_character} == 0x0050 or # P
1269 $self->{next_input_character} == 0x0070) { # p
1270 !!!next-input-character;
1271 if ($self->{next_input_character} == 0x0055 or # U
1272 $self->{next_input_character} == 0x0075) { # u
1273 !!!next-input-character;
1274 if ($self->{next_input_character} == 0x0042 or # B
1275 $self->{next_input_character} == 0x0062) { # b
1276 !!!next-input-character;
1277 if ($self->{next_input_character} == 0x004C or # L
1278 $self->{next_input_character} == 0x006C) { # l
1279 !!!next-input-character;
1280 if ($self->{next_input_character} == 0x0049 or # I
1281 $self->{next_input_character} == 0x0069) { # i
1282 !!!next-input-character;
1283 if ($self->{next_input_character} == 0x0043 or # C
1284 $self->{next_input_character} == 0x0063) { # c
1285 $self->{state} = 'before DOCTYPE public identifier';
1286 !!!next-input-character;
1287 redo A;
1288 }
1289 }
1290 }
1291 }
1292 }
1293
1294 #
1295 } elsif ($self->{next_input_character} == 0x0053 or # S
1296 $self->{next_input_character} == 0x0073) { # s
1297 !!!next-input-character;
1298 if ($self->{next_input_character} == 0x0059 or # Y
1299 $self->{next_input_character} == 0x0079) { # y
1300 !!!next-input-character;
1301 if ($self->{next_input_character} == 0x0053 or # S
1302 $self->{next_input_character} == 0x0073) { # s
1303 !!!next-input-character;
1304 if ($self->{next_input_character} == 0x0054 or # T
1305 $self->{next_input_character} == 0x0074) { # t
1306 !!!next-input-character;
1307 if ($self->{next_input_character} == 0x0045 or # E
1308 $self->{next_input_character} == 0x0065) { # e
1309 !!!next-input-character;
1310 if ($self->{next_input_character} == 0x004D or # M
1311 $self->{next_input_character} == 0x006D) { # m
1312 $self->{state} = 'before DOCTYPE system identifier';
1313 !!!next-input-character;
1314 redo A;
1315 }
1316 }
1317 }
1318 }
1319 }
1320
1321 #
1322 } else {
1323 !!!next-input-character;
1324 #
1325 }
1326
1327 !!!parse-error (type => 'string after DOCTYPE name');
1328 $self->{state} = 'bogus DOCTYPE';
1329 # next-input-character is already done
1330 redo A;
1331 } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
1332 if ({
1333 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1334 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1335 }->{$self->{next_input_character}}) {
1336 ## Stay in the state
1337 !!!next-input-character;
1338 redo A;
1339 } elsif ($self->{next_input_character} eq 0x0022) { # "
1340 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1341 $self->{state} = 'DOCTYPE public identifier (double-quoted)';
1342 !!!next-input-character;
1343 redo A;
1344 } elsif ($self->{next_input_character} eq 0x0027) { # '
1345 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1346 $self->{state} = 'DOCTYPE public identifier (single-quoted)';
1347 !!!next-input-character;
1348 redo A;
1349 } elsif ($self->{next_input_character} eq 0x003E) { # >
1350 !!!parse-error (type => 'no PUBLIC literal');
1351
1352 $self->{state} = 'data';
1353 !!!next-input-character;
1354
1355 delete $self->{current_token}->{correct};
1356 !!!emit ($self->{current_token}); # DOCTYPE
1357
1358 redo A;
1359 } elsif ($self->{next_input_character} == -1) {
1360 !!!parse-error (type => 'unclosed DOCTYPE');
1361
1362 $self->{state} = 'data';
1363 ## reconsume
1364
1365 delete $self->{current_token}->{correct};
1366 !!!emit ($self->{current_token}); # DOCTYPE
1367
1368 redo A;
1369 } else {
1370 !!!parse-error (type => 'string after PUBLIC');
1371 $self->{state} = 'bogus DOCTYPE';
1372 !!!next-input-character;
1373 redo A;
1374 }
1375 } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
1376 if ($self->{next_input_character} == 0x0022) { # "
1377 $self->{state} = 'after DOCTYPE public identifier';
1378 !!!next-input-character;
1379 redo A;
1380 } elsif ($self->{next_input_character} == -1) {
1381 !!!parse-error (type => 'unclosed PUBLIC literal');
1382
1383 $self->{state} = 'data';
1384 ## reconsume
1385
1386 delete $self->{current_token}->{correct};
1387 !!!emit ($self->{current_token}); # DOCTYPE
1388
1389 redo A;
1390 } else {
1391 $self->{current_token}->{public_identifier} # DOCTYPE
1392 .= chr $self->{next_input_character};
1393 ## Stay in the state
1394 !!!next-input-character;
1395 redo A;
1396 }
1397 } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
1398 if ($self->{next_input_character} == 0x0027) { # '
1399 $self->{state} = 'after DOCTYPE public identifier';
1400 !!!next-input-character;
1401 redo A;
1402 } elsif ($self->{next_input_character} == -1) {
1403 !!!parse-error (type => 'unclosed PUBLIC literal');
1404
1405 $self->{state} = 'data';
1406 ## reconsume
1407
1408 delete $self->{current_token}->{correct};
1409 !!!emit ($self->{current_token}); # DOCTYPE
1410
1411 redo A;
1412 } else {
1413 $self->{current_token}->{public_identifier} # DOCTYPE
1414 .= chr $self->{next_input_character};
1415 ## Stay in the state
1416 !!!next-input-character;
1417 redo A;
1418 }
1419 } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
1420 if ({
1421 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1422 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1423 }->{$self->{next_input_character}}) {
1424 ## Stay in the state
1425 !!!next-input-character;
1426 redo A;
1427 } elsif ($self->{next_input_character} == 0x0022) { # "
1428 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1429 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1430 !!!next-input-character;
1431 redo A;
1432 } elsif ($self->{next_input_character} == 0x0027) { # '
1433 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1434 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1435 !!!next-input-character;
1436 redo A;
1437 } elsif ($self->{next_input_character} == 0x003E) { # >
1438 $self->{state} = 'data';
1439 !!!next-input-character;
1440
1441 !!!emit ($self->{current_token}); # DOCTYPE
1442
1443 redo A;
1444 } elsif ($self->{next_input_character} == -1) {
1445 !!!parse-error (type => 'unclosed DOCTYPE');
1446
1447 $self->{state} = 'data';
1448 ## reconsume
1449
1450 delete $self->{current_token}->{correct};
1451 !!!emit ($self->{current_token}); # DOCTYPE
1452
1453 redo A;
1454 } else {
1455 !!!parse-error (type => 'string after PUBLIC literal');
1456 $self->{state} = 'bogus DOCTYPE';
1457 !!!next-input-character;
1458 redo A;
1459 }
1460 } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
1461 if ({
1462 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1463 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1464 }->{$self->{next_input_character}}) {
1465 ## Stay in the state
1466 !!!next-input-character;
1467 redo A;
1468 } elsif ($self->{next_input_character} == 0x0022) { # "
1469 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1470 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1471 !!!next-input-character;
1472 redo A;
1473 } elsif ($self->{next_input_character} == 0x0027) { # '
1474 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1475 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1476 !!!next-input-character;
1477 redo A;
1478 } elsif ($self->{next_input_character} == 0x003E) { # >
1479 !!!parse-error (type => 'no SYSTEM literal');
1480 $self->{state} = 'data';
1481 !!!next-input-character;
1482
1483 delete $self->{current_token}->{correct};
1484 !!!emit ($self->{current_token}); # DOCTYPE
1485
1486 redo A;
1487 } elsif ($self->{next_input_character} == -1) {
1488 !!!parse-error (type => 'unclosed DOCTYPE');
1489
1490 $self->{state} = 'data';
1491 ## reconsume
1492
1493 delete $self->{current_token}->{correct};
1494 !!!emit ($self->{current_token}); # DOCTYPE
1495
1496 redo A;
1497 } else {
1498 !!!parse-error (type => 'string after SYSTEM');
1499 $self->{state} = 'bogus DOCTYPE';
1500 !!!next-input-character;
1501 redo A;
1502 }
1503 } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
1504 if ($self->{next_input_character} == 0x0022) { # "
1505 $self->{state} = 'after DOCTYPE system identifier';
1506 !!!next-input-character;
1507 redo A;
1508 } elsif ($self->{next_input_character} == -1) {
1509 !!!parse-error (type => 'unclosed SYSTEM literal');
1510
1511 $self->{state} = 'data';
1512 ## reconsume
1513
1514 delete $self->{current_token}->{correct};
1515 !!!emit ($self->{current_token}); # DOCTYPE
1516
1517 redo A;
1518 } else {
1519 $self->{current_token}->{system_identifier} # DOCTYPE
1520 .= chr $self->{next_input_character};
1521 ## Stay in the state
1522 !!!next-input-character;
1523 redo A;
1524 }
1525 } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
1526 if ($self->{next_input_character} == 0x0027) { # '
1527 $self->{state} = 'after DOCTYPE system identifier';
1528 !!!next-input-character;
1529 redo A;
1530 } elsif ($self->{next_input_character} == -1) {
1531 !!!parse-error (type => 'unclosed SYSTEM literal');
1532
1533 $self->{state} = 'data';
1534 ## reconsume
1535
1536 delete $self->{current_token}->{correct};
1537 !!!emit ($self->{current_token}); # DOCTYPE
1538
1539 redo A;
1540 } else {
1541 $self->{current_token}->{system_identifier} # DOCTYPE
1542 .= chr $self->{next_input_character};
1543 ## Stay in the state
1544 !!!next-input-character;
1545 redo A;
1546 }
1547 } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
1548 if ({
1549 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1550 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1551 }->{$self->{next_input_character}}) {
1552 ## Stay in the state
1553 !!!next-input-character;
1554 redo A;
1555 } elsif ($self->{next_input_character} == 0x003E) { # >
1556 $self->{state} = 'data';
1557 !!!next-input-character;
1558
1559 !!!emit ($self->{current_token}); # DOCTYPE
1560
1561 redo A;
1562 } elsif ($self->{next_input_character} == -1) {
1563 !!!parse-error (type => 'unclosed DOCTYPE');
1564
1565 $self->{state} = 'data';
1566 ## reconsume
1567
1568 delete $self->{current_token}->{correct};
1569 !!!emit ($self->{current_token}); # DOCTYPE
1570
1571 redo A;
1572 } else {
1573 !!!parse-error (type => 'string after SYSTEM literal');
1574 $self->{state} = 'bogus DOCTYPE';
1575 !!!next-input-character;
1576 redo A;
1577 }
1578 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1579 if ($self->{next_input_character} == 0x003E) { # >
1580 $self->{state} = 'data';
1581 !!!next-input-character;
1582
1583 delete $self->{current_token}->{correct};
1584 !!!emit ($self->{current_token}); # DOCTYPE
1585
1586 redo A;
1587 } elsif ($self->{next_input_character} == -1) {
1588 !!!parse-error (type => 'unclosed DOCTYPE');
1589 $self->{state} = 'data';
1590 ## reconsume
1591
1592 delete $self->{current_token}->{correct};
1593 !!!emit ($self->{current_token}); # DOCTYPE
1594
1595 redo A;
1596 } else {
1597 ## Stay in the state
1598 !!!next-input-character;
1599 redo A;
1600 }
1601 } else {
1602 die "$0: $self->{state}: Unknown state";
1603 }
1604 } # A
1605
1606 die "$0: _get_next_token: unexpected case";
1607 } # _get_next_token
1608
1609 sub _tokenize_attempt_to_consume_an_entity ($$) {
1610 my ($self, $in_attr) = @_;
1611
1612 if ({
1613 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1614 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1615 }->{$self->{next_input_character}}) {
1616 ## Don't consume
1617 ## No error
1618 return undef;
1619 } elsif ($self->{next_input_character} == 0x0023) { # #
1620 !!!next-input-character;
1621 if ($self->{next_input_character} == 0x0078 or # x
1622 $self->{next_input_character} == 0x0058) { # X
1623 my $code;
1624 X: {
1625 my $x_char = $self->{next_input_character};
1626 !!!next-input-character;
1627 if (0x0030 <= $self->{next_input_character} and
1628 $self->{next_input_character} <= 0x0039) { # 0..9
1629 $code ||= 0;
1630 $code *= 0x10;
1631 $code += $self->{next_input_character} - 0x0030;
1632 redo X;
1633 } elsif (0x0061 <= $self->{next_input_character} and
1634 $self->{next_input_character} <= 0x0066) { # a..f
1635 $code ||= 0;
1636 $code *= 0x10;
1637 $code += $self->{next_input_character} - 0x0060 + 9;
1638 redo X;
1639 } elsif (0x0041 <= $self->{next_input_character} and
1640 $self->{next_input_character} <= 0x0046) { # A..F
1641 $code ||= 0;
1642 $code *= 0x10;
1643 $code += $self->{next_input_character} - 0x0040 + 9;
1644 redo X;
1645 } elsif (not defined $code) { # no hexadecimal digit
1646 !!!parse-error (type => 'bare hcro');
1647 $self->{next_input_character} = 0x0023; # #
1648 !!!back-next-input-character ($x_char);
1649 return undef;
1650 } elsif ($self->{next_input_character} == 0x003B) { # ;
1651 !!!next-input-character;
1652 } else {
1653 !!!parse-error (type => 'no refc');
1654 }
1655
1656 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1657 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1658 $code = 0xFFFD;
1659 } elsif ($code > 0x10FFFF) {
1660 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1661 $code = 0xFFFD;
1662 } elsif ($code == 0x000D) {
1663 !!!parse-error (type => 'CR character reference');
1664 $code = 0x000A;
1665 } elsif (0x80 <= $code and $code <= 0x9F) {
1666 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1667 $code = $c1_entity_char->{$code};
1668 }
1669
1670 return {type => 'character', data => chr $code};
1671 } # X
1672 } elsif (0x0030 <= $self->{next_input_character} and
1673 $self->{next_input_character} <= 0x0039) { # 0..9
1674 my $code = $self->{next_input_character} - 0x0030;
1675 !!!next-input-character;
1676
1677 while (0x0030 <= $self->{next_input_character} and
1678 $self->{next_input_character} <= 0x0039) { # 0..9
1679 $code *= 10;
1680 $code += $self->{next_input_character} - 0x0030;
1681
1682 !!!next-input-character;
1683 }
1684
1685 if ($self->{next_input_character} == 0x003B) { # ;
1686 !!!next-input-character;
1687 } else {
1688 !!!parse-error (type => 'no refc');
1689 }
1690
1691 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1692 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1693 $code = 0xFFFD;
1694 } elsif ($code > 0x10FFFF) {
1695 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1696 $code = 0xFFFD;
1697 } elsif ($code == 0x000D) {
1698 !!!parse-error (type => 'CR character reference');
1699 $code = 0x000A;
1700 } elsif (0x80 <= $code and $code <= 0x9F) {
1701 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1702 $code = $c1_entity_char->{$code};
1703 }
1704
1705 return {type => 'character', data => chr $code};
1706 } else {
1707 !!!parse-error (type => 'bare nero');
1708 !!!back-next-input-character ($self->{next_input_character});
1709 $self->{next_input_character} = 0x0023; # #
1710 return undef;
1711 }
1712 } elsif ((0x0041 <= $self->{next_input_character} and
1713 $self->{next_input_character} <= 0x005A) or
1714 (0x0061 <= $self->{next_input_character} and
1715 $self->{next_input_character} <= 0x007A)) {
1716 my $entity_name = chr $self->{next_input_character};
1717 !!!next-input-character;
1718
1719 my $value = $entity_name;
1720 my $match;
1721 require Whatpm::_NamedEntityList;
1722 our $EntityChar;
1723
1724 while (length $entity_name < 10 and
1725 ## NOTE: Some number greater than the maximum length of entity name
1726 ((0x0041 <= $self->{next_input_character} and # a
1727 $self->{next_input_character} <= 0x005A) or # x
1728 (0x0061 <= $self->{next_input_character} and # a
1729 $self->{next_input_character} <= 0x007A) or # z
1730 (0x0030 <= $self->{next_input_character} and # 0
1731 $self->{next_input_character} <= 0x0039) or # 9
1732 $self->{next_input_character} == 0x003B)) { # ;
1733 $entity_name .= chr $self->{next_input_character};
1734 if (defined $EntityChar->{$entity_name}) {
1735 if ($self->{next_input_character} == 0x003B) { # ;
1736 $value = $EntityChar->{$entity_name};
1737 $match = 1;
1738 !!!next-input-character;
1739 last;
1740 } elsif (not $in_attr) {
1741 $value = $EntityChar->{$entity_name};
1742 $match = -1;
1743 } else {
1744 $value .= chr $self->{next_input_character};
1745 }
1746 } else {
1747 $value .= chr $self->{next_input_character};
1748 }
1749 !!!next-input-character;
1750 }
1751
1752 if ($match > 0) {
1753 return {type => 'character', data => $value};
1754 } elsif ($match < 0) {
1755 !!!parse-error (type => 'no refc');
1756 return {type => 'character', data => $value};
1757 } else {
1758 !!!parse-error (type => 'bare ero');
1759 ## NOTE: No characters are consumed in the spec.
1760 return {type => 'character', data => '&'.$value};
1761 }
1762 } else {
1763 ## no characters are consumed
1764 !!!parse-error (type => 'bare ero');
1765 return undef;
1766 }
1767 } # _tokenize_attempt_to_consume_an_entity
1768
1769 sub _initialize_tree_constructor ($) {
1770 my $self = shift;
1771 ## NOTE: $self->{document} MUST be specified before this method is called
1772 $self->{document}->strict_error_checking (0);
1773 ## TODO: Turn mutation events off # MUST
1774 ## TODO: Turn loose Document option (manakai extension) on
1775 $self->{document}->manakai_is_html (1); # MUST
1776 } # _initialize_tree_constructor
1777
1778 sub _terminate_tree_constructor ($) {
1779 my $self = shift;
1780 $self->{document}->strict_error_checking (1);
1781 ## TODO: Turn mutation events on
1782 } # _terminate_tree_constructor
1783
1784 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1785
1786 { # tree construction stage
1787 my $token;
1788
1789 sub _construct_tree ($) {
1790 my ($self) = @_;
1791
1792 ## When an interactive UA render the $self->{document} available
1793 ## to the user, or when it begin accepting user input, are
1794 ## not defined.
1795
1796 ## Append a character: collect it and all subsequent consecutive
1797 ## characters and insert one Text node whose data is concatenation
1798 ## of all those characters. # MUST
1799
1800 !!!next-token;
1801
1802 $self->{insertion_mode} = 'before head';
1803 undef $self->{form_element};
1804 undef $self->{head_element};
1805 $self->{open_elements} = [];
1806 undef $self->{inner_html_node};
1807
1808 $self->_tree_construction_initial; # MUST
1809 $self->_tree_construction_root_element;
1810 $self->_tree_construction_main;
1811 } # _construct_tree
1812
1813 sub _tree_construction_initial ($) {
1814 my $self = shift;
1815 INITIAL: {
1816 if ($token->{type} eq 'DOCTYPE') {
1817 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1818 ## error, switch to a conformance checking mode for another
1819 ## language.
1820 my $doctype_name = $token->{name};
1821 $doctype_name = '' unless defined $doctype_name;
1822 $doctype_name =~ tr/a-z/A-Z/;
1823 if (not defined $token->{name} or # <!DOCTYPE>
1824 defined $token->{public_identifier} or
1825 defined $token->{system_identifier}) {
1826 !!!parse-error (type => 'not HTML5');
1827 } elsif ($doctype_name ne 'HTML') {
1828 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1829 !!!parse-error (type => 'not HTML5');
1830 }
1831
1832 my $doctype = $self->{document}->create_document_type_definition
1833 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1834 $doctype->public_id ($token->{public_identifier})
1835 if defined $token->{public_identifier};
1836 $doctype->system_id ($token->{system_identifier})
1837 if defined $token->{system_identifier};
1838 ## NOTE: Other DocumentType attributes are null or empty lists.
1839 ## ISSUE: internalSubset = null??
1840 $self->{document}->append_child ($doctype);
1841
1842 if (not $token->{correct} or $doctype_name ne 'HTML') {
1843 $self->{document}->manakai_compat_mode ('quirks');
1844 } elsif (defined $token->{public_identifier}) {
1845 my $pubid = $token->{public_identifier};
1846 $pubid =~ tr/a-z/A-z/;
1847 if ({
1848 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1849 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1850 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1851 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1852 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1853 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1854 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1855 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1856 "-//IETF//DTD HTML 2.0//EN" => 1,
1857 "-//IETF//DTD HTML 2.1E//EN" => 1,
1858 "-//IETF//DTD HTML 3.0//EN" => 1,
1859 "-//IETF//DTD HTML 3.0//EN//" => 1,
1860 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1861 "-//IETF//DTD HTML 3.2//EN" => 1,
1862 "-//IETF//DTD HTML 3//EN" => 1,
1863 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1864 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1865 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1866 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1867 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1868 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1869 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1870 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1871 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1872 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1873 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1874 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1875 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1876 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1877 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1878 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1879 "-//IETF//DTD HTML STRICT//EN" => 1,
1880 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1881 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1882 "-//IETF//DTD HTML//EN" => 1,
1883 "-//IETF//DTD HTML//EN//2.0" => 1,
1884 "-//IETF//DTD HTML//EN//3.0" => 1,
1885 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1886 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1887 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1888 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1889 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1890 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1891 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1892 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1893 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1894 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1895 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1896 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1897 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1898 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1899 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1900 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1901 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1902 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1903 "-//W3C//DTD HTML 3.2//EN" => 1,
1904 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1905 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1906 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1907 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1908 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1909 "-//W3C//DTD W3 HTML//EN" => 1,
1910 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1911 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1912 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1913 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1914 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1915 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1916 "HTML" => 1,
1917 }->{$pubid}) {
1918 $self->{document}->manakai_compat_mode ('quirks');
1919 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1920 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1921 if (defined $token->{system_identifier}) {
1922 $self->{document}->manakai_compat_mode ('quirks');
1923 } else {
1924 $self->{document}->manakai_compat_mode ('limited quirks');
1925 }
1926 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
1927 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
1928 $self->{document}->manakai_compat_mode ('limited quirks');
1929 }
1930 }
1931 if (defined $token->{system_identifier}) {
1932 my $sysid = $token->{system_identifier};
1933 $sysid =~ tr/A-Z/a-z/;
1934 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
1935 $self->{document}->manakai_compat_mode ('quirks');
1936 }
1937 }
1938
1939 ## Go to the root element phase.
1940 !!!next-token;
1941 return;
1942 } elsif ({
1943 'start tag' => 1,
1944 'end tag' => 1,
1945 'end-of-file' => 1,
1946 }->{$token->{type}}) {
1947 !!!parse-error (type => 'no DOCTYPE');
1948 $self->{document}->manakai_compat_mode ('quirks');
1949 ## Go to the root element phase
1950 ## reprocess
1951 return;
1952 } elsif ($token->{type} eq 'character') {
1953 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1954 ## Ignore the token
1955
1956 unless (length $token->{data}) {
1957 ## Stay in the phase
1958 !!!next-token;
1959 redo INITIAL;
1960 }
1961 }
1962
1963 !!!parse-error (type => 'no DOCTYPE');
1964 $self->{document}->manakai_compat_mode ('quirks');
1965 ## Go to the root element phase
1966 ## reprocess
1967 return;
1968 } elsif ($token->{type} eq 'comment') {
1969 my $comment = $self->{document}->create_comment ($token->{data});
1970 $self->{document}->append_child ($comment);
1971
1972 ## Stay in the phase.
1973 !!!next-token;
1974 redo INITIAL;
1975 } else {
1976 die "$0: $token->{type}: Unknown token";
1977 }
1978 } # INITIAL
1979 } # _tree_construction_initial
1980
1981 sub _tree_construction_root_element ($) {
1982 my $self = shift;
1983
1984 B: {
1985 if ($token->{type} eq 'DOCTYPE') {
1986 !!!parse-error (type => 'in html:#DOCTYPE');
1987 ## Ignore the token
1988 ## Stay in the phase
1989 !!!next-token;
1990 redo B;
1991 } elsif ($token->{type} eq 'comment') {
1992 my $comment = $self->{document}->create_comment ($token->{data});
1993 $self->{document}->append_child ($comment);
1994 ## Stay in the phase
1995 !!!next-token;
1996 redo B;
1997 } elsif ($token->{type} eq 'character') {
1998 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1999 ## Ignore the token.
2000
2001 unless (length $token->{data}) {
2002 ## Stay in the phase
2003 !!!next-token;
2004 redo B;
2005 }
2006 }
2007 #
2008 } elsif ({
2009 'start tag' => 1,
2010 'end tag' => 1,
2011 'end-of-file' => 1,
2012 }->{$token->{type}}) {
2013 ## ISSUE: There is an issue in the spec
2014 #
2015 } else {
2016 die "$0: $token->{type}: Unknown token";
2017 }
2018 my $root_element; !!!create-element ($root_element, 'html');
2019 $self->{document}->append_child ($root_element);
2020 push @{$self->{open_elements}}, [$root_element, 'html'];
2021 ## reprocess
2022 #redo B;
2023 return; ## Go to the main phase.
2024 } # B
2025 } # _tree_construction_root_element
2026
2027 sub _reset_insertion_mode ($) {
2028 my $self = shift;
2029
2030 ## Step 1
2031 my $last;
2032
2033 ## Step 2
2034 my $i = -1;
2035 my $node = $self->{open_elements}->[$i];
2036
2037 ## Step 3
2038 S3: {
2039 ## ISSUE: Oops! "If node is the first node in the stack of open
2040 ## elements, then set last to true. If the context element of the
2041 ## HTML fragment parsing algorithm is neither a td element nor a
2042 ## th element, then set node to the context element. (fragment case)":
2043 ## The second "if" is in the scope of the first "if"!?
2044 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2045 $last = 1;
2046 if (defined $self->{inner_html_node}) {
2047 if ($self->{inner_html_node}->[1] eq 'td' or
2048 $self->{inner_html_node}->[1] eq 'th') {
2049 #
2050 } else {
2051 $node = $self->{inner_html_node};
2052 }
2053 }
2054 }
2055
2056 ## Step 4..13
2057 my $new_mode = {
2058 select => 'in select',
2059 td => 'in cell',
2060 th => 'in cell',
2061 tr => 'in row',
2062 tbody => 'in table body',
2063 thead => 'in table head',
2064 tfoot => 'in table foot',
2065 caption => 'in caption',
2066 colgroup => 'in column group',
2067 table => 'in table',
2068 head => 'in body', # not in head!
2069 body => 'in body',
2070 frameset => 'in frameset',
2071 }->{$node->[1]};
2072 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2073
2074 ## Step 14
2075 if ($node->[1] eq 'html') {
2076 unless (defined $self->{head_element}) {
2077 $self->{insertion_mode} = 'before head';
2078 } else {
2079 $self->{insertion_mode} = 'after head';
2080 }
2081 return;
2082 }
2083
2084 ## Step 15
2085 $self->{insertion_mode} = 'in body' and return if $last;
2086
2087 ## Step 16
2088 $i--;
2089 $node = $self->{open_elements}->[$i];
2090
2091 ## Step 17
2092 redo S3;
2093 } # S3
2094 } # _reset_insertion_mode
2095
2096 sub _tree_construction_main ($) {
2097 my $self = shift;
2098
2099 my $previous_insertion_mode;
2100
2101 my $active_formatting_elements = [];
2102
2103 my $reconstruct_active_formatting_elements = sub { # MUST
2104 my $insert = shift;
2105
2106 ## Step 1
2107 return unless @$active_formatting_elements;
2108
2109 ## Step 3
2110 my $i = -1;
2111 my $entry = $active_formatting_elements->[$i];
2112
2113 ## Step 2
2114 return if $entry->[0] eq '#marker';
2115 for (@{$self->{open_elements}}) {
2116 if ($entry->[0] eq $_->[0]) {
2117 return;
2118 }
2119 }
2120
2121 S4: {
2122 ## Step 4
2123 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2124
2125 ## Step 5
2126 $i--;
2127 $entry = $active_formatting_elements->[$i];
2128
2129 ## Step 6
2130 if ($entry->[0] eq '#marker') {
2131 #
2132 } else {
2133 my $in_open_elements;
2134 OE: for (@{$self->{open_elements}}) {
2135 if ($entry->[0] eq $_->[0]) {
2136 $in_open_elements = 1;
2137 last OE;
2138 }
2139 }
2140 if ($in_open_elements) {
2141 #
2142 } else {
2143 redo S4;
2144 }
2145 }
2146
2147 ## Step 7
2148 $i++;
2149 $entry = $active_formatting_elements->[$i];
2150 } # S4
2151
2152 S7: {
2153 ## Step 8
2154 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2155
2156 ## Step 9
2157 $insert->($clone->[0]);
2158 push @{$self->{open_elements}}, $clone;
2159
2160 ## Step 10
2161 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2162
2163 ## Step 11
2164 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2165 ## Step 7'
2166 $i++;
2167 $entry = $active_formatting_elements->[$i];
2168
2169 redo S7;
2170 }
2171 } # S7
2172 }; # $reconstruct_active_formatting_elements
2173
2174 my $clear_up_to_marker = sub {
2175 for (reverse 0..$#$active_formatting_elements) {
2176 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2177 splice @$active_formatting_elements, $_;
2178 return;
2179 }
2180 }
2181 }; # $clear_up_to_marker
2182
2183 my $parse_rcdata = sub ($$) {
2184 my ($content_model_flag, $insert) = @_;
2185
2186 ## Step 1
2187 my $start_tag_name = $token->{tag_name};
2188 my $el;
2189 !!!create-element ($el, $start_tag_name, $token->{attributes});
2190
2191 ## Step 2
2192 $insert->($el); # /context node/->append_child ($el)
2193
2194 ## Step 3
2195 $self->{content_model_flag} = $content_model_flag; # CDATA or RCDATA
2196 delete $self->{escape}; # MUST
2197
2198 ## Step 4
2199 my $text = '';
2200 !!!next-token;
2201 while ($token->{type} eq 'character') { # or until stop tokenizing
2202 $text .= $token->{data};
2203 !!!next-token;
2204 }
2205
2206 ## Step 5
2207 if (length $text) {
2208 my $text = $self->{document}->create_text_node ($text);
2209 $el->append_child ($text);
2210 }
2211
2212 ## Step 6
2213 $self->{content_model_flag} = 'PCDATA';
2214
2215 ## Step 7
2216 if ($token->{type} eq 'end tag' and $token->{tag_name} eq $start_tag_name) {
2217 ## Ignore the token
2218 } else {
2219 !!!parse-error (type => 'in '.$content_model_flag.':#'.$token->{type});
2220 }
2221 !!!next-token;
2222 }; # $parse_rcdata
2223
2224 my $script_start_tag = sub ($) {
2225 my $insert = $_[0];
2226 my $script_el;
2227 !!!create-element ($script_el, 'script', $token->{attributes});
2228 ## TODO: mark as "parser-inserted"
2229
2230 $self->{content_model_flag} = 'CDATA';
2231 delete $self->{escape}; # MUST
2232
2233 my $text = '';
2234 !!!next-token;
2235 while ($token->{type} eq 'character') {
2236 $text .= $token->{data};
2237 !!!next-token;
2238 } # stop if non-character token or tokenizer stops tokenising
2239 if (length $text) {
2240 $script_el->manakai_append_text ($text);
2241 }
2242
2243 $self->{content_model_flag} = 'PCDATA';
2244
2245 if ($token->{type} eq 'end tag' and
2246 $token->{tag_name} eq 'script') {
2247 ## Ignore the token
2248 } else {
2249 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2250 ## ISSUE: And ignore?
2251 ## TODO: mark as "already executed"
2252 }
2253
2254 if (defined $self->{inner_html_node}) {
2255 ## TODO: mark as "already executed"
2256 } else {
2257 ## TODO: $old_insertion_point = current insertion point
2258 ## TODO: insertion point = just before the next input character
2259
2260 $insert->($script_el);
2261
2262 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2263
2264 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2265 }
2266
2267 !!!next-token;
2268 }; # $script_start_tag
2269
2270 my $formatting_end_tag = sub {
2271 my $tag_name = shift;
2272
2273 FET: {
2274 ## Step 1
2275 my $formatting_element;
2276 my $formatting_element_i_in_active;
2277 AFE: for (reverse 0..$#$active_formatting_elements) {
2278 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2279 $formatting_element = $active_formatting_elements->[$_];
2280 $formatting_element_i_in_active = $_;
2281 last AFE;
2282 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2283 last AFE;
2284 }
2285 } # AFE
2286 unless (defined $formatting_element) {
2287 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2288 ## Ignore the token
2289 !!!next-token;
2290 return;
2291 }
2292 ## has an element in scope
2293 my $in_scope = 1;
2294 my $formatting_element_i_in_open;
2295 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2296 my $node = $self->{open_elements}->[$_];
2297 if ($node->[0] eq $formatting_element->[0]) {
2298 if ($in_scope) {
2299 $formatting_element_i_in_open = $_;
2300 last INSCOPE;
2301 } else { # in open elements but not in scope
2302 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2303 ## Ignore the token
2304 !!!next-token;
2305 return;
2306 }
2307 } elsif ({
2308 table => 1, caption => 1, td => 1, th => 1,
2309 button => 1, marquee => 1, object => 1, html => 1,
2310 }->{$node->[1]}) {
2311 $in_scope = 0;
2312 }
2313 } # INSCOPE
2314 unless (defined $formatting_element_i_in_open) {
2315 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2316 pop @$active_formatting_elements; # $formatting_element
2317 !!!next-token; ## TODO: ok?
2318 return;
2319 }
2320 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2321 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2322 }
2323
2324 ## Step 2
2325 my $furthest_block;
2326 my $furthest_block_i_in_open;
2327 OE: for (reverse 0..$#{$self->{open_elements}}) {
2328 my $node = $self->{open_elements}->[$_];
2329 if (not $formatting_category->{$node->[1]} and
2330 #not $phrasing_category->{$node->[1]} and
2331 ($special_category->{$node->[1]} or
2332 $scoping_category->{$node->[1]})) {
2333 $furthest_block = $node;
2334 $furthest_block_i_in_open = $_;
2335 } elsif ($node->[0] eq $formatting_element->[0]) {
2336 last OE;
2337 }
2338 } # OE
2339
2340 ## Step 3
2341 unless (defined $furthest_block) { # MUST
2342 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2343 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2344 !!!next-token;
2345 return;
2346 }
2347
2348 ## Step 4
2349 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2350
2351 ## Step 5
2352 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2353 if (defined $furthest_block_parent) {
2354 $furthest_block_parent->remove_child ($furthest_block->[0]);
2355 }
2356
2357 ## Step 6
2358 my $bookmark_prev_el
2359 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2360 ->[0];
2361
2362 ## Step 7
2363 my $node = $furthest_block;
2364 my $node_i_in_open = $furthest_block_i_in_open;
2365 my $last_node = $furthest_block;
2366 S7: {
2367 ## Step 1
2368 $node_i_in_open--;
2369 $node = $self->{open_elements}->[$node_i_in_open];
2370
2371 ## Step 2
2372 my $node_i_in_active;
2373 S7S2: {
2374 for (reverse 0..$#$active_formatting_elements) {
2375 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2376 $node_i_in_active = $_;
2377 last S7S2;
2378 }
2379 }
2380 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2381 redo S7;
2382 } # S7S2
2383
2384 ## Step 3
2385 last S7 if $node->[0] eq $formatting_element->[0];
2386
2387 ## Step 4
2388 if ($last_node->[0] eq $furthest_block->[0]) {
2389 $bookmark_prev_el = $node->[0];
2390 }
2391
2392 ## Step 5
2393 if ($node->[0]->has_child_nodes ()) {
2394 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2395 $active_formatting_elements->[$node_i_in_active] = $clone;
2396 $self->{open_elements}->[$node_i_in_open] = $clone;
2397 $node = $clone;
2398 }
2399
2400 ## Step 6
2401 $node->[0]->append_child ($last_node->[0]);
2402
2403 ## Step 7
2404 $last_node = $node;
2405
2406 ## Step 8
2407 redo S7;
2408 } # S7
2409
2410 ## Step 8
2411 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2412
2413 ## Step 9
2414 my $clone = [$formatting_element->[0]->clone_node (0),
2415 $formatting_element->[1]];
2416
2417 ## Step 10
2418 my @cn = @{$furthest_block->[0]->child_nodes};
2419 $clone->[0]->append_child ($_) for @cn;
2420
2421 ## Step 11
2422 $furthest_block->[0]->append_child ($clone->[0]);
2423
2424 ## Step 12
2425 my $i;
2426 AFE: for (reverse 0..$#$active_formatting_elements) {
2427 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2428 splice @$active_formatting_elements, $_, 1;
2429 $i-- and last AFE if defined $i;
2430 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2431 $i = $_;
2432 }
2433 } # AFE
2434 splice @$active_formatting_elements, $i + 1, 0, $clone;
2435
2436 ## Step 13
2437 undef $i;
2438 OE: for (reverse 0..$#{$self->{open_elements}}) {
2439 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2440 splice @{$self->{open_elements}}, $_, 1;
2441 $i-- and last OE if defined $i;
2442 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2443 $i = $_;
2444 }
2445 } # OE
2446 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2447
2448 ## Step 14
2449 redo FET;
2450 } # FET
2451 }; # $formatting_end_tag
2452
2453 my $insert_to_current = sub {
2454 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2455 }; # $insert_to_current
2456
2457 my $insert_to_foster = sub {
2458 my $child = shift;
2459 if ({
2460 table => 1, tbody => 1, tfoot => 1,
2461 thead => 1, tr => 1,
2462 }->{$self->{open_elements}->[-1]->[1]}) {
2463 # MUST
2464 my $foster_parent_element;
2465 my $next_sibling;
2466 OE: for (reverse 0..$#{$self->{open_elements}}) {
2467 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2468 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2469 if (defined $parent and $parent->node_type == 1) {
2470 $foster_parent_element = $parent;
2471 $next_sibling = $self->{open_elements}->[$_]->[0];
2472 } else {
2473 $foster_parent_element
2474 = $self->{open_elements}->[$_ - 1]->[0];
2475 }
2476 last OE;
2477 }
2478 } # OE
2479 $foster_parent_element = $self->{open_elements}->[0]->[0]
2480 unless defined $foster_parent_element;
2481 $foster_parent_element->insert_before
2482 ($child, $next_sibling);
2483 } else {
2484 $self->{open_elements}->[-1]->[0]->append_child ($child);
2485 }
2486 }; # $insert_to_foster
2487
2488 my $in_body = sub {
2489 my $insert = shift;
2490 if ($token->{type} eq 'start tag') {
2491 if ($token->{tag_name} eq 'script') {
2492 ## NOTE: This is an "as if in head" code clone
2493 $script_start_tag->($insert);
2494 return;
2495 } elsif ($token->{tag_name} eq 'style') {
2496 ## NOTE: This is an "as if in head" code clone
2497 $parse_rcdata->('CDATA', $insert);
2498 return;
2499 } elsif ({
2500 base => 1, link => 1,
2501 }->{$token->{tag_name}}) {
2502 ## NOTE: This is an "as if in head" code clone, only "-t" differs
2503 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2504 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2505 !!!next-token;
2506 return;
2507 } elsif ($token->{tag_name} eq 'meta') {
2508 ## NOTE: This is an "as if in head" code clone, only "-t" differs
2509 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2510 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2511
2512 unless ($self->{confident}) {
2513 my $charset;
2514 if ($token->{attributes}->{charset}) { ## TODO: And if supported
2515 $charset = $token->{attributes}->{charset}->{value};
2516 }
2517 if ($token->{attributes}->{'http-equiv'}) {
2518 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2519 if ($token->{attributes}->{'http-equiv'}->{value}
2520 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2521 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2522 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2523 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
2524 } ## TODO: And if supported
2525 }
2526 ## TODO: Change the encoding
2527 }
2528
2529 !!!next-token;
2530 return;
2531 } elsif ($token->{tag_name} eq 'title') {
2532 !!!parse-error (type => 'in body:title');
2533 ## NOTE: This is an "as if in head" code clone
2534 $parse_rcdata->('RCDATA', sub {
2535 if (defined $self->{head_element}) {
2536 $self->{head_element}->append_child ($_[0]);
2537 } else {
2538 $insert->($_[0]);
2539 }
2540 });
2541 return;
2542 } elsif ($token->{tag_name} eq 'body') {
2543 !!!parse-error (type => 'in body:body');
2544
2545 if (@{$self->{open_elements}} == 1 or
2546 $self->{open_elements}->[1]->[1] ne 'body') {
2547 ## Ignore the token
2548 } else {
2549 my $body_el = $self->{open_elements}->[1]->[0];
2550 for my $attr_name (keys %{$token->{attributes}}) {
2551 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2552 $body_el->set_attribute_ns
2553 (undef, [undef, $attr_name],
2554 $token->{attributes}->{$attr_name}->{value});
2555 }
2556 }
2557 }
2558 !!!next-token;
2559 return;
2560 } elsif ({
2561 address => 1, blockquote => 1, center => 1, dir => 1,
2562 div => 1, dl => 1, fieldset => 1, listing => 1,
2563 menu => 1, ol => 1, p => 1, ul => 1,
2564 pre => 1,
2565 }->{$token->{tag_name}}) {
2566 ## has a p element in scope
2567 INSCOPE: for (reverse @{$self->{open_elements}}) {
2568 if ($_->[1] eq 'p') {
2569 !!!back-token;
2570 $token = {type => 'end tag', tag_name => 'p'};
2571 return;
2572 } elsif ({
2573 table => 1, caption => 1, td => 1, th => 1,
2574 button => 1, marquee => 1, object => 1, html => 1,
2575 }->{$_->[1]}) {
2576 last INSCOPE;
2577 }
2578 } # INSCOPE
2579
2580 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2581 if ($token->{tag_name} eq 'pre') {
2582 !!!next-token;
2583 if ($token->{type} eq 'character') {
2584 $token->{data} =~ s/^\x0A//;
2585 unless (length $token->{data}) {
2586 !!!next-token;
2587 }
2588 }
2589 } else {
2590 !!!next-token;
2591 }
2592 return;
2593 } elsif ($token->{tag_name} eq 'form') {
2594 if (defined $self->{form_element}) {
2595 !!!parse-error (type => 'in form:form');
2596 ## Ignore the token
2597 !!!next-token;
2598 return;
2599 } else {
2600 ## has a p element in scope
2601 INSCOPE: for (reverse @{$self->{open_elements}}) {
2602 if ($_->[1] eq 'p') {
2603 !!!back-token;
2604 $token = {type => 'end tag', tag_name => 'p'};
2605 return;
2606 } elsif ({
2607 table => 1, caption => 1, td => 1, th => 1,
2608 button => 1, marquee => 1, object => 1, html => 1,
2609 }->{$_->[1]}) {
2610 last INSCOPE;
2611 }
2612 } # INSCOPE
2613
2614 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2615 $self->{form_element} = $self->{open_elements}->[-1]->[0];
2616 !!!next-token;
2617 return;
2618 }
2619 } elsif ($token->{tag_name} eq 'li') {
2620 ## has a p element in scope
2621 INSCOPE: for (reverse @{$self->{open_elements}}) {
2622 if ($_->[1] eq 'p') {
2623 !!!back-token;
2624 $token = {type => 'end tag', tag_name => 'p'};
2625 return;
2626 } elsif ({
2627 table => 1, caption => 1, td => 1, th => 1,
2628 button => 1, marquee => 1, object => 1, html => 1,
2629 }->{$_->[1]}) {
2630 last INSCOPE;
2631 }
2632 } # INSCOPE
2633
2634 ## Step 1
2635 my $i = -1;
2636 my $node = $self->{open_elements}->[$i];
2637 LI: {
2638 ## Step 2
2639 if ($node->[1] eq 'li') {
2640 if ($i != -1) {
2641 !!!parse-error (type => 'end tag missing:'.
2642 $self->{open_elements}->[-1]->[1]);
2643 }
2644 splice @{$self->{open_elements}}, $i;
2645 last LI;
2646 }
2647
2648 ## Step 3
2649 if (not $formatting_category->{$node->[1]} and
2650 #not $phrasing_category->{$node->[1]} and
2651 ($special_category->{$node->[1]} or
2652 $scoping_category->{$node->[1]}) and
2653 $node->[1] ne 'address' and $node->[1] ne 'div') {
2654 last LI;
2655 }
2656
2657 ## Step 4
2658 $i--;
2659 $node = $self->{open_elements}->[$i];
2660 redo LI;
2661 } # LI
2662
2663 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2664 !!!next-token;
2665 return;
2666 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2667 ## has a p element in scope
2668 INSCOPE: for (reverse @{$self->{open_elements}}) {
2669 if ($_->[1] eq 'p') {
2670 !!!back-token;
2671 $token = {type => 'end tag', tag_name => 'p'};
2672 return;
2673 } elsif ({
2674 table => 1, caption => 1, td => 1, th => 1,
2675 button => 1, marquee => 1, object => 1, html => 1,
2676 }->{$_->[1]}) {
2677 last INSCOPE;
2678 }
2679 } # INSCOPE
2680
2681 ## Step 1
2682 my $i = -1;
2683 my $node = $self->{open_elements}->[$i];
2684 LI: {
2685 ## Step 2
2686 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2687 if ($i != -1) {
2688 !!!parse-error (type => 'end tag missing:'.
2689 $self->{open_elements}->[-1]->[1]);
2690 }
2691 splice @{$self->{open_elements}}, $i;
2692 last LI;
2693 }
2694
2695 ## Step 3
2696 if (not $formatting_category->{$node->[1]} and
2697 #not $phrasing_category->{$node->[1]} and
2698 ($special_category->{$node->[1]} or
2699 $scoping_category->{$node->[1]}) and
2700 $node->[1] ne 'address' and $node->[1] ne 'div') {
2701 last LI;
2702 }
2703
2704 ## Step 4
2705 $i--;
2706 $node = $self->{open_elements}->[$i];
2707 redo LI;
2708 } # LI
2709
2710 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2711 !!!next-token;
2712 return;
2713 } elsif ($token->{tag_name} eq 'plaintext') {
2714 ## has a p element in scope
2715 INSCOPE: for (reverse @{$self->{open_elements}}) {
2716 if ($_->[1] eq 'p') {
2717 !!!back-token;
2718 $token = {type => 'end tag', tag_name => 'p'};
2719 return;
2720 } elsif ({
2721 table => 1, caption => 1, td => 1, th => 1,
2722 button => 1, marquee => 1, object => 1, html => 1,
2723 }->{$_->[1]}) {
2724 last INSCOPE;
2725 }
2726 } # INSCOPE
2727
2728 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2729
2730 $self->{content_model_flag} = 'PLAINTEXT';
2731
2732 !!!next-token;
2733 return;
2734 } elsif ({
2735 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2736 }->{$token->{tag_name}}) {
2737 ## has a p element in scope
2738 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2739 my $node = $self->{open_elements}->[$_];
2740 if ($node->[1] eq 'p') {
2741 !!!back-token;
2742 $token = {type => 'end tag', tag_name => 'p'};
2743 return;
2744 } elsif ({
2745 table => 1, caption => 1, td => 1, th => 1,
2746 button => 1, marquee => 1, object => 1, html => 1,
2747 }->{$node->[1]}) {
2748 last INSCOPE;
2749 }
2750 } # INSCOPE
2751
2752 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
2753 ## has an element in scope
2754 #my $i;
2755 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2756 # my $node = $self->{open_elements}->[$_];
2757 # if ({
2758 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2759 # }->{$node->[1]}) {
2760 # $i = $_;
2761 # last INSCOPE;
2762 # } elsif ({
2763 # table => 1, caption => 1, td => 1, th => 1,
2764 # button => 1, marquee => 1, object => 1, html => 1,
2765 # }->{$node->[1]}) {
2766 # last INSCOPE;
2767 # }
2768 #} # INSCOPE
2769 #
2770 #if (defined $i) {
2771 # !!! parse-error (type => 'in hn:hn');
2772 # splice @{$self->{open_elements}}, $i;
2773 #}
2774
2775 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2776
2777 !!!next-token;
2778 return;
2779 } elsif ($token->{tag_name} eq 'a') {
2780 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2781 my $node = $active_formatting_elements->[$i];
2782 if ($node->[1] eq 'a') {
2783 !!!parse-error (type => 'in a:a');
2784
2785 !!!back-token;
2786 $token = {type => 'end tag', tag_name => 'a'};
2787 $formatting_end_tag->($token->{tag_name});
2788
2789 AFE2: for (reverse 0..$#$active_formatting_elements) {
2790 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2791 splice @$active_formatting_elements, $_, 1;
2792 last AFE2;
2793 }
2794 } # AFE2
2795 OE: for (reverse 0..$#{$self->{open_elements}}) {
2796 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
2797 splice @{$self->{open_elements}}, $_, 1;
2798 last OE;
2799 }
2800 } # OE
2801 last AFE;
2802 } elsif ($node->[0] eq '#marker') {
2803 last AFE;
2804 }
2805 } # AFE
2806
2807 $reconstruct_active_formatting_elements->($insert_to_current);
2808
2809 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2810 push @$active_formatting_elements, $self->{open_elements}->[-1];
2811
2812 !!!next-token;
2813 return;
2814 } elsif ({
2815 b => 1, big => 1, em => 1, font => 1, i => 1,
2816 s => 1, small => 1, strile => 1,
2817 strong => 1, tt => 1, u => 1,
2818 }->{$token->{tag_name}}) {
2819 $reconstruct_active_formatting_elements->($insert_to_current);
2820
2821 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2822 push @$active_formatting_elements, $self->{open_elements}->[-1];
2823
2824 !!!next-token;
2825 return;
2826 } elsif ($token->{tag_name} eq 'nobr') {
2827 $reconstruct_active_formatting_elements->($insert_to_current);
2828
2829 ## has a |nobr| element in scope
2830 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2831 my $node = $self->{open_elements}->[$_];
2832 if ($node->[1] eq 'nobr') {
2833 !!!parse-error (type => 'not closed:nobr');
2834 !!!back-token;
2835 $token = {type => 'end tag', tag_name => 'nobr'};
2836 return;
2837 } elsif ({
2838 table => 1, caption => 1, td => 1, th => 1,
2839 button => 1, marquee => 1, object => 1, html => 1,
2840 }->{$node->[1]}) {
2841 last INSCOPE;
2842 }
2843 } # INSCOPE
2844
2845 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2846 push @$active_formatting_elements, $self->{open_elements}->[-1];
2847
2848 !!!next-token;
2849 return;
2850 } elsif ($token->{tag_name} eq 'button') {
2851 ## has a button element in scope
2852 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2853 my $node = $self->{open_elements}->[$_];
2854 if ($node->[1] eq 'button') {
2855 !!!parse-error (type => 'in button:button');
2856 !!!back-token;
2857 $token = {type => 'end tag', tag_name => 'button'};
2858 return;
2859 } elsif ({
2860 table => 1, caption => 1, td => 1, th => 1,
2861 button => 1, marquee => 1, object => 1, html => 1,
2862 }->{$node->[1]}) {
2863 last INSCOPE;
2864 }
2865 } # INSCOPE
2866
2867 $reconstruct_active_formatting_elements->($insert_to_current);
2868
2869 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2870 push @$active_formatting_elements, ['#marker', ''];
2871
2872 !!!next-token;
2873 return;
2874 } elsif ($token->{tag_name} eq 'marquee' or
2875 $token->{tag_name} eq 'object') {
2876 $reconstruct_active_formatting_elements->($insert_to_current);
2877
2878 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2879 push @$active_formatting_elements, ['#marker', ''];
2880
2881 !!!next-token;
2882 return;
2883 } elsif ($token->{tag_name} eq 'xmp') {
2884 $reconstruct_active_formatting_elements->($insert_to_current);
2885 $parse_rcdata->('CDATA', $insert);
2886 return;
2887 } elsif ($token->{tag_name} eq 'table') {
2888 ## has a p element in scope
2889 INSCOPE: for (reverse @{$self->{open_elements}}) {
2890 if ($_->[1] eq 'p') {
2891 !!!back-token;
2892 $token = {type => 'end tag', tag_name => 'p'};
2893 return;
2894 } elsif ({
2895 table => 1, caption => 1, td => 1, th => 1,
2896 button => 1, marquee => 1, object => 1, html => 1,
2897 }->{$_->[1]}) {
2898 last INSCOPE;
2899 }
2900 } # INSCOPE
2901
2902 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2903
2904 $self->{insertion_mode} = 'in table';
2905
2906 !!!next-token;
2907 return;
2908 } elsif ({
2909 area => 1, basefont => 1, bgsound => 1, br => 1,
2910 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2911 image => 1,
2912 }->{$token->{tag_name}}) {
2913 if ($token->{tag_name} eq 'image') {
2914 !!!parse-error (type => 'image');
2915 $token->{tag_name} = 'img';
2916 }
2917
2918 ## NOTE: There is an "as if <br>" code clone.
2919 $reconstruct_active_formatting_elements->($insert_to_current);
2920
2921 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2922 pop @{$self->{open_elements}};
2923
2924 !!!next-token;
2925 return;
2926 } elsif ($token->{tag_name} eq 'hr') {
2927 ## has a p element in scope
2928 INSCOPE: for (reverse @{$self->{open_elements}}) {
2929 if ($_->[1] eq 'p') {
2930 !!!back-token;
2931 $token = {type => 'end tag', tag_name => 'p'};
2932 return;
2933 } elsif ({
2934 table => 1, caption => 1, td => 1, th => 1,
2935 button => 1, marquee => 1, object => 1, html => 1,
2936 }->{$_->[1]}) {
2937 last INSCOPE;
2938 }
2939 } # INSCOPE
2940
2941 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2942 pop @{$self->{open_elements}};
2943
2944 !!!next-token;
2945 return;
2946 } elsif ($token->{tag_name} eq 'input') {
2947 $reconstruct_active_formatting_elements->($insert_to_current);
2948
2949 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2950 ## TODO: associate with $self->{form_element} if defined
2951 pop @{$self->{open_elements}};
2952
2953 !!!next-token;
2954 return;
2955 } elsif ($token->{tag_name} eq 'isindex') {
2956 !!!parse-error (type => 'isindex');
2957
2958 if (defined $self->{form_element}) {
2959 ## Ignore the token
2960 !!!next-token;
2961 return;
2962 } else {
2963 my $at = $token->{attributes};
2964 my $form_attrs;
2965 $form_attrs->{action} = $at->{action} if $at->{action};
2966 my $prompt_attr = $at->{prompt};
2967 $at->{name} = {name => 'name', value => 'isindex'};
2968 delete $at->{action};
2969 delete $at->{prompt};
2970 my @tokens = (
2971 {type => 'start tag', tag_name => 'form',
2972 attributes => $form_attrs},
2973 {type => 'start tag', tag_name => 'hr'},
2974 {type => 'start tag', tag_name => 'p'},
2975 {type => 'start tag', tag_name => 'label'},
2976 );
2977 if ($prompt_attr) {
2978 push @tokens, {type => 'character', data => $prompt_attr->{value}};
2979 } else {
2980 push @tokens, {type => 'character',
2981 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
2982 ## TODO: make this configurable
2983 }
2984 push @tokens,
2985 {type => 'start tag', tag_name => 'input', attributes => $at},
2986 #{type => 'character', data => ''}, # SHOULD
2987 {type => 'end tag', tag_name => 'label'},
2988 {type => 'end tag', tag_name => 'p'},
2989 {type => 'start tag', tag_name => 'hr'},
2990 {type => 'end tag', tag_name => 'form'};
2991 $token = shift @tokens;
2992 !!!back-token (@tokens);
2993 return;
2994 }
2995 } elsif ($token->{tag_name} eq 'textarea') {
2996 my $tag_name = $token->{tag_name};
2997 my $el;
2998 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2999
3000 ## TODO: $self->{form_element} if defined
3001 $self->{content_model_flag} = 'RCDATA';
3002 delete $self->{escape}; # MUST
3003
3004 $insert->($el);
3005
3006 my $text = '';
3007 !!!next-token;
3008 if ($token->{type} eq 'character') {
3009 $token->{data} =~ s/^\x0A//;
3010 unless (length $token->{data}) {
3011 !!!next-token;
3012 }
3013 }
3014 while ($token->{type} eq 'character') {
3015 $text .= $token->{data};
3016 !!!next-token;
3017 }
3018 if (length $text) {
3019 $el->manakai_append_text ($text);
3020 }
3021
3022 $self->{content_model_flag} = 'PCDATA';
3023
3024 if ($token->{type} eq 'end tag' and
3025 $token->{tag_name} eq $tag_name) {
3026 ## Ignore the token
3027 } else {
3028 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
3029 }
3030 !!!next-token;
3031 return;
3032 } elsif ({
3033 iframe => 1,
3034 noembed => 1,
3035 noframes => 1,
3036 noscript => 0, ## TODO: 1 if scripting is enabled
3037 }->{$token->{tag_name}}) {
3038 $parse_rcdata->('CDATA', $insert);
3039 return;
3040 } elsif ($token->{tag_name} eq 'select') {
3041 $reconstruct_active_formatting_elements->($insert_to_current);
3042
3043 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
3044
3045 $self->{insertion_mode} = 'in select';
3046 !!!next-token;
3047 return;
3048 } elsif ({
3049 caption => 1, col => 1, colgroup => 1, frame => 1,
3050 frameset => 1, head => 1, option => 1, optgroup => 1,
3051 tbody => 1, td => 1, tfoot => 1, th => 1,
3052 thead => 1, tr => 1,
3053 }->{$token->{tag_name}}) {
3054 !!!parse-error (type => 'in body:'.$token->{tag_name});
3055 ## Ignore the token
3056 !!!next-token;
3057 return;
3058
3059 ## ISSUE: An issue on HTML5 new elements in the spec.
3060 } else {
3061 $reconstruct_active_formatting_elements->($insert_to_current);
3062
3063 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
3064
3065 !!!next-token;
3066 return;
3067 }
3068 } elsif ($token->{type} eq 'end tag') {
3069 if ($token->{tag_name} eq 'body') {
3070 if (@{$self->{open_elements}} > 1 and
3071 $self->{open_elements}->[1]->[1] eq 'body') {
3072 for (@{$self->{open_elements}}) {
3073 unless ({
3074 dd => 1, dt => 1, li => 1, p => 1, td => 1,
3075 th => 1, tr => 1, body => 1, html => 1,
3076 tbody => 1, tfoot => 1, thead => 1,
3077 }->{$_->[1]}) {
3078 !!!parse-error (type => 'not closed:'.$_->[1]);
3079 }
3080 }
3081
3082 $self->{insertion_mode} = 'after body';
3083 !!!next-token;
3084 return;
3085 } else {
3086 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3087 ## Ignore the token
3088 !!!next-token;
3089 return;
3090 }
3091 } elsif ($token->{tag_name} eq 'html') {
3092 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
3093 ## ISSUE: There is an issue in the spec.
3094 if ($self->{open_elements}->[-1]->[1] ne 'body') {
3095 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
3096 }
3097 $self->{insertion_mode} = 'after body';
3098 ## reprocess
3099 return;
3100 } else {
3101 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3102 ## Ignore the token
3103 !!!next-token;
3104 return;
3105 }
3106 } elsif ({
3107 address => 1, blockquote => 1, center => 1, dir => 1,
3108 div => 1, dl => 1, fieldset => 1, listing => 1,
3109 menu => 1, ol => 1, pre => 1, ul => 1,
3110 p => 1,
3111 dd => 1, dt => 1, li => 1,
3112 button => 1, marquee => 1, object => 1,
3113 }->{$token->{tag_name}}) {
3114 ## has an element in scope
3115 my $i;
3116 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3117 my $node = $self->{open_elements}->[$_];
3118 if ($node->[1] eq $token->{tag_name}) {
3119 ## generate implied end tags
3120 if ({
3121 dd => ($token->{tag_name} ne 'dd'),
3122 dt => ($token->{tag_name} ne 'dt'),
3123 li => ($token->{tag_name} ne 'li'),
3124 p => ($token->{tag_name} ne 'p'),
3125 td => 1, th => 1, tr => 1,
3126 tbody => 1, tfoot=> 1, thead => 1,
3127 }->{$self->{open_elements}->[-1]->[1]}) {
3128 !!!back-token;
3129 $token = {type => 'end tag',
3130 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3131 return;
3132 }
3133 $i = $_;
3134 last INSCOPE unless $token->{tag_name} eq 'p';
3135 } elsif ({
3136 table => 1, caption => 1, td => 1, th => 1,
3137 button => 1, marquee => 1, object => 1, html => 1,
3138 }->{$node->[1]}) {
3139 last INSCOPE;
3140 }
3141 } # INSCOPE
3142
3143 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3144 if (defined $i) {
3145 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3146 } else {
3147 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3148 }
3149 }
3150
3151 if (defined $i) {
3152 splice @{$self->{open_elements}}, $i;
3153 } elsif ($token->{tag_name} eq 'p') {
3154 ## As if <p>, then reprocess the current token
3155 my $el;
3156 !!!create-element ($el, 'p');
3157 $insert->($el);
3158 }
3159 $clear_up_to_marker->()
3160 if {
3161 button => 1, marquee => 1, object => 1,
3162 }->{$token->{tag_name}};
3163 !!!next-token;
3164 return;
3165 } elsif ($token->{tag_name} eq 'form') {
3166 ## has an element in scope
3167 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3168 my $node = $self->{open_elements}->[$_];
3169 if ($node->[1] eq $token->{tag_name}) {
3170 ## generate implied end tags
3171 if ({
3172 dd => 1, dt => 1, li => 1, p => 1,
3173 td => 1, th => 1, tr => 1,
3174 tbody => 1, tfoot=> 1, thead => 1,
3175 }->{$self->{open_elements}->[-1]->[1]}) {
3176 !!!back-token;
3177 $token = {type => 'end tag',
3178 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3179 return;
3180 }
3181 last INSCOPE;
3182 } elsif ({
3183 table => 1, caption => 1, td => 1, th => 1,
3184 button => 1, marquee => 1, object => 1, html => 1,
3185 }->{$node->[1]}) {
3186 last INSCOPE;
3187 }
3188 } # INSCOPE
3189
3190 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
3191 pop @{$self->{open_elements}};
3192 } else {
3193 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3194 }
3195
3196 undef $self->{form_element};
3197 !!!next-token;
3198 return;
3199 } elsif ({
3200 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3201 }->{$token->{tag_name}}) {
3202 ## has an element in scope
3203 my $i;
3204 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3205 my $node = $self->{open_elements}->[$_];
3206 if ({
3207 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3208 }->{$node->[1]}) {
3209 ## generate implied end tags
3210 if ({
3211 dd => 1, dt => 1, li => 1, p => 1,
3212 td => 1, th => 1, tr => 1,
3213 tbody => 1, tfoot=> 1, thead => 1,
3214 }->{$self->{open_elements}->[-1]->[1]}) {
3215 !!!back-token;
3216 $token = {type => 'end tag',
3217 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3218 return;
3219 }
3220 $i = $_;
3221 last INSCOPE;
3222 } elsif ({
3223 table => 1, caption => 1, td => 1, th => 1,
3224 button => 1, marquee => 1, object => 1, html => 1,
3225 }->{$node->[1]}) {
3226 last INSCOPE;
3227 }
3228 } # INSCOPE
3229
3230 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3231 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3232 }
3233
3234 splice @{$self->{open_elements}}, $i if defined $i;
3235 !!!next-token;
3236 return;
3237 } elsif ({
3238 a => 1,
3239 b => 1, big => 1, em => 1, font => 1, i => 1,
3240 nobr => 1, s => 1, small => 1, strile => 1,
3241 strong => 1, tt => 1, u => 1,
3242 }->{$token->{tag_name}}) {
3243 $formatting_end_tag->($token->{tag_name});
3244 return;
3245 } elsif ($token->{tag_name} eq 'br') {
3246 !!!parse-error (type => 'unmatched end tag:br');
3247
3248 ## As if <br>
3249 $reconstruct_active_formatting_elements->($insert_to_current);
3250
3251 my $el;
3252 !!!create-element ($el, 'br');
3253 $insert->($el);
3254
3255 ## Ignore the token.
3256 !!!next-token;
3257 return;
3258 } elsif ({
3259 caption => 1, col => 1, colgroup => 1, frame => 1,
3260 frameset => 1, head => 1, option => 1, optgroup => 1,
3261 tbody => 1, td => 1, tfoot => 1, th => 1,
3262 thead => 1, tr => 1,
3263 area => 1, basefont => 1, bgsound => 1,
3264 embed => 1, hr => 1, iframe => 1, image => 1,
3265 img => 1, input => 1, isindex => 1, noembed => 1,
3266 noframes => 1, param => 1, select => 1, spacer => 1,
3267 table => 1, textarea => 1, wbr => 1,
3268 noscript => 0, ## TODO: if scripting is enabled
3269 }->{$token->{tag_name}}) {
3270 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3271 ## Ignore the token
3272 !!!next-token;
3273 return;
3274
3275 ## ISSUE: Issue on HTML5 new elements in spec
3276
3277 } else {
3278 ## Step 1
3279 my $node_i = -1;
3280 my $node = $self->{open_elements}->[$node_i];
3281
3282 ## Step 2
3283 S2: {
3284 if ($node->[1] eq $token->{tag_name}) {
3285 ## Step 1
3286 ## generate implied end tags
3287 if ({
3288 dd => 1, dt => 1, li => 1, p => 1,
3289 td => 1, th => 1, tr => 1,
3290 tbody => 1, tfoot=> 1, thead => 1,
3291 }->{$self->{open_elements}->[-1]->[1]}) {
3292 !!!back-token;
3293 $token = {type => 'end tag',
3294 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3295 return;
3296 }
3297
3298 ## Step 2
3299 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
3300 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3301 }
3302
3303 ## Step 3
3304 splice @{$self->{open_elements}}, $node_i;
3305
3306 !!!next-token;
3307 last S2;
3308 } else {
3309 ## Step 3
3310 if (not $formatting_category->{$node->[1]} and
3311 #not $phrasing_category->{$node->[1]} and
3312 ($special_category->{$node->[1]} or
3313 $scoping_category->{$node->[1]})) {
3314 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3315 ## Ignore the token
3316 !!!next-token;
3317 last S2;
3318 }
3319 }
3320
3321 ## Step 4
3322 $node_i--;
3323 $node = $self->{open_elements}->[$node_i];
3324
3325 ## Step 5;
3326 redo S2;
3327 } # S2
3328 return;
3329 }
3330 }
3331 }; # $in_body
3332
3333 B: {
3334 if ($self->{insertion_mode} ne 'trailing end') {
3335 if ($token->{type} eq 'DOCTYPE') {
3336 !!!parse-error (type => 'in html:#DOCTYPE');
3337 ## Ignore the token
3338 ## Stay in the phase
3339 !!!next-token;
3340 redo B;
3341 } elsif ($token->{type} eq 'start tag' and
3342 $token->{tag_name} eq 'html') {
3343 ## ISSUE: "aa<html>" is not a parse error.
3344 ## ISSUE: "<html>" in fragment is not a parse error.
3345 unless ($token->{first_start_tag}) {
3346 !!!parse-error (type => 'not first start tag');
3347 }
3348 my $top_el = $self->{open_elements}->[0]->[0];
3349 for my $attr_name (keys %{$token->{attributes}}) {
3350 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3351 $top_el->set_attribute_ns
3352 (undef, [undef, $attr_name],
3353 $token->{attributes}->{$attr_name}->{value});
3354 }
3355 }
3356 !!!next-token;
3357 redo B;
3358 } elsif ($token->{type} eq 'end-of-file') {
3359 ## Generate implied end tags
3360 if ({
3361 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3362 tbody => 1, tfoot=> 1, thead => 1,
3363 }->{$self->{open_elements}->[-1]->[1]}) {
3364 !!!back-token;
3365 $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
3366 redo B;
3367 }
3368
3369 if (@{$self->{open_elements}} > 2 or
3370 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3371 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3372 } elsif (defined $self->{inner_html_node} and
3373 @{$self->{open_elements}} > 1 and
3374 $self->{open_elements}->[1]->[1] ne 'body') {
3375 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3376 }
3377
3378 ## Stop parsing
3379 last B;
3380
3381 ## ISSUE: There is an issue in the spec.
3382 } else {
3383 if ($self->{insertion_mode} eq 'before head') {
3384 if ($token->{type} eq 'character') {
3385 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3386 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3387 unless (length $token->{data}) {
3388 !!!next-token;
3389 redo B;
3390 }
3391 }
3392 ## As if <head>
3393 !!!create-element ($self->{head_element}, 'head');
3394 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3395 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3396 $self->{insertion_mode} = 'in head';
3397 ## reprocess
3398 redo B;
3399 } elsif ($token->{type} eq 'comment') {
3400 my $comment = $self->{document}->create_comment ($token->{data});
3401 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3402 !!!next-token;
3403 redo B;
3404 } elsif ($token->{type} eq 'start tag') {
3405 my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3406 !!!create-element ($self->{head_element}, 'head', $attr);
3407 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3408 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3409 $self->{insertion_mode} = 'in head';
3410 if ($token->{tag_name} eq 'head') {
3411 !!!next-token;
3412 #} elsif ({
3413 # base => 1, link => 1, meta => 1,
3414 # script => 1, style => 1, title => 1,
3415 # }->{$token->{tag_name}}) {
3416 # ## reprocess
3417 } else {
3418 ## reprocess
3419 }
3420 redo B;
3421 } elsif ($token->{type} eq 'end tag') {
3422 if ({
3423 head => 1, body => 1, html => 1,
3424 p => 1, br => 1,
3425 }->{$token->{tag_name}}) {
3426 ## As if <head>
3427 !!!create-element ($self->{head_element}, 'head');
3428 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3429 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3430 $self->{insertion_mode} = 'in head';
3431 ## reprocess
3432 redo B;
3433 } else {
3434 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3435 ## Ignore the token ## ISSUE: An issue in the spec.
3436 !!!next-token;
3437 redo B;
3438 }
3439 } else {
3440 die "$0: $token->{type}: Unknown type";
3441 }
3442 } elsif ($self->{insertion_mode} eq 'in head' or
3443 $self->{insertion_mode} eq 'in head noscript' or
3444 $self->{insertion_mode} eq 'after head') {
3445 if ($token->{type} eq 'character') {
3446 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3447 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3448 unless (length $token->{data}) {
3449 !!!next-token;
3450 redo B;
3451 }
3452 }
3453
3454 #
3455 } elsif ($token->{type} eq 'comment') {
3456 my $comment = $self->{document}->create_comment ($token->{data});
3457 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3458 !!!next-token;
3459 redo B;
3460 } elsif ($token->{type} eq 'start tag') {
3461 if ({base => ($self->{insertion_mode} eq 'in head' or
3462 $self->{insertion_mode} eq 'after head'),
3463 link => 1}->{$token->{tag_name}}) {
3464 ## NOTE: There is a "as if in head" code clone.
3465 if ($self->{insertion_mode} eq 'after head') {
3466 !!!parse-error (type => 'after head:'.$token->{tag_name});
3467 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3468 }
3469 !!!insert-element ($token->{tag_name}, $token->{attributes});
3470 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3471 pop @{$self->{open_elements}}
3472 if $self->{insertion_mode} eq 'after head';
3473 !!!next-token;
3474 redo B;
3475 } elsif ($token->{tag_name} eq 'meta') {
3476 ## NOTE: There is a "as if in head" code clone.
3477 if ($self->{insertion_mode} eq 'after head') {
3478 !!!parse-error (type => 'after head:'.$token->{tag_name});
3479 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3480 }
3481 !!!insert-element ($token->{tag_name}, $token->{attributes});
3482 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3483
3484 unless ($self->{confident}) {
3485 my $charset;
3486 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3487 $charset = $token->{attributes}->{charset}->{value};
3488 }
3489 if ($token->{attributes}->{'http-equiv'}) {
3490 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3491 if ($token->{attributes}->{'http-equiv'}->{value}
3492 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
3493 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3494 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3495 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
3496 } ## TODO: And if supported
3497 }
3498 ## TODO: Change the encoding
3499 }
3500
3501 ## TODO: Extracting |charset| from |meta|.
3502 pop @{$self->{open_elements}}
3503 if $self->{insertion_mode} eq 'after head';
3504 !!!next-token;
3505 redo B;
3506 } elsif ($token->{tag_name} eq 'title' and
3507 $self->{insertion_mode} eq 'in head') {
3508 ## NOTE: There is a "as if in head" code clone.
3509 if ($self->{insertion_mode} eq 'after head') {
3510 !!!parse-error (type => 'after head:'.$token->{tag_name});
3511 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3512 }
3513 my $parent = defined $self->{head_element} ? $self->{head_element}
3514 : $self->{open_elements}->[-1]->[0];
3515 $parse_rcdata->('RCDATA', sub { $parent->append_child ($_[0]) });
3516 pop @{$self->{open_elements}}
3517 if $self->{insertion_mode} eq 'after head';
3518 redo B;
3519 } elsif ($token->{tag_name} eq 'style') {
3520 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3521 ## insertion mode 'in head')
3522 ## NOTE: There is a "as if in head" code clone.
3523 if ($self->{insertion_mode} eq 'after head') {
3524 !!!parse-error (type => 'after head:'.$token->{tag_name});
3525 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3526 }
3527 $parse_rcdata->('CDATA', $insert_to_current);
3528 pop @{$self->{open_elements}}
3529 if $self->{insertion_mode} eq 'after head';
3530 redo B;
3531 } elsif ($token->{tag_name} eq 'noscript') {
3532 if ($self->{insertion_mode} eq 'in head') {
3533 ## NOTE: and scripting is disalbed
3534 !!!insert-element ($token->{tag_name}, $token->{attributes});
3535 $self->{insertion_mode} = 'in head noscript';
3536 !!!next-token;
3537 redo B;
3538 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3539 !!!parse-error (type => 'in noscript:noscript');
3540 ## Ignore the token
3541 redo B;
3542 } else {
3543 #
3544 }
3545 } elsif ($token->{tag_name} eq 'head' and
3546 $self->{insertion_mode} ne 'after head') {
3547 !!!parse-error (type => 'in head:head'); # or in head noscript
3548 ## Ignore the token
3549 !!!next-token;
3550 redo B;
3551 } elsif ($self->{insertion_mode} ne 'in head noscript' and
3552 $token->{tag_name} eq 'script') {
3553 if ($self->{insertion_mode} eq 'after head') {
3554 !!!parse-error (type => 'after head:'.$token->{tag_name});
3555 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3556 }
3557 ## NOTE: There is a "as if in head" code clone.
3558 $script_start_tag->($insert_to_current);
3559 pop @{$self->{open_elements}}
3560 if $self->{insertion_mode} eq 'after head';
3561 redo B;
3562 } elsif ($self->{insertion_mode} eq 'after head' and
3563 $token->{tag_name} eq 'body') {
3564 !!!insert-element ('body', $token->{attributes});
3565 $self->{insertion_mode} = 'in body';
3566 !!!next-token;
3567 redo B;
3568 } elsif ($self->{insertion_mode} eq 'after head' and
3569 $token->{tag_name} eq 'frameset') {
3570 !!!insert-element ('frameset', $token->{attributes});
3571 $self->{insertion_mode} = 'in frameset';
3572 !!!next-token;
3573 redo B;
3574 } else {
3575 #
3576 }
3577 } elsif ($token->{type} eq 'end tag') {
3578 if ($self->{insertion_mode} eq 'in head' and
3579 $token->{tag_name} eq 'head') {
3580 pop @{$self->{open_elements}};
3581 $self->{insertion_mode} = 'after head';
3582 !!!next-token;
3583 redo B;
3584 } elsif ($self->{insertion_mode} eq 'in head noscript' and
3585 $token->{tag_name} eq 'noscript') {
3586 pop @{$self->{open_elements}};
3587 $self->{insertion_mode} = 'in head';
3588 !!!next-token;
3589 redo B;
3590 } elsif ($self->{insertion_mode} eq 'in head' and
3591 {
3592 body => 1, html => 1,
3593 p => 1, br => 1,
3594 }->{$token->{tag_name}}) {
3595 #
3596 } elsif ($self->{insertion_mode} eq 'in head noscript' and
3597 {
3598 p => 1, br => 1,
3599 }->{$token->{tag_name}}) {
3600 #
3601 } elsif ($self->{insertion_mode} ne 'after head') {
3602 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3603 ## Ignore the token
3604 !!!next-token;
3605 redo B;
3606 } else {
3607 #
3608 }
3609 } else {
3610 #
3611 }
3612
3613 ## As if </head> or </noscript> or <body>
3614 if ($self->{insertion_mode} eq 'in head') {
3615 pop @{$self->{open_elements}};
3616 $self->{insertion_mode} = 'after head';
3617 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3618 pop @{$self->{open_elements}};
3619 !!!parse-error (type => 'in noscript:'.(defined $token->{tag_name} ? ($token->{type} eq 'end tag' ? '/' : '') . $token->{tag_name} : '#' . $token->{type}));
3620 $self->{insertion_mode} = 'in head';
3621 } else { # 'after head'
3622 !!!insert-element ('body');
3623 $self->{insertion_mode} = 'in body';
3624 }
3625 ## reprocess
3626 redo B;
3627
3628 ## ISSUE: An issue in the spec.
3629 } elsif ($self->{insertion_mode} eq 'in body') {
3630 if ($token->{type} eq 'character') {
3631 ## NOTE: There is a code clone of "character in body".
3632 $reconstruct_active_formatting_elements->($insert_to_current);
3633
3634 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3635
3636 !!!next-token;
3637 redo B;
3638 } elsif ($token->{type} eq 'comment') {
3639 ## NOTE: There is a code clone of "comment in body".
3640 my $comment = $self->{document}->create_comment ($token->{data});
3641 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3642 !!!next-token;
3643 redo B;
3644 } else {
3645 $in_body->($insert_to_current);
3646 redo B;
3647 }
3648 } elsif ($self->{insertion_mode} eq 'in table') {
3649 if ($token->{type} eq 'character') {
3650 ## NOTE: There are "character in table" code clones.
3651 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3652 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3653
3654 unless (length $token->{data}) {
3655 !!!next-token;
3656 redo B;
3657 }
3658 }
3659
3660 !!!parse-error (type => 'in table:#character');
3661
3662 ## As if in body, but insert into foster parent element
3663 ## ISSUE: Spec says that "whenever a node would be inserted
3664 ## into the current node" while characters might not be
3665 ## result in a new Text node.
3666 $reconstruct_active_formatting_elements->($insert_to_foster);
3667
3668 if ({
3669 table => 1, tbody => 1, tfoot => 1,
3670 thead => 1, tr => 1,
3671 }->{$self->{open_elements}->[-1]->[1]}) {
3672 # MUST
3673 my $foster_parent_element;
3674 my $next_sibling;
3675 my $prev_sibling;
3676 OE: for (reverse 0..$#{$self->{open_elements}}) {
3677 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3678 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3679 if (defined $parent and $parent->node_type == 1) {
3680 $foster_parent_element = $parent;
3681 $next_sibling = $self->{open_elements}->[$_]->[0];
3682 $prev_sibling = $next_sibling->previous_sibling;
3683 } else {
3684 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3685 $prev_sibling = $foster_parent_element->last_child;
3686 }
3687 last OE;
3688 }
3689 } # OE
3690 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3691 $prev_sibling = $foster_parent_element->last_child
3692 unless defined $foster_parent_element;
3693 if (defined $prev_sibling and
3694 $prev_sibling->node_type == 3) {
3695 $prev_sibling->manakai_append_text ($token->{data});
3696 } else {
3697 $foster_parent_element->insert_before
3698 ($self->{document}->create_text_node ($token->{data}),
3699 $next_sibling);
3700 }
3701 } else {
3702 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3703 }
3704
3705 !!!next-token;
3706 redo B;
3707 } elsif ($token->{type} eq 'comment') {
3708 my $comment = $self->{document}->create_comment ($token->{data});
3709 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3710 !!!next-token;
3711 redo B;
3712 } elsif ($token->{type} eq 'start tag') {
3713 if ({
3714 caption => 1,
3715 colgroup => 1,
3716 tbody => 1, tfoot => 1, thead => 1,
3717 }->{$token->{tag_name}}) {
3718 ## Clear back to table context
3719 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3720 $self->{open_elements}->[-1]->[1] ne 'html') {
3721 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3722 pop @{$self->{open_elements}};
3723 }
3724
3725 push @$active_formatting_elements, ['#marker', '']
3726 if $token->{tag_name} eq 'caption';
3727
3728 !!!insert-element ($token->{tag_name}, $token->{attributes});
3729 $self->{insertion_mode} = {
3730 caption => 'in caption',
3731 colgroup => 'in column group',
3732 tbody => 'in table body',
3733 tfoot => 'in table body',
3734 thead => 'in table body',
3735 }->{$token->{tag_name}};
3736 !!!next-token;
3737 redo B;
3738 } elsif ({
3739 col => 1,
3740 td => 1, th => 1, tr => 1,
3741 }->{$token->{tag_name}}) {
3742 ## Clear back to table context
3743 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3744 $self->{open_elements}->[-1]->[1] ne 'html') {
3745 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3746 pop @{$self->{open_elements}};
3747 }
3748
3749 !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
3750 $self->{insertion_mode} = $token->{tag_name} eq 'col'
3751 ? 'in column group' : 'in table body';
3752 ## reprocess
3753 redo B;
3754 } elsif ($token->{tag_name} eq 'table') {
3755 ## NOTE: There are code clones for this "table in table"
3756 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3757
3758 ## As if </table>
3759 ## have a table element in table scope
3760 my $i;
3761 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3762 my $node = $self->{open_elements}->[$_];
3763 if ($node->[1] eq 'table') {
3764 $i = $_;
3765 last INSCOPE;
3766 } elsif ({
3767 table => 1, html => 1,
3768 }->{$node->[1]}) {
3769 last INSCOPE;
3770 }
3771 } # INSCOPE
3772 unless (defined $i) {
3773 !!!parse-error (type => 'unmatched end tag:table');
3774 ## Ignore tokens </table><table>
3775 !!!next-token;
3776 redo B;
3777 }
3778
3779 ## generate implied end tags
3780 if ({
3781 dd => 1, dt => 1, li => 1, p => 1,
3782 td => 1, th => 1, tr => 1,
3783 tbody => 1, tfoot=> 1, thead => 1,
3784 }->{$self->{open_elements}->[-1]->[1]}) {
3785 !!!back-token; # <table>
3786 $token = {type => 'end tag', tag_name => 'table'};
3787 !!!back-token;
3788 $token = {type => 'end tag',
3789 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3790 redo B;
3791 }
3792
3793 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3794 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3795 }
3796
3797 splice @{$self->{open_elements}}, $i;
3798
3799 $self->_reset_insertion_mode;
3800
3801 ## reprocess
3802 redo B;
3803 } else {
3804 #
3805 }
3806 } elsif ($token->{type} eq 'end tag') {
3807 if ($token->{tag_name} eq 'table') {
3808 ## have a table element in table scope
3809 my $i;
3810 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3811 my $node = $self->{open_elements}->[$_];
3812 if ($node->[1] eq $token->{tag_name}) {
3813 $i = $_;
3814 last INSCOPE;
3815 } elsif ({
3816 table => 1, html => 1,
3817 }->{$node->[1]}) {
3818 last INSCOPE;
3819 }
3820 } # INSCOPE
3821 unless (defined $i) {
3822 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3823 ## Ignore the token
3824 !!!next-token;
3825 redo B;
3826 }
3827
3828 ## generate implied end tags
3829 if ({
3830 dd => 1, dt => 1, li => 1, p => 1,
3831 td => 1, th => 1, tr => 1,
3832 tbody => 1, tfoot=> 1, thead => 1,
3833 }->{$self->{open_elements}->[-1]->[1]}) {
3834 !!!back-token;
3835 $token = {type => 'end tag',
3836 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3837 redo B;
3838 }
3839
3840 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3841 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3842 }
3843
3844 splice @{$self->{open_elements}}, $i;
3845
3846 $self->_reset_insertion_mode;
3847
3848 !!!next-token;
3849 redo B;
3850 } elsif ({
3851 body => 1, caption => 1, col => 1, colgroup => 1,
3852 html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
3853 thead => 1, tr => 1,
3854 }->{$token->{tag_name}}) {
3855 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3856 ## Ignore the token
3857 !!!next-token;
3858 redo B;
3859 } else {
3860 #
3861 }
3862 } else {
3863 #
3864 }
3865
3866 !!!parse-error (type => 'in table:'.$token->{tag_name});
3867 $in_body->($insert_to_foster);
3868 redo B;
3869 } elsif ($self->{insertion_mode} eq 'in caption') {
3870 if ($token->{type} eq 'character') {
3871 ## NOTE: This is a code clone of "character in body".
3872 $reconstruct_active_formatting_elements->($insert_to_current);
3873
3874 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3875
3876 !!!next-token;
3877 redo B;
3878 } elsif ($token->{type} eq 'comment') {
3879 ## NOTE: This is a code clone of "comment in body".
3880 my $comment = $self->{document}->create_comment ($token->{data});
3881 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3882 !!!next-token;
3883 redo B;
3884 } elsif ($token->{type} eq 'start tag') {
3885 if ({
3886 caption => 1, col => 1, colgroup => 1, tbody => 1,
3887 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3888 }->{$token->{tag_name}}) {
3889 !!!parse-error (type => 'not closed:caption');
3890
3891 ## As if </caption>
3892 ## have a table element in table scope
3893 my $i;
3894 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3895 my $node = $self->{open_elements}->[$_];
3896 if ($node->[1] eq 'caption') {
3897 $i = $_;
3898 last INSCOPE;
3899 } elsif ({
3900 table => 1, html => 1,
3901 }->{$node->[1]}) {
3902 last INSCOPE;
3903 }
3904 } # INSCOPE
3905 unless (defined $i) {
3906 !!!parse-error (type => 'unmatched end tag:caption');
3907 ## Ignore the token
3908 !!!next-token;
3909 redo B;
3910 }
3911
3912 ## generate implied end tags
3913 if ({
3914 dd => 1, dt => 1, li => 1, p => 1,
3915 td => 1, th => 1, tr => 1,
3916 tbody => 1, tfoot=> 1, thead => 1,
3917 }->{$self->{open_elements}->[-1]->[1]}) {
3918 !!!back-token; # <?>
3919 $token = {type => 'end tag', tag_name => 'caption'};
3920 !!!back-token;
3921 $token = {type => 'end tag',
3922 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3923 redo B;
3924 }
3925
3926 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3927 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3928 }
3929
3930 splice @{$self->{open_elements}}, $i;
3931
3932 $clear_up_to_marker->();
3933
3934 $self->{insertion_mode} = 'in table';
3935
3936 ## reprocess
3937 redo B;
3938 } else {
3939 #
3940 }
3941 } elsif ($token->{type} eq 'end tag') {
3942 if ($token->{tag_name} eq 'caption') {
3943 ## have a table element in table scope
3944 my $i;
3945 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3946 my $node = $self->{open_elements}->[$_];
3947 if ($node->[1] eq $token->{tag_name}) {
3948 $i = $_;
3949 last INSCOPE;
3950 } elsif ({
3951 table => 1, html => 1,
3952 }->{$node->[1]}) {
3953 last INSCOPE;
3954 }
3955 } # INSCOPE
3956 unless (defined $i) {
3957 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3958 ## Ignore the token
3959 !!!next-token;
3960 redo B;
3961 }
3962
3963 ## generate implied end tags
3964 if ({
3965 dd => 1, dt => 1, li => 1, p => 1,
3966 td => 1, th => 1, tr => 1,
3967 tbody => 1, tfoot=> 1, thead => 1,
3968 }->{$self->{open_elements}->[-1]->[1]}) {
3969 !!!back-token;
3970 $token = {type => 'end tag',
3971 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3972 redo B;
3973 }
3974
3975 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3976 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3977 }
3978
3979 splice @{$self->{open_elements}}, $i;
3980
3981 $clear_up_to_marker->();
3982
3983 $self->{insertion_mode} = 'in table';
3984
3985 !!!next-token;
3986 redo B;
3987 } elsif ($token->{tag_name} eq 'table') {
3988 !!!parse-error (type => 'not closed:caption');
3989
3990 ## As if </caption>
3991 ## have a table element in table scope
3992 my $i;
3993 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3994 my $node = $self->{open_elements}->[$_];
3995 if ($node->[1] eq 'caption') {
3996 $i = $_;
3997 last INSCOPE;
3998 } elsif ({
3999 table => 1, html => 1,
4000 }->{$node->[1]}) {
4001 last INSCOPE;
4002 }
4003 } # INSCOPE
4004 unless (defined $i) {
4005 !!!parse-error (type => 'unmatched end tag:caption');
4006 ## Ignore the token
4007 !!!next-token;
4008 redo B;
4009 }
4010
4011 ## generate implied end tags
4012 if ({
4013 dd => 1, dt => 1, li => 1, p => 1,
4014 td => 1, th => 1, tr => 1,
4015 tbody => 1, tfoot=> 1, thead => 1,
4016 }->{$self->{open_elements}->[-1]->[1]}) {
4017 !!!back-token; # </table>
4018 $token = {type => 'end tag', tag_name => 'caption'};
4019 !!!back-token;
4020 $token = {type => 'end tag',
4021 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4022 redo B;
4023 }
4024
4025 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
4026 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4027 }
4028
4029 splice @{$self->{open_elements}}, $i;
4030
4031 $clear_up_to_marker->();
4032
4033 $self->{insertion_mode} = 'in table';
4034
4035 ## reprocess
4036 redo B;
4037 } elsif ({
4038 body => 1, col => 1, colgroup => 1,
4039 html => 1, tbody => 1, td => 1, tfoot => 1,
4040 th => 1, thead => 1, tr => 1,
4041 }->{$token->{tag_name}}) {
4042 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4043 ## Ignore the token
4044 redo B;
4045 } else {
4046 #
4047 }
4048 } else {
4049 #
4050 }
4051
4052 $in_body->($insert_to_current);
4053 redo B;
4054 } elsif ($self->{insertion_mode} eq 'in column group') {
4055 if ($token->{type} eq 'character') {
4056 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4057 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4058 unless (length $token->{data}) {
4059 !!!next-token;
4060 redo B;
4061 }
4062 }
4063
4064 #
4065 } elsif ($token->{type} eq 'comment') {
4066 my $comment = $self->{document}->create_comment ($token->{data});
4067 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4068 !!!next-token;
4069 redo B;
4070 } elsif ($token->{type} eq 'start tag') {
4071 if ($token->{tag_name} eq 'col') {
4072 !!!insert-element ($token->{tag_name}, $token->{attributes});
4073 pop @{$self->{open_elements}};
4074 !!!next-token;
4075 redo B;
4076 } else {
4077 #
4078 }
4079 } elsif ($token->{type} eq 'end tag') {
4080 if ($token->{tag_name} eq 'colgroup') {
4081 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4082 !!!parse-error (type => 'unmatched end tag:colgroup');
4083 ## Ignore the token
4084 !!!next-token;
4085 redo B;
4086 } else {
4087 pop @{$self->{open_elements}}; # colgroup
4088 $self->{insertion_mode} = 'in table';
4089 !!!next-token;
4090 redo B;
4091 }
4092 } elsif ($token->{tag_name} eq 'col') {
4093 !!!parse-error (type => 'unmatched end tag:col');
4094 ## Ignore the token
4095 !!!next-token;
4096 redo B;
4097 } else {
4098 #
4099 }
4100 } else {
4101 #
4102 }
4103
4104 ## As if </colgroup>
4105 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4106 !!!parse-error (type => 'unmatched end tag:colgroup');
4107 ## Ignore the token
4108 !!!next-token;
4109 redo B;
4110 } else {
4111 pop @{$self->{open_elements}}; # colgroup
4112 $self->{insertion_mode} = 'in table';
4113 ## reprocess
4114 redo B;
4115 }
4116 } elsif ($self->{insertion_mode} eq 'in table body') {
4117 if ($token->{type} eq 'character') {
4118 ## NOTE: This is a "character in table" code clone.
4119 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4120 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4121
4122 unless (length $token->{data}) {
4123 !!!next-token;
4124 redo B;
4125 }
4126 }
4127
4128 !!!parse-error (type => 'in table:#character');
4129
4130 ## As if in body, but insert into foster parent element
4131 ## ISSUE: Spec says that "whenever a node would be inserted
4132 ## into the current node" while characters might not be
4133 ## result in a new Text node.
4134 $reconstruct_active_formatting_elements->($insert_to_foster);
4135
4136 if ({
4137 table => 1, tbody => 1, tfoot => 1,
4138 thead => 1, tr => 1,
4139 }->{$self->{open_elements}->[-1]->[1]}) {
4140 # MUST
4141 my $foster_parent_element;
4142 my $next_sibling;
4143 my $prev_sibling;
4144 OE: for (reverse 0..$#{$self->{open_elements}}) {
4145 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4146 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4147 if (defined $parent and $parent->node_type == 1) {
4148 $foster_parent_element = $parent;
4149 $next_sibling = $self->{open_elements}->[$_]->[0];
4150 $prev_sibling = $next_sibling->previous_sibling;
4151 } else {
4152 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4153 $prev_sibling = $foster_parent_element->last_child;
4154 }
4155 last OE;
4156 }
4157 } # OE
4158 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4159 $prev_sibling = $foster_parent_element->last_child
4160 unless defined $foster_parent_element;
4161 if (defined $prev_sibling and
4162 $prev_sibling->node_type == 3) {
4163 $prev_sibling->manakai_append_text ($token->{data});
4164 } else {
4165 $foster_parent_element->insert_before
4166 ($self->{document}->create_text_node ($token->{data}),
4167 $next_sibling);
4168 }
4169 } else {
4170 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4171 }
4172
4173 !!!next-token;
4174 redo B;
4175 } elsif ($token->{type} eq 'comment') {
4176 ## Copied from 'in table'
4177 my $comment = $self->{document}->create_comment ($token->{data});
4178 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4179 !!!next-token;
4180 redo B;
4181 } elsif ($token->{type} eq 'start tag') {
4182 if ({
4183 tr => 1,
4184 th => 1, td => 1,
4185 }->{$token->{tag_name}}) {
4186 unless ($token->{tag_name} eq 'tr') {
4187 !!!parse-error (type => 'missing start tag:tr');
4188 }
4189
4190 ## Clear back to table body context
4191 while (not {
4192 tbody => 1, tfoot => 1, thead => 1, html => 1,
4193 }->{$self->{open_elements}->[-1]->[1]}) {
4194 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4195 pop @{$self->{open_elements}};
4196 }
4197
4198 $self->{insertion_mode} = 'in row';
4199 if ($token->{tag_name} eq 'tr') {
4200 !!!insert-element ($token->{tag_name}, $token->{attributes});
4201 !!!next-token;
4202 } else {
4203 !!!insert-element ('tr');
4204 ## reprocess
4205 }
4206 redo B;
4207 } elsif ({
4208 caption => 1, col => 1, colgroup => 1,
4209 tbody => 1, tfoot => 1, thead => 1,
4210 }->{$token->{tag_name}}) {
4211 ## have an element in table scope
4212 my $i;
4213 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4214 my $node = $self->{open_elements}->[$_];
4215 if ({
4216 tbody => 1, thead => 1, tfoot => 1,
4217 }->{$node->[1]}) {
4218 $i = $_;
4219 last INSCOPE;
4220 } elsif ({
4221 table => 1, html => 1,
4222 }->{$node->[1]}) {
4223 last INSCOPE;
4224 }
4225 } # INSCOPE
4226 unless (defined $i) {
4227 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4228 ## Ignore the token
4229 !!!next-token;
4230 redo B;
4231 }
4232
4233 ## Clear back to table body context
4234 while (not {
4235 tbody => 1, tfoot => 1, thead => 1, html => 1,
4236 }->{$self->{open_elements}->[-1]->[1]}) {
4237 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4238 pop @{$self->{open_elements}};
4239 }
4240
4241 ## As if <{current node}>
4242 ## have an element in table scope
4243 ## true by definition
4244
4245 ## Clear back to table body context
4246 ## nop by definition
4247
4248 pop @{$self->{open_elements}};
4249 $self->{insertion_mode} = 'in table';
4250 ## reprocess
4251 redo B;
4252 } elsif ($token->{tag_name} eq 'table') {
4253 ## NOTE: This is a code clone of "table in table"
4254 !!!parse-error (type => 'not closed:table');
4255
4256 ## As if </table>
4257 ## have a table element in table scope
4258 my $i;
4259 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4260 my $node = $self->{open_elements}->[$_];
4261 if ($node->[1] eq 'table') {
4262 $i = $_;
4263 last INSCOPE;
4264 } elsif ({
4265 table => 1, html => 1,
4266 }->{$node->[1]}) {
4267 last INSCOPE;
4268 }
4269 } # INSCOPE
4270 unless (defined $i) {
4271 !!!parse-error (type => 'unmatched end tag:table');
4272 ## Ignore tokens </table><table>
4273 !!!next-token;
4274 redo B;
4275 }
4276
4277 ## generate implied end tags
4278 if ({
4279 dd => 1, dt => 1, li => 1, p => 1,
4280 td => 1, th => 1, tr => 1,
4281 tbody => 1, tfoot=> 1, thead => 1,
4282 }->{$self->{open_elements}->[-1]->[1]}) {
4283 !!!back-token; # <table>
4284 $token = {type => 'end tag', tag_name => 'table'};
4285 !!!back-token;
4286 $token = {type => 'end tag',
4287 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4288 redo B;
4289 }
4290
4291 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4292 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4293 }
4294
4295 splice @{$self->{open_elements}}, $i;
4296
4297 $self->_reset_insertion_mode;
4298
4299 ## reprocess
4300 redo B;
4301 } else {
4302 #
4303 }
4304 } elsif ($token->{type} eq 'end tag') {
4305 if ({
4306 tbody => 1, tfoot => 1, thead => 1,
4307 }->{$token->{tag_name}}) {
4308 ## have an element in table scope
4309 my $i;
4310 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4311 my $node = $self->{open_elements}->[$_];
4312 if ($node->[1] eq $token->{tag_name}) {
4313 $i = $_;
4314 last INSCOPE;
4315 } elsif ({
4316 table => 1, html => 1,
4317 }->{$node->[1]}) {
4318 last INSCOPE;
4319 }
4320 } # INSCOPE
4321 unless (defined $i) {
4322 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4323 ## Ignore the token
4324 !!!next-token;
4325 redo B;
4326 }
4327
4328 ## Clear back to table body context
4329 while (not {
4330 tbody => 1, tfoot => 1, thead => 1, html => 1,
4331 }->{$self->{open_elements}->[-1]->[1]}) {
4332 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4333 pop @{$self->{open_elements}};
4334 }
4335
4336 pop @{$self->{open_elements}};
4337 $self->{insertion_mode} = 'in table';
4338 !!!next-token;
4339 redo B;
4340 } elsif ($token->{tag_name} eq 'table') {
4341 ## have an element in table scope
4342 my $i;
4343 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4344 my $node = $self->{open_elements}->[$_];
4345 if ({
4346 tbody => 1, thead => 1, tfoot => 1,
4347 }->{$node->[1]}) {
4348 $i = $_;
4349 last INSCOPE;
4350 } elsif ({
4351 table => 1, html => 1,
4352 }->{$node->[1]}) {
4353 last INSCOPE;
4354 }
4355 } # INSCOPE
4356 unless (defined $i) {
4357 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4358 ## Ignore the token
4359 !!!next-token;
4360 redo B;
4361 }
4362
4363 ## Clear back to table body context
4364 while (not {
4365 tbody => 1, tfoot => 1, thead => 1, html => 1,
4366 }->{$self->{open_elements}->[-1]->[1]}) {
4367 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4368 pop @{$self->{open_elements}};
4369 }
4370
4371 ## As if <{current node}>
4372 ## have an element in table scope
4373 ## true by definition
4374
4375 ## Clear back to table body context
4376 ## nop by definition
4377
4378 pop @{$self->{open_elements}};
4379 $self->{insertion_mode} = 'in table';
4380 ## reprocess
4381 redo B;
4382 } elsif ({
4383 body => 1, caption => 1, col => 1, colgroup => 1,
4384 html => 1, td => 1, th => 1, tr => 1,
4385 }->{$token->{tag_name}}) {
4386 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4387 ## Ignore the token
4388 !!!next-token;
4389 redo B;
4390 } else {
4391 #
4392 }
4393 } else {
4394 #
4395 }
4396
4397 ## As if in table
4398 !!!parse-error (type => 'in table:'.$token->{tag_name});
4399 $in_body->($insert_to_foster);
4400 redo B;
4401 } elsif ($self->{insertion_mode} eq 'in row') {
4402 if ($token->{type} eq 'character') {
4403 ## NOTE: This is a "character in table" code clone.
4404 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4405 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4406
4407 unless (length $token->{data}) {
4408 !!!next-token;
4409 redo B;
4410 }
4411 }
4412
4413 !!!parse-error (type => 'in table:#character');
4414
4415 ## As if in body, but insert into foster parent element
4416 ## ISSUE: Spec says that "whenever a node would be inserted
4417 ## into the current node" while characters might not be
4418 ## result in a new Text node.
4419 $reconstruct_active_formatting_elements->($insert_to_foster);
4420
4421 if ({
4422 table => 1, tbody => 1, tfoot => 1,
4423 thead => 1, tr => 1,
4424 }->{$self->{open_elements}->[-1]->[1]}) {
4425 # MUST
4426 my $foster_parent_element;
4427 my $next_sibling;
4428 my $prev_sibling;
4429 OE: for (reverse 0..$#{$self->{open_elements}}) {
4430 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4431 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4432 if (defined $parent and $parent->node_type == 1) {
4433 $foster_parent_element = $parent;
4434 $next_sibling = $self->{open_elements}->[$_]->[0];
4435 $prev_sibling = $next_sibling->previous_sibling;
4436 } else {
4437 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4438 $prev_sibling = $foster_parent_element->last_child;
4439 }
4440 last OE;
4441 }
4442 } # OE
4443 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4444 $prev_sibling = $foster_parent_element->last_child
4445 unless defined $foster_parent_element;
4446 if (defined $prev_sibling and
4447 $prev_sibling->node_type == 3) {
4448 $prev_sibling->manakai_append_text ($token->{data});
4449 } else {
4450 $foster_parent_element->insert_before
4451 ($self->{document}->create_text_node ($token->{data}),
4452 $next_sibling);
4453 }
4454 } else {
4455 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4456 }
4457
4458 !!!next-token;
4459 redo B;
4460 } elsif ($token->{type} eq 'comment') {
4461 ## Copied from 'in table'
4462 my $comment = $self->{document}->create_comment ($token->{data});
4463 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4464 !!!next-token;
4465 redo B;
4466 } elsif ($token->{type} eq 'start tag') {
4467 if ($token->{tag_name} eq 'th' or
4468 $token->{tag_name} eq 'td') {
4469 ## Clear back to table row context
4470 while (not {
4471 tr => 1, html => 1,
4472 }->{$self->{open_elements}->[-1]->[1]}) {
4473 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4474 pop @{$self->{open_elements}};
4475 }
4476
4477 !!!insert-element ($token->{tag_name}, $token->{attributes});
4478 $self->{insertion_mode} = 'in cell';
4479
4480 push @$active_formatting_elements, ['#marker', ''];
4481
4482 !!!next-token;
4483 redo B;
4484 } elsif ({
4485 caption => 1, col => 1, colgroup => 1,
4486 tbody => 1, tfoot => 1, thead => 1, tr => 1,
4487 }->{$token->{tag_name}}) {
4488 ## As if </tr>
4489 ## have an element in table scope
4490 my $i;
4491 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4492 my $node = $self->{open_elements}->[$_];
4493 if ($node->[1] eq 'tr') {
4494 $i = $_;
4495 last INSCOPE;
4496 } elsif ({
4497 table => 1, html => 1,
4498 }->{$node->[1]}) {
4499 last INSCOPE;
4500 }
4501 } # INSCOPE
4502 unless (defined $i) {
4503 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4504 ## Ignore the token
4505 !!!next-token;
4506 redo B;
4507 }
4508
4509 ## Clear back to table row context
4510 while (not {
4511 tr => 1, html => 1,
4512 }->{$self->{open_elements}->[-1]->[1]}) {
4513 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4514 pop @{$self->{open_elements}};
4515 }
4516
4517 pop @{$self->{open_elements}}; # tr
4518 $self->{insertion_mode} = 'in table body';
4519 ## reprocess
4520 redo B;
4521 } elsif ($token->{tag_name} eq 'table') {
4522 ## NOTE: This is a code clone of "table in table"
4523 !!!parse-error (type => 'not closed:table');
4524
4525 ## As if </table>
4526 ## have a table element in table scope
4527 my $i;
4528 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4529 my $node = $self->{open_elements}->[$_];
4530 if ($node->[1] eq 'table') {
4531 $i = $_;
4532 last INSCOPE;
4533 } elsif ({
4534 table => 1, html => 1,
4535 }->{$node->[1]}) {
4536 last INSCOPE;
4537 }
4538 } # INSCOPE
4539 unless (defined $i) {
4540 !!!parse-error (type => 'unmatched end tag:table');
4541 ## Ignore tokens </table><table>
4542 !!!next-token;
4543 redo B;
4544 }
4545
4546 ## generate implied end tags
4547 if ({
4548 dd => 1, dt => 1, li => 1, p => 1,
4549 td => 1, th => 1, tr => 1,
4550 tbody => 1, tfoot=> 1, thead => 1,
4551 }->{$self->{open_elements}->[-1]->[1]}) {
4552 !!!back-token; # <table>
4553 $token = {type => 'end tag', tag_name => 'table'};
4554 !!!back-token;
4555 $token = {type => 'end tag',
4556 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4557 redo B;
4558 }
4559
4560 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4561 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4562 }
4563
4564 splice @{$self->{open_elements}}, $i;
4565
4566 $self->_reset_insertion_mode;
4567
4568 ## reprocess
4569 redo B;
4570 } else {
4571 #
4572 }
4573 } elsif ($token->{type} eq 'end tag') {
4574 if ($token->{tag_name} eq 'tr') {
4575 ## have an element in table scope
4576 my $i;
4577 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4578 my $node = $self->{open_elements}->[$_];
4579 if ($node->[1] eq $token->{tag_name}) {
4580 $i = $_;
4581 last INSCOPE;
4582 } elsif ({
4583 table => 1, html => 1,
4584 }->{$node->[1]}) {
4585 last INSCOPE;
4586 }
4587 } # INSCOPE
4588 unless (defined $i) {
4589 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4590 ## Ignore the token
4591 !!!next-token;
4592 redo B;
4593 }
4594
4595 ## Clear back to table row context
4596 while (not {
4597 tr => 1, html => 1,
4598 }->{$self->{open_elements}->[-1]->[1]}) {
4599 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4600 pop @{$self->{open_elements}};
4601 }
4602
4603 pop @{$self->{open_elements}}; # tr
4604 $self->{insertion_mode} = 'in table body';
4605 !!!next-token;
4606 redo B;
4607 } elsif ($token->{tag_name} eq 'table') {
4608 ## As if </tr>
4609 ## have an element in table scope
4610 my $i;
4611 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4612 my $node = $self->{open_elements}->[$_];
4613 if ($node->[1] eq 'tr') {
4614 $i = $_;
4615 last INSCOPE;
4616 } elsif ({
4617 table => 1, html => 1,
4618 }->{$node->[1]}) {
4619 last INSCOPE;
4620 }
4621 } # INSCOPE
4622 unless (defined $i) {
4623 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4624 ## Ignore the token
4625 !!!next-token;
4626 redo B;
4627 }
4628
4629 ## Clear back to table row context
4630 while (not {
4631 tr => 1, html => 1,
4632 }->{$self->{open_elements}->[-1]->[1]}) {
4633 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4634 pop @{$self->{open_elements}};
4635 }
4636
4637 pop @{$self->{open_elements}}; # tr
4638 $self->{insertion_mode} = 'in table body';
4639 ## reprocess
4640 redo B;
4641 } elsif ({
4642 tbody => 1, tfoot => 1, thead => 1,
4643 }->{$token->{tag_name}}) {
4644 ## have an element in table scope
4645 my $i;
4646 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4647 my $node = $self->{open_elements}->[$_];
4648 if ($node->[1] eq $token->{tag_name}) {
4649 $i = $_;
4650 last INSCOPE;
4651 } elsif ({
4652 table => 1, html => 1,
4653 }->{$node->[1]}) {
4654 last INSCOPE;
4655 }
4656 } # INSCOPE
4657 unless (defined $i) {
4658 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4659 ## Ignore the token
4660 !!!next-token;
4661 redo B;
4662 }
4663
4664 ## As if </tr>
4665 ## have an element in table scope
4666 my $i;
4667 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4668 my $node = $self->{open_elements}->[$_];
4669 if ($node->[1] eq 'tr') {
4670 $i = $_;
4671 last INSCOPE;
4672 } elsif ({
4673 table => 1, html => 1,
4674 }->{$node->[1]}) {
4675 last INSCOPE;
4676 }
4677 } # INSCOPE
4678 unless (defined $i) {
4679 !!!parse-error (type => 'unmatched end tag:tr');
4680 ## Ignore the token
4681 !!!next-token;
4682 redo B;
4683 }
4684
4685 ## Clear back to table row context
4686 while (not {
4687 tr => 1, html => 1,
4688 }->{$self->{open_elements}->[-1]->[1]}) {
4689 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4690 pop @{$self->{open_elements}};
4691 }
4692
4693 pop @{$self->{open_elements}}; # tr
4694 $self->{insertion_mode} = 'in table body';
4695 ## reprocess
4696 redo B;
4697 } elsif ({
4698 body => 1, caption => 1, col => 1,
4699 colgroup => 1, html => 1, td => 1, th => 1,
4700 }->{$token->{tag_name}}) {
4701 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4702 ## Ignore the token
4703 !!!next-token;
4704 redo B;
4705 } else {
4706 #
4707 }
4708 } else {
4709 #
4710 }
4711
4712 ## As if in table
4713 !!!parse-error (type => 'in table:'.$token->{tag_name});
4714 $in_body->($insert_to_foster);
4715 redo B;
4716 } elsif ($self->{insertion_mode} eq 'in cell') {
4717 if ($token->{type} eq 'character') {
4718 ## NOTE: This is a code clone of "character in body".
4719 $reconstruct_active_formatting_elements->($insert_to_current);
4720
4721 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4722
4723 !!!next-token;
4724 redo B;
4725 } elsif ($token->{type} eq 'comment') {
4726 ## NOTE: This is a code clone of "comment in body".
4727 my $comment = $self->{document}->create_comment ($token->{data});
4728 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4729 !!!next-token;
4730 redo B;
4731 } elsif ($token->{type} eq 'start tag') {
4732 if ({
4733 caption => 1, col => 1, colgroup => 1,
4734 tbody => 1, td => 1, tfoot => 1, th => 1,
4735 thead => 1, tr => 1,
4736 }->{$token->{tag_name}}) {
4737 ## have an element in table scope
4738 my $tn;
4739 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4740 my $node = $self->{open_elements}->[$_];
4741 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4742 $tn = $node->[1];
4743 last INSCOPE;
4744 } elsif ({
4745 table => 1, html => 1,
4746 }->{$node->[1]}) {
4747 last INSCOPE;
4748 }
4749 } # INSCOPE
4750 unless (defined $tn) {
4751 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4752 ## Ignore the token
4753 !!!next-token;
4754 redo B;
4755 }
4756
4757 ## Close the cell
4758 !!!back-token; # <?>
4759 $token = {type => 'end tag', tag_name => $tn};
4760 redo B;
4761 } else {
4762 #
4763 }
4764 } elsif ($token->{type} eq 'end tag') {
4765 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4766 ## have an element in table scope
4767 my $i;
4768 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4769 my $node = $self->{open_elements}->[$_];
4770 if ($node->[1] eq $token->{tag_name}) {
4771 $i = $_;
4772 last INSCOPE;
4773 } elsif ({
4774 table => 1, html => 1,
4775 }->{$node->[1]}) {
4776 last INSCOPE;
4777 }
4778 } # INSCOPE
4779 unless (defined $i) {
4780 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4781 ## Ignore the token
4782 !!!next-token;
4783 redo B;
4784 }
4785
4786 ## generate implied end tags
4787 if ({
4788 dd => 1, dt => 1, li => 1, p => 1,
4789 td => ($token->{tag_name} eq 'th'),
4790 th => ($token->{tag_name} eq 'td'),
4791 tr => 1,
4792 tbody => 1, tfoot=> 1, thead => 1,
4793 }->{$self->{open_elements}->[-1]->[1]}) {
4794 !!!back-token;
4795 $token = {type => 'end tag',
4796 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4797 redo B;
4798 }
4799
4800 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4801 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4802 }
4803
4804 splice @{$self->{open_elements}}, $i;
4805
4806 $clear_up_to_marker->();
4807
4808 $self->{insertion_mode} = 'in row';
4809
4810 !!!next-token;
4811 redo B;
4812 } elsif ({
4813 body => 1, caption => 1, col => 1,
4814 colgroup => 1, html => 1,
4815 }->{$token->{tag_name}}) {
4816 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4817 ## Ignore the token
4818 !!!next-token;
4819 redo B;
4820 } elsif ({
4821 table => 1, tbody => 1, tfoot => 1,
4822 thead => 1, tr => 1,
4823 }->{$token->{tag_name}}) {
4824 ## have an element in table scope
4825 my $i;
4826 my $tn;
4827 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4828 my $node = $self->{open_elements}->[$_];
4829 if ($node->[1] eq $token->{tag_name}) {
4830 $i = $_;
4831 last INSCOPE;
4832 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4833 $tn = $node->[1];
4834 ## NOTE: There is exactly one |td| or |th| element
4835 ## in scope in the stack of open elements by definition.
4836 } elsif ({
4837 table => 1, html => 1,
4838 }->{$node->[1]}) {
4839 last INSCOPE;
4840 }
4841 } # INSCOPE
4842 unless (defined $i) {
4843 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4844 ## Ignore the token
4845 !!!next-token;
4846 redo B;
4847 }
4848
4849 ## Close the cell
4850 !!!back-token; # </?>
4851 $token = {type => 'end tag', tag_name => $tn};
4852 redo B;
4853 } else {
4854 #
4855 }
4856 } else {
4857 #
4858 }
4859
4860 $in_body->($insert_to_current);
4861 redo B;
4862 } elsif ($self->{insertion_mode} eq 'in select') {
4863 if ($token->{type} eq 'character') {
4864 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4865 !!!next-token;
4866 redo B;
4867 } elsif ($token->{type} eq 'comment') {
4868 my $comment = $self->{document}->create_comment ($token->{data});
4869 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4870 !!!next-token;
4871 redo B;
4872 } elsif ($token->{type} eq 'start tag') {
4873 if ($token->{tag_name} eq 'option') {
4874 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4875 ## As if </option>
4876 pop @{$self->{open_elements}};
4877 }
4878
4879 !!!insert-element ($token->{tag_name}, $token->{attributes});
4880 !!!next-token;
4881 redo B;
4882 } elsif ($token->{tag_name} eq 'optgroup') {
4883 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4884 ## As if </option>
4885 pop @{$self->{open_elements}};
4886 }
4887
4888 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4889 ## As if </optgroup>
4890 pop @{$self->{open_elements}};
4891 }
4892
4893 !!!insert-element ($token->{tag_name}, $token->{attributes});
4894 !!!next-token;
4895 redo B;
4896 } elsif ($token->{tag_name} eq 'select') {
4897 !!!parse-error (type => 'not closed:select');
4898 ## As if </select> instead
4899 ## have an element in table scope
4900 my $i;
4901 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4902 my $node = $self->{open_elements}->[$_];
4903 if ($node->[1] eq $token->{tag_name}) {
4904 $i = $_;
4905 last INSCOPE;
4906 } elsif ({
4907 table => 1, html => 1,
4908 }->{$node->[1]}) {
4909 last INSCOPE;
4910 }
4911 } # INSCOPE
4912 unless (defined $i) {
4913 !!!parse-error (type => 'unmatched end tag:select');
4914 ## Ignore the token
4915 !!!next-token;
4916 redo B;
4917 }
4918
4919 splice @{$self->{open_elements}}, $i;
4920
4921 $self->_reset_insertion_mode;
4922
4923 !!!next-token;
4924 redo B;
4925 } else {
4926 #
4927 }
4928 } elsif ($token->{type} eq 'end tag') {
4929 if ($token->{tag_name} eq 'optgroup') {
4930 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4931 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4932 ## As if </option>
4933 splice @{$self->{open_elements}}, -2;
4934 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4935 pop @{$self->{open_elements}};
4936 } else {
4937 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4938 ## Ignore the token
4939 }
4940 !!!next-token;
4941 redo B;
4942 } elsif ($token->{tag_name} eq 'option') {
4943 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4944 pop @{$self->{open_elements}};
4945 } else {
4946 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4947 ## Ignore the token
4948 }
4949 !!!next-token;
4950 redo B;
4951 } elsif ($token->{tag_name} eq 'select') {
4952 ## have an element in table scope
4953 my $i;
4954 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4955 my $node = $self->{open_elements}->[$_];
4956 if ($node->[1] eq $token->{tag_name}) {
4957 $i = $_;
4958 last INSCOPE;
4959 } elsif ({
4960 table => 1, html => 1,
4961 }->{$node->[1]}) {
4962 last INSCOPE;
4963 }
4964 } # INSCOPE
4965 unless (defined $i) {
4966 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4967 ## Ignore the token
4968 !!!next-token;
4969 redo B;
4970 }
4971
4972 splice @{$self->{open_elements}}, $i;
4973
4974 $self->_reset_insertion_mode;
4975
4976 !!!next-token;
4977 redo B;
4978 } elsif ({
4979 caption => 1, table => 1, tbody => 1,
4980 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4981 }->{$token->{tag_name}}) {
4982 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4983
4984 ## have an element in table scope
4985 my $i;
4986 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4987 my $node = $self->{open_elements}->[$_];
4988 if ($node->[1] eq $token->{tag_name}) {
4989 $i = $_;
4990 last INSCOPE;
4991 } elsif ({
4992 table => 1, html => 1,
4993 }->{$node->[1]}) {
4994 last INSCOPE;
4995 }
4996 } # INSCOPE
4997 unless (defined $i) {
4998 ## Ignore the token
4999 !!!next-token;
5000 redo B;
5001 }
5002
5003 ## As if </select>
5004 ## have an element in table scope
5005 undef $i;
5006 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5007 my $node = $self->{open_elements}->[$_];
5008 if ($node->[1] eq 'select') {
5009 $i = $_;
5010 last INSCOPE;
5011 } elsif ({
5012 table => 1, html => 1,
5013 }->{$node->[1]}) {
5014 last INSCOPE;
5015 }
5016 } # INSCOPE
5017 unless (defined $i) {
5018 !!!parse-error (type => 'unmatched end tag:select');
5019 ## Ignore the </select> token
5020 !!!next-token; ## TODO: ok?
5021 redo B;
5022 }
5023
5024 splice @{$self->{open_elements}}, $i;
5025
5026 $self->_reset_insertion_mode;
5027
5028 ## reprocess
5029 redo B;
5030 } else {
5031 #
5032 }
5033 } else {
5034 #
5035 }
5036
5037 !!!parse-error (type => 'in select:'.$token->{tag_name});
5038 ## Ignore the token
5039 !!!next-token;
5040 redo B;
5041 } elsif ($self->{insertion_mode} eq 'after body') {
5042 if ($token->{type} eq 'character') {
5043 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5044 my $data = $1;
5045 ## As if in body
5046 $reconstruct_active_formatting_elements->($insert_to_current);
5047
5048 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5049
5050 unless (length $token->{data}) {
5051 !!!next-token;
5052 redo B;
5053 }
5054 }
5055
5056 #
5057 !!!parse-error (type => 'after body:#'.$token->{type});
5058 } elsif ($token->{type} eq 'comment') {
5059 my $comment = $self->{document}->create_comment ($token->{data});
5060 $self->{open_elements}->[0]->[0]->append_child ($comment);
5061 !!!next-token;
5062 redo B;
5063 } elsif ($token->{type} eq 'start tag') {
5064 !!!parse-error (type => 'after body:'.$token->{tag_name});
5065 #
5066 } elsif ($token->{type} eq 'end tag') {
5067 if ($token->{tag_name} eq 'html') {
5068 if (defined $self->{inner_html_node}) {
5069 !!!parse-error (type => 'unmatched end tag:html');
5070 ## Ignore the token
5071 !!!next-token;
5072 redo B;
5073 } else {
5074 $previous_insertion_mode = $self->{insertion_mode};
5075 $self->{insertion_mode} = 'trailing end';
5076 !!!next-token;
5077 redo B;
5078 }
5079 } else {
5080 !!!parse-error (type => 'after body:/'.$token->{tag_name});
5081 }
5082 } else {
5083 !!!parse-error (type => 'after body:#'.$token->{type});
5084 }
5085
5086 $self->{insertion_mode} = 'in body';
5087 ## reprocess
5088 redo B;
5089 } elsif ($self->{insertion_mode} eq 'in frameset') {
5090 if ($token->{type} eq 'character') {
5091 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5092 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5093
5094 unless (length $token->{data}) {
5095 !!!next-token;
5096 redo B;
5097 }
5098 }
5099
5100 #
5101 } elsif ($token->{type} eq 'comment') {
5102 my $comment = $self->{document}->create_comment ($token->{data});
5103 $self->{open_elements}->[-1]->[0]->append_child ($comment);
5104 !!!next-token;
5105 redo B;
5106 } elsif ($token->{type} eq 'start tag') {
5107 if ($token->{tag_name} eq 'frameset') {
5108 !!!insert-element ($token->{tag_name}, $token->{attributes});
5109 !!!next-token;
5110 redo B;
5111 } elsif ($token->{tag_name} eq 'frame') {
5112 !!!insert-element ($token->{tag_name}, $token->{attributes});
5113 pop @{$self->{open_elements}};
5114 !!!next-token;
5115 redo B;
5116 } elsif ($token->{tag_name} eq 'noframes') {
5117 $in_body->($insert_to_current);
5118 redo B;
5119 } else {
5120 #
5121 }
5122 } elsif ($token->{type} eq 'end tag') {
5123 if ($token->{tag_name} eq 'frameset') {
5124 if ($self->{open_elements}->[-1]->[1] eq 'html' and
5125 @{$self->{open_elements}} == 1) {
5126 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5127 ## Ignore the token
5128 !!!next-token;
5129 } else {
5130 pop @{$self->{open_elements}};
5131 !!!next-token;
5132 }
5133
5134 ## if not inner_html and
5135 if ($self->{open_elements}->[-1]->[1] ne 'frameset') {
5136 $self->{insertion_mode} = 'after frameset';
5137 }
5138 redo B;
5139 } else {
5140 #
5141 }
5142 } else {
5143 #
5144 }
5145
5146 if (defined $token->{tag_name}) {
5147 !!!parse-error (type => 'in frameset:'.($token->{type} eq 'end tag' ? '/' : '').$token->{tag_name});
5148 } else {
5149 !!!parse-error (type => 'in frameset:#'.$token->{type});
5150 }
5151 ## Ignore the token
5152 !!!next-token;
5153 redo B;
5154 } elsif ($self->{insertion_mode} eq 'after frameset') {
5155 if ($token->{type} eq 'character') {
5156 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5157 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5158
5159 unless (length $token->{data}) {
5160 !!!next-token;
5161 redo B;
5162 }
5163 }
5164
5165 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5166 !!!parse-error (type => 'after frameset:#character');
5167
5168 ## Ignore the token.
5169 if (length $token->{data}) {
5170 ## reprocess the rest of characters
5171 } else {
5172 !!!next-token;
5173 }
5174 redo B;
5175 }
5176 } elsif ($token->{type} eq 'comment') {
5177 my $comment = $self->{document}->create_comment ($token->{data});
5178 $self->{open_elements}->[-1]->[0]->append_child ($comment);
5179 !!!next-token;
5180 redo B;
5181 } elsif ($token->{type} eq 'start tag') {
5182 if ($token->{tag_name} eq 'noframes') {
5183 $in_body->($insert_to_current);
5184 redo B;
5185 } else {
5186 #
5187 }
5188 } elsif ($token->{type} eq 'end tag') {
5189 if ($token->{tag_name} eq 'html') {
5190 $previous_insertion_mode = $self->{insertion_mode};
5191 $self->{insertion_mode} = 'trailing end';
5192 !!!next-token;
5193 redo B;
5194 } else {
5195 #
5196 }
5197 } else {
5198 die "$0: $token->{type}: Unknown token type";
5199 }
5200
5201 !!!parse-error (type => 'after frameset:'.($token->{tag_name} eq 'end tag' ? '/' : '').$token->{tag_name});
5202 ## Ignore the token
5203 !!!next-token;
5204 redo B;
5205
5206 ## ISSUE: An issue in spec there
5207 } else {
5208 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5209 }
5210 }
5211 } elsif ($self->{insertion_mode} eq 'trailing end') {
5212 ## states in the main stage is preserved yet # MUST
5213
5214 if ($token->{type} eq 'DOCTYPE') {
5215 !!!parse-error (type => 'after html:#DOCTYPE');
5216 ## Ignore the token
5217 !!!next-token;
5218 redo B;
5219 } elsif ($token->{type} eq 'comment') {
5220 my $comment = $self->{document}->create_comment ($token->{data});
5221 $self->{document}->append_child ($comment);
5222 !!!next-token;
5223 redo B;
5224 } elsif ($token->{type} eq 'character') {
5225 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5226 my $data = $1;
5227 ## As if in the main phase.
5228 ## NOTE: The insertion mode in the main phase
5229 ## just before the phase has been changed to the trailing
5230 ## end phase is either "after body" or "after frameset".
5231 $reconstruct_active_formatting_elements->($insert_to_current);
5232
5233 $self->{open_elements}->[-1]->[0]->manakai_append_text ($data);
5234
5235 unless (length $token->{data}) {
5236 !!!next-token;
5237 redo B;
5238 }
5239 }
5240
5241 !!!parse-error (type => 'after html:#character');
5242 $self->{insertion_mode} = $previous_insertion_mode;
5243 ## reprocess
5244 redo B;
5245 } elsif ($token->{type} eq 'start tag' or
5246 $token->{type} eq 'end tag') {
5247 !!!parse-error (type => 'after html:'.($token->{type} eq 'end tag' ? '/' : '').$token->{tag_name});
5248 $self->{insertion_mode} = $previous_insertion_mode;
5249 ## reprocess
5250 redo B;
5251 } elsif ($token->{type} eq 'end-of-file') {
5252 ## Stop parsing
5253 last B;
5254 } else {
5255 die "$0: $token->{type}: Unknown token";
5256 }
5257 }
5258 } # B
5259
5260 ## Stop parsing # MUST
5261
5262 ## TODO: script stuffs
5263 } # _tree_construct_main
5264
5265 sub set_inner_html ($$$) {
5266 my $class = shift;
5267 my $node = shift;
5268 my $s = \$_[0];
5269 my $onerror = $_[1];
5270
5271 my $nt = $node->node_type;
5272 if ($nt == 9) {
5273 # MUST
5274
5275 ## Step 1 # MUST
5276 ## TODO: If the document has an active parser, ...
5277 ## ISSUE: There is an issue in the spec.
5278
5279 ## Step 2 # MUST
5280 my @cn = @{$node->child_nodes};
5281 for (@cn) {
5282 $node->remove_child ($_);
5283 }
5284
5285 ## Step 3, 4, 5 # MUST
5286 $class->parse_string ($$s => $node, $onerror);
5287 } elsif ($nt == 1) {
5288 ## TODO: If non-html element
5289
5290 ## NOTE: Most of this code is copied from |parse_string|
5291
5292 ## Step 1 # MUST
5293 my $this_doc = $node->owner_document;
5294 my $doc = $this_doc->implementation->create_document;
5295 $doc->manakai_is_html (1);
5296 my $p = $class->new;
5297 $p->{document} = $doc;
5298
5299 ## Step 9 # MUST
5300 my $i = 0;
5301 my $line = 1;
5302 my $column = 0;
5303 $p->{set_next_input_character} = sub {
5304 my $self = shift;
5305
5306 pop @{$self->{prev_input_character}};
5307 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5308
5309 $self->{next_input_character} = -1 and return if $i >= length $$s;
5310 $self->{next_input_character} = ord substr $$s, $i++, 1;
5311 $column++;
5312
5313 if ($self->{next_input_character} == 0x000A) { # LF
5314 $line++;
5315 $column = 0;
5316 } elsif ($self->{next_input_character} == 0x000D) { # CR
5317 $i++ if substr ($$s, $i, 1) eq "\x0A";
5318 $self->{next_input_character} = 0x000A; # LF # MUST
5319 $line++;
5320 $column = 0;
5321 } elsif ($self->{next_input_character} > 0x10FFFF) {
5322 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5323 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5324 !!!parse-error (type => 'NULL');
5325 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5326 }
5327 };
5328 $p->{prev_input_character} = [-1, -1, -1];
5329 $p->{next_input_character} = -1;
5330
5331 my $ponerror = $onerror || sub {
5332 my (%opt) = @_;
5333 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5334 };
5335 $p->{parse_error} = sub {
5336 $ponerror->(@_, line => $line, column => $column);
5337 };
5338
5339 $p->_initialize_tokenizer;
5340 $p->_initialize_tree_constructor;
5341
5342 ## Step 2
5343 my $node_ln = $node->local_name;
5344 $p->{content_model_flag} = {
5345 title => 'RCDATA',
5346 textarea => 'RCDATA',
5347 style => 'CDATA',
5348 script => 'CDATA',
5349 xmp => 'CDATA',
5350 iframe => 'CDATA',
5351 noembed => 'CDATA',
5352 noframes => 'CDATA',
5353 noscript => 'CDATA',
5354 plaintext => 'PLAINTEXT',
5355 }->{$node_ln} || 'PCDATA';
5356 ## ISSUE: What is "the name of the element"? local name?
5357
5358 $p->{inner_html_node} = [$node, $node_ln];
5359
5360 ## Step 4
5361 my $root = $doc->create_element_ns
5362 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5363
5364 ## Step 5 # MUST
5365 $doc->append_child ($root);
5366
5367 ## Step 6 # MUST
5368 push @{$p->{open_elements}}, [$root, 'html'];
5369
5370 undef $p->{head_element};
5371
5372 ## Step 7 # MUST
5373 $p->_reset_insertion_mode;
5374
5375 ## Step 8 # MUST
5376 my $anode = $node;
5377 AN: while (defined $anode) {
5378 if ($anode->node_type == 1) {
5379 my $nsuri = $anode->namespace_uri;
5380 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5381 if ($anode->local_name eq 'form') { ## TODO: case?
5382 $p->{form_element} = $anode;
5383 last AN;
5384 }
5385 }
5386 }
5387 $anode = $anode->parent_node;
5388 } # AN
5389
5390 ## Step 3 # MUST
5391 ## Step 10 # MUST
5392 {
5393 my $self = $p;
5394 !!!next-token;
5395 }
5396 $p->_tree_construction_main;
5397
5398 ## Step 11 # MUST
5399 my @cn = @{$node->child_nodes};
5400 for (@cn) {
5401 $node->remove_child ($_);
5402 }
5403 ## ISSUE: mutation events? read-only?
5404
5405 ## Step 12 # MUST
5406 @cn = @{$root->child_nodes};
5407 for (@cn) {
5408 $this_doc->adopt_node ($_);
5409 $node->append_child ($_);
5410 }
5411 ## ISSUE: mutation events?
5412
5413 $p->_terminate_tree_constructor;
5414 } else {
5415 die "$0: |set_inner_html| is not defined for node of type $nt";
5416 }
5417 } # set_inner_html
5418
5419 } # tree construction stage
5420
5421 sub get_inner_html ($$$) {
5422 my (undef, $node, $on_error) = @_;
5423
5424 ## Step 1
5425 my $s = '';
5426
5427 my $in_cdata;
5428 my $parent = $node;
5429 while (defined $parent) {
5430 if ($parent->node_type == 1 and
5431 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5432 {
5433 style => 1, script => 1, xmp => 1, iframe => 1,
5434 noembed => 1, noframes => 1, noscript => 1,
5435 }->{$parent->local_name}) { ## TODO: case thingy
5436 $in_cdata = 1;
5437 }
5438 $parent = $parent->parent_node;
5439 }
5440
5441 ## Step 2
5442 my @node = @{$node->child_nodes};
5443 C: while (@node) {
5444 my $child = shift @node;
5445 unless (ref $child) {
5446 if ($child eq 'cdata-out') {
5447 $in_cdata = 0;
5448 } else {
5449 $s .= $child; # end tag
5450 }
5451 next C;
5452 }
5453
5454 my $nt = $child->node_type;
5455 if ($nt == 1) { # Element
5456 my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
5457 $s .= '<' . $tag_name;
5458 ## NOTE: Non-HTML case:
5459 ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
5460
5461 my @attrs = @{$child->attributes}; # sort order MUST be stable
5462 for my $attr (@attrs) { # order is implementation dependent
5463 my $attr_name = $attr->name; ## TODO: manakai_name
5464 $s .= ' ' . $attr_name . '="';
5465 my $attr_value = $attr->value;
5466 ## escape
5467 $attr_value =~ s/&/&amp;/g;
5468 $attr_value =~ s/</&lt;/g;
5469 $attr_value =~ s/>/&gt;/g;
5470 $attr_value =~ s/"/&quot;/g;
5471 $s .= $attr_value . '"';
5472 }
5473 $s .= '>';
5474
5475 next C if {
5476 area => 1, base => 1, basefont => 1, bgsound => 1,
5477 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5478 img => 1, input => 1, link => 1, meta => 1, param => 1,
5479 spacer => 1, wbr => 1,
5480 }->{$tag_name};
5481
5482 $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
5483
5484 if (not $in_cdata and {
5485 style => 1, script => 1, xmp => 1, iframe => 1,
5486 noembed => 1, noframes => 1, noscript => 1,
5487 plaintext => 1,
5488 }->{$tag_name}) {
5489 unshift @node, 'cdata-out';
5490 $in_cdata = 1;
5491 }
5492
5493 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5494 } elsif ($nt == 3 or $nt == 4) {
5495 if ($in_cdata) {
5496 $s .= $child->data;
5497 } else {
5498 my $value = $child->data;
5499 $value =~ s/&/&amp;/g;
5500 $value =~ s/</&lt;/g;
5501 $value =~ s/>/&gt;/g;
5502 $value =~ s/"/&quot;/g;
5503 $s .= $value;
5504 }
5505 } elsif ($nt == 8) {
5506 $s .= '<!--' . $child->data . '-->';
5507 } elsif ($nt == 10) {
5508 $s .= '<!DOCTYPE ' . $child->name . '>';
5509 } elsif ($nt == 5) { # entrefs
5510 push @node, @{$child->child_nodes};
5511 } else {
5512 $on_error->($child) if defined $on_error;
5513 }
5514 ## ISSUE: This code does not support PIs.
5515 } # C
5516
5517 ## Step 3
5518 return \$s;
5519 } # get_inner_html
5520
5521 1;
5522 # $Date: 2007/07/16 01:52:27 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24