/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.43 - (show annotations) (download) (as text)
Sat Jul 21 07:21:44 2007 UTC (17 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.42: +217 -219 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	21 Jul 2007 07:21:25 -0000
	* HTML.pm.src: Codes for "in cell" insertion mode
	is merged to the "in body" insertion mode code.

2007-07-21  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.42 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## ISSUE:
6 ## var doc = implementation.createDocument (null, null, null);
7 ## doc.write ('');
8 ## alert (doc.compatMode);
9
10 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
11 ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
12 ## is not yet clear.
13 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
14 ## "{U+FEFF}..." in GB18030?
15
16 my $permitted_slash_tag_name = {
17 base => 1,
18 link => 1,
19 meta => 1,
20 hr => 1,
21 br => 1,
22 img=> 1,
23 embed => 1,
24 param => 1,
25 area => 1,
26 col => 1,
27 input => 1,
28 };
29
30 my $c1_entity_char = {
31 0x80 => 0x20AC,
32 0x81 => 0xFFFD,
33 0x82 => 0x201A,
34 0x83 => 0x0192,
35 0x84 => 0x201E,
36 0x85 => 0x2026,
37 0x86 => 0x2020,
38 0x87 => 0x2021,
39 0x88 => 0x02C6,
40 0x89 => 0x2030,
41 0x8A => 0x0160,
42 0x8B => 0x2039,
43 0x8C => 0x0152,
44 0x8D => 0xFFFD,
45 0x8E => 0x017D,
46 0x8F => 0xFFFD,
47 0x90 => 0xFFFD,
48 0x91 => 0x2018,
49 0x92 => 0x2019,
50 0x93 => 0x201C,
51 0x94 => 0x201D,
52 0x95 => 0x2022,
53 0x96 => 0x2013,
54 0x97 => 0x2014,
55 0x98 => 0x02DC,
56 0x99 => 0x2122,
57 0x9A => 0x0161,
58 0x9B => 0x203A,
59 0x9C => 0x0153,
60 0x9D => 0xFFFD,
61 0x9E => 0x017E,
62 0x9F => 0x0178,
63 }; # $c1_entity_char
64
65 my $special_category = {
66 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
67 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
68 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
69 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
70 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
71 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
72 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
73 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
74 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
75 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
76 };
77 my $scoping_category = {
78 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
79 table => 1, td => 1, th => 1,
80 };
81 my $formatting_category = {
82 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
83 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
84 };
85 # $phrasing_category: all other elements
86
87 sub parse_string ($$$;$) {
88 my $self = shift->new;
89 my $s = \$_[0];
90 $self->{document} = $_[1];
91
92 ## NOTE: |set_inner_html| copies most of this method's code
93
94 my $i = 0;
95 my $line = 1;
96 my $column = 0;
97 $self->{set_next_input_character} = sub {
98 my $self = shift;
99
100 pop @{$self->{prev_input_character}};
101 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
102
103 $self->{next_input_character} = -1 and return if $i >= length $$s;
104 $self->{next_input_character} = ord substr $$s, $i++, 1;
105 $column++;
106
107 if ($self->{next_input_character} == 0x000A) { # LF
108 $line++;
109 $column = 0;
110 } elsif ($self->{next_input_character} == 0x000D) { # CR
111 $i++ if substr ($$s, $i, 1) eq "\x0A";
112 $self->{next_input_character} = 0x000A; # LF # MUST
113 $line++;
114 $column = 0;
115 } elsif ($self->{next_input_character} > 0x10FFFF) {
116 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
117 } elsif ($self->{next_input_character} == 0x0000) { # NULL
118 !!!parse-error (type => 'NULL');
119 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120 }
121 };
122 $self->{prev_input_character} = [-1, -1, -1];
123 $self->{next_input_character} = -1;
124
125 my $onerror = $_[2] || sub {
126 my (%opt) = @_;
127 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
128 };
129 $self->{parse_error} = sub {
130 $onerror->(@_, line => $line, column => $column);
131 };
132
133 $self->_initialize_tokenizer;
134 $self->_initialize_tree_constructor;
135 $self->_construct_tree;
136 $self->_terminate_tree_constructor;
137
138 return $self->{document};
139 } # parse_string
140
141 sub new ($) {
142 my $class = shift;
143 my $self = bless {}, $class;
144 $self->{set_next_input_character} = sub {
145 $self->{next_input_character} = -1;
146 };
147 $self->{parse_error} = sub {
148 #
149 };
150 return $self;
151 } # new
152
153 sub CM_ENTITY () { 0b001 } # & markup in data
154 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
155 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
156
157 sub PLAINTEXT_CONTENT_MODEL () { 0 }
158 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
159 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
160 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
161
162 ## Implementations MUST act as if state machine in the spec
163
164 sub _initialize_tokenizer ($) {
165 my $self = shift;
166 $self->{state} = 'data'; # MUST
167 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
168 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
169 undef $self->{current_attribute};
170 undef $self->{last_emitted_start_tag_name};
171 undef $self->{last_attribute_value_state};
172 $self->{char} = [];
173 # $self->{next_input_character}
174 !!!next-input-character;
175 $self->{token} = [];
176 # $self->{escape}
177 } # _initialize_tokenizer
178
179 ## A token has:
180 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
181 ## 'character', or 'end-of-file'
182 ## ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
183 ## ->{public_identifier} (DOCTYPE)
184 ## ->{system_identifier} (DOCTYPE)
185 ## ->{correct} == 1 or 0 (DOCTYPE)
186 ## ->{attributes} isa HASH (start tag, end tag)
187 ## ->{data} (comment, character)
188
189 ## Emitted token MUST immediately be handled by the tree construction state.
190
191 ## Before each step, UA MAY check to see if either one of the scripts in
192 ## "list of scripts that will execute as soon as possible" or the first
193 ## script in the "list of scripts that will execute asynchronously",
194 ## has completed loading. If one has, then it MUST be executed
195 ## and removed from the list.
196
197 sub _get_next_token ($) {
198 my $self = shift;
199 if (@{$self->{token}}) {
200 return shift @{$self->{token}};
201 }
202
203 A: {
204 if ($self->{state} eq 'data') {
205 if ($self->{next_input_character} == 0x0026) { # &
206 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
207 $self->{state} = 'entity data';
208 !!!next-input-character;
209 redo A;
210 } else {
211 #
212 }
213 } elsif ($self->{next_input_character} == 0x002D) { # -
214 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
215 unless ($self->{escape}) {
216 if ($self->{prev_input_character}->[0] == 0x002D and # -
217 $self->{prev_input_character}->[1] == 0x0021 and # !
218 $self->{prev_input_character}->[2] == 0x003C) { # <
219 $self->{escape} = 1;
220 }
221 }
222 }
223
224 #
225 } elsif ($self->{next_input_character} == 0x003C) { # <
226 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
227 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
228 not $self->{escape})) {
229 $self->{state} = 'tag open';
230 !!!next-input-character;
231 redo A;
232 } else {
233 #
234 }
235 } elsif ($self->{next_input_character} == 0x003E) { # >
236 if ($self->{escape} and
237 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
238 if ($self->{prev_input_character}->[0] == 0x002D and # -
239 $self->{prev_input_character}->[1] == 0x002D) { # -
240 delete $self->{escape};
241 }
242 }
243
244 #
245 } elsif ($self->{next_input_character} == -1) {
246 !!!emit ({type => 'end-of-file'});
247 last A; ## TODO: ok?
248 }
249 # Anything else
250 my $token = {type => 'character',
251 data => chr $self->{next_input_character}};
252 ## Stay in the data state
253 !!!next-input-character;
254
255 !!!emit ($token);
256
257 redo A;
258 } elsif ($self->{state} eq 'entity data') {
259 ## (cannot happen in CDATA state)
260
261 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
262
263 $self->{state} = 'data';
264 # next-input-character is already done
265
266 unless (defined $token) {
267 !!!emit ({type => 'character', data => '&'});
268 } else {
269 !!!emit ($token);
270 }
271
272 redo A;
273 } elsif ($self->{state} eq 'tag open') {
274 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
275 if ($self->{next_input_character} == 0x002F) { # /
276 !!!next-input-character;
277 $self->{state} = 'close tag open';
278 redo A;
279 } else {
280 ## reconsume
281 $self->{state} = 'data';
282
283 !!!emit ({type => 'character', data => '<'});
284
285 redo A;
286 }
287 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
288 if ($self->{next_input_character} == 0x0021) { # !
289 $self->{state} = 'markup declaration open';
290 !!!next-input-character;
291 redo A;
292 } elsif ($self->{next_input_character} == 0x002F) { # /
293 $self->{state} = 'close tag open';
294 !!!next-input-character;
295 redo A;
296 } elsif (0x0041 <= $self->{next_input_character} and
297 $self->{next_input_character} <= 0x005A) { # A..Z
298 $self->{current_token}
299 = {type => 'start tag',
300 tag_name => chr ($self->{next_input_character} + 0x0020)};
301 $self->{state} = 'tag name';
302 !!!next-input-character;
303 redo A;
304 } elsif (0x0061 <= $self->{next_input_character} and
305 $self->{next_input_character} <= 0x007A) { # a..z
306 $self->{current_token} = {type => 'start tag',
307 tag_name => chr ($self->{next_input_character})};
308 $self->{state} = 'tag name';
309 !!!next-input-character;
310 redo A;
311 } elsif ($self->{next_input_character} == 0x003E) { # >
312 !!!parse-error (type => 'empty start tag');
313 $self->{state} = 'data';
314 !!!next-input-character;
315
316 !!!emit ({type => 'character', data => '<>'});
317
318 redo A;
319 } elsif ($self->{next_input_character} == 0x003F) { # ?
320 !!!parse-error (type => 'pio');
321 $self->{state} = 'bogus comment';
322 ## $self->{next_input_character} is intentionally left as is
323 redo A;
324 } else {
325 !!!parse-error (type => 'bare stago');
326 $self->{state} = 'data';
327 ## reconsume
328
329 !!!emit ({type => 'character', data => '<'});
330
331 redo A;
332 }
333 } else {
334 die "$0: $self->{content_model} in tag open";
335 }
336 } elsif ($self->{state} eq 'close tag open') {
337 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
338 if (defined $self->{last_emitted_start_tag_name}) {
339 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
340 my @next_char;
341 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
342 push @next_char, $self->{next_input_character};
343 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
344 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
345 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
346 !!!next-input-character;
347 next TAGNAME;
348 } else {
349 $self->{next_input_character} = shift @next_char; # reconsume
350 !!!back-next-input-character (@next_char);
351 $self->{state} = 'data';
352
353 !!!emit ({type => 'character', data => '</'});
354
355 redo A;
356 }
357 }
358 push @next_char, $self->{next_input_character};
359
360 unless ($self->{next_input_character} == 0x0009 or # HT
361 $self->{next_input_character} == 0x000A or # LF
362 $self->{next_input_character} == 0x000B or # VT
363 $self->{next_input_character} == 0x000C or # FF
364 $self->{next_input_character} == 0x0020 or # SP
365 $self->{next_input_character} == 0x003E or # >
366 $self->{next_input_character} == 0x002F or # /
367 $self->{next_input_character} == -1) {
368 $self->{next_input_character} = shift @next_char; # reconsume
369 !!!back-next-input-character (@next_char);
370 $self->{state} = 'data';
371 !!!emit ({type => 'character', data => '</'});
372 redo A;
373 } else {
374 $self->{next_input_character} = shift @next_char;
375 !!!back-next-input-character (@next_char);
376 # and consume...
377 }
378 } else {
379 ## No start tag token has ever been emitted
380 # next-input-character is already done
381 $self->{state} = 'data';
382 !!!emit ({type => 'character', data => '</'});
383 redo A;
384 }
385 }
386
387 if (0x0041 <= $self->{next_input_character} and
388 $self->{next_input_character} <= 0x005A) { # A..Z
389 $self->{current_token} = {type => 'end tag',
390 tag_name => chr ($self->{next_input_character} + 0x0020)};
391 $self->{state} = 'tag name';
392 !!!next-input-character;
393 redo A;
394 } elsif (0x0061 <= $self->{next_input_character} and
395 $self->{next_input_character} <= 0x007A) { # a..z
396 $self->{current_token} = {type => 'end tag',
397 tag_name => chr ($self->{next_input_character})};
398 $self->{state} = 'tag name';
399 !!!next-input-character;
400 redo A;
401 } elsif ($self->{next_input_character} == 0x003E) { # >
402 !!!parse-error (type => 'empty end tag');
403 $self->{state} = 'data';
404 !!!next-input-character;
405 redo A;
406 } elsif ($self->{next_input_character} == -1) {
407 !!!parse-error (type => 'bare etago');
408 $self->{state} = 'data';
409 # reconsume
410
411 !!!emit ({type => 'character', data => '</'});
412
413 redo A;
414 } else {
415 !!!parse-error (type => 'bogus end tag');
416 $self->{state} = 'bogus comment';
417 ## $self->{next_input_character} is intentionally left as is
418 redo A;
419 }
420 } elsif ($self->{state} eq 'tag name') {
421 if ($self->{next_input_character} == 0x0009 or # HT
422 $self->{next_input_character} == 0x000A or # LF
423 $self->{next_input_character} == 0x000B or # VT
424 $self->{next_input_character} == 0x000C or # FF
425 $self->{next_input_character} == 0x0020) { # SP
426 $self->{state} = 'before attribute name';
427 !!!next-input-character;
428 redo A;
429 } elsif ($self->{next_input_character} == 0x003E) { # >
430 if ($self->{current_token}->{type} eq 'start tag') {
431 $self->{current_token}->{first_start_tag}
432 = not defined $self->{last_emitted_start_tag_name};
433 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
434 } elsif ($self->{current_token}->{type} eq 'end tag') {
435 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
436 if ($self->{current_token}->{attributes}) {
437 !!!parse-error (type => 'end tag attribute');
438 }
439 } else {
440 die "$0: $self->{current_token}->{type}: Unknown token type";
441 }
442 $self->{state} = 'data';
443 !!!next-input-character;
444
445 !!!emit ($self->{current_token}); # start tag or end tag
446
447 redo A;
448 } elsif (0x0041 <= $self->{next_input_character} and
449 $self->{next_input_character} <= 0x005A) { # A..Z
450 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
451 # start tag or end tag
452 ## Stay in this state
453 !!!next-input-character;
454 redo A;
455 } elsif ($self->{next_input_character} == -1) {
456 !!!parse-error (type => 'unclosed tag');
457 if ($self->{current_token}->{type} eq 'start tag') {
458 $self->{current_token}->{first_start_tag}
459 = not defined $self->{last_emitted_start_tag_name};
460 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
461 } elsif ($self->{current_token}->{type} eq 'end tag') {
462 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
463 if ($self->{current_token}->{attributes}) {
464 !!!parse-error (type => 'end tag attribute');
465 }
466 } else {
467 die "$0: $self->{current_token}->{type}: Unknown token type";
468 }
469 $self->{state} = 'data';
470 # reconsume
471
472 !!!emit ($self->{current_token}); # start tag or end tag
473
474 redo A;
475 } elsif ($self->{next_input_character} == 0x002F) { # /
476 !!!next-input-character;
477 if ($self->{next_input_character} == 0x003E and # >
478 $self->{current_token}->{type} eq 'start tag' and
479 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
480 # permitted slash
481 #
482 } else {
483 !!!parse-error (type => 'nestc');
484 }
485 $self->{state} = 'before attribute name';
486 # next-input-character is already done
487 redo A;
488 } else {
489 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
490 # start tag or end tag
491 ## Stay in the state
492 !!!next-input-character;
493 redo A;
494 }
495 } elsif ($self->{state} eq 'before attribute name') {
496 if ($self->{next_input_character} == 0x0009 or # HT
497 $self->{next_input_character} == 0x000A or # LF
498 $self->{next_input_character} == 0x000B or # VT
499 $self->{next_input_character} == 0x000C or # FF
500 $self->{next_input_character} == 0x0020) { # SP
501 ## Stay in the state
502 !!!next-input-character;
503 redo A;
504 } elsif ($self->{next_input_character} == 0x003E) { # >
505 if ($self->{current_token}->{type} eq 'start tag') {
506 $self->{current_token}->{first_start_tag}
507 = not defined $self->{last_emitted_start_tag_name};
508 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
509 } elsif ($self->{current_token}->{type} eq 'end tag') {
510 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
511 if ($self->{current_token}->{attributes}) {
512 !!!parse-error (type => 'end tag attribute');
513 }
514 } else {
515 die "$0: $self->{current_token}->{type}: Unknown token type";
516 }
517 $self->{state} = 'data';
518 !!!next-input-character;
519
520 !!!emit ($self->{current_token}); # start tag or end tag
521
522 redo A;
523 } elsif (0x0041 <= $self->{next_input_character} and
524 $self->{next_input_character} <= 0x005A) { # A..Z
525 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
526 value => ''};
527 $self->{state} = 'attribute name';
528 !!!next-input-character;
529 redo A;
530 } elsif ($self->{next_input_character} == 0x002F) { # /
531 !!!next-input-character;
532 if ($self->{next_input_character} == 0x003E and # >
533 $self->{current_token}->{type} eq 'start tag' and
534 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
535 # permitted slash
536 #
537 } else {
538 !!!parse-error (type => 'nestc');
539 }
540 ## Stay in the state
541 # next-input-character is already done
542 redo A;
543 } elsif ($self->{next_input_character} == -1) {
544 !!!parse-error (type => 'unclosed tag');
545 if ($self->{current_token}->{type} eq 'start tag') {
546 $self->{current_token}->{first_start_tag}
547 = not defined $self->{last_emitted_start_tag_name};
548 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
549 } elsif ($self->{current_token}->{type} eq 'end tag') {
550 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
551 if ($self->{current_token}->{attributes}) {
552 !!!parse-error (type => 'end tag attribute');
553 }
554 } else {
555 die "$0: $self->{current_token}->{type}: Unknown token type";
556 }
557 $self->{state} = 'data';
558 # reconsume
559
560 !!!emit ($self->{current_token}); # start tag or end tag
561
562 redo A;
563 } else {
564 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
565 value => ''};
566 $self->{state} = 'attribute name';
567 !!!next-input-character;
568 redo A;
569 }
570 } elsif ($self->{state} eq 'attribute name') {
571 my $before_leave = sub {
572 if (exists $self->{current_token}->{attributes} # start tag or end tag
573 ->{$self->{current_attribute}->{name}}) { # MUST
574 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
575 ## Discard $self->{current_attribute} # MUST
576 } else {
577 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
578 = $self->{current_attribute};
579 }
580 }; # $before_leave
581
582 if ($self->{next_input_character} == 0x0009 or # HT
583 $self->{next_input_character} == 0x000A or # LF
584 $self->{next_input_character} == 0x000B or # VT
585 $self->{next_input_character} == 0x000C or # FF
586 $self->{next_input_character} == 0x0020) { # SP
587 $before_leave->();
588 $self->{state} = 'after attribute name';
589 !!!next-input-character;
590 redo A;
591 } elsif ($self->{next_input_character} == 0x003D) { # =
592 $before_leave->();
593 $self->{state} = 'before attribute value';
594 !!!next-input-character;
595 redo A;
596 } elsif ($self->{next_input_character} == 0x003E) { # >
597 $before_leave->();
598 if ($self->{current_token}->{type} eq 'start tag') {
599 $self->{current_token}->{first_start_tag}
600 = not defined $self->{last_emitted_start_tag_name};
601 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
602 } elsif ($self->{current_token}->{type} eq 'end tag') {
603 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
604 if ($self->{current_token}->{attributes}) {
605 !!!parse-error (type => 'end tag attribute');
606 }
607 } else {
608 die "$0: $self->{current_token}->{type}: Unknown token type";
609 }
610 $self->{state} = 'data';
611 !!!next-input-character;
612
613 !!!emit ($self->{current_token}); # start tag or end tag
614
615 redo A;
616 } elsif (0x0041 <= $self->{next_input_character} and
617 $self->{next_input_character} <= 0x005A) { # A..Z
618 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
619 ## Stay in the state
620 !!!next-input-character;
621 redo A;
622 } elsif ($self->{next_input_character} == 0x002F) { # /
623 $before_leave->();
624 !!!next-input-character;
625 if ($self->{next_input_character} == 0x003E and # >
626 $self->{current_token}->{type} eq 'start tag' and
627 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
628 # permitted slash
629 #
630 } else {
631 !!!parse-error (type => 'nestc');
632 }
633 $self->{state} = 'before attribute name';
634 # next-input-character is already done
635 redo A;
636 } elsif ($self->{next_input_character} == -1) {
637 !!!parse-error (type => 'unclosed tag');
638 $before_leave->();
639 if ($self->{current_token}->{type} eq 'start tag') {
640 $self->{current_token}->{first_start_tag}
641 = not defined $self->{last_emitted_start_tag_name};
642 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
643 } elsif ($self->{current_token}->{type} eq 'end tag') {
644 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
645 if ($self->{current_token}->{attributes}) {
646 !!!parse-error (type => 'end tag attribute');
647 }
648 } else {
649 die "$0: $self->{current_token}->{type}: Unknown token type";
650 }
651 $self->{state} = 'data';
652 # reconsume
653
654 !!!emit ($self->{current_token}); # start tag or end tag
655
656 redo A;
657 } else {
658 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
659 ## Stay in the state
660 !!!next-input-character;
661 redo A;
662 }
663 } elsif ($self->{state} eq 'after attribute name') {
664 if ($self->{next_input_character} == 0x0009 or # HT
665 $self->{next_input_character} == 0x000A or # LF
666 $self->{next_input_character} == 0x000B or # VT
667 $self->{next_input_character} == 0x000C or # FF
668 $self->{next_input_character} == 0x0020) { # SP
669 ## Stay in the state
670 !!!next-input-character;
671 redo A;
672 } elsif ($self->{next_input_character} == 0x003D) { # =
673 $self->{state} = 'before attribute value';
674 !!!next-input-character;
675 redo A;
676 } elsif ($self->{next_input_character} == 0x003E) { # >
677 if ($self->{current_token}->{type} eq 'start tag') {
678 $self->{current_token}->{first_start_tag}
679 = not defined $self->{last_emitted_start_tag_name};
680 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
681 } elsif ($self->{current_token}->{type} eq 'end tag') {
682 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
683 if ($self->{current_token}->{attributes}) {
684 !!!parse-error (type => 'end tag attribute');
685 }
686 } else {
687 die "$0: $self->{current_token}->{type}: Unknown token type";
688 }
689 $self->{state} = 'data';
690 !!!next-input-character;
691
692 !!!emit ($self->{current_token}); # start tag or end tag
693
694 redo A;
695 } elsif (0x0041 <= $self->{next_input_character} and
696 $self->{next_input_character} <= 0x005A) { # A..Z
697 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
698 value => ''};
699 $self->{state} = 'attribute name';
700 !!!next-input-character;
701 redo A;
702 } elsif ($self->{next_input_character} == 0x002F) { # /
703 !!!next-input-character;
704 if ($self->{next_input_character} == 0x003E and # >
705 $self->{current_token}->{type} eq 'start tag' and
706 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
707 # permitted slash
708 #
709 } else {
710 !!!parse-error (type => 'nestc');
711 ## TODO: Different error type for <aa / bb> than <aa/>
712 }
713 $self->{state} = 'before attribute name';
714 # next-input-character is already done
715 redo A;
716 } elsif ($self->{next_input_character} == -1) {
717 !!!parse-error (type => 'unclosed tag');
718 if ($self->{current_token}->{type} eq 'start tag') {
719 $self->{current_token}->{first_start_tag}
720 = not defined $self->{last_emitted_start_tag_name};
721 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
722 } elsif ($self->{current_token}->{type} eq 'end tag') {
723 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
724 if ($self->{current_token}->{attributes}) {
725 !!!parse-error (type => 'end tag attribute');
726 }
727 } else {
728 die "$0: $self->{current_token}->{type}: Unknown token type";
729 }
730 $self->{state} = 'data';
731 # reconsume
732
733 !!!emit ($self->{current_token}); # start tag or end tag
734
735 redo A;
736 } else {
737 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
738 value => ''};
739 $self->{state} = 'attribute name';
740 !!!next-input-character;
741 redo A;
742 }
743 } elsif ($self->{state} eq 'before attribute value') {
744 if ($self->{next_input_character} == 0x0009 or # HT
745 $self->{next_input_character} == 0x000A or # LF
746 $self->{next_input_character} == 0x000B or # VT
747 $self->{next_input_character} == 0x000C or # FF
748 $self->{next_input_character} == 0x0020) { # SP
749 ## Stay in the state
750 !!!next-input-character;
751 redo A;
752 } elsif ($self->{next_input_character} == 0x0022) { # "
753 $self->{state} = 'attribute value (double-quoted)';
754 !!!next-input-character;
755 redo A;
756 } elsif ($self->{next_input_character} == 0x0026) { # &
757 $self->{state} = 'attribute value (unquoted)';
758 ## reconsume
759 redo A;
760 } elsif ($self->{next_input_character} == 0x0027) { # '
761 $self->{state} = 'attribute value (single-quoted)';
762 !!!next-input-character;
763 redo A;
764 } elsif ($self->{next_input_character} == 0x003E) { # >
765 if ($self->{current_token}->{type} eq 'start tag') {
766 $self->{current_token}->{first_start_tag}
767 = not defined $self->{last_emitted_start_tag_name};
768 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
769 } elsif ($self->{current_token}->{type} eq 'end tag') {
770 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
771 if ($self->{current_token}->{attributes}) {
772 !!!parse-error (type => 'end tag attribute');
773 }
774 } else {
775 die "$0: $self->{current_token}->{type}: Unknown token type";
776 }
777 $self->{state} = 'data';
778 !!!next-input-character;
779
780 !!!emit ($self->{current_token}); # start tag or end tag
781
782 redo A;
783 } elsif ($self->{next_input_character} == -1) {
784 !!!parse-error (type => 'unclosed tag');
785 if ($self->{current_token}->{type} eq 'start tag') {
786 $self->{current_token}->{first_start_tag}
787 = not defined $self->{last_emitted_start_tag_name};
788 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
789 } elsif ($self->{current_token}->{type} eq 'end tag') {
790 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791 if ($self->{current_token}->{attributes}) {
792 !!!parse-error (type => 'end tag attribute');
793 }
794 } else {
795 die "$0: $self->{current_token}->{type}: Unknown token type";
796 }
797 $self->{state} = 'data';
798 ## reconsume
799
800 !!!emit ($self->{current_token}); # start tag or end tag
801
802 redo A;
803 } else {
804 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
805 $self->{state} = 'attribute value (unquoted)';
806 !!!next-input-character;
807 redo A;
808 }
809 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
810 if ($self->{next_input_character} == 0x0022) { # "
811 $self->{state} = 'before attribute name';
812 !!!next-input-character;
813 redo A;
814 } elsif ($self->{next_input_character} == 0x0026) { # &
815 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
816 $self->{state} = 'entity in attribute value';
817 !!!next-input-character;
818 redo A;
819 } elsif ($self->{next_input_character} == -1) {
820 !!!parse-error (type => 'unclosed attribute value');
821 if ($self->{current_token}->{type} eq 'start tag') {
822 $self->{current_token}->{first_start_tag}
823 = not defined $self->{last_emitted_start_tag_name};
824 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
825 } elsif ($self->{current_token}->{type} eq 'end tag') {
826 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
827 if ($self->{current_token}->{attributes}) {
828 !!!parse-error (type => 'end tag attribute');
829 }
830 } else {
831 die "$0: $self->{current_token}->{type}: Unknown token type";
832 }
833 $self->{state} = 'data';
834 ## reconsume
835
836 !!!emit ($self->{current_token}); # start tag or end tag
837
838 redo A;
839 } else {
840 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
841 ## Stay in the state
842 !!!next-input-character;
843 redo A;
844 }
845 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
846 if ($self->{next_input_character} == 0x0027) { # '
847 $self->{state} = 'before attribute name';
848 !!!next-input-character;
849 redo A;
850 } elsif ($self->{next_input_character} == 0x0026) { # &
851 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
852 $self->{state} = 'entity in attribute value';
853 !!!next-input-character;
854 redo A;
855 } elsif ($self->{next_input_character} == -1) {
856 !!!parse-error (type => 'unclosed attribute value');
857 if ($self->{current_token}->{type} eq 'start tag') {
858 $self->{current_token}->{first_start_tag}
859 = not defined $self->{last_emitted_start_tag_name};
860 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
861 } elsif ($self->{current_token}->{type} eq 'end tag') {
862 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
863 if ($self->{current_token}->{attributes}) {
864 !!!parse-error (type => 'end tag attribute');
865 }
866 } else {
867 die "$0: $self->{current_token}->{type}: Unknown token type";
868 }
869 $self->{state} = 'data';
870 ## reconsume
871
872 !!!emit ($self->{current_token}); # start tag or end tag
873
874 redo A;
875 } else {
876 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
877 ## Stay in the state
878 !!!next-input-character;
879 redo A;
880 }
881 } elsif ($self->{state} eq 'attribute value (unquoted)') {
882 if ($self->{next_input_character} == 0x0009 or # HT
883 $self->{next_input_character} == 0x000A or # LF
884 $self->{next_input_character} == 0x000B or # HT
885 $self->{next_input_character} == 0x000C or # FF
886 $self->{next_input_character} == 0x0020) { # SP
887 $self->{state} = 'before attribute name';
888 !!!next-input-character;
889 redo A;
890 } elsif ($self->{next_input_character} == 0x0026) { # &
891 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
892 $self->{state} = 'entity in attribute value';
893 !!!next-input-character;
894 redo A;
895 } elsif ($self->{next_input_character} == 0x003E) { # >
896 if ($self->{current_token}->{type} eq 'start tag') {
897 $self->{current_token}->{first_start_tag}
898 = not defined $self->{last_emitted_start_tag_name};
899 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
900 } elsif ($self->{current_token}->{type} eq 'end tag') {
901 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
902 if ($self->{current_token}->{attributes}) {
903 !!!parse-error (type => 'end tag attribute');
904 }
905 } else {
906 die "$0: $self->{current_token}->{type}: Unknown token type";
907 }
908 $self->{state} = 'data';
909 !!!next-input-character;
910
911 !!!emit ($self->{current_token}); # start tag or end tag
912
913 redo A;
914 } elsif ($self->{next_input_character} == -1) {
915 !!!parse-error (type => 'unclosed tag');
916 if ($self->{current_token}->{type} eq 'start tag') {
917 $self->{current_token}->{first_start_tag}
918 = not defined $self->{last_emitted_start_tag_name};
919 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
920 } elsif ($self->{current_token}->{type} eq 'end tag') {
921 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
922 if ($self->{current_token}->{attributes}) {
923 !!!parse-error (type => 'end tag attribute');
924 }
925 } else {
926 die "$0: $self->{current_token}->{type}: Unknown token type";
927 }
928 $self->{state} = 'data';
929 ## reconsume
930
931 !!!emit ($self->{current_token}); # start tag or end tag
932
933 redo A;
934 } else {
935 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
936 ## Stay in the state
937 !!!next-input-character;
938 redo A;
939 }
940 } elsif ($self->{state} eq 'entity in attribute value') {
941 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
942
943 unless (defined $token) {
944 $self->{current_attribute}->{value} .= '&';
945 } else {
946 $self->{current_attribute}->{value} .= $token->{data};
947 ## ISSUE: spec says "append the returned character token to the current attribute's value"
948 }
949
950 $self->{state} = $self->{last_attribute_value_state};
951 # next-input-character is already done
952 redo A;
953 } elsif ($self->{state} eq 'bogus comment') {
954 ## (only happen if PCDATA state)
955
956 my $token = {type => 'comment', data => ''};
957
958 BC: {
959 if ($self->{next_input_character} == 0x003E) { # >
960 $self->{state} = 'data';
961 !!!next-input-character;
962
963 !!!emit ($token);
964
965 redo A;
966 } elsif ($self->{next_input_character} == -1) {
967 $self->{state} = 'data';
968 ## reconsume
969
970 !!!emit ($token);
971
972 redo A;
973 } else {
974 $token->{data} .= chr ($self->{next_input_character});
975 !!!next-input-character;
976 redo BC;
977 }
978 } # BC
979 } elsif ($self->{state} eq 'markup declaration open') {
980 ## (only happen if PCDATA state)
981
982 my @next_char;
983 push @next_char, $self->{next_input_character};
984
985 if ($self->{next_input_character} == 0x002D) { # -
986 !!!next-input-character;
987 push @next_char, $self->{next_input_character};
988 if ($self->{next_input_character} == 0x002D) { # -
989 $self->{current_token} = {type => 'comment', data => ''};
990 $self->{state} = 'comment start';
991 !!!next-input-character;
992 redo A;
993 }
994 } elsif ($self->{next_input_character} == 0x0044 or # D
995 $self->{next_input_character} == 0x0064) { # d
996 !!!next-input-character;
997 push @next_char, $self->{next_input_character};
998 if ($self->{next_input_character} == 0x004F or # O
999 $self->{next_input_character} == 0x006F) { # o
1000 !!!next-input-character;
1001 push @next_char, $self->{next_input_character};
1002 if ($self->{next_input_character} == 0x0043 or # C
1003 $self->{next_input_character} == 0x0063) { # c
1004 !!!next-input-character;
1005 push @next_char, $self->{next_input_character};
1006 if ($self->{next_input_character} == 0x0054 or # T
1007 $self->{next_input_character} == 0x0074) { # t
1008 !!!next-input-character;
1009 push @next_char, $self->{next_input_character};
1010 if ($self->{next_input_character} == 0x0059 or # Y
1011 $self->{next_input_character} == 0x0079) { # y
1012 !!!next-input-character;
1013 push @next_char, $self->{next_input_character};
1014 if ($self->{next_input_character} == 0x0050 or # P
1015 $self->{next_input_character} == 0x0070) { # p
1016 !!!next-input-character;
1017 push @next_char, $self->{next_input_character};
1018 if ($self->{next_input_character} == 0x0045 or # E
1019 $self->{next_input_character} == 0x0065) { # e
1020 ## ISSUE: What a stupid code this is!
1021 $self->{state} = 'DOCTYPE';
1022 !!!next-input-character;
1023 redo A;
1024 }
1025 }
1026 }
1027 }
1028 }
1029 }
1030 }
1031
1032 !!!parse-error (type => 'bogus comment');
1033 $self->{next_input_character} = shift @next_char;
1034 !!!back-next-input-character (@next_char);
1035 $self->{state} = 'bogus comment';
1036 redo A;
1037
1038 ## ISSUE: typos in spec: chacacters, is is a parse error
1039 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1040 } elsif ($self->{state} eq 'comment start') {
1041 if ($self->{next_input_character} == 0x002D) { # -
1042 $self->{state} = 'comment start dash';
1043 !!!next-input-character;
1044 redo A;
1045 } elsif ($self->{next_input_character} == 0x003E) { # >
1046 !!!parse-error (type => 'bogus comment');
1047 $self->{state} = 'data';
1048 !!!next-input-character;
1049
1050 !!!emit ($self->{current_token}); # comment
1051
1052 redo A;
1053 } elsif ($self->{next_input_character} == -1) {
1054 !!!parse-error (type => 'unclosed comment');
1055 $self->{state} = 'data';
1056 ## reconsume
1057
1058 !!!emit ($self->{current_token}); # comment
1059
1060 redo A;
1061 } else {
1062 $self->{current_token}->{data} # comment
1063 .= chr ($self->{next_input_character});
1064 $self->{state} = 'comment';
1065 !!!next-input-character;
1066 redo A;
1067 }
1068 } elsif ($self->{state} eq 'comment start dash') {
1069 if ($self->{next_input_character} == 0x002D) { # -
1070 $self->{state} = 'comment end';
1071 !!!next-input-character;
1072 redo A;
1073 } elsif ($self->{next_input_character} == 0x003E) { # >
1074 !!!parse-error (type => 'bogus comment');
1075 $self->{state} = 'data';
1076 !!!next-input-character;
1077
1078 !!!emit ($self->{current_token}); # comment
1079
1080 redo A;
1081 } elsif ($self->{next_input_character} == -1) {
1082 !!!parse-error (type => 'unclosed comment');
1083 $self->{state} = 'data';
1084 ## reconsume
1085
1086 !!!emit ($self->{current_token}); # comment
1087
1088 redo A;
1089 } else {
1090 $self->{current_token}->{data} # comment
1091 .= '-' . chr ($self->{next_input_character});
1092 $self->{state} = 'comment';
1093 !!!next-input-character;
1094 redo A;
1095 }
1096 } elsif ($self->{state} eq 'comment') {
1097 if ($self->{next_input_character} == 0x002D) { # -
1098 $self->{state} = 'comment end dash';
1099 !!!next-input-character;
1100 redo A;
1101 } elsif ($self->{next_input_character} == -1) {
1102 !!!parse-error (type => 'unclosed comment');
1103 $self->{state} = 'data';
1104 ## reconsume
1105
1106 !!!emit ($self->{current_token}); # comment
1107
1108 redo A;
1109 } else {
1110 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1111 ## Stay in the state
1112 !!!next-input-character;
1113 redo A;
1114 }
1115 } elsif ($self->{state} eq 'comment end dash') {
1116 if ($self->{next_input_character} == 0x002D) { # -
1117 $self->{state} = 'comment end';
1118 !!!next-input-character;
1119 redo A;
1120 } elsif ($self->{next_input_character} == -1) {
1121 !!!parse-error (type => 'unclosed comment');
1122 $self->{state} = 'data';
1123 ## reconsume
1124
1125 !!!emit ($self->{current_token}); # comment
1126
1127 redo A;
1128 } else {
1129 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1130 $self->{state} = 'comment';
1131 !!!next-input-character;
1132 redo A;
1133 }
1134 } elsif ($self->{state} eq 'comment end') {
1135 if ($self->{next_input_character} == 0x003E) { # >
1136 $self->{state} = 'data';
1137 !!!next-input-character;
1138
1139 !!!emit ($self->{current_token}); # comment
1140
1141 redo A;
1142 } elsif ($self->{next_input_character} == 0x002D) { # -
1143 !!!parse-error (type => 'dash in comment');
1144 $self->{current_token}->{data} .= '-'; # comment
1145 ## Stay in the state
1146 !!!next-input-character;
1147 redo A;
1148 } elsif ($self->{next_input_character} == -1) {
1149 !!!parse-error (type => 'unclosed comment');
1150 $self->{state} = 'data';
1151 ## reconsume
1152
1153 !!!emit ($self->{current_token}); # comment
1154
1155 redo A;
1156 } else {
1157 !!!parse-error (type => 'dash in comment');
1158 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1159 $self->{state} = 'comment';
1160 !!!next-input-character;
1161 redo A;
1162 }
1163 } elsif ($self->{state} eq 'DOCTYPE') {
1164 if ($self->{next_input_character} == 0x0009 or # HT
1165 $self->{next_input_character} == 0x000A or # LF
1166 $self->{next_input_character} == 0x000B or # VT
1167 $self->{next_input_character} == 0x000C or # FF
1168 $self->{next_input_character} == 0x0020) { # SP
1169 $self->{state} = 'before DOCTYPE name';
1170 !!!next-input-character;
1171 redo A;
1172 } else {
1173 !!!parse-error (type => 'no space before DOCTYPE name');
1174 $self->{state} = 'before DOCTYPE name';
1175 ## reconsume
1176 redo A;
1177 }
1178 } elsif ($self->{state} eq 'before DOCTYPE name') {
1179 if ($self->{next_input_character} == 0x0009 or # HT
1180 $self->{next_input_character} == 0x000A or # LF
1181 $self->{next_input_character} == 0x000B or # VT
1182 $self->{next_input_character} == 0x000C or # FF
1183 $self->{next_input_character} == 0x0020) { # SP
1184 ## Stay in the state
1185 !!!next-input-character;
1186 redo A;
1187 } elsif ($self->{next_input_character} == 0x003E) { # >
1188 !!!parse-error (type => 'no DOCTYPE name');
1189 $self->{state} = 'data';
1190 !!!next-input-character;
1191
1192 !!!emit ({type => 'DOCTYPE'}); # incorrect
1193
1194 redo A;
1195 } elsif ($self->{next_input_character} == -1) {
1196 !!!parse-error (type => 'no DOCTYPE name');
1197 $self->{state} = 'data';
1198 ## reconsume
1199
1200 !!!emit ({type => 'DOCTYPE'}); # incorrect
1201
1202 redo A;
1203 } else {
1204 $self->{current_token}
1205 = {type => 'DOCTYPE',
1206 name => chr ($self->{next_input_character}),
1207 correct => 1};
1208 ## ISSUE: "Set the token's name name to the" in the spec
1209 $self->{state} = 'DOCTYPE name';
1210 !!!next-input-character;
1211 redo A;
1212 }
1213 } elsif ($self->{state} eq 'DOCTYPE name') {
1214 ## ISSUE: Redundant "First," in the spec.
1215 if ($self->{next_input_character} == 0x0009 or # HT
1216 $self->{next_input_character} == 0x000A or # LF
1217 $self->{next_input_character} == 0x000B or # VT
1218 $self->{next_input_character} == 0x000C or # FF
1219 $self->{next_input_character} == 0x0020) { # SP
1220 $self->{state} = 'after DOCTYPE name';
1221 !!!next-input-character;
1222 redo A;
1223 } elsif ($self->{next_input_character} == 0x003E) { # >
1224 $self->{state} = 'data';
1225 !!!next-input-character;
1226
1227 !!!emit ($self->{current_token}); # DOCTYPE
1228
1229 redo A;
1230 } elsif ($self->{next_input_character} == -1) {
1231 !!!parse-error (type => 'unclosed DOCTYPE');
1232 $self->{state} = 'data';
1233 ## reconsume
1234
1235 delete $self->{current_token}->{correct};
1236 !!!emit ($self->{current_token}); # DOCTYPE
1237
1238 redo A;
1239 } else {
1240 $self->{current_token}->{name}
1241 .= chr ($self->{next_input_character}); # DOCTYPE
1242 ## Stay in the state
1243 !!!next-input-character;
1244 redo A;
1245 }
1246 } elsif ($self->{state} eq 'after DOCTYPE name') {
1247 if ($self->{next_input_character} == 0x0009 or # HT
1248 $self->{next_input_character} == 0x000A or # LF
1249 $self->{next_input_character} == 0x000B or # VT
1250 $self->{next_input_character} == 0x000C or # FF
1251 $self->{next_input_character} == 0x0020) { # SP
1252 ## Stay in the state
1253 !!!next-input-character;
1254 redo A;
1255 } elsif ($self->{next_input_character} == 0x003E) { # >
1256 $self->{state} = 'data';
1257 !!!next-input-character;
1258
1259 !!!emit ($self->{current_token}); # DOCTYPE
1260
1261 redo A;
1262 } elsif ($self->{next_input_character} == -1) {
1263 !!!parse-error (type => 'unclosed DOCTYPE');
1264 $self->{state} = 'data';
1265 ## reconsume
1266
1267 delete $self->{current_token}->{correct};
1268 !!!emit ($self->{current_token}); # DOCTYPE
1269
1270 redo A;
1271 } elsif ($self->{next_input_character} == 0x0050 or # P
1272 $self->{next_input_character} == 0x0070) { # p
1273 !!!next-input-character;
1274 if ($self->{next_input_character} == 0x0055 or # U
1275 $self->{next_input_character} == 0x0075) { # u
1276 !!!next-input-character;
1277 if ($self->{next_input_character} == 0x0042 or # B
1278 $self->{next_input_character} == 0x0062) { # b
1279 !!!next-input-character;
1280 if ($self->{next_input_character} == 0x004C or # L
1281 $self->{next_input_character} == 0x006C) { # l
1282 !!!next-input-character;
1283 if ($self->{next_input_character} == 0x0049 or # I
1284 $self->{next_input_character} == 0x0069) { # i
1285 !!!next-input-character;
1286 if ($self->{next_input_character} == 0x0043 or # C
1287 $self->{next_input_character} == 0x0063) { # c
1288 $self->{state} = 'before DOCTYPE public identifier';
1289 !!!next-input-character;
1290 redo A;
1291 }
1292 }
1293 }
1294 }
1295 }
1296
1297 #
1298 } elsif ($self->{next_input_character} == 0x0053 or # S
1299 $self->{next_input_character} == 0x0073) { # s
1300 !!!next-input-character;
1301 if ($self->{next_input_character} == 0x0059 or # Y
1302 $self->{next_input_character} == 0x0079) { # y
1303 !!!next-input-character;
1304 if ($self->{next_input_character} == 0x0053 or # S
1305 $self->{next_input_character} == 0x0073) { # s
1306 !!!next-input-character;
1307 if ($self->{next_input_character} == 0x0054 or # T
1308 $self->{next_input_character} == 0x0074) { # t
1309 !!!next-input-character;
1310 if ($self->{next_input_character} == 0x0045 or # E
1311 $self->{next_input_character} == 0x0065) { # e
1312 !!!next-input-character;
1313 if ($self->{next_input_character} == 0x004D or # M
1314 $self->{next_input_character} == 0x006D) { # m
1315 $self->{state} = 'before DOCTYPE system identifier';
1316 !!!next-input-character;
1317 redo A;
1318 }
1319 }
1320 }
1321 }
1322 }
1323
1324 #
1325 } else {
1326 !!!next-input-character;
1327 #
1328 }
1329
1330 !!!parse-error (type => 'string after DOCTYPE name');
1331 $self->{state} = 'bogus DOCTYPE';
1332 # next-input-character is already done
1333 redo A;
1334 } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
1335 if ({
1336 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1337 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1338 }->{$self->{next_input_character}}) {
1339 ## Stay in the state
1340 !!!next-input-character;
1341 redo A;
1342 } elsif ($self->{next_input_character} eq 0x0022) { # "
1343 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1344 $self->{state} = 'DOCTYPE public identifier (double-quoted)';
1345 !!!next-input-character;
1346 redo A;
1347 } elsif ($self->{next_input_character} eq 0x0027) { # '
1348 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1349 $self->{state} = 'DOCTYPE public identifier (single-quoted)';
1350 !!!next-input-character;
1351 redo A;
1352 } elsif ($self->{next_input_character} eq 0x003E) { # >
1353 !!!parse-error (type => 'no PUBLIC literal');
1354
1355 $self->{state} = 'data';
1356 !!!next-input-character;
1357
1358 delete $self->{current_token}->{correct};
1359 !!!emit ($self->{current_token}); # DOCTYPE
1360
1361 redo A;
1362 } elsif ($self->{next_input_character} == -1) {
1363 !!!parse-error (type => 'unclosed DOCTYPE');
1364
1365 $self->{state} = 'data';
1366 ## reconsume
1367
1368 delete $self->{current_token}->{correct};
1369 !!!emit ($self->{current_token}); # DOCTYPE
1370
1371 redo A;
1372 } else {
1373 !!!parse-error (type => 'string after PUBLIC');
1374 $self->{state} = 'bogus DOCTYPE';
1375 !!!next-input-character;
1376 redo A;
1377 }
1378 } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
1379 if ($self->{next_input_character} == 0x0022) { # "
1380 $self->{state} = 'after DOCTYPE public identifier';
1381 !!!next-input-character;
1382 redo A;
1383 } elsif ($self->{next_input_character} == -1) {
1384 !!!parse-error (type => 'unclosed PUBLIC literal');
1385
1386 $self->{state} = 'data';
1387 ## reconsume
1388
1389 delete $self->{current_token}->{correct};
1390 !!!emit ($self->{current_token}); # DOCTYPE
1391
1392 redo A;
1393 } else {
1394 $self->{current_token}->{public_identifier} # DOCTYPE
1395 .= chr $self->{next_input_character};
1396 ## Stay in the state
1397 !!!next-input-character;
1398 redo A;
1399 }
1400 } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
1401 if ($self->{next_input_character} == 0x0027) { # '
1402 $self->{state} = 'after DOCTYPE public identifier';
1403 !!!next-input-character;
1404 redo A;
1405 } elsif ($self->{next_input_character} == -1) {
1406 !!!parse-error (type => 'unclosed PUBLIC literal');
1407
1408 $self->{state} = 'data';
1409 ## reconsume
1410
1411 delete $self->{current_token}->{correct};
1412 !!!emit ($self->{current_token}); # DOCTYPE
1413
1414 redo A;
1415 } else {
1416 $self->{current_token}->{public_identifier} # DOCTYPE
1417 .= chr $self->{next_input_character};
1418 ## Stay in the state
1419 !!!next-input-character;
1420 redo A;
1421 }
1422 } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
1423 if ({
1424 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1425 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1426 }->{$self->{next_input_character}}) {
1427 ## Stay in the state
1428 !!!next-input-character;
1429 redo A;
1430 } elsif ($self->{next_input_character} == 0x0022) { # "
1431 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1432 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1433 !!!next-input-character;
1434 redo A;
1435 } elsif ($self->{next_input_character} == 0x0027) { # '
1436 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1437 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1438 !!!next-input-character;
1439 redo A;
1440 } elsif ($self->{next_input_character} == 0x003E) { # >
1441 $self->{state} = 'data';
1442 !!!next-input-character;
1443
1444 !!!emit ($self->{current_token}); # DOCTYPE
1445
1446 redo A;
1447 } elsif ($self->{next_input_character} == -1) {
1448 !!!parse-error (type => 'unclosed DOCTYPE');
1449
1450 $self->{state} = 'data';
1451 ## reconsume
1452
1453 delete $self->{current_token}->{correct};
1454 !!!emit ($self->{current_token}); # DOCTYPE
1455
1456 redo A;
1457 } else {
1458 !!!parse-error (type => 'string after PUBLIC literal');
1459 $self->{state} = 'bogus DOCTYPE';
1460 !!!next-input-character;
1461 redo A;
1462 }
1463 } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
1464 if ({
1465 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1466 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1467 }->{$self->{next_input_character}}) {
1468 ## Stay in the state
1469 !!!next-input-character;
1470 redo A;
1471 } elsif ($self->{next_input_character} == 0x0022) { # "
1472 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1473 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1474 !!!next-input-character;
1475 redo A;
1476 } elsif ($self->{next_input_character} == 0x0027) { # '
1477 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1478 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1479 !!!next-input-character;
1480 redo A;
1481 } elsif ($self->{next_input_character} == 0x003E) { # >
1482 !!!parse-error (type => 'no SYSTEM literal');
1483 $self->{state} = 'data';
1484 !!!next-input-character;
1485
1486 delete $self->{current_token}->{correct};
1487 !!!emit ($self->{current_token}); # DOCTYPE
1488
1489 redo A;
1490 } elsif ($self->{next_input_character} == -1) {
1491 !!!parse-error (type => 'unclosed DOCTYPE');
1492
1493 $self->{state} = 'data';
1494 ## reconsume
1495
1496 delete $self->{current_token}->{correct};
1497 !!!emit ($self->{current_token}); # DOCTYPE
1498
1499 redo A;
1500 } else {
1501 !!!parse-error (type => 'string after SYSTEM');
1502 $self->{state} = 'bogus DOCTYPE';
1503 !!!next-input-character;
1504 redo A;
1505 }
1506 } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
1507 if ($self->{next_input_character} == 0x0022) { # "
1508 $self->{state} = 'after DOCTYPE system identifier';
1509 !!!next-input-character;
1510 redo A;
1511 } elsif ($self->{next_input_character} == -1) {
1512 !!!parse-error (type => 'unclosed SYSTEM literal');
1513
1514 $self->{state} = 'data';
1515 ## reconsume
1516
1517 delete $self->{current_token}->{correct};
1518 !!!emit ($self->{current_token}); # DOCTYPE
1519
1520 redo A;
1521 } else {
1522 $self->{current_token}->{system_identifier} # DOCTYPE
1523 .= chr $self->{next_input_character};
1524 ## Stay in the state
1525 !!!next-input-character;
1526 redo A;
1527 }
1528 } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
1529 if ($self->{next_input_character} == 0x0027) { # '
1530 $self->{state} = 'after DOCTYPE system identifier';
1531 !!!next-input-character;
1532 redo A;
1533 } elsif ($self->{next_input_character} == -1) {
1534 !!!parse-error (type => 'unclosed SYSTEM literal');
1535
1536 $self->{state} = 'data';
1537 ## reconsume
1538
1539 delete $self->{current_token}->{correct};
1540 !!!emit ($self->{current_token}); # DOCTYPE
1541
1542 redo A;
1543 } else {
1544 $self->{current_token}->{system_identifier} # DOCTYPE
1545 .= chr $self->{next_input_character};
1546 ## Stay in the state
1547 !!!next-input-character;
1548 redo A;
1549 }
1550 } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
1551 if ({
1552 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1553 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1554 }->{$self->{next_input_character}}) {
1555 ## Stay in the state
1556 !!!next-input-character;
1557 redo A;
1558 } elsif ($self->{next_input_character} == 0x003E) { # >
1559 $self->{state} = 'data';
1560 !!!next-input-character;
1561
1562 !!!emit ($self->{current_token}); # DOCTYPE
1563
1564 redo A;
1565 } elsif ($self->{next_input_character} == -1) {
1566 !!!parse-error (type => 'unclosed DOCTYPE');
1567
1568 $self->{state} = 'data';
1569 ## reconsume
1570
1571 delete $self->{current_token}->{correct};
1572 !!!emit ($self->{current_token}); # DOCTYPE
1573
1574 redo A;
1575 } else {
1576 !!!parse-error (type => 'string after SYSTEM literal');
1577 $self->{state} = 'bogus DOCTYPE';
1578 !!!next-input-character;
1579 redo A;
1580 }
1581 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1582 if ($self->{next_input_character} == 0x003E) { # >
1583 $self->{state} = 'data';
1584 !!!next-input-character;
1585
1586 delete $self->{current_token}->{correct};
1587 !!!emit ($self->{current_token}); # DOCTYPE
1588
1589 redo A;
1590 } elsif ($self->{next_input_character} == -1) {
1591 !!!parse-error (type => 'unclosed DOCTYPE');
1592 $self->{state} = 'data';
1593 ## reconsume
1594
1595 delete $self->{current_token}->{correct};
1596 !!!emit ($self->{current_token}); # DOCTYPE
1597
1598 redo A;
1599 } else {
1600 ## Stay in the state
1601 !!!next-input-character;
1602 redo A;
1603 }
1604 } else {
1605 die "$0: $self->{state}: Unknown state";
1606 }
1607 } # A
1608
1609 die "$0: _get_next_token: unexpected case";
1610 } # _get_next_token
1611
1612 sub _tokenize_attempt_to_consume_an_entity ($$) {
1613 my ($self, $in_attr) = @_;
1614
1615 if ({
1616 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1617 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1618 }->{$self->{next_input_character}}) {
1619 ## Don't consume
1620 ## No error
1621 return undef;
1622 } elsif ($self->{next_input_character} == 0x0023) { # #
1623 !!!next-input-character;
1624 if ($self->{next_input_character} == 0x0078 or # x
1625 $self->{next_input_character} == 0x0058) { # X
1626 my $code;
1627 X: {
1628 my $x_char = $self->{next_input_character};
1629 !!!next-input-character;
1630 if (0x0030 <= $self->{next_input_character} and
1631 $self->{next_input_character} <= 0x0039) { # 0..9
1632 $code ||= 0;
1633 $code *= 0x10;
1634 $code += $self->{next_input_character} - 0x0030;
1635 redo X;
1636 } elsif (0x0061 <= $self->{next_input_character} and
1637 $self->{next_input_character} <= 0x0066) { # a..f
1638 $code ||= 0;
1639 $code *= 0x10;
1640 $code += $self->{next_input_character} - 0x0060 + 9;
1641 redo X;
1642 } elsif (0x0041 <= $self->{next_input_character} and
1643 $self->{next_input_character} <= 0x0046) { # A..F
1644 $code ||= 0;
1645 $code *= 0x10;
1646 $code += $self->{next_input_character} - 0x0040 + 9;
1647 redo X;
1648 } elsif (not defined $code) { # no hexadecimal digit
1649 !!!parse-error (type => 'bare hcro');
1650 !!!back-next-input-character ($x_char, $self->{next_input_character});
1651 $self->{next_input_character} = 0x0023; # #
1652 return undef;
1653 } elsif ($self->{next_input_character} == 0x003B) { # ;
1654 !!!next-input-character;
1655 } else {
1656 !!!parse-error (type => 'no refc');
1657 }
1658
1659 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1660 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1661 $code = 0xFFFD;
1662 } elsif ($code > 0x10FFFF) {
1663 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1664 $code = 0xFFFD;
1665 } elsif ($code == 0x000D) {
1666 !!!parse-error (type => 'CR character reference');
1667 $code = 0x000A;
1668 } elsif (0x80 <= $code and $code <= 0x9F) {
1669 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1670 $code = $c1_entity_char->{$code};
1671 }
1672
1673 return {type => 'character', data => chr $code};
1674 } # X
1675 } elsif (0x0030 <= $self->{next_input_character} and
1676 $self->{next_input_character} <= 0x0039) { # 0..9
1677 my $code = $self->{next_input_character} - 0x0030;
1678 !!!next-input-character;
1679
1680 while (0x0030 <= $self->{next_input_character} and
1681 $self->{next_input_character} <= 0x0039) { # 0..9
1682 $code *= 10;
1683 $code += $self->{next_input_character} - 0x0030;
1684
1685 !!!next-input-character;
1686 }
1687
1688 if ($self->{next_input_character} == 0x003B) { # ;
1689 !!!next-input-character;
1690 } else {
1691 !!!parse-error (type => 'no refc');
1692 }
1693
1694 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1695 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1696 $code = 0xFFFD;
1697 } elsif ($code > 0x10FFFF) {
1698 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1699 $code = 0xFFFD;
1700 } elsif ($code == 0x000D) {
1701 !!!parse-error (type => 'CR character reference');
1702 $code = 0x000A;
1703 } elsif (0x80 <= $code and $code <= 0x9F) {
1704 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1705 $code = $c1_entity_char->{$code};
1706 }
1707
1708 return {type => 'character', data => chr $code};
1709 } else {
1710 !!!parse-error (type => 'bare nero');
1711 !!!back-next-input-character ($self->{next_input_character});
1712 $self->{next_input_character} = 0x0023; # #
1713 return undef;
1714 }
1715 } elsif ((0x0041 <= $self->{next_input_character} and
1716 $self->{next_input_character} <= 0x005A) or
1717 (0x0061 <= $self->{next_input_character} and
1718 $self->{next_input_character} <= 0x007A)) {
1719 my $entity_name = chr $self->{next_input_character};
1720 !!!next-input-character;
1721
1722 my $value = $entity_name;
1723 my $match = 0;
1724 require Whatpm::_NamedEntityList;
1725 our $EntityChar;
1726
1727 while (length $entity_name < 10 and
1728 ## NOTE: Some number greater than the maximum length of entity name
1729 ((0x0041 <= $self->{next_input_character} and # a
1730 $self->{next_input_character} <= 0x005A) or # x
1731 (0x0061 <= $self->{next_input_character} and # a
1732 $self->{next_input_character} <= 0x007A) or # z
1733 (0x0030 <= $self->{next_input_character} and # 0
1734 $self->{next_input_character} <= 0x0039) or # 9
1735 $self->{next_input_character} == 0x003B)) { # ;
1736 $entity_name .= chr $self->{next_input_character};
1737 if (defined $EntityChar->{$entity_name}) {
1738 if ($self->{next_input_character} == 0x003B) { # ;
1739 $value = $EntityChar->{$entity_name};
1740 $match = 1;
1741 !!!next-input-character;
1742 last;
1743 } else {
1744 $value = $EntityChar->{$entity_name};
1745 $match = -1;
1746 !!!next-input-character;
1747 }
1748 } else {
1749 $value .= chr $self->{next_input_character};
1750 $match *= 2;
1751 !!!next-input-character;
1752 }
1753 }
1754
1755 if ($match > 0) {
1756 return {type => 'character', data => $value};
1757 } elsif ($match < 0) {
1758 !!!parse-error (type => 'no refc');
1759 if ($in_attr and $match < -1) {
1760 return {type => 'character', data => '&'.$entity_name};
1761 } else {
1762 return {type => 'character', data => $value};
1763 }
1764 } else {
1765 !!!parse-error (type => 'bare ero');
1766 ## NOTE: No characters are consumed in the spec.
1767 return {type => 'character', data => '&'.$value};
1768 }
1769 } else {
1770 ## no characters are consumed
1771 !!!parse-error (type => 'bare ero');
1772 return undef;
1773 }
1774 } # _tokenize_attempt_to_consume_an_entity
1775
1776 sub _initialize_tree_constructor ($) {
1777 my $self = shift;
1778 ## NOTE: $self->{document} MUST be specified before this method is called
1779 $self->{document}->strict_error_checking (0);
1780 ## TODO: Turn mutation events off # MUST
1781 ## TODO: Turn loose Document option (manakai extension) on
1782 $self->{document}->manakai_is_html (1); # MUST
1783 } # _initialize_tree_constructor
1784
1785 sub _terminate_tree_constructor ($) {
1786 my $self = shift;
1787 $self->{document}->strict_error_checking (1);
1788 ## TODO: Turn mutation events on
1789 } # _terminate_tree_constructor
1790
1791 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1792
1793 { # tree construction stage
1794 my $token;
1795
1796 sub _construct_tree ($) {
1797 my ($self) = @_;
1798
1799 ## When an interactive UA render the $self->{document} available
1800 ## to the user, or when it begin accepting user input, are
1801 ## not defined.
1802
1803 ## Append a character: collect it and all subsequent consecutive
1804 ## characters and insert one Text node whose data is concatenation
1805 ## of all those characters. # MUST
1806
1807 !!!next-token;
1808
1809 $self->{insertion_mode} = 'before head';
1810 undef $self->{form_element};
1811 undef $self->{head_element};
1812 $self->{open_elements} = [];
1813 undef $self->{inner_html_node};
1814
1815 $self->_tree_construction_initial; # MUST
1816 $self->_tree_construction_root_element;
1817 $self->_tree_construction_main;
1818 } # _construct_tree
1819
1820 sub _tree_construction_initial ($) {
1821 my $self = shift;
1822 INITIAL: {
1823 if ($token->{type} eq 'DOCTYPE') {
1824 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1825 ## error, switch to a conformance checking mode for another
1826 ## language.
1827 my $doctype_name = $token->{name};
1828 $doctype_name = '' unless defined $doctype_name;
1829 $doctype_name =~ tr/a-z/A-Z/;
1830 if (not defined $token->{name} or # <!DOCTYPE>
1831 defined $token->{public_identifier} or
1832 defined $token->{system_identifier}) {
1833 !!!parse-error (type => 'not HTML5');
1834 } elsif ($doctype_name ne 'HTML') {
1835 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1836 !!!parse-error (type => 'not HTML5');
1837 }
1838
1839 my $doctype = $self->{document}->create_document_type_definition
1840 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1841 $doctype->public_id ($token->{public_identifier})
1842 if defined $token->{public_identifier};
1843 $doctype->system_id ($token->{system_identifier})
1844 if defined $token->{system_identifier};
1845 ## NOTE: Other DocumentType attributes are null or empty lists.
1846 ## ISSUE: internalSubset = null??
1847 $self->{document}->append_child ($doctype);
1848
1849 if (not $token->{correct} or $doctype_name ne 'HTML') {
1850 $self->{document}->manakai_compat_mode ('quirks');
1851 } elsif (defined $token->{public_identifier}) {
1852 my $pubid = $token->{public_identifier};
1853 $pubid =~ tr/a-z/A-z/;
1854 if ({
1855 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1856 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1857 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1858 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1859 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1860 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1861 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1862 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1863 "-//IETF//DTD HTML 2.0//EN" => 1,
1864 "-//IETF//DTD HTML 2.1E//EN" => 1,
1865 "-//IETF//DTD HTML 3.0//EN" => 1,
1866 "-//IETF//DTD HTML 3.0//EN//" => 1,
1867 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1868 "-//IETF//DTD HTML 3.2//EN" => 1,
1869 "-//IETF//DTD HTML 3//EN" => 1,
1870 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1871 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1872 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1873 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1874 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1875 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1876 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1877 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1878 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1879 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1880 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1881 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1882 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1883 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1884 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1885 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1886 "-//IETF//DTD HTML STRICT//EN" => 1,
1887 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1888 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1889 "-//IETF//DTD HTML//EN" => 1,
1890 "-//IETF//DTD HTML//EN//2.0" => 1,
1891 "-//IETF//DTD HTML//EN//3.0" => 1,
1892 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1893 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1894 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1895 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1896 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1897 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1898 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1899 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1900 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1901 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1902 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1903 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1904 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1905 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1906 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1907 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1908 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1909 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1910 "-//W3C//DTD HTML 3.2//EN" => 1,
1911 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1912 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1913 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1914 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1915 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1916 "-//W3C//DTD W3 HTML//EN" => 1,
1917 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1918 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1919 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1920 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1921 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1922 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1923 "HTML" => 1,
1924 }->{$pubid}) {
1925 $self->{document}->manakai_compat_mode ('quirks');
1926 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1927 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1928 if (defined $token->{system_identifier}) {
1929 $self->{document}->manakai_compat_mode ('quirks');
1930 } else {
1931 $self->{document}->manakai_compat_mode ('limited quirks');
1932 }
1933 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
1934 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
1935 $self->{document}->manakai_compat_mode ('limited quirks');
1936 }
1937 }
1938 if (defined $token->{system_identifier}) {
1939 my $sysid = $token->{system_identifier};
1940 $sysid =~ tr/A-Z/a-z/;
1941 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
1942 $self->{document}->manakai_compat_mode ('quirks');
1943 }
1944 }
1945
1946 ## Go to the root element phase.
1947 !!!next-token;
1948 return;
1949 } elsif ({
1950 'start tag' => 1,
1951 'end tag' => 1,
1952 'end-of-file' => 1,
1953 }->{$token->{type}}) {
1954 !!!parse-error (type => 'no DOCTYPE');
1955 $self->{document}->manakai_compat_mode ('quirks');
1956 ## Go to the root element phase
1957 ## reprocess
1958 return;
1959 } elsif ($token->{type} eq 'character') {
1960 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1961 ## Ignore the token
1962
1963 unless (length $token->{data}) {
1964 ## Stay in the phase
1965 !!!next-token;
1966 redo INITIAL;
1967 }
1968 }
1969
1970 !!!parse-error (type => 'no DOCTYPE');
1971 $self->{document}->manakai_compat_mode ('quirks');
1972 ## Go to the root element phase
1973 ## reprocess
1974 return;
1975 } elsif ($token->{type} eq 'comment') {
1976 my $comment = $self->{document}->create_comment ($token->{data});
1977 $self->{document}->append_child ($comment);
1978
1979 ## Stay in the phase.
1980 !!!next-token;
1981 redo INITIAL;
1982 } else {
1983 die "$0: $token->{type}: Unknown token";
1984 }
1985 } # INITIAL
1986 } # _tree_construction_initial
1987
1988 sub _tree_construction_root_element ($) {
1989 my $self = shift;
1990
1991 B: {
1992 if ($token->{type} eq 'DOCTYPE') {
1993 !!!parse-error (type => 'in html:#DOCTYPE');
1994 ## Ignore the token
1995 ## Stay in the phase
1996 !!!next-token;
1997 redo B;
1998 } elsif ($token->{type} eq 'comment') {
1999 my $comment = $self->{document}->create_comment ($token->{data});
2000 $self->{document}->append_child ($comment);
2001 ## Stay in the phase
2002 !!!next-token;
2003 redo B;
2004 } elsif ($token->{type} eq 'character') {
2005 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2006 ## Ignore the token.
2007
2008 unless (length $token->{data}) {
2009 ## Stay in the phase
2010 !!!next-token;
2011 redo B;
2012 }
2013 }
2014 #
2015 } elsif ({
2016 'start tag' => 1,
2017 'end tag' => 1,
2018 'end-of-file' => 1,
2019 }->{$token->{type}}) {
2020 ## ISSUE: There is an issue in the spec
2021 #
2022 } else {
2023 die "$0: $token->{type}: Unknown token";
2024 }
2025 my $root_element; !!!create-element ($root_element, 'html');
2026 $self->{document}->append_child ($root_element);
2027 push @{$self->{open_elements}}, [$root_element, 'html'];
2028 ## reprocess
2029 #redo B;
2030 return; ## Go to the main phase.
2031 } # B
2032 } # _tree_construction_root_element
2033
2034 sub _reset_insertion_mode ($) {
2035 my $self = shift;
2036
2037 ## Step 1
2038 my $last;
2039
2040 ## Step 2
2041 my $i = -1;
2042 my $node = $self->{open_elements}->[$i];
2043
2044 ## Step 3
2045 S3: {
2046 ## ISSUE: Oops! "If node is the first node in the stack of open
2047 ## elements, then set last to true. If the context element of the
2048 ## HTML fragment parsing algorithm is neither a td element nor a
2049 ## th element, then set node to the context element. (fragment case)":
2050 ## The second "if" is in the scope of the first "if"!?
2051 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2052 $last = 1;
2053 if (defined $self->{inner_html_node}) {
2054 if ($self->{inner_html_node}->[1] eq 'td' or
2055 $self->{inner_html_node}->[1] eq 'th') {
2056 #
2057 } else {
2058 $node = $self->{inner_html_node};
2059 }
2060 }
2061 }
2062
2063 ## Step 4..13
2064 my $new_mode = {
2065 select => 'in select',
2066 td => 'in cell',
2067 th => 'in cell',
2068 tr => 'in row',
2069 tbody => 'in table body',
2070 thead => 'in table head',
2071 tfoot => 'in table foot',
2072 caption => 'in caption',
2073 colgroup => 'in column group',
2074 table => 'in table',
2075 head => 'in body', # not in head!
2076 body => 'in body',
2077 frameset => 'in frameset',
2078 }->{$node->[1]};
2079 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2080
2081 ## Step 14
2082 if ($node->[1] eq 'html') {
2083 unless (defined $self->{head_element}) {
2084 $self->{insertion_mode} = 'before head';
2085 } else {
2086 $self->{insertion_mode} = 'after head';
2087 }
2088 return;
2089 }
2090
2091 ## Step 15
2092 $self->{insertion_mode} = 'in body' and return if $last;
2093
2094 ## Step 16
2095 $i--;
2096 $node = $self->{open_elements}->[$i];
2097
2098 ## Step 17
2099 redo S3;
2100 } # S3
2101 } # _reset_insertion_mode
2102
2103 sub _tree_construction_main ($) {
2104 my $self = shift;
2105
2106 my $previous_insertion_mode;
2107
2108 my $active_formatting_elements = [];
2109
2110 my $reconstruct_active_formatting_elements = sub { # MUST
2111 my $insert = shift;
2112
2113 ## Step 1
2114 return unless @$active_formatting_elements;
2115
2116 ## Step 3
2117 my $i = -1;
2118 my $entry = $active_formatting_elements->[$i];
2119
2120 ## Step 2
2121 return if $entry->[0] eq '#marker';
2122 for (@{$self->{open_elements}}) {
2123 if ($entry->[0] eq $_->[0]) {
2124 return;
2125 }
2126 }
2127
2128 S4: {
2129 ## Step 4
2130 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2131
2132 ## Step 5
2133 $i--;
2134 $entry = $active_formatting_elements->[$i];
2135
2136 ## Step 6
2137 if ($entry->[0] eq '#marker') {
2138 #
2139 } else {
2140 my $in_open_elements;
2141 OE: for (@{$self->{open_elements}}) {
2142 if ($entry->[0] eq $_->[0]) {
2143 $in_open_elements = 1;
2144 last OE;
2145 }
2146 }
2147 if ($in_open_elements) {
2148 #
2149 } else {
2150 redo S4;
2151 }
2152 }
2153
2154 ## Step 7
2155 $i++;
2156 $entry = $active_formatting_elements->[$i];
2157 } # S4
2158
2159 S7: {
2160 ## Step 8
2161 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2162
2163 ## Step 9
2164 $insert->($clone->[0]);
2165 push @{$self->{open_elements}}, $clone;
2166
2167 ## Step 10
2168 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2169
2170 ## Step 11
2171 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2172 ## Step 7'
2173 $i++;
2174 $entry = $active_formatting_elements->[$i];
2175
2176 redo S7;
2177 }
2178 } # S7
2179 }; # $reconstruct_active_formatting_elements
2180
2181 my $clear_up_to_marker = sub {
2182 for (reverse 0..$#$active_formatting_elements) {
2183 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2184 splice @$active_formatting_elements, $_;
2185 return;
2186 }
2187 }
2188 }; # $clear_up_to_marker
2189
2190 my $parse_rcdata = sub ($$) {
2191 my ($content_model_flag, $insert) = @_;
2192
2193 ## Step 1
2194 my $start_tag_name = $token->{tag_name};
2195 my $el;
2196 !!!create-element ($el, $start_tag_name, $token->{attributes});
2197
2198 ## Step 2
2199 $insert->($el); # /context node/->append_child ($el)
2200
2201 ## Step 3
2202 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2203 delete $self->{escape}; # MUST
2204
2205 ## Step 4
2206 my $text = '';
2207 !!!next-token;
2208 while ($token->{type} eq 'character') { # or until stop tokenizing
2209 $text .= $token->{data};
2210 !!!next-token;
2211 }
2212
2213 ## Step 5
2214 if (length $text) {
2215 my $text = $self->{document}->create_text_node ($text);
2216 $el->append_child ($text);
2217 }
2218
2219 ## Step 6
2220 $self->{content_model} = PCDATA_CONTENT_MODEL;
2221
2222 ## Step 7
2223 if ($token->{type} eq 'end tag' and $token->{tag_name} eq $start_tag_name) {
2224 ## Ignore the token
2225 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2226 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2227 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2228 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2229 } else {
2230 die "$0: $content_model_flag in parse_rcdata";
2231 }
2232 !!!next-token;
2233 }; # $parse_rcdata
2234
2235 my $script_start_tag = sub ($) {
2236 my $insert = $_[0];
2237 my $script_el;
2238 !!!create-element ($script_el, 'script', $token->{attributes});
2239 ## TODO: mark as "parser-inserted"
2240
2241 $self->{content_model} = CDATA_CONTENT_MODEL;
2242 delete $self->{escape}; # MUST
2243
2244 my $text = '';
2245 !!!next-token;
2246 while ($token->{type} eq 'character') {
2247 $text .= $token->{data};
2248 !!!next-token;
2249 } # stop if non-character token or tokenizer stops tokenising
2250 if (length $text) {
2251 $script_el->manakai_append_text ($text);
2252 }
2253
2254 $self->{content_model} = PCDATA_CONTENT_MODEL;
2255
2256 if ($token->{type} eq 'end tag' and
2257 $token->{tag_name} eq 'script') {
2258 ## Ignore the token
2259 } else {
2260 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2261 ## ISSUE: And ignore?
2262 ## TODO: mark as "already executed"
2263 }
2264
2265 if (defined $self->{inner_html_node}) {
2266 ## TODO: mark as "already executed"
2267 } else {
2268 ## TODO: $old_insertion_point = current insertion point
2269 ## TODO: insertion point = just before the next input character
2270
2271 $insert->($script_el);
2272
2273 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2274
2275 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2276 }
2277
2278 !!!next-token;
2279 }; # $script_start_tag
2280
2281 my $formatting_end_tag = sub {
2282 my $tag_name = shift;
2283
2284 FET: {
2285 ## Step 1
2286 my $formatting_element;
2287 my $formatting_element_i_in_active;
2288 AFE: for (reverse 0..$#$active_formatting_elements) {
2289 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2290 $formatting_element = $active_formatting_elements->[$_];
2291 $formatting_element_i_in_active = $_;
2292 last AFE;
2293 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2294 last AFE;
2295 }
2296 } # AFE
2297 unless (defined $formatting_element) {
2298 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2299 ## Ignore the token
2300 !!!next-token;
2301 return;
2302 }
2303 ## has an element in scope
2304 my $in_scope = 1;
2305 my $formatting_element_i_in_open;
2306 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2307 my $node = $self->{open_elements}->[$_];
2308 if ($node->[0] eq $formatting_element->[0]) {
2309 if ($in_scope) {
2310 $formatting_element_i_in_open = $_;
2311 last INSCOPE;
2312 } else { # in open elements but not in scope
2313 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2314 ## Ignore the token
2315 !!!next-token;
2316 return;
2317 }
2318 } elsif ({
2319 table => 1, caption => 1, td => 1, th => 1,
2320 button => 1, marquee => 1, object => 1, html => 1,
2321 }->{$node->[1]}) {
2322 $in_scope = 0;
2323 }
2324 } # INSCOPE
2325 unless (defined $formatting_element_i_in_open) {
2326 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2327 pop @$active_formatting_elements; # $formatting_element
2328 !!!next-token; ## TODO: ok?
2329 return;
2330 }
2331 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2332 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2333 }
2334
2335 ## Step 2
2336 my $furthest_block;
2337 my $furthest_block_i_in_open;
2338 OE: for (reverse 0..$#{$self->{open_elements}}) {
2339 my $node = $self->{open_elements}->[$_];
2340 if (not $formatting_category->{$node->[1]} and
2341 #not $phrasing_category->{$node->[1]} and
2342 ($special_category->{$node->[1]} or
2343 $scoping_category->{$node->[1]})) {
2344 $furthest_block = $node;
2345 $furthest_block_i_in_open = $_;
2346 } elsif ($node->[0] eq $formatting_element->[0]) {
2347 last OE;
2348 }
2349 } # OE
2350
2351 ## Step 3
2352 unless (defined $furthest_block) { # MUST
2353 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2354 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2355 !!!next-token;
2356 return;
2357 }
2358
2359 ## Step 4
2360 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2361
2362 ## Step 5
2363 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2364 if (defined $furthest_block_parent) {
2365 $furthest_block_parent->remove_child ($furthest_block->[0]);
2366 }
2367
2368 ## Step 6
2369 my $bookmark_prev_el
2370 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2371 ->[0];
2372
2373 ## Step 7
2374 my $node = $furthest_block;
2375 my $node_i_in_open = $furthest_block_i_in_open;
2376 my $last_node = $furthest_block;
2377 S7: {
2378 ## Step 1
2379 $node_i_in_open--;
2380 $node = $self->{open_elements}->[$node_i_in_open];
2381
2382 ## Step 2
2383 my $node_i_in_active;
2384 S7S2: {
2385 for (reverse 0..$#$active_formatting_elements) {
2386 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2387 $node_i_in_active = $_;
2388 last S7S2;
2389 }
2390 }
2391 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2392 redo S7;
2393 } # S7S2
2394
2395 ## Step 3
2396 last S7 if $node->[0] eq $formatting_element->[0];
2397
2398 ## Step 4
2399 if ($last_node->[0] eq $furthest_block->[0]) {
2400 $bookmark_prev_el = $node->[0];
2401 }
2402
2403 ## Step 5
2404 if ($node->[0]->has_child_nodes ()) {
2405 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2406 $active_formatting_elements->[$node_i_in_active] = $clone;
2407 $self->{open_elements}->[$node_i_in_open] = $clone;
2408 $node = $clone;
2409 }
2410
2411 ## Step 6
2412 $node->[0]->append_child ($last_node->[0]);
2413
2414 ## Step 7
2415 $last_node = $node;
2416
2417 ## Step 8
2418 redo S7;
2419 } # S7
2420
2421 ## Step 8
2422 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2423
2424 ## Step 9
2425 my $clone = [$formatting_element->[0]->clone_node (0),
2426 $formatting_element->[1]];
2427
2428 ## Step 10
2429 my @cn = @{$furthest_block->[0]->child_nodes};
2430 $clone->[0]->append_child ($_) for @cn;
2431
2432 ## Step 11
2433 $furthest_block->[0]->append_child ($clone->[0]);
2434
2435 ## Step 12
2436 my $i;
2437 AFE: for (reverse 0..$#$active_formatting_elements) {
2438 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2439 splice @$active_formatting_elements, $_, 1;
2440 $i-- and last AFE if defined $i;
2441 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2442 $i = $_;
2443 }
2444 } # AFE
2445 splice @$active_formatting_elements, $i + 1, 0, $clone;
2446
2447 ## Step 13
2448 undef $i;
2449 OE: for (reverse 0..$#{$self->{open_elements}}) {
2450 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2451 splice @{$self->{open_elements}}, $_, 1;
2452 $i-- and last OE if defined $i;
2453 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2454 $i = $_;
2455 }
2456 } # OE
2457 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2458
2459 ## Step 14
2460 redo FET;
2461 } # FET
2462 }; # $formatting_end_tag
2463
2464 my $insert_to_current = sub {
2465 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2466 }; # $insert_to_current
2467
2468 my $insert_to_foster = sub {
2469 my $child = shift;
2470 if ({
2471 table => 1, tbody => 1, tfoot => 1,
2472 thead => 1, tr => 1,
2473 }->{$self->{open_elements}->[-1]->[1]}) {
2474 # MUST
2475 my $foster_parent_element;
2476 my $next_sibling;
2477 OE: for (reverse 0..$#{$self->{open_elements}}) {
2478 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2479 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2480 if (defined $parent and $parent->node_type == 1) {
2481 $foster_parent_element = $parent;
2482 $next_sibling = $self->{open_elements}->[$_]->[0];
2483 } else {
2484 $foster_parent_element
2485 = $self->{open_elements}->[$_ - 1]->[0];
2486 }
2487 last OE;
2488 }
2489 } # OE
2490 $foster_parent_element = $self->{open_elements}->[0]->[0]
2491 unless defined $foster_parent_element;
2492 $foster_parent_element->insert_before
2493 ($child, $next_sibling);
2494 } else {
2495 $self->{open_elements}->[-1]->[0]->append_child ($child);
2496 }
2497 }; # $insert_to_foster
2498
2499 my $in_body = sub {
2500 my $insert = shift;
2501 if ($token->{type} eq 'start tag') {
2502 if ($token->{tag_name} eq 'script') {
2503 ## NOTE: This is an "as if in head" code clone
2504 $script_start_tag->($insert);
2505 return;
2506 } elsif ($token->{tag_name} eq 'style') {
2507 ## NOTE: This is an "as if in head" code clone
2508 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
2509 return;
2510 } elsif ({
2511 base => 1, link => 1,
2512 }->{$token->{tag_name}}) {
2513 ## NOTE: This is an "as if in head" code clone, only "-t" differs
2514 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2515 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2516 !!!next-token;
2517 return;
2518 } elsif ($token->{tag_name} eq 'meta') {
2519 ## NOTE: This is an "as if in head" code clone, only "-t" differs
2520 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2521 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2522
2523 unless ($self->{confident}) {
2524 my $charset;
2525 if ($token->{attributes}->{charset}) { ## TODO: And if supported
2526 $charset = $token->{attributes}->{charset}->{value};
2527 }
2528 if ($token->{attributes}->{'http-equiv'}) {
2529 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2530 if ($token->{attributes}->{'http-equiv'}->{value}
2531 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2532 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2533 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2534 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
2535 } ## TODO: And if supported
2536 }
2537 ## TODO: Change the encoding
2538 }
2539
2540 !!!next-token;
2541 return;
2542 } elsif ($token->{tag_name} eq 'title') {
2543 !!!parse-error (type => 'in body:title');
2544 ## NOTE: This is an "as if in head" code clone
2545 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
2546 if (defined $self->{head_element}) {
2547 $self->{head_element}->append_child ($_[0]);
2548 } else {
2549 $insert->($_[0]);
2550 }
2551 });
2552 return;
2553 } elsif ($token->{tag_name} eq 'body') {
2554 !!!parse-error (type => 'in body:body');
2555
2556 if (@{$self->{open_elements}} == 1 or
2557 $self->{open_elements}->[1]->[1] ne 'body') {
2558 ## Ignore the token
2559 } else {
2560 my $body_el = $self->{open_elements}->[1]->[0];
2561 for my $attr_name (keys %{$token->{attributes}}) {
2562 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2563 $body_el->set_attribute_ns
2564 (undef, [undef, $attr_name],
2565 $token->{attributes}->{$attr_name}->{value});
2566 }
2567 }
2568 }
2569 !!!next-token;
2570 return;
2571 } elsif ({
2572 address => 1, blockquote => 1, center => 1, dir => 1,
2573 div => 1, dl => 1, fieldset => 1, listing => 1,
2574 menu => 1, ol => 1, p => 1, ul => 1,
2575 pre => 1,
2576 }->{$token->{tag_name}}) {
2577 ## has a p element in scope
2578 INSCOPE: for (reverse @{$self->{open_elements}}) {
2579 if ($_->[1] eq 'p') {
2580 !!!back-token;
2581 $token = {type => 'end tag', tag_name => 'p'};
2582 return;
2583 } elsif ({
2584 table => 1, caption => 1, td => 1, th => 1,
2585 button => 1, marquee => 1, object => 1, html => 1,
2586 }->{$_->[1]}) {
2587 last INSCOPE;
2588 }
2589 } # INSCOPE
2590
2591 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2592 if ($token->{tag_name} eq 'pre') {
2593 !!!next-token;
2594 if ($token->{type} eq 'character') {
2595 $token->{data} =~ s/^\x0A//;
2596 unless (length $token->{data}) {
2597 !!!next-token;
2598 }
2599 }
2600 } else {
2601 !!!next-token;
2602 }
2603 return;
2604 } elsif ($token->{tag_name} eq 'form') {
2605 if (defined $self->{form_element}) {
2606 !!!parse-error (type => 'in form:form');
2607 ## Ignore the token
2608 !!!next-token;
2609 return;
2610 } else {
2611 ## has a p element in scope
2612 INSCOPE: for (reverse @{$self->{open_elements}}) {
2613 if ($_->[1] eq 'p') {
2614 !!!back-token;
2615 $token = {type => 'end tag', tag_name => 'p'};
2616 return;
2617 } elsif ({
2618 table => 1, caption => 1, td => 1, th => 1,
2619 button => 1, marquee => 1, object => 1, html => 1,
2620 }->{$_->[1]}) {
2621 last INSCOPE;
2622 }
2623 } # INSCOPE
2624
2625 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2626 $self->{form_element} = $self->{open_elements}->[-1]->[0];
2627 !!!next-token;
2628 return;
2629 }
2630 } elsif ($token->{tag_name} eq 'li') {
2631 ## has a p element in scope
2632 INSCOPE: for (reverse @{$self->{open_elements}}) {
2633 if ($_->[1] eq 'p') {
2634 !!!back-token;
2635 $token = {type => 'end tag', tag_name => 'p'};
2636 return;
2637 } elsif ({
2638 table => 1, caption => 1, td => 1, th => 1,
2639 button => 1, marquee => 1, object => 1, html => 1,
2640 }->{$_->[1]}) {
2641 last INSCOPE;
2642 }
2643 } # INSCOPE
2644
2645 ## Step 1
2646 my $i = -1;
2647 my $node = $self->{open_elements}->[$i];
2648 LI: {
2649 ## Step 2
2650 if ($node->[1] eq 'li') {
2651 if ($i != -1) {
2652 !!!parse-error (type => 'end tag missing:'.
2653 $self->{open_elements}->[-1]->[1]);
2654 }
2655 splice @{$self->{open_elements}}, $i;
2656 last LI;
2657 }
2658
2659 ## Step 3
2660 if (not $formatting_category->{$node->[1]} and
2661 #not $phrasing_category->{$node->[1]} and
2662 ($special_category->{$node->[1]} or
2663 $scoping_category->{$node->[1]}) and
2664 $node->[1] ne 'address' and $node->[1] ne 'div') {
2665 last LI;
2666 }
2667
2668 ## Step 4
2669 $i--;
2670 $node = $self->{open_elements}->[$i];
2671 redo LI;
2672 } # LI
2673
2674 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2675 !!!next-token;
2676 return;
2677 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2678 ## has a p element in scope
2679 INSCOPE: for (reverse @{$self->{open_elements}}) {
2680 if ($_->[1] eq 'p') {
2681 !!!back-token;
2682 $token = {type => 'end tag', tag_name => 'p'};
2683 return;
2684 } elsif ({
2685 table => 1, caption => 1, td => 1, th => 1,
2686 button => 1, marquee => 1, object => 1, html => 1,
2687 }->{$_->[1]}) {
2688 last INSCOPE;
2689 }
2690 } # INSCOPE
2691
2692 ## Step 1
2693 my $i = -1;
2694 my $node = $self->{open_elements}->[$i];
2695 LI: {
2696 ## Step 2
2697 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2698 if ($i != -1) {
2699 !!!parse-error (type => 'end tag missing:'.
2700 $self->{open_elements}->[-1]->[1]);
2701 }
2702 splice @{$self->{open_elements}}, $i;
2703 last LI;
2704 }
2705
2706 ## Step 3
2707 if (not $formatting_category->{$node->[1]} and
2708 #not $phrasing_category->{$node->[1]} and
2709 ($special_category->{$node->[1]} or
2710 $scoping_category->{$node->[1]}) and
2711 $node->[1] ne 'address' and $node->[1] ne 'div') {
2712 last LI;
2713 }
2714
2715 ## Step 4
2716 $i--;
2717 $node = $self->{open_elements}->[$i];
2718 redo LI;
2719 } # LI
2720
2721 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2722 !!!next-token;
2723 return;
2724 } elsif ($token->{tag_name} eq 'plaintext') {
2725 ## has a p element in scope
2726 INSCOPE: for (reverse @{$self->{open_elements}}) {
2727 if ($_->[1] eq 'p') {
2728 !!!back-token;
2729 $token = {type => 'end tag', tag_name => 'p'};
2730 return;
2731 } elsif ({
2732 table => 1, caption => 1, td => 1, th => 1,
2733 button => 1, marquee => 1, object => 1, html => 1,
2734 }->{$_->[1]}) {
2735 last INSCOPE;
2736 }
2737 } # INSCOPE
2738
2739 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2740
2741 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
2742
2743 !!!next-token;
2744 return;
2745 } elsif ({
2746 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2747 }->{$token->{tag_name}}) {
2748 ## has a p element in scope
2749 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2750 my $node = $self->{open_elements}->[$_];
2751 if ($node->[1] eq 'p') {
2752 !!!back-token;
2753 $token = {type => 'end tag', tag_name => 'p'};
2754 return;
2755 } elsif ({
2756 table => 1, caption => 1, td => 1, th => 1,
2757 button => 1, marquee => 1, object => 1, html => 1,
2758 }->{$node->[1]}) {
2759 last INSCOPE;
2760 }
2761 } # INSCOPE
2762
2763 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
2764 ## has an element in scope
2765 #my $i;
2766 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2767 # my $node = $self->{open_elements}->[$_];
2768 # if ({
2769 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2770 # }->{$node->[1]}) {
2771 # $i = $_;
2772 # last INSCOPE;
2773 # } elsif ({
2774 # table => 1, caption => 1, td => 1, th => 1,
2775 # button => 1, marquee => 1, object => 1, html => 1,
2776 # }->{$node->[1]}) {
2777 # last INSCOPE;
2778 # }
2779 #} # INSCOPE
2780 #
2781 #if (defined $i) {
2782 # !!! parse-error (type => 'in hn:hn');
2783 # splice @{$self->{open_elements}}, $i;
2784 #}
2785
2786 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2787
2788 !!!next-token;
2789 return;
2790 } elsif ($token->{tag_name} eq 'a') {
2791 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2792 my $node = $active_formatting_elements->[$i];
2793 if ($node->[1] eq 'a') {
2794 !!!parse-error (type => 'in a:a');
2795
2796 !!!back-token;
2797 $token = {type => 'end tag', tag_name => 'a'};
2798 $formatting_end_tag->($token->{tag_name});
2799
2800 AFE2: for (reverse 0..$#$active_formatting_elements) {
2801 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2802 splice @$active_formatting_elements, $_, 1;
2803 last AFE2;
2804 }
2805 } # AFE2
2806 OE: for (reverse 0..$#{$self->{open_elements}}) {
2807 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
2808 splice @{$self->{open_elements}}, $_, 1;
2809 last OE;
2810 }
2811 } # OE
2812 last AFE;
2813 } elsif ($node->[0] eq '#marker') {
2814 last AFE;
2815 }
2816 } # AFE
2817
2818 $reconstruct_active_formatting_elements->($insert_to_current);
2819
2820 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2821 push @$active_formatting_elements, $self->{open_elements}->[-1];
2822
2823 !!!next-token;
2824 return;
2825 } elsif ({
2826 b => 1, big => 1, em => 1, font => 1, i => 1,
2827 s => 1, small => 1, strile => 1,
2828 strong => 1, tt => 1, u => 1,
2829 }->{$token->{tag_name}}) {
2830 $reconstruct_active_formatting_elements->($insert_to_current);
2831
2832 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2833 push @$active_formatting_elements, $self->{open_elements}->[-1];
2834
2835 !!!next-token;
2836 return;
2837 } elsif ($token->{tag_name} eq 'nobr') {
2838 $reconstruct_active_formatting_elements->($insert_to_current);
2839
2840 ## has a |nobr| element in scope
2841 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2842 my $node = $self->{open_elements}->[$_];
2843 if ($node->[1] eq 'nobr') {
2844 !!!parse-error (type => 'not closed:nobr');
2845 !!!back-token;
2846 $token = {type => 'end tag', tag_name => 'nobr'};
2847 return;
2848 } elsif ({
2849 table => 1, caption => 1, td => 1, th => 1,
2850 button => 1, marquee => 1, object => 1, html => 1,
2851 }->{$node->[1]}) {
2852 last INSCOPE;
2853 }
2854 } # INSCOPE
2855
2856 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2857 push @$active_formatting_elements, $self->{open_elements}->[-1];
2858
2859 !!!next-token;
2860 return;
2861 } elsif ($token->{tag_name} eq 'button') {
2862 ## has a button element in scope
2863 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2864 my $node = $self->{open_elements}->[$_];
2865 if ($node->[1] eq 'button') {
2866 !!!parse-error (type => 'in button:button');
2867 !!!back-token;
2868 $token = {type => 'end tag', tag_name => 'button'};
2869 return;
2870 } elsif ({
2871 table => 1, caption => 1, td => 1, th => 1,
2872 button => 1, marquee => 1, object => 1, html => 1,
2873 }->{$node->[1]}) {
2874 last INSCOPE;
2875 }
2876 } # INSCOPE
2877
2878 $reconstruct_active_formatting_elements->($insert_to_current);
2879
2880 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2881 push @$active_formatting_elements, ['#marker', ''];
2882
2883 !!!next-token;
2884 return;
2885 } elsif ($token->{tag_name} eq 'marquee' or
2886 $token->{tag_name} eq 'object') {
2887 $reconstruct_active_formatting_elements->($insert_to_current);
2888
2889 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2890 push @$active_formatting_elements, ['#marker', ''];
2891
2892 !!!next-token;
2893 return;
2894 } elsif ($token->{tag_name} eq 'xmp') {
2895 $reconstruct_active_formatting_elements->($insert_to_current);
2896 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
2897 return;
2898 } elsif ($token->{tag_name} eq 'table') {
2899 ## has a p element in scope
2900 INSCOPE: for (reverse @{$self->{open_elements}}) {
2901 if ($_->[1] eq 'p') {
2902 !!!back-token;
2903 $token = {type => 'end tag', tag_name => 'p'};
2904 return;
2905 } elsif ({
2906 table => 1, caption => 1, td => 1, th => 1,
2907 button => 1, marquee => 1, object => 1, html => 1,
2908 }->{$_->[1]}) {
2909 last INSCOPE;
2910 }
2911 } # INSCOPE
2912
2913 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2914
2915 $self->{insertion_mode} = 'in table';
2916
2917 !!!next-token;
2918 return;
2919 } elsif ({
2920 area => 1, basefont => 1, bgsound => 1, br => 1,
2921 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2922 image => 1,
2923 }->{$token->{tag_name}}) {
2924 if ($token->{tag_name} eq 'image') {
2925 !!!parse-error (type => 'image');
2926 $token->{tag_name} = 'img';
2927 }
2928
2929 ## NOTE: There is an "as if <br>" code clone.
2930 $reconstruct_active_formatting_elements->($insert_to_current);
2931
2932 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2933 pop @{$self->{open_elements}};
2934
2935 !!!next-token;
2936 return;
2937 } elsif ($token->{tag_name} eq 'hr') {
2938 ## has a p element in scope
2939 INSCOPE: for (reverse @{$self->{open_elements}}) {
2940 if ($_->[1] eq 'p') {
2941 !!!back-token;
2942 $token = {type => 'end tag', tag_name => 'p'};
2943 return;
2944 } elsif ({
2945 table => 1, caption => 1, td => 1, th => 1,
2946 button => 1, marquee => 1, object => 1, html => 1,
2947 }->{$_->[1]}) {
2948 last INSCOPE;
2949 }
2950 } # INSCOPE
2951
2952 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2953 pop @{$self->{open_elements}};
2954
2955 !!!next-token;
2956 return;
2957 } elsif ($token->{tag_name} eq 'input') {
2958 $reconstruct_active_formatting_elements->($insert_to_current);
2959
2960 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2961 ## TODO: associate with $self->{form_element} if defined
2962 pop @{$self->{open_elements}};
2963
2964 !!!next-token;
2965 return;
2966 } elsif ($token->{tag_name} eq 'isindex') {
2967 !!!parse-error (type => 'isindex');
2968
2969 if (defined $self->{form_element}) {
2970 ## Ignore the token
2971 !!!next-token;
2972 return;
2973 } else {
2974 my $at = $token->{attributes};
2975 my $form_attrs;
2976 $form_attrs->{action} = $at->{action} if $at->{action};
2977 my $prompt_attr = $at->{prompt};
2978 $at->{name} = {name => 'name', value => 'isindex'};
2979 delete $at->{action};
2980 delete $at->{prompt};
2981 my @tokens = (
2982 {type => 'start tag', tag_name => 'form',
2983 attributes => $form_attrs},
2984 {type => 'start tag', tag_name => 'hr'},
2985 {type => 'start tag', tag_name => 'p'},
2986 {type => 'start tag', tag_name => 'label'},
2987 );
2988 if ($prompt_attr) {
2989 push @tokens, {type => 'character', data => $prompt_attr->{value}};
2990 } else {
2991 push @tokens, {type => 'character',
2992 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
2993 ## TODO: make this configurable
2994 }
2995 push @tokens,
2996 {type => 'start tag', tag_name => 'input', attributes => $at},
2997 #{type => 'character', data => ''}, # SHOULD
2998 {type => 'end tag', tag_name => 'label'},
2999 {type => 'end tag', tag_name => 'p'},
3000 {type => 'start tag', tag_name => 'hr'},
3001 {type => 'end tag', tag_name => 'form'};
3002 $token = shift @tokens;
3003 !!!back-token (@tokens);
3004 return;
3005 }
3006 } elsif ($token->{tag_name} eq 'textarea') {
3007 my $tag_name = $token->{tag_name};
3008 my $el;
3009 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
3010
3011 ## TODO: $self->{form_element} if defined
3012 $self->{content_model} = RCDATA_CONTENT_MODEL;
3013 delete $self->{escape}; # MUST
3014
3015 $insert->($el);
3016
3017 my $text = '';
3018 !!!next-token;
3019 if ($token->{type} eq 'character') {
3020 $token->{data} =~ s/^\x0A//;
3021 unless (length $token->{data}) {
3022 !!!next-token;
3023 }
3024 }
3025 while ($token->{type} eq 'character') {
3026 $text .= $token->{data};
3027 !!!next-token;
3028 }
3029 if (length $text) {
3030 $el->manakai_append_text ($text);
3031 }
3032
3033 $self->{content_model} = PCDATA_CONTENT_MODEL;
3034
3035 if ($token->{type} eq 'end tag' and
3036 $token->{tag_name} eq $tag_name) {
3037 ## Ignore the token
3038 } else {
3039 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
3040 }
3041 !!!next-token;
3042 return;
3043 } elsif ({
3044 iframe => 1,
3045 noembed => 1,
3046 noframes => 1,
3047 noscript => 0, ## TODO: 1 if scripting is enabled
3048 }->{$token->{tag_name}}) {
3049 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
3050 return;
3051 } elsif ($token->{tag_name} eq 'select') {
3052 $reconstruct_active_formatting_elements->($insert_to_current);
3053
3054 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
3055
3056 $self->{insertion_mode} = 'in select';
3057 !!!next-token;
3058 return;
3059 } elsif ({
3060 caption => 1, col => 1, colgroup => 1, frame => 1,
3061 frameset => 1, head => 1, option => 1, optgroup => 1,
3062 tbody => 1, td => 1, tfoot => 1, th => 1,
3063 thead => 1, tr => 1,
3064 }->{$token->{tag_name}}) {
3065 !!!parse-error (type => 'in body:'.$token->{tag_name});
3066 ## Ignore the token
3067 !!!next-token;
3068 return;
3069
3070 ## ISSUE: An issue on HTML5 new elements in the spec.
3071 } else {
3072 $reconstruct_active_formatting_elements->($insert_to_current);
3073
3074 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
3075
3076 !!!next-token;
3077 return;
3078 }
3079 } elsif ($token->{type} eq 'end tag') {
3080 if ($token->{tag_name} eq 'body') {
3081 if (@{$self->{open_elements}} > 1 and
3082 $self->{open_elements}->[1]->[1] eq 'body') {
3083 for (@{$self->{open_elements}}) {
3084 unless ({
3085 dd => 1, dt => 1, li => 1, p => 1, td => 1,
3086 th => 1, tr => 1, body => 1, html => 1,
3087 tbody => 1, tfoot => 1, thead => 1,
3088 }->{$_->[1]}) {
3089 !!!parse-error (type => 'not closed:'.$_->[1]);
3090 }
3091 }
3092
3093 $self->{insertion_mode} = 'after body';
3094 !!!next-token;
3095 return;
3096 } else {
3097 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3098 ## Ignore the token
3099 !!!next-token;
3100 return;
3101 }
3102 } elsif ($token->{tag_name} eq 'html') {
3103 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
3104 ## ISSUE: There is an issue in the spec.
3105 if ($self->{open_elements}->[-1]->[1] ne 'body') {
3106 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
3107 }
3108 $self->{insertion_mode} = 'after body';
3109 ## reprocess
3110 return;
3111 } else {
3112 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3113 ## Ignore the token
3114 !!!next-token;
3115 return;
3116 }
3117 } elsif ({
3118 address => 1, blockquote => 1, center => 1, dir => 1,
3119 div => 1, dl => 1, fieldset => 1, listing => 1,
3120 menu => 1, ol => 1, pre => 1, ul => 1,
3121 p => 1,
3122 dd => 1, dt => 1, li => 1,
3123 button => 1, marquee => 1, object => 1,
3124 }->{$token->{tag_name}}) {
3125 ## has an element in scope
3126 my $i;
3127 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3128 my $node = $self->{open_elements}->[$_];
3129 if ($node->[1] eq $token->{tag_name}) {
3130 ## generate implied end tags
3131 if ({
3132 dd => ($token->{tag_name} ne 'dd'),
3133 dt => ($token->{tag_name} ne 'dt'),
3134 li => ($token->{tag_name} ne 'li'),
3135 p => ($token->{tag_name} ne 'p'),
3136 td => 1, th => 1, tr => 1,
3137 tbody => 1, tfoot=> 1, thead => 1,
3138 }->{$self->{open_elements}->[-1]->[1]}) {
3139 !!!back-token;
3140 $token = {type => 'end tag',
3141 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3142 return;
3143 }
3144 $i = $_;
3145 last INSCOPE unless $token->{tag_name} eq 'p';
3146 } elsif ({
3147 table => 1, caption => 1, td => 1, th => 1,
3148 button => 1, marquee => 1, object => 1, html => 1,
3149 }->{$node->[1]}) {
3150 last INSCOPE;
3151 }
3152 } # INSCOPE
3153
3154 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3155 if (defined $i) {
3156 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3157 } else {
3158 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3159 }
3160 }
3161
3162 if (defined $i) {
3163 splice @{$self->{open_elements}}, $i;
3164 } elsif ($token->{tag_name} eq 'p') {
3165 ## As if <p>, then reprocess the current token
3166 my $el;
3167 !!!create-element ($el, 'p');
3168 $insert->($el);
3169 }
3170 $clear_up_to_marker->()
3171 if {
3172 button => 1, marquee => 1, object => 1,
3173 }->{$token->{tag_name}};
3174 !!!next-token;
3175 return;
3176 } elsif ($token->{tag_name} eq 'form') {
3177 ## has an element in scope
3178 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3179 my $node = $self->{open_elements}->[$_];
3180 if ($node->[1] eq $token->{tag_name}) {
3181 ## generate implied end tags
3182 if ({
3183 dd => 1, dt => 1, li => 1, p => 1,
3184 td => 1, th => 1, tr => 1,
3185 tbody => 1, tfoot=> 1, thead => 1,
3186 }->{$self->{open_elements}->[-1]->[1]}) {
3187 !!!back-token;
3188 $token = {type => 'end tag',
3189 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3190 return;
3191 }
3192 last INSCOPE;
3193 } elsif ({
3194 table => 1, caption => 1, td => 1, th => 1,
3195 button => 1, marquee => 1, object => 1, html => 1,
3196 }->{$node->[1]}) {
3197 last INSCOPE;
3198 }
3199 } # INSCOPE
3200
3201 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
3202 pop @{$self->{open_elements}};
3203 } else {
3204 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3205 }
3206
3207 undef $self->{form_element};
3208 !!!next-token;
3209 return;
3210 } elsif ({
3211 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3212 }->{$token->{tag_name}}) {
3213 ## has an element in scope
3214 my $i;
3215 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3216 my $node = $self->{open_elements}->[$_];
3217 if ({
3218 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3219 }->{$node->[1]}) {
3220 ## generate implied end tags
3221 if ({
3222 dd => 1, dt => 1, li => 1, p => 1,
3223 td => 1, th => 1, tr => 1,
3224 tbody => 1, tfoot=> 1, thead => 1,
3225 }->{$self->{open_elements}->[-1]->[1]}) {
3226 !!!back-token;
3227 $token = {type => 'end tag',
3228 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3229 return;
3230 }
3231 $i = $_;
3232 last INSCOPE;
3233 } elsif ({
3234 table => 1, caption => 1, td => 1, th => 1,
3235 button => 1, marquee => 1, object => 1, html => 1,
3236 }->{$node->[1]}) {
3237 last INSCOPE;
3238 }
3239 } # INSCOPE
3240
3241 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3242 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3243 }
3244
3245 splice @{$self->{open_elements}}, $i if defined $i;
3246 !!!next-token;
3247 return;
3248 } elsif ({
3249 a => 1,
3250 b => 1, big => 1, em => 1, font => 1, i => 1,
3251 nobr => 1, s => 1, small => 1, strile => 1,
3252 strong => 1, tt => 1, u => 1,
3253 }->{$token->{tag_name}}) {
3254 $formatting_end_tag->($token->{tag_name});
3255 return;
3256 } elsif ($token->{tag_name} eq 'br') {
3257 !!!parse-error (type => 'unmatched end tag:br');
3258
3259 ## As if <br>
3260 $reconstruct_active_formatting_elements->($insert_to_current);
3261
3262 my $el;
3263 !!!create-element ($el, 'br');
3264 $insert->($el);
3265
3266 ## Ignore the token.
3267 !!!next-token;
3268 return;
3269 } elsif ({
3270 caption => 1, col => 1, colgroup => 1, frame => 1,
3271 frameset => 1, head => 1, option => 1, optgroup => 1,
3272 tbody => 1, td => 1, tfoot => 1, th => 1,
3273 thead => 1, tr => 1,
3274 area => 1, basefont => 1, bgsound => 1,
3275 embed => 1, hr => 1, iframe => 1, image => 1,
3276 img => 1, input => 1, isindex => 1, noembed => 1,
3277 noframes => 1, param => 1, select => 1, spacer => 1,
3278 table => 1, textarea => 1, wbr => 1,
3279 noscript => 0, ## TODO: if scripting is enabled
3280 }->{$token->{tag_name}}) {
3281 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3282 ## Ignore the token
3283 !!!next-token;
3284 return;
3285
3286 ## ISSUE: Issue on HTML5 new elements in spec
3287
3288 } else {
3289 ## Step 1
3290 my $node_i = -1;
3291 my $node = $self->{open_elements}->[$node_i];
3292
3293 ## Step 2
3294 S2: {
3295 if ($node->[1] eq $token->{tag_name}) {
3296 ## Step 1
3297 ## generate implied end tags
3298 if ({
3299 dd => 1, dt => 1, li => 1, p => 1,
3300 td => 1, th => 1, tr => 1,
3301 tbody => 1, tfoot=> 1, thead => 1,
3302 }->{$self->{open_elements}->[-1]->[1]}) {
3303 !!!back-token;
3304 $token = {type => 'end tag',
3305 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3306 return;
3307 }
3308
3309 ## Step 2
3310 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
3311 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3312 }
3313
3314 ## Step 3
3315 splice @{$self->{open_elements}}, $node_i;
3316
3317 !!!next-token;
3318 last S2;
3319 } else {
3320 ## Step 3
3321 if (not $formatting_category->{$node->[1]} and
3322 #not $phrasing_category->{$node->[1]} and
3323 ($special_category->{$node->[1]} or
3324 $scoping_category->{$node->[1]})) {
3325 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3326 ## Ignore the token
3327 !!!next-token;
3328 last S2;
3329 }
3330 }
3331
3332 ## Step 4
3333 $node_i--;
3334 $node = $self->{open_elements}->[$node_i];
3335
3336 ## Step 5;
3337 redo S2;
3338 } # S2
3339 return;
3340 }
3341 }
3342 }; # $in_body
3343
3344 B: {
3345 if ($token->{type} eq 'DOCTYPE') {
3346 !!!parse-error (type => 'DOCTYPE in the middle');
3347 ## Ignore the token
3348 ## Stay in the phase
3349 !!!next-token;
3350 redo B;
3351 } elsif ($token->{type} eq 'end-of-file') {
3352 if ($token->{insertion_mode} ne 'trailing end') {
3353 ## Generate implied end tags
3354 if ({
3355 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3356 tbody => 1, tfoot=> 1, thead => 1,
3357 }->{$self->{open_elements}->[-1]->[1]}) {
3358 !!!back-token;
3359 $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
3360 redo B;
3361 }
3362
3363 if (@{$self->{open_elements}} > 2 or
3364 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3365 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3366 } elsif (defined $self->{inner_html_node} and
3367 @{$self->{open_elements}} > 1 and
3368 $self->{open_elements}->[1]->[1] ne 'body') {
3369 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3370 }
3371
3372 ## ISSUE: There is an issue in the spec.
3373 }
3374
3375 ## Stop parsing
3376 last B;
3377 } elsif ($token->{type} eq 'start tag' and
3378 $token->{tag_name} eq 'html') {
3379 if ($self->{insertion_mode} eq 'trailing end') {
3380 ## Turn into the main phase
3381 !!!parse-error (type => 'after html:html');
3382 $self->{insertion_mode} = $previous_insertion_mode;
3383 }
3384
3385 ## ISSUE: "aa<html>" is not a parse error.
3386 ## ISSUE: "<html>" in fragment is not a parse error.
3387 unless ($token->{first_start_tag}) {
3388 !!!parse-error (type => 'not first start tag');
3389 }
3390 my $top_el = $self->{open_elements}->[0]->[0];
3391 for my $attr_name (keys %{$token->{attributes}}) {
3392 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3393 $top_el->set_attribute_ns
3394 (undef, [undef, $attr_name],
3395 $token->{attributes}->{$attr_name}->{value});
3396 }
3397 }
3398 !!!next-token;
3399 redo B;
3400 } elsif ($token->{type} eq 'comment') {
3401 my $comment = $self->{document}->create_comment ($token->{data});
3402 if ($self->{insertion_mode} eq 'trailing end') {
3403 $self->{document}->append_child ($comment);
3404 } elsif ($self->{insertion_mode} eq 'after body') {
3405 $self->{open_elements}->[0]->[0]->append_child ($comment);
3406 } else {
3407 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3408 }
3409 !!!next-token;
3410 redo B;
3411 } elsif ($self->{insertion_mode} eq 'before head') {
3412 if ($token->{type} eq 'character') {
3413 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3414 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3415 unless (length $token->{data}) {
3416 !!!next-token;
3417 redo B;
3418 }
3419 }
3420 ## As if <head>
3421 !!!create-element ($self->{head_element}, 'head');
3422 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3423 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3424 $self->{insertion_mode} = 'in head';
3425 ## reprocess
3426 redo B;
3427 } elsif ($token->{type} eq 'start tag') {
3428 my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3429 !!!create-element ($self->{head_element}, 'head', $attr);
3430 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3431 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3432 $self->{insertion_mode} = 'in head';
3433 if ($token->{tag_name} eq 'head') {
3434 !!!next-token;
3435 #} elsif ({
3436 # base => 1, link => 1, meta => 1,
3437 # script => 1, style => 1, title => 1,
3438 # }->{$token->{tag_name}}) {
3439 # ## reprocess
3440 } else {
3441 ## reprocess
3442 }
3443 redo B;
3444 } elsif ($token->{type} eq 'end tag') {
3445 if ({
3446 head => 1, body => 1, html => 1,
3447 p => 1, br => 1,
3448 }->{$token->{tag_name}}) {
3449 ## As if <head>
3450 !!!create-element ($self->{head_element}, 'head');
3451 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3452 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3453 $self->{insertion_mode} = 'in head';
3454 ## reprocess
3455 redo B;
3456 } else {
3457 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3458 ## Ignore the token ## ISSUE: An issue in the spec.
3459 !!!next-token;
3460 redo B;
3461 }
3462 } else {
3463 die "$0: $token->{type}: Unknown type";
3464 }
3465 } elsif ($self->{insertion_mode} eq 'in head' or
3466 $self->{insertion_mode} eq 'in head noscript' or
3467 $self->{insertion_mode} eq 'after head') {
3468 if ($token->{type} eq 'character') {
3469 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3470 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3471 unless (length $token->{data}) {
3472 !!!next-token;
3473 redo B;
3474 }
3475 }
3476
3477 #
3478 } elsif ($token->{type} eq 'start tag') {
3479 if ({base => ($self->{insertion_mode} eq 'in head' or
3480 $self->{insertion_mode} eq 'after head'),
3481 link => 1}->{$token->{tag_name}}) {
3482 ## NOTE: There is a "as if in head" code clone.
3483 if ($self->{insertion_mode} eq 'after head') {
3484 !!!parse-error (type => 'after head:'.$token->{tag_name});
3485 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3486 }
3487 !!!insert-element ($token->{tag_name}, $token->{attributes});
3488 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3489 pop @{$self->{open_elements}}
3490 if $self->{insertion_mode} eq 'after head';
3491 !!!next-token;
3492 redo B;
3493 } elsif ($token->{tag_name} eq 'meta') {
3494 ## NOTE: There is a "as if in head" code clone.
3495 if ($self->{insertion_mode} eq 'after head') {
3496 !!!parse-error (type => 'after head:'.$token->{tag_name});
3497 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3498 }
3499 !!!insert-element ($token->{tag_name}, $token->{attributes});
3500 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3501
3502 unless ($self->{confident}) {
3503 my $charset;
3504 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3505 $charset = $token->{attributes}->{charset}->{value};
3506 }
3507 if ($token->{attributes}->{'http-equiv'}) {
3508 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3509 if ($token->{attributes}->{'http-equiv'}->{value}
3510 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
3511 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3512 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3513 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
3514 } ## TODO: And if supported
3515 }
3516 ## TODO: Change the encoding
3517 }
3518
3519 ## TODO: Extracting |charset| from |meta|.
3520 pop @{$self->{open_elements}}
3521 if $self->{insertion_mode} eq 'after head';
3522 !!!next-token;
3523 redo B;
3524 } elsif ($token->{tag_name} eq 'title' and
3525 $self->{insertion_mode} eq 'in head') {
3526 ## NOTE: There is a "as if in head" code clone.
3527 if ($self->{insertion_mode} eq 'after head') {
3528 !!!parse-error (type => 'after head:'.$token->{tag_name});
3529 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3530 }
3531 my $parent = defined $self->{head_element} ? $self->{head_element}
3532 : $self->{open_elements}->[-1]->[0];
3533 $parse_rcdata->(RCDATA_CONTENT_MODEL,
3534 sub { $parent->append_child ($_[0]) });
3535 pop @{$self->{open_elements}}
3536 if $self->{insertion_mode} eq 'after head';
3537 redo B;
3538 } elsif ($token->{tag_name} eq 'style') {
3539 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3540 ## insertion mode 'in head')
3541 ## NOTE: There is a "as if in head" code clone.
3542 if ($self->{insertion_mode} eq 'after head') {
3543 !!!parse-error (type => 'after head:'.$token->{tag_name});
3544 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3545 }
3546 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
3547 pop @{$self->{open_elements}}
3548 if $self->{insertion_mode} eq 'after head';
3549 redo B;
3550 } elsif ($token->{tag_name} eq 'noscript') {
3551 if ($self->{insertion_mode} eq 'in head') {
3552 ## NOTE: and scripting is disalbed
3553 !!!insert-element ($token->{tag_name}, $token->{attributes});
3554 $self->{insertion_mode} = 'in head noscript';
3555 !!!next-token;
3556 redo B;
3557 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3558 !!!parse-error (type => 'in noscript:noscript');
3559 ## Ignore the token
3560 !!!next-token;
3561 redo B;
3562 } else {
3563 #
3564 }
3565 } elsif ($token->{tag_name} eq 'head' and
3566 $self->{insertion_mode} ne 'after head') {
3567 !!!parse-error (type => 'in head:head'); # or in head noscript
3568 ## Ignore the token
3569 !!!next-token;
3570 redo B;
3571 } elsif ($self->{insertion_mode} ne 'in head noscript' and
3572 $token->{tag_name} eq 'script') {
3573 if ($self->{insertion_mode} eq 'after head') {
3574 !!!parse-error (type => 'after head:'.$token->{tag_name});
3575 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3576 }
3577 ## NOTE: There is a "as if in head" code clone.
3578 $script_start_tag->($insert_to_current);
3579 pop @{$self->{open_elements}}
3580 if $self->{insertion_mode} eq 'after head';
3581 redo B;
3582 } elsif ($self->{insertion_mode} eq 'after head' and
3583 $token->{tag_name} eq 'body') {
3584 !!!insert-element ('body', $token->{attributes});
3585 $self->{insertion_mode} = 'in body';
3586 !!!next-token;
3587 redo B;
3588 } elsif ($self->{insertion_mode} eq 'after head' and
3589 $token->{tag_name} eq 'frameset') {
3590 !!!insert-element ('frameset', $token->{attributes});
3591 $self->{insertion_mode} = 'in frameset';
3592 !!!next-token;
3593 redo B;
3594 } else {
3595 #
3596 }
3597 } elsif ($token->{type} eq 'end tag') {
3598 if ($self->{insertion_mode} eq 'in head' and
3599 $token->{tag_name} eq 'head') {
3600 pop @{$self->{open_elements}};
3601 $self->{insertion_mode} = 'after head';
3602 !!!next-token;
3603 redo B;
3604 } elsif ($self->{insertion_mode} eq 'in head noscript' and
3605 $token->{tag_name} eq 'noscript') {
3606 pop @{$self->{open_elements}};
3607 $self->{insertion_mode} = 'in head';
3608 !!!next-token;
3609 redo B;
3610 } elsif ($self->{insertion_mode} eq 'in head' and
3611 {
3612 body => 1, html => 1,
3613 p => 1, br => 1,
3614 }->{$token->{tag_name}}) {
3615 #
3616 } elsif ($self->{insertion_mode} eq 'in head noscript' and
3617 {
3618 p => 1, br => 1,
3619 }->{$token->{tag_name}}) {
3620 #
3621 } elsif ($self->{insertion_mode} ne 'after head') {
3622 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3623 ## Ignore the token
3624 !!!next-token;
3625 redo B;
3626 } else {
3627 #
3628 }
3629 } else {
3630 #
3631 }
3632
3633 ## As if </head> or </noscript> or <body>
3634 if ($self->{insertion_mode} eq 'in head') {
3635 pop @{$self->{open_elements}};
3636 $self->{insertion_mode} = 'after head';
3637 } elsif ($self->{insertion_mode} eq 'in head noscript') {
3638 pop @{$self->{open_elements}};
3639 !!!parse-error (type => 'in noscript:'.(defined $token->{tag_name} ? ($token->{type} eq 'end tag' ? '/' : '') . $token->{tag_name} : '#' . $token->{type}));
3640 $self->{insertion_mode} = 'in head';
3641 } else { # 'after head'
3642 !!!insert-element ('body');
3643 $self->{insertion_mode} = 'in body';
3644 }
3645 ## reprocess
3646 redo B;
3647
3648 ## ISSUE: An issue in the spec.
3649 } elsif ($self->{insertion_mode} eq 'in body' or
3650 $self->{insertion_mode} eq 'in cell' or
3651 $self->{insertion_mode} eq 'in caption') {
3652 if ($token->{type} eq 'character') {
3653 ## NOTE: There is a code clone of "character in body".
3654 $reconstruct_active_formatting_elements->($insert_to_current);
3655
3656 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3657
3658 !!!next-token;
3659 redo B;
3660 } elsif ($token->{type} eq 'start tag') {
3661 if ({
3662 caption => 1, col => 1, colgroup => 1, tbody => 1,
3663 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3664 }->{$token->{tag_name}}) {
3665 if ($self->{insertion_mode} eq 'in cell') {
3666 ## have an element in table scope
3667 my $tn;
3668 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3669 my $node = $self->{open_elements}->[$_];
3670 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3671 $tn = $node->[1];
3672 last INSCOPE;
3673 } elsif ({
3674 table => 1, html => 1,
3675 }->{$node->[1]}) {
3676 last INSCOPE;
3677 }
3678 } # INSCOPE
3679 unless (defined $tn) {
3680 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3681 ## Ignore the token
3682 !!!next-token;
3683 redo B;
3684 }
3685
3686 ## Close the cell
3687 !!!back-token; # <?>
3688 $token = {type => 'end tag', tag_name => $tn};
3689 redo B;
3690 } elsif ($self->{insertion_mode} eq 'in caption') {
3691 !!!parse-error (type => 'not closed:caption');
3692
3693 ## As if </caption>
3694 ## have a table element in table scope
3695 my $i;
3696 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3697 my $node = $self->{open_elements}->[$_];
3698 if ($node->[1] eq 'caption') {
3699 $i = $_;
3700 last INSCOPE;
3701 } elsif ({
3702 table => 1, html => 1,
3703 }->{$node->[1]}) {
3704 last INSCOPE;
3705 }
3706 } # INSCOPE
3707 unless (defined $i) {
3708 !!!parse-error (type => 'unmatched end tag:caption');
3709 ## Ignore the token
3710 !!!next-token;
3711 redo B;
3712 }
3713
3714 ## generate implied end tags
3715 if ({
3716 dd => 1, dt => 1, li => 1, p => 1,
3717 td => 1, th => 1, tr => 1,
3718 tbody => 1, tfoot=> 1, thead => 1,
3719 }->{$self->{open_elements}->[-1]->[1]}) {
3720 !!!back-token; # <?>
3721 $token = {type => 'end tag', tag_name => 'caption'};
3722 !!!back-token;
3723 $token = {type => 'end tag',
3724 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3725 redo B;
3726 }
3727
3728 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3729 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3730 }
3731
3732 splice @{$self->{open_elements}}, $i;
3733
3734 $clear_up_to_marker->();
3735
3736 $self->{insertion_mode} = 'in table';
3737
3738 ## reprocess
3739 redo B;
3740 } else {
3741 #
3742 }
3743 } else {
3744 #
3745 }
3746 } elsif ($token->{type} eq 'end tag') {
3747 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3748 if ($self->{insertion_mode} eq 'in cell') {
3749 ## have an element in table scope
3750 my $i;
3751 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3752 my $node = $self->{open_elements}->[$_];
3753 if ($node->[1] eq $token->{tag_name}) {
3754 $i = $_;
3755 last INSCOPE;
3756 } elsif ({
3757 table => 1, html => 1,
3758 }->{$node->[1]}) {
3759 last INSCOPE;
3760 }
3761 } # INSCOPE
3762 unless (defined $i) {
3763 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3764 ## Ignore the token
3765 !!!next-token;
3766 redo B;
3767 }
3768
3769 ## generate implied end tags
3770 if ({
3771 dd => 1, dt => 1, li => 1, p => 1,
3772 td => ($token->{tag_name} eq 'th'),
3773 th => ($token->{tag_name} eq 'td'),
3774 tr => 1,
3775 tbody => 1, tfoot=> 1, thead => 1,
3776 }->{$self->{open_elements}->[-1]->[1]}) {
3777 !!!back-token;
3778 $token = {type => 'end tag',
3779 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3780 redo B;
3781 }
3782
3783 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3784 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3785 }
3786
3787 splice @{$self->{open_elements}}, $i;
3788
3789 $clear_up_to_marker->();
3790
3791 $self->{insertion_mode} = 'in row';
3792
3793 !!!next-token;
3794 redo B;
3795 } elsif ($self->{insertion_mode} eq 'in caption') {
3796 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3797 ## Ignore the token
3798 !!!next-token;
3799 redo B;
3800 } else {
3801 #
3802 }
3803 } elsif ($token->{tag_name} eq 'caption') {
3804 if ($self->{insertion_mode} eq 'in caption') {
3805 ## have a table element in table scope
3806 my $i;
3807 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3808 my $node = $self->{open_elements}->[$_];
3809 if ($node->[1] eq $token->{tag_name}) {
3810 $i = $_;
3811 last INSCOPE;
3812 } elsif ({
3813 table => 1, html => 1,
3814 }->{$node->[1]}) {
3815 last INSCOPE;
3816 }
3817 } # INSCOPE
3818 unless (defined $i) {
3819 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3820 ## Ignore the token
3821 !!!next-token;
3822 redo B;
3823 }
3824
3825 ## generate implied end tags
3826 if ({
3827 dd => 1, dt => 1, li => 1, p => 1,
3828 td => 1, th => 1, tr => 1,
3829 tbody => 1, tfoot=> 1, thead => 1,
3830 }->{$self->{open_elements}->[-1]->[1]}) {
3831 !!!back-token;
3832 $token = {type => 'end tag',
3833 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3834 redo B;
3835 }
3836
3837 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3838 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3839 }
3840
3841 splice @{$self->{open_elements}}, $i;
3842
3843 $clear_up_to_marker->();
3844
3845 $self->{insertion_mode} = 'in table';
3846
3847 !!!next-token;
3848 redo B;
3849 } elsif ($self->{insertion_mode} eq 'in cell') {
3850 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3851 ## Ignore the token
3852 !!!next-token;
3853 redo B;
3854 } else {
3855 #
3856 }
3857 } elsif ({
3858 table => 1, tbody => 1, tfoot => 1,
3859 thead => 1, tr => 1,
3860 }->{$token->{tag_name}} and
3861 $self->{insertion_mode} eq 'in cell') {
3862 ## have an element in table scope
3863 my $i;
3864 my $tn;
3865 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3866 my $node = $self->{open_elements}->[$_];
3867 if ($node->[1] eq $token->{tag_name}) {
3868 $i = $_;
3869 last INSCOPE;
3870 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3871 $tn = $node->[1];
3872 ## NOTE: There is exactly one |td| or |th| element
3873 ## in scope in the stack of open elements by definition.
3874 } elsif ({
3875 table => 1, html => 1,
3876 }->{$node->[1]}) {
3877 last INSCOPE;
3878 }
3879 } # INSCOPE
3880 unless (defined $i) {
3881 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3882 ## Ignore the token
3883 !!!next-token;
3884 redo B;
3885 }
3886
3887 ## Close the cell
3888 !!!back-token; # </?>
3889 $token = {type => 'end tag', tag_name => $tn};
3890 redo B;
3891 } elsif ($token->{tag_name} eq 'table' and
3892 $self->{insertion_mode} eq 'in caption') {
3893 !!!parse-error (type => 'not closed:caption');
3894
3895 ## As if </caption>
3896 ## have a table element in table scope
3897 my $i;
3898 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3899 my $node = $self->{open_elements}->[$_];
3900 if ($node->[1] eq 'caption') {
3901 $i = $_;
3902 last INSCOPE;
3903 } elsif ({
3904 table => 1, html => 1,
3905 }->{$node->[1]}) {
3906 last INSCOPE;
3907 }
3908 } # INSCOPE
3909 unless (defined $i) {
3910 !!!parse-error (type => 'unmatched end tag:caption');
3911 ## Ignore the token
3912 !!!next-token;
3913 redo B;
3914 }
3915
3916 ## generate implied end tags
3917 if ({
3918 dd => 1, dt => 1, li => 1, p => 1,
3919 td => 1, th => 1, tr => 1,
3920 tbody => 1, tfoot=> 1, thead => 1,
3921 }->{$self->{open_elements}->[-1]->[1]}) {
3922 !!!back-token; # </table>
3923 $token = {type => 'end tag', tag_name => 'caption'};
3924 !!!back-token;
3925 $token = {type => 'end tag',
3926 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3927 redo B;
3928 }
3929
3930 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3931 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3932 }
3933
3934 splice @{$self->{open_elements}}, $i;
3935
3936 $clear_up_to_marker->();
3937
3938 $self->{insertion_mode} = 'in table';
3939
3940 ## reprocess
3941 redo B;
3942 } elsif ({
3943 body => 1, col => 1, colgroup => 1, html => 1,
3944 }->{$token->{tag_name}}) {
3945 if ($self->{insertion_mode} eq 'in cell' or
3946 $self->{insertion_mode} eq 'in caption') {
3947 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3948 ## Ignore the token
3949 !!!next-token;
3950 redo B;
3951 } else {
3952 #
3953 }
3954 } elsif ({
3955 tbody => 1, tfoot => 1,
3956 thead => 1, tr => 1,
3957 }->{$token->{tag_name}} and
3958 $self->{insertion_mode} eq 'in caption') {
3959 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3960 ## Ignore the token
3961 !!!next-token;
3962 redo B;
3963 } else {
3964 #
3965 }
3966 } else {
3967 #
3968 }
3969
3970 $in_body->($insert_to_current);
3971 redo B;
3972 } elsif ($self->{insertion_mode} eq 'in table') {
3973 if ($token->{type} eq 'character') {
3974 ## NOTE: There are "character in table" code clones.
3975 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3976 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3977
3978 unless (length $token->{data}) {
3979 !!!next-token;
3980 redo B;
3981 }
3982 }
3983
3984 !!!parse-error (type => 'in table:#character');
3985
3986 ## As if in body, but insert into foster parent element
3987 ## ISSUE: Spec says that "whenever a node would be inserted
3988 ## into the current node" while characters might not be
3989 ## result in a new Text node.
3990 $reconstruct_active_formatting_elements->($insert_to_foster);
3991
3992 if ({
3993 table => 1, tbody => 1, tfoot => 1,
3994 thead => 1, tr => 1,
3995 }->{$self->{open_elements}->[-1]->[1]}) {
3996 # MUST
3997 my $foster_parent_element;
3998 my $next_sibling;
3999 my $prev_sibling;
4000 OE: for (reverse 0..$#{$self->{open_elements}}) {
4001 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4002 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4003 if (defined $parent and $parent->node_type == 1) {
4004 $foster_parent_element = $parent;
4005 $next_sibling = $self->{open_elements}->[$_]->[0];
4006 $prev_sibling = $next_sibling->previous_sibling;
4007 } else {
4008 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4009 $prev_sibling = $foster_parent_element->last_child;
4010 }
4011 last OE;
4012 }
4013 } # OE
4014 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4015 $prev_sibling = $foster_parent_element->last_child
4016 unless defined $foster_parent_element;
4017 if (defined $prev_sibling and
4018 $prev_sibling->node_type == 3) {
4019 $prev_sibling->manakai_append_text ($token->{data});
4020 } else {
4021 $foster_parent_element->insert_before
4022 ($self->{document}->create_text_node ($token->{data}),
4023 $next_sibling);
4024 }
4025 } else {
4026 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4027 }
4028
4029 !!!next-token;
4030 redo B;
4031 } elsif ($token->{type} eq 'start tag') {
4032 if ({
4033 caption => 1,
4034 colgroup => 1,
4035 tbody => 1, tfoot => 1, thead => 1,
4036 }->{$token->{tag_name}}) {
4037 ## Clear back to table context
4038 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4039 $self->{open_elements}->[-1]->[1] ne 'html') {
4040 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4041 pop @{$self->{open_elements}};
4042 }
4043
4044 push @$active_formatting_elements, ['#marker', '']
4045 if $token->{tag_name} eq 'caption';
4046
4047 !!!insert-element ($token->{tag_name}, $token->{attributes});
4048 $self->{insertion_mode} = {
4049 caption => 'in caption',
4050 colgroup => 'in column group',
4051 tbody => 'in table body',
4052 tfoot => 'in table body',
4053 thead => 'in table body',
4054 }->{$token->{tag_name}};
4055 !!!next-token;
4056 redo B;
4057 } elsif ({
4058 col => 1,
4059 td => 1, th => 1, tr => 1,
4060 }->{$token->{tag_name}}) {
4061 ## Clear back to table context
4062 while ($self->{open_elements}->[-1]->[1] ne 'table' and
4063 $self->{open_elements}->[-1]->[1] ne 'html') {
4064 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4065 pop @{$self->{open_elements}};
4066 }
4067
4068 !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
4069 $self->{insertion_mode} = $token->{tag_name} eq 'col'
4070 ? 'in column group' : 'in table body';
4071 ## reprocess
4072 redo B;
4073 } elsif ($token->{tag_name} eq 'table') {
4074 ## NOTE: There are code clones for this "table in table"
4075 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4076
4077 ## As if </table>
4078 ## have a table element in table scope
4079 my $i;
4080 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4081 my $node = $self->{open_elements}->[$_];
4082 if ($node->[1] eq 'table') {
4083 $i = $_;
4084 last INSCOPE;
4085 } elsif ({
4086 table => 1, html => 1,
4087 }->{$node->[1]}) {
4088 last INSCOPE;
4089 }
4090 } # INSCOPE
4091 unless (defined $i) {
4092 !!!parse-error (type => 'unmatched end tag:table');
4093 ## Ignore tokens </table><table>
4094 !!!next-token;
4095 redo B;
4096 }
4097
4098 ## generate implied end tags
4099 if ({
4100 dd => 1, dt => 1, li => 1, p => 1,
4101 td => 1, th => 1, tr => 1,
4102 tbody => 1, tfoot=> 1, thead => 1,
4103 }->{$self->{open_elements}->[-1]->[1]}) {
4104 !!!back-token; # <table>
4105 $token = {type => 'end tag', tag_name => 'table'};
4106 !!!back-token;
4107 $token = {type => 'end tag',
4108 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4109 redo B;
4110 }
4111
4112 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4113 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4114 }
4115
4116 splice @{$self->{open_elements}}, $i;
4117
4118 $self->_reset_insertion_mode;
4119
4120 ## reprocess
4121 redo B;
4122 } else {
4123 #
4124 }
4125 } elsif ($token->{type} eq 'end tag') {
4126 if ($token->{tag_name} eq 'table') {
4127 ## have a table element in table scope
4128 my $i;
4129 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4130 my $node = $self->{open_elements}->[$_];
4131 if ($node->[1] eq $token->{tag_name}) {
4132 $i = $_;
4133 last INSCOPE;
4134 } elsif ({
4135 table => 1, html => 1,
4136 }->{$node->[1]}) {
4137 last INSCOPE;
4138 }
4139 } # INSCOPE
4140 unless (defined $i) {
4141 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4142 ## Ignore the token
4143 !!!next-token;
4144 redo B;
4145 }
4146
4147 ## generate implied end tags
4148 if ({
4149 dd => 1, dt => 1, li => 1, p => 1,
4150 td => 1, th => 1, tr => 1,
4151 tbody => 1, tfoot=> 1, thead => 1,
4152 }->{$self->{open_elements}->[-1]->[1]}) {
4153 !!!back-token;
4154 $token = {type => 'end tag',
4155 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4156 redo B;
4157 }
4158
4159 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4160 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4161 }
4162
4163 splice @{$self->{open_elements}}, $i;
4164
4165 $self->_reset_insertion_mode;
4166
4167 !!!next-token;
4168 redo B;
4169 } elsif ({
4170 body => 1, caption => 1, col => 1, colgroup => 1,
4171 html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
4172 thead => 1, tr => 1,
4173 }->{$token->{tag_name}}) {
4174 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4175 ## Ignore the token
4176 !!!next-token;
4177 redo B;
4178 } else {
4179 #
4180 }
4181 } else {
4182 #
4183 }
4184
4185 !!!parse-error (type => 'in table:'.$token->{tag_name});
4186 $in_body->($insert_to_foster);
4187 redo B;
4188 } elsif ($self->{insertion_mode} eq 'in column group') {
4189 if ($token->{type} eq 'character') {
4190 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4191 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4192 unless (length $token->{data}) {
4193 !!!next-token;
4194 redo B;
4195 }
4196 }
4197
4198 #
4199 } elsif ($token->{type} eq 'start tag') {
4200 if ($token->{tag_name} eq 'col') {
4201 !!!insert-element ($token->{tag_name}, $token->{attributes});
4202 pop @{$self->{open_elements}};
4203 !!!next-token;
4204 redo B;
4205 } else {
4206 #
4207 }
4208 } elsif ($token->{type} eq 'end tag') {
4209 if ($token->{tag_name} eq 'colgroup') {
4210 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4211 !!!parse-error (type => 'unmatched end tag:colgroup');
4212 ## Ignore the token
4213 !!!next-token;
4214 redo B;
4215 } else {
4216 pop @{$self->{open_elements}}; # colgroup
4217 $self->{insertion_mode} = 'in table';
4218 !!!next-token;
4219 redo B;
4220 }
4221 } elsif ($token->{tag_name} eq 'col') {
4222 !!!parse-error (type => 'unmatched end tag:col');
4223 ## Ignore the token
4224 !!!next-token;
4225 redo B;
4226 } else {
4227 #
4228 }
4229 } else {
4230 #
4231 }
4232
4233 ## As if </colgroup>
4234 if ($self->{open_elements}->[-1]->[1] eq 'html') {
4235 !!!parse-error (type => 'unmatched end tag:colgroup');
4236 ## Ignore the token
4237 !!!next-token;
4238 redo B;
4239 } else {
4240 pop @{$self->{open_elements}}; # colgroup
4241 $self->{insertion_mode} = 'in table';
4242 ## reprocess
4243 redo B;
4244 }
4245 } elsif ($self->{insertion_mode} eq 'in table body') {
4246 if ($token->{type} eq 'character') {
4247 ## NOTE: This is a "character in table" code clone.
4248 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4249 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4250
4251 unless (length $token->{data}) {
4252 !!!next-token;
4253 redo B;
4254 }
4255 }
4256
4257 !!!parse-error (type => 'in table:#character');
4258
4259 ## As if in body, but insert into foster parent element
4260 ## ISSUE: Spec says that "whenever a node would be inserted
4261 ## into the current node" while characters might not be
4262 ## result in a new Text node.
4263 $reconstruct_active_formatting_elements->($insert_to_foster);
4264
4265 if ({
4266 table => 1, tbody => 1, tfoot => 1,
4267 thead => 1, tr => 1,
4268 }->{$self->{open_elements}->[-1]->[1]}) {
4269 # MUST
4270 my $foster_parent_element;
4271 my $next_sibling;
4272 my $prev_sibling;
4273 OE: for (reverse 0..$#{$self->{open_elements}}) {
4274 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4275 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4276 if (defined $parent and $parent->node_type == 1) {
4277 $foster_parent_element = $parent;
4278 $next_sibling = $self->{open_elements}->[$_]->[0];
4279 $prev_sibling = $next_sibling->previous_sibling;
4280 } else {
4281 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4282 $prev_sibling = $foster_parent_element->last_child;
4283 }
4284 last OE;
4285 }
4286 } # OE
4287 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4288 $prev_sibling = $foster_parent_element->last_child
4289 unless defined $foster_parent_element;
4290 if (defined $prev_sibling and
4291 $prev_sibling->node_type == 3) {
4292 $prev_sibling->manakai_append_text ($token->{data});
4293 } else {
4294 $foster_parent_element->insert_before
4295 ($self->{document}->create_text_node ($token->{data}),
4296 $next_sibling);
4297 }
4298 } else {
4299 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4300 }
4301
4302 !!!next-token;
4303 redo B;
4304 } elsif ($token->{type} eq 'start tag') {
4305 if ({
4306 tr => 1,
4307 th => 1, td => 1,
4308 }->{$token->{tag_name}}) {
4309 unless ($token->{tag_name} eq 'tr') {
4310 !!!parse-error (type => 'missing start tag:tr');
4311 }
4312
4313 ## Clear back to table body context
4314 while (not {
4315 tbody => 1, tfoot => 1, thead => 1, html => 1,
4316 }->{$self->{open_elements}->[-1]->[1]}) {
4317 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4318 pop @{$self->{open_elements}};
4319 }
4320
4321 $self->{insertion_mode} = 'in row';
4322 if ($token->{tag_name} eq 'tr') {
4323 !!!insert-element ($token->{tag_name}, $token->{attributes});
4324 !!!next-token;
4325 } else {
4326 !!!insert-element ('tr');
4327 ## reprocess
4328 }
4329 redo B;
4330 } elsif ({
4331 caption => 1, col => 1, colgroup => 1,
4332 tbody => 1, tfoot => 1, thead => 1,
4333 }->{$token->{tag_name}}) {
4334 ## have an element in table scope
4335 my $i;
4336 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4337 my $node = $self->{open_elements}->[$_];
4338 if ({
4339 tbody => 1, thead => 1, tfoot => 1,
4340 }->{$node->[1]}) {
4341 $i = $_;
4342 last INSCOPE;
4343 } elsif ({
4344 table => 1, html => 1,
4345 }->{$node->[1]}) {
4346 last INSCOPE;
4347 }
4348 } # INSCOPE
4349 unless (defined $i) {
4350 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4351 ## Ignore the token
4352 !!!next-token;
4353 redo B;
4354 }
4355
4356 ## Clear back to table body context
4357 while (not {
4358 tbody => 1, tfoot => 1, thead => 1, html => 1,
4359 }->{$self->{open_elements}->[-1]->[1]}) {
4360 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4361 pop @{$self->{open_elements}};
4362 }
4363
4364 ## As if <{current node}>
4365 ## have an element in table scope
4366 ## true by definition
4367
4368 ## Clear back to table body context
4369 ## nop by definition
4370
4371 pop @{$self->{open_elements}};
4372 $self->{insertion_mode} = 'in table';
4373 ## reprocess
4374 redo B;
4375 } elsif ($token->{tag_name} eq 'table') {
4376 ## NOTE: This is a code clone of "table in table"
4377 !!!parse-error (type => 'not closed:table');
4378
4379 ## As if </table>
4380 ## have a table element in table scope
4381 my $i;
4382 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4383 my $node = $self->{open_elements}->[$_];
4384 if ($node->[1] eq 'table') {
4385 $i = $_;
4386 last INSCOPE;
4387 } elsif ({
4388 table => 1, html => 1,
4389 }->{$node->[1]}) {
4390 last INSCOPE;
4391 }
4392 } # INSCOPE
4393 unless (defined $i) {
4394 !!!parse-error (type => 'unmatched end tag:table');
4395 ## Ignore tokens </table><table>
4396 !!!next-token;
4397 redo B;
4398 }
4399
4400 ## generate implied end tags
4401 if ({
4402 dd => 1, dt => 1, li => 1, p => 1,
4403 td => 1, th => 1, tr => 1,
4404 tbody => 1, tfoot=> 1, thead => 1,
4405 }->{$self->{open_elements}->[-1]->[1]}) {
4406 !!!back-token; # <table>
4407 $token = {type => 'end tag', tag_name => 'table'};
4408 !!!back-token;
4409 $token = {type => 'end tag',
4410 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4411 redo B;
4412 }
4413
4414 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4415 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4416 }
4417
4418 splice @{$self->{open_elements}}, $i;
4419
4420 $self->_reset_insertion_mode;
4421
4422 ## reprocess
4423 redo B;
4424 } else {
4425 #
4426 }
4427 } elsif ($token->{type} eq 'end tag') {
4428 if ({
4429 tbody => 1, tfoot => 1, thead => 1,
4430 }->{$token->{tag_name}}) {
4431 ## have an element in table scope
4432 my $i;
4433 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4434 my $node = $self->{open_elements}->[$_];
4435 if ($node->[1] eq $token->{tag_name}) {
4436 $i = $_;
4437 last INSCOPE;
4438 } elsif ({
4439 table => 1, html => 1,
4440 }->{$node->[1]}) {
4441 last INSCOPE;
4442 }
4443 } # INSCOPE
4444 unless (defined $i) {
4445 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4446 ## Ignore the token
4447 !!!next-token;
4448 redo B;
4449 }
4450
4451 ## Clear back to table body context
4452 while (not {
4453 tbody => 1, tfoot => 1, thead => 1, html => 1,
4454 }->{$self->{open_elements}->[-1]->[1]}) {
4455 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4456 pop @{$self->{open_elements}};
4457 }
4458
4459 pop @{$self->{open_elements}};
4460 $self->{insertion_mode} = 'in table';
4461 !!!next-token;
4462 redo B;
4463 } elsif ($token->{tag_name} eq 'table') {
4464 ## have an element in table scope
4465 my $i;
4466 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4467 my $node = $self->{open_elements}->[$_];
4468 if ({
4469 tbody => 1, thead => 1, tfoot => 1,
4470 }->{$node->[1]}) {
4471 $i = $_;
4472 last INSCOPE;
4473 } elsif ({
4474 table => 1, html => 1,
4475 }->{$node->[1]}) {
4476 last INSCOPE;
4477 }
4478 } # INSCOPE
4479 unless (defined $i) {
4480 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4481 ## Ignore the token
4482 !!!next-token;
4483 redo B;
4484 }
4485
4486 ## Clear back to table body context
4487 while (not {
4488 tbody => 1, tfoot => 1, thead => 1, html => 1,
4489 }->{$self->{open_elements}->[-1]->[1]}) {
4490 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4491 pop @{$self->{open_elements}};
4492 }
4493
4494 ## As if <{current node}>
4495 ## have an element in table scope
4496 ## true by definition
4497
4498 ## Clear back to table body context
4499 ## nop by definition
4500
4501 pop @{$self->{open_elements}};
4502 $self->{insertion_mode} = 'in table';
4503 ## reprocess
4504 redo B;
4505 } elsif ({
4506 body => 1, caption => 1, col => 1, colgroup => 1,
4507 html => 1, td => 1, th => 1, tr => 1,
4508 }->{$token->{tag_name}}) {
4509 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4510 ## Ignore the token
4511 !!!next-token;
4512 redo B;
4513 } else {
4514 #
4515 }
4516 } else {
4517 #
4518 }
4519
4520 ## As if in table
4521 !!!parse-error (type => 'in table:'.$token->{tag_name});
4522 $in_body->($insert_to_foster);
4523 redo B;
4524 } elsif ($self->{insertion_mode} eq 'in row') {
4525 if ($token->{type} eq 'character') {
4526 ## NOTE: This is a "character in table" code clone.
4527 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4528 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4529
4530 unless (length $token->{data}) {
4531 !!!next-token;
4532 redo B;
4533 }
4534 }
4535
4536 !!!parse-error (type => 'in table:#character');
4537
4538 ## As if in body, but insert into foster parent element
4539 ## ISSUE: Spec says that "whenever a node would be inserted
4540 ## into the current node" while characters might not be
4541 ## result in a new Text node.
4542 $reconstruct_active_formatting_elements->($insert_to_foster);
4543
4544 if ({
4545 table => 1, tbody => 1, tfoot => 1,
4546 thead => 1, tr => 1,
4547 }->{$self->{open_elements}->[-1]->[1]}) {
4548 # MUST
4549 my $foster_parent_element;
4550 my $next_sibling;
4551 my $prev_sibling;
4552 OE: for (reverse 0..$#{$self->{open_elements}}) {
4553 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4554 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4555 if (defined $parent and $parent->node_type == 1) {
4556 $foster_parent_element = $parent;
4557 $next_sibling = $self->{open_elements}->[$_]->[0];
4558 $prev_sibling = $next_sibling->previous_sibling;
4559 } else {
4560 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4561 $prev_sibling = $foster_parent_element->last_child;
4562 }
4563 last OE;
4564 }
4565 } # OE
4566 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4567 $prev_sibling = $foster_parent_element->last_child
4568 unless defined $foster_parent_element;
4569 if (defined $prev_sibling and
4570 $prev_sibling->node_type == 3) {
4571 $prev_sibling->manakai_append_text ($token->{data});
4572 } else {
4573 $foster_parent_element->insert_before
4574 ($self->{document}->create_text_node ($token->{data}),
4575 $next_sibling);
4576 }
4577 } else {
4578 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4579 }
4580
4581 !!!next-token;
4582 redo B;
4583 } elsif ($token->{type} eq 'start tag') {
4584 if ($token->{tag_name} eq 'th' or
4585 $token->{tag_name} eq 'td') {
4586 ## Clear back to table row context
4587 while (not {
4588 tr => 1, html => 1,
4589 }->{$self->{open_elements}->[-1]->[1]}) {
4590 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4591 pop @{$self->{open_elements}};
4592 }
4593
4594 !!!insert-element ($token->{tag_name}, $token->{attributes});
4595 $self->{insertion_mode} = 'in cell';
4596
4597 push @$active_formatting_elements, ['#marker', ''];
4598
4599 !!!next-token;
4600 redo B;
4601 } elsif ({
4602 caption => 1, col => 1, colgroup => 1,
4603 tbody => 1, tfoot => 1, thead => 1, tr => 1,
4604 }->{$token->{tag_name}}) {
4605 ## As if </tr>
4606 ## have an element in table scope
4607 my $i;
4608 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4609 my $node = $self->{open_elements}->[$_];
4610 if ($node->[1] eq 'tr') {
4611 $i = $_;
4612 last INSCOPE;
4613 } elsif ({
4614 table => 1, html => 1,
4615 }->{$node->[1]}) {
4616 last INSCOPE;
4617 }
4618 } # INSCOPE
4619 unless (defined $i) {
4620 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4621 ## Ignore the token
4622 !!!next-token;
4623 redo B;
4624 }
4625
4626 ## Clear back to table row context
4627 while (not {
4628 tr => 1, html => 1,
4629 }->{$self->{open_elements}->[-1]->[1]}) {
4630 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4631 pop @{$self->{open_elements}};
4632 }
4633
4634 pop @{$self->{open_elements}}; # tr
4635 $self->{insertion_mode} = 'in table body';
4636 ## reprocess
4637 redo B;
4638 } elsif ($token->{tag_name} eq 'table') {
4639 ## NOTE: This is a code clone of "table in table"
4640 !!!parse-error (type => 'not closed:table');
4641
4642 ## As if </table>
4643 ## have a table element in table scope
4644 my $i;
4645 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4646 my $node = $self->{open_elements}->[$_];
4647 if ($node->[1] eq 'table') {
4648 $i = $_;
4649 last INSCOPE;
4650 } elsif ({
4651 table => 1, html => 1,
4652 }->{$node->[1]}) {
4653 last INSCOPE;
4654 }
4655 } # INSCOPE
4656 unless (defined $i) {
4657 !!!parse-error (type => 'unmatched end tag:table');
4658 ## Ignore tokens </table><table>
4659 !!!next-token;
4660 redo B;
4661 }
4662
4663 ## generate implied end tags
4664 if ({
4665 dd => 1, dt => 1, li => 1, p => 1,
4666 td => 1, th => 1, tr => 1,
4667 tbody => 1, tfoot=> 1, thead => 1,
4668 }->{$self->{open_elements}->[-1]->[1]}) {
4669 !!!back-token; # <table>
4670 $token = {type => 'end tag', tag_name => 'table'};
4671 !!!back-token;
4672 $token = {type => 'end tag',
4673 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4674 redo B;
4675 }
4676
4677 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4678 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4679 }
4680
4681 splice @{$self->{open_elements}}, $i;
4682
4683 $self->_reset_insertion_mode;
4684
4685 ## reprocess
4686 redo B;
4687 } else {
4688 #
4689 }
4690 } elsif ($token->{type} eq 'end tag') {
4691 if ($token->{tag_name} eq 'tr') {
4692 ## have an element in table scope
4693 my $i;
4694 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4695 my $node = $self->{open_elements}->[$_];
4696 if ($node->[1] eq $token->{tag_name}) {
4697 $i = $_;
4698 last INSCOPE;
4699 } elsif ({
4700 table => 1, html => 1,
4701 }->{$node->[1]}) {
4702 last INSCOPE;
4703 }
4704 } # INSCOPE
4705 unless (defined $i) {
4706 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4707 ## Ignore the token
4708 !!!next-token;
4709 redo B;
4710 }
4711
4712 ## Clear back to table row context
4713 while (not {
4714 tr => 1, html => 1,
4715 }->{$self->{open_elements}->[-1]->[1]}) {
4716 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4717 pop @{$self->{open_elements}};
4718 }
4719
4720 pop @{$self->{open_elements}}; # tr
4721 $self->{insertion_mode} = 'in table body';
4722 !!!next-token;
4723 redo B;
4724 } elsif ($token->{tag_name} eq 'table') {
4725 ## As if </tr>
4726 ## have an element in table scope
4727 my $i;
4728 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4729 my $node = $self->{open_elements}->[$_];
4730 if ($node->[1] eq 'tr') {
4731 $i = $_;
4732 last INSCOPE;
4733 } elsif ({
4734 table => 1, html => 1,
4735 }->{$node->[1]}) {
4736 last INSCOPE;
4737 }
4738 } # INSCOPE
4739 unless (defined $i) {
4740 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4741 ## Ignore the token
4742 !!!next-token;
4743 redo B;
4744 }
4745
4746 ## Clear back to table row context
4747 while (not {
4748 tr => 1, html => 1,
4749 }->{$self->{open_elements}->[-1]->[1]}) {
4750 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4751 pop @{$self->{open_elements}};
4752 }
4753
4754 pop @{$self->{open_elements}}; # tr
4755 $self->{insertion_mode} = 'in table body';
4756 ## reprocess
4757 redo B;
4758 } elsif ({
4759 tbody => 1, tfoot => 1, thead => 1,
4760 }->{$token->{tag_name}}) {
4761 ## have an element in table scope
4762 my $i;
4763 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4764 my $node = $self->{open_elements}->[$_];
4765 if ($node->[1] eq $token->{tag_name}) {
4766 $i = $_;
4767 last INSCOPE;
4768 } elsif ({
4769 table => 1, html => 1,
4770 }->{$node->[1]}) {
4771 last INSCOPE;
4772 }
4773 } # INSCOPE
4774 unless (defined $i) {
4775 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4776 ## Ignore the token
4777 !!!next-token;
4778 redo B;
4779 }
4780
4781 ## As if </tr>
4782 ## have an element in table scope
4783 my $i;
4784 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4785 my $node = $self->{open_elements}->[$_];
4786 if ($node->[1] eq 'tr') {
4787 $i = $_;
4788 last INSCOPE;
4789 } elsif ({
4790 table => 1, html => 1,
4791 }->{$node->[1]}) {
4792 last INSCOPE;
4793 }
4794 } # INSCOPE
4795 unless (defined $i) {
4796 !!!parse-error (type => 'unmatched end tag:tr');
4797 ## Ignore the token
4798 !!!next-token;
4799 redo B;
4800 }
4801
4802 ## Clear back to table row context
4803 while (not {
4804 tr => 1, html => 1,
4805 }->{$self->{open_elements}->[-1]->[1]}) {
4806 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4807 pop @{$self->{open_elements}};
4808 }
4809
4810 pop @{$self->{open_elements}}; # tr
4811 $self->{insertion_mode} = 'in table body';
4812 ## reprocess
4813 redo B;
4814 } elsif ({
4815 body => 1, caption => 1, col => 1,
4816 colgroup => 1, html => 1, td => 1, th => 1,
4817 }->{$token->{tag_name}}) {
4818 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4819 ## Ignore the token
4820 !!!next-token;
4821 redo B;
4822 } else {
4823 #
4824 }
4825 } else {
4826 #
4827 }
4828
4829 ## As if in table
4830 !!!parse-error (type => 'in table:'.$token->{tag_name});
4831 $in_body->($insert_to_foster);
4832 redo B;
4833 } elsif ($self->{insertion_mode} eq 'in select') {
4834 if ($token->{type} eq 'character') {
4835 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4836 !!!next-token;
4837 redo B;
4838 } elsif ($token->{type} eq 'start tag') {
4839 if ($token->{tag_name} eq 'option') {
4840 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4841 ## As if </option>
4842 pop @{$self->{open_elements}};
4843 }
4844
4845 !!!insert-element ($token->{tag_name}, $token->{attributes});
4846 !!!next-token;
4847 redo B;
4848 } elsif ($token->{tag_name} eq 'optgroup') {
4849 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4850 ## As if </option>
4851 pop @{$self->{open_elements}};
4852 }
4853
4854 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4855 ## As if </optgroup>
4856 pop @{$self->{open_elements}};
4857 }
4858
4859 !!!insert-element ($token->{tag_name}, $token->{attributes});
4860 !!!next-token;
4861 redo B;
4862 } elsif ($token->{tag_name} eq 'select') {
4863 !!!parse-error (type => 'not closed:select');
4864 ## As if </select> instead
4865 ## have an element in table scope
4866 my $i;
4867 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4868 my $node = $self->{open_elements}->[$_];
4869 if ($node->[1] eq $token->{tag_name}) {
4870 $i = $_;
4871 last INSCOPE;
4872 } elsif ({
4873 table => 1, html => 1,
4874 }->{$node->[1]}) {
4875 last INSCOPE;
4876 }
4877 } # INSCOPE
4878 unless (defined $i) {
4879 !!!parse-error (type => 'unmatched end tag:select');
4880 ## Ignore the token
4881 !!!next-token;
4882 redo B;
4883 }
4884
4885 splice @{$self->{open_elements}}, $i;
4886
4887 $self->_reset_insertion_mode;
4888
4889 !!!next-token;
4890 redo B;
4891 } else {
4892 #
4893 }
4894 } elsif ($token->{type} eq 'end tag') {
4895 if ($token->{tag_name} eq 'optgroup') {
4896 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4897 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4898 ## As if </option>
4899 splice @{$self->{open_elements}}, -2;
4900 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4901 pop @{$self->{open_elements}};
4902 } else {
4903 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4904 ## Ignore the token
4905 }
4906 !!!next-token;
4907 redo B;
4908 } elsif ($token->{tag_name} eq 'option') {
4909 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4910 pop @{$self->{open_elements}};
4911 } else {
4912 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4913 ## Ignore the token
4914 }
4915 !!!next-token;
4916 redo B;
4917 } elsif ($token->{tag_name} eq 'select') {
4918 ## have an element in table scope
4919 my $i;
4920 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4921 my $node = $self->{open_elements}->[$_];
4922 if ($node->[1] eq $token->{tag_name}) {
4923 $i = $_;
4924 last INSCOPE;
4925 } elsif ({
4926 table => 1, html => 1,
4927 }->{$node->[1]}) {
4928 last INSCOPE;
4929 }
4930 } # INSCOPE
4931 unless (defined $i) {
4932 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4933 ## Ignore the token
4934 !!!next-token;
4935 redo B;
4936 }
4937
4938 splice @{$self->{open_elements}}, $i;
4939
4940 $self->_reset_insertion_mode;
4941
4942 !!!next-token;
4943 redo B;
4944 } elsif ({
4945 caption => 1, table => 1, tbody => 1,
4946 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4947 }->{$token->{tag_name}}) {
4948 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4949
4950 ## have an element in table scope
4951 my $i;
4952 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4953 my $node = $self->{open_elements}->[$_];
4954 if ($node->[1] eq $token->{tag_name}) {
4955 $i = $_;
4956 last INSCOPE;
4957 } elsif ({
4958 table => 1, html => 1,
4959 }->{$node->[1]}) {
4960 last INSCOPE;
4961 }
4962 } # INSCOPE
4963 unless (defined $i) {
4964 ## Ignore the token
4965 !!!next-token;
4966 redo B;
4967 }
4968
4969 ## As if </select>
4970 ## have an element in table scope
4971 undef $i;
4972 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4973 my $node = $self->{open_elements}->[$_];
4974 if ($node->[1] eq 'select') {
4975 $i = $_;
4976 last INSCOPE;
4977 } elsif ({
4978 table => 1, html => 1,
4979 }->{$node->[1]}) {
4980 last INSCOPE;
4981 }
4982 } # INSCOPE
4983 unless (defined $i) {
4984 !!!parse-error (type => 'unmatched end tag:select');
4985 ## Ignore the </select> token
4986 !!!next-token; ## TODO: ok?
4987 redo B;
4988 }
4989
4990 splice @{$self->{open_elements}}, $i;
4991
4992 $self->_reset_insertion_mode;
4993
4994 ## reprocess
4995 redo B;
4996 } else {
4997 #
4998 }
4999 } else {
5000 #
5001 }
5002
5003 !!!parse-error (type => 'in select:'.$token->{tag_name});
5004 ## Ignore the token
5005 !!!next-token;
5006 redo B;
5007 } elsif ($self->{insertion_mode} eq 'after body') {
5008 if ($token->{type} eq 'character') {
5009 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5010 my $data = $1;
5011 ## As if in body
5012 $reconstruct_active_formatting_elements->($insert_to_current);
5013
5014 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5015
5016 unless (length $token->{data}) {
5017 !!!next-token;
5018 redo B;
5019 }
5020 }
5021
5022 #
5023 !!!parse-error (type => 'after body:#character');
5024 } elsif ($token->{type} eq 'start tag') {
5025 !!!parse-error (type => 'after body:'.$token->{tag_name});
5026 #
5027 } elsif ($token->{type} eq 'end tag') {
5028 if ($token->{tag_name} eq 'html') {
5029 if (defined $self->{inner_html_node}) {
5030 !!!parse-error (type => 'unmatched end tag:html');
5031 ## Ignore the token
5032 !!!next-token;
5033 redo B;
5034 } else {
5035 $previous_insertion_mode = $self->{insertion_mode};
5036 $self->{insertion_mode} = 'trailing end';
5037 !!!next-token;
5038 redo B;
5039 }
5040 } else {
5041 !!!parse-error (type => 'after body:/'.$token->{tag_name});
5042 }
5043 } else {
5044 die "$0: $token->{type}: Unknown token type";
5045 }
5046
5047 $self->{insertion_mode} = 'in body';
5048 ## reprocess
5049 redo B;
5050 } elsif ($self->{insertion_mode} eq 'in frameset') {
5051 if ($token->{type} eq 'character') {
5052 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5053 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5054
5055 unless (length $token->{data}) {
5056 !!!next-token;
5057 redo B;
5058 }
5059 }
5060
5061 !!!parse-error (type => 'in frameset:#character');
5062 ## Ignore the token
5063 !!!next-token;
5064 redo B;
5065 } elsif ($token->{type} eq 'start tag') {
5066 if ($token->{tag_name} eq 'frameset') {
5067 !!!insert-element ($token->{tag_name}, $token->{attributes});
5068 !!!next-token;
5069 redo B;
5070 } elsif ($token->{tag_name} eq 'frame') {
5071 !!!insert-element ($token->{tag_name}, $token->{attributes});
5072 pop @{$self->{open_elements}};
5073 !!!next-token;
5074 redo B;
5075 } elsif ($token->{tag_name} eq 'noframes') {
5076 $in_body->($insert_to_current);
5077 redo B;
5078 } else {
5079 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
5080 ## Ignore the token
5081 !!!next-token;
5082 redo B;
5083 }
5084 } elsif ($token->{type} eq 'end tag') {
5085 if ($token->{tag_name} eq 'frameset') {
5086 if ($self->{open_elements}->[-1]->[1] eq 'html' and
5087 @{$self->{open_elements}} == 1) {
5088 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5089 ## Ignore the token
5090 !!!next-token;
5091 } else {
5092 pop @{$self->{open_elements}};
5093 !!!next-token;
5094 }
5095
5096 if (not defined $self->{inner_html_node} and
5097 $self->{open_elements}->[-1]->[1] ne 'frameset') {
5098 $self->{insertion_mode} = 'after frameset';
5099 }
5100 redo B;
5101 } else {
5102 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
5103 ## Ignore the token
5104 !!!next-token;
5105 redo B;
5106 }
5107 } else {
5108 die "$0: $token->{type}: Unknown token type";
5109 }
5110 } elsif ($self->{insertion_mode} eq 'after frameset') {
5111 if ($token->{type} eq 'character') {
5112 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5113 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5114
5115 unless (length $token->{data}) {
5116 !!!next-token;
5117 redo B;
5118 }
5119 }
5120
5121 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5122 !!!parse-error (type => 'after frameset:#character');
5123
5124 ## Ignore the token.
5125 if (length $token->{data}) {
5126 ## reprocess the rest of characters
5127 } else {
5128 !!!next-token;
5129 }
5130 redo B;
5131 }
5132
5133 die qq[$0: Character "$token->{data}"];
5134 } elsif ($token->{type} eq 'start tag') {
5135 if ($token->{tag_name} eq 'noframes') {
5136 $in_body->($insert_to_current);
5137 redo B;
5138 } else {
5139 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
5140 ## Ignore the token
5141 !!!next-token;
5142 redo B;
5143 }
5144 } elsif ($token->{type} eq 'end tag') {
5145 if ($token->{tag_name} eq 'html') {
5146 $previous_insertion_mode = $self->{insertion_mode};
5147 $self->{insertion_mode} = 'trailing end';
5148 !!!next-token;
5149 redo B;
5150 } else {
5151 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
5152 ## Ignore the token
5153 !!!next-token;
5154 redo B;
5155 }
5156 } else {
5157 die "$0: $token->{type}: Unknown token type";
5158 }
5159
5160 ## ISSUE: An issue in spec here
5161 } elsif ($self->{insertion_mode} eq 'trailing end') {
5162 ## states in the main stage is preserved yet # MUST
5163
5164 if ($token->{type} eq 'character') {
5165 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5166 my $data = $1;
5167 ## As if in the main phase.
5168 ## NOTE: The insertion mode in the main phase
5169 ## just before the phase has been changed to the trailing
5170 ## end phase is either "after body" or "after frameset".
5171 $reconstruct_active_formatting_elements->($insert_to_current);
5172
5173 $self->{open_elements}->[-1]->[0]->manakai_append_text ($data);
5174
5175 unless (length $token->{data}) {
5176 !!!next-token;
5177 redo B;
5178 }
5179 }
5180
5181 !!!parse-error (type => 'after html:#character');
5182 $self->{insertion_mode} = $previous_insertion_mode;
5183 ## reprocess
5184 redo B;
5185 } elsif ($token->{type} eq 'start tag') {
5186 !!!parse-error (type => 'after html:'.$token->{tag_name});
5187 $self->{insertion_mode} = $previous_insertion_mode;
5188 ## reprocess
5189 redo B;
5190 } elsif ($token->{type} eq 'end tag') {
5191 !!!parse-error (type => 'after html:/'.$token->{tag_name});
5192 $self->{insertion_mode} = $previous_insertion_mode;
5193 ## reprocess
5194 redo B;
5195 } else {
5196 die "$0: $token->{type}: Unknown token";
5197 }
5198 } else {
5199 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5200 }
5201 } # B
5202
5203 ## Stop parsing # MUST
5204
5205 ## TODO: script stuffs
5206 } # _tree_construct_main
5207
5208 sub set_inner_html ($$$) {
5209 my $class = shift;
5210 my $node = shift;
5211 my $s = \$_[0];
5212 my $onerror = $_[1];
5213
5214 my $nt = $node->node_type;
5215 if ($nt == 9) {
5216 # MUST
5217
5218 ## Step 1 # MUST
5219 ## TODO: If the document has an active parser, ...
5220 ## ISSUE: There is an issue in the spec.
5221
5222 ## Step 2 # MUST
5223 my @cn = @{$node->child_nodes};
5224 for (@cn) {
5225 $node->remove_child ($_);
5226 }
5227
5228 ## Step 3, 4, 5 # MUST
5229 $class->parse_string ($$s => $node, $onerror);
5230 } elsif ($nt == 1) {
5231 ## TODO: If non-html element
5232
5233 ## NOTE: Most of this code is copied from |parse_string|
5234
5235 ## Step 1 # MUST
5236 my $this_doc = $node->owner_document;
5237 my $doc = $this_doc->implementation->create_document;
5238 $doc->manakai_is_html (1);
5239 my $p = $class->new;
5240 $p->{document} = $doc;
5241
5242 ## Step 9 # MUST
5243 my $i = 0;
5244 my $line = 1;
5245 my $column = 0;
5246 $p->{set_next_input_character} = sub {
5247 my $self = shift;
5248
5249 pop @{$self->{prev_input_character}};
5250 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5251
5252 $self->{next_input_character} = -1 and return if $i >= length $$s;
5253 $self->{next_input_character} = ord substr $$s, $i++, 1;
5254 $column++;
5255
5256 if ($self->{next_input_character} == 0x000A) { # LF
5257 $line++;
5258 $column = 0;
5259 } elsif ($self->{next_input_character} == 0x000D) { # CR
5260 $i++ if substr ($$s, $i, 1) eq "\x0A";
5261 $self->{next_input_character} = 0x000A; # LF # MUST
5262 $line++;
5263 $column = 0;
5264 } elsif ($self->{next_input_character} > 0x10FFFF) {
5265 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5266 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5267 !!!parse-error (type => 'NULL');
5268 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5269 }
5270 };
5271 $p->{prev_input_character} = [-1, -1, -1];
5272 $p->{next_input_character} = -1;
5273
5274 my $ponerror = $onerror || sub {
5275 my (%opt) = @_;
5276 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5277 };
5278 $p->{parse_error} = sub {
5279 $ponerror->(@_, line => $line, column => $column);
5280 };
5281
5282 $p->_initialize_tokenizer;
5283 $p->_initialize_tree_constructor;
5284
5285 ## Step 2
5286 my $node_ln = $node->local_name;
5287 $p->{content_model} = {
5288 title => RCDATA_CONTENT_MODEL,
5289 textarea => RCDATA_CONTENT_MODEL,
5290 style => CDATA_CONTENT_MODEL,
5291 script => CDATA_CONTENT_MODEL,
5292 xmp => CDATA_CONTENT_MODEL,
5293 iframe => CDATA_CONTENT_MODEL,
5294 noembed => CDATA_CONTENT_MODEL,
5295 noframes => CDATA_CONTENT_MODEL,
5296 noscript => CDATA_CONTENT_MODEL,
5297 plaintext => PLAINTEXT_CONTENT_MODEL,
5298 }->{$node_ln};
5299 $p->{content_model} = PCDATA_CONTENT_MODEL
5300 unless defined $p->{content_model};
5301 ## ISSUE: What is "the name of the element"? local name?
5302
5303 $p->{inner_html_node} = [$node, $node_ln];
5304
5305 ## Step 4
5306 my $root = $doc->create_element_ns
5307 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5308
5309 ## Step 5 # MUST
5310 $doc->append_child ($root);
5311
5312 ## Step 6 # MUST
5313 push @{$p->{open_elements}}, [$root, 'html'];
5314
5315 undef $p->{head_element};
5316
5317 ## Step 7 # MUST
5318 $p->_reset_insertion_mode;
5319
5320 ## Step 8 # MUST
5321 my $anode = $node;
5322 AN: while (defined $anode) {
5323 if ($anode->node_type == 1) {
5324 my $nsuri = $anode->namespace_uri;
5325 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5326 if ($anode->local_name eq 'form') { ## TODO: case?
5327 $p->{form_element} = $anode;
5328 last AN;
5329 }
5330 }
5331 }
5332 $anode = $anode->parent_node;
5333 } # AN
5334
5335 ## Step 3 # MUST
5336 ## Step 10 # MUST
5337 {
5338 my $self = $p;
5339 !!!next-token;
5340 }
5341 $p->_tree_construction_main;
5342
5343 ## Step 11 # MUST
5344 my @cn = @{$node->child_nodes};
5345 for (@cn) {
5346 $node->remove_child ($_);
5347 }
5348 ## ISSUE: mutation events? read-only?
5349
5350 ## Step 12 # MUST
5351 @cn = @{$root->child_nodes};
5352 for (@cn) {
5353 $this_doc->adopt_node ($_);
5354 $node->append_child ($_);
5355 }
5356 ## ISSUE: mutation events?
5357
5358 $p->_terminate_tree_constructor;
5359 } else {
5360 die "$0: |set_inner_html| is not defined for node of type $nt";
5361 }
5362 } # set_inner_html
5363
5364 } # tree construction stage
5365
5366 sub get_inner_html ($$$) {
5367 my (undef, $node, $on_error) = @_;
5368
5369 ## Step 1
5370 my $s = '';
5371
5372 my $in_cdata;
5373 my $parent = $node;
5374 while (defined $parent) {
5375 if ($parent->node_type == 1 and
5376 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5377 {
5378 style => 1, script => 1, xmp => 1, iframe => 1,
5379 noembed => 1, noframes => 1, noscript => 1,
5380 }->{$parent->local_name}) { ## TODO: case thingy
5381 $in_cdata = 1;
5382 }
5383 $parent = $parent->parent_node;
5384 }
5385
5386 ## Step 2
5387 my @node = @{$node->child_nodes};
5388 C: while (@node) {
5389 my $child = shift @node;
5390 unless (ref $child) {
5391 if ($child eq 'cdata-out') {
5392 $in_cdata = 0;
5393 } else {
5394 $s .= $child; # end tag
5395 }
5396 next C;
5397 }
5398
5399 my $nt = $child->node_type;
5400 if ($nt == 1) { # Element
5401 my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
5402 $s .= '<' . $tag_name;
5403 ## NOTE: Non-HTML case:
5404 ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
5405
5406 my @attrs = @{$child->attributes}; # sort order MUST be stable
5407 for my $attr (@attrs) { # order is implementation dependent
5408 my $attr_name = $attr->name; ## TODO: manakai_name
5409 $s .= ' ' . $attr_name . '="';
5410 my $attr_value = $attr->value;
5411 ## escape
5412 $attr_value =~ s/&/&amp;/g;
5413 $attr_value =~ s/</&lt;/g;
5414 $attr_value =~ s/>/&gt;/g;
5415 $attr_value =~ s/"/&quot;/g;
5416 $s .= $attr_value . '"';
5417 }
5418 $s .= '>';
5419
5420 next C if {
5421 area => 1, base => 1, basefont => 1, bgsound => 1,
5422 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5423 img => 1, input => 1, link => 1, meta => 1, param => 1,
5424 spacer => 1, wbr => 1,
5425 }->{$tag_name};
5426
5427 $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
5428
5429 if (not $in_cdata and {
5430 style => 1, script => 1, xmp => 1, iframe => 1,
5431 noembed => 1, noframes => 1, noscript => 1,
5432 plaintext => 1,
5433 }->{$tag_name}) {
5434 unshift @node, 'cdata-out';
5435 $in_cdata = 1;
5436 }
5437
5438 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5439 } elsif ($nt == 3 or $nt == 4) {
5440 if ($in_cdata) {
5441 $s .= $child->data;
5442 } else {
5443 my $value = $child->data;
5444 $value =~ s/&/&amp;/g;
5445 $value =~ s/</&lt;/g;
5446 $value =~ s/>/&gt;/g;
5447 $value =~ s/"/&quot;/g;
5448 $s .= $value;
5449 }
5450 } elsif ($nt == 8) {
5451 $s .= '<!--' . $child->data . '-->';
5452 } elsif ($nt == 10) {
5453 $s .= '<!DOCTYPE ' . $child->name . '>';
5454 } elsif ($nt == 5) { # entrefs
5455 push @node, @{$child->child_nodes};
5456 } else {
5457 $on_error->($child) if defined $on_error;
5458 }
5459 ## ISSUE: This code does not support PIs.
5460 } # C
5461
5462 ## Step 3
5463 return \$s;
5464 } # get_inner_html
5465
5466 1;
5467 # $Date: 2007/07/21 06:59:16 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24