/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.53 - (show annotations) (download) (as text)
Sat Jul 21 12:37:57 2007 UTC (18 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.52: +57 -60 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	21 Jul 2007 12:37:54 -0000
	* HTML.pm.src: |$in_body| is no longer a function.

2007-07-21  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.52 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## ISSUE:
6 ## var doc = implementation.createDocument (null, null, null);
7 ## doc.write ('');
8 ## alert (doc.compatMode);
9
10 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
11 ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
12 ## is not yet clear.
13 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
14 ## "{U+FEFF}..." in GB18030?
15
16 my $permitted_slash_tag_name = {
17 base => 1,
18 link => 1,
19 meta => 1,
20 hr => 1,
21 br => 1,
22 img=> 1,
23 embed => 1,
24 param => 1,
25 area => 1,
26 col => 1,
27 input => 1,
28 };
29
30 my $c1_entity_char = {
31 0x80 => 0x20AC,
32 0x81 => 0xFFFD,
33 0x82 => 0x201A,
34 0x83 => 0x0192,
35 0x84 => 0x201E,
36 0x85 => 0x2026,
37 0x86 => 0x2020,
38 0x87 => 0x2021,
39 0x88 => 0x02C6,
40 0x89 => 0x2030,
41 0x8A => 0x0160,
42 0x8B => 0x2039,
43 0x8C => 0x0152,
44 0x8D => 0xFFFD,
45 0x8E => 0x017D,
46 0x8F => 0xFFFD,
47 0x90 => 0xFFFD,
48 0x91 => 0x2018,
49 0x92 => 0x2019,
50 0x93 => 0x201C,
51 0x94 => 0x201D,
52 0x95 => 0x2022,
53 0x96 => 0x2013,
54 0x97 => 0x2014,
55 0x98 => 0x02DC,
56 0x99 => 0x2122,
57 0x9A => 0x0161,
58 0x9B => 0x203A,
59 0x9C => 0x0153,
60 0x9D => 0xFFFD,
61 0x9E => 0x017E,
62 0x9F => 0x0178,
63 }; # $c1_entity_char
64
65 my $special_category = {
66 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
67 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
68 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
69 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
70 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
71 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
72 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
73 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
74 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
75 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
76 };
77 my $scoping_category = {
78 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
79 table => 1, td => 1, th => 1,
80 };
81 my $formatting_category = {
82 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
83 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
84 };
85 # $phrasing_category: all other elements
86
87 sub parse_string ($$$;$) {
88 my $self = shift->new;
89 my $s = \$_[0];
90 $self->{document} = $_[1];
91
92 ## NOTE: |set_inner_html| copies most of this method's code
93
94 my $i = 0;
95 my $line = 1;
96 my $column = 0;
97 $self->{set_next_input_character} = sub {
98 my $self = shift;
99
100 pop @{$self->{prev_input_character}};
101 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
102
103 $self->{next_input_character} = -1 and return if $i >= length $$s;
104 $self->{next_input_character} = ord substr $$s, $i++, 1;
105 $column++;
106
107 if ($self->{next_input_character} == 0x000A) { # LF
108 $line++;
109 $column = 0;
110 } elsif ($self->{next_input_character} == 0x000D) { # CR
111 $i++ if substr ($$s, $i, 1) eq "\x0A";
112 $self->{next_input_character} = 0x000A; # LF # MUST
113 $line++;
114 $column = 0;
115 } elsif ($self->{next_input_character} > 0x10FFFF) {
116 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
117 } elsif ($self->{next_input_character} == 0x0000) { # NULL
118 !!!parse-error (type => 'NULL');
119 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120 }
121 };
122 $self->{prev_input_character} = [-1, -1, -1];
123 $self->{next_input_character} = -1;
124
125 my $onerror = $_[2] || sub {
126 my (%opt) = @_;
127 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
128 };
129 $self->{parse_error} = sub {
130 $onerror->(@_, line => $line, column => $column);
131 };
132
133 $self->_initialize_tokenizer;
134 $self->_initialize_tree_constructor;
135 $self->_construct_tree;
136 $self->_terminate_tree_constructor;
137
138 return $self->{document};
139 } # parse_string
140
141 sub new ($) {
142 my $class = shift;
143 my $self = bless {}, $class;
144 $self->{set_next_input_character} = sub {
145 $self->{next_input_character} = -1;
146 };
147 $self->{parse_error} = sub {
148 #
149 };
150 return $self;
151 } # new
152
153 sub CM_ENTITY () { 0b001 } # & markup in data
154 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
155 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
156
157 sub PLAINTEXT_CONTENT_MODEL () { 0 }
158 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
159 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
160 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
161
162 ## Implementations MUST act as if state machine in the spec
163
164 sub _initialize_tokenizer ($) {
165 my $self = shift;
166 $self->{state} = 'data'; # MUST
167 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
168 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
169 undef $self->{current_attribute};
170 undef $self->{last_emitted_start_tag_name};
171 undef $self->{last_attribute_value_state};
172 $self->{char} = [];
173 # $self->{next_input_character}
174 !!!next-input-character;
175 $self->{token} = [];
176 # $self->{escape}
177 } # _initialize_tokenizer
178
179 ## A token has:
180 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
181 ## 'character', or 'end-of-file'
182 ## ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
183 ## ->{public_identifier} (DOCTYPE)
184 ## ->{system_identifier} (DOCTYPE)
185 ## ->{correct} == 1 or 0 (DOCTYPE)
186 ## ->{attributes} isa HASH (start tag, end tag)
187 ## ->{data} (comment, character)
188
189 ## Emitted token MUST immediately be handled by the tree construction state.
190
191 ## Before each step, UA MAY check to see if either one of the scripts in
192 ## "list of scripts that will execute as soon as possible" or the first
193 ## script in the "list of scripts that will execute asynchronously",
194 ## has completed loading. If one has, then it MUST be executed
195 ## and removed from the list.
196
197 sub _get_next_token ($) {
198 my $self = shift;
199 if (@{$self->{token}}) {
200 return shift @{$self->{token}};
201 }
202
203 A: {
204 if ($self->{state} eq 'data') {
205 if ($self->{next_input_character} == 0x0026) { # &
206 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
207 $self->{state} = 'entity data';
208 !!!next-input-character;
209 redo A;
210 } else {
211 #
212 }
213 } elsif ($self->{next_input_character} == 0x002D) { # -
214 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
215 unless ($self->{escape}) {
216 if ($self->{prev_input_character}->[0] == 0x002D and # -
217 $self->{prev_input_character}->[1] == 0x0021 and # !
218 $self->{prev_input_character}->[2] == 0x003C) { # <
219 $self->{escape} = 1;
220 }
221 }
222 }
223
224 #
225 } elsif ($self->{next_input_character} == 0x003C) { # <
226 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
227 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
228 not $self->{escape})) {
229 $self->{state} = 'tag open';
230 !!!next-input-character;
231 redo A;
232 } else {
233 #
234 }
235 } elsif ($self->{next_input_character} == 0x003E) { # >
236 if ($self->{escape} and
237 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
238 if ($self->{prev_input_character}->[0] == 0x002D and # -
239 $self->{prev_input_character}->[1] == 0x002D) { # -
240 delete $self->{escape};
241 }
242 }
243
244 #
245 } elsif ($self->{next_input_character} == -1) {
246 !!!emit ({type => 'end-of-file'});
247 last A; ## TODO: ok?
248 }
249 # Anything else
250 my $token = {type => 'character',
251 data => chr $self->{next_input_character}};
252 ## Stay in the data state
253 !!!next-input-character;
254
255 !!!emit ($token);
256
257 redo A;
258 } elsif ($self->{state} eq 'entity data') {
259 ## (cannot happen in CDATA state)
260
261 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
262
263 $self->{state} = 'data';
264 # next-input-character is already done
265
266 unless (defined $token) {
267 !!!emit ({type => 'character', data => '&'});
268 } else {
269 !!!emit ($token);
270 }
271
272 redo A;
273 } elsif ($self->{state} eq 'tag open') {
274 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
275 if ($self->{next_input_character} == 0x002F) { # /
276 !!!next-input-character;
277 $self->{state} = 'close tag open';
278 redo A;
279 } else {
280 ## reconsume
281 $self->{state} = 'data';
282
283 !!!emit ({type => 'character', data => '<'});
284
285 redo A;
286 }
287 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
288 if ($self->{next_input_character} == 0x0021) { # !
289 $self->{state} = 'markup declaration open';
290 !!!next-input-character;
291 redo A;
292 } elsif ($self->{next_input_character} == 0x002F) { # /
293 $self->{state} = 'close tag open';
294 !!!next-input-character;
295 redo A;
296 } elsif (0x0041 <= $self->{next_input_character} and
297 $self->{next_input_character} <= 0x005A) { # A..Z
298 $self->{current_token}
299 = {type => 'start tag',
300 tag_name => chr ($self->{next_input_character} + 0x0020)};
301 $self->{state} = 'tag name';
302 !!!next-input-character;
303 redo A;
304 } elsif (0x0061 <= $self->{next_input_character} and
305 $self->{next_input_character} <= 0x007A) { # a..z
306 $self->{current_token} = {type => 'start tag',
307 tag_name => chr ($self->{next_input_character})};
308 $self->{state} = 'tag name';
309 !!!next-input-character;
310 redo A;
311 } elsif ($self->{next_input_character} == 0x003E) { # >
312 !!!parse-error (type => 'empty start tag');
313 $self->{state} = 'data';
314 !!!next-input-character;
315
316 !!!emit ({type => 'character', data => '<>'});
317
318 redo A;
319 } elsif ($self->{next_input_character} == 0x003F) { # ?
320 !!!parse-error (type => 'pio');
321 $self->{state} = 'bogus comment';
322 ## $self->{next_input_character} is intentionally left as is
323 redo A;
324 } else {
325 !!!parse-error (type => 'bare stago');
326 $self->{state} = 'data';
327 ## reconsume
328
329 !!!emit ({type => 'character', data => '<'});
330
331 redo A;
332 }
333 } else {
334 die "$0: $self->{content_model} in tag open";
335 }
336 } elsif ($self->{state} eq 'close tag open') {
337 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
338 if (defined $self->{last_emitted_start_tag_name}) {
339 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
340 my @next_char;
341 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
342 push @next_char, $self->{next_input_character};
343 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
344 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
345 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
346 !!!next-input-character;
347 next TAGNAME;
348 } else {
349 $self->{next_input_character} = shift @next_char; # reconsume
350 !!!back-next-input-character (@next_char);
351 $self->{state} = 'data';
352
353 !!!emit ({type => 'character', data => '</'});
354
355 redo A;
356 }
357 }
358 push @next_char, $self->{next_input_character};
359
360 unless ($self->{next_input_character} == 0x0009 or # HT
361 $self->{next_input_character} == 0x000A or # LF
362 $self->{next_input_character} == 0x000B or # VT
363 $self->{next_input_character} == 0x000C or # FF
364 $self->{next_input_character} == 0x0020 or # SP
365 $self->{next_input_character} == 0x003E or # >
366 $self->{next_input_character} == 0x002F or # /
367 $self->{next_input_character} == -1) {
368 $self->{next_input_character} = shift @next_char; # reconsume
369 !!!back-next-input-character (@next_char);
370 $self->{state} = 'data';
371 !!!emit ({type => 'character', data => '</'});
372 redo A;
373 } else {
374 $self->{next_input_character} = shift @next_char;
375 !!!back-next-input-character (@next_char);
376 # and consume...
377 }
378 } else {
379 ## No start tag token has ever been emitted
380 # next-input-character is already done
381 $self->{state} = 'data';
382 !!!emit ({type => 'character', data => '</'});
383 redo A;
384 }
385 }
386
387 if (0x0041 <= $self->{next_input_character} and
388 $self->{next_input_character} <= 0x005A) { # A..Z
389 $self->{current_token} = {type => 'end tag',
390 tag_name => chr ($self->{next_input_character} + 0x0020)};
391 $self->{state} = 'tag name';
392 !!!next-input-character;
393 redo A;
394 } elsif (0x0061 <= $self->{next_input_character} and
395 $self->{next_input_character} <= 0x007A) { # a..z
396 $self->{current_token} = {type => 'end tag',
397 tag_name => chr ($self->{next_input_character})};
398 $self->{state} = 'tag name';
399 !!!next-input-character;
400 redo A;
401 } elsif ($self->{next_input_character} == 0x003E) { # >
402 !!!parse-error (type => 'empty end tag');
403 $self->{state} = 'data';
404 !!!next-input-character;
405 redo A;
406 } elsif ($self->{next_input_character} == -1) {
407 !!!parse-error (type => 'bare etago');
408 $self->{state} = 'data';
409 # reconsume
410
411 !!!emit ({type => 'character', data => '</'});
412
413 redo A;
414 } else {
415 !!!parse-error (type => 'bogus end tag');
416 $self->{state} = 'bogus comment';
417 ## $self->{next_input_character} is intentionally left as is
418 redo A;
419 }
420 } elsif ($self->{state} eq 'tag name') {
421 if ($self->{next_input_character} == 0x0009 or # HT
422 $self->{next_input_character} == 0x000A or # LF
423 $self->{next_input_character} == 0x000B or # VT
424 $self->{next_input_character} == 0x000C or # FF
425 $self->{next_input_character} == 0x0020) { # SP
426 $self->{state} = 'before attribute name';
427 !!!next-input-character;
428 redo A;
429 } elsif ($self->{next_input_character} == 0x003E) { # >
430 if ($self->{current_token}->{type} eq 'start tag') {
431 $self->{current_token}->{first_start_tag}
432 = not defined $self->{last_emitted_start_tag_name};
433 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
434 } elsif ($self->{current_token}->{type} eq 'end tag') {
435 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
436 if ($self->{current_token}->{attributes}) {
437 !!!parse-error (type => 'end tag attribute');
438 }
439 } else {
440 die "$0: $self->{current_token}->{type}: Unknown token type";
441 }
442 $self->{state} = 'data';
443 !!!next-input-character;
444
445 !!!emit ($self->{current_token}); # start tag or end tag
446
447 redo A;
448 } elsif (0x0041 <= $self->{next_input_character} and
449 $self->{next_input_character} <= 0x005A) { # A..Z
450 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
451 # start tag or end tag
452 ## Stay in this state
453 !!!next-input-character;
454 redo A;
455 } elsif ($self->{next_input_character} == -1) {
456 !!!parse-error (type => 'unclosed tag');
457 if ($self->{current_token}->{type} eq 'start tag') {
458 $self->{current_token}->{first_start_tag}
459 = not defined $self->{last_emitted_start_tag_name};
460 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
461 } elsif ($self->{current_token}->{type} eq 'end tag') {
462 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
463 if ($self->{current_token}->{attributes}) {
464 !!!parse-error (type => 'end tag attribute');
465 }
466 } else {
467 die "$0: $self->{current_token}->{type}: Unknown token type";
468 }
469 $self->{state} = 'data';
470 # reconsume
471
472 !!!emit ($self->{current_token}); # start tag or end tag
473
474 redo A;
475 } elsif ($self->{next_input_character} == 0x002F) { # /
476 !!!next-input-character;
477 if ($self->{next_input_character} == 0x003E and # >
478 $self->{current_token}->{type} eq 'start tag' and
479 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
480 # permitted slash
481 #
482 } else {
483 !!!parse-error (type => 'nestc');
484 }
485 $self->{state} = 'before attribute name';
486 # next-input-character is already done
487 redo A;
488 } else {
489 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
490 # start tag or end tag
491 ## Stay in the state
492 !!!next-input-character;
493 redo A;
494 }
495 } elsif ($self->{state} eq 'before attribute name') {
496 if ($self->{next_input_character} == 0x0009 or # HT
497 $self->{next_input_character} == 0x000A or # LF
498 $self->{next_input_character} == 0x000B or # VT
499 $self->{next_input_character} == 0x000C or # FF
500 $self->{next_input_character} == 0x0020) { # SP
501 ## Stay in the state
502 !!!next-input-character;
503 redo A;
504 } elsif ($self->{next_input_character} == 0x003E) { # >
505 if ($self->{current_token}->{type} eq 'start tag') {
506 $self->{current_token}->{first_start_tag}
507 = not defined $self->{last_emitted_start_tag_name};
508 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
509 } elsif ($self->{current_token}->{type} eq 'end tag') {
510 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
511 if ($self->{current_token}->{attributes}) {
512 !!!parse-error (type => 'end tag attribute');
513 }
514 } else {
515 die "$0: $self->{current_token}->{type}: Unknown token type";
516 }
517 $self->{state} = 'data';
518 !!!next-input-character;
519
520 !!!emit ($self->{current_token}); # start tag or end tag
521
522 redo A;
523 } elsif (0x0041 <= $self->{next_input_character} and
524 $self->{next_input_character} <= 0x005A) { # A..Z
525 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
526 value => ''};
527 $self->{state} = 'attribute name';
528 !!!next-input-character;
529 redo A;
530 } elsif ($self->{next_input_character} == 0x002F) { # /
531 !!!next-input-character;
532 if ($self->{next_input_character} == 0x003E and # >
533 $self->{current_token}->{type} eq 'start tag' and
534 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
535 # permitted slash
536 #
537 } else {
538 !!!parse-error (type => 'nestc');
539 }
540 ## Stay in the state
541 # next-input-character is already done
542 redo A;
543 } elsif ($self->{next_input_character} == -1) {
544 !!!parse-error (type => 'unclosed tag');
545 if ($self->{current_token}->{type} eq 'start tag') {
546 $self->{current_token}->{first_start_tag}
547 = not defined $self->{last_emitted_start_tag_name};
548 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
549 } elsif ($self->{current_token}->{type} eq 'end tag') {
550 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
551 if ($self->{current_token}->{attributes}) {
552 !!!parse-error (type => 'end tag attribute');
553 }
554 } else {
555 die "$0: $self->{current_token}->{type}: Unknown token type";
556 }
557 $self->{state} = 'data';
558 # reconsume
559
560 !!!emit ($self->{current_token}); # start tag or end tag
561
562 redo A;
563 } else {
564 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
565 value => ''};
566 $self->{state} = 'attribute name';
567 !!!next-input-character;
568 redo A;
569 }
570 } elsif ($self->{state} eq 'attribute name') {
571 my $before_leave = sub {
572 if (exists $self->{current_token}->{attributes} # start tag or end tag
573 ->{$self->{current_attribute}->{name}}) { # MUST
574 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
575 ## Discard $self->{current_attribute} # MUST
576 } else {
577 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
578 = $self->{current_attribute};
579 }
580 }; # $before_leave
581
582 if ($self->{next_input_character} == 0x0009 or # HT
583 $self->{next_input_character} == 0x000A or # LF
584 $self->{next_input_character} == 0x000B or # VT
585 $self->{next_input_character} == 0x000C or # FF
586 $self->{next_input_character} == 0x0020) { # SP
587 $before_leave->();
588 $self->{state} = 'after attribute name';
589 !!!next-input-character;
590 redo A;
591 } elsif ($self->{next_input_character} == 0x003D) { # =
592 $before_leave->();
593 $self->{state} = 'before attribute value';
594 !!!next-input-character;
595 redo A;
596 } elsif ($self->{next_input_character} == 0x003E) { # >
597 $before_leave->();
598 if ($self->{current_token}->{type} eq 'start tag') {
599 $self->{current_token}->{first_start_tag}
600 = not defined $self->{last_emitted_start_tag_name};
601 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
602 } elsif ($self->{current_token}->{type} eq 'end tag') {
603 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
604 if ($self->{current_token}->{attributes}) {
605 !!!parse-error (type => 'end tag attribute');
606 }
607 } else {
608 die "$0: $self->{current_token}->{type}: Unknown token type";
609 }
610 $self->{state} = 'data';
611 !!!next-input-character;
612
613 !!!emit ($self->{current_token}); # start tag or end tag
614
615 redo A;
616 } elsif (0x0041 <= $self->{next_input_character} and
617 $self->{next_input_character} <= 0x005A) { # A..Z
618 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
619 ## Stay in the state
620 !!!next-input-character;
621 redo A;
622 } elsif ($self->{next_input_character} == 0x002F) { # /
623 $before_leave->();
624 !!!next-input-character;
625 if ($self->{next_input_character} == 0x003E and # >
626 $self->{current_token}->{type} eq 'start tag' and
627 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
628 # permitted slash
629 #
630 } else {
631 !!!parse-error (type => 'nestc');
632 }
633 $self->{state} = 'before attribute name';
634 # next-input-character is already done
635 redo A;
636 } elsif ($self->{next_input_character} == -1) {
637 !!!parse-error (type => 'unclosed tag');
638 $before_leave->();
639 if ($self->{current_token}->{type} eq 'start tag') {
640 $self->{current_token}->{first_start_tag}
641 = not defined $self->{last_emitted_start_tag_name};
642 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
643 } elsif ($self->{current_token}->{type} eq 'end tag') {
644 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
645 if ($self->{current_token}->{attributes}) {
646 !!!parse-error (type => 'end tag attribute');
647 }
648 } else {
649 die "$0: $self->{current_token}->{type}: Unknown token type";
650 }
651 $self->{state} = 'data';
652 # reconsume
653
654 !!!emit ($self->{current_token}); # start tag or end tag
655
656 redo A;
657 } else {
658 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
659 ## Stay in the state
660 !!!next-input-character;
661 redo A;
662 }
663 } elsif ($self->{state} eq 'after attribute name') {
664 if ($self->{next_input_character} == 0x0009 or # HT
665 $self->{next_input_character} == 0x000A or # LF
666 $self->{next_input_character} == 0x000B or # VT
667 $self->{next_input_character} == 0x000C or # FF
668 $self->{next_input_character} == 0x0020) { # SP
669 ## Stay in the state
670 !!!next-input-character;
671 redo A;
672 } elsif ($self->{next_input_character} == 0x003D) { # =
673 $self->{state} = 'before attribute value';
674 !!!next-input-character;
675 redo A;
676 } elsif ($self->{next_input_character} == 0x003E) { # >
677 if ($self->{current_token}->{type} eq 'start tag') {
678 $self->{current_token}->{first_start_tag}
679 = not defined $self->{last_emitted_start_tag_name};
680 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
681 } elsif ($self->{current_token}->{type} eq 'end tag') {
682 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
683 if ($self->{current_token}->{attributes}) {
684 !!!parse-error (type => 'end tag attribute');
685 }
686 } else {
687 die "$0: $self->{current_token}->{type}: Unknown token type";
688 }
689 $self->{state} = 'data';
690 !!!next-input-character;
691
692 !!!emit ($self->{current_token}); # start tag or end tag
693
694 redo A;
695 } elsif (0x0041 <= $self->{next_input_character} and
696 $self->{next_input_character} <= 0x005A) { # A..Z
697 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
698 value => ''};
699 $self->{state} = 'attribute name';
700 !!!next-input-character;
701 redo A;
702 } elsif ($self->{next_input_character} == 0x002F) { # /
703 !!!next-input-character;
704 if ($self->{next_input_character} == 0x003E and # >
705 $self->{current_token}->{type} eq 'start tag' and
706 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
707 # permitted slash
708 #
709 } else {
710 !!!parse-error (type => 'nestc');
711 ## TODO: Different error type for <aa / bb> than <aa/>
712 }
713 $self->{state} = 'before attribute name';
714 # next-input-character is already done
715 redo A;
716 } elsif ($self->{next_input_character} == -1) {
717 !!!parse-error (type => 'unclosed tag');
718 if ($self->{current_token}->{type} eq 'start tag') {
719 $self->{current_token}->{first_start_tag}
720 = not defined $self->{last_emitted_start_tag_name};
721 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
722 } elsif ($self->{current_token}->{type} eq 'end tag') {
723 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
724 if ($self->{current_token}->{attributes}) {
725 !!!parse-error (type => 'end tag attribute');
726 }
727 } else {
728 die "$0: $self->{current_token}->{type}: Unknown token type";
729 }
730 $self->{state} = 'data';
731 # reconsume
732
733 !!!emit ($self->{current_token}); # start tag or end tag
734
735 redo A;
736 } else {
737 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
738 value => ''};
739 $self->{state} = 'attribute name';
740 !!!next-input-character;
741 redo A;
742 }
743 } elsif ($self->{state} eq 'before attribute value') {
744 if ($self->{next_input_character} == 0x0009 or # HT
745 $self->{next_input_character} == 0x000A or # LF
746 $self->{next_input_character} == 0x000B or # VT
747 $self->{next_input_character} == 0x000C or # FF
748 $self->{next_input_character} == 0x0020) { # SP
749 ## Stay in the state
750 !!!next-input-character;
751 redo A;
752 } elsif ($self->{next_input_character} == 0x0022) { # "
753 $self->{state} = 'attribute value (double-quoted)';
754 !!!next-input-character;
755 redo A;
756 } elsif ($self->{next_input_character} == 0x0026) { # &
757 $self->{state} = 'attribute value (unquoted)';
758 ## reconsume
759 redo A;
760 } elsif ($self->{next_input_character} == 0x0027) { # '
761 $self->{state} = 'attribute value (single-quoted)';
762 !!!next-input-character;
763 redo A;
764 } elsif ($self->{next_input_character} == 0x003E) { # >
765 if ($self->{current_token}->{type} eq 'start tag') {
766 $self->{current_token}->{first_start_tag}
767 = not defined $self->{last_emitted_start_tag_name};
768 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
769 } elsif ($self->{current_token}->{type} eq 'end tag') {
770 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
771 if ($self->{current_token}->{attributes}) {
772 !!!parse-error (type => 'end tag attribute');
773 }
774 } else {
775 die "$0: $self->{current_token}->{type}: Unknown token type";
776 }
777 $self->{state} = 'data';
778 !!!next-input-character;
779
780 !!!emit ($self->{current_token}); # start tag or end tag
781
782 redo A;
783 } elsif ($self->{next_input_character} == -1) {
784 !!!parse-error (type => 'unclosed tag');
785 if ($self->{current_token}->{type} eq 'start tag') {
786 $self->{current_token}->{first_start_tag}
787 = not defined $self->{last_emitted_start_tag_name};
788 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
789 } elsif ($self->{current_token}->{type} eq 'end tag') {
790 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791 if ($self->{current_token}->{attributes}) {
792 !!!parse-error (type => 'end tag attribute');
793 }
794 } else {
795 die "$0: $self->{current_token}->{type}: Unknown token type";
796 }
797 $self->{state} = 'data';
798 ## reconsume
799
800 !!!emit ($self->{current_token}); # start tag or end tag
801
802 redo A;
803 } else {
804 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
805 $self->{state} = 'attribute value (unquoted)';
806 !!!next-input-character;
807 redo A;
808 }
809 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
810 if ($self->{next_input_character} == 0x0022) { # "
811 $self->{state} = 'before attribute name';
812 !!!next-input-character;
813 redo A;
814 } elsif ($self->{next_input_character} == 0x0026) { # &
815 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
816 $self->{state} = 'entity in attribute value';
817 !!!next-input-character;
818 redo A;
819 } elsif ($self->{next_input_character} == -1) {
820 !!!parse-error (type => 'unclosed attribute value');
821 if ($self->{current_token}->{type} eq 'start tag') {
822 $self->{current_token}->{first_start_tag}
823 = not defined $self->{last_emitted_start_tag_name};
824 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
825 } elsif ($self->{current_token}->{type} eq 'end tag') {
826 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
827 if ($self->{current_token}->{attributes}) {
828 !!!parse-error (type => 'end tag attribute');
829 }
830 } else {
831 die "$0: $self->{current_token}->{type}: Unknown token type";
832 }
833 $self->{state} = 'data';
834 ## reconsume
835
836 !!!emit ($self->{current_token}); # start tag or end tag
837
838 redo A;
839 } else {
840 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
841 ## Stay in the state
842 !!!next-input-character;
843 redo A;
844 }
845 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
846 if ($self->{next_input_character} == 0x0027) { # '
847 $self->{state} = 'before attribute name';
848 !!!next-input-character;
849 redo A;
850 } elsif ($self->{next_input_character} == 0x0026) { # &
851 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
852 $self->{state} = 'entity in attribute value';
853 !!!next-input-character;
854 redo A;
855 } elsif ($self->{next_input_character} == -1) {
856 !!!parse-error (type => 'unclosed attribute value');
857 if ($self->{current_token}->{type} eq 'start tag') {
858 $self->{current_token}->{first_start_tag}
859 = not defined $self->{last_emitted_start_tag_name};
860 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
861 } elsif ($self->{current_token}->{type} eq 'end tag') {
862 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
863 if ($self->{current_token}->{attributes}) {
864 !!!parse-error (type => 'end tag attribute');
865 }
866 } else {
867 die "$0: $self->{current_token}->{type}: Unknown token type";
868 }
869 $self->{state} = 'data';
870 ## reconsume
871
872 !!!emit ($self->{current_token}); # start tag or end tag
873
874 redo A;
875 } else {
876 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
877 ## Stay in the state
878 !!!next-input-character;
879 redo A;
880 }
881 } elsif ($self->{state} eq 'attribute value (unquoted)') {
882 if ($self->{next_input_character} == 0x0009 or # HT
883 $self->{next_input_character} == 0x000A or # LF
884 $self->{next_input_character} == 0x000B or # HT
885 $self->{next_input_character} == 0x000C or # FF
886 $self->{next_input_character} == 0x0020) { # SP
887 $self->{state} = 'before attribute name';
888 !!!next-input-character;
889 redo A;
890 } elsif ($self->{next_input_character} == 0x0026) { # &
891 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
892 $self->{state} = 'entity in attribute value';
893 !!!next-input-character;
894 redo A;
895 } elsif ($self->{next_input_character} == 0x003E) { # >
896 if ($self->{current_token}->{type} eq 'start tag') {
897 $self->{current_token}->{first_start_tag}
898 = not defined $self->{last_emitted_start_tag_name};
899 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
900 } elsif ($self->{current_token}->{type} eq 'end tag') {
901 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
902 if ($self->{current_token}->{attributes}) {
903 !!!parse-error (type => 'end tag attribute');
904 }
905 } else {
906 die "$0: $self->{current_token}->{type}: Unknown token type";
907 }
908 $self->{state} = 'data';
909 !!!next-input-character;
910
911 !!!emit ($self->{current_token}); # start tag or end tag
912
913 redo A;
914 } elsif ($self->{next_input_character} == -1) {
915 !!!parse-error (type => 'unclosed tag');
916 if ($self->{current_token}->{type} eq 'start tag') {
917 $self->{current_token}->{first_start_tag}
918 = not defined $self->{last_emitted_start_tag_name};
919 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
920 } elsif ($self->{current_token}->{type} eq 'end tag') {
921 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
922 if ($self->{current_token}->{attributes}) {
923 !!!parse-error (type => 'end tag attribute');
924 }
925 } else {
926 die "$0: $self->{current_token}->{type}: Unknown token type";
927 }
928 $self->{state} = 'data';
929 ## reconsume
930
931 !!!emit ($self->{current_token}); # start tag or end tag
932
933 redo A;
934 } else {
935 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
936 ## Stay in the state
937 !!!next-input-character;
938 redo A;
939 }
940 } elsif ($self->{state} eq 'entity in attribute value') {
941 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
942
943 unless (defined $token) {
944 $self->{current_attribute}->{value} .= '&';
945 } else {
946 $self->{current_attribute}->{value} .= $token->{data};
947 ## ISSUE: spec says "append the returned character token to the current attribute's value"
948 }
949
950 $self->{state} = $self->{last_attribute_value_state};
951 # next-input-character is already done
952 redo A;
953 } elsif ($self->{state} eq 'bogus comment') {
954 ## (only happen if PCDATA state)
955
956 my $token = {type => 'comment', data => ''};
957
958 BC: {
959 if ($self->{next_input_character} == 0x003E) { # >
960 $self->{state} = 'data';
961 !!!next-input-character;
962
963 !!!emit ($token);
964
965 redo A;
966 } elsif ($self->{next_input_character} == -1) {
967 $self->{state} = 'data';
968 ## reconsume
969
970 !!!emit ($token);
971
972 redo A;
973 } else {
974 $token->{data} .= chr ($self->{next_input_character});
975 !!!next-input-character;
976 redo BC;
977 }
978 } # BC
979 } elsif ($self->{state} eq 'markup declaration open') {
980 ## (only happen if PCDATA state)
981
982 my @next_char;
983 push @next_char, $self->{next_input_character};
984
985 if ($self->{next_input_character} == 0x002D) { # -
986 !!!next-input-character;
987 push @next_char, $self->{next_input_character};
988 if ($self->{next_input_character} == 0x002D) { # -
989 $self->{current_token} = {type => 'comment', data => ''};
990 $self->{state} = 'comment start';
991 !!!next-input-character;
992 redo A;
993 }
994 } elsif ($self->{next_input_character} == 0x0044 or # D
995 $self->{next_input_character} == 0x0064) { # d
996 !!!next-input-character;
997 push @next_char, $self->{next_input_character};
998 if ($self->{next_input_character} == 0x004F or # O
999 $self->{next_input_character} == 0x006F) { # o
1000 !!!next-input-character;
1001 push @next_char, $self->{next_input_character};
1002 if ($self->{next_input_character} == 0x0043 or # C
1003 $self->{next_input_character} == 0x0063) { # c
1004 !!!next-input-character;
1005 push @next_char, $self->{next_input_character};
1006 if ($self->{next_input_character} == 0x0054 or # T
1007 $self->{next_input_character} == 0x0074) { # t
1008 !!!next-input-character;
1009 push @next_char, $self->{next_input_character};
1010 if ($self->{next_input_character} == 0x0059 or # Y
1011 $self->{next_input_character} == 0x0079) { # y
1012 !!!next-input-character;
1013 push @next_char, $self->{next_input_character};
1014 if ($self->{next_input_character} == 0x0050 or # P
1015 $self->{next_input_character} == 0x0070) { # p
1016 !!!next-input-character;
1017 push @next_char, $self->{next_input_character};
1018 if ($self->{next_input_character} == 0x0045 or # E
1019 $self->{next_input_character} == 0x0065) { # e
1020 ## ISSUE: What a stupid code this is!
1021 $self->{state} = 'DOCTYPE';
1022 !!!next-input-character;
1023 redo A;
1024 }
1025 }
1026 }
1027 }
1028 }
1029 }
1030 }
1031
1032 !!!parse-error (type => 'bogus comment');
1033 $self->{next_input_character} = shift @next_char;
1034 !!!back-next-input-character (@next_char);
1035 $self->{state} = 'bogus comment';
1036 redo A;
1037
1038 ## ISSUE: typos in spec: chacacters, is is a parse error
1039 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1040 } elsif ($self->{state} eq 'comment start') {
1041 if ($self->{next_input_character} == 0x002D) { # -
1042 $self->{state} = 'comment start dash';
1043 !!!next-input-character;
1044 redo A;
1045 } elsif ($self->{next_input_character} == 0x003E) { # >
1046 !!!parse-error (type => 'bogus comment');
1047 $self->{state} = 'data';
1048 !!!next-input-character;
1049
1050 !!!emit ($self->{current_token}); # comment
1051
1052 redo A;
1053 } elsif ($self->{next_input_character} == -1) {
1054 !!!parse-error (type => 'unclosed comment');
1055 $self->{state} = 'data';
1056 ## reconsume
1057
1058 !!!emit ($self->{current_token}); # comment
1059
1060 redo A;
1061 } else {
1062 $self->{current_token}->{data} # comment
1063 .= chr ($self->{next_input_character});
1064 $self->{state} = 'comment';
1065 !!!next-input-character;
1066 redo A;
1067 }
1068 } elsif ($self->{state} eq 'comment start dash') {
1069 if ($self->{next_input_character} == 0x002D) { # -
1070 $self->{state} = 'comment end';
1071 !!!next-input-character;
1072 redo A;
1073 } elsif ($self->{next_input_character} == 0x003E) { # >
1074 !!!parse-error (type => 'bogus comment');
1075 $self->{state} = 'data';
1076 !!!next-input-character;
1077
1078 !!!emit ($self->{current_token}); # comment
1079
1080 redo A;
1081 } elsif ($self->{next_input_character} == -1) {
1082 !!!parse-error (type => 'unclosed comment');
1083 $self->{state} = 'data';
1084 ## reconsume
1085
1086 !!!emit ($self->{current_token}); # comment
1087
1088 redo A;
1089 } else {
1090 $self->{current_token}->{data} # comment
1091 .= '-' . chr ($self->{next_input_character});
1092 $self->{state} = 'comment';
1093 !!!next-input-character;
1094 redo A;
1095 }
1096 } elsif ($self->{state} eq 'comment') {
1097 if ($self->{next_input_character} == 0x002D) { # -
1098 $self->{state} = 'comment end dash';
1099 !!!next-input-character;
1100 redo A;
1101 } elsif ($self->{next_input_character} == -1) {
1102 !!!parse-error (type => 'unclosed comment');
1103 $self->{state} = 'data';
1104 ## reconsume
1105
1106 !!!emit ($self->{current_token}); # comment
1107
1108 redo A;
1109 } else {
1110 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1111 ## Stay in the state
1112 !!!next-input-character;
1113 redo A;
1114 }
1115 } elsif ($self->{state} eq 'comment end dash') {
1116 if ($self->{next_input_character} == 0x002D) { # -
1117 $self->{state} = 'comment end';
1118 !!!next-input-character;
1119 redo A;
1120 } elsif ($self->{next_input_character} == -1) {
1121 !!!parse-error (type => 'unclosed comment');
1122 $self->{state} = 'data';
1123 ## reconsume
1124
1125 !!!emit ($self->{current_token}); # comment
1126
1127 redo A;
1128 } else {
1129 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1130 $self->{state} = 'comment';
1131 !!!next-input-character;
1132 redo A;
1133 }
1134 } elsif ($self->{state} eq 'comment end') {
1135 if ($self->{next_input_character} == 0x003E) { # >
1136 $self->{state} = 'data';
1137 !!!next-input-character;
1138
1139 !!!emit ($self->{current_token}); # comment
1140
1141 redo A;
1142 } elsif ($self->{next_input_character} == 0x002D) { # -
1143 !!!parse-error (type => 'dash in comment');
1144 $self->{current_token}->{data} .= '-'; # comment
1145 ## Stay in the state
1146 !!!next-input-character;
1147 redo A;
1148 } elsif ($self->{next_input_character} == -1) {
1149 !!!parse-error (type => 'unclosed comment');
1150 $self->{state} = 'data';
1151 ## reconsume
1152
1153 !!!emit ($self->{current_token}); # comment
1154
1155 redo A;
1156 } else {
1157 !!!parse-error (type => 'dash in comment');
1158 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1159 $self->{state} = 'comment';
1160 !!!next-input-character;
1161 redo A;
1162 }
1163 } elsif ($self->{state} eq 'DOCTYPE') {
1164 if ($self->{next_input_character} == 0x0009 or # HT
1165 $self->{next_input_character} == 0x000A or # LF
1166 $self->{next_input_character} == 0x000B or # VT
1167 $self->{next_input_character} == 0x000C or # FF
1168 $self->{next_input_character} == 0x0020) { # SP
1169 $self->{state} = 'before DOCTYPE name';
1170 !!!next-input-character;
1171 redo A;
1172 } else {
1173 !!!parse-error (type => 'no space before DOCTYPE name');
1174 $self->{state} = 'before DOCTYPE name';
1175 ## reconsume
1176 redo A;
1177 }
1178 } elsif ($self->{state} eq 'before DOCTYPE name') {
1179 if ($self->{next_input_character} == 0x0009 or # HT
1180 $self->{next_input_character} == 0x000A or # LF
1181 $self->{next_input_character} == 0x000B or # VT
1182 $self->{next_input_character} == 0x000C or # FF
1183 $self->{next_input_character} == 0x0020) { # SP
1184 ## Stay in the state
1185 !!!next-input-character;
1186 redo A;
1187 } elsif ($self->{next_input_character} == 0x003E) { # >
1188 !!!parse-error (type => 'no DOCTYPE name');
1189 $self->{state} = 'data';
1190 !!!next-input-character;
1191
1192 !!!emit ({type => 'DOCTYPE'}); # incorrect
1193
1194 redo A;
1195 } elsif ($self->{next_input_character} == -1) {
1196 !!!parse-error (type => 'no DOCTYPE name');
1197 $self->{state} = 'data';
1198 ## reconsume
1199
1200 !!!emit ({type => 'DOCTYPE'}); # incorrect
1201
1202 redo A;
1203 } else {
1204 $self->{current_token}
1205 = {type => 'DOCTYPE',
1206 name => chr ($self->{next_input_character}),
1207 correct => 1};
1208 ## ISSUE: "Set the token's name name to the" in the spec
1209 $self->{state} = 'DOCTYPE name';
1210 !!!next-input-character;
1211 redo A;
1212 }
1213 } elsif ($self->{state} eq 'DOCTYPE name') {
1214 ## ISSUE: Redundant "First," in the spec.
1215 if ($self->{next_input_character} == 0x0009 or # HT
1216 $self->{next_input_character} == 0x000A or # LF
1217 $self->{next_input_character} == 0x000B or # VT
1218 $self->{next_input_character} == 0x000C or # FF
1219 $self->{next_input_character} == 0x0020) { # SP
1220 $self->{state} = 'after DOCTYPE name';
1221 !!!next-input-character;
1222 redo A;
1223 } elsif ($self->{next_input_character} == 0x003E) { # >
1224 $self->{state} = 'data';
1225 !!!next-input-character;
1226
1227 !!!emit ($self->{current_token}); # DOCTYPE
1228
1229 redo A;
1230 } elsif ($self->{next_input_character} == -1) {
1231 !!!parse-error (type => 'unclosed DOCTYPE');
1232 $self->{state} = 'data';
1233 ## reconsume
1234
1235 delete $self->{current_token}->{correct};
1236 !!!emit ($self->{current_token}); # DOCTYPE
1237
1238 redo A;
1239 } else {
1240 $self->{current_token}->{name}
1241 .= chr ($self->{next_input_character}); # DOCTYPE
1242 ## Stay in the state
1243 !!!next-input-character;
1244 redo A;
1245 }
1246 } elsif ($self->{state} eq 'after DOCTYPE name') {
1247 if ($self->{next_input_character} == 0x0009 or # HT
1248 $self->{next_input_character} == 0x000A or # LF
1249 $self->{next_input_character} == 0x000B or # VT
1250 $self->{next_input_character} == 0x000C or # FF
1251 $self->{next_input_character} == 0x0020) { # SP
1252 ## Stay in the state
1253 !!!next-input-character;
1254 redo A;
1255 } elsif ($self->{next_input_character} == 0x003E) { # >
1256 $self->{state} = 'data';
1257 !!!next-input-character;
1258
1259 !!!emit ($self->{current_token}); # DOCTYPE
1260
1261 redo A;
1262 } elsif ($self->{next_input_character} == -1) {
1263 !!!parse-error (type => 'unclosed DOCTYPE');
1264 $self->{state} = 'data';
1265 ## reconsume
1266
1267 delete $self->{current_token}->{correct};
1268 !!!emit ($self->{current_token}); # DOCTYPE
1269
1270 redo A;
1271 } elsif ($self->{next_input_character} == 0x0050 or # P
1272 $self->{next_input_character} == 0x0070) { # p
1273 !!!next-input-character;
1274 if ($self->{next_input_character} == 0x0055 or # U
1275 $self->{next_input_character} == 0x0075) { # u
1276 !!!next-input-character;
1277 if ($self->{next_input_character} == 0x0042 or # B
1278 $self->{next_input_character} == 0x0062) { # b
1279 !!!next-input-character;
1280 if ($self->{next_input_character} == 0x004C or # L
1281 $self->{next_input_character} == 0x006C) { # l
1282 !!!next-input-character;
1283 if ($self->{next_input_character} == 0x0049 or # I
1284 $self->{next_input_character} == 0x0069) { # i
1285 !!!next-input-character;
1286 if ($self->{next_input_character} == 0x0043 or # C
1287 $self->{next_input_character} == 0x0063) { # c
1288 $self->{state} = 'before DOCTYPE public identifier';
1289 !!!next-input-character;
1290 redo A;
1291 }
1292 }
1293 }
1294 }
1295 }
1296
1297 #
1298 } elsif ($self->{next_input_character} == 0x0053 or # S
1299 $self->{next_input_character} == 0x0073) { # s
1300 !!!next-input-character;
1301 if ($self->{next_input_character} == 0x0059 or # Y
1302 $self->{next_input_character} == 0x0079) { # y
1303 !!!next-input-character;
1304 if ($self->{next_input_character} == 0x0053 or # S
1305 $self->{next_input_character} == 0x0073) { # s
1306 !!!next-input-character;
1307 if ($self->{next_input_character} == 0x0054 or # T
1308 $self->{next_input_character} == 0x0074) { # t
1309 !!!next-input-character;
1310 if ($self->{next_input_character} == 0x0045 or # E
1311 $self->{next_input_character} == 0x0065) { # e
1312 !!!next-input-character;
1313 if ($self->{next_input_character} == 0x004D or # M
1314 $self->{next_input_character} == 0x006D) { # m
1315 $self->{state} = 'before DOCTYPE system identifier';
1316 !!!next-input-character;
1317 redo A;
1318 }
1319 }
1320 }
1321 }
1322 }
1323
1324 #
1325 } else {
1326 !!!next-input-character;
1327 #
1328 }
1329
1330 !!!parse-error (type => 'string after DOCTYPE name');
1331 $self->{state} = 'bogus DOCTYPE';
1332 # next-input-character is already done
1333 redo A;
1334 } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
1335 if ({
1336 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1337 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1338 }->{$self->{next_input_character}}) {
1339 ## Stay in the state
1340 !!!next-input-character;
1341 redo A;
1342 } elsif ($self->{next_input_character} eq 0x0022) { # "
1343 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1344 $self->{state} = 'DOCTYPE public identifier (double-quoted)';
1345 !!!next-input-character;
1346 redo A;
1347 } elsif ($self->{next_input_character} eq 0x0027) { # '
1348 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1349 $self->{state} = 'DOCTYPE public identifier (single-quoted)';
1350 !!!next-input-character;
1351 redo A;
1352 } elsif ($self->{next_input_character} eq 0x003E) { # >
1353 !!!parse-error (type => 'no PUBLIC literal');
1354
1355 $self->{state} = 'data';
1356 !!!next-input-character;
1357
1358 delete $self->{current_token}->{correct};
1359 !!!emit ($self->{current_token}); # DOCTYPE
1360
1361 redo A;
1362 } elsif ($self->{next_input_character} == -1) {
1363 !!!parse-error (type => 'unclosed DOCTYPE');
1364
1365 $self->{state} = 'data';
1366 ## reconsume
1367
1368 delete $self->{current_token}->{correct};
1369 !!!emit ($self->{current_token}); # DOCTYPE
1370
1371 redo A;
1372 } else {
1373 !!!parse-error (type => 'string after PUBLIC');
1374 $self->{state} = 'bogus DOCTYPE';
1375 !!!next-input-character;
1376 redo A;
1377 }
1378 } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
1379 if ($self->{next_input_character} == 0x0022) { # "
1380 $self->{state} = 'after DOCTYPE public identifier';
1381 !!!next-input-character;
1382 redo A;
1383 } elsif ($self->{next_input_character} == -1) {
1384 !!!parse-error (type => 'unclosed PUBLIC literal');
1385
1386 $self->{state} = 'data';
1387 ## reconsume
1388
1389 delete $self->{current_token}->{correct};
1390 !!!emit ($self->{current_token}); # DOCTYPE
1391
1392 redo A;
1393 } else {
1394 $self->{current_token}->{public_identifier} # DOCTYPE
1395 .= chr $self->{next_input_character};
1396 ## Stay in the state
1397 !!!next-input-character;
1398 redo A;
1399 }
1400 } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
1401 if ($self->{next_input_character} == 0x0027) { # '
1402 $self->{state} = 'after DOCTYPE public identifier';
1403 !!!next-input-character;
1404 redo A;
1405 } elsif ($self->{next_input_character} == -1) {
1406 !!!parse-error (type => 'unclosed PUBLIC literal');
1407
1408 $self->{state} = 'data';
1409 ## reconsume
1410
1411 delete $self->{current_token}->{correct};
1412 !!!emit ($self->{current_token}); # DOCTYPE
1413
1414 redo A;
1415 } else {
1416 $self->{current_token}->{public_identifier} # DOCTYPE
1417 .= chr $self->{next_input_character};
1418 ## Stay in the state
1419 !!!next-input-character;
1420 redo A;
1421 }
1422 } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
1423 if ({
1424 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1425 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1426 }->{$self->{next_input_character}}) {
1427 ## Stay in the state
1428 !!!next-input-character;
1429 redo A;
1430 } elsif ($self->{next_input_character} == 0x0022) { # "
1431 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1432 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1433 !!!next-input-character;
1434 redo A;
1435 } elsif ($self->{next_input_character} == 0x0027) { # '
1436 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1437 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1438 !!!next-input-character;
1439 redo A;
1440 } elsif ($self->{next_input_character} == 0x003E) { # >
1441 $self->{state} = 'data';
1442 !!!next-input-character;
1443
1444 !!!emit ($self->{current_token}); # DOCTYPE
1445
1446 redo A;
1447 } elsif ($self->{next_input_character} == -1) {
1448 !!!parse-error (type => 'unclosed DOCTYPE');
1449
1450 $self->{state} = 'data';
1451 ## reconsume
1452
1453 delete $self->{current_token}->{correct};
1454 !!!emit ($self->{current_token}); # DOCTYPE
1455
1456 redo A;
1457 } else {
1458 !!!parse-error (type => 'string after PUBLIC literal');
1459 $self->{state} = 'bogus DOCTYPE';
1460 !!!next-input-character;
1461 redo A;
1462 }
1463 } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
1464 if ({
1465 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1466 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1467 }->{$self->{next_input_character}}) {
1468 ## Stay in the state
1469 !!!next-input-character;
1470 redo A;
1471 } elsif ($self->{next_input_character} == 0x0022) { # "
1472 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1473 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1474 !!!next-input-character;
1475 redo A;
1476 } elsif ($self->{next_input_character} == 0x0027) { # '
1477 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1478 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1479 !!!next-input-character;
1480 redo A;
1481 } elsif ($self->{next_input_character} == 0x003E) { # >
1482 !!!parse-error (type => 'no SYSTEM literal');
1483 $self->{state} = 'data';
1484 !!!next-input-character;
1485
1486 delete $self->{current_token}->{correct};
1487 !!!emit ($self->{current_token}); # DOCTYPE
1488
1489 redo A;
1490 } elsif ($self->{next_input_character} == -1) {
1491 !!!parse-error (type => 'unclosed DOCTYPE');
1492
1493 $self->{state} = 'data';
1494 ## reconsume
1495
1496 delete $self->{current_token}->{correct};
1497 !!!emit ($self->{current_token}); # DOCTYPE
1498
1499 redo A;
1500 } else {
1501 !!!parse-error (type => 'string after SYSTEM');
1502 $self->{state} = 'bogus DOCTYPE';
1503 !!!next-input-character;
1504 redo A;
1505 }
1506 } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
1507 if ($self->{next_input_character} == 0x0022) { # "
1508 $self->{state} = 'after DOCTYPE system identifier';
1509 !!!next-input-character;
1510 redo A;
1511 } elsif ($self->{next_input_character} == -1) {
1512 !!!parse-error (type => 'unclosed SYSTEM literal');
1513
1514 $self->{state} = 'data';
1515 ## reconsume
1516
1517 delete $self->{current_token}->{correct};
1518 !!!emit ($self->{current_token}); # DOCTYPE
1519
1520 redo A;
1521 } else {
1522 $self->{current_token}->{system_identifier} # DOCTYPE
1523 .= chr $self->{next_input_character};
1524 ## Stay in the state
1525 !!!next-input-character;
1526 redo A;
1527 }
1528 } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
1529 if ($self->{next_input_character} == 0x0027) { # '
1530 $self->{state} = 'after DOCTYPE system identifier';
1531 !!!next-input-character;
1532 redo A;
1533 } elsif ($self->{next_input_character} == -1) {
1534 !!!parse-error (type => 'unclosed SYSTEM literal');
1535
1536 $self->{state} = 'data';
1537 ## reconsume
1538
1539 delete $self->{current_token}->{correct};
1540 !!!emit ($self->{current_token}); # DOCTYPE
1541
1542 redo A;
1543 } else {
1544 $self->{current_token}->{system_identifier} # DOCTYPE
1545 .= chr $self->{next_input_character};
1546 ## Stay in the state
1547 !!!next-input-character;
1548 redo A;
1549 }
1550 } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
1551 if ({
1552 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1553 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1554 }->{$self->{next_input_character}}) {
1555 ## Stay in the state
1556 !!!next-input-character;
1557 redo A;
1558 } elsif ($self->{next_input_character} == 0x003E) { # >
1559 $self->{state} = 'data';
1560 !!!next-input-character;
1561
1562 !!!emit ($self->{current_token}); # DOCTYPE
1563
1564 redo A;
1565 } elsif ($self->{next_input_character} == -1) {
1566 !!!parse-error (type => 'unclosed DOCTYPE');
1567
1568 $self->{state} = 'data';
1569 ## reconsume
1570
1571 delete $self->{current_token}->{correct};
1572 !!!emit ($self->{current_token}); # DOCTYPE
1573
1574 redo A;
1575 } else {
1576 !!!parse-error (type => 'string after SYSTEM literal');
1577 $self->{state} = 'bogus DOCTYPE';
1578 !!!next-input-character;
1579 redo A;
1580 }
1581 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1582 if ($self->{next_input_character} == 0x003E) { # >
1583 $self->{state} = 'data';
1584 !!!next-input-character;
1585
1586 delete $self->{current_token}->{correct};
1587 !!!emit ($self->{current_token}); # DOCTYPE
1588
1589 redo A;
1590 } elsif ($self->{next_input_character} == -1) {
1591 !!!parse-error (type => 'unclosed DOCTYPE');
1592 $self->{state} = 'data';
1593 ## reconsume
1594
1595 delete $self->{current_token}->{correct};
1596 !!!emit ($self->{current_token}); # DOCTYPE
1597
1598 redo A;
1599 } else {
1600 ## Stay in the state
1601 !!!next-input-character;
1602 redo A;
1603 }
1604 } else {
1605 die "$0: $self->{state}: Unknown state";
1606 }
1607 } # A
1608
1609 die "$0: _get_next_token: unexpected case";
1610 } # _get_next_token
1611
1612 sub _tokenize_attempt_to_consume_an_entity ($$) {
1613 my ($self, $in_attr) = @_;
1614
1615 if ({
1616 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1617 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1618 }->{$self->{next_input_character}}) {
1619 ## Don't consume
1620 ## No error
1621 return undef;
1622 } elsif ($self->{next_input_character} == 0x0023) { # #
1623 !!!next-input-character;
1624 if ($self->{next_input_character} == 0x0078 or # x
1625 $self->{next_input_character} == 0x0058) { # X
1626 my $code;
1627 X: {
1628 my $x_char = $self->{next_input_character};
1629 !!!next-input-character;
1630 if (0x0030 <= $self->{next_input_character} and
1631 $self->{next_input_character} <= 0x0039) { # 0..9
1632 $code ||= 0;
1633 $code *= 0x10;
1634 $code += $self->{next_input_character} - 0x0030;
1635 redo X;
1636 } elsif (0x0061 <= $self->{next_input_character} and
1637 $self->{next_input_character} <= 0x0066) { # a..f
1638 $code ||= 0;
1639 $code *= 0x10;
1640 $code += $self->{next_input_character} - 0x0060 + 9;
1641 redo X;
1642 } elsif (0x0041 <= $self->{next_input_character} and
1643 $self->{next_input_character} <= 0x0046) { # A..F
1644 $code ||= 0;
1645 $code *= 0x10;
1646 $code += $self->{next_input_character} - 0x0040 + 9;
1647 redo X;
1648 } elsif (not defined $code) { # no hexadecimal digit
1649 !!!parse-error (type => 'bare hcro');
1650 !!!back-next-input-character ($x_char, $self->{next_input_character});
1651 $self->{next_input_character} = 0x0023; # #
1652 return undef;
1653 } elsif ($self->{next_input_character} == 0x003B) { # ;
1654 !!!next-input-character;
1655 } else {
1656 !!!parse-error (type => 'no refc');
1657 }
1658
1659 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1660 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1661 $code = 0xFFFD;
1662 } elsif ($code > 0x10FFFF) {
1663 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1664 $code = 0xFFFD;
1665 } elsif ($code == 0x000D) {
1666 !!!parse-error (type => 'CR character reference');
1667 $code = 0x000A;
1668 } elsif (0x80 <= $code and $code <= 0x9F) {
1669 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1670 $code = $c1_entity_char->{$code};
1671 }
1672
1673 return {type => 'character', data => chr $code};
1674 } # X
1675 } elsif (0x0030 <= $self->{next_input_character} and
1676 $self->{next_input_character} <= 0x0039) { # 0..9
1677 my $code = $self->{next_input_character} - 0x0030;
1678 !!!next-input-character;
1679
1680 while (0x0030 <= $self->{next_input_character} and
1681 $self->{next_input_character} <= 0x0039) { # 0..9
1682 $code *= 10;
1683 $code += $self->{next_input_character} - 0x0030;
1684
1685 !!!next-input-character;
1686 }
1687
1688 if ($self->{next_input_character} == 0x003B) { # ;
1689 !!!next-input-character;
1690 } else {
1691 !!!parse-error (type => 'no refc');
1692 }
1693
1694 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1695 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1696 $code = 0xFFFD;
1697 } elsif ($code > 0x10FFFF) {
1698 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1699 $code = 0xFFFD;
1700 } elsif ($code == 0x000D) {
1701 !!!parse-error (type => 'CR character reference');
1702 $code = 0x000A;
1703 } elsif (0x80 <= $code and $code <= 0x9F) {
1704 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1705 $code = $c1_entity_char->{$code};
1706 }
1707
1708 return {type => 'character', data => chr $code};
1709 } else {
1710 !!!parse-error (type => 'bare nero');
1711 !!!back-next-input-character ($self->{next_input_character});
1712 $self->{next_input_character} = 0x0023; # #
1713 return undef;
1714 }
1715 } elsif ((0x0041 <= $self->{next_input_character} and
1716 $self->{next_input_character} <= 0x005A) or
1717 (0x0061 <= $self->{next_input_character} and
1718 $self->{next_input_character} <= 0x007A)) {
1719 my $entity_name = chr $self->{next_input_character};
1720 !!!next-input-character;
1721
1722 my $value = $entity_name;
1723 my $match = 0;
1724 require Whatpm::_NamedEntityList;
1725 our $EntityChar;
1726
1727 while (length $entity_name < 10 and
1728 ## NOTE: Some number greater than the maximum length of entity name
1729 ((0x0041 <= $self->{next_input_character} and # a
1730 $self->{next_input_character} <= 0x005A) or # x
1731 (0x0061 <= $self->{next_input_character} and # a
1732 $self->{next_input_character} <= 0x007A) or # z
1733 (0x0030 <= $self->{next_input_character} and # 0
1734 $self->{next_input_character} <= 0x0039) or # 9
1735 $self->{next_input_character} == 0x003B)) { # ;
1736 $entity_name .= chr $self->{next_input_character};
1737 if (defined $EntityChar->{$entity_name}) {
1738 if ($self->{next_input_character} == 0x003B) { # ;
1739 $value = $EntityChar->{$entity_name};
1740 $match = 1;
1741 !!!next-input-character;
1742 last;
1743 } else {
1744 $value = $EntityChar->{$entity_name};
1745 $match = -1;
1746 !!!next-input-character;
1747 }
1748 } else {
1749 $value .= chr $self->{next_input_character};
1750 $match *= 2;
1751 !!!next-input-character;
1752 }
1753 }
1754
1755 if ($match > 0) {
1756 return {type => 'character', data => $value};
1757 } elsif ($match < 0) {
1758 !!!parse-error (type => 'no refc');
1759 if ($in_attr and $match < -1) {
1760 return {type => 'character', data => '&'.$entity_name};
1761 } else {
1762 return {type => 'character', data => $value};
1763 }
1764 } else {
1765 !!!parse-error (type => 'bare ero');
1766 ## NOTE: No characters are consumed in the spec.
1767 return {type => 'character', data => '&'.$value};
1768 }
1769 } else {
1770 ## no characters are consumed
1771 !!!parse-error (type => 'bare ero');
1772 return undef;
1773 }
1774 } # _tokenize_attempt_to_consume_an_entity
1775
1776 sub _initialize_tree_constructor ($) {
1777 my $self = shift;
1778 ## NOTE: $self->{document} MUST be specified before this method is called
1779 $self->{document}->strict_error_checking (0);
1780 ## TODO: Turn mutation events off # MUST
1781 ## TODO: Turn loose Document option (manakai extension) on
1782 $self->{document}->manakai_is_html (1); # MUST
1783 } # _initialize_tree_constructor
1784
1785 sub _terminate_tree_constructor ($) {
1786 my $self = shift;
1787 $self->{document}->strict_error_checking (1);
1788 ## TODO: Turn mutation events on
1789 } # _terminate_tree_constructor
1790
1791 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1792
1793 { # tree construction stage
1794 my $token;
1795
1796 sub _construct_tree ($) {
1797 my ($self) = @_;
1798
1799 ## When an interactive UA render the $self->{document} available
1800 ## to the user, or when it begin accepting user input, are
1801 ## not defined.
1802
1803 ## Append a character: collect it and all subsequent consecutive
1804 ## characters and insert one Text node whose data is concatenation
1805 ## of all those characters. # MUST
1806
1807 !!!next-token;
1808
1809 $self->{insertion_mode} = 'before head';
1810 undef $self->{form_element};
1811 undef $self->{head_element};
1812 $self->{open_elements} = [];
1813 undef $self->{inner_html_node};
1814
1815 $self->_tree_construction_initial; # MUST
1816 $self->_tree_construction_root_element;
1817 $self->_tree_construction_main;
1818 } # _construct_tree
1819
1820 sub _tree_construction_initial ($) {
1821 my $self = shift;
1822 INITIAL: {
1823 if ($token->{type} eq 'DOCTYPE') {
1824 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1825 ## error, switch to a conformance checking mode for another
1826 ## language.
1827 my $doctype_name = $token->{name};
1828 $doctype_name = '' unless defined $doctype_name;
1829 $doctype_name =~ tr/a-z/A-Z/;
1830 if (not defined $token->{name} or # <!DOCTYPE>
1831 defined $token->{public_identifier} or
1832 defined $token->{system_identifier}) {
1833 !!!parse-error (type => 'not HTML5');
1834 } elsif ($doctype_name ne 'HTML') {
1835 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1836 !!!parse-error (type => 'not HTML5');
1837 }
1838
1839 my $doctype = $self->{document}->create_document_type_definition
1840 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1841 $doctype->public_id ($token->{public_identifier})
1842 if defined $token->{public_identifier};
1843 $doctype->system_id ($token->{system_identifier})
1844 if defined $token->{system_identifier};
1845 ## NOTE: Other DocumentType attributes are null or empty lists.
1846 ## ISSUE: internalSubset = null??
1847 $self->{document}->append_child ($doctype);
1848
1849 if (not $token->{correct} or $doctype_name ne 'HTML') {
1850 $self->{document}->manakai_compat_mode ('quirks');
1851 } elsif (defined $token->{public_identifier}) {
1852 my $pubid = $token->{public_identifier};
1853 $pubid =~ tr/a-z/A-z/;
1854 if ({
1855 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1856 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1857 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1858 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1859 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1860 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1861 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1862 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1863 "-//IETF//DTD HTML 2.0//EN" => 1,
1864 "-//IETF//DTD HTML 2.1E//EN" => 1,
1865 "-//IETF//DTD HTML 3.0//EN" => 1,
1866 "-//IETF//DTD HTML 3.0//EN//" => 1,
1867 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1868 "-//IETF//DTD HTML 3.2//EN" => 1,
1869 "-//IETF//DTD HTML 3//EN" => 1,
1870 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1871 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1872 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1873 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1874 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1875 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1876 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1877 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1878 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1879 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1880 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1881 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1882 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1883 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1884 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1885 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1886 "-//IETF//DTD HTML STRICT//EN" => 1,
1887 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1888 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1889 "-//IETF//DTD HTML//EN" => 1,
1890 "-//IETF//DTD HTML//EN//2.0" => 1,
1891 "-//IETF//DTD HTML//EN//3.0" => 1,
1892 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1893 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1894 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1895 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1896 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1897 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1898 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1899 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1900 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1901 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1902 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1903 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1904 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1905 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1906 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1907 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1908 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1909 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1910 "-//W3C//DTD HTML 3.2//EN" => 1,
1911 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1912 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1913 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1914 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1915 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1916 "-//W3C//DTD W3 HTML//EN" => 1,
1917 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1918 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1919 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1920 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1921 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1922 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1923 "HTML" => 1,
1924 }->{$pubid}) {
1925 $self->{document}->manakai_compat_mode ('quirks');
1926 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1927 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1928 if (defined $token->{system_identifier}) {
1929 $self->{document}->manakai_compat_mode ('quirks');
1930 } else {
1931 $self->{document}->manakai_compat_mode ('limited quirks');
1932 }
1933 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
1934 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
1935 $self->{document}->manakai_compat_mode ('limited quirks');
1936 }
1937 }
1938 if (defined $token->{system_identifier}) {
1939 my $sysid = $token->{system_identifier};
1940 $sysid =~ tr/A-Z/a-z/;
1941 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
1942 $self->{document}->manakai_compat_mode ('quirks');
1943 }
1944 }
1945
1946 ## Go to the root element phase.
1947 !!!next-token;
1948 return;
1949 } elsif ({
1950 'start tag' => 1,
1951 'end tag' => 1,
1952 'end-of-file' => 1,
1953 }->{$token->{type}}) {
1954 !!!parse-error (type => 'no DOCTYPE');
1955 $self->{document}->manakai_compat_mode ('quirks');
1956 ## Go to the root element phase
1957 ## reprocess
1958 return;
1959 } elsif ($token->{type} eq 'character') {
1960 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1961 ## Ignore the token
1962
1963 unless (length $token->{data}) {
1964 ## Stay in the phase
1965 !!!next-token;
1966 redo INITIAL;
1967 }
1968 }
1969
1970 !!!parse-error (type => 'no DOCTYPE');
1971 $self->{document}->manakai_compat_mode ('quirks');
1972 ## Go to the root element phase
1973 ## reprocess
1974 return;
1975 } elsif ($token->{type} eq 'comment') {
1976 my $comment = $self->{document}->create_comment ($token->{data});
1977 $self->{document}->append_child ($comment);
1978
1979 ## Stay in the phase.
1980 !!!next-token;
1981 redo INITIAL;
1982 } else {
1983 die "$0: $token->{type}: Unknown token";
1984 }
1985 } # INITIAL
1986 } # _tree_construction_initial
1987
1988 sub _tree_construction_root_element ($) {
1989 my $self = shift;
1990
1991 B: {
1992 if ($token->{type} eq 'DOCTYPE') {
1993 !!!parse-error (type => 'in html:#DOCTYPE');
1994 ## Ignore the token
1995 ## Stay in the phase
1996 !!!next-token;
1997 redo B;
1998 } elsif ($token->{type} eq 'comment') {
1999 my $comment = $self->{document}->create_comment ($token->{data});
2000 $self->{document}->append_child ($comment);
2001 ## Stay in the phase
2002 !!!next-token;
2003 redo B;
2004 } elsif ($token->{type} eq 'character') {
2005 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2006 ## Ignore the token.
2007
2008 unless (length $token->{data}) {
2009 ## Stay in the phase
2010 !!!next-token;
2011 redo B;
2012 }
2013 }
2014 #
2015 } elsif ({
2016 'start tag' => 1,
2017 'end tag' => 1,
2018 'end-of-file' => 1,
2019 }->{$token->{type}}) {
2020 ## ISSUE: There is an issue in the spec
2021 #
2022 } else {
2023 die "$0: $token->{type}: Unknown token";
2024 }
2025 my $root_element; !!!create-element ($root_element, 'html');
2026 $self->{document}->append_child ($root_element);
2027 push @{$self->{open_elements}}, [$root_element, 'html'];
2028 ## reprocess
2029 #redo B;
2030 return; ## Go to the main phase.
2031 } # B
2032 } # _tree_construction_root_element
2033
2034 sub _reset_insertion_mode ($) {
2035 my $self = shift;
2036
2037 ## Step 1
2038 my $last;
2039
2040 ## Step 2
2041 my $i = -1;
2042 my $node = $self->{open_elements}->[$i];
2043
2044 ## Step 3
2045 S3: {
2046 ## ISSUE: Oops! "If node is the first node in the stack of open
2047 ## elements, then set last to true. If the context element of the
2048 ## HTML fragment parsing algorithm is neither a td element nor a
2049 ## th element, then set node to the context element. (fragment case)":
2050 ## The second "if" is in the scope of the first "if"!?
2051 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2052 $last = 1;
2053 if (defined $self->{inner_html_node}) {
2054 if ($self->{inner_html_node}->[1] eq 'td' or
2055 $self->{inner_html_node}->[1] eq 'th') {
2056 #
2057 } else {
2058 $node = $self->{inner_html_node};
2059 }
2060 }
2061 }
2062
2063 ## Step 4..13
2064 my $new_mode = {
2065 select => 'in select',
2066 td => 'in cell',
2067 th => 'in cell',
2068 tr => 'in row',
2069 tbody => 'in table body',
2070 thead => 'in table body',
2071 tfoot => 'in table body',
2072 caption => 'in caption',
2073 colgroup => 'in column group',
2074 table => 'in table',
2075 head => 'in body', # not in head!
2076 body => 'in body',
2077 frameset => 'in frameset',
2078 }->{$node->[1]};
2079 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2080
2081 ## Step 14
2082 if ($node->[1] eq 'html') {
2083 unless (defined $self->{head_element}) {
2084 $self->{insertion_mode} = 'before head';
2085 } else {
2086 $self->{insertion_mode} = 'after head';
2087 }
2088 return;
2089 }
2090
2091 ## Step 15
2092 $self->{insertion_mode} = 'in body' and return if $last;
2093
2094 ## Step 16
2095 $i--;
2096 $node = $self->{open_elements}->[$i];
2097
2098 ## Step 17
2099 redo S3;
2100 } # S3
2101 } # _reset_insertion_mode
2102
2103 sub _tree_construction_main ($) {
2104 my $self = shift;
2105
2106 my $active_formatting_elements = [];
2107
2108 my $reconstruct_active_formatting_elements = sub { # MUST
2109 my $insert = shift;
2110
2111 ## Step 1
2112 return unless @$active_formatting_elements;
2113
2114 ## Step 3
2115 my $i = -1;
2116 my $entry = $active_formatting_elements->[$i];
2117
2118 ## Step 2
2119 return if $entry->[0] eq '#marker';
2120 for (@{$self->{open_elements}}) {
2121 if ($entry->[0] eq $_->[0]) {
2122 return;
2123 }
2124 }
2125
2126 S4: {
2127 ## Step 4
2128 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2129
2130 ## Step 5
2131 $i--;
2132 $entry = $active_formatting_elements->[$i];
2133
2134 ## Step 6
2135 if ($entry->[0] eq '#marker') {
2136 #
2137 } else {
2138 my $in_open_elements;
2139 OE: for (@{$self->{open_elements}}) {
2140 if ($entry->[0] eq $_->[0]) {
2141 $in_open_elements = 1;
2142 last OE;
2143 }
2144 }
2145 if ($in_open_elements) {
2146 #
2147 } else {
2148 redo S4;
2149 }
2150 }
2151
2152 ## Step 7
2153 $i++;
2154 $entry = $active_formatting_elements->[$i];
2155 } # S4
2156
2157 S7: {
2158 ## Step 8
2159 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2160
2161 ## Step 9
2162 $insert->($clone->[0]);
2163 push @{$self->{open_elements}}, $clone;
2164
2165 ## Step 10
2166 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2167
2168 ## Step 11
2169 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2170 ## Step 7'
2171 $i++;
2172 $entry = $active_formatting_elements->[$i];
2173
2174 redo S7;
2175 }
2176 } # S7
2177 }; # $reconstruct_active_formatting_elements
2178
2179 my $clear_up_to_marker = sub {
2180 for (reverse 0..$#$active_formatting_elements) {
2181 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2182 splice @$active_formatting_elements, $_;
2183 return;
2184 }
2185 }
2186 }; # $clear_up_to_marker
2187
2188 my $parse_rcdata = sub ($$) {
2189 my ($content_model_flag, $insert) = @_;
2190
2191 ## Step 1
2192 my $start_tag_name = $token->{tag_name};
2193 my $el;
2194 !!!create-element ($el, $start_tag_name, $token->{attributes});
2195
2196 ## Step 2
2197 $insert->($el); # /context node/->append_child ($el)
2198
2199 ## Step 3
2200 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2201 delete $self->{escape}; # MUST
2202
2203 ## Step 4
2204 my $text = '';
2205 !!!next-token;
2206 while ($token->{type} eq 'character') { # or until stop tokenizing
2207 $text .= $token->{data};
2208 !!!next-token;
2209 }
2210
2211 ## Step 5
2212 if (length $text) {
2213 my $text = $self->{document}->create_text_node ($text);
2214 $el->append_child ($text);
2215 }
2216
2217 ## Step 6
2218 $self->{content_model} = PCDATA_CONTENT_MODEL;
2219
2220 ## Step 7
2221 if ($token->{type} eq 'end tag' and $token->{tag_name} eq $start_tag_name) {
2222 ## Ignore the token
2223 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2224 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2225 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2226 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2227 } else {
2228 die "$0: $content_model_flag in parse_rcdata";
2229 }
2230 !!!next-token;
2231 }; # $parse_rcdata
2232
2233 my $script_start_tag = sub ($) {
2234 my $insert = $_[0];
2235 my $script_el;
2236 !!!create-element ($script_el, 'script', $token->{attributes});
2237 ## TODO: mark as "parser-inserted"
2238
2239 $self->{content_model} = CDATA_CONTENT_MODEL;
2240 delete $self->{escape}; # MUST
2241
2242 my $text = '';
2243 !!!next-token;
2244 while ($token->{type} eq 'character') {
2245 $text .= $token->{data};
2246 !!!next-token;
2247 } # stop if non-character token or tokenizer stops tokenising
2248 if (length $text) {
2249 $script_el->manakai_append_text ($text);
2250 }
2251
2252 $self->{content_model} = PCDATA_CONTENT_MODEL;
2253
2254 if ($token->{type} eq 'end tag' and
2255 $token->{tag_name} eq 'script') {
2256 ## Ignore the token
2257 } else {
2258 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2259 ## ISSUE: And ignore?
2260 ## TODO: mark as "already executed"
2261 }
2262
2263 if (defined $self->{inner_html_node}) {
2264 ## TODO: mark as "already executed"
2265 } else {
2266 ## TODO: $old_insertion_point = current insertion point
2267 ## TODO: insertion point = just before the next input character
2268
2269 $insert->($script_el);
2270
2271 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2272
2273 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2274 }
2275
2276 !!!next-token;
2277 }; # $script_start_tag
2278
2279 my $formatting_end_tag = sub {
2280 my $tag_name = shift;
2281
2282 FET: {
2283 ## Step 1
2284 my $formatting_element;
2285 my $formatting_element_i_in_active;
2286 AFE: for (reverse 0..$#$active_formatting_elements) {
2287 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2288 $formatting_element = $active_formatting_elements->[$_];
2289 $formatting_element_i_in_active = $_;
2290 last AFE;
2291 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2292 last AFE;
2293 }
2294 } # AFE
2295 unless (defined $formatting_element) {
2296 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2297 ## Ignore the token
2298 !!!next-token;
2299 return;
2300 }
2301 ## has an element in scope
2302 my $in_scope = 1;
2303 my $formatting_element_i_in_open;
2304 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2305 my $node = $self->{open_elements}->[$_];
2306 if ($node->[0] eq $formatting_element->[0]) {
2307 if ($in_scope) {
2308 $formatting_element_i_in_open = $_;
2309 last INSCOPE;
2310 } else { # in open elements but not in scope
2311 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2312 ## Ignore the token
2313 !!!next-token;
2314 return;
2315 }
2316 } elsif ({
2317 table => 1, caption => 1, td => 1, th => 1,
2318 button => 1, marquee => 1, object => 1, html => 1,
2319 }->{$node->[1]}) {
2320 $in_scope = 0;
2321 }
2322 } # INSCOPE
2323 unless (defined $formatting_element_i_in_open) {
2324 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2325 pop @$active_formatting_elements; # $formatting_element
2326 !!!next-token; ## TODO: ok?
2327 return;
2328 }
2329 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2330 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2331 }
2332
2333 ## Step 2
2334 my $furthest_block;
2335 my $furthest_block_i_in_open;
2336 OE: for (reverse 0..$#{$self->{open_elements}}) {
2337 my $node = $self->{open_elements}->[$_];
2338 if (not $formatting_category->{$node->[1]} and
2339 #not $phrasing_category->{$node->[1]} and
2340 ($special_category->{$node->[1]} or
2341 $scoping_category->{$node->[1]})) {
2342 $furthest_block = $node;
2343 $furthest_block_i_in_open = $_;
2344 } elsif ($node->[0] eq $formatting_element->[0]) {
2345 last OE;
2346 }
2347 } # OE
2348
2349 ## Step 3
2350 unless (defined $furthest_block) { # MUST
2351 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2352 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2353 !!!next-token;
2354 return;
2355 }
2356
2357 ## Step 4
2358 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2359
2360 ## Step 5
2361 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2362 if (defined $furthest_block_parent) {
2363 $furthest_block_parent->remove_child ($furthest_block->[0]);
2364 }
2365
2366 ## Step 6
2367 my $bookmark_prev_el
2368 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2369 ->[0];
2370
2371 ## Step 7
2372 my $node = $furthest_block;
2373 my $node_i_in_open = $furthest_block_i_in_open;
2374 my $last_node = $furthest_block;
2375 S7: {
2376 ## Step 1
2377 $node_i_in_open--;
2378 $node = $self->{open_elements}->[$node_i_in_open];
2379
2380 ## Step 2
2381 my $node_i_in_active;
2382 S7S2: {
2383 for (reverse 0..$#$active_formatting_elements) {
2384 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2385 $node_i_in_active = $_;
2386 last S7S2;
2387 }
2388 }
2389 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2390 redo S7;
2391 } # S7S2
2392
2393 ## Step 3
2394 last S7 if $node->[0] eq $formatting_element->[0];
2395
2396 ## Step 4
2397 if ($last_node->[0] eq $furthest_block->[0]) {
2398 $bookmark_prev_el = $node->[0];
2399 }
2400
2401 ## Step 5
2402 if ($node->[0]->has_child_nodes ()) {
2403 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2404 $active_formatting_elements->[$node_i_in_active] = $clone;
2405 $self->{open_elements}->[$node_i_in_open] = $clone;
2406 $node = $clone;
2407 }
2408
2409 ## Step 6
2410 $node->[0]->append_child ($last_node->[0]);
2411
2412 ## Step 7
2413 $last_node = $node;
2414
2415 ## Step 8
2416 redo S7;
2417 } # S7
2418
2419 ## Step 8
2420 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2421
2422 ## Step 9
2423 my $clone = [$formatting_element->[0]->clone_node (0),
2424 $formatting_element->[1]];
2425
2426 ## Step 10
2427 my @cn = @{$furthest_block->[0]->child_nodes};
2428 $clone->[0]->append_child ($_) for @cn;
2429
2430 ## Step 11
2431 $furthest_block->[0]->append_child ($clone->[0]);
2432
2433 ## Step 12
2434 my $i;
2435 AFE: for (reverse 0..$#$active_formatting_elements) {
2436 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2437 splice @$active_formatting_elements, $_, 1;
2438 $i-- and last AFE if defined $i;
2439 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2440 $i = $_;
2441 }
2442 } # AFE
2443 splice @$active_formatting_elements, $i + 1, 0, $clone;
2444
2445 ## Step 13
2446 undef $i;
2447 OE: for (reverse 0..$#{$self->{open_elements}}) {
2448 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2449 splice @{$self->{open_elements}}, $_, 1;
2450 $i-- and last OE if defined $i;
2451 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2452 $i = $_;
2453 }
2454 } # OE
2455 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2456
2457 ## Step 14
2458 redo FET;
2459 } # FET
2460 }; # $formatting_end_tag
2461
2462 my $insert_to_current = sub {
2463 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2464 }; # $insert_to_current
2465
2466 my $insert_to_foster = sub {
2467 my $child = shift;
2468 if ({
2469 table => 1, tbody => 1, tfoot => 1,
2470 thead => 1, tr => 1,
2471 }->{$self->{open_elements}->[-1]->[1]}) {
2472 # MUST
2473 my $foster_parent_element;
2474 my $next_sibling;
2475 OE: for (reverse 0..$#{$self->{open_elements}}) {
2476 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2477 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2478 if (defined $parent and $parent->node_type == 1) {
2479 $foster_parent_element = $parent;
2480 $next_sibling = $self->{open_elements}->[$_]->[0];
2481 } else {
2482 $foster_parent_element
2483 = $self->{open_elements}->[$_ - 1]->[0];
2484 }
2485 last OE;
2486 }
2487 } # OE
2488 $foster_parent_element = $self->{open_elements}->[0]->[0]
2489 unless defined $foster_parent_element;
2490 $foster_parent_element->insert_before
2491 ($child, $next_sibling);
2492 } else {
2493 $self->{open_elements}->[-1]->[0]->append_child ($child);
2494 }
2495 }; # $insert_to_foster
2496
2497 my $insert;
2498
2499 B: {
2500 if ($token->{type} eq 'DOCTYPE') {
2501 !!!parse-error (type => 'DOCTYPE in the middle');
2502 ## Ignore the token
2503 ## Stay in the phase
2504 !!!next-token;
2505 redo B;
2506 } elsif ($token->{type} eq 'end-of-file') {
2507 if ($self->{insertion_mode} eq 'after html body' or
2508 $self->{insertion_mode} eq 'after html frameset') {
2509 #
2510 } else {
2511 ## Generate implied end tags
2512 if ({
2513 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2514 tbody => 1, tfoot=> 1, thead => 1,
2515 }->{$self->{open_elements}->[-1]->[1]}) {
2516 !!!back-token;
2517 $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
2518 redo B;
2519 }
2520
2521 if (@{$self->{open_elements}} > 2 or
2522 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2523 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2524 } elsif (defined $self->{inner_html_node} and
2525 @{$self->{open_elements}} > 1 and
2526 $self->{open_elements}->[1]->[1] ne 'body') {
2527 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2528 }
2529
2530 ## ISSUE: There is an issue in the spec.
2531 }
2532
2533 ## Stop parsing
2534 last B;
2535 } elsif ($token->{type} eq 'start tag' and
2536 $token->{tag_name} eq 'html') {
2537 if ($self->{insertion_mode} eq 'after html body') {
2538 ## Turn into the main phase
2539 !!!parse-error (type => 'after html:html');
2540 $self->{insertion_mode} = 'after body';
2541 } elsif ($self->{insertion_mode} eq 'after html frameset') {
2542 ## Turn into the main phase
2543 !!!parse-error (type => 'after html:html');
2544 $self->{insertion_mode} = 'after frameset';
2545 }
2546
2547 ## ISSUE: "aa<html>" is not a parse error.
2548 ## ISSUE: "<html>" in fragment is not a parse error.
2549 unless ($token->{first_start_tag}) {
2550 !!!parse-error (type => 'not first start tag');
2551 }
2552 my $top_el = $self->{open_elements}->[0]->[0];
2553 for my $attr_name (keys %{$token->{attributes}}) {
2554 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2555 $top_el->set_attribute_ns
2556 (undef, [undef, $attr_name],
2557 $token->{attributes}->{$attr_name}->{value});
2558 }
2559 }
2560 !!!next-token;
2561 redo B;
2562 } elsif ($token->{type} eq 'comment') {
2563 my $comment = $self->{document}->create_comment ($token->{data});
2564 if ($self->{insertion_mode} eq 'after html body' or
2565 $self->{insertion_mode} eq 'after html frameset') {
2566 $self->{document}->append_child ($comment);
2567 } elsif ($self->{insertion_mode} eq 'after body') {
2568 $self->{open_elements}->[0]->[0]->append_child ($comment);
2569 } else {
2570 $self->{open_elements}->[-1]->[0]->append_child ($comment);
2571 }
2572 !!!next-token;
2573 redo B;
2574 } elsif ($self->{insertion_mode} eq 'in head' or
2575 $self->{insertion_mode} eq 'in head noscript' or
2576 $self->{insertion_mode} eq 'after head' or
2577 $self->{insertion_mode} eq 'before head') {
2578 if ($token->{type} eq 'character') {
2579 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2580 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2581 unless (length $token->{data}) {
2582 !!!next-token;
2583 redo B;
2584 }
2585 }
2586
2587 if ($self->{insertion_mode} eq 'before head') {
2588 ## As if <head>
2589 !!!create-element ($self->{head_element}, 'head');
2590 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2591 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2592
2593 ## Reprocess in the "in head" insertion mode...
2594 pop @{$self->{open_elements}};
2595
2596 ## Reprocess in the "after head" insertion mode...
2597 } elsif ($self->{insertion_mode} eq 'in head noscript') {
2598 ## As if </noscript>
2599 pop @{$self->{open_elements}};
2600 !!!parse-error (type => 'in noscript:#character');
2601
2602 ## Reprocess in the "in head" insertion mode...
2603 ## As if </head>
2604 pop @{$self->{open_elements}};
2605
2606 ## Reprocess in the "after head" insertion mode...
2607 } elsif ($self->{insertion_mode} eq 'in head') {
2608 pop @{$self->{open_elements}};
2609
2610 ## Reprocess in the "after head" insertion mode...
2611 }
2612
2613 ## "after head" insertion mode
2614 ## As if <body>
2615 !!!insert-element ('body');
2616 $self->{insertion_mode} = 'in body';
2617 ## reprocess
2618 redo B;
2619 } elsif ($token->{type} eq 'start tag') {
2620 if ($token->{tag_name} eq 'head') {
2621 if ($self->{insertion_mode} eq 'before head') {
2622 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2623 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2624 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2625 $self->{insertion_mode} = 'in head';
2626 !!!next-token;
2627 redo B;
2628 } elsif ($self->{insertion_mode} ne 'after head') {
2629 !!!parse-error (type => 'in head:head'); # or in head noscript
2630 ## Ignore the token
2631 !!!next-token;
2632 redo B;
2633 } else {
2634 #
2635 }
2636 } elsif ($self->{insertion_mode} eq 'before head') {
2637 ## As if <head>
2638 !!!create-element ($self->{head_element}, 'head');
2639 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2640 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2641
2642 $self->{insertion_mode} = 'in head';
2643 ## Reprocess in the "in head" insertion mode...
2644 }
2645
2646 if ($token->{tag_name} eq 'base') {
2647 if ($self->{insertion_mode} eq 'in head noscript') {
2648 ## As if </noscript>
2649 pop @{$self->{open_elements}};
2650 !!!parse-error (type => 'in noscript:base');
2651
2652 $self->{insertion_mode} = 'in head';
2653 ## Reprocess in the "in head" insertion mode...
2654 }
2655
2656 ## NOTE: There is a "as if in head" code clone.
2657 if ($self->{insertion_mode} eq 'after head') {
2658 !!!parse-error (type => 'after head:'.$token->{tag_name});
2659 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2660 }
2661 !!!insert-element ($token->{tag_name}, $token->{attributes});
2662 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2663 pop @{$self->{open_elements}}
2664 if $self->{insertion_mode} eq 'after head';
2665 !!!next-token;
2666 redo B;
2667 } elsif ($token->{tag_name} eq 'link') {
2668 ## NOTE: There is a "as if in head" code clone.
2669 if ($self->{insertion_mode} eq 'after head') {
2670 !!!parse-error (type => 'after head:'.$token->{tag_name});
2671 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2672 }
2673 !!!insert-element ($token->{tag_name}, $token->{attributes});
2674 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2675 pop @{$self->{open_elements}}
2676 if $self->{insertion_mode} eq 'after head';
2677 !!!next-token;
2678 redo B;
2679 } elsif ($token->{tag_name} eq 'meta') {
2680 ## NOTE: There is a "as if in head" code clone.
2681 if ($self->{insertion_mode} eq 'after head') {
2682 !!!parse-error (type => 'after head:'.$token->{tag_name});
2683 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2684 }
2685 !!!insert-element ($token->{tag_name}, $token->{attributes});
2686 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2687
2688 unless ($self->{confident}) {
2689 my $charset;
2690 if ($token->{attributes}->{charset}) { ## TODO: And if supported
2691 $charset = $token->{attributes}->{charset}->{value};
2692 }
2693 if ($token->{attributes}->{'http-equiv'}) {
2694 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2695 if ($token->{attributes}->{'http-equiv'}->{value}
2696 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2697 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2698 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2699 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
2700 } ## TODO: And if supported
2701 }
2702 ## TODO: Change the encoding
2703 }
2704
2705 ## TODO: Extracting |charset| from |meta|.
2706 pop @{$self->{open_elements}}
2707 if $self->{insertion_mode} eq 'after head';
2708 !!!next-token;
2709 redo B;
2710 } elsif ($token->{tag_name} eq 'title') {
2711 if ($self->{insertion_mode} eq 'in head noscript') {
2712 ## As if </noscript>
2713 pop @{$self->{open_elements}};
2714 !!!parse-error (type => 'in noscript:title');
2715
2716 $self->{insertion_mode} = 'in head';
2717 ## Reprocess in the "in head" insertion mode...
2718 } elsif ($self->{insertion_mode} eq 'after head') {
2719 !!!parse-error (type => 'after head:'.$token->{tag_name});
2720 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2721 }
2722
2723 ## NOTE: There is a "as if in head" code clone.
2724 my $parent = defined $self->{head_element} ? $self->{head_element}
2725 : $self->{open_elements}->[-1]->[0];
2726 $parse_rcdata->(RCDATA_CONTENT_MODEL,
2727 sub { $parent->append_child ($_[0]) });
2728 pop @{$self->{open_elements}}
2729 if $self->{insertion_mode} eq 'after head';
2730 redo B;
2731 } elsif ($token->{tag_name} eq 'style') {
2732 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
2733 ## insertion mode 'in head')
2734 ## NOTE: There is a "as if in head" code clone.
2735 if ($self->{insertion_mode} eq 'after head') {
2736 !!!parse-error (type => 'after head:'.$token->{tag_name});
2737 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2738 }
2739 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
2740 pop @{$self->{open_elements}}
2741 if $self->{insertion_mode} eq 'after head';
2742 redo B;
2743 } elsif ($token->{tag_name} eq 'noscript') {
2744 if ($self->{insertion_mode} eq 'in head') {
2745 ## NOTE: and scripting is disalbed
2746 !!!insert-element ($token->{tag_name}, $token->{attributes});
2747 $self->{insertion_mode} = 'in head noscript';
2748 !!!next-token;
2749 redo B;
2750 } elsif ($self->{insertion_mode} eq 'in head noscript') {
2751 !!!parse-error (type => 'in noscript:noscript');
2752 ## Ignore the token
2753 !!!next-token;
2754 redo B;
2755 } else {
2756 #
2757 }
2758 } elsif ($token->{tag_name} eq 'script') {
2759 if ($self->{insertion_mode} eq 'in head noscript') {
2760 ## As if </noscript>
2761 pop @{$self->{open_elements}};
2762 !!!parse-error (type => 'in noscript:script');
2763
2764 $self->{insertion_mode} = 'in head';
2765 ## Reprocess in the "in head" insertion mode...
2766 } elsif ($self->{insertion_mode} eq 'after head') {
2767 !!!parse-error (type => 'after head:'.$token->{tag_name});
2768 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2769 }
2770
2771 ## NOTE: There is a "as if in head" code clone.
2772 $script_start_tag->($insert_to_current);
2773 pop @{$self->{open_elements}}
2774 if $self->{insertion_mode} eq 'after head';
2775 redo B;
2776 } elsif ($token->{tag_name} eq 'body' or
2777 $token->{tag_name} eq 'frameset') {
2778 if ($self->{insertion_mode} eq 'in head noscript') {
2779 ## As if </noscript>
2780 pop @{$self->{open_elements}};
2781 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
2782
2783 ## Reprocess in the "in head" insertion mode...
2784 ## As if </head>
2785 pop @{$self->{open_elements}};
2786
2787 ## Reprocess in the "after head" insertion mode...
2788 } elsif ($self->{insertion_mode} eq 'in head') {
2789 pop @{$self->{open_elements}};
2790
2791 ## Reprocess in the "after head" insertion mode...
2792 }
2793
2794 ## "after head" insertion mode
2795 !!!insert-element ($token->{tag_name}, $token->{attributes});
2796 $self->{insertion_mode} = 'in '.$token->{tag_name};
2797 !!!next-token;
2798 redo B;
2799 } else {
2800 #
2801 }
2802
2803 if ($self->{insertion_mode} eq 'in head noscript') {
2804 ## As if </noscript>
2805 pop @{$self->{open_elements}};
2806 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
2807
2808 ## Reprocess in the "in head" insertion mode...
2809 ## As if </head>
2810 pop @{$self->{open_elements}};
2811
2812 ## Reprocess in the "after head" insertion mode...
2813 } elsif ($self->{insertion_mode} eq 'in head') {
2814 ## As if </head>
2815 pop @{$self->{open_elements}};
2816
2817 ## Reprocess in the "after head" insertion mode...
2818 }
2819
2820 ## "after head" insertion mode
2821 ## As if <body>
2822 !!!insert-element ('body');
2823 $self->{insertion_mode} = 'in body';
2824 ## reprocess
2825 redo B;
2826 } elsif ($token->{type} eq 'end tag') {
2827 if ($token->{tag_name} eq 'head') {
2828 if ($self->{insertion_mode} eq 'before head') {
2829 ## As if <head>
2830 !!!create-element ($self->{head_element}, 'head');
2831 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2832 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2833
2834 ## Reprocess in the "in head" insertion mode...
2835 pop @{$self->{open_elements}};
2836 $self->{insertion_mode} = 'after head';
2837 !!!next-token;
2838 redo B;
2839 } elsif ($self->{insertion_mode} eq 'in head noscript') {
2840 ## As if </noscript>
2841 pop @{$self->{open_elements}};
2842 !!!parse-error (type => 'in noscript:script');
2843
2844 ## Reprocess in the "in head" insertion mode...
2845 pop @{$self->{open_elements}};
2846 $self->{insertion_mode} = 'after head';
2847 !!!next-token;
2848 redo B;
2849 } elsif ($self->{insertion_mode} eq 'in head') {
2850 pop @{$self->{open_elements}};
2851 $self->{insertion_mode} = 'after head';
2852 !!!next-token;
2853 redo B;
2854 } else {
2855 #
2856 }
2857 } elsif ($token->{tag_name} eq 'noscript') {
2858 if ($self->{insertion_mode} eq 'in head noscript') {
2859 pop @{$self->{open_elements}};
2860 $self->{insertion_mode} = 'in head';
2861 !!!next-token;
2862 redo B;
2863 } elsif ($self->{insertion_mode} eq 'before head') {
2864 !!!parse-error (type => 'unmatched end tag:noscript');
2865 ## Ignore the token ## ISSUE: An issue in the spec.
2866 !!!next-token;
2867 redo B;
2868 } else {
2869 #
2870 }
2871 } elsif ({
2872 body => 1, html => 1,
2873 }->{$token->{tag_name}}) {
2874 if ($self->{insertion_mode} eq 'before head') {
2875 ## As if <head>
2876 !!!create-element ($self->{head_element}, 'head');
2877 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2878 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2879
2880 $self->{insertion_mode} = 'in head';
2881 ## Reprocess in the "in head" insertion mode...
2882 } elsif ($self->{insertion_mode} eq 'in head noscript') {
2883 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2884 ## Ignore the token
2885 !!!next-token;
2886 redo B;
2887 }
2888
2889 #
2890 } elsif ({
2891 p => 1, br => 1,
2892 }->{$token->{tag_name}}) {
2893 if ($self->{insertion_mode} eq 'before head') {
2894 ## As if <head>
2895 !!!create-element ($self->{head_element}, 'head');
2896 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2897 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2898
2899 $self->{insertion_mode} = 'in head';
2900 ## Reprocess in the "in head" insertion mode...
2901 }
2902
2903 #
2904 } else {
2905 if ($self->{insertion_mode} ne 'after head') {
2906 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2907 ## Ignore the token
2908 !!!next-token;
2909 redo B;
2910 } else {
2911 #
2912 }
2913 }
2914
2915 if ($self->{insertion_mode} eq 'in head noscript') {
2916 ## As if </noscript>
2917 pop @{$self->{open_elements}};
2918 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
2919
2920 ## Reprocess in the "in head" insertion mode...
2921 ## As if </head>
2922 pop @{$self->{open_elements}};
2923
2924 ## Reprocess in the "after head" insertion mode...
2925 } elsif ($self->{insertion_mode} eq 'in head') {
2926 ## As if </head>
2927 pop @{$self->{open_elements}};
2928
2929 ## Reprocess in the "after head" insertion mode...
2930 } elsif ($self->{insertion_mode} eq 'before head') {
2931 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2932 ## Ignore the token ## ISSUE: An issue in the spec.
2933 !!!next-token;
2934 redo B;
2935 }
2936
2937 ## "after head" insertion mode
2938 ## As if <body>
2939 !!!insert-element ('body');
2940 $self->{insertion_mode} = 'in body';
2941 ## reprocess
2942 redo B;
2943 } else {
2944 die "$0: $token->{type}: Unknown token type";
2945 }
2946
2947 ## ISSUE: An issue in the spec.
2948 } elsif ($self->{insertion_mode} eq 'in body' or
2949 $self->{insertion_mode} eq 'in cell' or
2950 $self->{insertion_mode} eq 'in caption') {
2951 if ($token->{type} eq 'character') {
2952 ## NOTE: There is a code clone of "character in body".
2953 $reconstruct_active_formatting_elements->($insert_to_current);
2954
2955 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
2956
2957 !!!next-token;
2958 redo B;
2959 } elsif ($token->{type} eq 'start tag') {
2960 if ({
2961 caption => 1, col => 1, colgroup => 1, tbody => 1,
2962 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
2963 }->{$token->{tag_name}}) {
2964 if ($self->{insertion_mode} eq 'in cell') {
2965 ## have an element in table scope
2966 my $tn;
2967 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2968 my $node = $self->{open_elements}->[$_];
2969 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
2970 $tn = $node->[1];
2971 last INSCOPE;
2972 } elsif ({
2973 table => 1, html => 1,
2974 }->{$node->[1]}) {
2975 last INSCOPE;
2976 }
2977 } # INSCOPE
2978 unless (defined $tn) {
2979 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2980 ## Ignore the token
2981 !!!next-token;
2982 redo B;
2983 }
2984
2985 ## Close the cell
2986 !!!back-token; # <?>
2987 $token = {type => 'end tag', tag_name => $tn};
2988 redo B;
2989 } elsif ($self->{insertion_mode} eq 'in caption') {
2990 !!!parse-error (type => 'not closed:caption');
2991
2992 ## As if </caption>
2993 ## have a table element in table scope
2994 my $i;
2995 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2996 my $node = $self->{open_elements}->[$_];
2997 if ($node->[1] eq 'caption') {
2998 $i = $_;
2999 last INSCOPE;
3000 } elsif ({
3001 table => 1, html => 1,
3002 }->{$node->[1]}) {
3003 last INSCOPE;
3004 }
3005 } # INSCOPE
3006 unless (defined $i) {
3007 !!!parse-error (type => 'unmatched end tag:caption');
3008 ## Ignore the token
3009 !!!next-token;
3010 redo B;
3011 }
3012
3013 ## generate implied end tags
3014 if ({
3015 dd => 1, dt => 1, li => 1, p => 1,
3016 td => 1, th => 1, tr => 1,
3017 tbody => 1, tfoot=> 1, thead => 1,
3018 }->{$self->{open_elements}->[-1]->[1]}) {
3019 !!!back-token; # <?>
3020 $token = {type => 'end tag', tag_name => 'caption'};
3021 !!!back-token;
3022 $token = {type => 'end tag',
3023 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3024 redo B;
3025 }
3026
3027 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3028 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3029 }
3030
3031 splice @{$self->{open_elements}}, $i;
3032
3033 $clear_up_to_marker->();
3034
3035 $self->{insertion_mode} = 'in table';
3036
3037 ## reprocess
3038 redo B;
3039 } else {
3040 #
3041 }
3042 } else {
3043 #
3044 }
3045 } elsif ($token->{type} eq 'end tag') {
3046 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3047 if ($self->{insertion_mode} eq 'in cell') {
3048 ## have an element in table scope
3049 my $i;
3050 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3051 my $node = $self->{open_elements}->[$_];
3052 if ($node->[1] eq $token->{tag_name}) {
3053 $i = $_;
3054 last INSCOPE;
3055 } elsif ({
3056 table => 1, html => 1,
3057 }->{$node->[1]}) {
3058 last INSCOPE;
3059 }
3060 } # INSCOPE
3061 unless (defined $i) {
3062 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3063 ## Ignore the token
3064 !!!next-token;
3065 redo B;
3066 }
3067
3068 ## generate implied end tags
3069 if ({
3070 dd => 1, dt => 1, li => 1, p => 1,
3071 td => ($token->{tag_name} eq 'th'),
3072 th => ($token->{tag_name} eq 'td'),
3073 tr => 1,
3074 tbody => 1, tfoot=> 1, thead => 1,
3075 }->{$self->{open_elements}->[-1]->[1]}) {
3076 !!!back-token;
3077 $token = {type => 'end tag',
3078 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3079 redo B;
3080 }
3081
3082 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3083 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3084 }
3085
3086 splice @{$self->{open_elements}}, $i;
3087
3088 $clear_up_to_marker->();
3089
3090 $self->{insertion_mode} = 'in row';
3091
3092 !!!next-token;
3093 redo B;
3094 } elsif ($self->{insertion_mode} eq 'in caption') {
3095 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3096 ## Ignore the token
3097 !!!next-token;
3098 redo B;
3099 } else {
3100 #
3101 }
3102 } elsif ($token->{tag_name} eq 'caption') {
3103 if ($self->{insertion_mode} eq 'in caption') {
3104 ## have a table element in table scope
3105 my $i;
3106 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3107 my $node = $self->{open_elements}->[$_];
3108 if ($node->[1] eq $token->{tag_name}) {
3109 $i = $_;
3110 last INSCOPE;
3111 } elsif ({
3112 table => 1, html => 1,
3113 }->{$node->[1]}) {
3114 last INSCOPE;
3115 }
3116 } # INSCOPE
3117 unless (defined $i) {
3118 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3119 ## Ignore the token
3120 !!!next-token;
3121 redo B;
3122 }
3123
3124 ## generate implied end tags
3125 if ({
3126 dd => 1, dt => 1, li => 1, p => 1,
3127 td => 1, th => 1, tr => 1,
3128 tbody => 1, tfoot=> 1, thead => 1,
3129 }->{$self->{open_elements}->[-1]->[1]}) {
3130 !!!back-token;
3131 $token = {type => 'end tag',
3132 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3133 redo B;
3134 }
3135
3136 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3137 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3138 }
3139
3140 splice @{$self->{open_elements}}, $i;
3141
3142 $clear_up_to_marker->();
3143
3144 $self->{insertion_mode} = 'in table';
3145
3146 !!!next-token;
3147 redo B;
3148 } elsif ($self->{insertion_mode} eq 'in cell') {
3149 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3150 ## Ignore the token
3151 !!!next-token;
3152 redo B;
3153 } else {
3154 #
3155 }
3156 } elsif ({
3157 table => 1, tbody => 1, tfoot => 1,
3158 thead => 1, tr => 1,
3159 }->{$token->{tag_name}} and
3160 $self->{insertion_mode} eq 'in cell') {
3161 ## have an element in table scope
3162 my $i;
3163 my $tn;
3164 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3165 my $node = $self->{open_elements}->[$_];
3166 if ($node->[1] eq $token->{tag_name}) {
3167 $i = $_;
3168 last INSCOPE;
3169 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3170 $tn = $node->[1];
3171 ## NOTE: There is exactly one |td| or |th| element
3172 ## in scope in the stack of open elements by definition.
3173 } elsif ({
3174 table => 1, html => 1,
3175 }->{$node->[1]}) {
3176 last INSCOPE;
3177 }
3178 } # INSCOPE
3179 unless (defined $i) {
3180 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3181 ## Ignore the token
3182 !!!next-token;
3183 redo B;
3184 }
3185
3186 ## Close the cell
3187 !!!back-token; # </?>
3188 $token = {type => 'end tag', tag_name => $tn};
3189 redo B;
3190 } elsif ($token->{tag_name} eq 'table' and
3191 $self->{insertion_mode} eq 'in caption') {
3192 !!!parse-error (type => 'not closed:caption');
3193
3194 ## As if </caption>
3195 ## have a table element in table scope
3196 my $i;
3197 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3198 my $node = $self->{open_elements}->[$_];
3199 if ($node->[1] eq 'caption') {
3200 $i = $_;
3201 last INSCOPE;
3202 } elsif ({
3203 table => 1, html => 1,
3204 }->{$node->[1]}) {
3205 last INSCOPE;
3206 }
3207 } # INSCOPE
3208 unless (defined $i) {
3209 !!!parse-error (type => 'unmatched end tag:caption');
3210 ## Ignore the token
3211 !!!next-token;
3212 redo B;
3213 }
3214
3215 ## generate implied end tags
3216 if ({
3217 dd => 1, dt => 1, li => 1, p => 1,
3218 td => 1, th => 1, tr => 1,
3219 tbody => 1, tfoot=> 1, thead => 1,
3220 }->{$self->{open_elements}->[-1]->[1]}) {
3221 !!!back-token; # </table>
3222 $token = {type => 'end tag', tag_name => 'caption'};
3223 !!!back-token;
3224 $token = {type => 'end tag',
3225 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3226 redo B;
3227 }
3228
3229 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3230 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3231 }
3232
3233 splice @{$self->{open_elements}}, $i;
3234
3235 $clear_up_to_marker->();
3236
3237 $self->{insertion_mode} = 'in table';
3238
3239 ## reprocess
3240 redo B;
3241 } elsif ({
3242 body => 1, col => 1, colgroup => 1, html => 1,
3243 }->{$token->{tag_name}}) {
3244 if ($self->{insertion_mode} eq 'in cell' or
3245 $self->{insertion_mode} eq 'in caption') {
3246 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3247 ## Ignore the token
3248 !!!next-token;
3249 redo B;
3250 } else {
3251 #
3252 }
3253 } elsif ({
3254 tbody => 1, tfoot => 1,
3255 thead => 1, tr => 1,
3256 }->{$token->{tag_name}} and
3257 $self->{insertion_mode} eq 'in caption') {
3258 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3259 ## Ignore the token
3260 !!!next-token;
3261 redo B;
3262 } else {
3263 #
3264 }
3265 } else {
3266 die "$0: $token->{type}: Unknown token type";
3267 }
3268
3269 $insert = $insert_to_current;
3270 #
3271 } elsif ($self->{insertion_mode} eq 'in row' or
3272 $self->{insertion_mode} eq 'in table body' or
3273 $self->{insertion_mode} eq 'in table') {
3274 if ($token->{type} eq 'character') {
3275 ## NOTE: There are "character in table" code clones.
3276 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3277 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3278
3279 unless (length $token->{data}) {
3280 !!!next-token;
3281 redo B;
3282 }
3283 }
3284
3285 !!!parse-error (type => 'in table:#character');
3286
3287 ## As if in body, but insert into foster parent element
3288 ## ISSUE: Spec says that "whenever a node would be inserted
3289 ## into the current node" while characters might not be
3290 ## result in a new Text node.
3291 $reconstruct_active_formatting_elements->($insert_to_foster);
3292
3293 if ({
3294 table => 1, tbody => 1, tfoot => 1,
3295 thead => 1, tr => 1,
3296 }->{$self->{open_elements}->[-1]->[1]}) {
3297 # MUST
3298 my $foster_parent_element;
3299 my $next_sibling;
3300 my $prev_sibling;
3301 OE: for (reverse 0..$#{$self->{open_elements}}) {
3302 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3303 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3304 if (defined $parent and $parent->node_type == 1) {
3305 $foster_parent_element = $parent;
3306 $next_sibling = $self->{open_elements}->[$_]->[0];
3307 $prev_sibling = $next_sibling->previous_sibling;
3308 } else {
3309 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3310 $prev_sibling = $foster_parent_element->last_child;
3311 }
3312 last OE;
3313 }
3314 } # OE
3315 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3316 $prev_sibling = $foster_parent_element->last_child
3317 unless defined $foster_parent_element;
3318 if (defined $prev_sibling and
3319 $prev_sibling->node_type == 3) {
3320 $prev_sibling->manakai_append_text ($token->{data});
3321 } else {
3322 $foster_parent_element->insert_before
3323 ($self->{document}->create_text_node ($token->{data}),
3324 $next_sibling);
3325 }
3326 } else {
3327 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3328 }
3329
3330 !!!next-token;
3331 redo B;
3332 } elsif ($token->{type} eq 'start tag') {
3333 if ({
3334 tr => ($self->{insertion_mode} ne 'in row'),
3335 th => 1, td => 1,
3336 }->{$token->{tag_name}}) {
3337 if ($self->{insertion_mode} eq 'in table') {
3338 ## Clear back to table context
3339 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3340 $self->{open_elements}->[-1]->[1] ne 'html') {
3341 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3342 pop @{$self->{open_elements}};
3343 }
3344
3345 !!!insert-element ('tbody');
3346 $self->{insertion_mode} = 'in table body';
3347 ## reprocess in the "in table body" insertion mode...
3348 }
3349
3350 if ($self->{insertion_mode} eq 'in table body') {
3351 unless ($token->{tag_name} eq 'tr') {
3352 !!!parse-error (type => 'missing start tag:tr');
3353 }
3354
3355 ## Clear back to table body context
3356 while (not {
3357 tbody => 1, tfoot => 1, thead => 1, html => 1,
3358 }->{$self->{open_elements}->[-1]->[1]}) {
3359 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3360 pop @{$self->{open_elements}};
3361 }
3362
3363 $self->{insertion_mode} = 'in row';
3364 if ($token->{tag_name} eq 'tr') {
3365 !!!insert-element ($token->{tag_name}, $token->{attributes});
3366 !!!next-token;
3367 redo B;
3368 } else {
3369 !!!insert-element ('tr');
3370 ## reprocess in the "in row" insertion mode
3371 }
3372 }
3373
3374 ## Clear back to table row context
3375 while (not {
3376 tr => 1, html => 1,
3377 }->{$self->{open_elements}->[-1]->[1]}) {
3378 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3379 pop @{$self->{open_elements}};
3380 }
3381
3382 !!!insert-element ($token->{tag_name}, $token->{attributes});
3383 $self->{insertion_mode} = 'in cell';
3384
3385 push @$active_formatting_elements, ['#marker', ''];
3386
3387 !!!next-token;
3388 redo B;
3389 } elsif ({
3390 caption => 1, col => 1, colgroup => 1,
3391 tbody => 1, tfoot => 1, thead => 1,
3392 tr => 1, # $self->{insertion_mode} eq 'in row'
3393 }->{$token->{tag_name}}) {
3394 if ($self->{insertion_mode} eq 'in row') {
3395 ## As if </tr>
3396 ## have an element in table scope
3397 my $i;
3398 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3399 my $node = $self->{open_elements}->[$_];
3400 if ($node->[1] eq 'tr') {
3401 $i = $_;
3402 last INSCOPE;
3403 } elsif ({
3404 table => 1, html => 1,
3405 }->{$node->[1]}) {
3406 last INSCOPE;
3407 }
3408 } # INSCOPE
3409 unless (defined $i) {
3410 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3411 ## Ignore the token
3412 !!!next-token;
3413 redo B;
3414 }
3415
3416 ## Clear back to table row context
3417 while (not {
3418 tr => 1, html => 1,
3419 }->{$self->{open_elements}->[-1]->[1]}) {
3420 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3421 pop @{$self->{open_elements}};
3422 }
3423
3424 pop @{$self->{open_elements}}; # tr
3425 $self->{insertion_mode} = 'in table body';
3426 if ($token->{tag_name} eq 'tr') {
3427 ## reprocess
3428 redo B;
3429 } else {
3430 ## reprocess in the "in table body" insertion mode...
3431 }
3432 }
3433
3434 if ($self->{insertion_mode} eq 'in table body') {
3435 ## have an element in table scope
3436 my $i;
3437 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3438 my $node = $self->{open_elements}->[$_];
3439 if ({
3440 tbody => 1, thead => 1, tfoot => 1,
3441 }->{$node->[1]}) {
3442 $i = $_;
3443 last INSCOPE;
3444 } elsif ({
3445 table => 1, html => 1,
3446 }->{$node->[1]}) {
3447 last INSCOPE;
3448 }
3449 } # INSCOPE
3450 unless (defined $i) {
3451 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3452 ## Ignore the token
3453 !!!next-token;
3454 redo B;
3455 }
3456
3457 ## Clear back to table body context
3458 while (not {
3459 tbody => 1, tfoot => 1, thead => 1, html => 1,
3460 }->{$self->{open_elements}->[-1]->[1]}) {
3461 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3462 pop @{$self->{open_elements}};
3463 }
3464
3465 ## As if <{current node}>
3466 ## have an element in table scope
3467 ## true by definition
3468
3469 ## Clear back to table body context
3470 ## nop by definition
3471
3472 pop @{$self->{open_elements}};
3473 $self->{insertion_mode} = 'in table';
3474 ## reprocess in "in table" insertion mode...
3475 }
3476
3477 if ($token->{tag_name} eq 'col') {
3478 ## Clear back to table context
3479 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3480 $self->{open_elements}->[-1]->[1] ne 'html') {
3481 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3482 pop @{$self->{open_elements}};
3483 }
3484
3485 !!!insert-element ('colgroup');
3486 $self->{insertion_mode} = 'in column group';
3487 ## reprocess
3488 redo B;
3489 } elsif ({
3490 caption => 1,
3491 colgroup => 1,
3492 tbody => 1, tfoot => 1, thead => 1,
3493 }->{$token->{tag_name}}) {
3494 ## Clear back to table context
3495 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3496 $self->{open_elements}->[-1]->[1] ne 'html') {
3497 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3498 pop @{$self->{open_elements}};
3499 }
3500
3501 push @$active_formatting_elements, ['#marker', '']
3502 if $token->{tag_name} eq 'caption';
3503
3504 !!!insert-element ($token->{tag_name}, $token->{attributes});
3505 $self->{insertion_mode} = {
3506 caption => 'in caption',
3507 colgroup => 'in column group',
3508 tbody => 'in table body',
3509 tfoot => 'in table body',
3510 thead => 'in table body',
3511 }->{$token->{tag_name}};
3512 !!!next-token;
3513 redo B;
3514 } else {
3515 die "$0: in table: <>: $token->{tag_name}";
3516 }
3517 } elsif ($token->{tag_name} eq 'table') {
3518 ## NOTE: There are code clones for this "table in table"
3519 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3520
3521 ## As if </table>
3522 ## have a table element in table scope
3523 my $i;
3524 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3525 my $node = $self->{open_elements}->[$_];
3526 if ($node->[1] eq 'table') {
3527 $i = $_;
3528 last INSCOPE;
3529 } elsif ({
3530 table => 1, html => 1,
3531 }->{$node->[1]}) {
3532 last INSCOPE;
3533 }
3534 } # INSCOPE
3535 unless (defined $i) {
3536 !!!parse-error (type => 'unmatched end tag:table');
3537 ## Ignore tokens </table><table>
3538 !!!next-token;
3539 redo B;
3540 }
3541
3542 ## generate implied end tags
3543 if ({
3544 dd => 1, dt => 1, li => 1, p => 1,
3545 td => 1, th => 1, tr => 1,
3546 tbody => 1, tfoot=> 1, thead => 1,
3547 }->{$self->{open_elements}->[-1]->[1]}) {
3548 !!!back-token; # <table>
3549 $token = {type => 'end tag', tag_name => 'table'};
3550 !!!back-token;
3551 $token = {type => 'end tag',
3552 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3553 redo B;
3554 }
3555
3556 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3557 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3558 }
3559
3560 splice @{$self->{open_elements}}, $i;
3561
3562 $self->_reset_insertion_mode;
3563
3564 ## reprocess
3565 redo B;
3566 } else {
3567 #
3568 }
3569 } elsif ($token->{type} eq 'end tag') {
3570 if ($token->{tag_name} eq 'tr' and
3571 $self->{insertion_mode} eq 'in row') {
3572 ## have an element in table scope
3573 my $i;
3574 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3575 my $node = $self->{open_elements}->[$_];
3576 if ($node->[1] eq $token->{tag_name}) {
3577 $i = $_;
3578 last INSCOPE;
3579 } elsif ({
3580 table => 1, html => 1,
3581 }->{$node->[1]}) {
3582 last INSCOPE;
3583 }
3584 } # INSCOPE
3585 unless (defined $i) {
3586 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3587 ## Ignore the token
3588 !!!next-token;
3589 redo B;
3590 }
3591
3592 ## Clear back to table row context
3593 while (not {
3594 tr => 1, html => 1,
3595 }->{$self->{open_elements}->[-1]->[1]}) {
3596 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3597 pop @{$self->{open_elements}};
3598 }
3599
3600 pop @{$self->{open_elements}}; # tr
3601 $self->{insertion_mode} = 'in table body';
3602 !!!next-token;
3603 redo B;
3604 } elsif ($token->{tag_name} eq 'table') {
3605 if ($self->{insertion_mode} eq 'in row') {
3606 ## As if </tr>
3607 ## have an element in table scope
3608 my $i;
3609 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3610 my $node = $self->{open_elements}->[$_];
3611 if ($node->[1] eq 'tr') {
3612 $i = $_;
3613 last INSCOPE;
3614 } elsif ({
3615 table => 1, html => 1,
3616 }->{$node->[1]}) {
3617 last INSCOPE;
3618 }
3619 } # INSCOPE
3620 unless (defined $i) {
3621 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3622 ## Ignore the token
3623 !!!next-token;
3624 redo B;
3625 }
3626
3627 ## Clear back to table row context
3628 while (not {
3629 tr => 1, html => 1,
3630 }->{$self->{open_elements}->[-1]->[1]}) {
3631 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3632 pop @{$self->{open_elements}};
3633 }
3634
3635 pop @{$self->{open_elements}}; # tr
3636 $self->{insertion_mode} = 'in table body';
3637 ## reprocess in the "in table body" insertion mode...
3638 }
3639
3640 if ($self->{insertion_mode} eq 'in table body') {
3641 ## have an element in table scope
3642 my $i;
3643 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3644 my $node = $self->{open_elements}->[$_];
3645 if ({
3646 tbody => 1, thead => 1, tfoot => 1,
3647 }->{$node->[1]}) {
3648 $i = $_;
3649 last INSCOPE;
3650 } elsif ({
3651 table => 1, html => 1,
3652 }->{$node->[1]}) {
3653 last INSCOPE;
3654 }
3655 } # INSCOPE
3656 unless (defined $i) {
3657 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3658 ## Ignore the token
3659 !!!next-token;
3660 redo B;
3661 }
3662
3663 ## Clear back to table body context
3664 while (not {
3665 tbody => 1, tfoot => 1, thead => 1, html => 1,
3666 }->{$self->{open_elements}->[-1]->[1]}) {
3667 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3668 pop @{$self->{open_elements}};
3669 }
3670
3671 ## As if <{current node}>
3672 ## have an element in table scope
3673 ## true by definition
3674
3675 ## Clear back to table body context
3676 ## nop by definition
3677
3678 pop @{$self->{open_elements}};
3679 $self->{insertion_mode} = 'in table';
3680 ## reprocess in the "in table" insertion mode...
3681 }
3682
3683 ## have a table element in table scope
3684 my $i;
3685 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3686 my $node = $self->{open_elements}->[$_];
3687 if ($node->[1] eq $token->{tag_name}) {
3688 $i = $_;
3689 last INSCOPE;
3690 } elsif ({
3691 table => 1, html => 1,
3692 }->{$node->[1]}) {
3693 last INSCOPE;
3694 }
3695 } # INSCOPE
3696 unless (defined $i) {
3697 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3698 ## Ignore the token
3699 !!!next-token;
3700 redo B;
3701 }
3702
3703 ## generate implied end tags
3704 if ({
3705 dd => 1, dt => 1, li => 1, p => 1,
3706 td => 1, th => 1, tr => 1,
3707 tbody => 1, tfoot=> 1, thead => 1,
3708 }->{$self->{open_elements}->[-1]->[1]}) {
3709 !!!back-token;
3710 $token = {type => 'end tag',
3711 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3712 redo B;
3713 }
3714
3715 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3716 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3717 }
3718
3719 splice @{$self->{open_elements}}, $i;
3720
3721 $self->_reset_insertion_mode;
3722
3723 !!!next-token;
3724 redo B;
3725 } elsif ({
3726 tbody => 1, tfoot => 1, thead => 1,
3727 }->{$token->{tag_name}} and
3728 ($self->{insertion_mode} eq 'in row' or
3729 $self->{insertion_mode} eq 'in table body')) {
3730 if ($self->{insertion_mode} eq 'in row') {
3731 ## have an element in table scope
3732 my $i;
3733 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3734 my $node = $self->{open_elements}->[$_];
3735 if ($node->[1] eq $token->{tag_name}) {
3736 $i = $_;
3737 last INSCOPE;
3738 } elsif ({
3739 table => 1, html => 1,
3740 }->{$node->[1]}) {
3741 last INSCOPE;
3742 }
3743 } # INSCOPE
3744 unless (defined $i) {
3745 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3746 ## Ignore the token
3747 !!!next-token;
3748 redo B;
3749 }
3750
3751 ## As if </tr>
3752 ## have an element in table scope
3753 my $i;
3754 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3755 my $node = $self->{open_elements}->[$_];
3756 if ($node->[1] eq 'tr') {
3757 $i = $_;
3758 last INSCOPE;
3759 } elsif ({
3760 table => 1, html => 1,
3761 }->{$node->[1]}) {
3762 last INSCOPE;
3763 }
3764 } # INSCOPE
3765 unless (defined $i) {
3766 !!!parse-error (type => 'unmatched end tag:tr');
3767 ## Ignore the token
3768 !!!next-token;
3769 redo B;
3770 }
3771
3772 ## Clear back to table row context
3773 while (not {
3774 tr => 1, html => 1,
3775 }->{$self->{open_elements}->[-1]->[1]}) {
3776 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3777 pop @{$self->{open_elements}};
3778 }
3779
3780 pop @{$self->{open_elements}}; # tr
3781 $self->{insertion_mode} = 'in table body';
3782 ## reprocess in the "in table body" insertion mode...
3783 }
3784
3785 ## have an element in table scope
3786 my $i;
3787 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3788 my $node = $self->{open_elements}->[$_];
3789 if ($node->[1] eq $token->{tag_name}) {
3790 $i = $_;
3791 last INSCOPE;
3792 } elsif ({
3793 table => 1, html => 1,
3794 }->{$node->[1]}) {
3795 last INSCOPE;
3796 }
3797 } # INSCOPE
3798 unless (defined $i) {
3799 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3800 ## Ignore the token
3801 !!!next-token;
3802 redo B;
3803 }
3804
3805 ## Clear back to table body context
3806 while (not {
3807 tbody => 1, tfoot => 1, thead => 1, html => 1,
3808 }->{$self->{open_elements}->[-1]->[1]}) {
3809 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3810 pop @{$self->{open_elements}};
3811 }
3812
3813 pop @{$self->{open_elements}};
3814 $self->{insertion_mode} = 'in table';
3815 !!!next-token;
3816 redo B;
3817 } elsif ({
3818 body => 1, caption => 1, col => 1, colgroup => 1,
3819 html => 1, td => 1, th => 1,
3820 tr => 1, # $self->{insertion_mode} eq 'in row'
3821 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} eq 'in table'
3822 }->{$token->{tag_name}}) {
3823 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3824 ## Ignore the token
3825 !!!next-token;
3826 redo B;
3827 } else {
3828 #
3829 }
3830 } else {
3831 die "$0: $token->{type}: Unknown token type";
3832 }
3833
3834 !!!parse-error (type => 'in table:'.$token->{tag_name});
3835
3836 $insert = $insert_to_foster;
3837 #
3838 } elsif ($self->{insertion_mode} eq 'in column group') {
3839 if ($token->{type} eq 'character') {
3840 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3841 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3842 unless (length $token->{data}) {
3843 !!!next-token;
3844 redo B;
3845 }
3846 }
3847
3848 #
3849 } elsif ($token->{type} eq 'start tag') {
3850 if ($token->{tag_name} eq 'col') {
3851 !!!insert-element ($token->{tag_name}, $token->{attributes});
3852 pop @{$self->{open_elements}};
3853 !!!next-token;
3854 redo B;
3855 } else {
3856 #
3857 }
3858 } elsif ($token->{type} eq 'end tag') {
3859 if ($token->{tag_name} eq 'colgroup') {
3860 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3861 !!!parse-error (type => 'unmatched end tag:colgroup');
3862 ## Ignore the token
3863 !!!next-token;
3864 redo B;
3865 } else {
3866 pop @{$self->{open_elements}}; # colgroup
3867 $self->{insertion_mode} = 'in table';
3868 !!!next-token;
3869 redo B;
3870 }
3871 } elsif ($token->{tag_name} eq 'col') {
3872 !!!parse-error (type => 'unmatched end tag:col');
3873 ## Ignore the token
3874 !!!next-token;
3875 redo B;
3876 } else {
3877 #
3878 }
3879 } else {
3880 #
3881 }
3882
3883 ## As if </colgroup>
3884 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3885 !!!parse-error (type => 'unmatched end tag:colgroup');
3886 ## Ignore the token
3887 !!!next-token;
3888 redo B;
3889 } else {
3890 pop @{$self->{open_elements}}; # colgroup
3891 $self->{insertion_mode} = 'in table';
3892 ## reprocess
3893 redo B;
3894 }
3895 } elsif ($self->{insertion_mode} eq 'in select') {
3896 if ($token->{type} eq 'character') {
3897 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3898 !!!next-token;
3899 redo B;
3900 } elsif ($token->{type} eq 'start tag') {
3901 if ($token->{tag_name} eq 'option') {
3902 if ($self->{open_elements}->[-1]->[1] eq 'option') {
3903 ## As if </option>
3904 pop @{$self->{open_elements}};
3905 }
3906
3907 !!!insert-element ($token->{tag_name}, $token->{attributes});
3908 !!!next-token;
3909 redo B;
3910 } elsif ($token->{tag_name} eq 'optgroup') {
3911 if ($self->{open_elements}->[-1]->[1] eq 'option') {
3912 ## As if </option>
3913 pop @{$self->{open_elements}};
3914 }
3915
3916 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
3917 ## As if </optgroup>
3918 pop @{$self->{open_elements}};
3919 }
3920
3921 !!!insert-element ($token->{tag_name}, $token->{attributes});
3922 !!!next-token;
3923 redo B;
3924 } elsif ($token->{tag_name} eq 'select') {
3925 !!!parse-error (type => 'not closed:select');
3926 ## As if </select> instead
3927 ## have an element in table scope
3928 my $i;
3929 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3930 my $node = $self->{open_elements}->[$_];
3931 if ($node->[1] eq $token->{tag_name}) {
3932 $i = $_;
3933 last INSCOPE;
3934 } elsif ({
3935 table => 1, html => 1,
3936 }->{$node->[1]}) {
3937 last INSCOPE;
3938 }
3939 } # INSCOPE
3940 unless (defined $i) {
3941 !!!parse-error (type => 'unmatched end tag:select');
3942 ## Ignore the token
3943 !!!next-token;
3944 redo B;
3945 }
3946
3947 splice @{$self->{open_elements}}, $i;
3948
3949 $self->_reset_insertion_mode;
3950
3951 !!!next-token;
3952 redo B;
3953 } else {
3954 #
3955 }
3956 } elsif ($token->{type} eq 'end tag') {
3957 if ($token->{tag_name} eq 'optgroup') {
3958 if ($self->{open_elements}->[-1]->[1] eq 'option' and
3959 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
3960 ## As if </option>
3961 splice @{$self->{open_elements}}, -2;
3962 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
3963 pop @{$self->{open_elements}};
3964 } else {
3965 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3966 ## Ignore the token
3967 }
3968 !!!next-token;
3969 redo B;
3970 } elsif ($token->{tag_name} eq 'option') {
3971 if ($self->{open_elements}->[-1]->[1] eq 'option') {
3972 pop @{$self->{open_elements}};
3973 } else {
3974 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3975 ## Ignore the token
3976 }
3977 !!!next-token;
3978 redo B;
3979 } elsif ($token->{tag_name} eq 'select') {
3980 ## have an element in table scope
3981 my $i;
3982 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3983 my $node = $self->{open_elements}->[$_];
3984 if ($node->[1] eq $token->{tag_name}) {
3985 $i = $_;
3986 last INSCOPE;
3987 } elsif ({
3988 table => 1, html => 1,
3989 }->{$node->[1]}) {
3990 last INSCOPE;
3991 }
3992 } # INSCOPE
3993 unless (defined $i) {
3994 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3995 ## Ignore the token
3996 !!!next-token;
3997 redo B;
3998 }
3999
4000 splice @{$self->{open_elements}}, $i;
4001
4002 $self->_reset_insertion_mode;
4003
4004 !!!next-token;
4005 redo B;
4006 } elsif ({
4007 caption => 1, table => 1, tbody => 1,
4008 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4009 }->{$token->{tag_name}}) {
4010 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4011
4012 ## have an element in table scope
4013 my $i;
4014 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4015 my $node = $self->{open_elements}->[$_];
4016 if ($node->[1] eq $token->{tag_name}) {
4017 $i = $_;
4018 last INSCOPE;
4019 } elsif ({
4020 table => 1, html => 1,
4021 }->{$node->[1]}) {
4022 last INSCOPE;
4023 }
4024 } # INSCOPE
4025 unless (defined $i) {
4026 ## Ignore the token
4027 !!!next-token;
4028 redo B;
4029 }
4030
4031 ## As if </select>
4032 ## have an element in table scope
4033 undef $i;
4034 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4035 my $node = $self->{open_elements}->[$_];
4036 if ($node->[1] eq 'select') {
4037 $i = $_;
4038 last INSCOPE;
4039 } elsif ({
4040 table => 1, html => 1,
4041 }->{$node->[1]}) {
4042 last INSCOPE;
4043 }
4044 } # INSCOPE
4045 unless (defined $i) {
4046 !!!parse-error (type => 'unmatched end tag:select');
4047 ## Ignore the </select> token
4048 !!!next-token; ## TODO: ok?
4049 redo B;
4050 }
4051
4052 splice @{$self->{open_elements}}, $i;
4053
4054 $self->_reset_insertion_mode;
4055
4056 ## reprocess
4057 redo B;
4058 } else {
4059 #
4060 }
4061 } else {
4062 #
4063 }
4064
4065 !!!parse-error (type => 'in select:'.$token->{tag_name});
4066 ## Ignore the token
4067 !!!next-token;
4068 redo B;
4069 } elsif ($self->{insertion_mode} eq 'after body' or
4070 $self->{insertion_mode} eq 'after html body') {
4071 if ($token->{type} eq 'character') {
4072 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4073 my $data = $1;
4074 ## As if in body
4075 $reconstruct_active_formatting_elements->($insert_to_current);
4076
4077 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4078
4079 unless (length $token->{data}) {
4080 !!!next-token;
4081 redo B;
4082 }
4083 }
4084
4085 if ($self->{insertion_mode} eq 'after html body') {
4086 !!!parse-error (type => 'after html:#character');
4087
4088 ## Reprocess in the "main" phase, "after body" insertion mode...
4089 }
4090
4091 ## "after body" insertion mode
4092 !!!parse-error (type => 'after body:#character');
4093
4094 $self->{insertion_mode} = 'in body';
4095 ## reprocess
4096 redo B;
4097 } elsif ($token->{type} eq 'start tag') {
4098 if ($self->{insertion_mode} eq 'after html body') {
4099 !!!parse-error (type => 'after html:'.$token->{tag_name});
4100
4101 ## Reprocess in the "main" phase, "after body" insertion mode...
4102 }
4103
4104 ## "after body" insertion mode
4105 !!!parse-error (type => 'after body:'.$token->{tag_name});
4106
4107 $self->{insertion_mode} = 'in body';
4108 ## reprocess
4109 redo B;
4110 } elsif ($token->{type} eq 'end tag') {
4111 if ($self->{insertion_mode} eq 'after html body') {
4112 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4113
4114 $self->{insertion_mode} = 'after body';
4115 ## Reprocess in the "main" phase, "after body" insertion mode...
4116 }
4117
4118 ## "after body" insertion mode
4119 if ($token->{tag_name} eq 'html') {
4120 if (defined $self->{inner_html_node}) {
4121 !!!parse-error (type => 'unmatched end tag:html');
4122 ## Ignore the token
4123 !!!next-token;
4124 redo B;
4125 } else {
4126 $self->{insertion_mode} = 'after html body';
4127 !!!next-token;
4128 redo B;
4129 }
4130 } else {
4131 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4132
4133 $self->{insertion_mode} = 'in body';
4134 ## reprocess
4135 redo B;
4136 }
4137 } else {
4138 die "$0: $token->{type}: Unknown token type";
4139 }
4140 } elsif ($self->{insertion_mode} eq 'in frameset' or
4141 $self->{insertion_mode} eq 'after frameset' or
4142 $self->{insertion_mode} eq 'after html frameset') {
4143 if ($token->{type} eq 'character') {
4144 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4145 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4146
4147 unless (length $token->{data}) {
4148 !!!next-token;
4149 redo B;
4150 }
4151 }
4152
4153 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4154 if ($self->{insertion_mode} eq 'in frameset') {
4155 !!!parse-error (type => 'in frameset:#character');
4156 } elsif ($self->{insertion_mode} eq 'after frameset') {
4157 !!!parse-error (type => 'after frameset:#character');
4158 } else { # "after html frameset"
4159 !!!parse-error (type => 'after html:#character');
4160
4161 $self->{insertion_mode} = 'after frameset';
4162 ## Reprocess in the "main" phase, "after frameset"...
4163 !!!parse-error (type => 'after frameset:#character');
4164 }
4165
4166 ## Ignore the token.
4167 if (length $token->{data}) {
4168 ## reprocess the rest of characters
4169 } else {
4170 !!!next-token;
4171 }
4172 redo B;
4173 }
4174
4175 die qq[$0: Character "$token->{data}"];
4176 } elsif ($token->{type} eq 'start tag') {
4177 if ($self->{insertion_mode} eq 'after html frameset') {
4178 !!!parse-error (type => 'after html:'.$token->{tag_name});
4179
4180 $self->{insertion_mode} = 'after frameset';
4181 ## Process in the "main" phase, "after frameset" insertion mode...
4182 }
4183
4184 if ($token->{tag_name} eq 'frameset' and
4185 $self->{insertion_mode} eq 'in frameset') {
4186 !!!insert-element ($token->{tag_name}, $token->{attributes});
4187 !!!next-token;
4188 redo B;
4189 } elsif ($token->{tag_name} eq 'frame' and
4190 $self->{insertion_mode} eq 'in frameset') {
4191 !!!insert-element ($token->{tag_name}, $token->{attributes});
4192 pop @{$self->{open_elements}};
4193 !!!next-token;
4194 redo B;
4195 } elsif ($token->{tag_name} eq 'noframes') {
4196 ## NOTE: As if in body.
4197 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4198 redo B;
4199 } else {
4200 if ($self->{insertion_mode} eq 'in frameset') {
4201 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4202 } else {
4203 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4204 }
4205 ## Ignore the token
4206 !!!next-token;
4207 redo B;
4208 }
4209 } elsif ($token->{type} eq 'end tag') {
4210 if ($self->{insertion_mode} eq 'after html frameset') {
4211 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4212
4213 $self->{insertion_mode} = 'after frameset';
4214 ## Process in the "main" phase, "after frameset" insertion mode...
4215 }
4216
4217 if ($token->{tag_name} eq 'frameset' and
4218 $self->{insertion_mode} eq 'in frameset') {
4219 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4220 @{$self->{open_elements}} == 1) {
4221 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4222 ## Ignore the token
4223 !!!next-token;
4224 } else {
4225 pop @{$self->{open_elements}};
4226 !!!next-token;
4227 }
4228
4229 if (not defined $self->{inner_html_node} and
4230 $self->{open_elements}->[-1]->[1] ne 'frameset') {
4231 $self->{insertion_mode} = 'after frameset';
4232 }
4233 redo B;
4234 } elsif ($token->{tag_name} eq 'html' and
4235 $self->{insertion_mode} eq 'after frameset') {
4236 $self->{insertion_mode} = 'after html frameset';
4237 !!!next-token;
4238 redo B;
4239 } else {
4240 if ($self->{insertion_mode} eq 'in frameset') {
4241 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4242 } else {
4243 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4244 }
4245 ## Ignore the token
4246 !!!next-token;
4247 redo B;
4248 }
4249 } else {
4250 die "$0: $token->{type}: Unknown token type";
4251 }
4252
4253 ## ISSUE: An issue in spec here
4254 } else {
4255 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4256 }
4257
4258 ## "in body" insertion mode
4259 if ($token->{type} eq 'start tag') {
4260 if ($token->{tag_name} eq 'script') {
4261 ## NOTE: This is an "as if in head" code clone
4262 $script_start_tag->($insert);
4263 redo B;
4264 } elsif ($token->{tag_name} eq 'style') {
4265 ## NOTE: This is an "as if in head" code clone
4266 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4267 redo B;
4268 } elsif ({
4269 base => 1, link => 1,
4270 }->{$token->{tag_name}}) {
4271 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4272 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4273 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4274 !!!next-token;
4275 redo B;
4276 } elsif ($token->{tag_name} eq 'meta') {
4277 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4278 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4279 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4280
4281 unless ($self->{confident}) {
4282 my $charset;
4283 if ($token->{attributes}->{charset}) { ## TODO: And if supported
4284 $charset = $token->{attributes}->{charset}->{value};
4285 }
4286 if ($token->{attributes}->{'http-equiv'}) {
4287 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4288 if ($token->{attributes}->{'http-equiv'}->{value}
4289 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
4290 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4291 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4292 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
4293 } ## TODO: And if supported
4294 }
4295 ## TODO: Change the encoding
4296 }
4297
4298 !!!next-token;
4299 redo B;
4300 } elsif ($token->{tag_name} eq 'title') {
4301 !!!parse-error (type => 'in body:title');
4302 ## NOTE: This is an "as if in head" code clone
4303 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4304 if (defined $self->{head_element}) {
4305 $self->{head_element}->append_child ($_[0]);
4306 } else {
4307 $insert->($_[0]);
4308 }
4309 });
4310 redo B;
4311 } elsif ($token->{tag_name} eq 'body') {
4312 !!!parse-error (type => 'in body:body');
4313
4314 if (@{$self->{open_elements}} == 1 or
4315 $self->{open_elements}->[1]->[1] ne 'body') {
4316 ## Ignore the token
4317 } else {
4318 my $body_el = $self->{open_elements}->[1]->[0];
4319 for my $attr_name (keys %{$token->{attributes}}) {
4320 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4321 $body_el->set_attribute_ns
4322 (undef, [undef, $attr_name],
4323 $token->{attributes}->{$attr_name}->{value});
4324 }
4325 }
4326 }
4327 !!!next-token;
4328 redo B;
4329 } elsif ({
4330 address => 1, blockquote => 1, center => 1, dir => 1,
4331 div => 1, dl => 1, fieldset => 1, listing => 1,
4332 menu => 1, ol => 1, p => 1, ul => 1,
4333 pre => 1,
4334 }->{$token->{tag_name}}) {
4335 ## has a p element in scope
4336 INSCOPE: for (reverse @{$self->{open_elements}}) {
4337 if ($_->[1] eq 'p') {
4338 !!!back-token;
4339 $token = {type => 'end tag', tag_name => 'p'};
4340 redo B;
4341 } elsif ({
4342 table => 1, caption => 1, td => 1, th => 1,
4343 button => 1, marquee => 1, object => 1, html => 1,
4344 }->{$_->[1]}) {
4345 last INSCOPE;
4346 }
4347 } # INSCOPE
4348
4349 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4350 if ($token->{tag_name} eq 'pre') {
4351 !!!next-token;
4352 if ($token->{type} eq 'character') {
4353 $token->{data} =~ s/^\x0A//;
4354 unless (length $token->{data}) {
4355 !!!next-token;
4356 }
4357 }
4358 } else {
4359 !!!next-token;
4360 }
4361 redo B;
4362 } elsif ($token->{tag_name} eq 'form') {
4363 if (defined $self->{form_element}) {
4364 !!!parse-error (type => 'in form:form');
4365 ## Ignore the token
4366 !!!next-token;
4367 redo B;
4368 } else {
4369 ## has a p element in scope
4370 INSCOPE: for (reverse @{$self->{open_elements}}) {
4371 if ($_->[1] eq 'p') {
4372 !!!back-token;
4373 $token = {type => 'end tag', tag_name => 'p'};
4374 redo B;
4375 } elsif ({
4376 table => 1, caption => 1, td => 1, th => 1,
4377 button => 1, marquee => 1, object => 1, html => 1,
4378 }->{$_->[1]}) {
4379 last INSCOPE;
4380 }
4381 } # INSCOPE
4382
4383 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4384 $self->{form_element} = $self->{open_elements}->[-1]->[0];
4385 !!!next-token;
4386 redo B;
4387 }
4388 } elsif ($token->{tag_name} eq 'li') {
4389 ## has a p element in scope
4390 INSCOPE: for (reverse @{$self->{open_elements}}) {
4391 if ($_->[1] eq 'p') {
4392 !!!back-token;
4393 $token = {type => 'end tag', tag_name => 'p'};
4394 redo B;
4395 } elsif ({
4396 table => 1, caption => 1, td => 1, th => 1,
4397 button => 1, marquee => 1, object => 1, html => 1,
4398 }->{$_->[1]}) {
4399 last INSCOPE;
4400 }
4401 } # INSCOPE
4402
4403 ## Step 1
4404 my $i = -1;
4405 my $node = $self->{open_elements}->[$i];
4406 LI: {
4407 ## Step 2
4408 if ($node->[1] eq 'li') {
4409 if ($i != -1) {
4410 !!!parse-error (type => 'end tag missing:'.
4411 $self->{open_elements}->[-1]->[1]);
4412 }
4413 splice @{$self->{open_elements}}, $i;
4414 last LI;
4415 }
4416
4417 ## Step 3
4418 if (not $formatting_category->{$node->[1]} and
4419 #not $phrasing_category->{$node->[1]} and
4420 ($special_category->{$node->[1]} or
4421 $scoping_category->{$node->[1]}) and
4422 $node->[1] ne 'address' and $node->[1] ne 'div') {
4423 last LI;
4424 }
4425
4426 ## Step 4
4427 $i--;
4428 $node = $self->{open_elements}->[$i];
4429 redo LI;
4430 } # LI
4431
4432 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4433 !!!next-token;
4434 redo B;
4435 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4436 ## has a p element in scope
4437 INSCOPE: for (reverse @{$self->{open_elements}}) {
4438 if ($_->[1] eq 'p') {
4439 !!!back-token;
4440 $token = {type => 'end tag', tag_name => 'p'};
4441 redo B;
4442 } elsif ({
4443 table => 1, caption => 1, td => 1, th => 1,
4444 button => 1, marquee => 1, object => 1, html => 1,
4445 }->{$_->[1]}) {
4446 last INSCOPE;
4447 }
4448 } # INSCOPE
4449
4450 ## Step 1
4451 my $i = -1;
4452 my $node = $self->{open_elements}->[$i];
4453 LI: {
4454 ## Step 2
4455 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4456 if ($i != -1) {
4457 !!!parse-error (type => 'end tag missing:'.
4458 $self->{open_elements}->[-1]->[1]);
4459 }
4460 splice @{$self->{open_elements}}, $i;
4461 last LI;
4462 }
4463
4464 ## Step 3
4465 if (not $formatting_category->{$node->[1]} and
4466 #not $phrasing_category->{$node->[1]} and
4467 ($special_category->{$node->[1]} or
4468 $scoping_category->{$node->[1]}) and
4469 $node->[1] ne 'address' and $node->[1] ne 'div') {
4470 last LI;
4471 }
4472
4473 ## Step 4
4474 $i--;
4475 $node = $self->{open_elements}->[$i];
4476 redo LI;
4477 } # LI
4478
4479 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4480 !!!next-token;
4481 redo B;
4482 } elsif ($token->{tag_name} eq 'plaintext') {
4483 ## has a p element in scope
4484 INSCOPE: for (reverse @{$self->{open_elements}}) {
4485 if ($_->[1] eq 'p') {
4486 !!!back-token;
4487 $token = {type => 'end tag', tag_name => 'p'};
4488 redo B;
4489 } elsif ({
4490 table => 1, caption => 1, td => 1, th => 1,
4491 button => 1, marquee => 1, object => 1, html => 1,
4492 }->{$_->[1]}) {
4493 last INSCOPE;
4494 }
4495 } # INSCOPE
4496
4497 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4498
4499 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4500
4501 !!!next-token;
4502 redo B;
4503 } elsif ({
4504 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4505 }->{$token->{tag_name}}) {
4506 ## has a p element in scope
4507 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4508 my $node = $self->{open_elements}->[$_];
4509 if ($node->[1] eq 'p') {
4510 !!!back-token;
4511 $token = {type => 'end tag', tag_name => 'p'};
4512 redo B;
4513 } elsif ({
4514 table => 1, caption => 1, td => 1, th => 1,
4515 button => 1, marquee => 1, object => 1, html => 1,
4516 }->{$node->[1]}) {
4517 last INSCOPE;
4518 }
4519 } # INSCOPE
4520
4521 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4522 ## has an element in scope
4523 #my $i;
4524 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4525 # my $node = $self->{open_elements}->[$_];
4526 # if ({
4527 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4528 # }->{$node->[1]}) {
4529 # $i = $_;
4530 # last INSCOPE;
4531 # } elsif ({
4532 # table => 1, caption => 1, td => 1, th => 1,
4533 # button => 1, marquee => 1, object => 1, html => 1,
4534 # }->{$node->[1]}) {
4535 # last INSCOPE;
4536 # }
4537 #} # INSCOPE
4538 #
4539 #if (defined $i) {
4540 # !!! parse-error (type => 'in hn:hn');
4541 # splice @{$self->{open_elements}}, $i;
4542 #}
4543
4544 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4545
4546 !!!next-token;
4547 redo B;
4548 } elsif ($token->{tag_name} eq 'a') {
4549 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4550 my $node = $active_formatting_elements->[$i];
4551 if ($node->[1] eq 'a') {
4552 !!!parse-error (type => 'in a:a');
4553
4554 !!!back-token;
4555 $token = {type => 'end tag', tag_name => 'a'};
4556 $formatting_end_tag->($token->{tag_name});
4557
4558 AFE2: for (reverse 0..$#$active_formatting_elements) {
4559 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4560 splice @$active_formatting_elements, $_, 1;
4561 last AFE2;
4562 }
4563 } # AFE2
4564 OE: for (reverse 0..$#{$self->{open_elements}}) {
4565 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4566 splice @{$self->{open_elements}}, $_, 1;
4567 last OE;
4568 }
4569 } # OE
4570 last AFE;
4571 } elsif ($node->[0] eq '#marker') {
4572 last AFE;
4573 }
4574 } # AFE
4575
4576 $reconstruct_active_formatting_elements->($insert_to_current);
4577
4578 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4579 push @$active_formatting_elements, $self->{open_elements}->[-1];
4580
4581 !!!next-token;
4582 redo B;
4583 } elsif ({
4584 b => 1, big => 1, em => 1, font => 1, i => 1,
4585 s => 1, small => 1, strile => 1,
4586 strong => 1, tt => 1, u => 1,
4587 }->{$token->{tag_name}}) {
4588 $reconstruct_active_formatting_elements->($insert_to_current);
4589
4590 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4591 push @$active_formatting_elements, $self->{open_elements}->[-1];
4592
4593 !!!next-token;
4594 redo B;
4595 } elsif ($token->{tag_name} eq 'nobr') {
4596 $reconstruct_active_formatting_elements->($insert_to_current);
4597
4598 ## has a |nobr| element in scope
4599 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4600 my $node = $self->{open_elements}->[$_];
4601 if ($node->[1] eq 'nobr') {
4602 !!!parse-error (type => 'not closed:nobr');
4603 !!!back-token;
4604 $token = {type => 'end tag', tag_name => 'nobr'};
4605 redo B;
4606 } elsif ({
4607 table => 1, caption => 1, td => 1, th => 1,
4608 button => 1, marquee => 1, object => 1, html => 1,
4609 }->{$node->[1]}) {
4610 last INSCOPE;
4611 }
4612 } # INSCOPE
4613
4614 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4615 push @$active_formatting_elements, $self->{open_elements}->[-1];
4616
4617 !!!next-token;
4618 redo B;
4619 } elsif ($token->{tag_name} eq 'button') {
4620 ## has a button element in scope
4621 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4622 my $node = $self->{open_elements}->[$_];
4623 if ($node->[1] eq 'button') {
4624 !!!parse-error (type => 'in button:button');
4625 !!!back-token;
4626 $token = {type => 'end tag', tag_name => 'button'};
4627 redo B;
4628 } elsif ({
4629 table => 1, caption => 1, td => 1, th => 1,
4630 button => 1, marquee => 1, object => 1, html => 1,
4631 }->{$node->[1]}) {
4632 last INSCOPE;
4633 }
4634 } # INSCOPE
4635
4636 $reconstruct_active_formatting_elements->($insert_to_current);
4637
4638 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4639 push @$active_formatting_elements, ['#marker', ''];
4640
4641 !!!next-token;
4642 redo B;
4643 } elsif ($token->{tag_name} eq 'marquee' or
4644 $token->{tag_name} eq 'object') {
4645 $reconstruct_active_formatting_elements->($insert_to_current);
4646
4647 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4648 push @$active_formatting_elements, ['#marker', ''];
4649
4650 !!!next-token;
4651 redo B;
4652 } elsif ($token->{tag_name} eq 'xmp') {
4653 $reconstruct_active_formatting_elements->($insert_to_current);
4654 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4655 redo B;
4656 } elsif ($token->{tag_name} eq 'table') {
4657 ## has a p element in scope
4658 INSCOPE: for (reverse @{$self->{open_elements}}) {
4659 if ($_->[1] eq 'p') {
4660 !!!back-token;
4661 $token = {type => 'end tag', tag_name => 'p'};
4662 redo B;
4663 } elsif ({
4664 table => 1, caption => 1, td => 1, th => 1,
4665 button => 1, marquee => 1, object => 1, html => 1,
4666 }->{$_->[1]}) {
4667 last INSCOPE;
4668 }
4669 } # INSCOPE
4670
4671 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4672
4673 $self->{insertion_mode} = 'in table';
4674
4675 !!!next-token;
4676 redo B;
4677 } elsif ({
4678 area => 1, basefont => 1, bgsound => 1, br => 1,
4679 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
4680 image => 1,
4681 }->{$token->{tag_name}}) {
4682 if ($token->{tag_name} eq 'image') {
4683 !!!parse-error (type => 'image');
4684 $token->{tag_name} = 'img';
4685 }
4686
4687 ## NOTE: There is an "as if <br>" code clone.
4688 $reconstruct_active_formatting_elements->($insert_to_current);
4689
4690 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4691 pop @{$self->{open_elements}};
4692
4693 !!!next-token;
4694 redo B;
4695 } elsif ($token->{tag_name} eq 'hr') {
4696 ## has a p element in scope
4697 INSCOPE: for (reverse @{$self->{open_elements}}) {
4698 if ($_->[1] eq 'p') {
4699 !!!back-token;
4700 $token = {type => 'end tag', tag_name => 'p'};
4701 redo B;
4702 } elsif ({
4703 table => 1, caption => 1, td => 1, th => 1,
4704 button => 1, marquee => 1, object => 1, html => 1,
4705 }->{$_->[1]}) {
4706 last INSCOPE;
4707 }
4708 } # INSCOPE
4709
4710 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4711 pop @{$self->{open_elements}};
4712
4713 !!!next-token;
4714 redo B;
4715 } elsif ($token->{tag_name} eq 'input') {
4716 $reconstruct_active_formatting_elements->($insert_to_current);
4717
4718 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4719 ## TODO: associate with $self->{form_element} if defined
4720 pop @{$self->{open_elements}};
4721
4722 !!!next-token;
4723 redo B;
4724 } elsif ($token->{tag_name} eq 'isindex') {
4725 !!!parse-error (type => 'isindex');
4726
4727 if (defined $self->{form_element}) {
4728 ## Ignore the token
4729 !!!next-token;
4730 redo B;
4731 } else {
4732 my $at = $token->{attributes};
4733 my $form_attrs;
4734 $form_attrs->{action} = $at->{action} if $at->{action};
4735 my $prompt_attr = $at->{prompt};
4736 $at->{name} = {name => 'name', value => 'isindex'};
4737 delete $at->{action};
4738 delete $at->{prompt};
4739 my @tokens = (
4740 {type => 'start tag', tag_name => 'form',
4741 attributes => $form_attrs},
4742 {type => 'start tag', tag_name => 'hr'},
4743 {type => 'start tag', tag_name => 'p'},
4744 {type => 'start tag', tag_name => 'label'},
4745 );
4746 if ($prompt_attr) {
4747 push @tokens, {type => 'character', data => $prompt_attr->{value}};
4748 } else {
4749 push @tokens, {type => 'character',
4750 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
4751 ## TODO: make this configurable
4752 }
4753 push @tokens,
4754 {type => 'start tag', tag_name => 'input', attributes => $at},
4755 #{type => 'character', data => ''}, # SHOULD
4756 {type => 'end tag', tag_name => 'label'},
4757 {type => 'end tag', tag_name => 'p'},
4758 {type => 'start tag', tag_name => 'hr'},
4759 {type => 'end tag', tag_name => 'form'};
4760 $token = shift @tokens;
4761 !!!back-token (@tokens);
4762 redo B;
4763 }
4764 } elsif ($token->{tag_name} eq 'textarea') {
4765 my $tag_name = $token->{tag_name};
4766 my $el;
4767 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
4768
4769 ## TODO: $self->{form_element} if defined
4770 $self->{content_model} = RCDATA_CONTENT_MODEL;
4771 delete $self->{escape}; # MUST
4772
4773 $insert->($el);
4774
4775 my $text = '';
4776 !!!next-token;
4777 if ($token->{type} eq 'character') {
4778 $token->{data} =~ s/^\x0A//;
4779 unless (length $token->{data}) {
4780 !!!next-token;
4781 }
4782 }
4783 while ($token->{type} eq 'character') {
4784 $text .= $token->{data};
4785 !!!next-token;
4786 }
4787 if (length $text) {
4788 $el->manakai_append_text ($text);
4789 }
4790
4791 $self->{content_model} = PCDATA_CONTENT_MODEL;
4792
4793 if ($token->{type} eq 'end tag' and
4794 $token->{tag_name} eq $tag_name) {
4795 ## Ignore the token
4796 } else {
4797 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
4798 }
4799 !!!next-token;
4800 redo B;
4801 } elsif ({
4802 iframe => 1,
4803 noembed => 1,
4804 noframes => 1,
4805 noscript => 0, ## TODO: 1 if scripting is enabled
4806 }->{$token->{tag_name}}) {
4807 ## NOTE: There are two "as if in body" code clones.
4808 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4809 redo B;
4810 } elsif ($token->{tag_name} eq 'select') {
4811 $reconstruct_active_formatting_elements->($insert_to_current);
4812
4813 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4814
4815 $self->{insertion_mode} = 'in select';
4816 !!!next-token;
4817 redo B;
4818 } elsif ({
4819 caption => 1, col => 1, colgroup => 1, frame => 1,
4820 frameset => 1, head => 1, option => 1, optgroup => 1,
4821 tbody => 1, td => 1, tfoot => 1, th => 1,
4822 thead => 1, tr => 1,
4823 }->{$token->{tag_name}}) {
4824 !!!parse-error (type => 'in body:'.$token->{tag_name});
4825 ## Ignore the token
4826 !!!next-token;
4827 redo B;
4828
4829 ## ISSUE: An issue on HTML5 new elements in the spec.
4830 } else {
4831 $reconstruct_active_formatting_elements->($insert_to_current);
4832
4833 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4834
4835 !!!next-token;
4836 redo B;
4837 }
4838 } elsif ($token->{type} eq 'end tag') {
4839 if ($token->{tag_name} eq 'body') {
4840 if (@{$self->{open_elements}} > 1 and
4841 $self->{open_elements}->[1]->[1] eq 'body') {
4842 for (@{$self->{open_elements}}) {
4843 unless ({
4844 dd => 1, dt => 1, li => 1, p => 1, td => 1,
4845 th => 1, tr => 1, body => 1, html => 1,
4846 tbody => 1, tfoot => 1, thead => 1,
4847 }->{$_->[1]}) {
4848 !!!parse-error (type => 'not closed:'.$_->[1]);
4849 }
4850 }
4851
4852 $self->{insertion_mode} = 'after body';
4853 !!!next-token;
4854 redo B;
4855 } else {
4856 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4857 ## Ignore the token
4858 !!!next-token;
4859 redo B;
4860 }
4861 } elsif ($token->{tag_name} eq 'html') {
4862 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
4863 ## ISSUE: There is an issue in the spec.
4864 if ($self->{open_elements}->[-1]->[1] ne 'body') {
4865 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
4866 }
4867 $self->{insertion_mode} = 'after body';
4868 ## reprocess
4869 redo B;
4870 } else {
4871 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4872 ## Ignore the token
4873 !!!next-token;
4874 redo B;
4875 }
4876 } elsif ({
4877 address => 1, blockquote => 1, center => 1, dir => 1,
4878 div => 1, dl => 1, fieldset => 1, listing => 1,
4879 menu => 1, ol => 1, pre => 1, ul => 1,
4880 p => 1,
4881 dd => 1, dt => 1, li => 1,
4882 button => 1, marquee => 1, object => 1,
4883 }->{$token->{tag_name}}) {
4884 ## has an element in scope
4885 my $i;
4886 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4887 my $node = $self->{open_elements}->[$_];
4888 if ($node->[1] eq $token->{tag_name}) {
4889 ## generate implied end tags
4890 if ({
4891 dd => ($token->{tag_name} ne 'dd'),
4892 dt => ($token->{tag_name} ne 'dt'),
4893 li => ($token->{tag_name} ne 'li'),
4894 p => ($token->{tag_name} ne 'p'),
4895 td => 1, th => 1, tr => 1,
4896 tbody => 1, tfoot=> 1, thead => 1,
4897 }->{$self->{open_elements}->[-1]->[1]}) {
4898 !!!back-token;
4899 $token = {type => 'end tag',
4900 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4901 redo B;
4902 }
4903 $i = $_;
4904 last INSCOPE unless $token->{tag_name} eq 'p';
4905 } elsif ({
4906 table => 1, caption => 1, td => 1, th => 1,
4907 button => 1, marquee => 1, object => 1, html => 1,
4908 }->{$node->[1]}) {
4909 last INSCOPE;
4910 }
4911 } # INSCOPE
4912
4913 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4914 if (defined $i) {
4915 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4916 } else {
4917 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4918 }
4919 }
4920
4921 if (defined $i) {
4922 splice @{$self->{open_elements}}, $i;
4923 } elsif ($token->{tag_name} eq 'p') {
4924 ## As if <p>, then reprocess the current token
4925 my $el;
4926 !!!create-element ($el, 'p');
4927 $insert->($el);
4928 }
4929 $clear_up_to_marker->()
4930 if {
4931 button => 1, marquee => 1, object => 1,
4932 }->{$token->{tag_name}};
4933 !!!next-token;
4934 redo B;
4935 } elsif ($token->{tag_name} eq 'form') {
4936 ## has an element in scope
4937 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4938 my $node = $self->{open_elements}->[$_];
4939 if ($node->[1] eq $token->{tag_name}) {
4940 ## generate implied end tags
4941 if ({
4942 dd => 1, dt => 1, li => 1, p => 1,
4943 td => 1, th => 1, tr => 1,
4944 tbody => 1, tfoot=> 1, thead => 1,
4945 }->{$self->{open_elements}->[-1]->[1]}) {
4946 !!!back-token;
4947 $token = {type => 'end tag',
4948 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4949 redo B;
4950 }
4951 last INSCOPE;
4952 } elsif ({
4953 table => 1, caption => 1, td => 1, th => 1,
4954 button => 1, marquee => 1, object => 1, html => 1,
4955 }->{$node->[1]}) {
4956 last INSCOPE;
4957 }
4958 } # INSCOPE
4959
4960 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
4961 pop @{$self->{open_elements}};
4962 } else {
4963 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4964 }
4965
4966 undef $self->{form_element};
4967 !!!next-token;
4968 redo B;
4969 } elsif ({
4970 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4971 }->{$token->{tag_name}}) {
4972 ## has an element in scope
4973 my $i;
4974 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4975 my $node = $self->{open_elements}->[$_];
4976 if ({
4977 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4978 }->{$node->[1]}) {
4979 ## generate implied end tags
4980 if ({
4981 dd => 1, dt => 1, li => 1, p => 1,
4982 td => 1, th => 1, tr => 1,
4983 tbody => 1, tfoot=> 1, thead => 1,
4984 }->{$self->{open_elements}->[-1]->[1]}) {
4985 !!!back-token;
4986 $token = {type => 'end tag',
4987 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4988 redo B;
4989 }
4990 $i = $_;
4991 last INSCOPE;
4992 } elsif ({
4993 table => 1, caption => 1, td => 1, th => 1,
4994 button => 1, marquee => 1, object => 1, html => 1,
4995 }->{$node->[1]}) {
4996 last INSCOPE;
4997 }
4998 } # INSCOPE
4999
5000 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5001 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5002 }
5003
5004 splice @{$self->{open_elements}}, $i if defined $i;
5005 !!!next-token;
5006 redo B;
5007 } elsif ({
5008 a => 1,
5009 b => 1, big => 1, em => 1, font => 1, i => 1,
5010 nobr => 1, s => 1, small => 1, strile => 1,
5011 strong => 1, tt => 1, u => 1,
5012 }->{$token->{tag_name}}) {
5013 $formatting_end_tag->($token->{tag_name});
5014 redo B;
5015 } elsif ($token->{tag_name} eq 'br') {
5016 !!!parse-error (type => 'unmatched end tag:br');
5017
5018 ## As if <br>
5019 $reconstruct_active_formatting_elements->($insert_to_current);
5020
5021 my $el;
5022 !!!create-element ($el, 'br');
5023 $insert->($el);
5024
5025 ## Ignore the token.
5026 !!!next-token;
5027 redo B;
5028 } elsif ({
5029 caption => 1, col => 1, colgroup => 1, frame => 1,
5030 frameset => 1, head => 1, option => 1, optgroup => 1,
5031 tbody => 1, td => 1, tfoot => 1, th => 1,
5032 thead => 1, tr => 1,
5033 area => 1, basefont => 1, bgsound => 1,
5034 embed => 1, hr => 1, iframe => 1, image => 1,
5035 img => 1, input => 1, isindex => 1, noembed => 1,
5036 noframes => 1, param => 1, select => 1, spacer => 1,
5037 table => 1, textarea => 1, wbr => 1,
5038 noscript => 0, ## TODO: if scripting is enabled
5039 }->{$token->{tag_name}}) {
5040 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5041 ## Ignore the token
5042 !!!next-token;
5043 redo B;
5044
5045 ## ISSUE: Issue on HTML5 new elements in spec
5046
5047 } else {
5048 ## Step 1
5049 my $node_i = -1;
5050 my $node = $self->{open_elements}->[$node_i];
5051
5052 ## Step 2
5053 S2: {
5054 if ($node->[1] eq $token->{tag_name}) {
5055 ## Step 1
5056 ## generate implied end tags
5057 if ({
5058 dd => 1, dt => 1, li => 1, p => 1,
5059 td => 1, th => 1, tr => 1,
5060 tbody => 1, tfoot=> 1, thead => 1,
5061 }->{$self->{open_elements}->[-1]->[1]}) {
5062 !!!back-token;
5063 $token = {type => 'end tag',
5064 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5065 redo B;
5066 }
5067
5068 ## Step 2
5069 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5070 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5071 }
5072
5073 ## Step 3
5074 splice @{$self->{open_elements}}, $node_i;
5075
5076 !!!next-token;
5077 last S2;
5078 } else {
5079 ## Step 3
5080 if (not $formatting_category->{$node->[1]} and
5081 #not $phrasing_category->{$node->[1]} and
5082 ($special_category->{$node->[1]} or
5083 $scoping_category->{$node->[1]})) {
5084 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5085 ## Ignore the token
5086 !!!next-token;
5087 last S2;
5088 }
5089 }
5090
5091 ## Step 4
5092 $node_i--;
5093 $node = $self->{open_elements}->[$node_i];
5094
5095 ## Step 5;
5096 redo S2;
5097 } # S2
5098 redo B;
5099 }
5100 }
5101 redo B;
5102 } # B
5103
5104 ## NOTE: The "trailing end" phase in HTML5 is split into
5105 ## two insertion modes: "after html body" and "after html frameset".
5106 ## NOTE: States in the main stage is preserved while
5107 ## the parser stays in the trailing end phase. # MUST
5108
5109 ## Stop parsing # MUST
5110
5111 ## TODO: script stuffs
5112 } # _tree_construct_main
5113
5114 sub set_inner_html ($$$) {
5115 my $class = shift;
5116 my $node = shift;
5117 my $s = \$_[0];
5118 my $onerror = $_[1];
5119
5120 my $nt = $node->node_type;
5121 if ($nt == 9) {
5122 # MUST
5123
5124 ## Step 1 # MUST
5125 ## TODO: If the document has an active parser, ...
5126 ## ISSUE: There is an issue in the spec.
5127
5128 ## Step 2 # MUST
5129 my @cn = @{$node->child_nodes};
5130 for (@cn) {
5131 $node->remove_child ($_);
5132 }
5133
5134 ## Step 3, 4, 5 # MUST
5135 $class->parse_string ($$s => $node, $onerror);
5136 } elsif ($nt == 1) {
5137 ## TODO: If non-html element
5138
5139 ## NOTE: Most of this code is copied from |parse_string|
5140
5141 ## Step 1 # MUST
5142 my $this_doc = $node->owner_document;
5143 my $doc = $this_doc->implementation->create_document;
5144 $doc->manakai_is_html (1);
5145 my $p = $class->new;
5146 $p->{document} = $doc;
5147
5148 ## Step 9 # MUST
5149 my $i = 0;
5150 my $line = 1;
5151 my $column = 0;
5152 $p->{set_next_input_character} = sub {
5153 my $self = shift;
5154
5155 pop @{$self->{prev_input_character}};
5156 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5157
5158 $self->{next_input_character} = -1 and return if $i >= length $$s;
5159 $self->{next_input_character} = ord substr $$s, $i++, 1;
5160 $column++;
5161
5162 if ($self->{next_input_character} == 0x000A) { # LF
5163 $line++;
5164 $column = 0;
5165 } elsif ($self->{next_input_character} == 0x000D) { # CR
5166 $i++ if substr ($$s, $i, 1) eq "\x0A";
5167 $self->{next_input_character} = 0x000A; # LF # MUST
5168 $line++;
5169 $column = 0;
5170 } elsif ($self->{next_input_character} > 0x10FFFF) {
5171 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5172 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5173 !!!parse-error (type => 'NULL');
5174 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5175 }
5176 };
5177 $p->{prev_input_character} = [-1, -1, -1];
5178 $p->{next_input_character} = -1;
5179
5180 my $ponerror = $onerror || sub {
5181 my (%opt) = @_;
5182 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5183 };
5184 $p->{parse_error} = sub {
5185 $ponerror->(@_, line => $line, column => $column);
5186 };
5187
5188 $p->_initialize_tokenizer;
5189 $p->_initialize_tree_constructor;
5190
5191 ## Step 2
5192 my $node_ln = $node->local_name;
5193 $p->{content_model} = {
5194 title => RCDATA_CONTENT_MODEL,
5195 textarea => RCDATA_CONTENT_MODEL,
5196 style => CDATA_CONTENT_MODEL,
5197 script => CDATA_CONTENT_MODEL,
5198 xmp => CDATA_CONTENT_MODEL,
5199 iframe => CDATA_CONTENT_MODEL,
5200 noembed => CDATA_CONTENT_MODEL,
5201 noframes => CDATA_CONTENT_MODEL,
5202 noscript => CDATA_CONTENT_MODEL,
5203 plaintext => PLAINTEXT_CONTENT_MODEL,
5204 }->{$node_ln};
5205 $p->{content_model} = PCDATA_CONTENT_MODEL
5206 unless defined $p->{content_model};
5207 ## ISSUE: What is "the name of the element"? local name?
5208
5209 $p->{inner_html_node} = [$node, $node_ln];
5210
5211 ## Step 4
5212 my $root = $doc->create_element_ns
5213 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5214
5215 ## Step 5 # MUST
5216 $doc->append_child ($root);
5217
5218 ## Step 6 # MUST
5219 push @{$p->{open_elements}}, [$root, 'html'];
5220
5221 undef $p->{head_element};
5222
5223 ## Step 7 # MUST
5224 $p->_reset_insertion_mode;
5225
5226 ## Step 8 # MUST
5227 my $anode = $node;
5228 AN: while (defined $anode) {
5229 if ($anode->node_type == 1) {
5230 my $nsuri = $anode->namespace_uri;
5231 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5232 if ($anode->local_name eq 'form') { ## TODO: case?
5233 $p->{form_element} = $anode;
5234 last AN;
5235 }
5236 }
5237 }
5238 $anode = $anode->parent_node;
5239 } # AN
5240
5241 ## Step 3 # MUST
5242 ## Step 10 # MUST
5243 {
5244 my $self = $p;
5245 !!!next-token;
5246 }
5247 $p->_tree_construction_main;
5248
5249 ## Step 11 # MUST
5250 my @cn = @{$node->child_nodes};
5251 for (@cn) {
5252 $node->remove_child ($_);
5253 }
5254 ## ISSUE: mutation events? read-only?
5255
5256 ## Step 12 # MUST
5257 @cn = @{$root->child_nodes};
5258 for (@cn) {
5259 $this_doc->adopt_node ($_);
5260 $node->append_child ($_);
5261 }
5262 ## ISSUE: mutation events?
5263
5264 $p->_terminate_tree_constructor;
5265 } else {
5266 die "$0: |set_inner_html| is not defined for node of type $nt";
5267 }
5268 } # set_inner_html
5269
5270 } # tree construction stage
5271
5272 sub get_inner_html ($$$) {
5273 my (undef, $node, $on_error) = @_;
5274
5275 ## Step 1
5276 my $s = '';
5277
5278 my $in_cdata;
5279 my $parent = $node;
5280 while (defined $parent) {
5281 if ($parent->node_type == 1 and
5282 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5283 {
5284 style => 1, script => 1, xmp => 1, iframe => 1,
5285 noembed => 1, noframes => 1, noscript => 1,
5286 }->{$parent->local_name}) { ## TODO: case thingy
5287 $in_cdata = 1;
5288 }
5289 $parent = $parent->parent_node;
5290 }
5291
5292 ## Step 2
5293 my @node = @{$node->child_nodes};
5294 C: while (@node) {
5295 my $child = shift @node;
5296 unless (ref $child) {
5297 if ($child eq 'cdata-out') {
5298 $in_cdata = 0;
5299 } else {
5300 $s .= $child; # end tag
5301 }
5302 next C;
5303 }
5304
5305 my $nt = $child->node_type;
5306 if ($nt == 1) { # Element
5307 my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
5308 $s .= '<' . $tag_name;
5309 ## NOTE: Non-HTML case:
5310 ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
5311
5312 my @attrs = @{$child->attributes}; # sort order MUST be stable
5313 for my $attr (@attrs) { # order is implementation dependent
5314 my $attr_name = $attr->name; ## TODO: manakai_name
5315 $s .= ' ' . $attr_name . '="';
5316 my $attr_value = $attr->value;
5317 ## escape
5318 $attr_value =~ s/&/&amp;/g;
5319 $attr_value =~ s/</&lt;/g;
5320 $attr_value =~ s/>/&gt;/g;
5321 $attr_value =~ s/"/&quot;/g;
5322 $s .= $attr_value . '"';
5323 }
5324 $s .= '>';
5325
5326 next C if {
5327 area => 1, base => 1, basefont => 1, bgsound => 1,
5328 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5329 img => 1, input => 1, link => 1, meta => 1, param => 1,
5330 spacer => 1, wbr => 1,
5331 }->{$tag_name};
5332
5333 $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
5334
5335 if (not $in_cdata and {
5336 style => 1, script => 1, xmp => 1, iframe => 1,
5337 noembed => 1, noframes => 1, noscript => 1,
5338 plaintext => 1,
5339 }->{$tag_name}) {
5340 unshift @node, 'cdata-out';
5341 $in_cdata = 1;
5342 }
5343
5344 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5345 } elsif ($nt == 3 or $nt == 4) {
5346 if ($in_cdata) {
5347 $s .= $child->data;
5348 } else {
5349 my $value = $child->data;
5350 $value =~ s/&/&amp;/g;
5351 $value =~ s/</&lt;/g;
5352 $value =~ s/>/&gt;/g;
5353 $value =~ s/"/&quot;/g;
5354 $s .= $value;
5355 }
5356 } elsif ($nt == 8) {
5357 $s .= '<!--' . $child->data . '-->';
5358 } elsif ($nt == 10) {
5359 $s .= '<!DOCTYPE ' . $child->name . '>';
5360 } elsif ($nt == 5) { # entrefs
5361 push @node, @{$child->child_nodes};
5362 } else {
5363 $on_error->($child) if defined $on_error;
5364 }
5365 ## ISSUE: This code does not support PIs.
5366 } # C
5367
5368 ## Step 3
5369 return \$s;
5370 } # get_inner_html
5371
5372 1;
5373 # $Date: 2007/07/21 12:27:22 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24