/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (show annotations) (download) (as text)
Sat Jun 23 13:05:16 2007 UTC (18 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.18: +26 -5 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	23 Jun 2007 12:56:46 -0000
	* tree-test-1.dat: Always use uppercase document type name.

2007-06-23  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	23 Jun 2007 13:05:07 -0000
	* NanoDOM.pm (manakai_is_html): Setting to false did not work.

	* HTML.pm.src: HTML5 revision 914 (</ in CDATA, RCDATA).
	HTML5 revision 915 (<nobr>).

2007-06-23  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.18 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## ISSUE:
6 ## var doc = implementation.createDocument (null, null, null);
7 ## doc.write ('');
8 ## alert (doc.compatMode);
9
10 my $permitted_slash_tag_name = {
11 base => 1,
12 link => 1,
13 meta => 1,
14 hr => 1,
15 br => 1,
16 img=> 1,
17 embed => 1,
18 param => 1,
19 area => 1,
20 col => 1,
21 input => 1,
22 };
23
24 my $c1_entity_char = {
25 0x80 => 0x20AC,
26 0x81 => 0xFFFD,
27 0x82 => 0x201A,
28 0x83 => 0x0192,
29 0x84 => 0x201E,
30 0x85 => 0x2026,
31 0x86 => 0x2020,
32 0x87 => 0x2021,
33 0x88 => 0x02C6,
34 0x89 => 0x2030,
35 0x8A => 0x0160,
36 0x8B => 0x2039,
37 0x8C => 0x0152,
38 0x8D => 0xFFFD,
39 0x8E => 0x017D,
40 0x8F => 0xFFFD,
41 0x90 => 0xFFFD,
42 0x91 => 0x2018,
43 0x92 => 0x2019,
44 0x93 => 0x201C,
45 0x94 => 0x201D,
46 0x95 => 0x2022,
47 0x96 => 0x2013,
48 0x97 => 0x2014,
49 0x98 => 0x02DC,
50 0x99 => 0x2122,
51 0x9A => 0x0161,
52 0x9B => 0x203A,
53 0x9C => 0x0153,
54 0x9D => 0xFFFD,
55 0x9E => 0x017E,
56 0x9F => 0x0178,
57 }; # $c1_entity_char
58
59 my $special_category = {
60 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
61 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
62 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
63 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
64 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
65 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
66 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
67 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
68 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
69 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
70 };
71 my $scoping_category = {
72 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
73 table => 1, td => 1, th => 1,
74 };
75 my $formatting_category = {
76 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
77 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
78 };
79 # $phrasing_category: all other elements
80
81 sub parse_string ($$$;$) {
82 my $self = shift->new;
83 my $s = \$_[0];
84 $self->{document} = $_[1];
85
86 ## NOTE: |set_inner_html| copies most of this method's code
87
88 my $i = 0;
89 my $line = 1;
90 my $column = 0;
91 $self->{set_next_input_character} = sub {
92 my $self = shift;
93
94 pop @{$self->{prev_input_character}};
95 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
96
97 $self->{next_input_character} = -1 and return if $i >= length $$s;
98 $self->{next_input_character} = ord substr $$s, $i++, 1;
99 $column++;
100
101 if ($self->{next_input_character} == 0x000A) { # LF
102 $line++;
103 $column = 0;
104 } elsif ($self->{next_input_character} == 0x000D) { # CR
105 $i++ if substr ($$s, $i, 1) eq "\x0A";
106 $self->{next_input_character} = 0x000A; # LF # MUST
107 $line++;
108 $column = 0;
109 } elsif ($self->{next_input_character} > 0x10FFFF) {
110 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
111 } elsif ($self->{next_input_character} == 0x0000) { # NULL
112 !!!parse-error (type => 'NULL');
113 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
114 }
115 };
116 $self->{prev_input_character} = [-1, -1, -1];
117 $self->{next_input_character} = -1;
118
119 my $onerror = $_[2] || sub {
120 my (%opt) = @_;
121 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
122 };
123 $self->{parse_error} = sub {
124 $onerror->(@_, line => $line, column => $column);
125 };
126
127 $self->_initialize_tokenizer;
128 $self->_initialize_tree_constructor;
129 $self->_construct_tree;
130 $self->_terminate_tree_constructor;
131
132 return $self->{document};
133 } # parse_string
134
135 sub new ($) {
136 my $class = shift;
137 my $self = bless {}, $class;
138 $self->{set_next_input_character} = sub {
139 $self->{next_input_character} = -1;
140 };
141 $self->{parse_error} = sub {
142 #
143 };
144 return $self;
145 } # new
146
147 ## Implementations MUST act as if state machine in the spec
148
149 sub _initialize_tokenizer ($) {
150 my $self = shift;
151 $self->{state} = 'data'; # MUST
152 $self->{content_model_flag} = 'PCDATA'; # be
153 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
154 undef $self->{current_attribute};
155 undef $self->{last_emitted_start_tag_name};
156 undef $self->{last_attribute_value_state};
157 $self->{char} = [];
158 # $self->{next_input_character}
159 !!!next-input-character;
160 $self->{token} = [];
161 # $self->{escape}
162 } # _initialize_tokenizer
163
164 ## A token has:
165 ## ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
166 ## 'character', or 'end-of-file'
167 ## ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
168 ## ->{public_identifier} (DOCTYPE)
169 ## ->{system_identifier} (DOCTYPE)
170 ## ->{correct} == 1 or 0 (DOCTYPE)
171 ## ->{attributes} isa HASH (start tag, end tag)
172 ## ->{data} (comment, character)
173
174 ## Emitted token MUST immediately be handled by the tree construction state.
175
176 ## Before each step, UA MAY check to see if either one of the scripts in
177 ## "list of scripts that will execute as soon as possible" or the first
178 ## script in the "list of scripts that will execute asynchronously",
179 ## has completed loading. If one has, then it MUST be executed
180 ## and removed from the list.
181
182 sub _get_next_token ($) {
183 my $self = shift;
184 if (@{$self->{token}}) {
185 return shift @{$self->{token}};
186 }
187
188 A: {
189 if ($self->{state} eq 'data') {
190 if ($self->{next_input_character} == 0x0026) { # &
191 if ($self->{content_model_flag} eq 'PCDATA' or
192 $self->{content_model_flag} eq 'RCDATA') {
193 $self->{state} = 'entity data';
194 !!!next-input-character;
195 redo A;
196 } else {
197 #
198 }
199 } elsif ($self->{next_input_character} == 0x002D) { # -
200 if ($self->{content_model_flag} eq 'RCDATA' or
201 $self->{content_model_flag} eq 'CDATA') {
202 unless ($self->{escape}) {
203 if ($self->{prev_input_character}->[0] == 0x002D and # -
204 $self->{prev_input_character}->[1] == 0x0021 and # !
205 $self->{prev_input_character}->[2] == 0x003C) { # <
206 $self->{escape} = 1;
207 }
208 }
209 }
210
211 #
212 } elsif ($self->{next_input_character} == 0x003C) { # <
213 if ($self->{content_model_flag} eq 'PCDATA' or
214 (($self->{content_model_flag} eq 'CDATA' or
215 $self->{content_model_flag} eq 'RCDATA') and
216 not $self->{escape})) {
217 $self->{state} = 'tag open';
218 !!!next-input-character;
219 redo A;
220 } else {
221 #
222 }
223 } elsif ($self->{next_input_character} == 0x003E) { # >
224 if ($self->{escape} and
225 ($self->{content_model_flag} eq 'RCDATA' or
226 $self->{content_model_flag} eq 'CDATA')) {
227 if ($self->{prev_input_character}->[0] == 0x002D and # -
228 $self->{prev_input_character}->[1] == 0x002D) { # -
229 delete $self->{escape};
230 }
231 }
232
233 #
234 } elsif ($self->{next_input_character} == -1) {
235 !!!emit ({type => 'end-of-file'});
236 last A; ## TODO: ok?
237 }
238 # Anything else
239 my $token = {type => 'character',
240 data => chr $self->{next_input_character}};
241 ## Stay in the data state
242 !!!next-input-character;
243
244 !!!emit ($token);
245
246 redo A;
247 } elsif ($self->{state} eq 'entity data') {
248 ## (cannot happen in CDATA state)
249
250 my $token = $self->_tokenize_attempt_to_consume_an_entity;
251
252 $self->{state} = 'data';
253 # next-input-character is already done
254
255 unless (defined $token) {
256 !!!emit ({type => 'character', data => '&'});
257 } else {
258 !!!emit ($token);
259 }
260
261 redo A;
262 } elsif ($self->{state} eq 'tag open') {
263 if ($self->{content_model_flag} eq 'RCDATA' or
264 $self->{content_model_flag} eq 'CDATA') {
265 if ($self->{next_input_character} == 0x002F) { # /
266 !!!next-input-character;
267 $self->{state} = 'close tag open';
268 redo A;
269 } else {
270 ## reconsume
271 $self->{state} = 'data';
272
273 !!!emit ({type => 'character', data => '<'});
274
275 redo A;
276 }
277 } elsif ($self->{content_model_flag} eq 'PCDATA') {
278 if ($self->{next_input_character} == 0x0021) { # !
279 $self->{state} = 'markup declaration open';
280 !!!next-input-character;
281 redo A;
282 } elsif ($self->{next_input_character} == 0x002F) { # /
283 $self->{state} = 'close tag open';
284 !!!next-input-character;
285 redo A;
286 } elsif (0x0041 <= $self->{next_input_character} and
287 $self->{next_input_character} <= 0x005A) { # A..Z
288 $self->{current_token}
289 = {type => 'start tag',
290 tag_name => chr ($self->{next_input_character} + 0x0020)};
291 $self->{state} = 'tag name';
292 !!!next-input-character;
293 redo A;
294 } elsif (0x0061 <= $self->{next_input_character} and
295 $self->{next_input_character} <= 0x007A) { # a..z
296 $self->{current_token} = {type => 'start tag',
297 tag_name => chr ($self->{next_input_character})};
298 $self->{state} = 'tag name';
299 !!!next-input-character;
300 redo A;
301 } elsif ($self->{next_input_character} == 0x003E) { # >
302 !!!parse-error (type => 'empty start tag');
303 $self->{state} = 'data';
304 !!!next-input-character;
305
306 !!!emit ({type => 'character', data => '<>'});
307
308 redo A;
309 } elsif ($self->{next_input_character} == 0x003F) { # ?
310 !!!parse-error (type => 'pio');
311 $self->{state} = 'bogus comment';
312 ## $self->{next_input_character} is intentionally left as is
313 redo A;
314 } else {
315 !!!parse-error (type => 'bare stago');
316 $self->{state} = 'data';
317 ## reconsume
318
319 !!!emit ({type => 'character', data => '<'});
320
321 redo A;
322 }
323 } else {
324 die "$0: $self->{content_model_flag}: Unknown content model flag";
325 }
326 } elsif ($self->{state} eq 'close tag open') {
327 if ($self->{content_model_flag} eq 'RCDATA' or
328 $self->{content_model_flag} eq 'CDATA') {
329 my @next_char;
330 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
331 push @next_char, $self->{next_input_character};
332 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
333 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
334 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
335 !!!next-input-character;
336 next TAGNAME;
337 } else {
338 $self->{next_input_character} = shift @next_char; # reconsume
339 !!!back-next-input-character (@next_char);
340 $self->{state} = 'data';
341
342 !!!emit ({type => 'character', data => '</'});
343
344 redo A;
345 }
346 }
347 push @next_char, $self->{next_input_character};
348
349 unless ($self->{next_input_character} == 0x0009 or # HT
350 $self->{next_input_character} == 0x000A or # LF
351 $self->{next_input_character} == 0x000B or # VT
352 $self->{next_input_character} == 0x000C or # FF
353 $self->{next_input_character} == 0x0020 or # SP
354 $self->{next_input_character} == 0x003E or # >
355 $self->{next_input_character} == 0x002F or # /
356 $self->{next_input_character} == -1) {
357 $self->{next_input_character} = shift @next_char; # reconsume
358 !!!back-next-input-character (@next_char);
359 $self->{state} = 'data';
360
361 !!!emit ({type => 'character', data => '</'});
362
363 redo A;
364 } else {
365 $self->{next_input_character} = shift @next_char;
366 !!!back-next-input-character (@next_char);
367 # and consume...
368 }
369 }
370
371 if (0x0041 <= $self->{next_input_character} and
372 $self->{next_input_character} <= 0x005A) { # A..Z
373 $self->{current_token} = {type => 'end tag',
374 tag_name => chr ($self->{next_input_character} + 0x0020)};
375 $self->{state} = 'tag name';
376 !!!next-input-character;
377 redo A;
378 } elsif (0x0061 <= $self->{next_input_character} and
379 $self->{next_input_character} <= 0x007A) { # a..z
380 $self->{current_token} = {type => 'end tag',
381 tag_name => chr ($self->{next_input_character})};
382 $self->{state} = 'tag name';
383 !!!next-input-character;
384 redo A;
385 } elsif ($self->{next_input_character} == 0x003E) { # >
386 !!!parse-error (type => 'empty end tag');
387 $self->{state} = 'data';
388 !!!next-input-character;
389 redo A;
390 } elsif ($self->{next_input_character} == -1) {
391 !!!parse-error (type => 'bare etago');
392 $self->{state} = 'data';
393 # reconsume
394
395 !!!emit ({type => 'character', data => '</'});
396
397 redo A;
398 } else {
399 !!!parse-error (type => 'bogus end tag');
400 $self->{state} = 'bogus comment';
401 ## $self->{next_input_character} is intentionally left as is
402 redo A;
403 }
404 } elsif ($self->{state} eq 'tag name') {
405 if ($self->{next_input_character} == 0x0009 or # HT
406 $self->{next_input_character} == 0x000A or # LF
407 $self->{next_input_character} == 0x000B or # VT
408 $self->{next_input_character} == 0x000C or # FF
409 $self->{next_input_character} == 0x0020) { # SP
410 $self->{state} = 'before attribute name';
411 !!!next-input-character;
412 redo A;
413 } elsif ($self->{next_input_character} == 0x003E) { # >
414 if ($self->{current_token}->{type} eq 'start tag') {
415 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
416 } elsif ($self->{current_token}->{type} eq 'end tag') {
417 $self->{content_model_flag} = 'PCDATA'; # MUST
418 if ($self->{current_token}->{attributes}) {
419 !!!parse-error (type => 'end tag attribute');
420 }
421 } else {
422 die "$0: $self->{current_token}->{type}: Unknown token type";
423 }
424 $self->{state} = 'data';
425 !!!next-input-character;
426
427 !!!emit ($self->{current_token}); # start tag or end tag
428 undef $self->{current_token};
429
430 redo A;
431 } elsif (0x0041 <= $self->{next_input_character} and
432 $self->{next_input_character} <= 0x005A) { # A..Z
433 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
434 # start tag or end tag
435 ## Stay in this state
436 !!!next-input-character;
437 redo A;
438 } elsif ($self->{next_input_character} == -1) {
439 !!!parse-error (type => 'unclosed tag');
440 if ($self->{current_token}->{type} eq 'start tag') {
441 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
442 } elsif ($self->{current_token}->{type} eq 'end tag') {
443 $self->{content_model_flag} = 'PCDATA'; # MUST
444 if ($self->{current_token}->{attributes}) {
445 !!!parse-error (type => 'end tag attribute');
446 }
447 } else {
448 die "$0: $self->{current_token}->{type}: Unknown token type";
449 }
450 $self->{state} = 'data';
451 # reconsume
452
453 !!!emit ($self->{current_token}); # start tag or end tag
454 undef $self->{current_token};
455
456 redo A;
457 } elsif ($self->{next_input_character} == 0x002F) { # /
458 !!!next-input-character;
459 if ($self->{next_input_character} == 0x003E and # >
460 $self->{current_token}->{type} eq 'start tag' and
461 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
462 # permitted slash
463 #
464 } else {
465 !!!parse-error (type => 'nestc');
466 }
467 $self->{state} = 'before attribute name';
468 # next-input-character is already done
469 redo A;
470 } else {
471 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
472 # start tag or end tag
473 ## Stay in the state
474 !!!next-input-character;
475 redo A;
476 }
477 } elsif ($self->{state} eq 'before attribute name') {
478 if ($self->{next_input_character} == 0x0009 or # HT
479 $self->{next_input_character} == 0x000A or # LF
480 $self->{next_input_character} == 0x000B or # VT
481 $self->{next_input_character} == 0x000C or # FF
482 $self->{next_input_character} == 0x0020) { # SP
483 ## Stay in the state
484 !!!next-input-character;
485 redo A;
486 } elsif ($self->{next_input_character} == 0x003E) { # >
487 if ($self->{current_token}->{type} eq 'start tag') {
488 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
489 } elsif ($self->{current_token}->{type} eq 'end tag') {
490 $self->{content_model_flag} = 'PCDATA'; # MUST
491 if ($self->{current_token}->{attributes}) {
492 !!!parse-error (type => 'end tag attribute');
493 }
494 } else {
495 die "$0: $self->{current_token}->{type}: Unknown token type";
496 }
497 $self->{state} = 'data';
498 !!!next-input-character;
499
500 !!!emit ($self->{current_token}); # start tag or end tag
501 undef $self->{current_token};
502
503 redo A;
504 } elsif (0x0041 <= $self->{next_input_character} and
505 $self->{next_input_character} <= 0x005A) { # A..Z
506 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
507 value => ''};
508 $self->{state} = 'attribute name';
509 !!!next-input-character;
510 redo A;
511 } elsif ($self->{next_input_character} == 0x002F) { # /
512 !!!next-input-character;
513 if ($self->{next_input_character} == 0x003E and # >
514 $self->{current_token}->{type} eq 'start tag' and
515 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
516 # permitted slash
517 #
518 } else {
519 !!!parse-error (type => 'nestc');
520 }
521 ## Stay in the state
522 # next-input-character is already done
523 redo A;
524 } elsif ($self->{next_input_character} == -1) {
525 !!!parse-error (type => 'unclosed tag');
526 if ($self->{current_token}->{type} eq 'start tag') {
527 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
528 } elsif ($self->{current_token}->{type} eq 'end tag') {
529 $self->{content_model_flag} = 'PCDATA'; # MUST
530 if ($self->{current_token}->{attributes}) {
531 !!!parse-error (type => 'end tag attribute');
532 }
533 } else {
534 die "$0: $self->{current_token}->{type}: Unknown token type";
535 }
536 $self->{state} = 'data';
537 # reconsume
538
539 !!!emit ($self->{current_token}); # start tag or end tag
540 undef $self->{current_token};
541
542 redo A;
543 } else {
544 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
545 value => ''};
546 $self->{state} = 'attribute name';
547 !!!next-input-character;
548 redo A;
549 }
550 } elsif ($self->{state} eq 'attribute name') {
551 my $before_leave = sub {
552 if (exists $self->{current_token}->{attributes} # start tag or end tag
553 ->{$self->{current_attribute}->{name}}) { # MUST
554 !!!parse-error (type => 'dupulicate attribute');
555 ## Discard $self->{current_attribute} # MUST
556 } else {
557 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
558 = $self->{current_attribute};
559 }
560 }; # $before_leave
561
562 if ($self->{next_input_character} == 0x0009 or # HT
563 $self->{next_input_character} == 0x000A or # LF
564 $self->{next_input_character} == 0x000B or # VT
565 $self->{next_input_character} == 0x000C or # FF
566 $self->{next_input_character} == 0x0020) { # SP
567 $before_leave->();
568 $self->{state} = 'after attribute name';
569 !!!next-input-character;
570 redo A;
571 } elsif ($self->{next_input_character} == 0x003D) { # =
572 $before_leave->();
573 $self->{state} = 'before attribute value';
574 !!!next-input-character;
575 redo A;
576 } elsif ($self->{next_input_character} == 0x003E) { # >
577 $before_leave->();
578 if ($self->{current_token}->{type} eq 'start tag') {
579 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
580 } elsif ($self->{current_token}->{type} eq 'end tag') {
581 $self->{content_model_flag} = 'PCDATA'; # MUST
582 if ($self->{current_token}->{attributes}) {
583 !!!parse-error (type => 'end tag attribute');
584 }
585 } else {
586 die "$0: $self->{current_token}->{type}: Unknown token type";
587 }
588 $self->{state} = 'data';
589 !!!next-input-character;
590
591 !!!emit ($self->{current_token}); # start tag or end tag
592 undef $self->{current_token};
593
594 redo A;
595 } elsif (0x0041 <= $self->{next_input_character} and
596 $self->{next_input_character} <= 0x005A) { # A..Z
597 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
598 ## Stay in the state
599 !!!next-input-character;
600 redo A;
601 } elsif ($self->{next_input_character} == 0x002F) { # /
602 $before_leave->();
603 !!!next-input-character;
604 if ($self->{next_input_character} == 0x003E and # >
605 $self->{current_token}->{type} eq 'start tag' and
606 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
607 # permitted slash
608 #
609 } else {
610 !!!parse-error (type => 'nestc');
611 }
612 $self->{state} = 'before attribute name';
613 # next-input-character is already done
614 redo A;
615 } elsif ($self->{next_input_character} == -1) {
616 !!!parse-error (type => 'unclosed tag');
617 $before_leave->();
618 if ($self->{current_token}->{type} eq 'start tag') {
619 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
620 } elsif ($self->{current_token}->{type} eq 'end tag') {
621 $self->{content_model_flag} = 'PCDATA'; # MUST
622 if ($self->{current_token}->{attributes}) {
623 !!!parse-error (type => 'end tag attribute');
624 }
625 } else {
626 die "$0: $self->{current_token}->{type}: Unknown token type";
627 }
628 $self->{state} = 'data';
629 # reconsume
630
631 !!!emit ($self->{current_token}); # start tag or end tag
632 undef $self->{current_token};
633
634 redo A;
635 } else {
636 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
637 ## Stay in the state
638 !!!next-input-character;
639 redo A;
640 }
641 } elsif ($self->{state} eq 'after attribute name') {
642 if ($self->{next_input_character} == 0x0009 or # HT
643 $self->{next_input_character} == 0x000A or # LF
644 $self->{next_input_character} == 0x000B or # VT
645 $self->{next_input_character} == 0x000C or # FF
646 $self->{next_input_character} == 0x0020) { # SP
647 ## Stay in the state
648 !!!next-input-character;
649 redo A;
650 } elsif ($self->{next_input_character} == 0x003D) { # =
651 $self->{state} = 'before attribute value';
652 !!!next-input-character;
653 redo A;
654 } elsif ($self->{next_input_character} == 0x003E) { # >
655 if ($self->{current_token}->{type} eq 'start tag') {
656 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
657 } elsif ($self->{current_token}->{type} eq 'end tag') {
658 $self->{content_model_flag} = 'PCDATA'; # MUST
659 if ($self->{current_token}->{attributes}) {
660 !!!parse-error (type => 'end tag attribute');
661 }
662 } else {
663 die "$0: $self->{current_token}->{type}: Unknown token type";
664 }
665 $self->{state} = 'data';
666 !!!next-input-character;
667
668 !!!emit ($self->{current_token}); # start tag or end tag
669 undef $self->{current_token};
670
671 redo A;
672 } elsif (0x0041 <= $self->{next_input_character} and
673 $self->{next_input_character} <= 0x005A) { # A..Z
674 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
675 value => ''};
676 $self->{state} = 'attribute name';
677 !!!next-input-character;
678 redo A;
679 } elsif ($self->{next_input_character} == 0x002F) { # /
680 !!!next-input-character;
681 if ($self->{next_input_character} == 0x003E and # >
682 $self->{current_token}->{type} eq 'start tag' and
683 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
684 # permitted slash
685 #
686 } else {
687 !!!parse-error (type => 'nestc');
688 }
689 $self->{state} = 'before attribute name';
690 # next-input-character is already done
691 redo A;
692 } elsif ($self->{next_input_character} == -1) {
693 !!!parse-error (type => 'unclosed tag');
694 if ($self->{current_token}->{type} eq 'start tag') {
695 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
696 } elsif ($self->{current_token}->{type} eq 'end tag') {
697 $self->{content_model_flag} = 'PCDATA'; # MUST
698 if ($self->{current_token}->{attributes}) {
699 !!!parse-error (type => 'end tag attribute');
700 }
701 } else {
702 die "$0: $self->{current_token}->{type}: Unknown token type";
703 }
704 $self->{state} = 'data';
705 # reconsume
706
707 !!!emit ($self->{current_token}); # start tag or end tag
708 undef $self->{current_token};
709
710 redo A;
711 } else {
712 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
713 value => ''};
714 $self->{state} = 'attribute name';
715 !!!next-input-character;
716 redo A;
717 }
718 } elsif ($self->{state} eq 'before attribute value') {
719 if ($self->{next_input_character} == 0x0009 or # HT
720 $self->{next_input_character} == 0x000A or # LF
721 $self->{next_input_character} == 0x000B or # VT
722 $self->{next_input_character} == 0x000C or # FF
723 $self->{next_input_character} == 0x0020) { # SP
724 ## Stay in the state
725 !!!next-input-character;
726 redo A;
727 } elsif ($self->{next_input_character} == 0x0022) { # "
728 $self->{state} = 'attribute value (double-quoted)';
729 !!!next-input-character;
730 redo A;
731 } elsif ($self->{next_input_character} == 0x0026) { # &
732 $self->{state} = 'attribute value (unquoted)';
733 ## reconsume
734 redo A;
735 } elsif ($self->{next_input_character} == 0x0027) { # '
736 $self->{state} = 'attribute value (single-quoted)';
737 !!!next-input-character;
738 redo A;
739 } elsif ($self->{next_input_character} == 0x003E) { # >
740 if ($self->{current_token}->{type} eq 'start tag') {
741 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
742 } elsif ($self->{current_token}->{type} eq 'end tag') {
743 $self->{content_model_flag} = 'PCDATA'; # MUST
744 if ($self->{current_token}->{attributes}) {
745 !!!parse-error (type => 'end tag attribute');
746 }
747 } else {
748 die "$0: $self->{current_token}->{type}: Unknown token type";
749 }
750 $self->{state} = 'data';
751 !!!next-input-character;
752
753 !!!emit ($self->{current_token}); # start tag or end tag
754 undef $self->{current_token};
755
756 redo A;
757 } elsif ($self->{next_input_character} == -1) {
758 !!!parse-error (type => 'unclosed tag');
759 if ($self->{current_token}->{type} eq 'start tag') {
760 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
761 } elsif ($self->{current_token}->{type} eq 'end tag') {
762 $self->{content_model_flag} = 'PCDATA'; # MUST
763 if ($self->{current_token}->{attributes}) {
764 !!!parse-error (type => 'end tag attribute');
765 }
766 } else {
767 die "$0: $self->{current_token}->{type}: Unknown token type";
768 }
769 $self->{state} = 'data';
770 ## reconsume
771
772 !!!emit ($self->{current_token}); # start tag or end tag
773 undef $self->{current_token};
774
775 redo A;
776 } else {
777 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
778 $self->{state} = 'attribute value (unquoted)';
779 !!!next-input-character;
780 redo A;
781 }
782 } elsif ($self->{state} eq 'attribute value (double-quoted)') {
783 if ($self->{next_input_character} == 0x0022) { # "
784 $self->{state} = 'before attribute name';
785 !!!next-input-character;
786 redo A;
787 } elsif ($self->{next_input_character} == 0x0026) { # &
788 $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
789 $self->{state} = 'entity in attribute value';
790 !!!next-input-character;
791 redo A;
792 } elsif ($self->{next_input_character} == -1) {
793 !!!parse-error (type => 'unclosed attribute value');
794 if ($self->{current_token}->{type} eq 'start tag') {
795 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
796 } elsif ($self->{current_token}->{type} eq 'end tag') {
797 $self->{content_model_flag} = 'PCDATA'; # MUST
798 if ($self->{current_token}->{attributes}) {
799 !!!parse-error (type => 'end tag attribute');
800 }
801 } else {
802 die "$0: $self->{current_token}->{type}: Unknown token type";
803 }
804 $self->{state} = 'data';
805 ## reconsume
806
807 !!!emit ($self->{current_token}); # start tag or end tag
808 undef $self->{current_token};
809
810 redo A;
811 } else {
812 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
813 ## Stay in the state
814 !!!next-input-character;
815 redo A;
816 }
817 } elsif ($self->{state} eq 'attribute value (single-quoted)') {
818 if ($self->{next_input_character} == 0x0027) { # '
819 $self->{state} = 'before attribute name';
820 !!!next-input-character;
821 redo A;
822 } elsif ($self->{next_input_character} == 0x0026) { # &
823 $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
824 $self->{state} = 'entity in attribute value';
825 !!!next-input-character;
826 redo A;
827 } elsif ($self->{next_input_character} == -1) {
828 !!!parse-error (type => 'unclosed attribute value');
829 if ($self->{current_token}->{type} eq 'start tag') {
830 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
831 } elsif ($self->{current_token}->{type} eq 'end tag') {
832 $self->{content_model_flag} = 'PCDATA'; # MUST
833 if ($self->{current_token}->{attributes}) {
834 !!!parse-error (type => 'end tag attribute');
835 }
836 } else {
837 die "$0: $self->{current_token}->{type}: Unknown token type";
838 }
839 $self->{state} = 'data';
840 ## reconsume
841
842 !!!emit ($self->{current_token}); # start tag or end tag
843 undef $self->{current_token};
844
845 redo A;
846 } else {
847 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
848 ## Stay in the state
849 !!!next-input-character;
850 redo A;
851 }
852 } elsif ($self->{state} eq 'attribute value (unquoted)') {
853 if ($self->{next_input_character} == 0x0009 or # HT
854 $self->{next_input_character} == 0x000A or # LF
855 $self->{next_input_character} == 0x000B or # HT
856 $self->{next_input_character} == 0x000C or # FF
857 $self->{next_input_character} == 0x0020) { # SP
858 $self->{state} = 'before attribute name';
859 !!!next-input-character;
860 redo A;
861 } elsif ($self->{next_input_character} == 0x0026) { # &
862 $self->{last_attribute_value_state} = 'attribute value (unquoted)';
863 $self->{state} = 'entity in attribute value';
864 !!!next-input-character;
865 redo A;
866 } elsif ($self->{next_input_character} == 0x003E) { # >
867 if ($self->{current_token}->{type} eq 'start tag') {
868 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
869 } elsif ($self->{current_token}->{type} eq 'end tag') {
870 $self->{content_model_flag} = 'PCDATA'; # MUST
871 if ($self->{current_token}->{attributes}) {
872 !!!parse-error (type => 'end tag attribute');
873 }
874 } else {
875 die "$0: $self->{current_token}->{type}: Unknown token type";
876 }
877 $self->{state} = 'data';
878 !!!next-input-character;
879
880 !!!emit ($self->{current_token}); # start tag or end tag
881 undef $self->{current_token};
882
883 redo A;
884 } elsif ($self->{next_input_character} == -1) {
885 !!!parse-error (type => 'unclosed tag');
886 if ($self->{current_token}->{type} eq 'start tag') {
887 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
888 } elsif ($self->{current_token}->{type} eq 'end tag') {
889 $self->{content_model_flag} = 'PCDATA'; # MUST
890 if ($self->{current_token}->{attributes}) {
891 !!!parse-error (type => 'end tag attribute');
892 }
893 } else {
894 die "$0: $self->{current_token}->{type}: Unknown token type";
895 }
896 $self->{state} = 'data';
897 ## reconsume
898
899 !!!emit ($self->{current_token}); # start tag or end tag
900 undef $self->{current_token};
901
902 redo A;
903 } else {
904 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
905 ## Stay in the state
906 !!!next-input-character;
907 redo A;
908 }
909 } elsif ($self->{state} eq 'entity in attribute value') {
910 my $token = $self->_tokenize_attempt_to_consume_an_entity;
911
912 unless (defined $token) {
913 $self->{current_attribute}->{value} .= '&';
914 } else {
915 $self->{current_attribute}->{value} .= $token->{data};
916 ## ISSUE: spec says "append the returned character token to the current attribute's value"
917 }
918
919 $self->{state} = $self->{last_attribute_value_state};
920 # next-input-character is already done
921 redo A;
922 } elsif ($self->{state} eq 'bogus comment') {
923 ## (only happen if PCDATA state)
924
925 my $token = {type => 'comment', data => ''};
926
927 BC: {
928 if ($self->{next_input_character} == 0x003E) { # >
929 $self->{state} = 'data';
930 !!!next-input-character;
931
932 !!!emit ($token);
933
934 redo A;
935 } elsif ($self->{next_input_character} == -1) {
936 $self->{state} = 'data';
937 ## reconsume
938
939 !!!emit ($token);
940
941 redo A;
942 } else {
943 $token->{data} .= chr ($self->{next_input_character});
944 !!!next-input-character;
945 redo BC;
946 }
947 } # BC
948 } elsif ($self->{state} eq 'markup declaration open') {
949 ## (only happen if PCDATA state)
950
951 my @next_char;
952 push @next_char, $self->{next_input_character};
953
954 if ($self->{next_input_character} == 0x002D) { # -
955 !!!next-input-character;
956 push @next_char, $self->{next_input_character};
957 if ($self->{next_input_character} == 0x002D) { # -
958 $self->{current_token} = {type => 'comment', data => ''};
959 $self->{state} = 'comment';
960 !!!next-input-character;
961 redo A;
962 }
963 } elsif ($self->{next_input_character} == 0x0044 or # D
964 $self->{next_input_character} == 0x0064) { # d
965 !!!next-input-character;
966 push @next_char, $self->{next_input_character};
967 if ($self->{next_input_character} == 0x004F or # O
968 $self->{next_input_character} == 0x006F) { # o
969 !!!next-input-character;
970 push @next_char, $self->{next_input_character};
971 if ($self->{next_input_character} == 0x0043 or # C
972 $self->{next_input_character} == 0x0063) { # c
973 !!!next-input-character;
974 push @next_char, $self->{next_input_character};
975 if ($self->{next_input_character} == 0x0054 or # T
976 $self->{next_input_character} == 0x0074) { # t
977 !!!next-input-character;
978 push @next_char, $self->{next_input_character};
979 if ($self->{next_input_character} == 0x0059 or # Y
980 $self->{next_input_character} == 0x0079) { # y
981 !!!next-input-character;
982 push @next_char, $self->{next_input_character};
983 if ($self->{next_input_character} == 0x0050 or # P
984 $self->{next_input_character} == 0x0070) { # p
985 !!!next-input-character;
986 push @next_char, $self->{next_input_character};
987 if ($self->{next_input_character} == 0x0045 or # E
988 $self->{next_input_character} == 0x0065) { # e
989 ## ISSUE: What a stupid code this is!
990 $self->{state} = 'DOCTYPE';
991 !!!next-input-character;
992 redo A;
993 }
994 }
995 }
996 }
997 }
998 }
999 }
1000
1001 !!!parse-error (type => 'bogus comment open');
1002 $self->{next_input_character} = shift @next_char;
1003 !!!back-next-input-character (@next_char);
1004 $self->{state} = 'bogus comment';
1005 redo A;
1006
1007 ## ISSUE: typos in spec: chacacters, is is a parse error
1008 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1009 } elsif ($self->{state} eq 'comment') {
1010 if ($self->{next_input_character} == 0x002D) { # -
1011 $self->{state} = 'comment dash';
1012 !!!next-input-character;
1013 redo A;
1014 } elsif ($self->{next_input_character} == -1) {
1015 !!!parse-error (type => 'unclosed comment');
1016 $self->{state} = 'data';
1017 ## reconsume
1018
1019 !!!emit ($self->{current_token}); # comment
1020 undef $self->{current_token};
1021
1022 redo A;
1023 } else {
1024 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1025 ## Stay in the state
1026 !!!next-input-character;
1027 redo A;
1028 }
1029 } elsif ($self->{state} eq 'comment dash') {
1030 if ($self->{next_input_character} == 0x002D) { # -
1031 $self->{state} = 'comment end';
1032 !!!next-input-character;
1033 redo A;
1034 } elsif ($self->{next_input_character} == -1) {
1035 !!!parse-error (type => 'unclosed comment');
1036 $self->{state} = 'data';
1037 ## reconsume
1038
1039 !!!emit ($self->{current_token}); # comment
1040 undef $self->{current_token};
1041
1042 redo A;
1043 } else {
1044 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1045 $self->{state} = 'comment';
1046 !!!next-input-character;
1047 redo A;
1048 }
1049 } elsif ($self->{state} eq 'comment end') {
1050 if ($self->{next_input_character} == 0x003E) { # >
1051 $self->{state} = 'data';
1052 !!!next-input-character;
1053
1054 !!!emit ($self->{current_token}); # comment
1055 undef $self->{current_token};
1056
1057 redo A;
1058 } elsif ($self->{next_input_character} == 0x002D) { # -
1059 !!!parse-error (type => 'dash in comment');
1060 $self->{current_token}->{data} .= '-'; # comment
1061 ## Stay in the state
1062 !!!next-input-character;
1063 redo A;
1064 } elsif ($self->{next_input_character} == -1) {
1065 !!!parse-error (type => 'unclosed comment');
1066 $self->{state} = 'data';
1067 ## reconsume
1068
1069 !!!emit ($self->{current_token}); # comment
1070 undef $self->{current_token};
1071
1072 redo A;
1073 } else {
1074 !!!parse-error (type => 'dash in comment');
1075 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1076 $self->{state} = 'comment';
1077 !!!next-input-character;
1078 redo A;
1079 }
1080 } elsif ($self->{state} eq 'DOCTYPE') {
1081 if ($self->{next_input_character} == 0x0009 or # HT
1082 $self->{next_input_character} == 0x000A or # LF
1083 $self->{next_input_character} == 0x000B or # VT
1084 $self->{next_input_character} == 0x000C or # FF
1085 $self->{next_input_character} == 0x0020) { # SP
1086 $self->{state} = 'before DOCTYPE name';
1087 !!!next-input-character;
1088 redo A;
1089 } else {
1090 !!!parse-error (type => 'no space before DOCTYPE name');
1091 $self->{state} = 'before DOCTYPE name';
1092 ## reconsume
1093 redo A;
1094 }
1095 } elsif ($self->{state} eq 'before DOCTYPE name') {
1096 if ($self->{next_input_character} == 0x0009 or # HT
1097 $self->{next_input_character} == 0x000A or # LF
1098 $self->{next_input_character} == 0x000B or # VT
1099 $self->{next_input_character} == 0x000C or # FF
1100 $self->{next_input_character} == 0x0020) { # SP
1101 ## Stay in the state
1102 !!!next-input-character;
1103 redo A;
1104 } elsif ($self->{next_input_character} == 0x003E) { # >
1105 !!!parse-error (type => 'no DOCTYPE name');
1106 $self->{state} = 'data';
1107 !!!next-input-character;
1108
1109 !!!emit ({type => 'DOCTYPE'}); # incorrect
1110
1111 redo A;
1112 } elsif ($self->{next_input_character} == -1) {
1113 !!!parse-error (type => 'no DOCTYPE name');
1114 $self->{state} = 'data';
1115 ## reconsume
1116
1117 !!!emit ({type => 'DOCTYPE'}); # incorrect
1118
1119 redo A;
1120 } else {
1121 $self->{current_token}
1122 = {type => 'DOCTYPE',
1123 name => chr ($self->{next_input_character}),
1124 correct => 1};
1125 ## ISSUE: "Set the token's name name to the" in the spec
1126 $self->{state} = 'DOCTYPE name';
1127 !!!next-input-character;
1128 redo A;
1129 }
1130 } elsif ($self->{state} eq 'DOCTYPE name') {
1131 ## ISSUE: Redundant "First," in the spec.
1132 if ($self->{next_input_character} == 0x0009 or # HT
1133 $self->{next_input_character} == 0x000A or # LF
1134 $self->{next_input_character} == 0x000B or # VT
1135 $self->{next_input_character} == 0x000C or # FF
1136 $self->{next_input_character} == 0x0020) { # SP
1137 $self->{state} = 'after DOCTYPE name';
1138 !!!next-input-character;
1139 redo A;
1140 } elsif ($self->{next_input_character} == 0x003E) { # >
1141 $self->{state} = 'data';
1142 !!!next-input-character;
1143
1144 !!!emit ($self->{current_token}); # DOCTYPE
1145 undef $self->{current_token};
1146
1147 redo A;
1148 } elsif ($self->{next_input_character} == -1) {
1149 !!!parse-error (type => 'unclosed DOCTYPE');
1150 $self->{state} = 'data';
1151 ## reconsume
1152
1153 delete $self->{current_token}->{correct};
1154 !!!emit ($self->{current_token}); # DOCTYPE
1155 undef $self->{current_token};
1156
1157 redo A;
1158 } else {
1159 $self->{current_token}->{name}
1160 .= chr ($self->{next_input_character}); # DOCTYPE
1161 ## Stay in the state
1162 !!!next-input-character;
1163 redo A;
1164 }
1165 } elsif ($self->{state} eq 'after DOCTYPE name') {
1166 if ($self->{next_input_character} == 0x0009 or # HT
1167 $self->{next_input_character} == 0x000A or # LF
1168 $self->{next_input_character} == 0x000B or # VT
1169 $self->{next_input_character} == 0x000C or # FF
1170 $self->{next_input_character} == 0x0020) { # SP
1171 ## Stay in the state
1172 !!!next-input-character;
1173 redo A;
1174 } elsif ($self->{next_input_character} == 0x003E) { # >
1175 $self->{state} = 'data';
1176 !!!next-input-character;
1177
1178 !!!emit ($self->{current_token}); # DOCTYPE
1179 undef $self->{current_token};
1180
1181 redo A;
1182 } elsif ($self->{next_input_character} == -1) {
1183 !!!parse-error (type => 'unclosed DOCTYPE');
1184 $self->{state} = 'data';
1185 ## reconsume
1186
1187 delete $self->{current_token}->{correct};
1188 !!!emit ($self->{current_token}); # DOCTYPE
1189 undef $self->{current_token};
1190
1191 redo A;
1192 } elsif ($self->{next_input_character} == 0x0050 or # P
1193 $self->{next_input_character} == 0x0070) { # p
1194 !!!next-input-character;
1195 if ($self->{next_input_character} == 0x0055 or # U
1196 $self->{next_input_character} == 0x0075) { # u
1197 !!!next-input-character;
1198 if ($self->{next_input_character} == 0x0042 or # B
1199 $self->{next_input_character} == 0x0062) { # b
1200 !!!next-input-character;
1201 if ($self->{next_input_character} == 0x004C or # L
1202 $self->{next_input_character} == 0x006C) { # l
1203 !!!next-input-character;
1204 if ($self->{next_input_character} == 0x0049 or # I
1205 $self->{next_input_character} == 0x0069) { # i
1206 !!!next-input-character;
1207 if ($self->{next_input_character} == 0x0043 or # C
1208 $self->{next_input_character} == 0x0063) { # c
1209 $self->{state} = 'before DOCTYPE public identifier';
1210 !!!next-input-character;
1211 redo A;
1212 }
1213 }
1214 }
1215 }
1216 }
1217
1218 #
1219 } elsif ($self->{next_input_character} == 0x0053 or # S
1220 $self->{next_input_character} == 0x0073) { # s
1221 !!!next-input-character;
1222 if ($self->{next_input_character} == 0x0059 or # Y
1223 $self->{next_input_character} == 0x0079) { # y
1224 !!!next-input-character;
1225 if ($self->{next_input_character} == 0x0053 or # S
1226 $self->{next_input_character} == 0x0073) { # s
1227 !!!next-input-character;
1228 if ($self->{next_input_character} == 0x0054 or # T
1229 $self->{next_input_character} == 0x0074) { # t
1230 !!!next-input-character;
1231 if ($self->{next_input_character} == 0x0045 or # E
1232 $self->{next_input_character} == 0x0065) { # e
1233 !!!next-input-character;
1234 if ($self->{next_input_character} == 0x004D or # M
1235 $self->{next_input_character} == 0x006D) { # m
1236 $self->{state} = 'before DOCTYPE system identifier';
1237 !!!next-input-character;
1238 redo A;
1239 }
1240 }
1241 }
1242 }
1243 }
1244
1245 #
1246 } else {
1247 !!!next-input-character;
1248 #
1249 }
1250
1251 !!!parse-error (type => 'string after DOCTYPE name');
1252 $self->{state} = 'bogus DOCTYPE';
1253 # next-input-character is already done
1254 redo A;
1255 } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
1256 if ({
1257 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1258 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1259 }->{$self->{next_input_character}}) {
1260 ## Stay in the state
1261 !!!next-input-character;
1262 redo A;
1263 } elsif ($self->{next_input_character} eq 0x0022) { # "
1264 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1265 $self->{state} = 'DOCTYPE public identifier (double-quoted)';
1266 !!!next-input-character;
1267 redo A;
1268 } elsif ($self->{next_input_character} eq 0x0027) { # '
1269 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1270 $self->{state} = 'DOCTYPE public identifier (single-quoted)';
1271 !!!next-input-character;
1272 redo A;
1273 } elsif ($self->{next_input_character} eq 0x003E) { # >
1274 !!!parse-error (type => 'no PUBLIC literal');
1275
1276 $self->{state} = 'data';
1277 !!!next-input-character;
1278
1279 delete $self->{current_token}->{correct};
1280 !!!emit ($self->{current_token}); # DOCTYPE
1281 undef $self->{current_token};
1282
1283 redo A;
1284 } elsif ($self->{next_input_character} == -1) {
1285 !!!parse-error (type => 'unclosed DOCTYPE');
1286
1287 $self->{state} = 'data';
1288 ## reconsume
1289
1290 delete $self->{current_token}->{correct};
1291 !!!emit ($self->{current_token}); # DOCTYPE
1292 undef $self->{current_token};
1293
1294 redo A;
1295 } else {
1296 !!!parse-error (type => 'string after PUBLIC');
1297 $self->{state} = 'bogus DOCTYPE';
1298 !!!next-input-character;
1299 redo A;
1300 }
1301 } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
1302 if ($self->{next_input_character} == 0x0022) { # "
1303 $self->{state} = 'after DOCTYPE public identifier';
1304 !!!next-input-character;
1305 redo A;
1306 } elsif ($self->{next_input_character} == -1) {
1307 !!!parse-error (type => 'unclosed PUBLIC literal');
1308
1309 $self->{state} = 'data';
1310 ## reconsume
1311
1312 delete $self->{current_token}->{correct};
1313 !!!emit ($self->{current_token}); # DOCTYPE
1314 undef $self->{current_token};
1315
1316 redo A;
1317 } else {
1318 $self->{current_token}->{public_identifier} # DOCTYPE
1319 .= chr $self->{next_input_character};
1320 ## Stay in the state
1321 !!!next-input-character;
1322 redo A;
1323 }
1324 } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
1325 if ($self->{next_input_character} == 0x0027) { # '
1326 $self->{state} = 'after DOCTYPE public identifier';
1327 !!!next-input-character;
1328 redo A;
1329 } elsif ($self->{next_input_character} == -1) {
1330 !!!parse-error (type => 'unclosed PUBLIC literal');
1331
1332 $self->{state} = 'data';
1333 ## reconsume
1334
1335 delete $self->{current_token}->{correct};
1336 !!!emit ($self->{current_token}); # DOCTYPE
1337 undef $self->{current_token};
1338
1339 redo A;
1340 } else {
1341 $self->{current_token}->{public_identifier} # DOCTYPE
1342 .= chr $self->{next_input_character};
1343 ## Stay in the state
1344 !!!next-input-character;
1345 redo A;
1346 }
1347 } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
1348 if ({
1349 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1350 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1351 }->{$self->{next_input_character}}) {
1352 ## Stay in the state
1353 !!!next-input-character;
1354 redo A;
1355 } elsif ($self->{next_input_character} == 0x0022) { # "
1356 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1357 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1358 !!!next-input-character;
1359 redo A;
1360 } elsif ($self->{next_input_character} == 0x0027) { # '
1361 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1362 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1363 !!!next-input-character;
1364 redo A;
1365 } elsif ($self->{next_input_character} == 0x003E) { # >
1366 $self->{state} = 'data';
1367 !!!next-input-character;
1368
1369 !!!emit ($self->{current_token}); # DOCTYPE
1370 undef $self->{current_token};
1371
1372 redo A;
1373 } elsif ($self->{next_input_character} == -1) {
1374 !!!parse-error (type => 'unclosed DOCTYPE');
1375
1376 $self->{state} = 'data';
1377 ## recomsume
1378
1379 delete $self->{current_token}->{correct};
1380 !!!emit ($self->{current_token}); # DOCTYPE
1381 undef $self->{current_token};
1382
1383 redo A;
1384 } else {
1385 !!!parse-error (type => 'string after PUBLIC literal');
1386 $self->{state} = 'bogus DOCTYPE';
1387 !!!next-input-character;
1388 redo A;
1389 }
1390 } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
1391 if ({
1392 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1393 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1394 }->{$self->{next_input_character}}) {
1395 ## Stay in the state
1396 !!!next-input-character;
1397 redo A;
1398 } elsif ($self->{next_input_character} == 0x0022) { # "
1399 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1400 $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1401 !!!next-input-character;
1402 redo A;
1403 } elsif ($self->{next_input_character} == 0x0027) { # '
1404 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1405 $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1406 !!!next-input-character;
1407 redo A;
1408 } elsif ($self->{next_input_character} == 0x003E) { # >
1409 !!!parse-error (type => 'no SYSTEM literal');
1410 $self->{state} = 'data';
1411 !!!next-input-character;
1412
1413 delete $self->{current_token}->{correct};
1414 !!!emit ($self->{current_token}); # DOCTYPE
1415 undef $self->{current_token};
1416
1417 redo A;
1418 } elsif ($self->{next_input_character} == -1) {
1419 !!!parse-error (type => 'unclosed DOCTYPE');
1420
1421 $self->{state} = 'data';
1422 ## recomsume
1423
1424 delete $self->{current_token}->{correct};
1425 !!!emit ($self->{current_token}); # DOCTYPE
1426 undef $self->{current_token};
1427
1428 redo A;
1429 } else {
1430 !!!parse-error (type => 'string after PUBLIC literal');
1431 $self->{state} = 'bogus DOCTYPE';
1432 !!!next-input-character;
1433 redo A;
1434 }
1435 } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
1436 if ($self->{next_input_character} == 0x0022) { # "
1437 $self->{state} = 'after DOCTYPE system identifier';
1438 !!!next-input-character;
1439 redo A;
1440 } elsif ($self->{next_input_character} == -1) {
1441 !!!parse-error (type => 'unclosed SYSTEM literal');
1442
1443 $self->{state} = 'data';
1444 ## reconsume
1445
1446 delete $self->{current_token}->{correct};
1447 !!!emit ($self->{current_token}); # DOCTYPE
1448 undef $self->{current_token};
1449
1450 redo A;
1451 } else {
1452 $self->{current_token}->{system_identifier} # DOCTYPE
1453 .= chr $self->{next_input_character};
1454 ## Stay in the state
1455 !!!next-input-character;
1456 redo A;
1457 }
1458 } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
1459 if ($self->{next_input_character} == 0x0027) { # '
1460 $self->{state} = 'after DOCTYPE system identifier';
1461 !!!next-input-character;
1462 redo A;
1463 } elsif ($self->{next_input_character} == -1) {
1464 !!!parse-error (type => 'unclosed SYSTEM literal');
1465
1466 $self->{state} = 'data';
1467 ## reconsume
1468
1469 delete $self->{current_token}->{correct};
1470 !!!emit ($self->{current_token}); # DOCTYPE
1471 undef $self->{current_token};
1472
1473 redo A;
1474 } else {
1475 $self->{current_token}->{system_identifier} # DOCTYPE
1476 .= chr $self->{next_input_character};
1477 ## Stay in the state
1478 !!!next-input-character;
1479 redo A;
1480 }
1481 } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
1482 if ({
1483 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1484 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1485 }->{$self->{next_input_character}}) {
1486 ## Stay in the state
1487 !!!next-input-character;
1488 redo A;
1489 } elsif ($self->{next_input_character} == 0x003E) { # >
1490 $self->{state} = 'data';
1491 !!!next-input-character;
1492
1493 !!!emit ($self->{current_token}); # DOCTYPE
1494 undef $self->{current_token};
1495
1496 redo A;
1497 } elsif ($self->{next_input_character} == -1) {
1498 !!!parse-error (type => 'unclosed DOCTYPE');
1499
1500 $self->{state} = 'data';
1501 ## recomsume
1502
1503 delete $self->{current_token}->{correct};
1504 !!!emit ($self->{current_token}); # DOCTYPE
1505 undef $self->{current_token};
1506
1507 redo A;
1508 } else {
1509 !!!parse-error (type => 'string after SYSTEM literal');
1510 $self->{state} = 'bogus DOCTYPE';
1511 !!!next-input-character;
1512 redo A;
1513 }
1514 } elsif ($self->{state} eq 'bogus DOCTYPE') {
1515 if ($self->{next_input_character} == 0x003E) { # >
1516 $self->{state} = 'data';
1517 !!!next-input-character;
1518
1519 delete $self->{current_token}->{correct};
1520 !!!emit ($self->{current_token}); # DOCTYPE
1521 undef $self->{current_token};
1522
1523 redo A;
1524 } elsif ($self->{next_input_character} == -1) {
1525 !!!parse-error (type => 'unclosed DOCTYPE');
1526 $self->{state} = 'data';
1527 ## reconsume
1528
1529 delete $self->{current_token}->{correct};
1530 !!!emit ($self->{current_token}); # DOCTYPE
1531 undef $self->{current_token};
1532
1533 redo A;
1534 } else {
1535 ## Stay in the state
1536 !!!next-input-character;
1537 redo A;
1538 }
1539 } else {
1540 die "$0: $self->{state}: Unknown state";
1541 }
1542 } # A
1543
1544 die "$0: _get_next_token: unexpected case";
1545 } # _get_next_token
1546
1547 sub _tokenize_attempt_to_consume_an_entity ($) {
1548 my $self = shift;
1549
1550 if ($self->{next_input_character} == 0x0023) { # #
1551 !!!next-input-character;
1552 if ($self->{next_input_character} == 0x0078 or # x
1553 $self->{next_input_character} == 0x0058) { # X
1554 my $num;
1555 X: {
1556 my $x_char = $self->{next_input_character};
1557 !!!next-input-character;
1558 if (0x0030 <= $self->{next_input_character} and
1559 $self->{next_input_character} <= 0x0039) { # 0..9
1560 $num ||= 0;
1561 $num *= 0x10;
1562 $num += $self->{next_input_character} - 0x0030;
1563 redo X;
1564 } elsif (0x0061 <= $self->{next_input_character} and
1565 $self->{next_input_character} <= 0x0066) { # a..f
1566 ## ISSUE: the spec says U+0078, which is apparently incorrect
1567 $num ||= 0;
1568 $num *= 0x10;
1569 $num += $self->{next_input_character} - 0x0060 + 9;
1570 redo X;
1571 } elsif (0x0041 <= $self->{next_input_character} and
1572 $self->{next_input_character} <= 0x0046) { # A..F
1573 ## ISSUE: the spec says U+0058, which is apparently incorrect
1574 $num ||= 0;
1575 $num *= 0x10;
1576 $num += $self->{next_input_character} - 0x0040 + 9;
1577 redo X;
1578 } elsif (not defined $num) { # no hexadecimal digit
1579 !!!parse-error (type => 'bare hcro');
1580 $self->{next_input_character} = 0x0023; # #
1581 !!!back-next-input-character ($x_char);
1582 return undef;
1583 } elsif ($self->{next_input_character} == 0x003B) { # ;
1584 !!!next-input-character;
1585 } else {
1586 !!!parse-error (type => 'no refc');
1587 }
1588
1589 ## TODO: check the definition for |a valid Unicode character|.
1590 ## <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8189>
1591 if ($num > 1114111 or $num == 0) {
1592 $num = 0xFFFD; # REPLACEMENT CHARACTER
1593 ## ISSUE: Why this is not an error?
1594 } elsif (0x80 <= $num and $num <= 0x9F) {
1595 !!!parse-error (type => sprintf 'c1 entity:U+%04X', $num);
1596 $num = $c1_entity_char->{$num};
1597 }
1598
1599 return {type => 'character', data => chr $num};
1600 } # X
1601 } elsif (0x0030 <= $self->{next_input_character} and
1602 $self->{next_input_character} <= 0x0039) { # 0..9
1603 my $code = $self->{next_input_character} - 0x0030;
1604 !!!next-input-character;
1605
1606 while (0x0030 <= $self->{next_input_character} and
1607 $self->{next_input_character} <= 0x0039) { # 0..9
1608 $code *= 10;
1609 $code += $self->{next_input_character} - 0x0030;
1610
1611 !!!next-input-character;
1612 }
1613
1614 if ($self->{next_input_character} == 0x003B) { # ;
1615 !!!next-input-character;
1616 } else {
1617 !!!parse-error (type => 'no refc');
1618 }
1619
1620 ## TODO: check the definition for |a valid Unicode character|.
1621 if ($code > 1114111 or $code == 0) {
1622 $code = 0xFFFD; # REPLACEMENT CHARACTER
1623 ## ISSUE: Why this is not an error?
1624 } elsif (0x80 <= $code and $code <= 0x9F) {
1625 !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);
1626 $code = $c1_entity_char->{$code};
1627 }
1628
1629 return {type => 'character', data => chr $code};
1630 } else {
1631 !!!parse-error (type => 'bare nero');
1632 !!!back-next-input-character ($self->{next_input_character});
1633 $self->{next_input_character} = 0x0023; # #
1634 return undef;
1635 }
1636 } elsif ((0x0041 <= $self->{next_input_character} and
1637 $self->{next_input_character} <= 0x005A) or
1638 (0x0061 <= $self->{next_input_character} and
1639 $self->{next_input_character} <= 0x007A)) {
1640 my $entity_name = chr $self->{next_input_character};
1641 !!!next-input-character;
1642
1643 my $value = $entity_name;
1644 my $match;
1645 require Whatpm::_NamedEntityList;
1646 our $EntityChar;
1647
1648 while (length $entity_name < 10 and
1649 ## NOTE: Some number greater than the maximum length of entity name
1650 ((0x0041 <= $self->{next_input_character} and # a
1651 $self->{next_input_character} <= 0x005A) or # x
1652 (0x0061 <= $self->{next_input_character} and # a
1653 $self->{next_input_character} <= 0x007A) or # z
1654 (0x0030 <= $self->{next_input_character} and # 0
1655 $self->{next_input_character} <= 0x0039) or # 9
1656 $self->{next_input_character} == 0x003B)) { # ;
1657 $entity_name .= chr $self->{next_input_character};
1658 if (defined $EntityChar->{$entity_name}) {
1659 $value = $EntityChar->{$entity_name};
1660 if ($self->{next_input_character} == 0x003B) { # ;
1661 $match = 1;
1662 !!!next-input-character;
1663 last;
1664 } else {
1665 $match = -1;
1666 }
1667 } else {
1668 $value .= chr $self->{next_input_character};
1669 }
1670 !!!next-input-character;
1671 }
1672
1673 if ($match > 0) {
1674 return {type => 'character', data => $value};
1675 } elsif ($match < 0) {
1676 !!!parse-error (type => 'refc');
1677 return {type => 'character', data => $value};
1678 } else {
1679 !!!parse-error (type => 'bare ero');
1680 ## NOTE: No characters are consumed in the spec.
1681 !!!back-token ({type => 'character', data => $value});
1682 return undef;
1683 }
1684 } else {
1685 ## no characters are consumed
1686 !!!parse-error (type => 'bare ero');
1687 return undef;
1688 }
1689 } # _tokenize_attempt_to_consume_an_entity
1690
1691 sub _initialize_tree_constructor ($) {
1692 my $self = shift;
1693 ## NOTE: $self->{document} MUST be specified before this method is called
1694 $self->{document}->strict_error_checking (0);
1695 ## TODO: Turn mutation events off # MUST
1696 ## TODO: Turn loose Document option (manakai extension) on
1697 $self->{document}->manakai_is_html (1); # MUST
1698 } # _initialize_tree_constructor
1699
1700 sub _terminate_tree_constructor ($) {
1701 my $self = shift;
1702 $self->{document}->strict_error_checking (1);
1703 ## TODO: Turn mutation events on
1704 } # _terminate_tree_constructor
1705
1706 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1707
1708 { # tree construction stage
1709 my $token;
1710
1711 sub _construct_tree ($) {
1712 my ($self) = @_;
1713
1714 ## When an interactive UA render the $self->{document} available
1715 ## to the user, or when it begin accepting user input, are
1716 ## not defined.
1717
1718 ## Append a character: collect it and all subsequent consecutive
1719 ## characters and insert one Text node whose data is concatenation
1720 ## of all those characters. # MUST
1721
1722 !!!next-token;
1723
1724 $self->{insertion_mode} = 'before head';
1725 undef $self->{form_element};
1726 undef $self->{head_element};
1727 $self->{open_elements} = [];
1728 undef $self->{inner_html_node};
1729
1730 $self->_tree_construction_initial; # MUST
1731 $self->_tree_construction_root_element;
1732 $self->_tree_construction_main;
1733 } # _construct_tree
1734
1735 sub _tree_construction_initial ($) {
1736 my $self = shift;
1737 INITIAL: {
1738 if ($token->{type} eq 'DOCTYPE') {
1739 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1740 ## error, switch to a conformance checking mode for another
1741 ## language.
1742 my $doctype_name = $token->{name};
1743 $doctype_name = '' unless defined $doctype_name;
1744 $doctype_name =~ tr/a-z/A-Z/;
1745 if (not defined $token->{name} or # <!DOCTYPE>
1746 defined $token->{public_identifier} or
1747 defined $token->{system_identifier}) {
1748 !!!parse-error (type => 'not HTML5');
1749 } elsif ($doctype_name ne 'HTML') {
1750 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1751 !!!parse-error (type => 'not HTML5');
1752 }
1753
1754 my $doctype = $self->{document}->create_document_type_definition
1755 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1756 $doctype->public_id ($token->{public_identifier})
1757 if defined $token->{public_identifier};
1758 $doctype->system_id ($token->{system_identifier})
1759 if defined $token->{system_identifier};
1760 ## NOTE: Other DocumentType attributes are null or empty lists.
1761 ## ISSUE: internalSubset = null??
1762 $self->{document}->append_child ($doctype);
1763
1764 if (not $token->{correct} or $doctype_name ne 'HTML') {
1765 $self->{document}->manakai_compat_mode ('quirks');
1766 } elsif (defined $token->{public_identifier}) {
1767 my $pubid = $token->{public_identifier};
1768 $pubid =~ tr/a-z/A-z/;
1769 if ({
1770 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1771 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1772 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1773 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1774 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1775 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1776 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1777 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1778 "-//IETF//DTD HTML 2.0//EN" => 1,
1779 "-//IETF//DTD HTML 2.1E//EN" => 1,
1780 "-//IETF//DTD HTML 3.0//EN" => 1,
1781 "-//IETF//DTD HTML 3.0//EN//" => 1,
1782 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1783 "-//IETF//DTD HTML 3.2//EN" => 1,
1784 "-//IETF//DTD HTML 3//EN" => 1,
1785 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1786 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1787 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1788 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1789 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1790 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1791 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1792 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1793 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1794 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1795 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1796 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1797 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1798 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1799 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1800 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1801 "-//IETF//DTD HTML STRICT//EN" => 1,
1802 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1803 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1804 "-//IETF//DTD HTML//EN" => 1,
1805 "-//IETF//DTD HTML//EN//2.0" => 1,
1806 "-//IETF//DTD HTML//EN//3.0" => 1,
1807 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1808 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1809 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1810 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1811 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1812 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1813 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1814 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1815 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1816 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1817 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1818 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1819 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1820 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1821 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1822 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1823 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1824 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1825 "-//W3C//DTD HTML 3.2//EN" => 1,
1826 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1827 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1828 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1829 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1830 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1831 "-//W3C//DTD W3 HTML//EN" => 1,
1832 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1833 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1834 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1835 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1836 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1837 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1838 "HTML" => 1,
1839 }->{$pubid}) {
1840 $self->{document}->manakai_compat_mode ('quirks');
1841 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1842 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1843 if (defined $token->{system_identifier}) {
1844 $self->{document}->manakai_compat_mode ('quirks');
1845 } else {
1846 $self->{document}->manakai_compat_mode ('limited quirks');
1847 }
1848 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
1849 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
1850 $self->{document}->manakai_compat_mode ('limited quirks');
1851 }
1852 }
1853 if (defined $token->{system_identifier}) {
1854 my $sysid = $token->{system_identifier};
1855 $sysid =~ tr/A-Z/a-z/;
1856 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
1857 $self->{document}->manakai_compat_mode ('quirks');
1858 }
1859 }
1860
1861 ## Go to the root element phase.
1862 !!!next-token;
1863 return;
1864 } elsif ({
1865 'start tag' => 1,
1866 'end tag' => 1,
1867 'end-of-file' => 1,
1868 }->{$token->{type}}) {
1869 !!!parse-error (type => 'no DOCTYPE');
1870 $self->{document}->manakai_compat_mode ('quirks');
1871 ## Go to the root element phase
1872 ## reprocess
1873 return;
1874 } elsif ($token->{type} eq 'character') {
1875 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1876 ## Ignore the token
1877 unless (length $token->{data}) {
1878 ## Stay in the phase
1879 !!!next-token;
1880 redo INITIAL;
1881 }
1882 }
1883
1884 !!!parse-error (type => 'no DOCTYPE');
1885 $self->{document}->manakai_compat_mode ('quirks');
1886 ## Go to the root element phase
1887 ## reprocess
1888 return;
1889 } elsif ($token->{type} eq 'comment') {
1890 my $comment = $self->{document}->create_comment ($token->{data});
1891 $self->{document}->append_child ($comment);
1892
1893 ## Stay in the phase.
1894 !!!next-token;
1895 redo INITIAL;
1896 } else {
1897 die "$0: $token->{type}: Unknown token";
1898 }
1899 } # INITIAL
1900 } # _tree_construction_initial
1901
1902 sub _tree_construction_root_element ($) {
1903 my $self = shift;
1904
1905 B: {
1906 if ($token->{type} eq 'DOCTYPE') {
1907 !!!parse-error (type => 'in html:#DOCTYPE');
1908 ## Ignore the token
1909 ## Stay in the phase
1910 !!!next-token;
1911 redo B;
1912 } elsif ($token->{type} eq 'comment') {
1913 my $comment = $self->{document}->create_comment ($token->{data});
1914 $self->{document}->append_child ($comment);
1915 ## Stay in the phase
1916 !!!next-token;
1917 redo B;
1918 } elsif ($token->{type} eq 'character') {
1919 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
1920 $self->{document}->manakai_append_text ($1);
1921 ## ISSUE: DOM3 Core does not allow Document > Text
1922 unless (length $token->{data}) {
1923 ## Stay in the phase
1924 !!!next-token;
1925 redo B;
1926 }
1927 }
1928 #
1929 } elsif ({
1930 'start tag' => 1,
1931 'end tag' => 1,
1932 'end-of-file' => 1,
1933 }->{$token->{type}}) {
1934 ## ISSUE: There is an issue in the spec
1935 #
1936 } else {
1937 die "$0: $token->{type}: Unknown token";
1938 }
1939 my $root_element; !!!create-element ($root_element, 'html');
1940 $self->{document}->append_child ($root_element);
1941 push @{$self->{open_elements}}, [$root_element, 'html'];
1942 #$phase = 'main';
1943 ## reprocess
1944 #redo B;
1945 return;
1946 } # B
1947 } # _tree_construction_root_element
1948
1949 sub _reset_insertion_mode ($) {
1950 my $self = shift;
1951
1952 ## Step 1
1953 my $last;
1954
1955 ## Step 2
1956 my $i = -1;
1957 my $node = $self->{open_elements}->[$i];
1958
1959 ## Step 3
1960 S3: {
1961 $last = 1 if $self->{open_elements}->[0]->[0] eq $node->[0];
1962 if (defined $self->{inner_html_node}) {
1963 if ($self->{inner_html_node}->[1] eq 'td' or
1964 $self->{inner_html_node}->[1] eq 'th') {
1965 #
1966 } else {
1967 $node = $self->{inner_html_node};
1968 }
1969 }
1970
1971 ## Step 4..13
1972 my $new_mode = {
1973 select => 'in select',
1974 td => 'in cell',
1975 th => 'in cell',
1976 tr => 'in row',
1977 tbody => 'in table body',
1978 thead => 'in table head',
1979 tfoot => 'in table foot',
1980 caption => 'in caption',
1981 colgroup => 'in column group',
1982 table => 'in table',
1983 head => 'in body', # not in head!
1984 body => 'in body',
1985 frameset => 'in frameset',
1986 }->{$node->[1]};
1987 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
1988
1989 ## Step 14
1990 if ($node->[1] eq 'html') {
1991 unless (defined $self->{head_element}) {
1992 $self->{insertion_mode} = 'before head';
1993 } else {
1994 $self->{insertion_mode} = 'after head';
1995 }
1996 return;
1997 }
1998
1999 ## Step 15
2000 $self->{insertion_mode} = 'in body' and return if $last;
2001
2002 ## Step 16
2003 $i--;
2004 $node = $self->{open_elements}->[$i];
2005
2006 ## Step 17
2007 redo S3;
2008 } # S3
2009 } # _reset_insertion_mode
2010
2011 sub _tree_construction_main ($) {
2012 my $self = shift;
2013
2014 my $phase = 'main';
2015
2016 my $active_formatting_elements = [];
2017
2018 my $reconstruct_active_formatting_elements = sub { # MUST
2019 my $insert = shift;
2020
2021 ## Step 1
2022 return unless @$active_formatting_elements;
2023
2024 ## Step 3
2025 my $i = -1;
2026 my $entry = $active_formatting_elements->[$i];
2027
2028 ## Step 2
2029 return if $entry->[0] eq '#marker';
2030 for (@{$self->{open_elements}}) {
2031 if ($entry->[0] eq $_->[0]) {
2032 return;
2033 }
2034 }
2035
2036 S4: {
2037 ## Step 4
2038 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2039
2040 ## Step 5
2041 $i--;
2042 $entry = $active_formatting_elements->[$i];
2043
2044 ## Step 6
2045 if ($entry->[0] eq '#marker') {
2046 #
2047 } else {
2048 my $in_open_elements;
2049 OE: for (@{$self->{open_elements}}) {
2050 if ($entry->[0] eq $_->[0]) {
2051 $in_open_elements = 1;
2052 last OE;
2053 }
2054 }
2055 if ($in_open_elements) {
2056 #
2057 } else {
2058 redo S4;
2059 }
2060 }
2061
2062 ## Step 7
2063 $i++;
2064 $entry = $active_formatting_elements->[$i];
2065 } # S4
2066
2067 S7: {
2068 ## Step 8
2069 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2070
2071 ## Step 9
2072 $insert->($clone->[0]);
2073 push @{$self->{open_elements}}, $clone;
2074
2075 ## Step 10
2076 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2077
2078 ## Step 11
2079 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2080 ## Step 7'
2081 $i++;
2082 $entry = $active_formatting_elements->[$i];
2083
2084 redo S7;
2085 }
2086 } # S7
2087 }; # $reconstruct_active_formatting_elements
2088
2089 my $clear_up_to_marker = sub {
2090 for (reverse 0..$#$active_formatting_elements) {
2091 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2092 splice @$active_formatting_elements, $_;
2093 return;
2094 }
2095 }
2096 }; # $clear_up_to_marker
2097
2098 my $style_start_tag = sub {
2099 my $style_el; !!!create-element ($style_el, 'style', $token->{attributes});
2100 ## $self->{insertion_mode} eq 'in head' and ... (always true)
2101 (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
2102 ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
2103 ->append_child ($style_el);
2104 $self->{content_model_flag} = 'CDATA';
2105 delete $self->{escape}; # MUST
2106
2107 my $text = '';
2108 !!!next-token;
2109 while ($token->{type} eq 'character') {
2110 $text .= $token->{data};
2111 !!!next-token;
2112 } # stop if non-character token or tokenizer stops tokenising
2113 if (length $text) {
2114 $style_el->manakai_append_text ($text);
2115 }
2116
2117 $self->{content_model_flag} = 'PCDATA';
2118
2119 if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
2120 ## Ignore the token
2121 } else {
2122 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2123 ## ISSUE: And ignore?
2124 }
2125 !!!next-token;
2126 }; # $style_start_tag
2127
2128 my $script_start_tag = sub {
2129 my $script_el;
2130 !!!create-element ($script_el, 'script', $token->{attributes});
2131 ## TODO: mark as "parser-inserted"
2132
2133 $self->{content_model_flag} = 'CDATA';
2134 delete $self->{escape}; # MUST
2135
2136 my $text = '';
2137 !!!next-token;
2138 while ($token->{type} eq 'character') {
2139 $text .= $token->{data};
2140 !!!next-token;
2141 } # stop if non-character token or tokenizer stops tokenising
2142 if (length $text) {
2143 $script_el->manakai_append_text ($text);
2144 }
2145
2146 $self->{content_model_flag} = 'PCDATA';
2147
2148 if ($token->{type} eq 'end tag' and
2149 $token->{tag_name} eq 'script') {
2150 ## Ignore the token
2151 } else {
2152 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2153 ## ISSUE: And ignore?
2154 ## TODO: mark as "already executed"
2155 }
2156
2157 if (defined $self->{inner_html_node}) {
2158 ## TODO: mark as "already executed"
2159 } else {
2160 ## TODO: $old_insertion_point = current insertion point
2161 ## TODO: insertion point = just before the next input character
2162
2163 (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
2164 ? $self->{head_element} : $self->{open_elements}->[-1]->[0])->append_child ($script_el);
2165
2166 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2167
2168 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2169 }
2170
2171 !!!next-token;
2172 }; # $script_start_tag
2173
2174 my $formatting_end_tag = sub {
2175 my $tag_name = shift;
2176
2177 FET: {
2178 ## Step 1
2179 my $formatting_element;
2180 my $formatting_element_i_in_active;
2181 AFE: for (reverse 0..$#$active_formatting_elements) {
2182 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2183 $formatting_element = $active_formatting_elements->[$_];
2184 $formatting_element_i_in_active = $_;
2185 last AFE;
2186 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2187 last AFE;
2188 }
2189 } # AFE
2190 unless (defined $formatting_element) {
2191 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2192 ## Ignore the token
2193 !!!next-token;
2194 return;
2195 }
2196 ## has an element in scope
2197 my $in_scope = 1;
2198 my $formatting_element_i_in_open;
2199 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2200 my $node = $self->{open_elements}->[$_];
2201 if ($node->[0] eq $formatting_element->[0]) {
2202 if ($in_scope) {
2203 $formatting_element_i_in_open = $_;
2204 last INSCOPE;
2205 } else { # in open elements but not in scope
2206 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2207 ## Ignore the token
2208 !!!next-token;
2209 return;
2210 }
2211 } elsif ({
2212 table => 1, caption => 1, td => 1, th => 1,
2213 button => 1, marquee => 1, object => 1, html => 1,
2214 }->{$node->[1]}) {
2215 $in_scope = 0;
2216 }
2217 } # INSCOPE
2218 unless (defined $formatting_element_i_in_open) {
2219 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2220 pop @$active_formatting_elements; # $formatting_element
2221 !!!next-token; ## TODO: ok?
2222 return;
2223 }
2224 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2225 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2226 }
2227
2228 ## Step 2
2229 my $furthest_block;
2230 my $furthest_block_i_in_open;
2231 OE: for (reverse 0..$#{$self->{open_elements}}) {
2232 my $node = $self->{open_elements}->[$_];
2233 if (not $formatting_category->{$node->[1]} and
2234 #not $phrasing_category->{$node->[1]} and
2235 ($special_category->{$node->[1]} or
2236 $scoping_category->{$node->[1]})) {
2237 $furthest_block = $node;
2238 $furthest_block_i_in_open = $_;
2239 } elsif ($node->[0] eq $formatting_element->[0]) {
2240 last OE;
2241 }
2242 } # OE
2243
2244 ## Step 3
2245 unless (defined $furthest_block) { # MUST
2246 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2247 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2248 !!!next-token;
2249 return;
2250 }
2251
2252 ## Step 4
2253 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2254
2255 ## Step 5
2256 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2257 if (defined $furthest_block_parent) {
2258 $furthest_block_parent->remove_child ($furthest_block->[0]);
2259 }
2260
2261 ## Step 6
2262 my $bookmark_prev_el
2263 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2264 ->[0];
2265
2266 ## Step 7
2267 my $node = $furthest_block;
2268 my $node_i_in_open = $furthest_block_i_in_open;
2269 my $last_node = $furthest_block;
2270 S7: {
2271 ## Step 1
2272 $node_i_in_open--;
2273 $node = $self->{open_elements}->[$node_i_in_open];
2274
2275 ## Step 2
2276 my $node_i_in_active;
2277 S7S2: {
2278 for (reverse 0..$#$active_formatting_elements) {
2279 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2280 $node_i_in_active = $_;
2281 last S7S2;
2282 }
2283 }
2284 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2285 redo S7;
2286 } # S7S2
2287
2288 ## Step 3
2289 last S7 if $node->[0] eq $formatting_element->[0];
2290
2291 ## Step 4
2292 if ($last_node->[0] eq $furthest_block->[0]) {
2293 $bookmark_prev_el = $node->[0];
2294 }
2295
2296 ## Step 5
2297 if ($node->[0]->has_child_nodes ()) {
2298 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2299 $active_formatting_elements->[$node_i_in_active] = $clone;
2300 $self->{open_elements}->[$node_i_in_open] = $clone;
2301 $node = $clone;
2302 }
2303
2304 ## Step 6
2305 $node->[0]->append_child ($last_node->[0]);
2306
2307 ## Step 7
2308 $last_node = $node;
2309
2310 ## Step 8
2311 redo S7;
2312 } # S7
2313
2314 ## Step 8
2315 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2316
2317 ## Step 9
2318 my $clone = [$formatting_element->[0]->clone_node (0),
2319 $formatting_element->[1]];
2320
2321 ## Step 10
2322 my @cn = @{$furthest_block->[0]->child_nodes};
2323 $clone->[0]->append_child ($_) for @cn;
2324
2325 ## Step 11
2326 $furthest_block->[0]->append_child ($clone->[0]);
2327
2328 ## Step 12
2329 my $i;
2330 AFE: for (reverse 0..$#$active_formatting_elements) {
2331 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2332 splice @$active_formatting_elements, $_, 1;
2333 $i-- and last AFE if defined $i;
2334 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2335 $i = $_;
2336 }
2337 } # AFE
2338 splice @$active_formatting_elements, $i + 1, 0, $clone;
2339
2340 ## Step 13
2341 undef $i;
2342 OE: for (reverse 0..$#{$self->{open_elements}}) {
2343 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2344 splice @{$self->{open_elements}}, $_, 1;
2345 $i-- and last OE if defined $i;
2346 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2347 $i = $_;
2348 }
2349 } # OE
2350 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2351
2352 ## Step 14
2353 redo FET;
2354 } # FET
2355 }; # $formatting_end_tag
2356
2357 my $insert_to_current = sub {
2358 $self->{open_elements}->[-1]->[0]->append_child (shift);
2359 }; # $insert_to_current
2360
2361 my $insert_to_foster = sub {
2362 my $child = shift;
2363 if ({
2364 table => 1, tbody => 1, tfoot => 1,
2365 thead => 1, tr => 1,
2366 }->{$self->{open_elements}->[-1]->[1]}) {
2367 # MUST
2368 my $foster_parent_element;
2369 my $next_sibling;
2370 OE: for (reverse 0..$#{$self->{open_elements}}) {
2371 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2372 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2373 if (defined $parent and $parent->node_type == 1) {
2374 $foster_parent_element = $parent;
2375 $next_sibling = $self->{open_elements}->[$_]->[0];
2376 } else {
2377 $foster_parent_element
2378 = $self->{open_elements}->[$_ - 1]->[0];
2379 }
2380 last OE;
2381 }
2382 } # OE
2383 $foster_parent_element = $self->{open_elements}->[0]->[0]
2384 unless defined $foster_parent_element;
2385 $foster_parent_element->insert_before
2386 ($child, $next_sibling);
2387 } else {
2388 $self->{open_elements}->[-1]->[0]->append_child ($child);
2389 }
2390 }; # $insert_to_foster
2391
2392 my $in_body = sub {
2393 my $insert = shift;
2394 if ($token->{type} eq 'start tag') {
2395 if ($token->{tag_name} eq 'script') {
2396 $script_start_tag->();
2397 return;
2398 } elsif ($token->{tag_name} eq 'style') {
2399 $style_start_tag->();
2400 return;
2401 } elsif ({
2402 base => 1, link => 1, meta => 1,
2403 }->{$token->{tag_name}}) {
2404 !!!parse-error (type => 'in body:'.$token->{tag_name});
2405 ## NOTE: This is an "as if in head" code clone
2406 my $el;
2407 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2408 if (defined $self->{head_element}) {
2409 $self->{head_element}->append_child ($el);
2410 } else {
2411 $insert->($el);
2412 }
2413
2414 !!!next-token;
2415 return;
2416 } elsif ($token->{tag_name} eq 'title') {
2417 !!!parse-error (type => 'in body:title');
2418 ## NOTE: There is an "as if in head" code clone
2419 my $title_el;
2420 !!!create-element ($title_el, 'title', $token->{attributes});
2421 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
2422 ->append_child ($title_el);
2423 $self->{content_model_flag} = 'RCDATA';
2424 delete $self->{escape}; # MUST
2425
2426 my $text = '';
2427 !!!next-token;
2428 while ($token->{type} eq 'character') {
2429 $text .= $token->{data};
2430 !!!next-token;
2431 }
2432 if (length $text) {
2433 $title_el->manakai_append_text ($text);
2434 }
2435
2436 $self->{content_model_flag} = 'PCDATA';
2437
2438 if ($token->{type} eq 'end tag' and
2439 $token->{tag_name} eq 'title') {
2440 ## Ignore the token
2441 } else {
2442 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2443 ## ISSUE: And ignore?
2444 }
2445 !!!next-token;
2446 return;
2447 } elsif ($token->{tag_name} eq 'body') {
2448 !!!parse-error (type => 'in body:body');
2449
2450 if (@{$self->{open_elements}} == 1 or
2451 $self->{open_elements}->[1]->[1] ne 'body') {
2452 ## Ignore the token
2453 } else {
2454 my $body_el = $self->{open_elements}->[1]->[0];
2455 for my $attr_name (keys %{$token->{attributes}}) {
2456 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
2457 $body_el->set_attribute_ns
2458 (undef, [undef, $attr_name],
2459 $token->{attributes}->{$attr_name}->{value});
2460 }
2461 }
2462 }
2463 !!!next-token;
2464 return;
2465 } elsif ({
2466 address => 1, blockquote => 1, center => 1, dir => 1,
2467 div => 1, dl => 1, fieldset => 1, listing => 1,
2468 menu => 1, ol => 1, p => 1, ul => 1,
2469 pre => 1,
2470 }->{$token->{tag_name}}) {
2471 ## has a p element in scope
2472 INSCOPE: for (reverse @{$self->{open_elements}}) {
2473 if ($_->[1] eq 'p') {
2474 !!!back-token;
2475 $token = {type => 'end tag', tag_name => 'p'};
2476 return;
2477 } elsif ({
2478 table => 1, caption => 1, td => 1, th => 1,
2479 button => 1, marquee => 1, object => 1, html => 1,
2480 }->{$_->[1]}) {
2481 last INSCOPE;
2482 }
2483 } # INSCOPE
2484
2485 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2486 if ($token->{tag_name} eq 'pre') {
2487 !!!next-token;
2488 if ($token->{type} eq 'character') {
2489 $token->{data} =~ s/^\x0A//;
2490 unless (length $token->{data}) {
2491 !!!next-token;
2492 }
2493 }
2494 } else {
2495 !!!next-token;
2496 }
2497 return;
2498 } elsif ($token->{tag_name} eq 'form') {
2499 if (defined $self->{form_element}) {
2500 !!!parse-error (type => 'in form:form');
2501 ## Ignore the token
2502 !!!next-token;
2503 return;
2504 } else {
2505 ## has a p element in scope
2506 INSCOPE: for (reverse @{$self->{open_elements}}) {
2507 if ($_->[1] eq 'p') {
2508 !!!back-token;
2509 $token = {type => 'end tag', tag_name => 'p'};
2510 return;
2511 } elsif ({
2512 table => 1, caption => 1, td => 1, th => 1,
2513 button => 1, marquee => 1, object => 1, html => 1,
2514 }->{$_->[1]}) {
2515 last INSCOPE;
2516 }
2517 } # INSCOPE
2518
2519 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2520 $self->{form_element} = $self->{open_elements}->[-1]->[0];
2521 !!!next-token;
2522 return;
2523 }
2524 } elsif ($token->{tag_name} eq 'li') {
2525 ## has a p element in scope
2526 INSCOPE: for (reverse @{$self->{open_elements}}) {
2527 if ($_->[1] eq 'p') {
2528 !!!back-token;
2529 $token = {type => 'end tag', tag_name => 'p'};
2530 return;
2531 } elsif ({
2532 table => 1, caption => 1, td => 1, th => 1,
2533 button => 1, marquee => 1, object => 1, html => 1,
2534 }->{$_->[1]}) {
2535 last INSCOPE;
2536 }
2537 } # INSCOPE
2538
2539 ## Step 1
2540 my $i = -1;
2541 my $node = $self->{open_elements}->[$i];
2542 LI: {
2543 ## Step 2
2544 if ($node->[1] eq 'li') {
2545 if ($i != -1) {
2546 !!!parse-error (type => 'end tag missing:'.
2547 $self->{open_elements}->[-1]->[1]);
2548 ## TODO: test
2549 }
2550 splice @{$self->{open_elements}}, $i;
2551 last LI;
2552 }
2553
2554 ## Step 3
2555 if (not $formatting_category->{$node->[1]} and
2556 #not $phrasing_category->{$node->[1]} and
2557 ($special_category->{$node->[1]} or
2558 $scoping_category->{$node->[1]}) and
2559 $node->[1] ne 'address' and $node->[1] ne 'div') {
2560 last LI;
2561 }
2562
2563 ## Step 4
2564 $i--;
2565 $node = $self->{open_elements}->[$i];
2566 redo LI;
2567 } # LI
2568
2569 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2570 !!!next-token;
2571 return;
2572 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
2573 ## has a p element in scope
2574 INSCOPE: for (reverse @{$self->{open_elements}}) {
2575 if ($_->[1] eq 'p') {
2576 !!!back-token;
2577 $token = {type => 'end tag', tag_name => 'p'};
2578 return;
2579 } elsif ({
2580 table => 1, caption => 1, td => 1, th => 1,
2581 button => 1, marquee => 1, object => 1, html => 1,
2582 }->{$_->[1]}) {
2583 last INSCOPE;
2584 }
2585 } # INSCOPE
2586
2587 ## Step 1
2588 my $i = -1;
2589 my $node = $self->{open_elements}->[$i];
2590 LI: {
2591 ## Step 2
2592 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
2593 if ($i != -1) {
2594 !!!parse-error (type => 'end tag missing:'.
2595 $self->{open_elements}->[-1]->[1]);
2596 ## TODO: test
2597 }
2598 splice @{$self->{open_elements}}, $i;
2599 last LI;
2600 }
2601
2602 ## Step 3
2603 if (not $formatting_category->{$node->[1]} and
2604 #not $phrasing_category->{$node->[1]} and
2605 ($special_category->{$node->[1]} or
2606 $scoping_category->{$node->[1]}) and
2607 $node->[1] ne 'address' and $node->[1] ne 'div') {
2608 last LI;
2609 }
2610
2611 ## Step 4
2612 $i--;
2613 $node = $self->{open_elements}->[$i];
2614 redo LI;
2615 } # LI
2616
2617 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2618 !!!next-token;
2619 return;
2620 } elsif ($token->{tag_name} eq 'plaintext') {
2621 ## has a p element in scope
2622 INSCOPE: for (reverse @{$self->{open_elements}}) {
2623 if ($_->[1] eq 'p') {
2624 !!!back-token;
2625 $token = {type => 'end tag', tag_name => 'p'};
2626 return;
2627 } elsif ({
2628 table => 1, caption => 1, td => 1, th => 1,
2629 button => 1, marquee => 1, object => 1, html => 1,
2630 }->{$_->[1]}) {
2631 last INSCOPE;
2632 }
2633 } # INSCOPE
2634
2635 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2636
2637 $self->{content_model_flag} = 'PLAINTEXT';
2638
2639 !!!next-token;
2640 return;
2641 } elsif ({
2642 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2643 }->{$token->{tag_name}}) {
2644 ## has a p element in scope
2645 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2646 my $node = $self->{open_elements}->[$_];
2647 if ($node->[1] eq 'p') {
2648 !!!back-token;
2649 $token = {type => 'end tag', tag_name => 'p'};
2650 return;
2651 } elsif ({
2652 table => 1, caption => 1, td => 1, th => 1,
2653 button => 1, marquee => 1, object => 1, html => 1,
2654 }->{$node->[1]}) {
2655 last INSCOPE;
2656 }
2657 } # INSCOPE
2658
2659 ## has an element in scope
2660 my $i;
2661 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2662 my $node = $self->{open_elements}->[$_];
2663 if ({
2664 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
2665 }->{$node->[1]}) {
2666 $i = $_;
2667 last INSCOPE;
2668 } elsif ({
2669 table => 1, caption => 1, td => 1, th => 1,
2670 button => 1, marquee => 1, object => 1, html => 1,
2671 }->{$node->[1]}) {
2672 last INSCOPE;
2673 }
2674 } # INSCOPE
2675
2676 if (defined $i) {
2677 !!!parse-error (type => 'in hn:hn');
2678 splice @{$self->{open_elements}}, $i;
2679 }
2680
2681 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2682
2683 !!!next-token;
2684 return;
2685 } elsif ($token->{tag_name} eq 'a') {
2686 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
2687 my $node = $active_formatting_elements->[$i];
2688 if ($node->[1] eq 'a') {
2689 !!!parse-error (type => 'in a:a');
2690
2691 !!!back-token;
2692 $token = {type => 'end tag', tag_name => 'a'};
2693 $formatting_end_tag->($token->{tag_name});
2694
2695 AFE2: for (reverse 0..$#$active_formatting_elements) {
2696 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2697 splice @$active_formatting_elements, $_, 1;
2698 last AFE2;
2699 }
2700 } # AFE2
2701 OE: for (reverse 0..$#{$self->{open_elements}}) {
2702 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
2703 splice @{$self->{open_elements}}, $_, 1;
2704 last OE;
2705 }
2706 } # OE
2707 last AFE;
2708 } elsif ($node->[0] eq '#marker') {
2709 last AFE;
2710 }
2711 } # AFE
2712
2713 $reconstruct_active_formatting_elements->($insert_to_current);
2714
2715 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2716 push @$active_formatting_elements, $self->{open_elements}->[-1];
2717
2718 !!!next-token;
2719 return;
2720 } elsif ({
2721 b => 1, big => 1, em => 1, font => 1, i => 1,
2722 s => 1, small => 1, strile => 1,
2723 strong => 1, tt => 1, u => 1,
2724 }->{$token->{tag_name}}) {
2725 $reconstruct_active_formatting_elements->($insert_to_current);
2726
2727 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2728 push @$active_formatting_elements, $self->{open_elements}->[-1];
2729
2730 !!!next-token;
2731 return;
2732 } elsif ($token->{tag_name} eq 'nobr') {
2733 $reconstruct_active_formatting_elements->($insert_to_current);
2734
2735 ## has a |nobr| element in scope
2736 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2737 my $node = $self->{open_elements}->[$_];
2738 if ($node->[1] eq 'nobr') {
2739 !!!back-token;
2740 $token = {type => 'end tag', tag_name => 'nobr'};
2741 return;
2742 } elsif ({
2743 table => 1, caption => 1, td => 1, th => 1,
2744 button => 1, marquee => 1, object => 1, html => 1,
2745 }->{$node->[1]}) {
2746 last INSCOPE;
2747 }
2748 } # INSCOPE
2749
2750 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2751 push @$active_formatting_elements, $self->{open_elements}->[-1];
2752
2753 !!!next-token;
2754 return;
2755 } elsif ($token->{tag_name} eq 'button') {
2756 ## has a button element in scope
2757 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2758 my $node = $self->{open_elements}->[$_];
2759 if ($node->[1] eq 'button') {
2760 !!!parse-error (type => 'in button:button');
2761 !!!back-token;
2762 $token = {type => 'end tag', tag_name => 'button'};
2763 return;
2764 } elsif ({
2765 table => 1, caption => 1, td => 1, th => 1,
2766 button => 1, marquee => 1, object => 1, html => 1,
2767 }->{$node->[1]}) {
2768 last INSCOPE;
2769 }
2770 } # INSCOPE
2771
2772 $reconstruct_active_formatting_elements->($insert_to_current);
2773
2774 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2775 push @$active_formatting_elements, ['#marker', ''];
2776
2777 !!!next-token;
2778 return;
2779 } elsif ($token->{tag_name} eq 'marquee' or
2780 $token->{tag_name} eq 'object') {
2781 $reconstruct_active_formatting_elements->($insert_to_current);
2782
2783 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2784 push @$active_formatting_elements, ['#marker', ''];
2785
2786 !!!next-token;
2787 return;
2788 } elsif ($token->{tag_name} eq 'xmp') {
2789 $reconstruct_active_formatting_elements->($insert_to_current);
2790
2791 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2792
2793 $self->{content_model_flag} = 'CDATA';
2794 delete $self->{escape}; # MUST
2795
2796 !!!next-token;
2797 return;
2798 } elsif ($token->{tag_name} eq 'table') {
2799 ## has a p element in scope
2800 INSCOPE: for (reverse @{$self->{open_elements}}) {
2801 if ($_->[1] eq 'p') {
2802 !!!back-token;
2803 $token = {type => 'end tag', tag_name => 'p'};
2804 return;
2805 } elsif ({
2806 table => 1, caption => 1, td => 1, th => 1,
2807 button => 1, marquee => 1, object => 1, html => 1,
2808 }->{$_->[1]}) {
2809 last INSCOPE;
2810 }
2811 } # INSCOPE
2812
2813 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2814
2815 $self->{insertion_mode} = 'in table';
2816
2817 !!!next-token;
2818 return;
2819 } elsif ({
2820 area => 1, basefont => 1, bgsound => 1, br => 1,
2821 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
2822 image => 1,
2823 }->{$token->{tag_name}}) {
2824 if ($token->{tag_name} eq 'image') {
2825 !!!parse-error (type => 'image');
2826 $token->{tag_name} = 'img';
2827 }
2828
2829 $reconstruct_active_formatting_elements->($insert_to_current);
2830
2831 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2832 pop @{$self->{open_elements}};
2833
2834 !!!next-token;
2835 return;
2836 } elsif ($token->{tag_name} eq 'hr') {
2837 ## has a p element in scope
2838 INSCOPE: for (reverse @{$self->{open_elements}}) {
2839 if ($_->[1] eq 'p') {
2840 !!!back-token;
2841 $token = {type => 'end tag', tag_name => 'p'};
2842 return;
2843 } elsif ({
2844 table => 1, caption => 1, td => 1, th => 1,
2845 button => 1, marquee => 1, object => 1, html => 1,
2846 }->{$_->[1]}) {
2847 last INSCOPE;
2848 }
2849 } # INSCOPE
2850
2851 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2852 pop @{$self->{open_elements}};
2853
2854 !!!next-token;
2855 return;
2856 } elsif ($token->{tag_name} eq 'input') {
2857 $reconstruct_active_formatting_elements->($insert_to_current);
2858
2859 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2860 ## TODO: associate with $self->{form_element} if defined
2861 pop @{$self->{open_elements}};
2862
2863 !!!next-token;
2864 return;
2865 } elsif ($token->{tag_name} eq 'isindex') {
2866 !!!parse-error (type => 'isindex');
2867
2868 if (defined $self->{form_element}) {
2869 ## Ignore the token
2870 !!!next-token;
2871 return;
2872 } else {
2873 my $at = $token->{attributes};
2874 $at->{name} = {name => 'name', value => 'isindex'};
2875 my @tokens = (
2876 {type => 'start tag', tag_name => 'form'},
2877 {type => 'start tag', tag_name => 'hr'},
2878 {type => 'start tag', tag_name => 'p'},
2879 {type => 'start tag', tag_name => 'label'},
2880 {type => 'character',
2881 data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
2882 ## TODO: make this configurable
2883 {type => 'start tag', tag_name => 'input', attributes => $at},
2884 #{type => 'character', data => ''}, # SHOULD
2885 {type => 'end tag', tag_name => 'label'},
2886 {type => 'end tag', tag_name => 'p'},
2887 {type => 'start tag', tag_name => 'hr'},
2888 {type => 'end tag', tag_name => 'form'},
2889 );
2890 $token = shift @tokens;
2891 !!!back-token (@tokens);
2892 return;
2893 }
2894 } elsif ({
2895 textarea => 1,
2896 iframe => 1,
2897 noembed => 1,
2898 noframes => 1,
2899 noscript => 0, ## TODO: 1 if scripting is enabled
2900 }->{$token->{tag_name}}) {
2901 my $tag_name = $token->{tag_name};
2902 my $el;
2903 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
2904
2905 if ($token->{tag_name} eq 'textarea') {
2906 ## TODO: $self->{form_element} if defined
2907 $self->{content_model_flag} = 'RCDATA';
2908 } else {
2909 $self->{content_model_flag} = 'CDATA';
2910 }
2911 delete $self->{escape}; # MUST
2912
2913 $insert->($el);
2914
2915 my $text = '';
2916 if ($token->{tag_name} eq 'textarea') {
2917 !!!next-token;
2918 if ($token->{type} eq 'character') {
2919 $token->{data} =~ s/^\x0A//;
2920 unless (length $token->{data}) {
2921 !!!next-token;
2922 }
2923 }
2924 } else {
2925 !!!next-token;
2926 }
2927 while ($token->{type} eq 'character') {
2928 $text .= $token->{data};
2929 !!!next-token;
2930 }
2931 if (length $text) {
2932 $el->manakai_append_text ($text);
2933 }
2934
2935 $self->{content_model_flag} = 'PCDATA';
2936
2937 if ($token->{type} eq 'end tag' and
2938 $token->{tag_name} eq $tag_name) {
2939 ## Ignore the token
2940 } else {
2941 if ($token->{tag_name} eq 'textarea') {
2942 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2943 } else {
2944 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2945 }
2946 ## ISSUE: And ignore?
2947 }
2948 !!!next-token;
2949 return;
2950 } elsif ($token->{tag_name} eq 'select') {
2951 $reconstruct_active_formatting_elements->($insert_to_current);
2952
2953 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2954
2955 $self->{insertion_mode} = 'in select';
2956 !!!next-token;
2957 return;
2958 } elsif ({
2959 caption => 1, col => 1, colgroup => 1, frame => 1,
2960 frameset => 1, head => 1, option => 1, optgroup => 1,
2961 tbody => 1, td => 1, tfoot => 1, th => 1,
2962 thead => 1, tr => 1,
2963 }->{$token->{tag_name}}) {
2964 !!!parse-error (type => 'in body:'.$token->{tag_name});
2965 ## Ignore the token
2966 !!!next-token;
2967 return;
2968
2969 ## ISSUE: An issue on HTML5 new elements in the spec.
2970 } else {
2971 $reconstruct_active_formatting_elements->($insert_to_current);
2972
2973 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
2974
2975 !!!next-token;
2976 return;
2977 }
2978 } elsif ($token->{type} eq 'end tag') {
2979 if ($token->{tag_name} eq 'body') {
2980 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
2981 ## ISSUE: There is an issue in the spec.
2982 if ($self->{open_elements}->[-1]->[1] ne 'body') {
2983 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2984 }
2985 $self->{insertion_mode} = 'after body';
2986 !!!next-token;
2987 return;
2988 } else {
2989 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2990 ## Ignore the token
2991 !!!next-token;
2992 return;
2993 }
2994 } elsif ($token->{tag_name} eq 'html') {
2995 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
2996 ## ISSUE: There is an issue in the spec.
2997 if ($self->{open_elements}->[-1]->[1] ne 'body') {
2998 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
2999 }
3000 $self->{insertion_mode} = 'after body';
3001 ## reprocess
3002 return;
3003 } else {
3004 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3005 ## Ignore the token
3006 !!!next-token;
3007 return;
3008 }
3009 } elsif ({
3010 address => 1, blockquote => 1, center => 1, dir => 1,
3011 div => 1, dl => 1, fieldset => 1, listing => 1,
3012 menu => 1, ol => 1, pre => 1, ul => 1,
3013 p => 1,
3014 dd => 1, dt => 1, li => 1,
3015 button => 1, marquee => 1, object => 1,
3016 }->{$token->{tag_name}}) {
3017 ## has an element in scope
3018 my $i;
3019 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3020 my $node = $self->{open_elements}->[$_];
3021 if ($node->[1] eq $token->{tag_name}) {
3022 ## generate implied end tags
3023 if ({
3024 dd => ($token->{tag_name} ne 'dd'),
3025 dt => ($token->{tag_name} ne 'dt'),
3026 li => ($token->{tag_name} ne 'li'),
3027 p => ($token->{tag_name} ne 'p'),
3028 td => 1, th => 1, tr => 1,
3029 }->{$self->{open_elements}->[-1]->[1]}) {
3030 !!!back-token;
3031 $token = {type => 'end tag',
3032 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3033 return;
3034 }
3035 $i = $_;
3036 last INSCOPE unless $token->{tag_name} eq 'p';
3037 } elsif ({
3038 table => 1, caption => 1, td => 1, th => 1,
3039 button => 1, marquee => 1, object => 1, html => 1,
3040 }->{$node->[1]}) {
3041 last INSCOPE;
3042 }
3043 } # INSCOPE
3044
3045 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3046 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3047 }
3048
3049 splice @{$self->{open_elements}}, $i if defined $i;
3050 $clear_up_to_marker->()
3051 if {
3052 button => 1, marquee => 1, object => 1,
3053 }->{$token->{tag_name}};
3054 !!!next-token;
3055 return;
3056 } elsif ($token->{tag_name} eq 'form') {
3057 ## has an element in scope
3058 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3059 my $node = $self->{open_elements}->[$_];
3060 if ($node->[1] eq $token->{tag_name}) {
3061 ## generate implied end tags
3062 if ({
3063 dd => 1, dt => 1, li => 1, p => 1,
3064 td => 1, th => 1, tr => 1,
3065 }->{$self->{open_elements}->[-1]->[1]}) {
3066 !!!back-token;
3067 $token = {type => 'end tag',
3068 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3069 return;
3070 }
3071 last INSCOPE;
3072 } elsif ({
3073 table => 1, caption => 1, td => 1, th => 1,
3074 button => 1, marquee => 1, object => 1, html => 1,
3075 }->{$node->[1]}) {
3076 last INSCOPE;
3077 }
3078 } # INSCOPE
3079
3080 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
3081 pop @{$self->{open_elements}};
3082 } else {
3083 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3084 }
3085
3086 undef $self->{form_element};
3087 !!!next-token;
3088 return;
3089 } elsif ({
3090 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3091 }->{$token->{tag_name}}) {
3092 ## has an element in scope
3093 my $i;
3094 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3095 my $node = $self->{open_elements}->[$_];
3096 if ({
3097 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
3098 }->{$node->[1]}) {
3099 ## generate implied end tags
3100 if ({
3101 dd => 1, dt => 1, li => 1, p => 1,
3102 td => 1, th => 1, tr => 1,
3103 }->{$self->{open_elements}->[-1]->[1]}) {
3104 !!!back-token;
3105 $token = {type => 'end tag',
3106 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3107 return;
3108 }
3109 $i = $_;
3110 last INSCOPE;
3111 } elsif ({
3112 table => 1, caption => 1, td => 1, th => 1,
3113 button => 1, marquee => 1, object => 1, html => 1,
3114 }->{$node->[1]}) {
3115 last INSCOPE;
3116 }
3117 } # INSCOPE
3118
3119 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3120 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3121 }
3122
3123 splice @{$self->{open_elements}}, $i if defined $i;
3124 !!!next-token;
3125 return;
3126 } elsif ({
3127 a => 1,
3128 b => 1, big => 1, em => 1, font => 1, i => 1,
3129 nobr => 1, s => 1, small => 1, strile => 1,
3130 strong => 1, tt => 1, u => 1,
3131 }->{$token->{tag_name}}) {
3132 $formatting_end_tag->($token->{tag_name});
3133 ## TODO: <http://html5.org/tools/web-apps-tracker?from=883&to=884>
3134 return;
3135 } elsif ({
3136 caption => 1, col => 1, colgroup => 1, frame => 1,
3137 frameset => 1, head => 1, option => 1, optgroup => 1,
3138 tbody => 1, td => 1, tfoot => 1, th => 1,
3139 thead => 1, tr => 1,
3140 area => 1, basefont => 1, bgsound => 1, br => 1,
3141 embed => 1, hr => 1, iframe => 1, image => 1,
3142 img => 1, input => 1, isindex => 1, noembed => 1,
3143 noframes => 1, param => 1, select => 1, spacer => 1,
3144 table => 1, textarea => 1, wbr => 1,
3145 noscript => 0, ## TODO: if scripting is enabled
3146 }->{$token->{tag_name}}) {
3147 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3148 ## Ignore the token
3149 !!!next-token;
3150 return;
3151
3152 ## ISSUE: Issue on HTML5 new elements in spec
3153
3154 } else {
3155 ## Step 1
3156 my $node_i = -1;
3157 my $node = $self->{open_elements}->[$node_i];
3158
3159 ## Step 2
3160 S2: {
3161 if ($node->[1] eq $token->{tag_name}) {
3162 ## Step 1
3163 ## generate implied end tags
3164 if ({
3165 dd => 1, dt => 1, li => 1, p => 1,
3166 td => 1, th => 1, tr => 1,
3167 }->{$self->{open_elements}->[-1]->[1]}) {
3168 !!!back-token;
3169 $token = {type => 'end tag',
3170 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3171 return;
3172 }
3173
3174 ## Step 2
3175 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
3176 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3177 }
3178
3179 ## Step 3
3180 splice @{$self->{open_elements}}, $node_i;
3181
3182 !!!next-token;
3183 last S2;
3184 } else {
3185 ## Step 3
3186 if (not $formatting_category->{$node->[1]} and
3187 #not $phrasing_category->{$node->[1]} and
3188 ($special_category->{$node->[1]} or
3189 $scoping_category->{$node->[1]})) {
3190 !!!parse-error (type => 'not closed:'.$node->[1]);
3191 ## Ignore the token
3192 !!!next-token;
3193 last S2;
3194 }
3195 }
3196
3197 ## Step 4
3198 $node_i--;
3199 $node = $self->{open_elements}->[$node_i];
3200
3201 ## Step 5;
3202 redo S2;
3203 } # S2
3204 return;
3205 }
3206 }
3207 }; # $in_body
3208
3209 B: {
3210 if ($phase eq 'main') {
3211 if ($token->{type} eq 'DOCTYPE') {
3212 !!!parse-error (type => 'in html:#DOCTYPE');
3213 ## Ignore the token
3214 ## Stay in the phase
3215 !!!next-token;
3216 redo B;
3217 } elsif ($token->{type} eq 'start tag' and
3218 $token->{tag_name} eq 'html') {
3219 ## TODO: unless it is the first start tag token, parse-error
3220 my $top_el = $self->{open_elements}->[0]->[0];
3221 for my $attr_name (keys %{$token->{attributes}}) {
3222 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3223 $top_el->set_attribute_ns
3224 (undef, [undef, $attr_name],
3225 $token->{attributes}->{$attr_name}->{value});
3226 }
3227 }
3228 !!!next-token;
3229 redo B;
3230 } elsif ($token->{type} eq 'end-of-file') {
3231 ## Generate implied end tags
3232 if ({
3233 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
3234 }->{$self->{open_elements}->[-1]->[1]}) {
3235 !!!back-token;
3236 $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
3237 redo B;
3238 }
3239
3240 if (@{$self->{open_elements}} > 2 or
3241 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
3242 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3243 } elsif (defined $self->{inner_html_node} and
3244 @{$self->{open_elements}} > 1 and
3245 $self->{open_elements}->[1]->[1] ne 'body') {
3246 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3247 }
3248
3249 ## Stop parsing
3250 last B;
3251
3252 ## ISSUE: There is an issue in the spec.
3253 } else {
3254 if ($self->{insertion_mode} eq 'before head') {
3255 if ($token->{type} eq 'character') {
3256 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3257 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3258 unless (length $token->{data}) {
3259 !!!next-token;
3260 redo B;
3261 }
3262 }
3263 ## As if <head>
3264 !!!create-element ($self->{head_element}, 'head');
3265 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3266 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3267 $self->{insertion_mode} = 'in head';
3268 ## reprocess
3269 redo B;
3270 } elsif ($token->{type} eq 'comment') {
3271 my $comment = $self->{document}->create_comment ($token->{data});
3272 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3273 !!!next-token;
3274 redo B;
3275 } elsif ($token->{type} eq 'start tag') {
3276 my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
3277 !!!create-element ($self->{head_element}, 'head', $attr);
3278 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3279 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3280 $self->{insertion_mode} = 'in head';
3281 if ($token->{tag_name} eq 'head') {
3282 !!!next-token;
3283 #} elsif ({
3284 # base => 1, link => 1, meta => 1,
3285 # script => 1, style => 1, title => 1,
3286 # }->{$token->{tag_name}}) {
3287 # ## reprocess
3288 } else {
3289 ## reprocess
3290 }
3291 redo B;
3292 } elsif ($token->{type} eq 'end tag') {
3293 if ($token->{tag_name} eq 'html') {
3294 ## As if <head>
3295 !!!create-element ($self->{head_element}, 'head');
3296 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3297 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3298 $self->{insertion_mode} = 'in head';
3299 ## reprocess
3300 redo B;
3301 } else {
3302 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3303 ## Ignore the token
3304 !!!next-token;
3305 redo B;
3306 }
3307 } else {
3308 die "$0: $token->{type}: Unknown type";
3309 }
3310 } elsif ($self->{insertion_mode} eq 'in head') {
3311 if ($token->{type} eq 'character') {
3312 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3313 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3314 unless (length $token->{data}) {
3315 !!!next-token;
3316 redo B;
3317 }
3318 }
3319
3320 #
3321 } elsif ($token->{type} eq 'comment') {
3322 my $comment = $self->{document}->create_comment ($token->{data});
3323 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3324 !!!next-token;
3325 redo B;
3326 } elsif ($token->{type} eq 'start tag') {
3327 if ($token->{tag_name} eq 'title') {
3328 ## NOTE: There is an "as if in head" code clone
3329 my $title_el;
3330 !!!create-element ($title_el, 'title', $token->{attributes});
3331 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
3332 ->append_child ($title_el);
3333 $self->{content_model_flag} = 'RCDATA';
3334 delete $self->{escape}; # MUST
3335
3336 my $text = '';
3337 !!!next-token;
3338 while ($token->{type} eq 'character') {
3339 $text .= $token->{data};
3340 !!!next-token;
3341 }
3342 if (length $text) {
3343 $title_el->manakai_append_text ($text);
3344 }
3345
3346 $self->{content_model_flag} = 'PCDATA';
3347
3348 if ($token->{type} eq 'end tag' and
3349 $token->{tag_name} eq 'title') {
3350 ## Ignore the token
3351 } else {
3352 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
3353 ## ISSUE: And ignore?
3354 }
3355 !!!next-token;
3356 redo B;
3357 } elsif ($token->{tag_name} eq 'style') {
3358 $style_start_tag->();
3359 redo B;
3360 } elsif ($token->{tag_name} eq 'script') {
3361 $script_start_tag->();
3362 redo B;
3363 } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
3364 ## NOTE: There are "as if in head" code clones
3365 my $el;
3366 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
3367 (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
3368 ->append_child ($el);
3369
3370 !!!next-token;
3371 redo B;
3372 } elsif ($token->{tag_name} eq 'head') {
3373 !!!parse-error (type => 'in head:head');
3374 ## Ignore the token
3375 !!!next-token;
3376 redo B;
3377 } else {
3378 #
3379 }
3380 } elsif ($token->{type} eq 'end tag') {
3381 if ($token->{tag_name} eq 'head') {
3382 if ($self->{open_elements}->[-1]->[1] eq 'head') {
3383 pop @{$self->{open_elements}};
3384 } else {
3385 !!!parse-error (type => 'unmatched end tag:head');
3386 }
3387 $self->{insertion_mode} = 'after head';
3388 !!!next-token;
3389 redo B;
3390 } elsif ($token->{tag_name} eq 'html') {
3391 #
3392 } else {
3393 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3394 ## Ignore the token
3395 !!!next-token;
3396 redo B;
3397 }
3398 } else {
3399 #
3400 }
3401
3402 if ($self->{open_elements}->[-1]->[1] eq 'head') {
3403 ## As if </head>
3404 pop @{$self->{open_elements}};
3405 }
3406 $self->{insertion_mode} = 'after head';
3407 ## reprocess
3408 redo B;
3409
3410 ## ISSUE: An issue in the spec.
3411 } elsif ($self->{insertion_mode} eq 'after head') {
3412 if ($token->{type} eq 'character') {
3413 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3414 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3415 unless (length $token->{data}) {
3416 !!!next-token;
3417 redo B;
3418 }
3419 }
3420
3421 #
3422 } elsif ($token->{type} eq 'comment') {
3423 my $comment = $self->{document}->create_comment ($token->{data});
3424 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3425 !!!next-token;
3426 redo B;
3427 } elsif ($token->{type} eq 'start tag') {
3428 if ($token->{tag_name} eq 'body') {
3429 !!!insert-element ('body', $token->{attributes});
3430 $self->{insertion_mode} = 'in body';
3431 !!!next-token;
3432 redo B;
3433 } elsif ($token->{tag_name} eq 'frameset') {
3434 !!!insert-element ('frameset', $token->{attributes});
3435 $self->{insertion_mode} = 'in frameset';
3436 !!!next-token;
3437 redo B;
3438 } elsif ({
3439 base => 1, link => 1, meta => 1,
3440 script => 1, style => 1, title => 1,
3441 }->{$token->{tag_name}}) {
3442 !!!parse-error (type => 'after head:'.$token->{tag_name});
3443 $self->{insertion_mode} = 'in head';
3444 ## reprocess
3445 redo B;
3446 } else {
3447 #
3448 }
3449 } else {
3450 #
3451 }
3452
3453 ## As if <body>
3454 !!!insert-element ('body');
3455 $self->{insertion_mode} = 'in body';
3456 ## reprocess
3457 redo B;
3458 } elsif ($self->{insertion_mode} eq 'in body') {
3459 if ($token->{type} eq 'character') {
3460 ## NOTE: There is a code clone of "character in body".
3461 $reconstruct_active_formatting_elements->($insert_to_current);
3462
3463 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3464
3465 !!!next-token;
3466 redo B;
3467 } elsif ($token->{type} eq 'comment') {
3468 ## NOTE: There is a code clone of "comment in body".
3469 my $comment = $self->{document}->create_comment ($token->{data});
3470 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3471 !!!next-token;
3472 redo B;
3473 } else {
3474 $in_body->($insert_to_current);
3475 redo B;
3476 }
3477 } elsif ($self->{insertion_mode} eq 'in table') {
3478 if ($token->{type} eq 'character') {
3479 ## NOTE: There are "character in table" code clones.
3480 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3481 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3482
3483 unless (length $token->{data}) {
3484 !!!next-token;
3485 redo B;
3486 }
3487 }
3488
3489 !!!parse-error (type => 'in table:#character');
3490
3491 ## As if in body, but insert into foster parent element
3492 ## ISSUE: Spec says that "whenever a node would be inserted
3493 ## into the current node" while characters might not be
3494 ## result in a new Text node.
3495 $reconstruct_active_formatting_elements->($insert_to_foster);
3496
3497 if ({
3498 table => 1, tbody => 1, tfoot => 1,
3499 thead => 1, tr => 1,
3500 }->{$self->{open_elements}->[-1]->[1]}) {
3501 # MUST
3502 my $foster_parent_element;
3503 my $next_sibling;
3504 my $prev_sibling;
3505 OE: for (reverse 0..$#{$self->{open_elements}}) {
3506 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3507 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3508 if (defined $parent and $parent->node_type == 1) {
3509 $foster_parent_element = $parent;
3510 $next_sibling = $self->{open_elements}->[$_]->[0];
3511 $prev_sibling = $next_sibling->previous_sibling;
3512 } else {
3513 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3514 $prev_sibling = $foster_parent_element->last_child;
3515 }
3516 last OE;
3517 }
3518 } # OE
3519 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3520 $prev_sibling = $foster_parent_element->last_child
3521 unless defined $foster_parent_element;
3522 if (defined $prev_sibling and
3523 $prev_sibling->node_type == 3) {
3524 $prev_sibling->manakai_append_text ($token->{data});
3525 } else {
3526 $foster_parent_element->insert_before
3527 ($self->{document}->create_text_node ($token->{data}),
3528 $next_sibling);
3529 }
3530 } else {
3531 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3532 }
3533
3534 !!!next-token;
3535 redo B;
3536 } elsif ($token->{type} eq 'comment') {
3537 my $comment = $self->{document}->create_comment ($token->{data});
3538 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3539 !!!next-token;
3540 redo B;
3541 } elsif ($token->{type} eq 'start tag') {
3542 if ({
3543 caption => 1,
3544 colgroup => 1,
3545 tbody => 1, tfoot => 1, thead => 1,
3546 }->{$token->{tag_name}}) {
3547 ## Clear back to table context
3548 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3549 $self->{open_elements}->[-1]->[1] ne 'html') {
3550 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3551 pop @{$self->{open_elements}};
3552 }
3553
3554 push @$active_formatting_elements, ['#marker', '']
3555 if $token->{tag_name} eq 'caption';
3556
3557 !!!insert-element ($token->{tag_name}, $token->{attributes});
3558 $self->{insertion_mode} = {
3559 caption => 'in caption',
3560 colgroup => 'in column group',
3561 tbody => 'in table body',
3562 tfoot => 'in table body',
3563 thead => 'in table body',
3564 }->{$token->{tag_name}};
3565 !!!next-token;
3566 redo B;
3567 } elsif ({
3568 col => 1,
3569 td => 1, th => 1, tr => 1,
3570 }->{$token->{tag_name}}) {
3571 ## Clear back to table context
3572 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3573 $self->{open_elements}->[-1]->[1] ne 'html') {
3574 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3575 pop @{$self->{open_elements}};
3576 }
3577
3578 !!!insert-element ($token->{tag_name} eq 'col' ? 'colgroup' : 'tbody');
3579 $self->{insertion_mode} = $token->{tag_name} eq 'col'
3580 ? 'in column group' : 'in table body';
3581 ## reprocess
3582 redo B;
3583 } elsif ($token->{tag_name} eq 'table') {
3584 ## NOTE: There are code clones for this "table in table"
3585 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3586
3587 ## As if </table>
3588 ## have a table element in table scope
3589 my $i;
3590 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3591 my $node = $self->{open_elements}->[$_];
3592 if ($node->[1] eq 'table') {
3593 $i = $_;
3594 last INSCOPE;
3595 } elsif ({
3596 table => 1, html => 1,
3597 }->{$node->[1]}) {
3598 last INSCOPE;
3599 }
3600 } # INSCOPE
3601 unless (defined $i) {
3602 !!!parse-error (type => 'unmatched end tag:table');
3603 ## Ignore tokens </table><table>
3604 !!!next-token;
3605 redo B;
3606 }
3607
3608 ## generate implied end tags
3609 if ({
3610 dd => 1, dt => 1, li => 1, p => 1,
3611 td => 1, th => 1, tr => 1,
3612 }->{$self->{open_elements}->[-1]->[1]}) {
3613 !!!back-token; # <table>
3614 $token = {type => 'end tag', tag_name => 'table'};
3615 !!!back-token;
3616 $token = {type => 'end tag',
3617 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3618 redo B;
3619 }
3620
3621 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3622 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3623 }
3624
3625 splice @{$self->{open_elements}}, $i;
3626
3627 $self->_reset_insertion_mode;
3628
3629 ## reprocess
3630 redo B;
3631 } else {
3632 #
3633 }
3634 } elsif ($token->{type} eq 'end tag') {
3635 if ($token->{tag_name} eq 'table') {
3636 ## have a table element in table scope
3637 my $i;
3638 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3639 my $node = $self->{open_elements}->[$_];
3640 if ($node->[1] eq $token->{tag_name}) {
3641 $i = $_;
3642 last INSCOPE;
3643 } elsif ({
3644 table => 1, html => 1,
3645 }->{$node->[1]}) {
3646 last INSCOPE;
3647 }
3648 } # INSCOPE
3649 unless (defined $i) {
3650 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3651 ## Ignore the token
3652 !!!next-token;
3653 redo B;
3654 }
3655
3656 ## generate implied end tags
3657 if ({
3658 dd => 1, dt => 1, li => 1, p => 1,
3659 td => 1, th => 1, tr => 1,
3660 }->{$self->{open_elements}->[-1]->[1]}) {
3661 !!!back-token;
3662 $token = {type => 'end tag',
3663 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3664 redo B;
3665 }
3666
3667 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3668 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3669 }
3670
3671 splice @{$self->{open_elements}}, $i;
3672
3673 $self->_reset_insertion_mode;
3674
3675 !!!next-token;
3676 redo B;
3677 } elsif ({
3678 body => 1, caption => 1, col => 1, colgroup => 1,
3679 html => 1, tbody => 1, td => 1, tfoot => 1, th => 1,
3680 thead => 1, tr => 1,
3681 }->{$token->{tag_name}}) {
3682 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3683 ## Ignore the token
3684 !!!next-token;
3685 redo B;
3686 } else {
3687 #
3688 }
3689 } else {
3690 #
3691 }
3692
3693 !!!parse-error (type => 'in table:'.$token->{tag_name});
3694 $in_body->($insert_to_foster);
3695 redo B;
3696 } elsif ($self->{insertion_mode} eq 'in caption') {
3697 if ($token->{type} eq 'character') {
3698 ## NOTE: This is a code clone of "character in body".
3699 $reconstruct_active_formatting_elements->($insert_to_current);
3700
3701 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3702
3703 !!!next-token;
3704 redo B;
3705 } elsif ($token->{type} eq 'comment') {
3706 ## NOTE: This is a code clone of "comment in body".
3707 my $comment = $self->{document}->create_comment ($token->{data});
3708 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3709 !!!next-token;
3710 redo B;
3711 } elsif ($token->{type} eq 'start tag') {
3712 if ({
3713 caption => 1, col => 1, colgroup => 1, tbody => 1,
3714 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3715 }->{$token->{tag_name}}) {
3716 !!!parse-error (type => 'not closed:caption');
3717
3718 ## As if </caption>
3719 ## have a table element in table scope
3720 my $i;
3721 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3722 my $node = $self->{open_elements}->[$_];
3723 if ($node->[1] eq 'caption') {
3724 $i = $_;
3725 last INSCOPE;
3726 } elsif ({
3727 table => 1, html => 1,
3728 }->{$node->[1]}) {
3729 last INSCOPE;
3730 }
3731 } # INSCOPE
3732 unless (defined $i) {
3733 !!!parse-error (type => 'unmatched end tag:caption');
3734 ## Ignore the token
3735 !!!next-token;
3736 redo B;
3737 }
3738
3739 ## generate implied end tags
3740 if ({
3741 dd => 1, dt => 1, li => 1, p => 1,
3742 td => 1, th => 1, tr => 1,
3743 }->{$self->{open_elements}->[-1]->[1]}) {
3744 !!!back-token; # <?>
3745 $token = {type => 'end tag', tag_name => 'caption'};
3746 !!!back-token;
3747 $token = {type => 'end tag',
3748 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3749 redo B;
3750 }
3751
3752 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3753 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3754 }
3755
3756 splice @{$self->{open_elements}}, $i;
3757
3758 $clear_up_to_marker->();
3759
3760 $self->{insertion_mode} = 'in table';
3761
3762 ## reprocess
3763 redo B;
3764 } else {
3765 #
3766 }
3767 } elsif ($token->{type} eq 'end tag') {
3768 if ($token->{tag_name} eq 'caption') {
3769 ## have a table element in table scope
3770 my $i;
3771 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3772 my $node = $self->{open_elements}->[$_];
3773 if ($node->[1] eq $token->{tag_name}) {
3774 $i = $_;
3775 last INSCOPE;
3776 } elsif ({
3777 table => 1, html => 1,
3778 }->{$node->[1]}) {
3779 last INSCOPE;
3780 }
3781 } # INSCOPE
3782 unless (defined $i) {
3783 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3784 ## Ignore the token
3785 !!!next-token;
3786 redo B;
3787 }
3788
3789 ## generate implied end tags
3790 if ({
3791 dd => 1, dt => 1, li => 1, p => 1,
3792 td => 1, th => 1, tr => 1,
3793 }->{$self->{open_elements}->[-1]->[1]}) {
3794 !!!back-token;
3795 $token = {type => 'end tag',
3796 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3797 redo B;
3798 }
3799
3800 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3801 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3802 }
3803
3804 splice @{$self->{open_elements}}, $i;
3805
3806 $clear_up_to_marker->();
3807
3808 $self->{insertion_mode} = 'in table';
3809
3810 !!!next-token;
3811 redo B;
3812 } elsif ($token->{tag_name} eq 'table') {
3813 !!!parse-error (type => 'not closed:caption');
3814
3815 ## As if </caption>
3816 ## have a table element in table scope
3817 my $i;
3818 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3819 my $node = $self->{open_elements}->[$_];
3820 if ($node->[1] eq 'caption') {
3821 $i = $_;
3822 last INSCOPE;
3823 } elsif ({
3824 table => 1, html => 1,
3825 }->{$node->[1]}) {
3826 last INSCOPE;
3827 }
3828 } # INSCOPE
3829 unless (defined $i) {
3830 !!!parse-error (type => 'unmatched end tag:caption');
3831 ## Ignore the token
3832 !!!next-token;
3833 redo B;
3834 }
3835
3836 ## generate implied end tags
3837 if ({
3838 dd => 1, dt => 1, li => 1, p => 1,
3839 td => 1, th => 1, tr => 1,
3840 }->{$self->{open_elements}->[-1]->[1]}) {
3841 !!!back-token; # </table>
3842 $token = {type => 'end tag', tag_name => 'caption'};
3843 !!!back-token;
3844 $token = {type => 'end tag',
3845 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3846 redo B;
3847 }
3848
3849 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3850 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3851 }
3852
3853 splice @{$self->{open_elements}}, $i;
3854
3855 $clear_up_to_marker->();
3856
3857 $self->{insertion_mode} = 'in table';
3858
3859 ## reprocess
3860 redo B;
3861 } elsif ({
3862 body => 1, col => 1, colgroup => 1,
3863 html => 1, tbody => 1, td => 1, tfoot => 1,
3864 th => 1, thead => 1, tr => 1,
3865 }->{$token->{tag_name}}) {
3866 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3867 ## Ignore the token
3868 redo B;
3869 } else {
3870 #
3871 }
3872 } else {
3873 #
3874 }
3875
3876 $in_body->($insert_to_current);
3877 redo B;
3878 } elsif ($self->{insertion_mode} eq 'in column group') {
3879 if ($token->{type} eq 'character') {
3880 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3881 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3882 unless (length $token->{data}) {
3883 !!!next-token;
3884 redo B;
3885 }
3886 }
3887
3888 #
3889 } elsif ($token->{type} eq 'comment') {
3890 my $comment = $self->{document}->create_comment ($token->{data});
3891 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3892 !!!next-token;
3893 redo B;
3894 } elsif ($token->{type} eq 'start tag') {
3895 if ($token->{tag_name} eq 'col') {
3896 !!!insert-element ($token->{tag_name}, $token->{attributes});
3897 pop @{$self->{open_elements}};
3898 !!!next-token;
3899 redo B;
3900 } else {
3901 #
3902 }
3903 } elsif ($token->{type} eq 'end tag') {
3904 if ($token->{tag_name} eq 'colgroup') {
3905 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3906 !!!parse-error (type => 'unmatched end tag:colgroup');
3907 ## Ignore the token
3908 !!!next-token;
3909 redo B;
3910 } else {
3911 pop @{$self->{open_elements}}; # colgroup
3912 $self->{insertion_mode} = 'in table';
3913 !!!next-token;
3914 redo B;
3915 }
3916 } elsif ($token->{tag_name} eq 'col') {
3917 !!!parse-error (type => 'unmatched end tag:col');
3918 ## Ignore the token
3919 !!!next-token;
3920 redo B;
3921 } else {
3922 #
3923 }
3924 } else {
3925 #
3926 }
3927
3928 ## As if </colgroup>
3929 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3930 !!!parse-error (type => 'unmatched end tag:colgroup');
3931 ## Ignore the token
3932 !!!next-token;
3933 redo B;
3934 } else {
3935 pop @{$self->{open_elements}}; # colgroup
3936 $self->{insertion_mode} = 'in table';
3937 ## reprocess
3938 redo B;
3939 }
3940 } elsif ($self->{insertion_mode} eq 'in table body') {
3941 if ($token->{type} eq 'character') {
3942 ## NOTE: This is a "character in table" code clone.
3943 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3944 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3945
3946 unless (length $token->{data}) {
3947 !!!next-token;
3948 redo B;
3949 }
3950 }
3951
3952 !!!parse-error (type => 'in table:#character');
3953
3954 ## As if in body, but insert into foster parent element
3955 ## ISSUE: Spec says that "whenever a node would be inserted
3956 ## into the current node" while characters might not be
3957 ## result in a new Text node.
3958 $reconstruct_active_formatting_elements->($insert_to_foster);
3959
3960 if ({
3961 table => 1, tbody => 1, tfoot => 1,
3962 thead => 1, tr => 1,
3963 }->{$self->{open_elements}->[-1]->[1]}) {
3964 # MUST
3965 my $foster_parent_element;
3966 my $next_sibling;
3967 my $prev_sibling;
3968 OE: for (reverse 0..$#{$self->{open_elements}}) {
3969 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3970 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3971 if (defined $parent and $parent->node_type == 1) {
3972 $foster_parent_element = $parent;
3973 $next_sibling = $self->{open_elements}->[$_]->[0];
3974 $prev_sibling = $next_sibling->previous_sibling;
3975 } else {
3976 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3977 $prev_sibling = $foster_parent_element->last_child;
3978 }
3979 last OE;
3980 }
3981 } # OE
3982 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3983 $prev_sibling = $foster_parent_element->last_child
3984 unless defined $foster_parent_element;
3985 if (defined $prev_sibling and
3986 $prev_sibling->node_type == 3) {
3987 $prev_sibling->manakai_append_text ($token->{data});
3988 } else {
3989 $foster_parent_element->insert_before
3990 ($self->{document}->create_text_node ($token->{data}),
3991 $next_sibling);
3992 }
3993 } else {
3994 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3995 }
3996
3997 !!!next-token;
3998 redo B;
3999 } elsif ($token->{type} eq 'comment') {
4000 ## Copied from 'in table'
4001 my $comment = $self->{document}->create_comment ($token->{data});
4002 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4003 !!!next-token;
4004 redo B;
4005 } elsif ($token->{type} eq 'start tag') {
4006 if ({
4007 tr => 1,
4008 th => 1, td => 1,
4009 }->{$token->{tag_name}}) {
4010 unless ($token->{tag_name} eq 'tr') {
4011 !!!parse-error (type => 'missing start tag:tr');
4012 }
4013
4014 ## Clear back to table body context
4015 while (not {
4016 tbody => 1, tfoot => 1, thead => 1, html => 1,
4017 }->{$self->{open_elements}->[-1]->[1]}) {
4018 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4019 pop @{$self->{open_elements}};
4020 }
4021
4022 $self->{insertion_mode} = 'in row';
4023 if ($token->{tag_name} eq 'tr') {
4024 !!!insert-element ($token->{tag_name}, $token->{attributes});
4025 !!!next-token;
4026 } else {
4027 !!!insert-element ('tr');
4028 ## reprocess
4029 }
4030 redo B;
4031 } elsif ({
4032 caption => 1, col => 1, colgroup => 1,
4033 tbody => 1, tfoot => 1, thead => 1,
4034 }->{$token->{tag_name}}) {
4035 ## have an element in table scope
4036 my $i;
4037 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4038 my $node = $self->{open_elements}->[$_];
4039 if ({
4040 tbody => 1, thead => 1, tfoot => 1,
4041 }->{$node->[1]}) {
4042 $i = $_;
4043 last INSCOPE;
4044 } elsif ({
4045 table => 1, html => 1,
4046 }->{$node->[1]}) {
4047 last INSCOPE;
4048 }
4049 } # INSCOPE
4050 unless (defined $i) {
4051 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4052 ## Ignore the token
4053 !!!next-token;
4054 redo B;
4055 }
4056
4057 ## Clear back to table body context
4058 while (not {
4059 tbody => 1, tfoot => 1, thead => 1, html => 1,
4060 }->{$self->{open_elements}->[-1]->[1]}) {
4061 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4062 pop @{$self->{open_elements}};
4063 }
4064
4065 ## As if <{current node}>
4066 ## have an element in table scope
4067 ## true by definition
4068
4069 ## Clear back to table body context
4070 ## nop by definition
4071
4072 pop @{$self->{open_elements}};
4073 $self->{insertion_mode} = 'in table';
4074 ## reprocess
4075 redo B;
4076 } elsif ($token->{tag_name} eq 'table') {
4077 ## NOTE: This is a code clone of "table in table"
4078 !!!parse-error (type => 'not closed:table');
4079
4080 ## As if </table>
4081 ## have a table element in table scope
4082 my $i;
4083 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4084 my $node = $self->{open_elements}->[$_];
4085 if ($node->[1] eq 'table') {
4086 $i = $_;
4087 last INSCOPE;
4088 } elsif ({
4089 table => 1, html => 1,
4090 }->{$node->[1]}) {
4091 last INSCOPE;
4092 }
4093 } # INSCOPE
4094 unless (defined $i) {
4095 !!!parse-error (type => 'unmatched end tag:table');
4096 ## Ignore tokens </table><table>
4097 !!!next-token;
4098 redo B;
4099 }
4100
4101 ## generate implied end tags
4102 if ({
4103 dd => 1, dt => 1, li => 1, p => 1,
4104 td => 1, th => 1, tr => 1,
4105 }->{$self->{open_elements}->[-1]->[1]}) {
4106 !!!back-token; # <table>
4107 $token = {type => 'end tag', tag_name => 'table'};
4108 !!!back-token;
4109 $token = {type => 'end tag',
4110 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4111 redo B;
4112 }
4113
4114 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4115 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4116 }
4117
4118 splice @{$self->{open_elements}}, $i;
4119
4120 $self->_reset_insertion_mode;
4121
4122 ## reprocess
4123 redo B;
4124 } else {
4125 #
4126 }
4127 } elsif ($token->{type} eq 'end tag') {
4128 if ({
4129 tbody => 1, tfoot => 1, thead => 1,
4130 }->{$token->{tag_name}}) {
4131 ## have an element in table scope
4132 my $i;
4133 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4134 my $node = $self->{open_elements}->[$_];
4135 if ($node->[1] eq $token->{tag_name}) {
4136 $i = $_;
4137 last INSCOPE;
4138 } elsif ({
4139 table => 1, html => 1,
4140 }->{$node->[1]}) {
4141 last INSCOPE;
4142 }
4143 } # INSCOPE
4144 unless (defined $i) {
4145 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4146 ## Ignore the token
4147 !!!next-token;
4148 redo B;
4149 }
4150
4151 ## Clear back to table body context
4152 while (not {
4153 tbody => 1, tfoot => 1, thead => 1, html => 1,
4154 }->{$self->{open_elements}->[-1]->[1]}) {
4155 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4156 pop @{$self->{open_elements}};
4157 }
4158
4159 pop @{$self->{open_elements}};
4160 $self->{insertion_mode} = 'in table';
4161 !!!next-token;
4162 redo B;
4163 } elsif ($token->{tag_name} eq 'table') {
4164 ## have an element in table scope
4165 my $i;
4166 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4167 my $node = $self->{open_elements}->[$_];
4168 if ({
4169 tbody => 1, thead => 1, tfoot => 1,
4170 }->{$node->[1]}) {
4171 $i = $_;
4172 last INSCOPE;
4173 } elsif ({
4174 table => 1, html => 1,
4175 }->{$node->[1]}) {
4176 last INSCOPE;
4177 }
4178 } # INSCOPE
4179 unless (defined $i) {
4180 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4181 ## Ignore the token
4182 !!!next-token;
4183 redo B;
4184 }
4185
4186 ## Clear back to table body context
4187 while (not {
4188 tbody => 1, tfoot => 1, thead => 1, html => 1,
4189 }->{$self->{open_elements}->[-1]->[1]}) {
4190 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4191 pop @{$self->{open_elements}};
4192 }
4193
4194 ## As if <{current node}>
4195 ## have an element in table scope
4196 ## true by definition
4197
4198 ## Clear back to table body context
4199 ## nop by definition
4200
4201 pop @{$self->{open_elements}};
4202 $self->{insertion_mode} = 'in table';
4203 ## reprocess
4204 redo B;
4205 } elsif ({
4206 body => 1, caption => 1, col => 1, colgroup => 1,
4207 html => 1, td => 1, th => 1, tr => 1,
4208 }->{$token->{tag_name}}) {
4209 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4210 ## Ignore the token
4211 !!!next-token;
4212 redo B;
4213 } else {
4214 #
4215 }
4216 } else {
4217 #
4218 }
4219
4220 ## As if in table
4221 !!!parse-error (type => 'in table:'.$token->{tag_name});
4222 $in_body->($insert_to_foster);
4223 redo B;
4224 } elsif ($self->{insertion_mode} eq 'in row') {
4225 if ($token->{type} eq 'character') {
4226 ## NOTE: This is a "character in table" code clone.
4227 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4228 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4229
4230 unless (length $token->{data}) {
4231 !!!next-token;
4232 redo B;
4233 }
4234 }
4235
4236 !!!parse-error (type => 'in table:#character');
4237
4238 ## As if in body, but insert into foster parent element
4239 ## ISSUE: Spec says that "whenever a node would be inserted
4240 ## into the current node" while characters might not be
4241 ## result in a new Text node.
4242 $reconstruct_active_formatting_elements->($insert_to_foster);
4243
4244 if ({
4245 table => 1, tbody => 1, tfoot => 1,
4246 thead => 1, tr => 1,
4247 }->{$self->{open_elements}->[-1]->[1]}) {
4248 # MUST
4249 my $foster_parent_element;
4250 my $next_sibling;
4251 my $prev_sibling;
4252 OE: for (reverse 0..$#{$self->{open_elements}}) {
4253 if ($self->{open_elements}->[$_]->[1] eq 'table') {
4254 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4255 if (defined $parent and $parent->node_type == 1) {
4256 $foster_parent_element = $parent;
4257 $next_sibling = $self->{open_elements}->[$_]->[0];
4258 $prev_sibling = $next_sibling->previous_sibling;
4259 } else {
4260 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4261 $prev_sibling = $foster_parent_element->last_child;
4262 }
4263 last OE;
4264 }
4265 } # OE
4266 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4267 $prev_sibling = $foster_parent_element->last_child
4268 unless defined $foster_parent_element;
4269 if (defined $prev_sibling and
4270 $prev_sibling->node_type == 3) {
4271 $prev_sibling->manakai_append_text ($token->{data});
4272 } else {
4273 $foster_parent_element->insert_before
4274 ($self->{document}->create_text_node ($token->{data}),
4275 $next_sibling);
4276 }
4277 } else {
4278 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4279 }
4280
4281 !!!next-token;
4282 redo B;
4283 } elsif ($token->{type} eq 'comment') {
4284 ## Copied from 'in table'
4285 my $comment = $self->{document}->create_comment ($token->{data});
4286 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4287 !!!next-token;
4288 redo B;
4289 } elsif ($token->{type} eq 'start tag') {
4290 if ($token->{tag_name} eq 'th' or
4291 $token->{tag_name} eq 'td') {
4292 ## Clear back to table row context
4293 while (not {
4294 tr => 1, html => 1,
4295 }->{$self->{open_elements}->[-1]->[1]}) {
4296 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4297 pop @{$self->{open_elements}};
4298 }
4299
4300 !!!insert-element ($token->{tag_name}, $token->{attributes});
4301 $self->{insertion_mode} = 'in cell';
4302
4303 push @$active_formatting_elements, ['#marker', ''];
4304
4305 !!!next-token;
4306 redo B;
4307 } elsif ({
4308 caption => 1, col => 1, colgroup => 1,
4309 tbody => 1, tfoot => 1, thead => 1, tr => 1,
4310 }->{$token->{tag_name}}) {
4311 ## As if </tr>
4312 ## have an element in table scope
4313 my $i;
4314 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4315 my $node = $self->{open_elements}->[$_];
4316 if ($node->[1] eq 'tr') {
4317 $i = $_;
4318 last INSCOPE;
4319 } elsif ({
4320 table => 1, html => 1,
4321 }->{$node->[1]}) {
4322 last INSCOPE;
4323 }
4324 } # INSCOPE
4325 unless (defined $i) {
4326 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
4327 ## Ignore the token
4328 !!!next-token;
4329 redo B;
4330 }
4331
4332 ## Clear back to table row context
4333 while (not {
4334 tr => 1, html => 1,
4335 }->{$self->{open_elements}->[-1]->[1]}) {
4336 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4337 pop @{$self->{open_elements}};
4338 }
4339
4340 pop @{$self->{open_elements}}; # tr
4341 $self->{insertion_mode} = 'in table body';
4342 ## reprocess
4343 redo B;
4344 } elsif ($token->{tag_name} eq 'table') {
4345 ## NOTE: This is a code clone of "table in table"
4346 !!!parse-error (type => 'not closed:table');
4347
4348 ## As if </table>
4349 ## have a table element in table scope
4350 my $i;
4351 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4352 my $node = $self->{open_elements}->[$_];
4353 if ($node->[1] eq 'table') {
4354 $i = $_;
4355 last INSCOPE;
4356 } elsif ({
4357 table => 1, html => 1,
4358 }->{$node->[1]}) {
4359 last INSCOPE;
4360 }
4361 } # INSCOPE
4362 unless (defined $i) {
4363 !!!parse-error (type => 'unmatched end tag:table');
4364 ## Ignore tokens </table><table>
4365 !!!next-token;
4366 redo B;
4367 }
4368
4369 ## generate implied end tags
4370 if ({
4371 dd => 1, dt => 1, li => 1, p => 1,
4372 td => 1, th => 1, tr => 1,
4373 }->{$self->{open_elements}->[-1]->[1]}) {
4374 !!!back-token; # <table>
4375 $token = {type => 'end tag', tag_name => 'table'};
4376 !!!back-token;
4377 $token = {type => 'end tag',
4378 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4379 redo B;
4380 }
4381
4382 if ($self->{open_elements}->[-1]->[1] ne 'table') {
4383 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4384 }
4385
4386 splice @{$self->{open_elements}}, $i;
4387
4388 $self->_reset_insertion_mode;
4389
4390 ## reprocess
4391 redo B;
4392 } else {
4393 #
4394 }
4395 } elsif ($token->{type} eq 'end tag') {
4396 if ($token->{tag_name} eq 'tr') {
4397 ## have an element in table scope
4398 my $i;
4399 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4400 my $node = $self->{open_elements}->[$_];
4401 if ($node->[1] eq $token->{tag_name}) {
4402 $i = $_;
4403 last INSCOPE;
4404 } elsif ({
4405 table => 1, html => 1,
4406 }->{$node->[1]}) {
4407 last INSCOPE;
4408 }
4409 } # INSCOPE
4410 unless (defined $i) {
4411 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4412 ## Ignore the token
4413 !!!next-token;
4414 redo B;
4415 }
4416
4417 ## Clear back to table row context
4418 while (not {
4419 tr => 1, html => 1,
4420 }->{$self->{open_elements}->[-1]->[1]}) {
4421 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4422 pop @{$self->{open_elements}};
4423 }
4424
4425 pop @{$self->{open_elements}}; # tr
4426 $self->{insertion_mode} = 'in table body';
4427 !!!next-token;
4428 redo B;
4429 } elsif ($token->{tag_name} eq 'table') {
4430 ## As if </tr>
4431 ## have an element in table scope
4432 my $i;
4433 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4434 my $node = $self->{open_elements}->[$_];
4435 if ($node->[1] eq 'tr') {
4436 $i = $_;
4437 last INSCOPE;
4438 } elsif ({
4439 table => 1, html => 1,
4440 }->{$node->[1]}) {
4441 last INSCOPE;
4442 }
4443 } # INSCOPE
4444 unless (defined $i) {
4445 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
4446 ## Ignore the token
4447 !!!next-token;
4448 redo B;
4449 }
4450
4451 ## Clear back to table row context
4452 while (not {
4453 tr => 1, html => 1,
4454 }->{$self->{open_elements}->[-1]->[1]}) {
4455 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4456 pop @{$self->{open_elements}};
4457 }
4458
4459 pop @{$self->{open_elements}}; # tr
4460 $self->{insertion_mode} = 'in table body';
4461 ## reprocess
4462 redo B;
4463 } elsif ({
4464 tbody => 1, tfoot => 1, thead => 1,
4465 }->{$token->{tag_name}}) {
4466 ## have an element in table scope
4467 my $i;
4468 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4469 my $node = $self->{open_elements}->[$_];
4470 if ($node->[1] eq $token->{tag_name}) {
4471 $i = $_;
4472 last INSCOPE;
4473 } elsif ({
4474 table => 1, html => 1,
4475 }->{$node->[1]}) {
4476 last INSCOPE;
4477 }
4478 } # INSCOPE
4479 unless (defined $i) {
4480 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4481 ## Ignore the token
4482 !!!next-token;
4483 redo B;
4484 }
4485
4486 ## As if </tr>
4487 ## have an element in table scope
4488 my $i;
4489 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4490 my $node = $self->{open_elements}->[$_];
4491 if ($node->[1] eq 'tr') {
4492 $i = $_;
4493 last INSCOPE;
4494 } elsif ({
4495 table => 1, html => 1,
4496 }->{$node->[1]}) {
4497 last INSCOPE;
4498 }
4499 } # INSCOPE
4500 unless (defined $i) {
4501 !!!parse-error (type => 'unmatched end tag:tr');
4502 ## Ignore the token
4503 !!!next-token;
4504 redo B;
4505 }
4506
4507 ## Clear back to table row context
4508 while (not {
4509 tr => 1, html => 1,
4510 }->{$self->{open_elements}->[-1]->[1]}) {
4511 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4512 pop @{$self->{open_elements}};
4513 }
4514
4515 pop @{$self->{open_elements}}; # tr
4516 $self->{insertion_mode} = 'in table body';
4517 ## reprocess
4518 redo B;
4519 } elsif ({
4520 body => 1, caption => 1, col => 1,
4521 colgroup => 1, html => 1, td => 1, th => 1,
4522 }->{$token->{tag_name}}) {
4523 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4524 ## Ignore the token
4525 !!!next-token;
4526 redo B;
4527 } else {
4528 #
4529 }
4530 } else {
4531 #
4532 }
4533
4534 ## As if in table
4535 !!!parse-error (type => 'in table:'.$token->{tag_name});
4536 $in_body->($insert_to_foster);
4537 redo B;
4538 } elsif ($self->{insertion_mode} eq 'in cell') {
4539 if ($token->{type} eq 'character') {
4540 ## NOTE: This is a code clone of "character in body".
4541 $reconstruct_active_formatting_elements->($insert_to_current);
4542
4543 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4544
4545 !!!next-token;
4546 redo B;
4547 } elsif ($token->{type} eq 'comment') {
4548 ## NOTE: This is a code clone of "comment in body".
4549 my $comment = $self->{document}->create_comment ($token->{data});
4550 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4551 !!!next-token;
4552 redo B;
4553 } elsif ($token->{type} eq 'start tag') {
4554 if ({
4555 caption => 1, col => 1, colgroup => 1,
4556 tbody => 1, td => 1, tfoot => 1, th => 1,
4557 thead => 1, tr => 1,
4558 }->{$token->{tag_name}}) {
4559 ## have an element in table scope
4560 my $tn;
4561 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4562 my $node = $self->{open_elements}->[$_];
4563 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
4564 $tn = $node->[1];
4565 last INSCOPE;
4566 } elsif ({
4567 table => 1, html => 1,
4568 }->{$node->[1]}) {
4569 last INSCOPE;
4570 }
4571 } # INSCOPE
4572 unless (defined $tn) {
4573 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4574 ## Ignore the token
4575 !!!next-token;
4576 redo B;
4577 }
4578
4579 ## Close the cell
4580 !!!back-token; # <?>
4581 $token = {type => 'end tag', tag_name => $tn};
4582 redo B;
4583 } else {
4584 #
4585 }
4586 } elsif ($token->{type} eq 'end tag') {
4587 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4588 ## have an element in table scope
4589 my $i;
4590 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4591 my $node = $self->{open_elements}->[$_];
4592 if ($node->[1] eq $token->{tag_name}) {
4593 $i = $_;
4594 last INSCOPE;
4595 } elsif ({
4596 table => 1, html => 1,
4597 }->{$node->[1]}) {
4598 last INSCOPE;
4599 }
4600 } # INSCOPE
4601 unless (defined $i) {
4602 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4603 ## Ignore the token
4604 !!!next-token;
4605 redo B;
4606 }
4607
4608 ## generate implied end tags
4609 if ({
4610 dd => 1, dt => 1, li => 1, p => 1,
4611 td => ($token->{tag_name} eq 'th'),
4612 th => ($token->{tag_name} eq 'td'),
4613 tr => 1,
4614 }->{$self->{open_elements}->[-1]->[1]}) {
4615 !!!back-token;
4616 $token = {type => 'end tag',
4617 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4618 redo B;
4619 }
4620
4621 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
4622 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
4623 }
4624
4625 splice @{$self->{open_elements}}, $i;
4626
4627 $clear_up_to_marker->();
4628
4629 $self->{insertion_mode} = 'in row';
4630
4631 !!!next-token;
4632 redo B;
4633 } elsif ({
4634 body => 1, caption => 1, col => 1,
4635 colgroup => 1, html => 1,
4636 }->{$token->{tag_name}}) {
4637 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4638 ## Ignore the token
4639 !!!next-token;
4640 redo B;
4641 } elsif ({
4642 table => 1, tbody => 1, tfoot => 1,
4643 thead => 1, tr => 1,
4644 }->{$token->{tag_name}}) {
4645 ## have an element in table scope
4646 my $i;
4647 my $tn;
4648 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4649 my $node = $self->{open_elements}->[$_];
4650 if ($node->[1] eq $token->{tag_name}) {
4651 $i = $_;
4652 last INSCOPE;
4653 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
4654 $tn = $node->[1];
4655 ## NOTE: There is exactly one |td| or |th| element
4656 ## in scope in the stack of open elements by definition.
4657 } elsif ({
4658 table => 1, html => 1,
4659 }->{$node->[1]}) {
4660 last INSCOPE;
4661 }
4662 } # INSCOPE
4663 unless (defined $i) {
4664 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4665 ## Ignore the token
4666 !!!next-token;
4667 redo B;
4668 }
4669
4670 ## Close the cell
4671 !!!back-token; # </?>
4672 $token = {type => 'end tag', tag_name => $tn};
4673 redo B;
4674 } else {
4675 #
4676 }
4677 } else {
4678 #
4679 }
4680
4681 $in_body->($insert_to_current);
4682 redo B;
4683 } elsif ($self->{insertion_mode} eq 'in select') {
4684 if ($token->{type} eq 'character') {
4685 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4686 !!!next-token;
4687 redo B;
4688 } elsif ($token->{type} eq 'comment') {
4689 my $comment = $self->{document}->create_comment ($token->{data});
4690 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4691 !!!next-token;
4692 redo B;
4693 } elsif ($token->{type} eq 'start tag') {
4694 if ($token->{tag_name} eq 'option') {
4695 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4696 ## As if </option>
4697 pop @{$self->{open_elements}};
4698 }
4699
4700 !!!insert-element ($token->{tag_name}, $token->{attributes});
4701 !!!next-token;
4702 redo B;
4703 } elsif ($token->{tag_name} eq 'optgroup') {
4704 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4705 ## As if </option>
4706 pop @{$self->{open_elements}};
4707 }
4708
4709 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4710 ## As if </optgroup>
4711 pop @{$self->{open_elements}};
4712 }
4713
4714 !!!insert-element ($token->{tag_name}, $token->{attributes});
4715 !!!next-token;
4716 redo B;
4717 } elsif ($token->{tag_name} eq 'select') {
4718 !!!parse-error (type => 'not closed:select');
4719 ## As if </select> instead
4720 ## have an element in table scope
4721 my $i;
4722 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4723 my $node = $self->{open_elements}->[$_];
4724 if ($node->[1] eq $token->{tag_name}) {
4725 $i = $_;
4726 last INSCOPE;
4727 } elsif ({
4728 table => 1, html => 1,
4729 }->{$node->[1]}) {
4730 last INSCOPE;
4731 }
4732 } # INSCOPE
4733 unless (defined $i) {
4734 !!!parse-error (type => 'unmatched end tag:select');
4735 ## Ignore the token
4736 !!!next-token;
4737 redo B;
4738 }
4739
4740 splice @{$self->{open_elements}}, $i;
4741
4742 $self->_reset_insertion_mode;
4743
4744 !!!next-token;
4745 redo B;
4746 } else {
4747 #
4748 }
4749 } elsif ($token->{type} eq 'end tag') {
4750 if ($token->{tag_name} eq 'optgroup') {
4751 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4752 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4753 ## As if </option>
4754 splice @{$self->{open_elements}}, -2;
4755 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4756 pop @{$self->{open_elements}};
4757 } else {
4758 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4759 ## Ignore the token
4760 }
4761 !!!next-token;
4762 redo B;
4763 } elsif ($token->{tag_name} eq 'option') {
4764 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4765 pop @{$self->{open_elements}};
4766 } else {
4767 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4768 ## Ignore the token
4769 }
4770 !!!next-token;
4771 redo B;
4772 } elsif ($token->{tag_name} eq 'select') {
4773 ## have an element in table scope
4774 my $i;
4775 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4776 my $node = $self->{open_elements}->[$_];
4777 if ($node->[1] eq $token->{tag_name}) {
4778 $i = $_;
4779 last INSCOPE;
4780 } elsif ({
4781 table => 1, html => 1,
4782 }->{$node->[1]}) {
4783 last INSCOPE;
4784 }
4785 } # INSCOPE
4786 unless (defined $i) {
4787 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4788 ## Ignore the token
4789 !!!next-token;
4790 redo B;
4791 }
4792
4793 splice @{$self->{open_elements}}, $i;
4794
4795 $self->_reset_insertion_mode;
4796
4797 !!!next-token;
4798 redo B;
4799 } elsif ({
4800 caption => 1, table => 1, tbody => 1,
4801 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4802 }->{$token->{tag_name}}) {
4803 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4804
4805 ## have an element in table scope
4806 my $i;
4807 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4808 my $node = $self->{open_elements}->[$_];
4809 if ($node->[1] eq $token->{tag_name}) {
4810 $i = $_;
4811 last INSCOPE;
4812 } elsif ({
4813 table => 1, html => 1,
4814 }->{$node->[1]}) {
4815 last INSCOPE;
4816 }
4817 } # INSCOPE
4818 unless (defined $i) {
4819 ## Ignore the token
4820 !!!next-token;
4821 redo B;
4822 }
4823
4824 ## As if </select>
4825 ## have an element in table scope
4826 undef $i;
4827 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4828 my $node = $self->{open_elements}->[$_];
4829 if ($node->[1] eq 'select') {
4830 $i = $_;
4831 last INSCOPE;
4832 } elsif ({
4833 table => 1, html => 1,
4834 }->{$node->[1]}) {
4835 last INSCOPE;
4836 }
4837 } # INSCOPE
4838 unless (defined $i) {
4839 !!!parse-error (type => 'unmatched end tag:select');
4840 ## Ignore the </select> token
4841 !!!next-token; ## TODO: ok?
4842 redo B;
4843 }
4844
4845 splice @{$self->{open_elements}}, $i;
4846
4847 $self->_reset_insertion_mode;
4848
4849 ## reprocess
4850 redo B;
4851 } else {
4852 #
4853 }
4854 } else {
4855 #
4856 }
4857
4858 !!!parse-error (type => 'in select:'.$token->{tag_name});
4859 ## Ignore the token
4860 !!!next-token;
4861 redo B;
4862 } elsif ($self->{insertion_mode} eq 'after body') {
4863 if ($token->{type} eq 'character') {
4864 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4865 ## As if in body
4866 $reconstruct_active_formatting_elements->($insert_to_current);
4867
4868 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4869
4870 unless (length $token->{data}) {
4871 !!!next-token;
4872 redo B;
4873 }
4874 }
4875
4876 #
4877 !!!parse-error (type => 'after body:#'.$token->{type});
4878 } elsif ($token->{type} eq 'comment') {
4879 my $comment = $self->{document}->create_comment ($token->{data});
4880 $self->{open_elements}->[0]->[0]->append_child ($comment);
4881 !!!next-token;
4882 redo B;
4883 } elsif ($token->{type} eq 'start tag') {
4884 !!!parse-error (type => 'after body:'.$token->{tag_name});
4885 #
4886 } elsif ($token->{type} eq 'end tag') {
4887 if ($token->{tag_name} eq 'html') {
4888 if (defined $self->{inner_html_node}) {
4889 !!!parse-error (type => 'unmatched end tag:html');
4890 ## Ignore the token
4891 !!!next-token;
4892 redo B;
4893 } else {
4894 $phase = 'trailing end';
4895 !!!next-token;
4896 redo B;
4897 }
4898 } else {
4899 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4900 }
4901 } else {
4902 !!!parse-error (type => 'after body:#'.$token->{type});
4903 }
4904
4905 $self->{insertion_mode} = 'in body';
4906 ## reprocess
4907 redo B;
4908 } elsif ($self->{insertion_mode} eq 'in frameset') {
4909 if ($token->{type} eq 'character') {
4910 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4911 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4912
4913 unless (length $token->{data}) {
4914 !!!next-token;
4915 redo B;
4916 }
4917 }
4918
4919 #
4920 } elsif ($token->{type} eq 'comment') {
4921 my $comment = $self->{document}->create_comment ($token->{data});
4922 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4923 !!!next-token;
4924 redo B;
4925 } elsif ($token->{type} eq 'start tag') {
4926 if ($token->{tag_name} eq 'frameset') {
4927 !!!insert-element ($token->{tag_name}, $token->{attributes});
4928 !!!next-token;
4929 redo B;
4930 } elsif ($token->{tag_name} eq 'frame') {
4931 !!!insert-element ($token->{tag_name}, $token->{attributes});
4932 pop @{$self->{open_elements}};
4933 !!!next-token;
4934 redo B;
4935 } elsif ($token->{tag_name} eq 'noframes') {
4936 $in_body->($insert_to_current);
4937 redo B;
4938 } else {
4939 #
4940 }
4941 } elsif ($token->{type} eq 'end tag') {
4942 if ($token->{tag_name} eq 'frameset') {
4943 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4944 @{$self->{open_elements}} == 1) {
4945 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4946 ## Ignore the token
4947 !!!next-token;
4948 } else {
4949 pop @{$self->{open_elements}};
4950 !!!next-token;
4951 }
4952
4953 ## if not inner_html and
4954 if ($self->{open_elements}->[-1]->[1] ne 'frameset') {
4955 $self->{insertion_mode} = 'after frameset';
4956 }
4957 redo B;
4958 } else {
4959 #
4960 }
4961 } else {
4962 #
4963 }
4964
4965 if (defined $token->{tag_name}) {
4966 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4967 } else {
4968 !!!parse-error (type => 'in frameset:#'.$token->{type});
4969 }
4970 ## Ignore the token
4971 !!!next-token;
4972 redo B;
4973 } elsif ($self->{insertion_mode} eq 'after frameset') {
4974 if ($token->{type} eq 'character') {
4975 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4976 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4977
4978 unless (length $token->{data}) {
4979 !!!next-token;
4980 redo B;
4981 }
4982 }
4983
4984 #
4985 } elsif ($token->{type} eq 'comment') {
4986 my $comment = $self->{document}->create_comment ($token->{data});
4987 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4988 !!!next-token;
4989 redo B;
4990 } elsif ($token->{type} eq 'start tag') {
4991 if ($token->{tag_name} eq 'noframes') {
4992 $in_body->($insert_to_current);
4993 redo B;
4994 } else {
4995 #
4996 }
4997 } elsif ($token->{type} eq 'end tag') {
4998 if ($token->{tag_name} eq 'html') {
4999 $phase = 'trailing end';
5000 !!!next-token;
5001 redo B;
5002 } else {
5003 #
5004 }
5005 } else {
5006 #
5007 }
5008
5009 if (defined $token->{tag_name}) {
5010 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
5011 } else {
5012 !!!parse-error (type => 'after frameset:#'.$token->{type});
5013 }
5014 ## Ignore the token
5015 !!!next-token;
5016 redo B;
5017
5018 ## ISSUE: An issue in spec there
5019 } else {
5020 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5021 }
5022 }
5023 } elsif ($phase eq 'trailing end') {
5024 ## states in the main stage is preserved yet # MUST
5025
5026 if ($token->{type} eq 'DOCTYPE') {
5027 !!!parse-error (type => 'after html:#DOCTYPE');
5028 ## Ignore the token
5029 !!!next-token;
5030 redo B;
5031 } elsif ($token->{type} eq 'comment') {
5032 my $comment = $self->{document}->create_comment ($token->{data});
5033 $self->{document}->append_child ($comment);
5034 !!!next-token;
5035 redo B;
5036 } elsif ($token->{type} eq 'character') {
5037 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5038 my $data = $1;
5039 ## As if in the main phase.
5040 ## NOTE: The insertion mode in the main phase
5041 ## just before the phase has been changed to the trailing
5042 ## end phase is either "after body" or "after frameset".
5043 $reconstruct_active_formatting_elements->($insert_to_current)
5044 if $phase eq 'main';
5045
5046 $self->{open_elements}->[-1]->[0]->manakai_append_text ($data);
5047
5048 unless (length $token->{data}) {
5049 !!!next-token;
5050 redo B;
5051 }
5052 }
5053
5054 !!!parse-error (type => 'after html:#character');
5055 $phase = 'main';
5056 ## reprocess
5057 redo B;
5058 } elsif ($token->{type} eq 'start tag' or
5059 $token->{type} eq 'end tag') {
5060 !!!parse-error (type => 'after html:'.$token->{tag_name});
5061 $phase = 'main';
5062 ## reprocess
5063 redo B;
5064 } elsif ($token->{type} eq 'end-of-file') {
5065 ## Stop parsing
5066 last B;
5067 } else {
5068 die "$0: $token->{type}: Unknown token";
5069 }
5070 }
5071 } # B
5072
5073 ## Stop parsing # MUST
5074
5075 ## TODO: script stuffs
5076 } # _tree_construct_main
5077
5078 sub set_inner_html ($$$) {
5079 my $class = shift;
5080 my $node = shift;
5081 my $s = \$_[0];
5082 my $onerror = $_[1];
5083
5084 my $nt = $node->node_type;
5085 if ($nt == 9) {
5086 # MUST
5087
5088 ## Step 1 # MUST
5089 ## TODO: If the document has an active parser, ...
5090 ## ISSUE: There is an issue in the spec.
5091
5092 ## Step 2 # MUST
5093 my @cn = @{$node->child_nodes};
5094 for (@cn) {
5095 $node->remove_child ($_);
5096 }
5097
5098 ## Step 3, 4, 5 # MUST
5099 $class->parse_string ($$s => $node, $onerror);
5100 } elsif ($nt == 1) {
5101 ## TODO: If non-html element
5102
5103 ## NOTE: Most of this code is copied from |parse_string|
5104
5105 ## Step 1 # MUST
5106 my $this_doc = $node->owner_document;
5107 my $doc = $this_doc->implementation->create_document;
5108 $doc->manakai_is_html (1);
5109 my $p = $class->new;
5110 $p->{document} = $doc;
5111
5112 ## Step 9 # MUST
5113 my $i = 0;
5114 my $line = 1;
5115 my $column = 0;
5116 $p->{set_next_input_character} = sub {
5117 my $self = shift;
5118
5119 pop @{$self->{prev_input_character}};
5120 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5121
5122 $self->{next_input_character} = -1 and return if $i >= length $$s;
5123 $self->{next_input_character} = ord substr $$s, $i++, 1;
5124 $column++;
5125
5126 if ($self->{next_input_character} == 0x000A) { # LF
5127 $line++;
5128 $column = 0;
5129 } elsif ($self->{next_input_character} == 0x000D) { # CR
5130 $i++ if substr ($$s, $i, 1) eq "\x0A";
5131 $self->{next_input_character} = 0x000A; # LF # MUST
5132 $line++;
5133 $column = 0;
5134 } elsif ($self->{next_input_character} > 0x10FFFF) {
5135 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5136 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5137 !!!parse-error (type => 'NULL');
5138 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5139 }
5140 };
5141 $p->{prev_input_character} = [-1, -1, -1];
5142 $p->{next_input_character} = -1;
5143
5144 my $ponerror = $onerror || sub {
5145 my (%opt) = @_;
5146 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5147 };
5148 $p->{parse_error} = sub {
5149 $ponerror->(@_, line => $line, column => $column);
5150 };
5151
5152 $p->_initialize_tokenizer;
5153 $p->_initialize_tree_constructor;
5154
5155 ## Step 2
5156 my $node_ln = $node->local_name;
5157 $p->{content_model_flag} = {
5158 title => 'RCDATA',
5159 textarea => 'RCDATA',
5160 style => 'CDATA',
5161 script => 'CDATA',
5162 xmp => 'CDATA',
5163 iframe => 'CDATA',
5164 noembed => 'CDATA',
5165 noframes => 'CDATA',
5166 noscript => 'CDATA',
5167 plaintext => 'PLAINTEXT',
5168 }->{$node_ln} || 'PCDATA';
5169 ## ISSUE: What is "the name of the element"? local name?
5170
5171 $p->{inner_html_node} = [$node, $node_ln];
5172
5173 ## Step 4
5174 my $root = $doc->create_element_ns
5175 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5176
5177 ## Step 5 # MUST
5178 $doc->append_child ($root);
5179
5180 ## Step 6 # MUST
5181 push @{$p->{open_elements}}, [$root, 'html'];
5182
5183 undef $p->{head_element};
5184
5185 ## Step 7 # MUST
5186 $p->_reset_insertion_mode;
5187
5188 ## Step 8 # MUST
5189 my $anode = $node;
5190 AN: while (defined $anode) {
5191 if ($anode->node_type == 1) {
5192 my $nsuri = $anode->namespace_uri;
5193 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5194 if ($anode->local_name eq 'form') { ## TODO: case?
5195 $p->{form_element} = $anode;
5196 last AN;
5197 }
5198 }
5199 }
5200 $anode = $anode->parent_node;
5201 } # AN
5202
5203 ## Step 3 # MUST
5204 ## Step 10 # MUST
5205 {
5206 my $self = $p;
5207 !!!next-token;
5208 }
5209 $p->_tree_construction_main;
5210
5211 ## Step 11 # MUST
5212 my @cn = @{$node->child_nodes};
5213 for (@cn) {
5214 $node->remove_child ($_);
5215 }
5216 ## ISSUE: mutation events? read-only?
5217
5218 ## Step 12 # MUST
5219 @cn = @{$root->child_nodes};
5220 for (@cn) {
5221 $this_doc->adopt_node ($_);
5222 $node->append_child ($_);
5223 }
5224 ## ISSUE: mutation events?
5225
5226 $p->_terminate_tree_constructor;
5227 } else {
5228 die "$0: |set_inner_html| is not defined for node of type $nt";
5229 }
5230 } # set_inner_html
5231
5232 } # tree construction stage
5233
5234 sub get_inner_html ($$$) {
5235 my (undef, $node, $on_error) = @_;
5236
5237 ## Step 1
5238 my $s = '';
5239
5240 my $in_cdata;
5241 my $parent = $node;
5242 while (defined $parent) {
5243 if ($parent->node_type == 1 and
5244 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5245 {
5246 style => 1, script => 1, xmp => 1, iframe => 1,
5247 noembed => 1, noframes => 1, noscript => 1,
5248 }->{$parent->local_name}) { ## TODO: case thingy
5249 $in_cdata = 1;
5250 }
5251 $parent = $parent->parent_node;
5252 }
5253
5254 ## Step 2
5255 my @node = @{$node->child_nodes};
5256 C: while (@node) {
5257 my $child = shift @node;
5258 unless (ref $child) {
5259 if ($child eq 'cdata-out') {
5260 $in_cdata = 0;
5261 } else {
5262 $s .= $child; # end tag
5263 }
5264 next C;
5265 }
5266
5267 my $nt = $child->node_type;
5268 if ($nt == 1) { # Element
5269 my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
5270 $s .= '<' . $tag_name;
5271
5272 ## ISSUE: Non-html elements
5273
5274 my @attrs = @{$child->attributes}; # sort order MUST be stable
5275 for my $attr (@attrs) { # order is implementation dependent
5276 my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
5277 $s .= ' ' . $attr_name . '="';
5278 my $attr_value = $attr->value;
5279 ## escape
5280 $attr_value =~ s/&/&amp;/g;
5281 $attr_value =~ s/</&lt;/g;
5282 $attr_value =~ s/>/&gt;/g;
5283 $attr_value =~ s/"/&quot;/g;
5284 $s .= $attr_value . '"';
5285 }
5286 $s .= '>';
5287
5288 next C if {
5289 area => 1, base => 1, basefont => 1, bgsound => 1,
5290 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5291 img => 1, input => 1, link => 1, meta => 1, param => 1,
5292 spacer => 1, wbr => 1,
5293 }->{$tag_name};
5294
5295 if (not $in_cdata and {
5296 style => 1, script => 1, xmp => 1, iframe => 1,
5297 noembed => 1, noframes => 1, noscript => 1,
5298 }->{$tag_name}) {
5299 unshift @node, 'cdata-out';
5300 $in_cdata = 1;
5301 }
5302
5303 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5304 } elsif ($nt == 3 or $nt == 4) {
5305 if ($in_cdata) {
5306 $s .= $child->data;
5307 } else {
5308 my $value = $child->data;
5309 $value =~ s/&/&amp;/g;
5310 $value =~ s/</&lt;/g;
5311 $value =~ s/>/&gt;/g;
5312 $value =~ s/"/&quot;/g;
5313 $s .= $value;
5314 }
5315 } elsif ($nt == 8) {
5316 $s .= '<!--' . $child->data . '-->';
5317 } elsif ($nt == 10) {
5318 $s .= '<!DOCTYPE ' . $child->name . '>';
5319 } elsif ($nt == 5) { # entrefs
5320 push @node, @{$child->child_nodes};
5321 } else {
5322 $on_error->($child) if defined $on_error;
5323 }
5324 ## ISSUE: This code does not support PIs.
5325 } # C
5326
5327 ## Step 3
5328 return \$s;
5329 } # get_inner_html
5330
5331 1;
5332 # $Date: 2007/06/23 12:21:01 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24