/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.61 - (show annotations) (download) (as text)
Sun Nov 4 04:15:06 2007 UTC (17 years ago) by wakaba
Branch: MAIN
Changes since 1.60: +23 -4 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	4 Nov 2007 04:14:45 -0000
	* HTML.pm.src: Support for application cache selection algorithm
	callback.

2007-11-04  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.60 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## ISSUE:
6 ## var doc = implementation.createDocument (null, null, null);
7 ## doc.write ('');
8 ## alert (doc.compatMode);
9
10 ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
11 ## strip BOM and the HTML layer MUST ignore it. Whether we can do it
12 ## is not yet clear.
13 ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
14 ## "{U+FEFF}..." in GB18030?
15
16 my $permitted_slash_tag_name = {
17 base => 1,
18 link => 1,
19 meta => 1,
20 hr => 1,
21 br => 1,
22 img=> 1,
23 embed => 1,
24 param => 1,
25 area => 1,
26 col => 1,
27 input => 1,
28 };
29
30 my $c1_entity_char = {
31 0x80 => 0x20AC,
32 0x81 => 0xFFFD,
33 0x82 => 0x201A,
34 0x83 => 0x0192,
35 0x84 => 0x201E,
36 0x85 => 0x2026,
37 0x86 => 0x2020,
38 0x87 => 0x2021,
39 0x88 => 0x02C6,
40 0x89 => 0x2030,
41 0x8A => 0x0160,
42 0x8B => 0x2039,
43 0x8C => 0x0152,
44 0x8D => 0xFFFD,
45 0x8E => 0x017D,
46 0x8F => 0xFFFD,
47 0x90 => 0xFFFD,
48 0x91 => 0x2018,
49 0x92 => 0x2019,
50 0x93 => 0x201C,
51 0x94 => 0x201D,
52 0x95 => 0x2022,
53 0x96 => 0x2013,
54 0x97 => 0x2014,
55 0x98 => 0x02DC,
56 0x99 => 0x2122,
57 0x9A => 0x0161,
58 0x9B => 0x203A,
59 0x9C => 0x0153,
60 0x9D => 0xFFFD,
61 0x9E => 0x017E,
62 0x9F => 0x0178,
63 }; # $c1_entity_char
64
65 my $special_category = {
66 address => 1, area => 1, base => 1, basefont => 1, bgsound => 1,
67 blockquote => 1, body => 1, br => 1, center => 1, col => 1, colgroup => 1,
68 dd => 1, dir => 1, div => 1, dl => 1, dt => 1, embed => 1, fieldset => 1,
69 form => 1, frame => 1, frameset => 1, h1 => 1, h2 => 1, h3 => 1,
70 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, iframe => 1, image => 1,
71 img => 1, input => 1, isindex => 1, li => 1, link => 1, listing => 1,
72 menu => 1, meta => 1, noembed => 1, noframes => 1, noscript => 1,
73 ol => 1, optgroup => 1, option => 1, p => 1, param => 1, plaintext => 1,
74 pre => 1, script => 1, select => 1, spacer => 1, style => 1, tbody => 1,
75 textarea => 1, tfoot => 1, thead => 1, title => 1, tr => 1, ul => 1, wbr => 1,
76 };
77 my $scoping_category = {
78 button => 1, caption => 1, html => 1, marquee => 1, object => 1,
79 table => 1, td => 1, th => 1,
80 };
81 my $formatting_category = {
82 a => 1, b => 1, big => 1, em => 1, font => 1, i => 1, nobr => 1,
83 s => 1, small => 1, strile => 1, strong => 1, tt => 1, u => 1,
84 };
85 # $phrasing_category: all other elements
86
87 sub parse_string ($$$;$) {
88 my $self = shift->new;
89 my $s = \$_[0];
90 $self->{document} = $_[1];
91
92 ## NOTE: |set_inner_html| copies most of this method's code
93
94 my $i = 0;
95 my $line = 1;
96 my $column = 0;
97 $self->{set_next_input_character} = sub {
98 my $self = shift;
99
100 pop @{$self->{prev_input_character}};
101 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
102
103 $self->{next_input_character} = -1 and return if $i >= length $$s;
104 $self->{next_input_character} = ord substr $$s, $i++, 1;
105 $column++;
106
107 if ($self->{next_input_character} == 0x000A) { # LF
108 $line++;
109 $column = 0;
110 } elsif ($self->{next_input_character} == 0x000D) { # CR
111 $i++ if substr ($$s, $i, 1) eq "\x0A";
112 $self->{next_input_character} = 0x000A; # LF # MUST
113 $line++;
114 $column = 0;
115 } elsif ($self->{next_input_character} > 0x10FFFF) {
116 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
117 } elsif ($self->{next_input_character} == 0x0000) { # NULL
118 !!!parse-error (type => 'NULL');
119 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120 }
121 };
122 $self->{prev_input_character} = [-1, -1, -1];
123 $self->{next_input_character} = -1;
124
125 my $onerror = $_[2] || sub {
126 my (%opt) = @_;
127 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
128 };
129 $self->{parse_error} = sub {
130 $onerror->(@_, line => $line, column => $column);
131 };
132
133 $self->_initialize_tokenizer;
134 $self->_initialize_tree_constructor;
135 $self->_construct_tree;
136 $self->_terminate_tree_constructor;
137
138 return $self->{document};
139 } # parse_string
140
141 sub new ($) {
142 my $class = shift;
143 my $self = bless {}, $class;
144 $self->{set_next_input_character} = sub {
145 $self->{next_input_character} = -1;
146 };
147 $self->{parse_error} = sub {
148 #
149 };
150 $self->{application_cache_selection} = sub {
151 #
152 };
153 return $self;
154 } # new
155
156 sub CM_ENTITY () { 0b001 } # & markup in data
157 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
158 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
159
160 sub PLAINTEXT_CONTENT_MODEL () { 0 }
161 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
162 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
163 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
164
165 sub DATA_STATE () { 0 }
166 sub ENTITY_DATA_STATE () { 1 }
167 sub TAG_OPEN_STATE () { 2 }
168 sub CLOSE_TAG_OPEN_STATE () { 3 }
169 sub TAG_NAME_STATE () { 4 }
170 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
171 sub ATTRIBUTE_NAME_STATE () { 6 }
172 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
173 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
174 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
175 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
176 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
177 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
178 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
179 sub COMMENT_START_STATE () { 14 }
180 sub COMMENT_START_DASH_STATE () { 15 }
181 sub COMMENT_STATE () { 16 }
182 sub COMMENT_END_STATE () { 17 }
183 sub COMMENT_END_DASH_STATE () { 18 }
184 sub BOGUS_COMMENT_STATE () { 19 }
185 sub DOCTYPE_STATE () { 20 }
186 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
187 sub DOCTYPE_NAME_STATE () { 22 }
188 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
189 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
190 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
191 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
192 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
193 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
194 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
195 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
196 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
197 sub BOGUS_DOCTYPE_STATE () { 32 }
198
199 sub DOCTYPE_TOKEN () { 1 }
200 sub COMMENT_TOKEN () { 2 }
201 sub START_TAG_TOKEN () { 3 }
202 sub END_TAG_TOKEN () { 4 }
203 sub END_OF_FILE_TOKEN () { 5 }
204 sub CHARACTER_TOKEN () { 6 }
205
206 sub AFTER_HTML_IMS () { 0b100 }
207 sub HEAD_IMS () { 0b1000 }
208 sub BODY_IMS () { 0b10000 }
209 sub BODY_TABLE_IMS () { 0b100000 }
210 sub TABLE_IMS () { 0b1000000 }
211 sub ROW_IMS () { 0b10000000 }
212 sub BODY_AFTER_IMS () { 0b100000000 }
213 sub FRAME_IMS () { 0b1000000000 }
214
215 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
216 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
217 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
218 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
219 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
220 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
221 sub IN_BODY_IM () { BODY_IMS }
222 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
223 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
224 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
225 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
226 sub IN_TABLE_IM () { TABLE_IMS }
227 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
228 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
229 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
230 sub IN_SELECT_IM () { 0b01 }
231 sub IN_COLUMN_GROUP_IM () { 0b10 }
232
233 ## Implementations MUST act as if state machine in the spec
234
235 sub _initialize_tokenizer ($) {
236 my $self = shift;
237 $self->{state} = DATA_STATE; # MUST
238 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
239 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
240 undef $self->{current_attribute};
241 undef $self->{last_emitted_start_tag_name};
242 undef $self->{last_attribute_value_state};
243 $self->{char} = [];
244 # $self->{next_input_character}
245 !!!next-input-character;
246 $self->{token} = [];
247 # $self->{escape}
248 } # _initialize_tokenizer
249
250 ## A token has:
251 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
252 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
253 ## ->{name} (DOCTYPE_TOKEN)
254 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
255 ## ->{public_identifier} (DOCTYPE_TOKEN)
256 ## ->{system_identifier} (DOCTYPE_TOKEN)
257 ## ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
258 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
259 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
260
261 ## Emitted token MUST immediately be handled by the tree construction state.
262
263 ## Before each step, UA MAY check to see if either one of the scripts in
264 ## "list of scripts that will execute as soon as possible" or the first
265 ## script in the "list of scripts that will execute asynchronously",
266 ## has completed loading. If one has, then it MUST be executed
267 ## and removed from the list.
268
269 ## NOTE: HTML5 "Writing HTML documents" section, applied to
270 ## documents and not to user agents and conformance checkers,
271 ## contains some requirements that are not detected by the
272 ## parsing algorithm:
273 ## - Some requirements on character encoding declarations. ## TODO
274 ## - "Elements MUST NOT contain content that their content model disallows."
275 ## ... Some are parse error, some are not (will be reported by c.c.).
276 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
277 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
278 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
279
280 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
281 ## be detected by the HTML5 parsing algorithm:
282 ## - Text,
283
284 sub _get_next_token ($) {
285 my $self = shift;
286 if (@{$self->{token}}) {
287 return shift @{$self->{token}};
288 }
289
290 A: {
291 if ($self->{state} == DATA_STATE) {
292 if ($self->{next_input_character} == 0x0026) { # &
293 if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
294 $self->{state} = ENTITY_DATA_STATE;
295 !!!next-input-character;
296 redo A;
297 } else {
298 #
299 }
300 } elsif ($self->{next_input_character} == 0x002D) { # -
301 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
302 unless ($self->{escape}) {
303 if ($self->{prev_input_character}->[0] == 0x002D and # -
304 $self->{prev_input_character}->[1] == 0x0021 and # !
305 $self->{prev_input_character}->[2] == 0x003C) { # <
306 $self->{escape} = 1;
307 }
308 }
309 }
310
311 #
312 } elsif ($self->{next_input_character} == 0x003C) { # <
313 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
314 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
315 not $self->{escape})) {
316 $self->{state} = TAG_OPEN_STATE;
317 !!!next-input-character;
318 redo A;
319 } else {
320 #
321 }
322 } elsif ($self->{next_input_character} == 0x003E) { # >
323 if ($self->{escape} and
324 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
325 if ($self->{prev_input_character}->[0] == 0x002D and # -
326 $self->{prev_input_character}->[1] == 0x002D) { # -
327 delete $self->{escape};
328 }
329 }
330
331 #
332 } elsif ($self->{next_input_character} == -1) {
333 !!!emit ({type => END_OF_FILE_TOKEN});
334 last A; ## TODO: ok?
335 }
336 # Anything else
337 my $token = {type => CHARACTER_TOKEN,
338 data => chr $self->{next_input_character}};
339 ## Stay in the data state
340 !!!next-input-character;
341
342 !!!emit ($token);
343
344 redo A;
345 } elsif ($self->{state} == ENTITY_DATA_STATE) {
346 ## (cannot happen in CDATA state)
347
348 my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
349
350 $self->{state} = DATA_STATE;
351 # next-input-character is already done
352
353 unless (defined $token) {
354 !!!emit ({type => CHARACTER_TOKEN, data => '&'});
355 } else {
356 !!!emit ($token);
357 }
358
359 redo A;
360 } elsif ($self->{state} == TAG_OPEN_STATE) {
361 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
362 if ($self->{next_input_character} == 0x002F) { # /
363 !!!next-input-character;
364 $self->{state} = CLOSE_TAG_OPEN_STATE;
365 redo A;
366 } else {
367 ## reconsume
368 $self->{state} = DATA_STATE;
369
370 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
371
372 redo A;
373 }
374 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
375 if ($self->{next_input_character} == 0x0021) { # !
376 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
377 !!!next-input-character;
378 redo A;
379 } elsif ($self->{next_input_character} == 0x002F) { # /
380 $self->{state} = CLOSE_TAG_OPEN_STATE;
381 !!!next-input-character;
382 redo A;
383 } elsif (0x0041 <= $self->{next_input_character} and
384 $self->{next_input_character} <= 0x005A) { # A..Z
385 $self->{current_token}
386 = {type => START_TAG_TOKEN,
387 tag_name => chr ($self->{next_input_character} + 0x0020)};
388 $self->{state} = TAG_NAME_STATE;
389 !!!next-input-character;
390 redo A;
391 } elsif (0x0061 <= $self->{next_input_character} and
392 $self->{next_input_character} <= 0x007A) { # a..z
393 $self->{current_token} = {type => START_TAG_TOKEN,
394 tag_name => chr ($self->{next_input_character})};
395 $self->{state} = TAG_NAME_STATE;
396 !!!next-input-character;
397 redo A;
398 } elsif ($self->{next_input_character} == 0x003E) { # >
399 !!!parse-error (type => 'empty start tag');
400 $self->{state} = DATA_STATE;
401 !!!next-input-character;
402
403 !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
404
405 redo A;
406 } elsif ($self->{next_input_character} == 0x003F) { # ?
407 !!!parse-error (type => 'pio');
408 $self->{state} = BOGUS_COMMENT_STATE;
409 ## $self->{next_input_character} is intentionally left as is
410 redo A;
411 } else {
412 !!!parse-error (type => 'bare stago');
413 $self->{state} = DATA_STATE;
414 ## reconsume
415
416 !!!emit ({type => CHARACTER_TOKEN, data => '<'});
417
418 redo A;
419 }
420 } else {
421 die "$0: $self->{content_model} in tag open";
422 }
423 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
424 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
425 if (defined $self->{last_emitted_start_tag_name}) {
426 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
427 my @next_char;
428 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
429 push @next_char, $self->{next_input_character};
430 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
431 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
432 if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
433 !!!next-input-character;
434 next TAGNAME;
435 } else {
436 $self->{next_input_character} = shift @next_char; # reconsume
437 !!!back-next-input-character (@next_char);
438 $self->{state} = DATA_STATE;
439
440 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
441
442 redo A;
443 }
444 }
445 push @next_char, $self->{next_input_character};
446
447 unless ($self->{next_input_character} == 0x0009 or # HT
448 $self->{next_input_character} == 0x000A or # LF
449 $self->{next_input_character} == 0x000B or # VT
450 $self->{next_input_character} == 0x000C or # FF
451 $self->{next_input_character} == 0x0020 or # SP
452 $self->{next_input_character} == 0x003E or # >
453 $self->{next_input_character} == 0x002F or # /
454 $self->{next_input_character} == -1) {
455 $self->{next_input_character} = shift @next_char; # reconsume
456 !!!back-next-input-character (@next_char);
457 $self->{state} = DATA_STATE;
458 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
459 redo A;
460 } else {
461 $self->{next_input_character} = shift @next_char;
462 !!!back-next-input-character (@next_char);
463 # and consume...
464 }
465 } else {
466 ## No start tag token has ever been emitted
467 # next-input-character is already done
468 $self->{state} = DATA_STATE;
469 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
470 redo A;
471 }
472 }
473
474 if (0x0041 <= $self->{next_input_character} and
475 $self->{next_input_character} <= 0x005A) { # A..Z
476 $self->{current_token} = {type => END_TAG_TOKEN,
477 tag_name => chr ($self->{next_input_character} + 0x0020)};
478 $self->{state} = TAG_NAME_STATE;
479 !!!next-input-character;
480 redo A;
481 } elsif (0x0061 <= $self->{next_input_character} and
482 $self->{next_input_character} <= 0x007A) { # a..z
483 $self->{current_token} = {type => END_TAG_TOKEN,
484 tag_name => chr ($self->{next_input_character})};
485 $self->{state} = TAG_NAME_STATE;
486 !!!next-input-character;
487 redo A;
488 } elsif ($self->{next_input_character} == 0x003E) { # >
489 !!!parse-error (type => 'empty end tag');
490 $self->{state} = DATA_STATE;
491 !!!next-input-character;
492 redo A;
493 } elsif ($self->{next_input_character} == -1) {
494 !!!parse-error (type => 'bare etago');
495 $self->{state} = DATA_STATE;
496 # reconsume
497
498 !!!emit ({type => CHARACTER_TOKEN, data => '</'});
499
500 redo A;
501 } else {
502 !!!parse-error (type => 'bogus end tag');
503 $self->{state} = BOGUS_COMMENT_STATE;
504 ## $self->{next_input_character} is intentionally left as is
505 redo A;
506 }
507 } elsif ($self->{state} == TAG_NAME_STATE) {
508 if ($self->{next_input_character} == 0x0009 or # HT
509 $self->{next_input_character} == 0x000A or # LF
510 $self->{next_input_character} == 0x000B or # VT
511 $self->{next_input_character} == 0x000C or # FF
512 $self->{next_input_character} == 0x0020) { # SP
513 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
514 !!!next-input-character;
515 redo A;
516 } elsif ($self->{next_input_character} == 0x003E) { # >
517 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
518 $self->{current_token}->{first_start_tag}
519 = not defined $self->{last_emitted_start_tag_name};
520 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
521 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
522 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
523 if ($self->{current_token}->{attributes}) {
524 !!!parse-error (type => 'end tag attribute');
525 }
526 } else {
527 die "$0: $self->{current_token}->{type}: Unknown token type";
528 }
529 $self->{state} = DATA_STATE;
530 !!!next-input-character;
531
532 !!!emit ($self->{current_token}); # start tag or end tag
533
534 redo A;
535 } elsif (0x0041 <= $self->{next_input_character} and
536 $self->{next_input_character} <= 0x005A) { # A..Z
537 $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
538 # start tag or end tag
539 ## Stay in this state
540 !!!next-input-character;
541 redo A;
542 } elsif ($self->{next_input_character} == -1) {
543 !!!parse-error (type => 'unclosed tag');
544 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
545 $self->{current_token}->{first_start_tag}
546 = not defined $self->{last_emitted_start_tag_name};
547 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
548 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
549 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
550 if ($self->{current_token}->{attributes}) {
551 !!!parse-error (type => 'end tag attribute');
552 }
553 } else {
554 die "$0: $self->{current_token}->{type}: Unknown token type";
555 }
556 $self->{state} = DATA_STATE;
557 # reconsume
558
559 !!!emit ($self->{current_token}); # start tag or end tag
560
561 redo A;
562 } elsif ($self->{next_input_character} == 0x002F) { # /
563 !!!next-input-character;
564 if ($self->{next_input_character} == 0x003E and # >
565 $self->{current_token}->{type} == START_TAG_TOKEN and
566 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
567 # permitted slash
568 #
569 } else {
570 !!!parse-error (type => 'nestc');
571 }
572 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
573 # next-input-character is already done
574 redo A;
575 } else {
576 $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
577 # start tag or end tag
578 ## Stay in the state
579 !!!next-input-character;
580 redo A;
581 }
582 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
583 if ($self->{next_input_character} == 0x0009 or # HT
584 $self->{next_input_character} == 0x000A or # LF
585 $self->{next_input_character} == 0x000B or # VT
586 $self->{next_input_character} == 0x000C or # FF
587 $self->{next_input_character} == 0x0020) { # SP
588 ## Stay in the state
589 !!!next-input-character;
590 redo A;
591 } elsif ($self->{next_input_character} == 0x003E) { # >
592 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
593 $self->{current_token}->{first_start_tag}
594 = not defined $self->{last_emitted_start_tag_name};
595 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
596 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
597 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
598 if ($self->{current_token}->{attributes}) {
599 !!!parse-error (type => 'end tag attribute');
600 }
601 } else {
602 die "$0: $self->{current_token}->{type}: Unknown token type";
603 }
604 $self->{state} = DATA_STATE;
605 !!!next-input-character;
606
607 !!!emit ($self->{current_token}); # start tag or end tag
608
609 redo A;
610 } elsif (0x0041 <= $self->{next_input_character} and
611 $self->{next_input_character} <= 0x005A) { # A..Z
612 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
613 value => ''};
614 $self->{state} = ATTRIBUTE_NAME_STATE;
615 !!!next-input-character;
616 redo A;
617 } elsif ($self->{next_input_character} == 0x002F) { # /
618 !!!next-input-character;
619 if ($self->{next_input_character} == 0x003E and # >
620 $self->{current_token}->{type} == START_TAG_TOKEN and
621 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
622 # permitted slash
623 #
624 } else {
625 !!!parse-error (type => 'nestc');
626 }
627 ## Stay in the state
628 # next-input-character is already done
629 redo A;
630 } elsif ($self->{next_input_character} == -1) {
631 !!!parse-error (type => 'unclosed tag');
632 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
633 $self->{current_token}->{first_start_tag}
634 = not defined $self->{last_emitted_start_tag_name};
635 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
636 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
637 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
638 if ($self->{current_token}->{attributes}) {
639 !!!parse-error (type => 'end tag attribute');
640 }
641 } else {
642 die "$0: $self->{current_token}->{type}: Unknown token type";
643 }
644 $self->{state} = DATA_STATE;
645 # reconsume
646
647 !!!emit ($self->{current_token}); # start tag or end tag
648
649 redo A;
650 } else {
651 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
652 value => ''};
653 $self->{state} = ATTRIBUTE_NAME_STATE;
654 !!!next-input-character;
655 redo A;
656 }
657 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
658 my $before_leave = sub {
659 if (exists $self->{current_token}->{attributes} # start tag or end tag
660 ->{$self->{current_attribute}->{name}}) { # MUST
661 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
662 ## Discard $self->{current_attribute} # MUST
663 } else {
664 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
665 = $self->{current_attribute};
666 }
667 }; # $before_leave
668
669 if ($self->{next_input_character} == 0x0009 or # HT
670 $self->{next_input_character} == 0x000A or # LF
671 $self->{next_input_character} == 0x000B or # VT
672 $self->{next_input_character} == 0x000C or # FF
673 $self->{next_input_character} == 0x0020) { # SP
674 $before_leave->();
675 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
676 !!!next-input-character;
677 redo A;
678 } elsif ($self->{next_input_character} == 0x003D) { # =
679 $before_leave->();
680 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
681 !!!next-input-character;
682 redo A;
683 } elsif ($self->{next_input_character} == 0x003E) { # >
684 $before_leave->();
685 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
686 $self->{current_token}->{first_start_tag}
687 = not defined $self->{last_emitted_start_tag_name};
688 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
689 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
690 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
691 if ($self->{current_token}->{attributes}) {
692 !!!parse-error (type => 'end tag attribute');
693 }
694 } else {
695 die "$0: $self->{current_token}->{type}: Unknown token type";
696 }
697 $self->{state} = DATA_STATE;
698 !!!next-input-character;
699
700 !!!emit ($self->{current_token}); # start tag or end tag
701
702 redo A;
703 } elsif (0x0041 <= $self->{next_input_character} and
704 $self->{next_input_character} <= 0x005A) { # A..Z
705 $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
706 ## Stay in the state
707 !!!next-input-character;
708 redo A;
709 } elsif ($self->{next_input_character} == 0x002F) { # /
710 $before_leave->();
711 !!!next-input-character;
712 if ($self->{next_input_character} == 0x003E and # >
713 $self->{current_token}->{type} == START_TAG_TOKEN and
714 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
715 # permitted slash
716 #
717 } else {
718 !!!parse-error (type => 'nestc');
719 }
720 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
721 # next-input-character is already done
722 redo A;
723 } elsif ($self->{next_input_character} == -1) {
724 !!!parse-error (type => 'unclosed tag');
725 $before_leave->();
726 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
727 $self->{current_token}->{first_start_tag}
728 = not defined $self->{last_emitted_start_tag_name};
729 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
730 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
731 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
732 if ($self->{current_token}->{attributes}) {
733 !!!parse-error (type => 'end tag attribute');
734 }
735 } else {
736 die "$0: $self->{current_token}->{type}: Unknown token type";
737 }
738 $self->{state} = DATA_STATE;
739 # reconsume
740
741 !!!emit ($self->{current_token}); # start tag or end tag
742
743 redo A;
744 } else {
745 $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
746 ## Stay in the state
747 !!!next-input-character;
748 redo A;
749 }
750 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
751 if ($self->{next_input_character} == 0x0009 or # HT
752 $self->{next_input_character} == 0x000A or # LF
753 $self->{next_input_character} == 0x000B or # VT
754 $self->{next_input_character} == 0x000C or # FF
755 $self->{next_input_character} == 0x0020) { # SP
756 ## Stay in the state
757 !!!next-input-character;
758 redo A;
759 } elsif ($self->{next_input_character} == 0x003D) { # =
760 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
761 !!!next-input-character;
762 redo A;
763 } elsif ($self->{next_input_character} == 0x003E) { # >
764 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
765 $self->{current_token}->{first_start_tag}
766 = not defined $self->{last_emitted_start_tag_name};
767 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
768 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
769 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
770 if ($self->{current_token}->{attributes}) {
771 !!!parse-error (type => 'end tag attribute');
772 }
773 } else {
774 die "$0: $self->{current_token}->{type}: Unknown token type";
775 }
776 $self->{state} = DATA_STATE;
777 !!!next-input-character;
778
779 !!!emit ($self->{current_token}); # start tag or end tag
780
781 redo A;
782 } elsif (0x0041 <= $self->{next_input_character} and
783 $self->{next_input_character} <= 0x005A) { # A..Z
784 $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
785 value => ''};
786 $self->{state} = ATTRIBUTE_NAME_STATE;
787 !!!next-input-character;
788 redo A;
789 } elsif ($self->{next_input_character} == 0x002F) { # /
790 !!!next-input-character;
791 if ($self->{next_input_character} == 0x003E and # >
792 $self->{current_token}->{type} == START_TAG_TOKEN and
793 $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
794 # permitted slash
795 #
796 } else {
797 !!!parse-error (type => 'nestc');
798 ## TODO: Different error type for <aa / bb> than <aa/>
799 }
800 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
801 # next-input-character is already done
802 redo A;
803 } elsif ($self->{next_input_character} == -1) {
804 !!!parse-error (type => 'unclosed tag');
805 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
806 $self->{current_token}->{first_start_tag}
807 = not defined $self->{last_emitted_start_tag_name};
808 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
809 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
810 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
811 if ($self->{current_token}->{attributes}) {
812 !!!parse-error (type => 'end tag attribute');
813 }
814 } else {
815 die "$0: $self->{current_token}->{type}: Unknown token type";
816 }
817 $self->{state} = DATA_STATE;
818 # reconsume
819
820 !!!emit ($self->{current_token}); # start tag or end tag
821
822 redo A;
823 } else {
824 $self->{current_attribute} = {name => chr ($self->{next_input_character}),
825 value => ''};
826 $self->{state} = ATTRIBUTE_NAME_STATE;
827 !!!next-input-character;
828 redo A;
829 }
830 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
831 if ($self->{next_input_character} == 0x0009 or # HT
832 $self->{next_input_character} == 0x000A or # LF
833 $self->{next_input_character} == 0x000B or # VT
834 $self->{next_input_character} == 0x000C or # FF
835 $self->{next_input_character} == 0x0020) { # SP
836 ## Stay in the state
837 !!!next-input-character;
838 redo A;
839 } elsif ($self->{next_input_character} == 0x0022) { # "
840 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
841 !!!next-input-character;
842 redo A;
843 } elsif ($self->{next_input_character} == 0x0026) { # &
844 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
845 ## reconsume
846 redo A;
847 } elsif ($self->{next_input_character} == 0x0027) { # '
848 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
849 !!!next-input-character;
850 redo A;
851 } elsif ($self->{next_input_character} == 0x003E) { # >
852 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
853 $self->{current_token}->{first_start_tag}
854 = not defined $self->{last_emitted_start_tag_name};
855 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
856 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
857 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
858 if ($self->{current_token}->{attributes}) {
859 !!!parse-error (type => 'end tag attribute');
860 }
861 } else {
862 die "$0: $self->{current_token}->{type}: Unknown token type";
863 }
864 $self->{state} = DATA_STATE;
865 !!!next-input-character;
866
867 !!!emit ($self->{current_token}); # start tag or end tag
868
869 redo A;
870 } elsif ($self->{next_input_character} == -1) {
871 !!!parse-error (type => 'unclosed tag');
872 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
873 $self->{current_token}->{first_start_tag}
874 = not defined $self->{last_emitted_start_tag_name};
875 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
876 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
877 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
878 if ($self->{current_token}->{attributes}) {
879 !!!parse-error (type => 'end tag attribute');
880 }
881 } else {
882 die "$0: $self->{current_token}->{type}: Unknown token type";
883 }
884 $self->{state} = DATA_STATE;
885 ## reconsume
886
887 !!!emit ($self->{current_token}); # start tag or end tag
888
889 redo A;
890 } else {
891 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
892 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
893 !!!next-input-character;
894 redo A;
895 }
896 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
897 if ($self->{next_input_character} == 0x0022) { # "
898 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
899 !!!next-input-character;
900 redo A;
901 } elsif ($self->{next_input_character} == 0x0026) { # &
902 $self->{last_attribute_value_state} = $self->{state};
903 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
904 !!!next-input-character;
905 redo A;
906 } elsif ($self->{next_input_character} == -1) {
907 !!!parse-error (type => 'unclosed attribute value');
908 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
909 $self->{current_token}->{first_start_tag}
910 = not defined $self->{last_emitted_start_tag_name};
911 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
912 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
913 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
914 if ($self->{current_token}->{attributes}) {
915 !!!parse-error (type => 'end tag attribute');
916 }
917 } else {
918 die "$0: $self->{current_token}->{type}: Unknown token type";
919 }
920 $self->{state} = DATA_STATE;
921 ## reconsume
922
923 !!!emit ($self->{current_token}); # start tag or end tag
924
925 redo A;
926 } else {
927 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
928 ## Stay in the state
929 !!!next-input-character;
930 redo A;
931 }
932 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
933 if ($self->{next_input_character} == 0x0027) { # '
934 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
935 !!!next-input-character;
936 redo A;
937 } elsif ($self->{next_input_character} == 0x0026) { # &
938 $self->{last_attribute_value_state} = $self->{state};
939 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
940 !!!next-input-character;
941 redo A;
942 } elsif ($self->{next_input_character} == -1) {
943 !!!parse-error (type => 'unclosed attribute value');
944 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
945 $self->{current_token}->{first_start_tag}
946 = not defined $self->{last_emitted_start_tag_name};
947 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
948 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
949 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
950 if ($self->{current_token}->{attributes}) {
951 !!!parse-error (type => 'end tag attribute');
952 }
953 } else {
954 die "$0: $self->{current_token}->{type}: Unknown token type";
955 }
956 $self->{state} = DATA_STATE;
957 ## reconsume
958
959 !!!emit ($self->{current_token}); # start tag or end tag
960
961 redo A;
962 } else {
963 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
964 ## Stay in the state
965 !!!next-input-character;
966 redo A;
967 }
968 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
969 if ($self->{next_input_character} == 0x0009 or # HT
970 $self->{next_input_character} == 0x000A or # LF
971 $self->{next_input_character} == 0x000B or # HT
972 $self->{next_input_character} == 0x000C or # FF
973 $self->{next_input_character} == 0x0020) { # SP
974 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
975 !!!next-input-character;
976 redo A;
977 } elsif ($self->{next_input_character} == 0x0026) { # &
978 $self->{last_attribute_value_state} = $self->{state};
979 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
980 !!!next-input-character;
981 redo A;
982 } elsif ($self->{next_input_character} == 0x003E) { # >
983 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
984 $self->{current_token}->{first_start_tag}
985 = not defined $self->{last_emitted_start_tag_name};
986 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
987 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
988 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
989 if ($self->{current_token}->{attributes}) {
990 !!!parse-error (type => 'end tag attribute');
991 }
992 } else {
993 die "$0: $self->{current_token}->{type}: Unknown token type";
994 }
995 $self->{state} = DATA_STATE;
996 !!!next-input-character;
997
998 !!!emit ($self->{current_token}); # start tag or end tag
999
1000 redo A;
1001 } elsif ($self->{next_input_character} == -1) {
1002 !!!parse-error (type => 'unclosed tag');
1003 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1004 $self->{current_token}->{first_start_tag}
1005 = not defined $self->{last_emitted_start_tag_name};
1006 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1007 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1008 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1009 if ($self->{current_token}->{attributes}) {
1010 !!!parse-error (type => 'end tag attribute');
1011 }
1012 } else {
1013 die "$0: $self->{current_token}->{type}: Unknown token type";
1014 }
1015 $self->{state} = DATA_STATE;
1016 ## reconsume
1017
1018 !!!emit ($self->{current_token}); # start tag or end tag
1019
1020 redo A;
1021 } else {
1022 $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
1023 ## Stay in the state
1024 !!!next-input-character;
1025 redo A;
1026 }
1027 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1028 my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
1029
1030 unless (defined $token) {
1031 $self->{current_attribute}->{value} .= '&';
1032 } else {
1033 $self->{current_attribute}->{value} .= $token->{data};
1034 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1035 }
1036
1037 $self->{state} = $self->{last_attribute_value_state};
1038 # next-input-character is already done
1039 redo A;
1040 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1041 ## (only happen if PCDATA state)
1042
1043 my $token = {type => COMMENT_TOKEN, data => ''};
1044
1045 BC: {
1046 if ($self->{next_input_character} == 0x003E) { # >
1047 $self->{state} = DATA_STATE;
1048 !!!next-input-character;
1049
1050 !!!emit ($token);
1051
1052 redo A;
1053 } elsif ($self->{next_input_character} == -1) {
1054 $self->{state} = DATA_STATE;
1055 ## reconsume
1056
1057 !!!emit ($token);
1058
1059 redo A;
1060 } else {
1061 $token->{data} .= chr ($self->{next_input_character});
1062 !!!next-input-character;
1063 redo BC;
1064 }
1065 } # BC
1066 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1067 ## (only happen if PCDATA state)
1068
1069 my @next_char;
1070 push @next_char, $self->{next_input_character};
1071
1072 if ($self->{next_input_character} == 0x002D) { # -
1073 !!!next-input-character;
1074 push @next_char, $self->{next_input_character};
1075 if ($self->{next_input_character} == 0x002D) { # -
1076 $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1077 $self->{state} = COMMENT_START_STATE;
1078 !!!next-input-character;
1079 redo A;
1080 }
1081 } elsif ($self->{next_input_character} == 0x0044 or # D
1082 $self->{next_input_character} == 0x0064) { # d
1083 !!!next-input-character;
1084 push @next_char, $self->{next_input_character};
1085 if ($self->{next_input_character} == 0x004F or # O
1086 $self->{next_input_character} == 0x006F) { # o
1087 !!!next-input-character;
1088 push @next_char, $self->{next_input_character};
1089 if ($self->{next_input_character} == 0x0043 or # C
1090 $self->{next_input_character} == 0x0063) { # c
1091 !!!next-input-character;
1092 push @next_char, $self->{next_input_character};
1093 if ($self->{next_input_character} == 0x0054 or # T
1094 $self->{next_input_character} == 0x0074) { # t
1095 !!!next-input-character;
1096 push @next_char, $self->{next_input_character};
1097 if ($self->{next_input_character} == 0x0059 or # Y
1098 $self->{next_input_character} == 0x0079) { # y
1099 !!!next-input-character;
1100 push @next_char, $self->{next_input_character};
1101 if ($self->{next_input_character} == 0x0050 or # P
1102 $self->{next_input_character} == 0x0070) { # p
1103 !!!next-input-character;
1104 push @next_char, $self->{next_input_character};
1105 if ($self->{next_input_character} == 0x0045 or # E
1106 $self->{next_input_character} == 0x0065) { # e
1107 ## ISSUE: What a stupid code this is!
1108 $self->{state} = DOCTYPE_STATE;
1109 !!!next-input-character;
1110 redo A;
1111 }
1112 }
1113 }
1114 }
1115 }
1116 }
1117 }
1118
1119 !!!parse-error (type => 'bogus comment');
1120 $self->{next_input_character} = shift @next_char;
1121 !!!back-next-input-character (@next_char);
1122 $self->{state} = BOGUS_COMMENT_STATE;
1123 redo A;
1124
1125 ## ISSUE: typos in spec: chacacters, is is a parse error
1126 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1127 } elsif ($self->{state} == COMMENT_START_STATE) {
1128 if ($self->{next_input_character} == 0x002D) { # -
1129 $self->{state} = COMMENT_START_DASH_STATE;
1130 !!!next-input-character;
1131 redo A;
1132 } elsif ($self->{next_input_character} == 0x003E) { # >
1133 !!!parse-error (type => 'bogus comment');
1134 $self->{state} = DATA_STATE;
1135 !!!next-input-character;
1136
1137 !!!emit ($self->{current_token}); # comment
1138
1139 redo A;
1140 } elsif ($self->{next_input_character} == -1) {
1141 !!!parse-error (type => 'unclosed comment');
1142 $self->{state} = DATA_STATE;
1143 ## reconsume
1144
1145 !!!emit ($self->{current_token}); # comment
1146
1147 redo A;
1148 } else {
1149 $self->{current_token}->{data} # comment
1150 .= chr ($self->{next_input_character});
1151 $self->{state} = COMMENT_STATE;
1152 !!!next-input-character;
1153 redo A;
1154 }
1155 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1156 if ($self->{next_input_character} == 0x002D) { # -
1157 $self->{state} = COMMENT_END_STATE;
1158 !!!next-input-character;
1159 redo A;
1160 } elsif ($self->{next_input_character} == 0x003E) { # >
1161 !!!parse-error (type => 'bogus comment');
1162 $self->{state} = DATA_STATE;
1163 !!!next-input-character;
1164
1165 !!!emit ($self->{current_token}); # comment
1166
1167 redo A;
1168 } elsif ($self->{next_input_character} == -1) {
1169 !!!parse-error (type => 'unclosed comment');
1170 $self->{state} = DATA_STATE;
1171 ## reconsume
1172
1173 !!!emit ($self->{current_token}); # comment
1174
1175 redo A;
1176 } else {
1177 $self->{current_token}->{data} # comment
1178 .= '-' . chr ($self->{next_input_character});
1179 $self->{state} = COMMENT_STATE;
1180 !!!next-input-character;
1181 redo A;
1182 }
1183 } elsif ($self->{state} == COMMENT_STATE) {
1184 if ($self->{next_input_character} == 0x002D) { # -
1185 $self->{state} = COMMENT_END_DASH_STATE;
1186 !!!next-input-character;
1187 redo A;
1188 } elsif ($self->{next_input_character} == -1) {
1189 !!!parse-error (type => 'unclosed comment');
1190 $self->{state} = DATA_STATE;
1191 ## reconsume
1192
1193 !!!emit ($self->{current_token}); # comment
1194
1195 redo A;
1196 } else {
1197 $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
1198 ## Stay in the state
1199 !!!next-input-character;
1200 redo A;
1201 }
1202 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1203 if ($self->{next_input_character} == 0x002D) { # -
1204 $self->{state} = COMMENT_END_STATE;
1205 !!!next-input-character;
1206 redo A;
1207 } elsif ($self->{next_input_character} == -1) {
1208 !!!parse-error (type => 'unclosed comment');
1209 $self->{state} = DATA_STATE;
1210 ## reconsume
1211
1212 !!!emit ($self->{current_token}); # comment
1213
1214 redo A;
1215 } else {
1216 $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1217 $self->{state} = COMMENT_STATE;
1218 !!!next-input-character;
1219 redo A;
1220 }
1221 } elsif ($self->{state} == COMMENT_END_STATE) {
1222 if ($self->{next_input_character} == 0x003E) { # >
1223 $self->{state} = DATA_STATE;
1224 !!!next-input-character;
1225
1226 !!!emit ($self->{current_token}); # comment
1227
1228 redo A;
1229 } elsif ($self->{next_input_character} == 0x002D) { # -
1230 !!!parse-error (type => 'dash in comment');
1231 $self->{current_token}->{data} .= '-'; # comment
1232 ## Stay in the state
1233 !!!next-input-character;
1234 redo A;
1235 } elsif ($self->{next_input_character} == -1) {
1236 !!!parse-error (type => 'unclosed comment');
1237 $self->{state} = DATA_STATE;
1238 ## reconsume
1239
1240 !!!emit ($self->{current_token}); # comment
1241
1242 redo A;
1243 } else {
1244 !!!parse-error (type => 'dash in comment');
1245 $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1246 $self->{state} = COMMENT_STATE;
1247 !!!next-input-character;
1248 redo A;
1249 }
1250 } elsif ($self->{state} == DOCTYPE_STATE) {
1251 if ($self->{next_input_character} == 0x0009 or # HT
1252 $self->{next_input_character} == 0x000A or # LF
1253 $self->{next_input_character} == 0x000B or # VT
1254 $self->{next_input_character} == 0x000C or # FF
1255 $self->{next_input_character} == 0x0020) { # SP
1256 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1257 !!!next-input-character;
1258 redo A;
1259 } else {
1260 !!!parse-error (type => 'no space before DOCTYPE name');
1261 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1262 ## reconsume
1263 redo A;
1264 }
1265 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1266 if ($self->{next_input_character} == 0x0009 or # HT
1267 $self->{next_input_character} == 0x000A or # LF
1268 $self->{next_input_character} == 0x000B or # VT
1269 $self->{next_input_character} == 0x000C or # FF
1270 $self->{next_input_character} == 0x0020) { # SP
1271 ## Stay in the state
1272 !!!next-input-character;
1273 redo A;
1274 } elsif ($self->{next_input_character} == 0x003E) { # >
1275 !!!parse-error (type => 'no DOCTYPE name');
1276 $self->{state} = DATA_STATE;
1277 !!!next-input-character;
1278
1279 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1280
1281 redo A;
1282 } elsif ($self->{next_input_character} == -1) {
1283 !!!parse-error (type => 'no DOCTYPE name');
1284 $self->{state} = DATA_STATE;
1285 ## reconsume
1286
1287 !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
1288
1289 redo A;
1290 } else {
1291 $self->{current_token}
1292 = {type => DOCTYPE_TOKEN,
1293 name => chr ($self->{next_input_character}),
1294 correct => 1};
1295 ## ISSUE: "Set the token's name name to the" in the spec
1296 $self->{state} = DOCTYPE_NAME_STATE;
1297 !!!next-input-character;
1298 redo A;
1299 }
1300 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1301 ## ISSUE: Redundant "First," in the spec.
1302 if ($self->{next_input_character} == 0x0009 or # HT
1303 $self->{next_input_character} == 0x000A or # LF
1304 $self->{next_input_character} == 0x000B or # VT
1305 $self->{next_input_character} == 0x000C or # FF
1306 $self->{next_input_character} == 0x0020) { # SP
1307 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1308 !!!next-input-character;
1309 redo A;
1310 } elsif ($self->{next_input_character} == 0x003E) { # >
1311 $self->{state} = DATA_STATE;
1312 !!!next-input-character;
1313
1314 !!!emit ($self->{current_token}); # DOCTYPE
1315
1316 redo A;
1317 } elsif ($self->{next_input_character} == -1) {
1318 !!!parse-error (type => 'unclosed DOCTYPE');
1319 $self->{state} = DATA_STATE;
1320 ## reconsume
1321
1322 delete $self->{current_token}->{correct};
1323 !!!emit ($self->{current_token}); # DOCTYPE
1324
1325 redo A;
1326 } else {
1327 $self->{current_token}->{name}
1328 .= chr ($self->{next_input_character}); # DOCTYPE
1329 ## Stay in the state
1330 !!!next-input-character;
1331 redo A;
1332 }
1333 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1334 if ($self->{next_input_character} == 0x0009 or # HT
1335 $self->{next_input_character} == 0x000A or # LF
1336 $self->{next_input_character} == 0x000B or # VT
1337 $self->{next_input_character} == 0x000C or # FF
1338 $self->{next_input_character} == 0x0020) { # SP
1339 ## Stay in the state
1340 !!!next-input-character;
1341 redo A;
1342 } elsif ($self->{next_input_character} == 0x003E) { # >
1343 $self->{state} = DATA_STATE;
1344 !!!next-input-character;
1345
1346 !!!emit ($self->{current_token}); # DOCTYPE
1347
1348 redo A;
1349 } elsif ($self->{next_input_character} == -1) {
1350 !!!parse-error (type => 'unclosed DOCTYPE');
1351 $self->{state} = DATA_STATE;
1352 ## reconsume
1353
1354 delete $self->{current_token}->{correct};
1355 !!!emit ($self->{current_token}); # DOCTYPE
1356
1357 redo A;
1358 } elsif ($self->{next_input_character} == 0x0050 or # P
1359 $self->{next_input_character} == 0x0070) { # p
1360 !!!next-input-character;
1361 if ($self->{next_input_character} == 0x0055 or # U
1362 $self->{next_input_character} == 0x0075) { # u
1363 !!!next-input-character;
1364 if ($self->{next_input_character} == 0x0042 or # B
1365 $self->{next_input_character} == 0x0062) { # b
1366 !!!next-input-character;
1367 if ($self->{next_input_character} == 0x004C or # L
1368 $self->{next_input_character} == 0x006C) { # l
1369 !!!next-input-character;
1370 if ($self->{next_input_character} == 0x0049 or # I
1371 $self->{next_input_character} == 0x0069) { # i
1372 !!!next-input-character;
1373 if ($self->{next_input_character} == 0x0043 or # C
1374 $self->{next_input_character} == 0x0063) { # c
1375 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1376 !!!next-input-character;
1377 redo A;
1378 }
1379 }
1380 }
1381 }
1382 }
1383
1384 #
1385 } elsif ($self->{next_input_character} == 0x0053 or # S
1386 $self->{next_input_character} == 0x0073) { # s
1387 !!!next-input-character;
1388 if ($self->{next_input_character} == 0x0059 or # Y
1389 $self->{next_input_character} == 0x0079) { # y
1390 !!!next-input-character;
1391 if ($self->{next_input_character} == 0x0053 or # S
1392 $self->{next_input_character} == 0x0073) { # s
1393 !!!next-input-character;
1394 if ($self->{next_input_character} == 0x0054 or # T
1395 $self->{next_input_character} == 0x0074) { # t
1396 !!!next-input-character;
1397 if ($self->{next_input_character} == 0x0045 or # E
1398 $self->{next_input_character} == 0x0065) { # e
1399 !!!next-input-character;
1400 if ($self->{next_input_character} == 0x004D or # M
1401 $self->{next_input_character} == 0x006D) { # m
1402 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1403 !!!next-input-character;
1404 redo A;
1405 }
1406 }
1407 }
1408 }
1409 }
1410
1411 #
1412 } else {
1413 !!!next-input-character;
1414 #
1415 }
1416
1417 !!!parse-error (type => 'string after DOCTYPE name');
1418 $self->{state} = BOGUS_DOCTYPE_STATE;
1419 # next-input-character is already done
1420 redo A;
1421 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1422 if ({
1423 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1424 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1425 }->{$self->{next_input_character}}) {
1426 ## Stay in the state
1427 !!!next-input-character;
1428 redo A;
1429 } elsif ($self->{next_input_character} eq 0x0022) { # "
1430 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1431 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1432 !!!next-input-character;
1433 redo A;
1434 } elsif ($self->{next_input_character} eq 0x0027) { # '
1435 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1436 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1437 !!!next-input-character;
1438 redo A;
1439 } elsif ($self->{next_input_character} eq 0x003E) { # >
1440 !!!parse-error (type => 'no PUBLIC literal');
1441
1442 $self->{state} = DATA_STATE;
1443 !!!next-input-character;
1444
1445 delete $self->{current_token}->{correct};
1446 !!!emit ($self->{current_token}); # DOCTYPE
1447
1448 redo A;
1449 } elsif ($self->{next_input_character} == -1) {
1450 !!!parse-error (type => 'unclosed DOCTYPE');
1451
1452 $self->{state} = DATA_STATE;
1453 ## reconsume
1454
1455 delete $self->{current_token}->{correct};
1456 !!!emit ($self->{current_token}); # DOCTYPE
1457
1458 redo A;
1459 } else {
1460 !!!parse-error (type => 'string after PUBLIC');
1461 $self->{state} = BOGUS_DOCTYPE_STATE;
1462 !!!next-input-character;
1463 redo A;
1464 }
1465 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1466 if ($self->{next_input_character} == 0x0022) { # "
1467 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1468 !!!next-input-character;
1469 redo A;
1470 } elsif ($self->{next_input_character} == -1) {
1471 !!!parse-error (type => 'unclosed PUBLIC literal');
1472
1473 $self->{state} = DATA_STATE;
1474 ## reconsume
1475
1476 delete $self->{current_token}->{correct};
1477 !!!emit ($self->{current_token}); # DOCTYPE
1478
1479 redo A;
1480 } else {
1481 $self->{current_token}->{public_identifier} # DOCTYPE
1482 .= chr $self->{next_input_character};
1483 ## Stay in the state
1484 !!!next-input-character;
1485 redo A;
1486 }
1487 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1488 if ($self->{next_input_character} == 0x0027) { # '
1489 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1490 !!!next-input-character;
1491 redo A;
1492 } elsif ($self->{next_input_character} == -1) {
1493 !!!parse-error (type => 'unclosed PUBLIC literal');
1494
1495 $self->{state} = DATA_STATE;
1496 ## reconsume
1497
1498 delete $self->{current_token}->{correct};
1499 !!!emit ($self->{current_token}); # DOCTYPE
1500
1501 redo A;
1502 } else {
1503 $self->{current_token}->{public_identifier} # DOCTYPE
1504 .= chr $self->{next_input_character};
1505 ## Stay in the state
1506 !!!next-input-character;
1507 redo A;
1508 }
1509 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1510 if ({
1511 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1512 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1513 }->{$self->{next_input_character}}) {
1514 ## Stay in the state
1515 !!!next-input-character;
1516 redo A;
1517 } elsif ($self->{next_input_character} == 0x0022) { # "
1518 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1519 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1520 !!!next-input-character;
1521 redo A;
1522 } elsif ($self->{next_input_character} == 0x0027) { # '
1523 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1524 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1525 !!!next-input-character;
1526 redo A;
1527 } elsif ($self->{next_input_character} == 0x003E) { # >
1528 $self->{state} = DATA_STATE;
1529 !!!next-input-character;
1530
1531 !!!emit ($self->{current_token}); # DOCTYPE
1532
1533 redo A;
1534 } elsif ($self->{next_input_character} == -1) {
1535 !!!parse-error (type => 'unclosed DOCTYPE');
1536
1537 $self->{state} = DATA_STATE;
1538 ## reconsume
1539
1540 delete $self->{current_token}->{correct};
1541 !!!emit ($self->{current_token}); # DOCTYPE
1542
1543 redo A;
1544 } else {
1545 !!!parse-error (type => 'string after PUBLIC literal');
1546 $self->{state} = BOGUS_DOCTYPE_STATE;
1547 !!!next-input-character;
1548 redo A;
1549 }
1550 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1551 if ({
1552 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1553 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1554 }->{$self->{next_input_character}}) {
1555 ## Stay in the state
1556 !!!next-input-character;
1557 redo A;
1558 } elsif ($self->{next_input_character} == 0x0022) { # "
1559 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1560 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1561 !!!next-input-character;
1562 redo A;
1563 } elsif ($self->{next_input_character} == 0x0027) { # '
1564 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1565 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1566 !!!next-input-character;
1567 redo A;
1568 } elsif ($self->{next_input_character} == 0x003E) { # >
1569 !!!parse-error (type => 'no SYSTEM literal');
1570 $self->{state} = DATA_STATE;
1571 !!!next-input-character;
1572
1573 delete $self->{current_token}->{correct};
1574 !!!emit ($self->{current_token}); # DOCTYPE
1575
1576 redo A;
1577 } elsif ($self->{next_input_character} == -1) {
1578 !!!parse-error (type => 'unclosed DOCTYPE');
1579
1580 $self->{state} = DATA_STATE;
1581 ## reconsume
1582
1583 delete $self->{current_token}->{correct};
1584 !!!emit ($self->{current_token}); # DOCTYPE
1585
1586 redo A;
1587 } else {
1588 !!!parse-error (type => 'string after SYSTEM');
1589 $self->{state} = BOGUS_DOCTYPE_STATE;
1590 !!!next-input-character;
1591 redo A;
1592 }
1593 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1594 if ($self->{next_input_character} == 0x0022) { # "
1595 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1596 !!!next-input-character;
1597 redo A;
1598 } elsif ($self->{next_input_character} == -1) {
1599 !!!parse-error (type => 'unclosed SYSTEM literal');
1600
1601 $self->{state} = DATA_STATE;
1602 ## reconsume
1603
1604 delete $self->{current_token}->{correct};
1605 !!!emit ($self->{current_token}); # DOCTYPE
1606
1607 redo A;
1608 } else {
1609 $self->{current_token}->{system_identifier} # DOCTYPE
1610 .= chr $self->{next_input_character};
1611 ## Stay in the state
1612 !!!next-input-character;
1613 redo A;
1614 }
1615 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1616 if ($self->{next_input_character} == 0x0027) { # '
1617 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1618 !!!next-input-character;
1619 redo A;
1620 } elsif ($self->{next_input_character} == -1) {
1621 !!!parse-error (type => 'unclosed SYSTEM literal');
1622
1623 $self->{state} = DATA_STATE;
1624 ## reconsume
1625
1626 delete $self->{current_token}->{correct};
1627 !!!emit ($self->{current_token}); # DOCTYPE
1628
1629 redo A;
1630 } else {
1631 $self->{current_token}->{system_identifier} # DOCTYPE
1632 .= chr $self->{next_input_character};
1633 ## Stay in the state
1634 !!!next-input-character;
1635 redo A;
1636 }
1637 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1638 if ({
1639 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1640 #0x000D => 1, # HT, LF, VT, FF, SP, CR
1641 }->{$self->{next_input_character}}) {
1642 ## Stay in the state
1643 !!!next-input-character;
1644 redo A;
1645 } elsif ($self->{next_input_character} == 0x003E) { # >
1646 $self->{state} = DATA_STATE;
1647 !!!next-input-character;
1648
1649 !!!emit ($self->{current_token}); # DOCTYPE
1650
1651 redo A;
1652 } elsif ($self->{next_input_character} == -1) {
1653 !!!parse-error (type => 'unclosed DOCTYPE');
1654
1655 $self->{state} = DATA_STATE;
1656 ## reconsume
1657
1658 delete $self->{current_token}->{correct};
1659 !!!emit ($self->{current_token}); # DOCTYPE
1660
1661 redo A;
1662 } else {
1663 !!!parse-error (type => 'string after SYSTEM literal');
1664 $self->{state} = BOGUS_DOCTYPE_STATE;
1665 !!!next-input-character;
1666 redo A;
1667 }
1668 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1669 if ($self->{next_input_character} == 0x003E) { # >
1670 $self->{state} = DATA_STATE;
1671 !!!next-input-character;
1672
1673 delete $self->{current_token}->{correct};
1674 !!!emit ($self->{current_token}); # DOCTYPE
1675
1676 redo A;
1677 } elsif ($self->{next_input_character} == -1) {
1678 !!!parse-error (type => 'unclosed DOCTYPE');
1679 $self->{state} = DATA_STATE;
1680 ## reconsume
1681
1682 delete $self->{current_token}->{correct};
1683 !!!emit ($self->{current_token}); # DOCTYPE
1684
1685 redo A;
1686 } else {
1687 ## Stay in the state
1688 !!!next-input-character;
1689 redo A;
1690 }
1691 } else {
1692 die "$0: $self->{state}: Unknown state";
1693 }
1694 } # A
1695
1696 die "$0: _get_next_token: unexpected case";
1697 } # _get_next_token
1698
1699 sub _tokenize_attempt_to_consume_an_entity ($$) {
1700 my ($self, $in_attr) = @_;
1701
1702 if ({
1703 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
1704 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
1705 }->{$self->{next_input_character}}) {
1706 ## Don't consume
1707 ## No error
1708 return undef;
1709 } elsif ($self->{next_input_character} == 0x0023) { # #
1710 !!!next-input-character;
1711 if ($self->{next_input_character} == 0x0078 or # x
1712 $self->{next_input_character} == 0x0058) { # X
1713 my $code;
1714 X: {
1715 my $x_char = $self->{next_input_character};
1716 !!!next-input-character;
1717 if (0x0030 <= $self->{next_input_character} and
1718 $self->{next_input_character} <= 0x0039) { # 0..9
1719 $code ||= 0;
1720 $code *= 0x10;
1721 $code += $self->{next_input_character} - 0x0030;
1722 redo X;
1723 } elsif (0x0061 <= $self->{next_input_character} and
1724 $self->{next_input_character} <= 0x0066) { # a..f
1725 $code ||= 0;
1726 $code *= 0x10;
1727 $code += $self->{next_input_character} - 0x0060 + 9;
1728 redo X;
1729 } elsif (0x0041 <= $self->{next_input_character} and
1730 $self->{next_input_character} <= 0x0046) { # A..F
1731 $code ||= 0;
1732 $code *= 0x10;
1733 $code += $self->{next_input_character} - 0x0040 + 9;
1734 redo X;
1735 } elsif (not defined $code) { # no hexadecimal digit
1736 !!!parse-error (type => 'bare hcro');
1737 !!!back-next-input-character ($x_char, $self->{next_input_character});
1738 $self->{next_input_character} = 0x0023; # #
1739 return undef;
1740 } elsif ($self->{next_input_character} == 0x003B) { # ;
1741 !!!next-input-character;
1742 } else {
1743 !!!parse-error (type => 'no refc');
1744 }
1745
1746 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1747 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1748 $code = 0xFFFD;
1749 } elsif ($code > 0x10FFFF) {
1750 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1751 $code = 0xFFFD;
1752 } elsif ($code == 0x000D) {
1753 !!!parse-error (type => 'CR character reference');
1754 $code = 0x000A;
1755 } elsif (0x80 <= $code and $code <= 0x9F) {
1756 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1757 $code = $c1_entity_char->{$code};
1758 }
1759
1760 return {type => CHARACTER_TOKEN, data => chr $code};
1761 } # X
1762 } elsif (0x0030 <= $self->{next_input_character} and
1763 $self->{next_input_character} <= 0x0039) { # 0..9
1764 my $code = $self->{next_input_character} - 0x0030;
1765 !!!next-input-character;
1766
1767 while (0x0030 <= $self->{next_input_character} and
1768 $self->{next_input_character} <= 0x0039) { # 0..9
1769 $code *= 10;
1770 $code += $self->{next_input_character} - 0x0030;
1771
1772 !!!next-input-character;
1773 }
1774
1775 if ($self->{next_input_character} == 0x003B) { # ;
1776 !!!next-input-character;
1777 } else {
1778 !!!parse-error (type => 'no refc');
1779 }
1780
1781 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
1782 !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
1783 $code = 0xFFFD;
1784 } elsif ($code > 0x10FFFF) {
1785 !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
1786 $code = 0xFFFD;
1787 } elsif ($code == 0x000D) {
1788 !!!parse-error (type => 'CR character reference');
1789 $code = 0x000A;
1790 } elsif (0x80 <= $code and $code <= 0x9F) {
1791 !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
1792 $code = $c1_entity_char->{$code};
1793 }
1794
1795 return {type => CHARACTER_TOKEN, data => chr $code};
1796 } else {
1797 !!!parse-error (type => 'bare nero');
1798 !!!back-next-input-character ($self->{next_input_character});
1799 $self->{next_input_character} = 0x0023; # #
1800 return undef;
1801 }
1802 } elsif ((0x0041 <= $self->{next_input_character} and
1803 $self->{next_input_character} <= 0x005A) or
1804 (0x0061 <= $self->{next_input_character} and
1805 $self->{next_input_character} <= 0x007A)) {
1806 my $entity_name = chr $self->{next_input_character};
1807 !!!next-input-character;
1808
1809 my $value = $entity_name;
1810 my $match = 0;
1811 require Whatpm::_NamedEntityList;
1812 our $EntityChar;
1813
1814 while (length $entity_name < 10 and
1815 ## NOTE: Some number greater than the maximum length of entity name
1816 ((0x0041 <= $self->{next_input_character} and # a
1817 $self->{next_input_character} <= 0x005A) or # x
1818 (0x0061 <= $self->{next_input_character} and # a
1819 $self->{next_input_character} <= 0x007A) or # z
1820 (0x0030 <= $self->{next_input_character} and # 0
1821 $self->{next_input_character} <= 0x0039) or # 9
1822 $self->{next_input_character} == 0x003B)) { # ;
1823 $entity_name .= chr $self->{next_input_character};
1824 if (defined $EntityChar->{$entity_name}) {
1825 if ($self->{next_input_character} == 0x003B) { # ;
1826 $value = $EntityChar->{$entity_name};
1827 $match = 1;
1828 !!!next-input-character;
1829 last;
1830 } else {
1831 $value = $EntityChar->{$entity_name};
1832 $match = -1;
1833 !!!next-input-character;
1834 }
1835 } else {
1836 $value .= chr $self->{next_input_character};
1837 $match *= 2;
1838 !!!next-input-character;
1839 }
1840 }
1841
1842 if ($match > 0) {
1843 return {type => CHARACTER_TOKEN, data => $value};
1844 } elsif ($match < 0) {
1845 !!!parse-error (type => 'no refc');
1846 if ($in_attr and $match < -1) {
1847 return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1848 } else {
1849 return {type => CHARACTER_TOKEN, data => $value};
1850 }
1851 } else {
1852 !!!parse-error (type => 'bare ero');
1853 ## NOTE: No characters are consumed in the spec.
1854 return {type => CHARACTER_TOKEN, data => '&'.$value};
1855 }
1856 } else {
1857 ## no characters are consumed
1858 !!!parse-error (type => 'bare ero');
1859 return undef;
1860 }
1861 } # _tokenize_attempt_to_consume_an_entity
1862
1863 sub _initialize_tree_constructor ($) {
1864 my $self = shift;
1865 ## NOTE: $self->{document} MUST be specified before this method is called
1866 $self->{document}->strict_error_checking (0);
1867 ## TODO: Turn mutation events off # MUST
1868 ## TODO: Turn loose Document option (manakai extension) on
1869 $self->{document}->manakai_is_html (1); # MUST
1870 } # _initialize_tree_constructor
1871
1872 sub _terminate_tree_constructor ($) {
1873 my $self = shift;
1874 $self->{document}->strict_error_checking (1);
1875 ## TODO: Turn mutation events on
1876 } # _terminate_tree_constructor
1877
1878 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
1879
1880 { # tree construction stage
1881 my $token;
1882
1883 sub _construct_tree ($) {
1884 my ($self) = @_;
1885
1886 ## When an interactive UA render the $self->{document} available
1887 ## to the user, or when it begin accepting user input, are
1888 ## not defined.
1889
1890 ## Append a character: collect it and all subsequent consecutive
1891 ## characters and insert one Text node whose data is concatenation
1892 ## of all those characters. # MUST
1893
1894 !!!next-token;
1895
1896 $self->{insertion_mode} = BEFORE_HEAD_IM;
1897 undef $self->{form_element};
1898 undef $self->{head_element};
1899 $self->{open_elements} = [];
1900 undef $self->{inner_html_node};
1901
1902 $self->_tree_construction_initial; # MUST
1903 $self->_tree_construction_root_element;
1904 $self->_tree_construction_main;
1905 } # _construct_tree
1906
1907 sub _tree_construction_initial ($) {
1908 my $self = shift;
1909 INITIAL: {
1910 if ($token->{type} == DOCTYPE_TOKEN) {
1911 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1912 ## error, switch to a conformance checking mode for another
1913 ## language.
1914 my $doctype_name = $token->{name};
1915 $doctype_name = '' unless defined $doctype_name;
1916 $doctype_name =~ tr/a-z/A-Z/;
1917 if (not defined $token->{name} or # <!DOCTYPE>
1918 defined $token->{public_identifier} or
1919 defined $token->{system_identifier}) {
1920 !!!parse-error (type => 'not HTML5');
1921 } elsif ($doctype_name ne 'HTML') {
1922 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1923 !!!parse-error (type => 'not HTML5');
1924 }
1925
1926 my $doctype = $self->{document}->create_document_type_definition
1927 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1928 $doctype->public_id ($token->{public_identifier})
1929 if defined $token->{public_identifier};
1930 $doctype->system_id ($token->{system_identifier})
1931 if defined $token->{system_identifier};
1932 ## NOTE: Other DocumentType attributes are null or empty lists.
1933 ## ISSUE: internalSubset = null??
1934 $self->{document}->append_child ($doctype);
1935
1936 if (not $token->{correct} or $doctype_name ne 'HTML') {
1937 $self->{document}->manakai_compat_mode ('quirks');
1938 } elsif (defined $token->{public_identifier}) {
1939 my $pubid = $token->{public_identifier};
1940 $pubid =~ tr/a-z/A-z/;
1941 if ({
1942 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1943 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1944 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1945 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1946 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1947 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1948 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1949 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1950 "-//IETF//DTD HTML 2.0//EN" => 1,
1951 "-//IETF//DTD HTML 2.1E//EN" => 1,
1952 "-//IETF//DTD HTML 3.0//EN" => 1,
1953 "-//IETF//DTD HTML 3.0//EN//" => 1,
1954 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1955 "-//IETF//DTD HTML 3.2//EN" => 1,
1956 "-//IETF//DTD HTML 3//EN" => 1,
1957 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1958 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1959 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1960 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1961 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1962 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1963 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1964 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1965 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1966 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1967 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1968 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1969 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1970 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1971 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1972 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1973 "-//IETF//DTD HTML STRICT//EN" => 1,
1974 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1975 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1976 "-//IETF//DTD HTML//EN" => 1,
1977 "-//IETF//DTD HTML//EN//2.0" => 1,
1978 "-//IETF//DTD HTML//EN//3.0" => 1,
1979 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1980 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1981 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1982 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1983 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1984 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1985 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1986 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1987 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1988 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1989 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1990 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1991 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1992 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1993 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1994 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1995 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1996 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1997 "-//W3C//DTD HTML 3.2//EN" => 1,
1998 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1999 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2000 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2001 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2002 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2003 "-//W3C//DTD W3 HTML//EN" => 1,
2004 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2005 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2006 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2007 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2008 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2009 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2010 "HTML" => 1,
2011 }->{$pubid}) {
2012 $self->{document}->manakai_compat_mode ('quirks');
2013 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2014 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2015 if (defined $token->{system_identifier}) {
2016 $self->{document}->manakai_compat_mode ('quirks');
2017 } else {
2018 $self->{document}->manakai_compat_mode ('limited quirks');
2019 }
2020 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
2021 $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
2022 $self->{document}->manakai_compat_mode ('limited quirks');
2023 }
2024 }
2025 if (defined $token->{system_identifier}) {
2026 my $sysid = $token->{system_identifier};
2027 $sysid =~ tr/A-Z/a-z/;
2028 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2029 $self->{document}->manakai_compat_mode ('quirks');
2030 }
2031 }
2032
2033 ## Go to the root element phase.
2034 !!!next-token;
2035 return;
2036 } elsif ({
2037 START_TAG_TOKEN, 1,
2038 END_TAG_TOKEN, 1,
2039 END_OF_FILE_TOKEN, 1,
2040 }->{$token->{type}}) {
2041 !!!parse-error (type => 'no DOCTYPE');
2042 $self->{document}->manakai_compat_mode ('quirks');
2043 ## Go to the root element phase
2044 ## reprocess
2045 return;
2046 } elsif ($token->{type} == CHARACTER_TOKEN) {
2047 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2048 ## Ignore the token
2049
2050 unless (length $token->{data}) {
2051 ## Stay in the phase
2052 !!!next-token;
2053 redo INITIAL;
2054 }
2055 }
2056
2057 !!!parse-error (type => 'no DOCTYPE');
2058 $self->{document}->manakai_compat_mode ('quirks');
2059 ## Go to the root element phase
2060 ## reprocess
2061 return;
2062 } elsif ($token->{type} == COMMENT_TOKEN) {
2063 my $comment = $self->{document}->create_comment ($token->{data});
2064 $self->{document}->append_child ($comment);
2065
2066 ## Stay in the phase.
2067 !!!next-token;
2068 redo INITIAL;
2069 } else {
2070 die "$0: $token->{type}: Unknown token type";
2071 }
2072 } # INITIAL
2073 } # _tree_construction_initial
2074
2075 sub _tree_construction_root_element ($) {
2076 my $self = shift;
2077
2078 B: {
2079 if ($token->{type} == DOCTYPE_TOKEN) {
2080 !!!parse-error (type => 'in html:#DOCTYPE');
2081 ## Ignore the token
2082 ## Stay in the phase
2083 !!!next-token;
2084 redo B;
2085 } elsif ($token->{type} == COMMENT_TOKEN) {
2086 my $comment = $self->{document}->create_comment ($token->{data});
2087 $self->{document}->append_child ($comment);
2088 ## Stay in the phase
2089 !!!next-token;
2090 redo B;
2091 } elsif ($token->{type} == CHARACTER_TOKEN) {
2092 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2093 ## Ignore the token.
2094
2095 unless (length $token->{data}) {
2096 ## Stay in the phase
2097 !!!next-token;
2098 redo B;
2099 }
2100 }
2101
2102 $self->{application_cache_selection}->(undef);
2103
2104 #
2105 } elsif ($token->{type} == START_TAG_TOKEN) {
2106 if ($token->{tag_name} eq 'html' and
2107 $token->{attributes}->{manifest}) { ## ISSUE: Spec spells as "application"
2108 $self->{application_cache_selection}
2109 ->($token->{attributes}->{manifest}->{value});
2110 ## ISSUE: No relative reference resolution?
2111 } else {
2112 $self->{application_cache_selection}->(undef);
2113 }
2114
2115 ## ISSUE: There is an issue in the spec
2116 #
2117 } elsif ({
2118 END_TAG_TOKEN, 1,
2119 END_OF_FILE_TOKEN, 1,
2120 }->{$token->{type}}) {
2121 $self->{application_cache_selection}->(undef);
2122
2123 ## ISSUE: There is an issue in the spec
2124 #
2125 } else {
2126 die "$0: $token->{type}: Unknown token type";
2127 }
2128
2129 my $root_element; !!!create-element ($root_element, 'html');
2130 $self->{document}->append_child ($root_element);
2131 push @{$self->{open_elements}}, [$root_element, 'html'];
2132 ## reprocess
2133 #redo B;
2134 return; ## Go to the main phase.
2135 } # B
2136 } # _tree_construction_root_element
2137
2138 sub _reset_insertion_mode ($) {
2139 my $self = shift;
2140
2141 ## Step 1
2142 my $last;
2143
2144 ## Step 2
2145 my $i = -1;
2146 my $node = $self->{open_elements}->[$i];
2147
2148 ## Step 3
2149 S3: {
2150 ## ISSUE: Oops! "If node is the first node in the stack of open
2151 ## elements, then set last to true. If the context element of the
2152 ## HTML fragment parsing algorithm is neither a td element nor a
2153 ## th element, then set node to the context element. (fragment case)":
2154 ## The second "if" is in the scope of the first "if"!?
2155 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
2156 $last = 1;
2157 if (defined $self->{inner_html_node}) {
2158 if ($self->{inner_html_node}->[1] eq 'td' or
2159 $self->{inner_html_node}->[1] eq 'th') {
2160 #
2161 } else {
2162 $node = $self->{inner_html_node};
2163 }
2164 }
2165 }
2166
2167 ## Step 4..13
2168 my $new_mode = {
2169 select => IN_SELECT_IM,
2170 td => IN_CELL_IM,
2171 th => IN_CELL_IM,
2172 tr => IN_ROW_IM,
2173 tbody => IN_TABLE_BODY_IM,
2174 thead => IN_TABLE_BODY_IM,
2175 tfoot => IN_TABLE_BODY_IM,
2176 caption => IN_CAPTION_IM,
2177 colgroup => IN_COLUMN_GROUP_IM,
2178 table => IN_TABLE_IM,
2179 head => IN_BODY_IM, # not in head!
2180 body => IN_BODY_IM,
2181 frameset => IN_FRAMESET_IM,
2182 }->{$node->[1]};
2183 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
2184
2185 ## Step 14
2186 if ($node->[1] eq 'html') {
2187 unless (defined $self->{head_element}) {
2188 $self->{insertion_mode} = BEFORE_HEAD_IM;
2189 } else {
2190 $self->{insertion_mode} = AFTER_HEAD_IM;
2191 }
2192 return;
2193 }
2194
2195 ## Step 15
2196 $self->{insertion_mode} = IN_BODY_IM and return if $last;
2197
2198 ## Step 16
2199 $i--;
2200 $node = $self->{open_elements}->[$i];
2201
2202 ## Step 17
2203 redo S3;
2204 } # S3
2205 } # _reset_insertion_mode
2206
2207 sub _tree_construction_main ($) {
2208 my $self = shift;
2209
2210 my $active_formatting_elements = [];
2211
2212 my $reconstruct_active_formatting_elements = sub { # MUST
2213 my $insert = shift;
2214
2215 ## Step 1
2216 return unless @$active_formatting_elements;
2217
2218 ## Step 3
2219 my $i = -1;
2220 my $entry = $active_formatting_elements->[$i];
2221
2222 ## Step 2
2223 return if $entry->[0] eq '#marker';
2224 for (@{$self->{open_elements}}) {
2225 if ($entry->[0] eq $_->[0]) {
2226 return;
2227 }
2228 }
2229
2230 S4: {
2231 ## Step 4
2232 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
2233
2234 ## Step 5
2235 $i--;
2236 $entry = $active_formatting_elements->[$i];
2237
2238 ## Step 6
2239 if ($entry->[0] eq '#marker') {
2240 #
2241 } else {
2242 my $in_open_elements;
2243 OE: for (@{$self->{open_elements}}) {
2244 if ($entry->[0] eq $_->[0]) {
2245 $in_open_elements = 1;
2246 last OE;
2247 }
2248 }
2249 if ($in_open_elements) {
2250 #
2251 } else {
2252 redo S4;
2253 }
2254 }
2255
2256 ## Step 7
2257 $i++;
2258 $entry = $active_formatting_elements->[$i];
2259 } # S4
2260
2261 S7: {
2262 ## Step 8
2263 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
2264
2265 ## Step 9
2266 $insert->($clone->[0]);
2267 push @{$self->{open_elements}}, $clone;
2268
2269 ## Step 10
2270 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
2271
2272 ## Step 11
2273 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
2274 ## Step 7'
2275 $i++;
2276 $entry = $active_formatting_elements->[$i];
2277
2278 redo S7;
2279 }
2280 } # S7
2281 }; # $reconstruct_active_formatting_elements
2282
2283 my $clear_up_to_marker = sub {
2284 for (reverse 0..$#$active_formatting_elements) {
2285 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
2286 splice @$active_formatting_elements, $_;
2287 return;
2288 }
2289 }
2290 }; # $clear_up_to_marker
2291
2292 my $parse_rcdata = sub ($$) {
2293 my ($content_model_flag, $insert) = @_;
2294
2295 ## Step 1
2296 my $start_tag_name = $token->{tag_name};
2297 my $el;
2298 !!!create-element ($el, $start_tag_name, $token->{attributes});
2299
2300 ## Step 2
2301 $insert->($el); # /context node/->append_child ($el)
2302
2303 ## Step 3
2304 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
2305 delete $self->{escape}; # MUST
2306
2307 ## Step 4
2308 my $text = '';
2309 !!!next-token;
2310 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
2311 $text .= $token->{data};
2312 !!!next-token;
2313 }
2314
2315 ## Step 5
2316 if (length $text) {
2317 my $text = $self->{document}->create_text_node ($text);
2318 $el->append_child ($text);
2319 }
2320
2321 ## Step 6
2322 $self->{content_model} = PCDATA_CONTENT_MODEL;
2323
2324 ## Step 7
2325 if ($token->{type} == END_TAG_TOKEN and $token->{tag_name} eq $start_tag_name) {
2326 ## Ignore the token
2327 } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
2328 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2329 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
2330 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
2331 } else {
2332 die "$0: $content_model_flag in parse_rcdata";
2333 }
2334 !!!next-token;
2335 }; # $parse_rcdata
2336
2337 my $script_start_tag = sub ($) {
2338 my $insert = $_[0];
2339 my $script_el;
2340 !!!create-element ($script_el, 'script', $token->{attributes});
2341 ## TODO: mark as "parser-inserted"
2342
2343 $self->{content_model} = CDATA_CONTENT_MODEL;
2344 delete $self->{escape}; # MUST
2345
2346 my $text = '';
2347 !!!next-token;
2348 while ($token->{type} == CHARACTER_TOKEN) {
2349 $text .= $token->{data};
2350 !!!next-token;
2351 } # stop if non-character token or tokenizer stops tokenising
2352 if (length $text) {
2353 $script_el->manakai_append_text ($text);
2354 }
2355
2356 $self->{content_model} = PCDATA_CONTENT_MODEL;
2357
2358 if ($token->{type} == END_TAG_TOKEN and
2359 $token->{tag_name} eq 'script') {
2360 ## Ignore the token
2361 } else {
2362 !!!parse-error (type => 'in CDATA:#'.$token->{type});
2363 ## ISSUE: And ignore?
2364 ## TODO: mark as "already executed"
2365 }
2366
2367 if (defined $self->{inner_html_node}) {
2368 ## TODO: mark as "already executed"
2369 } else {
2370 ## TODO: $old_insertion_point = current insertion point
2371 ## TODO: insertion point = just before the next input character
2372
2373 $insert->($script_el);
2374
2375 ## TODO: insertion point = $old_insertion_point (might be "undefined")
2376
2377 ## TODO: if there is a script that will execute as soon as the parser resume, then...
2378 }
2379
2380 !!!next-token;
2381 }; # $script_start_tag
2382
2383 my $formatting_end_tag = sub {
2384 my $tag_name = shift;
2385
2386 FET: {
2387 ## Step 1
2388 my $formatting_element;
2389 my $formatting_element_i_in_active;
2390 AFE: for (reverse 0..$#$active_formatting_elements) {
2391 if ($active_formatting_elements->[$_]->[1] eq $tag_name) {
2392 $formatting_element = $active_formatting_elements->[$_];
2393 $formatting_element_i_in_active = $_;
2394 last AFE;
2395 } elsif ($active_formatting_elements->[$_]->[0] eq '#marker') {
2396 last AFE;
2397 }
2398 } # AFE
2399 unless (defined $formatting_element) {
2400 !!!parse-error (type => 'unmatched end tag:'.$tag_name);
2401 ## Ignore the token
2402 !!!next-token;
2403 return;
2404 }
2405 ## has an element in scope
2406 my $in_scope = 1;
2407 my $formatting_element_i_in_open;
2408 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
2409 my $node = $self->{open_elements}->[$_];
2410 if ($node->[0] eq $formatting_element->[0]) {
2411 if ($in_scope) {
2412 $formatting_element_i_in_open = $_;
2413 last INSCOPE;
2414 } else { # in open elements but not in scope
2415 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2416 ## Ignore the token
2417 !!!next-token;
2418 return;
2419 }
2420 } elsif ({
2421 table => 1, caption => 1, td => 1, th => 1,
2422 button => 1, marquee => 1, object => 1, html => 1,
2423 }->{$node->[1]}) {
2424 $in_scope = 0;
2425 }
2426 } # INSCOPE
2427 unless (defined $formatting_element_i_in_open) {
2428 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2429 pop @$active_formatting_elements; # $formatting_element
2430 !!!next-token; ## TODO: ok?
2431 return;
2432 }
2433 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
2434 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2435 }
2436
2437 ## Step 2
2438 my $furthest_block;
2439 my $furthest_block_i_in_open;
2440 OE: for (reverse 0..$#{$self->{open_elements}}) {
2441 my $node = $self->{open_elements}->[$_];
2442 if (not $formatting_category->{$node->[1]} and
2443 #not $phrasing_category->{$node->[1]} and
2444 ($special_category->{$node->[1]} or
2445 $scoping_category->{$node->[1]})) {
2446 $furthest_block = $node;
2447 $furthest_block_i_in_open = $_;
2448 } elsif ($node->[0] eq $formatting_element->[0]) {
2449 last OE;
2450 }
2451 } # OE
2452
2453 ## Step 3
2454 unless (defined $furthest_block) { # MUST
2455 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
2456 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
2457 !!!next-token;
2458 return;
2459 }
2460
2461 ## Step 4
2462 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
2463
2464 ## Step 5
2465 my $furthest_block_parent = $furthest_block->[0]->parent_node;
2466 if (defined $furthest_block_parent) {
2467 $furthest_block_parent->remove_child ($furthest_block->[0]);
2468 }
2469
2470 ## Step 6
2471 my $bookmark_prev_el
2472 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
2473 ->[0];
2474
2475 ## Step 7
2476 my $node = $furthest_block;
2477 my $node_i_in_open = $furthest_block_i_in_open;
2478 my $last_node = $furthest_block;
2479 S7: {
2480 ## Step 1
2481 $node_i_in_open--;
2482 $node = $self->{open_elements}->[$node_i_in_open];
2483
2484 ## Step 2
2485 my $node_i_in_active;
2486 S7S2: {
2487 for (reverse 0..$#$active_formatting_elements) {
2488 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
2489 $node_i_in_active = $_;
2490 last S7S2;
2491 }
2492 }
2493 splice @{$self->{open_elements}}, $node_i_in_open, 1;
2494 redo S7;
2495 } # S7S2
2496
2497 ## Step 3
2498 last S7 if $node->[0] eq $formatting_element->[0];
2499
2500 ## Step 4
2501 if ($last_node->[0] eq $furthest_block->[0]) {
2502 $bookmark_prev_el = $node->[0];
2503 }
2504
2505 ## Step 5
2506 if ($node->[0]->has_child_nodes ()) {
2507 my $clone = [$node->[0]->clone_node (0), $node->[1]];
2508 $active_formatting_elements->[$node_i_in_active] = $clone;
2509 $self->{open_elements}->[$node_i_in_open] = $clone;
2510 $node = $clone;
2511 }
2512
2513 ## Step 6
2514 $node->[0]->append_child ($last_node->[0]);
2515
2516 ## Step 7
2517 $last_node = $node;
2518
2519 ## Step 8
2520 redo S7;
2521 } # S7
2522
2523 ## Step 8
2524 $common_ancestor_node->[0]->append_child ($last_node->[0]);
2525
2526 ## Step 9
2527 my $clone = [$formatting_element->[0]->clone_node (0),
2528 $formatting_element->[1]];
2529
2530 ## Step 10
2531 my @cn = @{$furthest_block->[0]->child_nodes};
2532 $clone->[0]->append_child ($_) for @cn;
2533
2534 ## Step 11
2535 $furthest_block->[0]->append_child ($clone->[0]);
2536
2537 ## Step 12
2538 my $i;
2539 AFE: for (reverse 0..$#$active_formatting_elements) {
2540 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
2541 splice @$active_formatting_elements, $_, 1;
2542 $i-- and last AFE if defined $i;
2543 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
2544 $i = $_;
2545 }
2546 } # AFE
2547 splice @$active_formatting_elements, $i + 1, 0, $clone;
2548
2549 ## Step 13
2550 undef $i;
2551 OE: for (reverse 0..$#{$self->{open_elements}}) {
2552 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
2553 splice @{$self->{open_elements}}, $_, 1;
2554 $i-- and last OE if defined $i;
2555 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
2556 $i = $_;
2557 }
2558 } # OE
2559 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
2560
2561 ## Step 14
2562 redo FET;
2563 } # FET
2564 }; # $formatting_end_tag
2565
2566 my $insert_to_current = sub {
2567 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
2568 }; # $insert_to_current
2569
2570 my $insert_to_foster = sub {
2571 my $child = shift;
2572 if ({
2573 table => 1, tbody => 1, tfoot => 1,
2574 thead => 1, tr => 1,
2575 }->{$self->{open_elements}->[-1]->[1]}) {
2576 # MUST
2577 my $foster_parent_element;
2578 my $next_sibling;
2579 OE: for (reverse 0..$#{$self->{open_elements}}) {
2580 if ($self->{open_elements}->[$_]->[1] eq 'table') {
2581 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
2582 if (defined $parent and $parent->node_type == 1) {
2583 $foster_parent_element = $parent;
2584 $next_sibling = $self->{open_elements}->[$_]->[0];
2585 } else {
2586 $foster_parent_element
2587 = $self->{open_elements}->[$_ - 1]->[0];
2588 }
2589 last OE;
2590 }
2591 } # OE
2592 $foster_parent_element = $self->{open_elements}->[0]->[0]
2593 unless defined $foster_parent_element;
2594 $foster_parent_element->insert_before
2595 ($child, $next_sibling);
2596 } else {
2597 $self->{open_elements}->[-1]->[0]->append_child ($child);
2598 }
2599 }; # $insert_to_foster
2600
2601 my $insert;
2602
2603 B: {
2604 if ($token->{type} == DOCTYPE_TOKEN) {
2605 !!!parse-error (type => 'DOCTYPE in the middle');
2606 ## Ignore the token
2607 ## Stay in the phase
2608 !!!next-token;
2609 redo B;
2610 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
2611 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2612 #
2613 } else {
2614 ## Generate implied end tags
2615 if ({
2616 dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
2617 tbody => 1, tfoot=> 1, thead => 1,
2618 }->{$self->{open_elements}->[-1]->[1]}) {
2619 !!!back-token;
2620 $token = {type => END_TAG_TOKEN, tag_name => $self->{open_elements}->[-1]->[1]};
2621 redo B;
2622 }
2623
2624 if (@{$self->{open_elements}} > 2 or
2625 (@{$self->{open_elements}} == 2 and $self->{open_elements}->[1]->[1] ne 'body')) {
2626 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2627 } elsif (defined $self->{inner_html_node} and
2628 @{$self->{open_elements}} > 1 and
2629 $self->{open_elements}->[1]->[1] ne 'body') {
2630 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
2631 }
2632
2633 ## ISSUE: There is an issue in the spec.
2634 }
2635
2636 ## Stop parsing
2637 last B;
2638 } elsif ($token->{type} == START_TAG_TOKEN and
2639 $token->{tag_name} eq 'html') {
2640 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
2641 ## Turn into the main phase
2642 !!!parse-error (type => 'after html:html');
2643 $self->{insertion_mode} = AFTER_BODY_IM;
2644 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
2645 ## Turn into the main phase
2646 !!!parse-error (type => 'after html:html');
2647 $self->{insertion_mode} = AFTER_FRAMESET_IM;
2648 }
2649
2650 ## ISSUE: "aa<html>" is not a parse error.
2651 ## ISSUE: "<html>" in fragment is not a parse error.
2652 unless ($token->{first_start_tag}) {
2653 !!!parse-error (type => 'not first start tag');
2654 }
2655 my $top_el = $self->{open_elements}->[0]->[0];
2656 for my $attr_name (keys %{$token->{attributes}}) {
2657 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
2658 $top_el->set_attribute_ns
2659 (undef, [undef, $attr_name],
2660 $token->{attributes}->{$attr_name}->{value});
2661 }
2662 }
2663 !!!next-token;
2664 redo B;
2665 } elsif ($token->{type} == COMMENT_TOKEN) {
2666 my $comment = $self->{document}->create_comment ($token->{data});
2667 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
2668 $self->{document}->append_child ($comment);
2669 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
2670 $self->{open_elements}->[0]->[0]->append_child ($comment);
2671 } else {
2672 $self->{open_elements}->[-1]->[0]->append_child ($comment);
2673 }
2674 !!!next-token;
2675 redo B;
2676 } elsif ($self->{insertion_mode} & HEAD_IMS) {
2677 if ($token->{type} == CHARACTER_TOKEN) {
2678 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
2679 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
2680 unless (length $token->{data}) {
2681 !!!next-token;
2682 redo B;
2683 }
2684 }
2685
2686 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2687 ## As if <head>
2688 !!!create-element ($self->{head_element}, 'head');
2689 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2690 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2691
2692 ## Reprocess in the "in head" insertion mode...
2693 pop @{$self->{open_elements}};
2694
2695 ## Reprocess in the "after head" insertion mode...
2696 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2697 ## As if </noscript>
2698 pop @{$self->{open_elements}};
2699 !!!parse-error (type => 'in noscript:#character');
2700
2701 ## Reprocess in the "in head" insertion mode...
2702 ## As if </head>
2703 pop @{$self->{open_elements}};
2704
2705 ## Reprocess in the "after head" insertion mode...
2706 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2707 pop @{$self->{open_elements}};
2708
2709 ## Reprocess in the "after head" insertion mode...
2710 }
2711
2712 ## "after head" insertion mode
2713 ## As if <body>
2714 !!!insert-element ('body');
2715 $self->{insertion_mode} = IN_BODY_IM;
2716 ## reprocess
2717 redo B;
2718 } elsif ($token->{type} == START_TAG_TOKEN) {
2719 if ($token->{tag_name} eq 'head') {
2720 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2721 !!!create-element ($self->{head_element}, $token->{tag_name}, $token->{attributes});
2722 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2723 push @{$self->{open_elements}}, [$self->{head_element}, $token->{tag_name}];
2724 $self->{insertion_mode} = IN_HEAD_IM;
2725 !!!next-token;
2726 redo B;
2727 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2728 #
2729 } else {
2730 !!!parse-error (type => 'in head:head'); # or in head noscript
2731 ## Ignore the token
2732 !!!next-token;
2733 redo B;
2734 }
2735 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2736 ## As if <head>
2737 !!!create-element ($self->{head_element}, 'head');
2738 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2739 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2740
2741 $self->{insertion_mode} = IN_HEAD_IM;
2742 ## Reprocess in the "in head" insertion mode...
2743 }
2744
2745 if ($token->{tag_name} eq 'base') {
2746 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2747 ## As if </noscript>
2748 pop @{$self->{open_elements}};
2749 !!!parse-error (type => 'in noscript:base');
2750
2751 $self->{insertion_mode} = IN_HEAD_IM;
2752 ## Reprocess in the "in head" insertion mode...
2753 }
2754
2755 ## NOTE: There is a "as if in head" code clone.
2756 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2757 !!!parse-error (type => 'after head:'.$token->{tag_name});
2758 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2759 }
2760 !!!insert-element ($token->{tag_name}, $token->{attributes});
2761 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2762 pop @{$self->{open_elements}}
2763 if $self->{insertion_mode} == AFTER_HEAD_IM;
2764 !!!next-token;
2765 redo B;
2766 } elsif ($token->{tag_name} eq 'link') {
2767 ## NOTE: There is a "as if in head" code clone.
2768 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2769 !!!parse-error (type => 'after head:'.$token->{tag_name});
2770 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2771 }
2772 !!!insert-element ($token->{tag_name}, $token->{attributes});
2773 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2774 pop @{$self->{open_elements}}
2775 if $self->{insertion_mode} == AFTER_HEAD_IM;
2776 !!!next-token;
2777 redo B;
2778 } elsif ($token->{tag_name} eq 'meta') {
2779 ## NOTE: There is a "as if in head" code clone.
2780 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2781 !!!parse-error (type => 'after head:'.$token->{tag_name});
2782 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2783 }
2784 !!!insert-element ($token->{tag_name}, $token->{attributes});
2785 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2786
2787 unless ($self->{confident}) {
2788 my $charset;
2789 if ($token->{attributes}->{charset}) { ## TODO: And if supported
2790 $charset = $token->{attributes}->{charset}->{value};
2791 }
2792 if ($token->{attributes}->{'http-equiv'}) {
2793 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2794 if ($token->{attributes}->{'http-equiv'}->{value}
2795 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
2796 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2797 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2798 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
2799 } ## TODO: And if supported
2800 }
2801 ## TODO: Change the encoding
2802 }
2803
2804 ## TODO: Extracting |charset| from |meta|.
2805 pop @{$self->{open_elements}}
2806 if $self->{insertion_mode} == AFTER_HEAD_IM;
2807 !!!next-token;
2808 redo B;
2809 } elsif ($token->{tag_name} eq 'title') {
2810 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2811 ## As if </noscript>
2812 pop @{$self->{open_elements}};
2813 !!!parse-error (type => 'in noscript:title');
2814
2815 $self->{insertion_mode} = IN_HEAD_IM;
2816 ## Reprocess in the "in head" insertion mode...
2817 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2818 !!!parse-error (type => 'after head:'.$token->{tag_name});
2819 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2820 }
2821
2822 ## NOTE: There is a "as if in head" code clone.
2823 my $parent = defined $self->{head_element} ? $self->{head_element}
2824 : $self->{open_elements}->[-1]->[0];
2825 $parse_rcdata->(RCDATA_CONTENT_MODEL,
2826 sub { $parent->append_child ($_[0]) });
2827 pop @{$self->{open_elements}}
2828 if $self->{insertion_mode} == AFTER_HEAD_IM;
2829 redo B;
2830 } elsif ($token->{tag_name} eq 'style') {
2831 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
2832 ## insertion mode IN_HEAD_IM)
2833 ## NOTE: There is a "as if in head" code clone.
2834 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
2835 !!!parse-error (type => 'after head:'.$token->{tag_name});
2836 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2837 }
2838 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
2839 pop @{$self->{open_elements}}
2840 if $self->{insertion_mode} == AFTER_HEAD_IM;
2841 redo B;
2842 } elsif ($token->{tag_name} eq 'noscript') {
2843 if ($self->{insertion_mode} == IN_HEAD_IM) {
2844 ## NOTE: and scripting is disalbed
2845 !!!insert-element ($token->{tag_name}, $token->{attributes});
2846 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
2847 !!!next-token;
2848 redo B;
2849 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2850 !!!parse-error (type => 'in noscript:noscript');
2851 ## Ignore the token
2852 !!!next-token;
2853 redo B;
2854 } else {
2855 #
2856 }
2857 } elsif ($token->{tag_name} eq 'script') {
2858 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2859 ## As if </noscript>
2860 pop @{$self->{open_elements}};
2861 !!!parse-error (type => 'in noscript:script');
2862
2863 $self->{insertion_mode} = IN_HEAD_IM;
2864 ## Reprocess in the "in head" insertion mode...
2865 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
2866 !!!parse-error (type => 'after head:'.$token->{tag_name});
2867 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2868 }
2869
2870 ## NOTE: There is a "as if in head" code clone.
2871 $script_start_tag->($insert_to_current);
2872 pop @{$self->{open_elements}}
2873 if $self->{insertion_mode} == AFTER_HEAD_IM;
2874 redo B;
2875 } elsif ($token->{tag_name} eq 'body' or
2876 $token->{tag_name} eq 'frameset') {
2877 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2878 ## As if </noscript>
2879 pop @{$self->{open_elements}};
2880 !!!parse-error (type => 'in noscript:'.$token->{tag_name});
2881
2882 ## Reprocess in the "in head" insertion mode...
2883 ## As if </head>
2884 pop @{$self->{open_elements}};
2885
2886 ## Reprocess in the "after head" insertion mode...
2887 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2888 pop @{$self->{open_elements}};
2889
2890 ## Reprocess in the "after head" insertion mode...
2891 }
2892
2893 ## "after head" insertion mode
2894 !!!insert-element ($token->{tag_name}, $token->{attributes});
2895 if ($token->{tag_name} eq 'body') {
2896 $self->{insertion_mode} = IN_BODY_IM;
2897 } elsif ($token->{tag_name} eq 'frameset') {
2898 $self->{insertion_mode} = IN_FRAMESET_IM;
2899 } else {
2900 die "$0: tag name: $self->{tag_name}";
2901 }
2902 !!!next-token;
2903 redo B;
2904 } else {
2905 #
2906 }
2907
2908 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2909 ## As if </noscript>
2910 pop @{$self->{open_elements}};
2911 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
2912
2913 ## Reprocess in the "in head" insertion mode...
2914 ## As if </head>
2915 pop @{$self->{open_elements}};
2916
2917 ## Reprocess in the "after head" insertion mode...
2918 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2919 ## As if </head>
2920 pop @{$self->{open_elements}};
2921
2922 ## Reprocess in the "after head" insertion mode...
2923 }
2924
2925 ## "after head" insertion mode
2926 ## As if <body>
2927 !!!insert-element ('body');
2928 $self->{insertion_mode} = IN_BODY_IM;
2929 ## reprocess
2930 redo B;
2931 } elsif ($token->{type} == END_TAG_TOKEN) {
2932 if ($token->{tag_name} eq 'head') {
2933 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2934 ## As if <head>
2935 !!!create-element ($self->{head_element}, 'head');
2936 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2937 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2938
2939 ## Reprocess in the "in head" insertion mode...
2940 pop @{$self->{open_elements}};
2941 $self->{insertion_mode} = AFTER_HEAD_IM;
2942 !!!next-token;
2943 redo B;
2944 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2945 ## As if </noscript>
2946 pop @{$self->{open_elements}};
2947 !!!parse-error (type => 'in noscript:script');
2948
2949 ## Reprocess in the "in head" insertion mode...
2950 pop @{$self->{open_elements}};
2951 $self->{insertion_mode} = AFTER_HEAD_IM;
2952 !!!next-token;
2953 redo B;
2954 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
2955 pop @{$self->{open_elements}};
2956 $self->{insertion_mode} = AFTER_HEAD_IM;
2957 !!!next-token;
2958 redo B;
2959 } else {
2960 #
2961 }
2962 } elsif ($token->{tag_name} eq 'noscript') {
2963 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2964 pop @{$self->{open_elements}};
2965 $self->{insertion_mode} = IN_HEAD_IM;
2966 !!!next-token;
2967 redo B;
2968 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2969 !!!parse-error (type => 'unmatched end tag:noscript');
2970 ## Ignore the token ## ISSUE: An issue in the spec.
2971 !!!next-token;
2972 redo B;
2973 } else {
2974 #
2975 }
2976 } elsif ({
2977 body => 1, html => 1,
2978 }->{$token->{tag_name}}) {
2979 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2980 ## As if <head>
2981 !!!create-element ($self->{head_element}, 'head');
2982 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
2983 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2984
2985 $self->{insertion_mode} = IN_HEAD_IM;
2986 ## Reprocess in the "in head" insertion mode...
2987 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
2988 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
2989 ## Ignore the token
2990 !!!next-token;
2991 redo B;
2992 }
2993
2994 #
2995 } elsif ({
2996 p => 1, br => 1,
2997 }->{$token->{tag_name}}) {
2998 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
2999 ## As if <head>
3000 !!!create-element ($self->{head_element}, 'head');
3001 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3002 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
3003
3004 $self->{insertion_mode} = IN_HEAD_IM;
3005 ## Reprocess in the "in head" insertion mode...
3006 }
3007
3008 #
3009 } else {
3010 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3011 #
3012 } else {
3013 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3014 ## Ignore the token
3015 !!!next-token;
3016 redo B;
3017 }
3018 }
3019
3020 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3021 ## As if </noscript>
3022 pop @{$self->{open_elements}};
3023 !!!parse-error (type => 'in noscript:/'.$token->{tag_name});
3024
3025 ## Reprocess in the "in head" insertion mode...
3026 ## As if </head>
3027 pop @{$self->{open_elements}};
3028
3029 ## Reprocess in the "after head" insertion mode...
3030 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3031 ## As if </head>
3032 pop @{$self->{open_elements}};
3033
3034 ## Reprocess in the "after head" insertion mode...
3035 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3036 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3037 ## Ignore the token ## ISSUE: An issue in the spec.
3038 !!!next-token;
3039 redo B;
3040 }
3041
3042 ## "after head" insertion mode
3043 ## As if <body>
3044 !!!insert-element ('body');
3045 $self->{insertion_mode} = IN_BODY_IM;
3046 ## reprocess
3047 redo B;
3048 } else {
3049 die "$0: $token->{type}: Unknown token type";
3050 }
3051
3052 ## ISSUE: An issue in the spec.
3053 } elsif ($self->{insertion_mode} & BODY_IMS) {
3054 if ($token->{type} == CHARACTER_TOKEN) {
3055 ## NOTE: There is a code clone of "character in body".
3056 $reconstruct_active_formatting_elements->($insert_to_current);
3057
3058 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3059
3060 !!!next-token;
3061 redo B;
3062 } elsif ($token->{type} == START_TAG_TOKEN) {
3063 if ({
3064 caption => 1, col => 1, colgroup => 1, tbody => 1,
3065 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
3066 }->{$token->{tag_name}}) {
3067 if ($self->{insertion_mode} == IN_CELL_IM) {
3068 ## have an element in table scope
3069 my $tn;
3070 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3071 my $node = $self->{open_elements}->[$_];
3072 if ($node->[1] eq 'td' or $node->[1] eq 'th') {
3073 $tn = $node->[1];
3074 last INSCOPE;
3075 } elsif ({
3076 table => 1, html => 1,
3077 }->{$node->[1]}) {
3078 last INSCOPE;
3079 }
3080 } # INSCOPE
3081 unless (defined $tn) {
3082 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3083 ## Ignore the token
3084 !!!next-token;
3085 redo B;
3086 }
3087
3088 ## Close the cell
3089 !!!back-token; # <?>
3090 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3091 redo B;
3092 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3093 !!!parse-error (type => 'not closed:caption');
3094
3095 ## As if </caption>
3096 ## have a table element in table scope
3097 my $i;
3098 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3099 my $node = $self->{open_elements}->[$_];
3100 if ($node->[1] eq 'caption') {
3101 $i = $_;
3102 last INSCOPE;
3103 } elsif ({
3104 table => 1, html => 1,
3105 }->{$node->[1]}) {
3106 last INSCOPE;
3107 }
3108 } # INSCOPE
3109 unless (defined $i) {
3110 !!!parse-error (type => 'unmatched end tag:caption');
3111 ## Ignore the token
3112 !!!next-token;
3113 redo B;
3114 }
3115
3116 ## generate implied end tags
3117 if ({
3118 dd => 1, dt => 1, li => 1, p => 1,
3119 td => 1, th => 1, tr => 1,
3120 tbody => 1, tfoot=> 1, thead => 1,
3121 }->{$self->{open_elements}->[-1]->[1]}) {
3122 !!!back-token; # <?>
3123 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3124 !!!back-token;
3125 $token = {type => END_TAG_TOKEN,
3126 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3127 redo B;
3128 }
3129
3130 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3131 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3132 }
3133
3134 splice @{$self->{open_elements}}, $i;
3135
3136 $clear_up_to_marker->();
3137
3138 $self->{insertion_mode} = IN_TABLE_IM;
3139
3140 ## reprocess
3141 redo B;
3142 } else {
3143 #
3144 }
3145 } else {
3146 #
3147 }
3148 } elsif ($token->{type} == END_TAG_TOKEN) {
3149 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
3150 if ($self->{insertion_mode} == IN_CELL_IM) {
3151 ## have an element in table scope
3152 my $i;
3153 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3154 my $node = $self->{open_elements}->[$_];
3155 if ($node->[1] eq $token->{tag_name}) {
3156 $i = $_;
3157 last INSCOPE;
3158 } elsif ({
3159 table => 1, html => 1,
3160 }->{$node->[1]}) {
3161 last INSCOPE;
3162 }
3163 } # INSCOPE
3164 unless (defined $i) {
3165 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3166 ## Ignore the token
3167 !!!next-token;
3168 redo B;
3169 }
3170
3171 ## generate implied end tags
3172 if ({
3173 dd => 1, dt => 1, li => 1, p => 1,
3174 td => ($token->{tag_name} eq 'th'),
3175 th => ($token->{tag_name} eq 'td'),
3176 tr => 1,
3177 tbody => 1, tfoot=> 1, thead => 1,
3178 }->{$self->{open_elements}->[-1]->[1]}) {
3179 !!!back-token;
3180 $token = {type => END_TAG_TOKEN,
3181 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3182 redo B;
3183 }
3184
3185 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
3186 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3187 }
3188
3189 splice @{$self->{open_elements}}, $i;
3190
3191 $clear_up_to_marker->();
3192
3193 $self->{insertion_mode} = IN_ROW_IM;
3194
3195 !!!next-token;
3196 redo B;
3197 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
3198 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3199 ## Ignore the token
3200 !!!next-token;
3201 redo B;
3202 } else {
3203 #
3204 }
3205 } elsif ($token->{tag_name} eq 'caption') {
3206 if ($self->{insertion_mode} == IN_CAPTION_IM) {
3207 ## have a table element in table scope
3208 my $i;
3209 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3210 my $node = $self->{open_elements}->[$_];
3211 if ($node->[1] eq $token->{tag_name}) {
3212 $i = $_;
3213 last INSCOPE;
3214 } elsif ({
3215 table => 1, html => 1,
3216 }->{$node->[1]}) {
3217 last INSCOPE;
3218 }
3219 } # INSCOPE
3220 unless (defined $i) {
3221 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3222 ## Ignore the token
3223 !!!next-token;
3224 redo B;
3225 }
3226
3227 ## generate implied end tags
3228 if ({
3229 dd => 1, dt => 1, li => 1, p => 1,
3230 td => 1, th => 1, tr => 1,
3231 tbody => 1, tfoot=> 1, thead => 1,
3232 }->{$self->{open_elements}->[-1]->[1]}) {
3233 !!!back-token;
3234 $token = {type => END_TAG_TOKEN,
3235 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3236 redo B;
3237 }
3238
3239 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3240 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3241 }
3242
3243 splice @{$self->{open_elements}}, $i;
3244
3245 $clear_up_to_marker->();
3246
3247 $self->{insertion_mode} = IN_TABLE_IM;
3248
3249 !!!next-token;
3250 redo B;
3251 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
3252 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3253 ## Ignore the token
3254 !!!next-token;
3255 redo B;
3256 } else {
3257 #
3258 }
3259 } elsif ({
3260 table => 1, tbody => 1, tfoot => 1,
3261 thead => 1, tr => 1,
3262 }->{$token->{tag_name}} and
3263 $self->{insertion_mode} == IN_CELL_IM) {
3264 ## have an element in table scope
3265 my $i;
3266 my $tn;
3267 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3268 my $node = $self->{open_elements}->[$_];
3269 if ($node->[1] eq $token->{tag_name}) {
3270 $i = $_;
3271 last INSCOPE;
3272 } elsif ($node->[1] eq 'td' or $node->[1] eq 'th') {
3273 $tn = $node->[1];
3274 ## NOTE: There is exactly one |td| or |th| element
3275 ## in scope in the stack of open elements by definition.
3276 } elsif ({
3277 table => 1, html => 1,
3278 }->{$node->[1]}) {
3279 last INSCOPE;
3280 }
3281 } # INSCOPE
3282 unless (defined $i) {
3283 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3284 ## Ignore the token
3285 !!!next-token;
3286 redo B;
3287 }
3288
3289 ## Close the cell
3290 !!!back-token; # </?>
3291 $token = {type => END_TAG_TOKEN, tag_name => $tn};
3292 redo B;
3293 } elsif ($token->{tag_name} eq 'table' and
3294 $self->{insertion_mode} == IN_CAPTION_IM) {
3295 !!!parse-error (type => 'not closed:caption');
3296
3297 ## As if </caption>
3298 ## have a table element in table scope
3299 my $i;
3300 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3301 my $node = $self->{open_elements}->[$_];
3302 if ($node->[1] eq 'caption') {
3303 $i = $_;
3304 last INSCOPE;
3305 } elsif ({
3306 table => 1, html => 1,
3307 }->{$node->[1]}) {
3308 last INSCOPE;
3309 }
3310 } # INSCOPE
3311 unless (defined $i) {
3312 !!!parse-error (type => 'unmatched end tag:caption');
3313 ## Ignore the token
3314 !!!next-token;
3315 redo B;
3316 }
3317
3318 ## generate implied end tags
3319 if ({
3320 dd => 1, dt => 1, li => 1, p => 1,
3321 td => 1, th => 1, tr => 1,
3322 tbody => 1, tfoot=> 1, thead => 1,
3323 }->{$self->{open_elements}->[-1]->[1]}) {
3324 !!!back-token; # </table>
3325 $token = {type => END_TAG_TOKEN, tag_name => 'caption'};
3326 !!!back-token;
3327 $token = {type => END_TAG_TOKEN,
3328 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3329 redo B;
3330 }
3331
3332 if ($self->{open_elements}->[-1]->[1] ne 'caption') {
3333 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3334 }
3335
3336 splice @{$self->{open_elements}}, $i;
3337
3338 $clear_up_to_marker->();
3339
3340 $self->{insertion_mode} = IN_TABLE_IM;
3341
3342 ## reprocess
3343 redo B;
3344 } elsif ({
3345 body => 1, col => 1, colgroup => 1, html => 1,
3346 }->{$token->{tag_name}}) {
3347 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
3348 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3349 ## Ignore the token
3350 !!!next-token;
3351 redo B;
3352 } else {
3353 #
3354 }
3355 } elsif ({
3356 tbody => 1, tfoot => 1,
3357 thead => 1, tr => 1,
3358 }->{$token->{tag_name}} and
3359 $self->{insertion_mode} == IN_CAPTION_IM) {
3360 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3361 ## Ignore the token
3362 !!!next-token;
3363 redo B;
3364 } else {
3365 #
3366 }
3367 } else {
3368 die "$0: $token->{type}: Unknown token type";
3369 }
3370
3371 $insert = $insert_to_current;
3372 #
3373 } elsif ($self->{insertion_mode} & TABLE_IMS) {
3374 if ($token->{type} == CHARACTER_TOKEN) {
3375 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3376 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3377
3378 unless (length $token->{data}) {
3379 !!!next-token;
3380 redo B;
3381 }
3382 }
3383
3384 !!!parse-error (type => 'in table:#character');
3385
3386 ## As if in body, but insert into foster parent element
3387 ## ISSUE: Spec says that "whenever a node would be inserted
3388 ## into the current node" while characters might not be
3389 ## result in a new Text node.
3390 $reconstruct_active_formatting_elements->($insert_to_foster);
3391
3392 if ({
3393 table => 1, tbody => 1, tfoot => 1,
3394 thead => 1, tr => 1,
3395 }->{$self->{open_elements}->[-1]->[1]}) {
3396 # MUST
3397 my $foster_parent_element;
3398 my $next_sibling;
3399 my $prev_sibling;
3400 OE: for (reverse 0..$#{$self->{open_elements}}) {
3401 if ($self->{open_elements}->[$_]->[1] eq 'table') {
3402 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3403 if (defined $parent and $parent->node_type == 1) {
3404 $foster_parent_element = $parent;
3405 $next_sibling = $self->{open_elements}->[$_]->[0];
3406 $prev_sibling = $next_sibling->previous_sibling;
3407 } else {
3408 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
3409 $prev_sibling = $foster_parent_element->last_child;
3410 }
3411 last OE;
3412 }
3413 } # OE
3414 $foster_parent_element = $self->{open_elements}->[0]->[0] and
3415 $prev_sibling = $foster_parent_element->last_child
3416 unless defined $foster_parent_element;
3417 if (defined $prev_sibling and
3418 $prev_sibling->node_type == 3) {
3419 $prev_sibling->manakai_append_text ($token->{data});
3420 } else {
3421 $foster_parent_element->insert_before
3422 ($self->{document}->create_text_node ($token->{data}),
3423 $next_sibling);
3424 }
3425 } else {
3426 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3427 }
3428
3429 !!!next-token;
3430 redo B;
3431 } elsif ($token->{type} == START_TAG_TOKEN) {
3432 if ({
3433 tr => ($self->{insertion_mode} != IN_ROW_IM),
3434 th => 1, td => 1,
3435 }->{$token->{tag_name}}) {
3436 if ($self->{insertion_mode} == IN_TABLE_IM) {
3437 ## Clear back to table context
3438 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3439 $self->{open_elements}->[-1]->[1] ne 'html') {
3440 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3441 pop @{$self->{open_elements}};
3442 }
3443
3444 !!!insert-element ('tbody');
3445 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3446 ## reprocess in the "in table body" insertion mode...
3447 }
3448
3449 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3450 unless ($token->{tag_name} eq 'tr') {
3451 !!!parse-error (type => 'missing start tag:tr');
3452 }
3453
3454 ## Clear back to table body context
3455 while (not {
3456 tbody => 1, tfoot => 1, thead => 1, html => 1,
3457 }->{$self->{open_elements}->[-1]->[1]}) {
3458 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3459 pop @{$self->{open_elements}};
3460 }
3461
3462 $self->{insertion_mode} = IN_ROW_IM;
3463 if ($token->{tag_name} eq 'tr') {
3464 !!!insert-element ($token->{tag_name}, $token->{attributes});
3465 !!!next-token;
3466 redo B;
3467 } else {
3468 !!!insert-element ('tr');
3469 ## reprocess in the "in row" insertion mode
3470 }
3471 }
3472
3473 ## Clear back to table row context
3474 while (not {
3475 tr => 1, html => 1,
3476 }->{$self->{open_elements}->[-1]->[1]}) {
3477 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3478 pop @{$self->{open_elements}};
3479 }
3480
3481 !!!insert-element ($token->{tag_name}, $token->{attributes});
3482 $self->{insertion_mode} = IN_CELL_IM;
3483
3484 push @$active_formatting_elements, ['#marker', ''];
3485
3486 !!!next-token;
3487 redo B;
3488 } elsif ({
3489 caption => 1, col => 1, colgroup => 1,
3490 tbody => 1, tfoot => 1, thead => 1,
3491 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3492 }->{$token->{tag_name}}) {
3493 if ($self->{insertion_mode} == IN_ROW_IM) {
3494 ## As if </tr>
3495 ## have an element in table scope
3496 my $i;
3497 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3498 my $node = $self->{open_elements}->[$_];
3499 if ($node->[1] eq 'tr') {
3500 $i = $_;
3501 last INSCOPE;
3502 } elsif ({
3503 table => 1, html => 1,
3504 }->{$node->[1]}) {
3505 last INSCOPE;
3506 }
3507 } # INSCOPE
3508 unless (defined $i) {
3509 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name});
3510 ## Ignore the token
3511 !!!next-token;
3512 redo B;
3513 }
3514
3515 ## Clear back to table row context
3516 while (not {
3517 tr => 1, html => 1,
3518 }->{$self->{open_elements}->[-1]->[1]}) {
3519 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3520 pop @{$self->{open_elements}};
3521 }
3522
3523 pop @{$self->{open_elements}}; # tr
3524 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3525 if ($token->{tag_name} eq 'tr') {
3526 ## reprocess
3527 redo B;
3528 } else {
3529 ## reprocess in the "in table body" insertion mode...
3530 }
3531 }
3532
3533 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3534 ## have an element in table scope
3535 my $i;
3536 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3537 my $node = $self->{open_elements}->[$_];
3538 if ({
3539 tbody => 1, thead => 1, tfoot => 1,
3540 }->{$node->[1]}) {
3541 $i = $_;
3542 last INSCOPE;
3543 } elsif ({
3544 table => 1, html => 1,
3545 }->{$node->[1]}) {
3546 last INSCOPE;
3547 }
3548 } # INSCOPE
3549 unless (defined $i) {
3550 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3551 ## Ignore the token
3552 !!!next-token;
3553 redo B;
3554 }
3555
3556 ## Clear back to table body context
3557 while (not {
3558 tbody => 1, tfoot => 1, thead => 1, html => 1,
3559 }->{$self->{open_elements}->[-1]->[1]}) {
3560 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3561 pop @{$self->{open_elements}};
3562 }
3563
3564 ## As if <{current node}>
3565 ## have an element in table scope
3566 ## true by definition
3567
3568 ## Clear back to table body context
3569 ## nop by definition
3570
3571 pop @{$self->{open_elements}};
3572 $self->{insertion_mode} = IN_TABLE_IM;
3573 ## reprocess in "in table" insertion mode...
3574 }
3575
3576 if ($token->{tag_name} eq 'col') {
3577 ## Clear back to table context
3578 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3579 $self->{open_elements}->[-1]->[1] ne 'html') {
3580 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3581 pop @{$self->{open_elements}};
3582 }
3583
3584 !!!insert-element ('colgroup');
3585 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
3586 ## reprocess
3587 redo B;
3588 } elsif ({
3589 caption => 1,
3590 colgroup => 1,
3591 tbody => 1, tfoot => 1, thead => 1,
3592 }->{$token->{tag_name}}) {
3593 ## Clear back to table context
3594 while ($self->{open_elements}->[-1]->[1] ne 'table' and
3595 $self->{open_elements}->[-1]->[1] ne 'html') {
3596 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3597 pop @{$self->{open_elements}};
3598 }
3599
3600 push @$active_formatting_elements, ['#marker', '']
3601 if $token->{tag_name} eq 'caption';
3602
3603 !!!insert-element ($token->{tag_name}, $token->{attributes});
3604 $self->{insertion_mode} = {
3605 caption => IN_CAPTION_IM,
3606 colgroup => IN_COLUMN_GROUP_IM,
3607 tbody => IN_TABLE_BODY_IM,
3608 tfoot => IN_TABLE_BODY_IM,
3609 thead => IN_TABLE_BODY_IM,
3610 }->{$token->{tag_name}};
3611 !!!next-token;
3612 redo B;
3613 } else {
3614 die "$0: in table: <>: $token->{tag_name}";
3615 }
3616 } elsif ($token->{tag_name} eq 'table') {
3617 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3618
3619 ## As if </table>
3620 ## have a table element in table scope
3621 my $i;
3622 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3623 my $node = $self->{open_elements}->[$_];
3624 if ($node->[1] eq 'table') {
3625 $i = $_;
3626 last INSCOPE;
3627 } elsif ({
3628 table => 1, html => 1,
3629 }->{$node->[1]}) {
3630 last INSCOPE;
3631 }
3632 } # INSCOPE
3633 unless (defined $i) {
3634 !!!parse-error (type => 'unmatched end tag:table');
3635 ## Ignore tokens </table><table>
3636 !!!next-token;
3637 redo B;
3638 }
3639
3640 ## generate implied end tags
3641 if ({
3642 dd => 1, dt => 1, li => 1, p => 1,
3643 td => 1, th => 1, tr => 1,
3644 tbody => 1, tfoot=> 1, thead => 1,
3645 }->{$self->{open_elements}->[-1]->[1]}) {
3646 !!!back-token; # <table>
3647 $token = {type => END_TAG_TOKEN, tag_name => 'table'};
3648 !!!back-token;
3649 $token = {type => END_TAG_TOKEN,
3650 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3651 redo B;
3652 }
3653
3654 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3655 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3656 }
3657
3658 splice @{$self->{open_elements}}, $i;
3659
3660 $self->_reset_insertion_mode;
3661
3662 ## reprocess
3663 redo B;
3664 } else {
3665 !!!parse-error (type => 'in table:'.$token->{tag_name});
3666
3667 $insert = $insert_to_foster;
3668 #
3669 }
3670 } elsif ($token->{type} == END_TAG_TOKEN) {
3671 if ($token->{tag_name} eq 'tr' and
3672 $self->{insertion_mode} == IN_ROW_IM) {
3673 ## have an element in table scope
3674 my $i;
3675 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3676 my $node = $self->{open_elements}->[$_];
3677 if ($node->[1] eq $token->{tag_name}) {
3678 $i = $_;
3679 last INSCOPE;
3680 } elsif ({
3681 table => 1, html => 1,
3682 }->{$node->[1]}) {
3683 last INSCOPE;
3684 }
3685 } # INSCOPE
3686 unless (defined $i) {
3687 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3688 ## Ignore the token
3689 !!!next-token;
3690 redo B;
3691 }
3692
3693 ## Clear back to table row context
3694 while (not {
3695 tr => 1, html => 1,
3696 }->{$self->{open_elements}->[-1]->[1]}) {
3697 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3698 pop @{$self->{open_elements}};
3699 }
3700
3701 pop @{$self->{open_elements}}; # tr
3702 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3703 !!!next-token;
3704 redo B;
3705 } elsif ($token->{tag_name} eq 'table') {
3706 if ($self->{insertion_mode} == IN_ROW_IM) {
3707 ## As if </tr>
3708 ## have an element in table scope
3709 my $i;
3710 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3711 my $node = $self->{open_elements}->[$_];
3712 if ($node->[1] eq 'tr') {
3713 $i = $_;
3714 last INSCOPE;
3715 } elsif ({
3716 table => 1, html => 1,
3717 }->{$node->[1]}) {
3718 last INSCOPE;
3719 }
3720 } # INSCOPE
3721 unless (defined $i) {
3722 !!!parse-error (type => 'unmatched end tag:'.$token->{type});
3723 ## Ignore the token
3724 !!!next-token;
3725 redo B;
3726 }
3727
3728 ## Clear back to table row context
3729 while (not {
3730 tr => 1, html => 1,
3731 }->{$self->{open_elements}->[-1]->[1]}) {
3732 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3733 pop @{$self->{open_elements}};
3734 }
3735
3736 pop @{$self->{open_elements}}; # tr
3737 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3738 ## reprocess in the "in table body" insertion mode...
3739 }
3740
3741 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
3742 ## have an element in table scope
3743 my $i;
3744 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3745 my $node = $self->{open_elements}->[$_];
3746 if ({
3747 tbody => 1, thead => 1, tfoot => 1,
3748 }->{$node->[1]}) {
3749 $i = $_;
3750 last INSCOPE;
3751 } elsif ({
3752 table => 1, html => 1,
3753 }->{$node->[1]}) {
3754 last INSCOPE;
3755 }
3756 } # INSCOPE
3757 unless (defined $i) {
3758 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3759 ## Ignore the token
3760 !!!next-token;
3761 redo B;
3762 }
3763
3764 ## Clear back to table body context
3765 while (not {
3766 tbody => 1, tfoot => 1, thead => 1, html => 1,
3767 }->{$self->{open_elements}->[-1]->[1]}) {
3768 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3769 pop @{$self->{open_elements}};
3770 }
3771
3772 ## As if <{current node}>
3773 ## have an element in table scope
3774 ## true by definition
3775
3776 ## Clear back to table body context
3777 ## nop by definition
3778
3779 pop @{$self->{open_elements}};
3780 $self->{insertion_mode} = IN_TABLE_IM;
3781 ## reprocess in the "in table" insertion mode...
3782 }
3783
3784 ## have a table element in table scope
3785 my $i;
3786 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3787 my $node = $self->{open_elements}->[$_];
3788 if ($node->[1] eq $token->{tag_name}) {
3789 $i = $_;
3790 last INSCOPE;
3791 } elsif ({
3792 table => 1, html => 1,
3793 }->{$node->[1]}) {
3794 last INSCOPE;
3795 }
3796 } # INSCOPE
3797 unless (defined $i) {
3798 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3799 ## Ignore the token
3800 !!!next-token;
3801 redo B;
3802 }
3803
3804 ## generate implied end tags
3805 if ({
3806 dd => 1, dt => 1, li => 1, p => 1,
3807 td => 1, th => 1, tr => 1,
3808 tbody => 1, tfoot=> 1, thead => 1,
3809 }->{$self->{open_elements}->[-1]->[1]}) {
3810 !!!back-token;
3811 $token = {type => END_TAG_TOKEN,
3812 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
3813 redo B;
3814 }
3815
3816 if ($self->{open_elements}->[-1]->[1] ne 'table') {
3817 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3818 }
3819
3820 splice @{$self->{open_elements}}, $i;
3821
3822 $self->_reset_insertion_mode;
3823
3824 !!!next-token;
3825 redo B;
3826 } elsif ({
3827 tbody => 1, tfoot => 1, thead => 1,
3828 }->{$token->{tag_name}} and
3829 $self->{insertion_mode} & ROW_IMS) {
3830 if ($self->{insertion_mode} == IN_ROW_IM) {
3831 ## have an element in table scope
3832 my $i;
3833 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3834 my $node = $self->{open_elements}->[$_];
3835 if ($node->[1] eq $token->{tag_name}) {
3836 $i = $_;
3837 last INSCOPE;
3838 } elsif ({
3839 table => 1, html => 1,
3840 }->{$node->[1]}) {
3841 last INSCOPE;
3842 }
3843 } # INSCOPE
3844 unless (defined $i) {
3845 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3846 ## Ignore the token
3847 !!!next-token;
3848 redo B;
3849 }
3850
3851 ## As if </tr>
3852 ## have an element in table scope
3853 my $i;
3854 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3855 my $node = $self->{open_elements}->[$_];
3856 if ($node->[1] eq 'tr') {
3857 $i = $_;
3858 last INSCOPE;
3859 } elsif ({
3860 table => 1, html => 1,
3861 }->{$node->[1]}) {
3862 last INSCOPE;
3863 }
3864 } # INSCOPE
3865 unless (defined $i) {
3866 !!!parse-error (type => 'unmatched end tag:tr');
3867 ## Ignore the token
3868 !!!next-token;
3869 redo B;
3870 }
3871
3872 ## Clear back to table row context
3873 while (not {
3874 tr => 1, html => 1,
3875 }->{$self->{open_elements}->[-1]->[1]}) {
3876 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3877 pop @{$self->{open_elements}};
3878 }
3879
3880 pop @{$self->{open_elements}}; # tr
3881 $self->{insertion_mode} = IN_TABLE_BODY_IM;
3882 ## reprocess in the "in table body" insertion mode...
3883 }
3884
3885 ## have an element in table scope
3886 my $i;
3887 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3888 my $node = $self->{open_elements}->[$_];
3889 if ($node->[1] eq $token->{tag_name}) {
3890 $i = $_;
3891 last INSCOPE;
3892 } elsif ({
3893 table => 1, html => 1,
3894 }->{$node->[1]}) {
3895 last INSCOPE;
3896 }
3897 } # INSCOPE
3898 unless (defined $i) {
3899 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3900 ## Ignore the token
3901 !!!next-token;
3902 redo B;
3903 }
3904
3905 ## Clear back to table body context
3906 while (not {
3907 tbody => 1, tfoot => 1, thead => 1, html => 1,
3908 }->{$self->{open_elements}->[-1]->[1]}) {
3909 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3910 pop @{$self->{open_elements}};
3911 }
3912
3913 pop @{$self->{open_elements}};
3914 $self->{insertion_mode} = IN_TABLE_IM;
3915 !!!next-token;
3916 redo B;
3917 } elsif ({
3918 body => 1, caption => 1, col => 1, colgroup => 1,
3919 html => 1, td => 1, th => 1,
3920 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
3921 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
3922 }->{$token->{tag_name}}) {
3923 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
3924 ## Ignore the token
3925 !!!next-token;
3926 redo B;
3927 } else {
3928 !!!parse-error (type => 'in table:/'.$token->{tag_name});
3929
3930 $insert = $insert_to_foster;
3931 #
3932 }
3933 } else {
3934 die "$0: $token->{type}: Unknown token type";
3935 }
3936 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
3937 if ($token->{type} == CHARACTER_TOKEN) {
3938 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3939 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3940 unless (length $token->{data}) {
3941 !!!next-token;
3942 redo B;
3943 }
3944 }
3945
3946 #
3947 } elsif ($token->{type} == START_TAG_TOKEN) {
3948 if ($token->{tag_name} eq 'col') {
3949 !!!insert-element ($token->{tag_name}, $token->{attributes});
3950 pop @{$self->{open_elements}};
3951 !!!next-token;
3952 redo B;
3953 } else {
3954 #
3955 }
3956 } elsif ($token->{type} == END_TAG_TOKEN) {
3957 if ($token->{tag_name} eq 'colgroup') {
3958 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3959 !!!parse-error (type => 'unmatched end tag:colgroup');
3960 ## Ignore the token
3961 !!!next-token;
3962 redo B;
3963 } else {
3964 pop @{$self->{open_elements}}; # colgroup
3965 $self->{insertion_mode} = IN_TABLE_IM;
3966 !!!next-token;
3967 redo B;
3968 }
3969 } elsif ($token->{tag_name} eq 'col') {
3970 !!!parse-error (type => 'unmatched end tag:col');
3971 ## Ignore the token
3972 !!!next-token;
3973 redo B;
3974 } else {
3975 #
3976 }
3977 } else {
3978 #
3979 }
3980
3981 ## As if </colgroup>
3982 if ($self->{open_elements}->[-1]->[1] eq 'html') {
3983 !!!parse-error (type => 'unmatched end tag:colgroup');
3984 ## Ignore the token
3985 !!!next-token;
3986 redo B;
3987 } else {
3988 pop @{$self->{open_elements}}; # colgroup
3989 $self->{insertion_mode} = IN_TABLE_IM;
3990 ## reprocess
3991 redo B;
3992 }
3993 } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
3994 if ($token->{type} == CHARACTER_TOKEN) {
3995 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3996 !!!next-token;
3997 redo B;
3998 } elsif ($token->{type} == START_TAG_TOKEN) {
3999 if ($token->{tag_name} eq 'option') {
4000 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4001 ## As if </option>
4002 pop @{$self->{open_elements}};
4003 }
4004
4005 !!!insert-element ($token->{tag_name}, $token->{attributes});
4006 !!!next-token;
4007 redo B;
4008 } elsif ($token->{tag_name} eq 'optgroup') {
4009 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4010 ## As if </option>
4011 pop @{$self->{open_elements}};
4012 }
4013
4014 if ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4015 ## As if </optgroup>
4016 pop @{$self->{open_elements}};
4017 }
4018
4019 !!!insert-element ($token->{tag_name}, $token->{attributes});
4020 !!!next-token;
4021 redo B;
4022 } elsif ($token->{tag_name} eq 'select') {
4023 !!!parse-error (type => 'not closed:select');
4024 ## As if </select> instead
4025 ## have an element in table scope
4026 my $i;
4027 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4028 my $node = $self->{open_elements}->[$_];
4029 if ($node->[1] eq $token->{tag_name}) {
4030 $i = $_;
4031 last INSCOPE;
4032 } elsif ({
4033 table => 1, html => 1,
4034 }->{$node->[1]}) {
4035 last INSCOPE;
4036 }
4037 } # INSCOPE
4038 unless (defined $i) {
4039 !!!parse-error (type => 'unmatched end tag:select');
4040 ## Ignore the token
4041 !!!next-token;
4042 redo B;
4043 }
4044
4045 splice @{$self->{open_elements}}, $i;
4046
4047 $self->_reset_insertion_mode;
4048
4049 !!!next-token;
4050 redo B;
4051 } else {
4052 !!!parse-error (type => 'in select:'.$token->{tag_name});
4053 ## Ignore the token
4054 !!!next-token;
4055 redo B;
4056 }
4057 } elsif ($token->{type} == END_TAG_TOKEN) {
4058 if ($token->{tag_name} eq 'optgroup') {
4059 if ($self->{open_elements}->[-1]->[1] eq 'option' and
4060 $self->{open_elements}->[-2]->[1] eq 'optgroup') {
4061 ## As if </option>
4062 splice @{$self->{open_elements}}, -2;
4063 } elsif ($self->{open_elements}->[-1]->[1] eq 'optgroup') {
4064 pop @{$self->{open_elements}};
4065 } else {
4066 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4067 ## Ignore the token
4068 }
4069 !!!next-token;
4070 redo B;
4071 } elsif ($token->{tag_name} eq 'option') {
4072 if ($self->{open_elements}->[-1]->[1] eq 'option') {
4073 pop @{$self->{open_elements}};
4074 } else {
4075 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4076 ## Ignore the token
4077 }
4078 !!!next-token;
4079 redo B;
4080 } elsif ($token->{tag_name} eq 'select') {
4081 ## have an element in table scope
4082 my $i;
4083 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4084 my $node = $self->{open_elements}->[$_];
4085 if ($node->[1] eq $token->{tag_name}) {
4086 $i = $_;
4087 last INSCOPE;
4088 } elsif ({
4089 table => 1, html => 1,
4090 }->{$node->[1]}) {
4091 last INSCOPE;
4092 }
4093 } # INSCOPE
4094 unless (defined $i) {
4095 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4096 ## Ignore the token
4097 !!!next-token;
4098 redo B;
4099 }
4100
4101 splice @{$self->{open_elements}}, $i;
4102
4103 $self->_reset_insertion_mode;
4104
4105 !!!next-token;
4106 redo B;
4107 } elsif ({
4108 caption => 1, table => 1, tbody => 1,
4109 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
4110 }->{$token->{tag_name}}) {
4111 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4112
4113 ## have an element in table scope
4114 my $i;
4115 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4116 my $node = $self->{open_elements}->[$_];
4117 if ($node->[1] eq $token->{tag_name}) {
4118 $i = $_;
4119 last INSCOPE;
4120 } elsif ({
4121 table => 1, html => 1,
4122 }->{$node->[1]}) {
4123 last INSCOPE;
4124 }
4125 } # INSCOPE
4126 unless (defined $i) {
4127 ## Ignore the token
4128 !!!next-token;
4129 redo B;
4130 }
4131
4132 ## As if </select>
4133 ## have an element in table scope
4134 undef $i;
4135 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4136 my $node = $self->{open_elements}->[$_];
4137 if ($node->[1] eq 'select') {
4138 $i = $_;
4139 last INSCOPE;
4140 } elsif ({
4141 table => 1, html => 1,
4142 }->{$node->[1]}) {
4143 last INSCOPE;
4144 }
4145 } # INSCOPE
4146 unless (defined $i) {
4147 !!!parse-error (type => 'unmatched end tag:select');
4148 ## Ignore the </select> token
4149 !!!next-token; ## TODO: ok?
4150 redo B;
4151 }
4152
4153 splice @{$self->{open_elements}}, $i;
4154
4155 $self->_reset_insertion_mode;
4156
4157 ## reprocess
4158 redo B;
4159 } else {
4160 !!!parse-error (type => 'in select:/'.$token->{tag_name});
4161 ## Ignore the token
4162 !!!next-token;
4163 redo B;
4164 }
4165 } else {
4166 die "$0: $token->{type}: Unknown token type";
4167 }
4168 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4169 if ($token->{type} == CHARACTER_TOKEN) {
4170 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4171 my $data = $1;
4172 ## As if in body
4173 $reconstruct_active_formatting_elements->($insert_to_current);
4174
4175 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4176
4177 unless (length $token->{data}) {
4178 !!!next-token;
4179 redo B;
4180 }
4181 }
4182
4183 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4184 !!!parse-error (type => 'after html:#character');
4185
4186 ## Reprocess in the "main" phase, "after body" insertion mode...
4187 }
4188
4189 ## "after body" insertion mode
4190 !!!parse-error (type => 'after body:#character');
4191
4192 $self->{insertion_mode} = IN_BODY_IM;
4193 ## reprocess
4194 redo B;
4195 } elsif ($token->{type} == START_TAG_TOKEN) {
4196 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4197 !!!parse-error (type => 'after html:'.$token->{tag_name});
4198
4199 ## Reprocess in the "main" phase, "after body" insertion mode...
4200 }
4201
4202 ## "after body" insertion mode
4203 !!!parse-error (type => 'after body:'.$token->{tag_name});
4204
4205 $self->{insertion_mode} = IN_BODY_IM;
4206 ## reprocess
4207 redo B;
4208 } elsif ($token->{type} == END_TAG_TOKEN) {
4209 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4210 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4211
4212 $self->{insertion_mode} = AFTER_BODY_IM;
4213 ## Reprocess in the "main" phase, "after body" insertion mode...
4214 }
4215
4216 ## "after body" insertion mode
4217 if ($token->{tag_name} eq 'html') {
4218 if (defined $self->{inner_html_node}) {
4219 !!!parse-error (type => 'unmatched end tag:html');
4220 ## Ignore the token
4221 !!!next-token;
4222 redo B;
4223 } else {
4224 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
4225 !!!next-token;
4226 redo B;
4227 }
4228 } else {
4229 !!!parse-error (type => 'after body:/'.$token->{tag_name});
4230
4231 $self->{insertion_mode} = IN_BODY_IM;
4232 ## reprocess
4233 redo B;
4234 }
4235 } else {
4236 die "$0: $token->{type}: Unknown token type";
4237 }
4238 } elsif ($self->{insertion_mode} & FRAME_IMS) {
4239 if ($token->{type} == CHARACTER_TOKEN) {
4240 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4241 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4242
4243 unless (length $token->{data}) {
4244 !!!next-token;
4245 redo B;
4246 }
4247 }
4248
4249 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
4250 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4251 !!!parse-error (type => 'in frameset:#character');
4252 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
4253 !!!parse-error (type => 'after frameset:#character');
4254 } else { # "after html frameset"
4255 !!!parse-error (type => 'after html:#character');
4256
4257 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4258 ## Reprocess in the "main" phase, "after frameset"...
4259 !!!parse-error (type => 'after frameset:#character');
4260 }
4261
4262 ## Ignore the token.
4263 if (length $token->{data}) {
4264 ## reprocess the rest of characters
4265 } else {
4266 !!!next-token;
4267 }
4268 redo B;
4269 }
4270
4271 die qq[$0: Character "$token->{data}"];
4272 } elsif ($token->{type} == START_TAG_TOKEN) {
4273 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4274 !!!parse-error (type => 'after html:'.$token->{tag_name});
4275
4276 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4277 ## Process in the "main" phase, "after frameset" insertion mode...
4278 }
4279
4280 if ($token->{tag_name} eq 'frameset' and
4281 $self->{insertion_mode} == IN_FRAMESET_IM) {
4282 !!!insert-element ($token->{tag_name}, $token->{attributes});
4283 !!!next-token;
4284 redo B;
4285 } elsif ($token->{tag_name} eq 'frame' and
4286 $self->{insertion_mode} == IN_FRAMESET_IM) {
4287 !!!insert-element ($token->{tag_name}, $token->{attributes});
4288 pop @{$self->{open_elements}};
4289 !!!next-token;
4290 redo B;
4291 } elsif ($token->{tag_name} eq 'noframes') {
4292 ## NOTE: As if in body.
4293 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
4294 redo B;
4295 } else {
4296 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4297 !!!parse-error (type => 'in frameset:'.$token->{tag_name});
4298 } else {
4299 !!!parse-error (type => 'after frameset:'.$token->{tag_name});
4300 }
4301 ## Ignore the token
4302 !!!next-token;
4303 redo B;
4304 }
4305 } elsif ($token->{type} == END_TAG_TOKEN) {
4306 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4307 !!!parse-error (type => 'after html:/'.$token->{tag_name});
4308
4309 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4310 ## Process in the "main" phase, "after frameset" insertion mode...
4311 }
4312
4313 if ($token->{tag_name} eq 'frameset' and
4314 $self->{insertion_mode} == IN_FRAMESET_IM) {
4315 if ($self->{open_elements}->[-1]->[1] eq 'html' and
4316 @{$self->{open_elements}} == 1) {
4317 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4318 ## Ignore the token
4319 !!!next-token;
4320 } else {
4321 pop @{$self->{open_elements}};
4322 !!!next-token;
4323 }
4324
4325 if (not defined $self->{inner_html_node} and
4326 $self->{open_elements}->[-1]->[1] ne 'frameset') {
4327 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4328 }
4329 redo B;
4330 } elsif ($token->{tag_name} eq 'html' and
4331 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
4332 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
4333 !!!next-token;
4334 redo B;
4335 } else {
4336 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
4337 !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
4338 } else {
4339 !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
4340 }
4341 ## Ignore the token
4342 !!!next-token;
4343 redo B;
4344 }
4345 } else {
4346 die "$0: $token->{type}: Unknown token type";
4347 }
4348
4349 ## ISSUE: An issue in spec here
4350 } else {
4351 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4352 }
4353
4354 ## "in body" insertion mode
4355 if ($token->{type} == START_TAG_TOKEN) {
4356 if ($token->{tag_name} eq 'script') {
4357 ## NOTE: This is an "as if in head" code clone
4358 $script_start_tag->($insert);
4359 redo B;
4360 } elsif ($token->{tag_name} eq 'style') {
4361 ## NOTE: This is an "as if in head" code clone
4362 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4363 redo B;
4364 } elsif ({
4365 base => 1, link => 1,
4366 }->{$token->{tag_name}}) {
4367 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4368 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4369 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4370 !!!next-token;
4371 redo B;
4372 } elsif ($token->{tag_name} eq 'meta') {
4373 ## NOTE: This is an "as if in head" code clone, only "-t" differs
4374 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4375 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4376
4377 unless ($self->{confident}) {
4378 my $charset;
4379 if ($token->{attributes}->{charset}) { ## TODO: And if supported
4380 $charset = $token->{attributes}->{charset}->{value};
4381 }
4382 if ($token->{attributes}->{'http-equiv'}) {
4383 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4384 if ($token->{attributes}->{'http-equiv'}->{value}
4385 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
4386 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4387 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4388 $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
4389 } ## TODO: And if supported
4390 }
4391 ## TODO: Change the encoding
4392 }
4393
4394 !!!next-token;
4395 redo B;
4396 } elsif ($token->{tag_name} eq 'title') {
4397 !!!parse-error (type => 'in body:title');
4398 ## NOTE: This is an "as if in head" code clone
4399 $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
4400 if (defined $self->{head_element}) {
4401 $self->{head_element}->append_child ($_[0]);
4402 } else {
4403 $insert->($_[0]);
4404 }
4405 });
4406 redo B;
4407 } elsif ($token->{tag_name} eq 'body') {
4408 !!!parse-error (type => 'in body:body');
4409
4410 if (@{$self->{open_elements}} == 1 or
4411 $self->{open_elements}->[1]->[1] ne 'body') {
4412 ## Ignore the token
4413 } else {
4414 my $body_el = $self->{open_elements}->[1]->[0];
4415 for my $attr_name (keys %{$token->{attributes}}) {
4416 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
4417 $body_el->set_attribute_ns
4418 (undef, [undef, $attr_name],
4419 $token->{attributes}->{$attr_name}->{value});
4420 }
4421 }
4422 }
4423 !!!next-token;
4424 redo B;
4425 } elsif ({
4426 address => 1, blockquote => 1, center => 1, dir => 1,
4427 div => 1, dl => 1, fieldset => 1, listing => 1,
4428 menu => 1, ol => 1, p => 1, ul => 1,
4429 pre => 1,
4430 }->{$token->{tag_name}}) {
4431 ## has a p element in scope
4432 INSCOPE: for (reverse @{$self->{open_elements}}) {
4433 if ($_->[1] eq 'p') {
4434 !!!back-token;
4435 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4436 redo B;
4437 } elsif ({
4438 table => 1, caption => 1, td => 1, th => 1,
4439 button => 1, marquee => 1, object => 1, html => 1,
4440 }->{$_->[1]}) {
4441 last INSCOPE;
4442 }
4443 } # INSCOPE
4444
4445 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4446 if ($token->{tag_name} eq 'pre') {
4447 !!!next-token;
4448 if ($token->{type} == CHARACTER_TOKEN) {
4449 $token->{data} =~ s/^\x0A//;
4450 unless (length $token->{data}) {
4451 !!!next-token;
4452 }
4453 }
4454 } else {
4455 !!!next-token;
4456 }
4457 redo B;
4458 } elsif ($token->{tag_name} eq 'form') {
4459 if (defined $self->{form_element}) {
4460 !!!parse-error (type => 'in form:form');
4461 ## Ignore the token
4462 !!!next-token;
4463 redo B;
4464 } else {
4465 ## has a p element in scope
4466 INSCOPE: for (reverse @{$self->{open_elements}}) {
4467 if ($_->[1] eq 'p') {
4468 !!!back-token;
4469 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4470 redo B;
4471 } elsif ({
4472 table => 1, caption => 1, td => 1, th => 1,
4473 button => 1, marquee => 1, object => 1, html => 1,
4474 }->{$_->[1]}) {
4475 last INSCOPE;
4476 }
4477 } # INSCOPE
4478
4479 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4480 $self->{form_element} = $self->{open_elements}->[-1]->[0];
4481 !!!next-token;
4482 redo B;
4483 }
4484 } elsif ($token->{tag_name} eq 'li') {
4485 ## has a p element in scope
4486 INSCOPE: for (reverse @{$self->{open_elements}}) {
4487 if ($_->[1] eq 'p') {
4488 !!!back-token;
4489 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4490 redo B;
4491 } elsif ({
4492 table => 1, caption => 1, td => 1, th => 1,
4493 button => 1, marquee => 1, object => 1, html => 1,
4494 }->{$_->[1]}) {
4495 last INSCOPE;
4496 }
4497 } # INSCOPE
4498
4499 ## Step 1
4500 my $i = -1;
4501 my $node = $self->{open_elements}->[$i];
4502 LI: {
4503 ## Step 2
4504 if ($node->[1] eq 'li') {
4505 if ($i != -1) {
4506 !!!parse-error (type => 'end tag missing:'.
4507 $self->{open_elements}->[-1]->[1]);
4508 }
4509 splice @{$self->{open_elements}}, $i;
4510 last LI;
4511 }
4512
4513 ## Step 3
4514 if (not $formatting_category->{$node->[1]} and
4515 #not $phrasing_category->{$node->[1]} and
4516 ($special_category->{$node->[1]} or
4517 $scoping_category->{$node->[1]}) and
4518 $node->[1] ne 'address' and $node->[1] ne 'div') {
4519 last LI;
4520 }
4521
4522 ## Step 4
4523 $i--;
4524 $node = $self->{open_elements}->[$i];
4525 redo LI;
4526 } # LI
4527
4528 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4529 !!!next-token;
4530 redo B;
4531 } elsif ($token->{tag_name} eq 'dd' or $token->{tag_name} eq 'dt') {
4532 ## has a p element in scope
4533 INSCOPE: for (reverse @{$self->{open_elements}}) {
4534 if ($_->[1] eq 'p') {
4535 !!!back-token;
4536 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4537 redo B;
4538 } elsif ({
4539 table => 1, caption => 1, td => 1, th => 1,
4540 button => 1, marquee => 1, object => 1, html => 1,
4541 }->{$_->[1]}) {
4542 last INSCOPE;
4543 }
4544 } # INSCOPE
4545
4546 ## Step 1
4547 my $i = -1;
4548 my $node = $self->{open_elements}->[$i];
4549 LI: {
4550 ## Step 2
4551 if ($node->[1] eq 'dt' or $node->[1] eq 'dd') {
4552 if ($i != -1) {
4553 !!!parse-error (type => 'end tag missing:'.
4554 $self->{open_elements}->[-1]->[1]);
4555 }
4556 splice @{$self->{open_elements}}, $i;
4557 last LI;
4558 }
4559
4560 ## Step 3
4561 if (not $formatting_category->{$node->[1]} and
4562 #not $phrasing_category->{$node->[1]} and
4563 ($special_category->{$node->[1]} or
4564 $scoping_category->{$node->[1]}) and
4565 $node->[1] ne 'address' and $node->[1] ne 'div') {
4566 last LI;
4567 }
4568
4569 ## Step 4
4570 $i--;
4571 $node = $self->{open_elements}->[$i];
4572 redo LI;
4573 } # LI
4574
4575 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4576 !!!next-token;
4577 redo B;
4578 } elsif ($token->{tag_name} eq 'plaintext') {
4579 ## has a p element in scope
4580 INSCOPE: for (reverse @{$self->{open_elements}}) {
4581 if ($_->[1] eq 'p') {
4582 !!!back-token;
4583 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4584 redo B;
4585 } elsif ({
4586 table => 1, caption => 1, td => 1, th => 1,
4587 button => 1, marquee => 1, object => 1, html => 1,
4588 }->{$_->[1]}) {
4589 last INSCOPE;
4590 }
4591 } # INSCOPE
4592
4593 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4594
4595 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
4596
4597 !!!next-token;
4598 redo B;
4599 } elsif ({
4600 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4601 }->{$token->{tag_name}}) {
4602 ## has a p element in scope
4603 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4604 my $node = $self->{open_elements}->[$_];
4605 if ($node->[1] eq 'p') {
4606 !!!back-token;
4607 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4608 redo B;
4609 } elsif ({
4610 table => 1, caption => 1, td => 1, th => 1,
4611 button => 1, marquee => 1, object => 1, html => 1,
4612 }->{$node->[1]}) {
4613 last INSCOPE;
4614 }
4615 } # INSCOPE
4616
4617 ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
4618 ## has an element in scope
4619 #my $i;
4620 #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4621 # my $node = $self->{open_elements}->[$_];
4622 # if ({
4623 # h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
4624 # }->{$node->[1]}) {
4625 # $i = $_;
4626 # last INSCOPE;
4627 # } elsif ({
4628 # table => 1, caption => 1, td => 1, th => 1,
4629 # button => 1, marquee => 1, object => 1, html => 1,
4630 # }->{$node->[1]}) {
4631 # last INSCOPE;
4632 # }
4633 #} # INSCOPE
4634 #
4635 #if (defined $i) {
4636 # !!! parse-error (type => 'in hn:hn');
4637 # splice @{$self->{open_elements}}, $i;
4638 #}
4639
4640 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4641
4642 !!!next-token;
4643 redo B;
4644 } elsif ($token->{tag_name} eq 'a') {
4645 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
4646 my $node = $active_formatting_elements->[$i];
4647 if ($node->[1] eq 'a') {
4648 !!!parse-error (type => 'in a:a');
4649
4650 !!!back-token;
4651 $token = {type => END_TAG_TOKEN, tag_name => 'a'};
4652 $formatting_end_tag->($token->{tag_name});
4653
4654 AFE2: for (reverse 0..$#$active_formatting_elements) {
4655 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4656 splice @$active_formatting_elements, $_, 1;
4657 last AFE2;
4658 }
4659 } # AFE2
4660 OE: for (reverse 0..$#{$self->{open_elements}}) {
4661 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
4662 splice @{$self->{open_elements}}, $_, 1;
4663 last OE;
4664 }
4665 } # OE
4666 last AFE;
4667 } elsif ($node->[0] eq '#marker') {
4668 last AFE;
4669 }
4670 } # AFE
4671
4672 $reconstruct_active_formatting_elements->($insert_to_current);
4673
4674 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4675 push @$active_formatting_elements, $self->{open_elements}->[-1];
4676
4677 !!!next-token;
4678 redo B;
4679 } elsif ({
4680 b => 1, big => 1, em => 1, font => 1, i => 1,
4681 s => 1, small => 1, strile => 1,
4682 strong => 1, tt => 1, u => 1,
4683 }->{$token->{tag_name}}) {
4684 $reconstruct_active_formatting_elements->($insert_to_current);
4685
4686 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4687 push @$active_formatting_elements, $self->{open_elements}->[-1];
4688
4689 !!!next-token;
4690 redo B;
4691 } elsif ($token->{tag_name} eq 'nobr') {
4692 $reconstruct_active_formatting_elements->($insert_to_current);
4693
4694 ## has a |nobr| element in scope
4695 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4696 my $node = $self->{open_elements}->[$_];
4697 if ($node->[1] eq 'nobr') {
4698 !!!parse-error (type => 'in nobr:nobr');
4699 !!!back-token;
4700 $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4701 redo B;
4702 } elsif ({
4703 table => 1, caption => 1, td => 1, th => 1,
4704 button => 1, marquee => 1, object => 1, html => 1,
4705 }->{$node->[1]}) {
4706 last INSCOPE;
4707 }
4708 } # INSCOPE
4709
4710 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4711 push @$active_formatting_elements, $self->{open_elements}->[-1];
4712
4713 !!!next-token;
4714 redo B;
4715 } elsif ($token->{tag_name} eq 'button') {
4716 ## has a button element in scope
4717 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4718 my $node = $self->{open_elements}->[$_];
4719 if ($node->[1] eq 'button') {
4720 !!!parse-error (type => 'in button:button');
4721 !!!back-token;
4722 $token = {type => END_TAG_TOKEN, tag_name => 'button'};
4723 redo B;
4724 } elsif ({
4725 table => 1, caption => 1, td => 1, th => 1,
4726 button => 1, marquee => 1, object => 1, html => 1,
4727 }->{$node->[1]}) {
4728 last INSCOPE;
4729 }
4730 } # INSCOPE
4731
4732 $reconstruct_active_formatting_elements->($insert_to_current);
4733
4734 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4735 push @$active_formatting_elements, ['#marker', ''];
4736
4737 !!!next-token;
4738 redo B;
4739 } elsif ($token->{tag_name} eq 'marquee' or
4740 $token->{tag_name} eq 'object') {
4741 $reconstruct_active_formatting_elements->($insert_to_current);
4742
4743 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4744 push @$active_formatting_elements, ['#marker', ''];
4745
4746 !!!next-token;
4747 redo B;
4748 } elsif ($token->{tag_name} eq 'xmp') {
4749 $reconstruct_active_formatting_elements->($insert_to_current);
4750 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4751 redo B;
4752 } elsif ($token->{tag_name} eq 'table') {
4753 ## has a p element in scope
4754 INSCOPE: for (reverse @{$self->{open_elements}}) {
4755 if ($_->[1] eq 'p') {
4756 !!!back-token;
4757 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4758 redo B;
4759 } elsif ({
4760 table => 1, caption => 1, td => 1, th => 1,
4761 button => 1, marquee => 1, object => 1, html => 1,
4762 }->{$_->[1]}) {
4763 last INSCOPE;
4764 }
4765 } # INSCOPE
4766
4767 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4768
4769 $self->{insertion_mode} = IN_TABLE_IM;
4770
4771 !!!next-token;
4772 redo B;
4773 } elsif ({
4774 area => 1, basefont => 1, bgsound => 1, br => 1,
4775 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
4776 image => 1,
4777 }->{$token->{tag_name}}) {
4778 if ($token->{tag_name} eq 'image') {
4779 !!!parse-error (type => 'image');
4780 $token->{tag_name} = 'img';
4781 }
4782
4783 ## NOTE: There is an "as if <br>" code clone.
4784 $reconstruct_active_formatting_elements->($insert_to_current);
4785
4786 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4787 pop @{$self->{open_elements}};
4788
4789 !!!next-token;
4790 redo B;
4791 } elsif ($token->{tag_name} eq 'hr') {
4792 ## has a p element in scope
4793 INSCOPE: for (reverse @{$self->{open_elements}}) {
4794 if ($_->[1] eq 'p') {
4795 !!!back-token;
4796 $token = {type => END_TAG_TOKEN, tag_name => 'p'};
4797 redo B;
4798 } elsif ({
4799 table => 1, caption => 1, td => 1, th => 1,
4800 button => 1, marquee => 1, object => 1, html => 1,
4801 }->{$_->[1]}) {
4802 last INSCOPE;
4803 }
4804 } # INSCOPE
4805
4806 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4807 pop @{$self->{open_elements}};
4808
4809 !!!next-token;
4810 redo B;
4811 } elsif ($token->{tag_name} eq 'input') {
4812 $reconstruct_active_formatting_elements->($insert_to_current);
4813
4814 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4815 ## TODO: associate with $self->{form_element} if defined
4816 pop @{$self->{open_elements}};
4817
4818 !!!next-token;
4819 redo B;
4820 } elsif ($token->{tag_name} eq 'isindex') {
4821 !!!parse-error (type => 'isindex');
4822
4823 if (defined $self->{form_element}) {
4824 ## Ignore the token
4825 !!!next-token;
4826 redo B;
4827 } else {
4828 my $at = $token->{attributes};
4829 my $form_attrs;
4830 $form_attrs->{action} = $at->{action} if $at->{action};
4831 my $prompt_attr = $at->{prompt};
4832 $at->{name} = {name => 'name', value => 'isindex'};
4833 delete $at->{action};
4834 delete $at->{prompt};
4835 my @tokens = (
4836 {type => START_TAG_TOKEN, tag_name => 'form',
4837 attributes => $form_attrs},
4838 {type => START_TAG_TOKEN, tag_name => 'hr'},
4839 {type => START_TAG_TOKEN, tag_name => 'p'},
4840 {type => START_TAG_TOKEN, tag_name => 'label'},
4841 );
4842 if ($prompt_attr) {
4843 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}};
4844 } else {
4845 push @tokens, {type => CHARACTER_TOKEN,
4846 data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
4847 ## TODO: make this configurable
4848 }
4849 push @tokens,
4850 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at},
4851 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
4852 {type => END_TAG_TOKEN, tag_name => 'label'},
4853 {type => END_TAG_TOKEN, tag_name => 'p'},
4854 {type => START_TAG_TOKEN, tag_name => 'hr'},
4855 {type => END_TAG_TOKEN, tag_name => 'form'};
4856 $token = shift @tokens;
4857 !!!back-token (@tokens);
4858 redo B;
4859 }
4860 } elsif ($token->{tag_name} eq 'textarea') {
4861 my $tag_name = $token->{tag_name};
4862 my $el;
4863 !!!create-element ($el, $token->{tag_name}, $token->{attributes});
4864
4865 ## TODO: $self->{form_element} if defined
4866 $self->{content_model} = RCDATA_CONTENT_MODEL;
4867 delete $self->{escape}; # MUST
4868
4869 $insert->($el);
4870
4871 my $text = '';
4872 !!!next-token;
4873 if ($token->{type} == CHARACTER_TOKEN) {
4874 $token->{data} =~ s/^\x0A//;
4875 unless (length $token->{data}) {
4876 !!!next-token;
4877 }
4878 }
4879 while ($token->{type} == CHARACTER_TOKEN) {
4880 $text .= $token->{data};
4881 !!!next-token;
4882 }
4883 if (length $text) {
4884 $el->manakai_append_text ($text);
4885 }
4886
4887 $self->{content_model} = PCDATA_CONTENT_MODEL;
4888
4889 if ($token->{type} == END_TAG_TOKEN and
4890 $token->{tag_name} eq $tag_name) {
4891 ## Ignore the token
4892 } else {
4893 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
4894 }
4895 !!!next-token;
4896 redo B;
4897 } elsif ({
4898 iframe => 1,
4899 noembed => 1,
4900 noframes => 1,
4901 noscript => 0, ## TODO: 1 if scripting is enabled
4902 }->{$token->{tag_name}}) {
4903 ## NOTE: There is an "as if in body" code clone.
4904 $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
4905 redo B;
4906 } elsif ($token->{tag_name} eq 'select') {
4907 $reconstruct_active_formatting_elements->($insert_to_current);
4908
4909 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4910
4911 $self->{insertion_mode} = IN_SELECT_IM;
4912 !!!next-token;
4913 redo B;
4914 } elsif ({
4915 caption => 1, col => 1, colgroup => 1, frame => 1,
4916 frameset => 1, head => 1, option => 1, optgroup => 1,
4917 tbody => 1, td => 1, tfoot => 1, th => 1,
4918 thead => 1, tr => 1,
4919 }->{$token->{tag_name}}) {
4920 !!!parse-error (type => 'in body:'.$token->{tag_name});
4921 ## Ignore the token
4922 !!!next-token;
4923 redo B;
4924
4925 ## ISSUE: An issue on HTML5 new elements in the spec.
4926 } else {
4927 $reconstruct_active_formatting_elements->($insert_to_current);
4928
4929 !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4930
4931 !!!next-token;
4932 redo B;
4933 }
4934 } elsif ($token->{type} == END_TAG_TOKEN) {
4935 if ($token->{tag_name} eq 'body') {
4936 if (@{$self->{open_elements}} > 1 and
4937 $self->{open_elements}->[1]->[1] eq 'body') {
4938 for (@{$self->{open_elements}}) {
4939 unless ({
4940 dd => 1, dt => 1, li => 1, p => 1, td => 1,
4941 th => 1, tr => 1, body => 1, html => 1,
4942 tbody => 1, tfoot => 1, thead => 1,
4943 }->{$_->[1]}) {
4944 !!!parse-error (type => 'not closed:'.$_->[1]);
4945 }
4946 }
4947
4948 $self->{insertion_mode} = AFTER_BODY_IM;
4949 !!!next-token;
4950 redo B;
4951 } else {
4952 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4953 ## Ignore the token
4954 !!!next-token;
4955 redo B;
4956 }
4957 } elsif ($token->{tag_name} eq 'html') {
4958 if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
4959 ## ISSUE: There is an issue in the spec.
4960 if ($self->{open_elements}->[-1]->[1] ne 'body') {
4961 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[1]->[1]);
4962 }
4963 $self->{insertion_mode} = AFTER_BODY_IM;
4964 ## reprocess
4965 redo B;
4966 } else {
4967 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
4968 ## Ignore the token
4969 !!!next-token;
4970 redo B;
4971 }
4972 } elsif ({
4973 address => 1, blockquote => 1, center => 1, dir => 1,
4974 div => 1, dl => 1, fieldset => 1, listing => 1,
4975 menu => 1, ol => 1, pre => 1, ul => 1,
4976 p => 1,
4977 dd => 1, dt => 1, li => 1,
4978 button => 1, marquee => 1, object => 1,
4979 }->{$token->{tag_name}}) {
4980 ## has an element in scope
4981 my $i;
4982 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4983 my $node = $self->{open_elements}->[$_];
4984 if ($node->[1] eq $token->{tag_name}) {
4985 ## generate implied end tags
4986 if ({
4987 dd => ($token->{tag_name} ne 'dd'),
4988 dt => ($token->{tag_name} ne 'dt'),
4989 li => ($token->{tag_name} ne 'li'),
4990 p => ($token->{tag_name} ne 'p'),
4991 td => 1, th => 1, tr => 1,
4992 tbody => 1, tfoot=> 1, thead => 1,
4993 }->{$self->{open_elements}->[-1]->[1]}) {
4994 !!!back-token;
4995 $token = {type => END_TAG_TOKEN,
4996 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
4997 redo B;
4998 }
4999 $i = $_;
5000 last INSCOPE unless $token->{tag_name} eq 'p';
5001 } elsif ({
5002 table => 1, caption => 1, td => 1, th => 1,
5003 button => 1, marquee => 1, object => 1, html => 1,
5004 }->{$node->[1]}) {
5005 last INSCOPE;
5006 }
5007 } # INSCOPE
5008
5009 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5010 if (defined $i) {
5011 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5012 } else {
5013 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5014 }
5015 }
5016
5017 if (defined $i) {
5018 splice @{$self->{open_elements}}, $i;
5019 } elsif ($token->{tag_name} eq 'p') {
5020 ## As if <p>, then reprocess the current token
5021 my $el;
5022 !!!create-element ($el, 'p');
5023 $insert->($el);
5024 }
5025 $clear_up_to_marker->()
5026 if {
5027 button => 1, marquee => 1, object => 1,
5028 }->{$token->{tag_name}};
5029 !!!next-token;
5030 redo B;
5031 } elsif ($token->{tag_name} eq 'form') {
5032 ## has an element in scope
5033 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5034 my $node = $self->{open_elements}->[$_];
5035 if ($node->[1] eq $token->{tag_name}) {
5036 ## generate implied end tags
5037 if ({
5038 dd => 1, dt => 1, li => 1, p => 1,
5039 td => 1, th => 1, tr => 1,
5040 tbody => 1, tfoot=> 1, thead => 1,
5041 }->{$self->{open_elements}->[-1]->[1]}) {
5042 !!!back-token;
5043 $token = {type => END_TAG_TOKEN,
5044 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5045 redo B;
5046 }
5047 last INSCOPE;
5048 } elsif ({
5049 table => 1, caption => 1, td => 1, th => 1,
5050 button => 1, marquee => 1, object => 1, html => 1,
5051 }->{$node->[1]}) {
5052 last INSCOPE;
5053 }
5054 } # INSCOPE
5055
5056 if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5057 pop @{$self->{open_elements}};
5058 } else {
5059 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5060 }
5061
5062 undef $self->{form_element};
5063 !!!next-token;
5064 redo B;
5065 } elsif ({
5066 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5067 }->{$token->{tag_name}}) {
5068 ## has an element in scope
5069 my $i;
5070 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5071 my $node = $self->{open_elements}->[$_];
5072 if ({
5073 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
5074 }->{$node->[1]}) {
5075 ## generate implied end tags
5076 if ({
5077 dd => 1, dt => 1, li => 1, p => 1,
5078 td => 1, th => 1, tr => 1,
5079 tbody => 1, tfoot=> 1, thead => 1,
5080 }->{$self->{open_elements}->[-1]->[1]}) {
5081 !!!back-token;
5082 $token = {type => END_TAG_TOKEN,
5083 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5084 redo B;
5085 }
5086 $i = $_;
5087 last INSCOPE;
5088 } elsif ({
5089 table => 1, caption => 1, td => 1, th => 1,
5090 button => 1, marquee => 1, object => 1, html => 1,
5091 }->{$node->[1]}) {
5092 last INSCOPE;
5093 }
5094 } # INSCOPE
5095
5096 if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5097 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5098 }
5099
5100 splice @{$self->{open_elements}}, $i if defined $i;
5101 !!!next-token;
5102 redo B;
5103 } elsif ({
5104 a => 1,
5105 b => 1, big => 1, em => 1, font => 1, i => 1,
5106 nobr => 1, s => 1, small => 1, strile => 1,
5107 strong => 1, tt => 1, u => 1,
5108 }->{$token->{tag_name}}) {
5109 $formatting_end_tag->($token->{tag_name});
5110 redo B;
5111 } elsif ($token->{tag_name} eq 'br') {
5112 !!!parse-error (type => 'unmatched end tag:br');
5113
5114 ## As if <br>
5115 $reconstruct_active_formatting_elements->($insert_to_current);
5116
5117 my $el;
5118 !!!create-element ($el, 'br');
5119 $insert->($el);
5120
5121 ## Ignore the token.
5122 !!!next-token;
5123 redo B;
5124 } elsif ({
5125 caption => 1, col => 1, colgroup => 1, frame => 1,
5126 frameset => 1, head => 1, option => 1, optgroup => 1,
5127 tbody => 1, td => 1, tfoot => 1, th => 1,
5128 thead => 1, tr => 1,
5129 area => 1, basefont => 1, bgsound => 1,
5130 embed => 1, hr => 1, iframe => 1, image => 1,
5131 img => 1, input => 1, isindex => 1, noembed => 1,
5132 noframes => 1, param => 1, select => 1, spacer => 1,
5133 table => 1, textarea => 1, wbr => 1,
5134 noscript => 0, ## TODO: if scripting is enabled
5135 }->{$token->{tag_name}}) {
5136 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5137 ## Ignore the token
5138 !!!next-token;
5139 redo B;
5140
5141 ## ISSUE: Issue on HTML5 new elements in spec
5142
5143 } else {
5144 ## Step 1
5145 my $node_i = -1;
5146 my $node = $self->{open_elements}->[$node_i];
5147
5148 ## Step 2
5149 S2: {
5150 if ($node->[1] eq $token->{tag_name}) {
5151 ## Step 1
5152 ## generate implied end tags
5153 if ({
5154 dd => 1, dt => 1, li => 1, p => 1,
5155 td => 1, th => 1, tr => 1,
5156 tbody => 1, tfoot => 1, thead => 1,
5157 }->{$self->{open_elements}->[-1]->[1]}) {
5158 !!!back-token;
5159 $token = {type => END_TAG_TOKEN,
5160 tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
5161 redo B;
5162 }
5163
5164 ## Step 2
5165 if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5166 ## NOTE: <x><y></x>
5167 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5168 }
5169
5170 ## Step 3
5171 splice @{$self->{open_elements}}, $node_i;
5172
5173 !!!next-token;
5174 last S2;
5175 } else {
5176 ## Step 3
5177 if (not $formatting_category->{$node->[1]} and
5178 #not $phrasing_category->{$node->[1]} and
5179 ($special_category->{$node->[1]} or
5180 $scoping_category->{$node->[1]})) {
5181 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5182 ## Ignore the token
5183 !!!next-token;
5184 last S2;
5185 }
5186 }
5187
5188 ## Step 4
5189 $node_i--;
5190 $node = $self->{open_elements}->[$node_i];
5191
5192 ## Step 5;
5193 redo S2;
5194 } # S2
5195 redo B;
5196 }
5197 }
5198 redo B;
5199 } # B
5200
5201 ## NOTE: The "trailing end" phase in HTML5 is split into
5202 ## two insertion modes: "after html body" and "after html frameset".
5203 ## NOTE: States in the main stage is preserved while
5204 ## the parser stays in the trailing end phase. # MUST
5205
5206 ## Stop parsing # MUST
5207
5208 ## TODO: script stuffs
5209 } # _tree_construct_main
5210
5211 sub set_inner_html ($$$) {
5212 my $class = shift;
5213 my $node = shift;
5214 my $s = \$_[0];
5215 my $onerror = $_[1];
5216
5217 my $nt = $node->node_type;
5218 if ($nt == 9) {
5219 # MUST
5220
5221 ## Step 1 # MUST
5222 ## TODO: If the document has an active parser, ...
5223 ## ISSUE: There is an issue in the spec.
5224
5225 ## Step 2 # MUST
5226 my @cn = @{$node->child_nodes};
5227 for (@cn) {
5228 $node->remove_child ($_);
5229 }
5230
5231 ## Step 3, 4, 5 # MUST
5232 $class->parse_string ($$s => $node, $onerror);
5233 } elsif ($nt == 1) {
5234 ## TODO: If non-html element
5235
5236 ## NOTE: Most of this code is copied from |parse_string|
5237
5238 ## Step 1 # MUST
5239 my $this_doc = $node->owner_document;
5240 my $doc = $this_doc->implementation->create_document;
5241 $doc->manakai_is_html (1);
5242 my $p = $class->new;
5243 $p->{document} = $doc;
5244
5245 ## Step 9 # MUST
5246 my $i = 0;
5247 my $line = 1;
5248 my $column = 0;
5249 $p->{set_next_input_character} = sub {
5250 my $self = shift;
5251
5252 pop @{$self->{prev_input_character}};
5253 unshift @{$self->{prev_input_character}}, $self->{next_input_character};
5254
5255 $self->{next_input_character} = -1 and return if $i >= length $$s;
5256 $self->{next_input_character} = ord substr $$s, $i++, 1;
5257 $column++;
5258
5259 if ($self->{next_input_character} == 0x000A) { # LF
5260 $line++;
5261 $column = 0;
5262 } elsif ($self->{next_input_character} == 0x000D) { # CR
5263 $i++ if substr ($$s, $i, 1) eq "\x0A";
5264 $self->{next_input_character} = 0x000A; # LF # MUST
5265 $line++;
5266 $column = 0;
5267 } elsif ($self->{next_input_character} > 0x10FFFF) {
5268 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5269 } elsif ($self->{next_input_character} == 0x0000) { # NULL
5270 !!!parse-error (type => 'NULL');
5271 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
5272 }
5273 };
5274 $p->{prev_input_character} = [-1, -1, -1];
5275 $p->{next_input_character} = -1;
5276
5277 my $ponerror = $onerror || sub {
5278 my (%opt) = @_;
5279 warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";
5280 };
5281 $p->{parse_error} = sub {
5282 $ponerror->(@_, line => $line, column => $column);
5283 };
5284
5285 $p->_initialize_tokenizer;
5286 $p->_initialize_tree_constructor;
5287
5288 ## Step 2
5289 my $node_ln = $node->local_name;
5290 $p->{content_model} = {
5291 title => RCDATA_CONTENT_MODEL,
5292 textarea => RCDATA_CONTENT_MODEL,
5293 style => CDATA_CONTENT_MODEL,
5294 script => CDATA_CONTENT_MODEL,
5295 xmp => CDATA_CONTENT_MODEL,
5296 iframe => CDATA_CONTENT_MODEL,
5297 noembed => CDATA_CONTENT_MODEL,
5298 noframes => CDATA_CONTENT_MODEL,
5299 noscript => CDATA_CONTENT_MODEL,
5300 plaintext => PLAINTEXT_CONTENT_MODEL,
5301 }->{$node_ln};
5302 $p->{content_model} = PCDATA_CONTENT_MODEL
5303 unless defined $p->{content_model};
5304 ## ISSUE: What is "the name of the element"? local name?
5305
5306 $p->{inner_html_node} = [$node, $node_ln];
5307
5308 ## Step 4
5309 my $root = $doc->create_element_ns
5310 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
5311
5312 ## Step 5 # MUST
5313 $doc->append_child ($root);
5314
5315 ## Step 6 # MUST
5316 push @{$p->{open_elements}}, [$root, 'html'];
5317
5318 undef $p->{head_element};
5319
5320 ## Step 7 # MUST
5321 $p->_reset_insertion_mode;
5322
5323 ## Step 8 # MUST
5324 my $anode = $node;
5325 AN: while (defined $anode) {
5326 if ($anode->node_type == 1) {
5327 my $nsuri = $anode->namespace_uri;
5328 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
5329 if ($anode->local_name eq 'form') { ## TODO: case?
5330 $p->{form_element} = $anode;
5331 last AN;
5332 }
5333 }
5334 }
5335 $anode = $anode->parent_node;
5336 } # AN
5337
5338 ## Step 3 # MUST
5339 ## Step 10 # MUST
5340 {
5341 my $self = $p;
5342 !!!next-token;
5343 }
5344 $p->_tree_construction_main;
5345
5346 ## Step 11 # MUST
5347 my @cn = @{$node->child_nodes};
5348 for (@cn) {
5349 $node->remove_child ($_);
5350 }
5351 ## ISSUE: mutation events? read-only?
5352
5353 ## Step 12 # MUST
5354 @cn = @{$root->child_nodes};
5355 for (@cn) {
5356 $this_doc->adopt_node ($_);
5357 $node->append_child ($_);
5358 }
5359 ## ISSUE: mutation events?
5360
5361 $p->_terminate_tree_constructor;
5362 } else {
5363 die "$0: |set_inner_html| is not defined for node of type $nt";
5364 }
5365 } # set_inner_html
5366
5367 } # tree construction stage
5368
5369 sub get_inner_html ($$$) {
5370 my (undef, $node, $on_error) = @_;
5371
5372 ## Step 1
5373 my $s = '';
5374
5375 my $in_cdata;
5376 my $parent = $node;
5377 while (defined $parent) {
5378 if ($parent->node_type == 1 and
5379 $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
5380 {
5381 style => 1, script => 1, xmp => 1, iframe => 1,
5382 noembed => 1, noframes => 1, noscript => 1,
5383 }->{$parent->local_name}) { ## TODO: case thingy
5384 $in_cdata = 1;
5385 }
5386 $parent = $parent->parent_node;
5387 }
5388
5389 ## Step 2
5390 my @node = @{$node->child_nodes};
5391 C: while (@node) {
5392 my $child = shift @node;
5393 unless (ref $child) {
5394 if ($child eq 'cdata-out') {
5395 $in_cdata = 0;
5396 } else {
5397 $s .= $child; # end tag
5398 }
5399 next C;
5400 }
5401
5402 my $nt = $child->node_type;
5403 if ($nt == 1) { # Element
5404 my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
5405 $s .= '<' . $tag_name;
5406 ## NOTE: Non-HTML case:
5407 ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
5408
5409 my @attrs = @{$child->attributes}; # sort order MUST be stable
5410 for my $attr (@attrs) { # order is implementation dependent
5411 my $attr_name = $attr->name; ## TODO: manakai_name
5412 $s .= ' ' . $attr_name . '="';
5413 my $attr_value = $attr->value;
5414 ## escape
5415 $attr_value =~ s/&/&amp;/g;
5416 $attr_value =~ s/</&lt;/g;
5417 $attr_value =~ s/>/&gt;/g;
5418 $attr_value =~ s/"/&quot;/g;
5419 $s .= $attr_value . '"';
5420 }
5421 $s .= '>';
5422
5423 next C if {
5424 area => 1, base => 1, basefont => 1, bgsound => 1,
5425 br => 1, col => 1, embed => 1, frame => 1, hr => 1,
5426 img => 1, input => 1, link => 1, meta => 1, param => 1,
5427 spacer => 1, wbr => 1,
5428 }->{$tag_name};
5429
5430 $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
5431
5432 if (not $in_cdata and {
5433 style => 1, script => 1, xmp => 1, iframe => 1,
5434 noembed => 1, noframes => 1, noscript => 1,
5435 plaintext => 1,
5436 }->{$tag_name}) {
5437 unshift @node, 'cdata-out';
5438 $in_cdata = 1;
5439 }
5440
5441 unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
5442 } elsif ($nt == 3 or $nt == 4) {
5443 if ($in_cdata) {
5444 $s .= $child->data;
5445 } else {
5446 my $value = $child->data;
5447 $value =~ s/&/&amp;/g;
5448 $value =~ s/</&lt;/g;
5449 $value =~ s/>/&gt;/g;
5450 $value =~ s/"/&quot;/g;
5451 $s .= $value;
5452 }
5453 } elsif ($nt == 8) {
5454 $s .= '<!--' . $child->data . '-->';
5455 } elsif ($nt == 10) {
5456 $s .= '<!DOCTYPE ' . $child->name . '>';
5457 } elsif ($nt == 5) { # entrefs
5458 push @node, @{$child->child_nodes};
5459 } else {
5460 $on_error->($child) if defined $on_error;
5461 }
5462 ## ISSUE: This code does not support PIs.
5463 } # C
5464
5465 ## Step 3
5466 return \$s;
5467 } # get_inner_html
5468
5469 1;
5470 # $Date: 2007/10/14 09:21:46 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24