/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.130 - (show annotations) (download) (as text)
Sat Apr 12 15:47:13 2008 UTC (17 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.129: +11 -4 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	12 Apr 2008 15:45:43 -0000
	* HTML.pm.src: List of element names that close foreign content
	insertion mode is added (HTML5 revisions 1412 and 1418).

2008-04-13  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.129 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
16 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
17 my $SVG_NS = q<http://www.w3.org/2000/svg>;
18 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
19 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
20 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
21
22 sub A_EL () { 0b1 }
23 sub ADDRESS_EL () { 0b10 }
24 sub BODY_EL () { 0b100 }
25 sub BUTTON_EL () { 0b1000 }
26 sub CAPTION_EL () { 0b10000 }
27 sub DD_EL () { 0b100000 }
28 sub DIV_EL () { 0b1000000 }
29 sub DT_EL () { 0b10000000 }
30 sub FORM_EL () { 0b100000000 }
31 sub FORMATTING_EL () { 0b1000000000 }
32 sub FRAMESET_EL () { 0b10000000000 }
33 sub HEADING_EL () { 0b100000000000 }
34 sub HTML_EL () { 0b1000000000000 }
35 sub LI_EL () { 0b10000000000000 }
36 sub NOBR_EL () { 0b100000000000000 }
37 sub OPTION_EL () { 0b1000000000000000 }
38 sub OPTGROUP_EL () { 0b10000000000000000 }
39 sub P_EL () { 0b100000000000000000 }
40 sub SELECT_EL () { 0b1000000000000000000 }
41 sub TABLE_EL () { 0b10000000000000000000 }
42 sub TABLE_CELL_EL () { 0b100000000000000000000 }
43 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
44 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
45 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
46 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
47 sub FOREIGN_EL () { 0b10000000000000000000000000 }
48 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
49 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 sub END_TAG_OPTIONAL_EL () {
58 DD_EL |
59 DT_EL |
60 LI_EL |
61 P_EL
62 }
63
64 sub ALL_END_TAG_OPTIONAL_EL () {
65 END_TAG_OPTIONAL_EL |
66 BODY_EL |
67 HTML_EL |
68 TABLE_CELL_EL |
69 TABLE_ROW_EL |
70 TABLE_ROW_GROUP_EL
71 }
72
73 sub SCOPING_EL () {
74 BUTTON_EL |
75 CAPTION_EL |
76 HTML_EL |
77 TABLE_EL |
78 TABLE_CELL_EL |
79 MISC_SCOPING_EL
80 }
81
82 sub TABLE_SCOPING_EL () {
83 HTML_EL |
84 TABLE_EL
85 }
86
87 sub TABLE_ROWS_SCOPING_EL () {
88 HTML_EL |
89 TABLE_ROW_GROUP_EL
90 }
91
92 sub TABLE_ROW_SCOPING_EL () {
93 HTML_EL |
94 TABLE_ROW_EL
95 }
96
97 sub SPECIAL_EL () {
98 ADDRESS_EL |
99 BODY_EL |
100 DIV_EL |
101 END_TAG_OPTIONAL_EL |
102 FORM_EL |
103 FRAMESET_EL |
104 HEADING_EL |
105 OPTION_EL |
106 OPTGROUP_EL |
107 SELECT_EL |
108 TABLE_ROW_EL |
109 TABLE_ROW_GROUP_EL |
110 MISC_SPECIAL_EL
111 }
112
113 my $el_category = {
114 a => A_EL | FORMATTING_EL,
115 address => ADDRESS_EL,
116 applet => MISC_SCOPING_EL,
117 area => MISC_SPECIAL_EL,
118 b => FORMATTING_EL,
119 base => MISC_SPECIAL_EL,
120 basefont => MISC_SPECIAL_EL,
121 bgsound => MISC_SPECIAL_EL,
122 big => FORMATTING_EL,
123 blockquote => MISC_SPECIAL_EL,
124 body => BODY_EL,
125 br => MISC_SPECIAL_EL,
126 button => BUTTON_EL,
127 caption => CAPTION_EL,
128 center => MISC_SPECIAL_EL,
129 col => MISC_SPECIAL_EL,
130 colgroup => MISC_SPECIAL_EL,
131 dd => DD_EL,
132 dir => MISC_SPECIAL_EL,
133 div => DIV_EL,
134 dl => MISC_SPECIAL_EL,
135 dt => DT_EL,
136 em => FORMATTING_EL,
137 embed => MISC_SPECIAL_EL,
138 fieldset => MISC_SPECIAL_EL,
139 font => FORMATTING_EL,
140 form => FORM_EL,
141 frame => MISC_SPECIAL_EL,
142 frameset => FRAMESET_EL,
143 h1 => HEADING_EL,
144 h2 => HEADING_EL,
145 h3 => HEADING_EL,
146 h4 => HEADING_EL,
147 h5 => HEADING_EL,
148 h6 => HEADING_EL,
149 head => MISC_SPECIAL_EL,
150 hr => MISC_SPECIAL_EL,
151 html => HTML_EL,
152 i => FORMATTING_EL,
153 iframe => MISC_SPECIAL_EL,
154 img => MISC_SPECIAL_EL,
155 input => MISC_SPECIAL_EL,
156 isindex => MISC_SPECIAL_EL,
157 li => LI_EL,
158 link => MISC_SPECIAL_EL,
159 listing => MISC_SPECIAL_EL,
160 marquee => MISC_SCOPING_EL,
161 menu => MISC_SPECIAL_EL,
162 meta => MISC_SPECIAL_EL,
163 nobr => NOBR_EL | FORMATTING_EL,
164 noembed => MISC_SPECIAL_EL,
165 noframes => MISC_SPECIAL_EL,
166 noscript => MISC_SPECIAL_EL,
167 object => MISC_SCOPING_EL,
168 ol => MISC_SPECIAL_EL,
169 optgroup => OPTGROUP_EL,
170 option => OPTION_EL,
171 p => P_EL,
172 param => MISC_SPECIAL_EL,
173 plaintext => MISC_SPECIAL_EL,
174 pre => MISC_SPECIAL_EL,
175 s => FORMATTING_EL,
176 script => MISC_SPECIAL_EL,
177 select => SELECT_EL,
178 small => FORMATTING_EL,
179 spacer => MISC_SPECIAL_EL,
180 strike => FORMATTING_EL,
181 strong => FORMATTING_EL,
182 style => MISC_SPECIAL_EL,
183 table => TABLE_EL,
184 tbody => TABLE_ROW_GROUP_EL,
185 td => TABLE_CELL_EL,
186 textarea => MISC_SPECIAL_EL,
187 tfoot => TABLE_ROW_GROUP_EL,
188 th => TABLE_CELL_EL,
189 thead => TABLE_ROW_GROUP_EL,
190 title => MISC_SPECIAL_EL,
191 tr => TABLE_ROW_EL,
192 tt => FORMATTING_EL,
193 u => FORMATTING_EL,
194 ul => MISC_SPECIAL_EL,
195 wbr => MISC_SPECIAL_EL,
196 };
197
198 my $el_category_f = {
199 $MML_NS => {
200 'annotation-xml' => MML_AXML_EL,
201 mi => FOREIGN_FLOW_CONTENT_EL,
202 mo => FOREIGN_FLOW_CONTENT_EL,
203 mn => FOREIGN_FLOW_CONTENT_EL,
204 ms => FOREIGN_FLOW_CONTENT_EL,
205 mtext => FOREIGN_FLOW_CONTENT_EL,
206 },
207 $SVG_NS => {
208 foreignobject => FOREIGN_FLOW_CONTENT_EL, ## TODO: case
209 desc => FOREIGN_FLOW_CONTENT_EL,
210 title => FOREIGN_FLOW_CONTENT_EL,
211 },
212 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
213 };
214
215 my $c1_entity_char = {
216 0x80 => 0x20AC,
217 0x81 => 0xFFFD,
218 0x82 => 0x201A,
219 0x83 => 0x0192,
220 0x84 => 0x201E,
221 0x85 => 0x2026,
222 0x86 => 0x2020,
223 0x87 => 0x2021,
224 0x88 => 0x02C6,
225 0x89 => 0x2030,
226 0x8A => 0x0160,
227 0x8B => 0x2039,
228 0x8C => 0x0152,
229 0x8D => 0xFFFD,
230 0x8E => 0x017D,
231 0x8F => 0xFFFD,
232 0x90 => 0xFFFD,
233 0x91 => 0x2018,
234 0x92 => 0x2019,
235 0x93 => 0x201C,
236 0x94 => 0x201D,
237 0x95 => 0x2022,
238 0x96 => 0x2013,
239 0x97 => 0x2014,
240 0x98 => 0x02DC,
241 0x99 => 0x2122,
242 0x9A => 0x0161,
243 0x9B => 0x203A,
244 0x9C => 0x0153,
245 0x9D => 0xFFFD,
246 0x9E => 0x017E,
247 0x9F => 0x0178,
248 }; # $c1_entity_char
249
250 sub parse_byte_string ($$$$;$) {
251 my $self = ref $_[0] ? shift : shift->new;
252 my $charset = shift;
253 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
254 my $s;
255
256 if (defined $charset) {
257 require Encode; ## TODO: decode(utf8) don't delete BOM
258 $s = \ (Encode::decode ($charset, $$bytes_s));
259 $self->{input_encoding} = lc $charset; ## TODO: normalize name
260 $self->{confident} = 1;
261 } else {
262 ## TODO: Implement HTML5 detection algorithm
263 require Whatpm::Charset::UniversalCharDet;
264 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
265 (substr ($$bytes_s, 0, 1024));
266 $charset ||= 'windows-1252';
267 $s = \ (Encode::decode ($charset, $$bytes_s));
268 $self->{input_encoding} = $charset;
269 $self->{confident} = 0;
270 }
271
272 $self->{change_encoding} = sub {
273 my $self = shift;
274 my $charset = lc shift;
275 my $token = shift;
276 ## TODO: if $charset is supported
277 ## TODO: normalize charset name
278
279 ## "Change the encoding" algorithm:
280
281 ## Step 1
282 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
283 $charset = 'utf-8';
284 }
285
286 ## Step 2
287 if (defined $self->{input_encoding} and
288 $self->{input_encoding} eq $charset) {
289 $self->{confident} = 1;
290 return;
291 }
292
293 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
294 ':'.$charset, level => 'w', token => $token);
295
296 ## Step 3
297 # if (can) {
298 ## change the encoding on the fly.
299 #$self->{confident} = 1;
300 #return;
301 # }
302
303 ## Step 4
304 throw Whatpm::HTML::RestartParser (charset => $charset);
305 }; # $self->{change_encoding}
306
307 my @args = @_; shift @args; # $s
308 my $return;
309 try {
310 $return = $self->parse_char_string ($s, @args);
311 } catch Whatpm::HTML::RestartParser with {
312 my $charset = shift->{charset};
313 $s = \ (Encode::decode ($charset, $$bytes_s));
314 $self->{input_encoding} = $charset; ## TODO: normalize
315 $self->{confident} = 1;
316 $return = $self->parse_char_string ($s, @args);
317 };
318 return $return;
319 } # parse_byte_string
320
321 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
322 ## and the HTML layer MUST ignore it. However, we does strip BOM in
323 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
324 ## because the core part of our HTML parser expects a string of character,
325 ## not a string of bytes or code units or anything which might contain a BOM.
326 ## Therefore, any parser interface that accepts a string of bytes,
327 ## such as |parse_byte_string| in this module, must ensure that it does
328 ## strip the BOM and never strip any ZWNBSP.
329
330 *parse_char_string = \&parse_string;
331
332 sub parse_string ($$$;$) {
333 my $self = ref $_[0] ? shift : shift->new;
334 my $s = ref $_[0] ? $_[0] : \($_[0]);
335 $self->{document} = $_[1];
336 @{$self->{document}->child_nodes} = ();
337
338 ## NOTE: |set_inner_html| copies most of this method's code
339
340 $self->{confident} = 1 unless exists $self->{confident};
341 $self->{document}->input_encoding ($self->{input_encoding})
342 if defined $self->{input_encoding};
343
344 my $i = 0;
345 $self->{line_prev} = $self->{line} = 1;
346 $self->{column_prev} = $self->{column} = 0;
347 $self->{set_next_char} = sub {
348 my $self = shift;
349
350 pop @{$self->{prev_char}};
351 unshift @{$self->{prev_char}}, $self->{next_char};
352
353 $self->{next_char} = -1 and return if $i >= length $$s;
354 $self->{next_char} = ord substr $$s, $i++, 1;
355
356 ($self->{line_prev}, $self->{column_prev})
357 = ($self->{line}, $self->{column});
358 $self->{column}++;
359
360 if ($self->{next_char} == 0x000A) { # LF
361 $self->{line}++;
362 $self->{column} = 0;
363 } elsif ($self->{next_char} == 0x000D) { # CR
364 $i++ if substr ($$s, $i, 1) eq "\x0A";
365 $self->{next_char} = 0x000A; # LF # MUST
366 $self->{line}++;
367 $self->{column} = 0;
368 } elsif ($self->{next_char} > 0x10FFFF) {
369 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
370 } elsif ($self->{next_char} == 0x0000) { # NULL
371 !!!parse-error (type => 'NULL');
372 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
373 }
374 };
375 $self->{prev_char} = [-1, -1, -1];
376 $self->{next_char} = -1;
377
378 my $onerror = $_[2] || sub {
379 my (%opt) = @_;
380 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
381 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
382 warn "Parse error ($opt{type}) at line $line column $column\n";
383 };
384 $self->{parse_error} = sub {
385 $onerror->(line => $self->{line}, column => $self->{column}, @_);
386 };
387
388 $self->_initialize_tokenizer;
389 $self->_initialize_tree_constructor;
390 $self->_construct_tree;
391 $self->_terminate_tree_constructor;
392
393 delete $self->{parse_error}; # remove loop
394
395 return $self->{document};
396 } # parse_string
397
398 sub new ($) {
399 my $class = shift;
400 my $self = bless {}, $class;
401 $self->{set_next_char} = sub {
402 $self->{next_char} = -1;
403 };
404 $self->{parse_error} = sub {
405 #
406 };
407 $self->{change_encoding} = sub {
408 # if ($_[0] is a supported encoding) {
409 # run "change the encoding" algorithm;
410 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
411 # }
412 };
413 $self->{application_cache_selection} = sub {
414 #
415 };
416 return $self;
417 } # new
418
419 sub CM_ENTITY () { 0b001 } # & markup in data
420 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
421 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
422
423 sub PLAINTEXT_CONTENT_MODEL () { 0 }
424 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
425 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
426 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
427
428 sub DATA_STATE () { 0 }
429 sub ENTITY_DATA_STATE () { 1 }
430 sub TAG_OPEN_STATE () { 2 }
431 sub CLOSE_TAG_OPEN_STATE () { 3 }
432 sub TAG_NAME_STATE () { 4 }
433 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
434 sub ATTRIBUTE_NAME_STATE () { 6 }
435 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
436 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
437 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
438 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
439 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
440 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
441 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
442 sub COMMENT_START_STATE () { 14 }
443 sub COMMENT_START_DASH_STATE () { 15 }
444 sub COMMENT_STATE () { 16 }
445 sub COMMENT_END_STATE () { 17 }
446 sub COMMENT_END_DASH_STATE () { 18 }
447 sub BOGUS_COMMENT_STATE () { 19 }
448 sub DOCTYPE_STATE () { 20 }
449 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
450 sub DOCTYPE_NAME_STATE () { 22 }
451 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
452 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
453 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
454 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
455 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
456 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
457 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
458 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
459 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
460 sub BOGUS_DOCTYPE_STATE () { 32 }
461 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
462 sub SELF_CLOSING_START_TAG_STATE () { 34 }
463 sub CDATA_BLOCK_STATE () { 35 }
464
465 sub DOCTYPE_TOKEN () { 1 }
466 sub COMMENT_TOKEN () { 2 }
467 sub START_TAG_TOKEN () { 3 }
468 sub END_TAG_TOKEN () { 4 }
469 sub END_OF_FILE_TOKEN () { 5 }
470 sub CHARACTER_TOKEN () { 6 }
471
472 sub AFTER_HTML_IMS () { 0b100 }
473 sub HEAD_IMS () { 0b1000 }
474 sub BODY_IMS () { 0b10000 }
475 sub BODY_TABLE_IMS () { 0b100000 }
476 sub TABLE_IMS () { 0b1000000 }
477 sub ROW_IMS () { 0b10000000 }
478 sub BODY_AFTER_IMS () { 0b100000000 }
479 sub FRAME_IMS () { 0b1000000000 }
480 sub SELECT_IMS () { 0b10000000000 }
481 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
482 ## NOTE: "in foreign content" insertion mode is special; it is combined
483 ## with the secondary insertion mode. In this parser, they are stored
484 ## together in the bit-or'ed form.
485
486 ## NOTE: "initial" and "before html" insertion modes have no constants.
487
488 ## NOTE: "after after body" insertion mode.
489 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
490
491 ## NOTE: "after after frameset" insertion mode.
492 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
493
494 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
495 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
496 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
497 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
498 sub IN_BODY_IM () { BODY_IMS }
499 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
500 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
501 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
502 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
503 sub IN_TABLE_IM () { TABLE_IMS }
504 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
505 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
506 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
507 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
508 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
509 sub IN_COLUMN_GROUP_IM () { 0b10 }
510
511 ## Implementations MUST act as if state machine in the spec
512
513 sub _initialize_tokenizer ($) {
514 my $self = shift;
515 $self->{state} = DATA_STATE; # MUST
516 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
517 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
518 undef $self->{current_attribute};
519 undef $self->{last_emitted_start_tag_name};
520 undef $self->{last_attribute_value_state};
521 delete $self->{self_closing};
522 $self->{char} = [];
523 # $self->{next_char}
524 !!!next-input-character;
525 $self->{token} = [];
526 # $self->{escape}
527 } # _initialize_tokenizer
528
529 ## A token has:
530 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
531 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
532 ## ->{name} (DOCTYPE_TOKEN)
533 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
534 ## ->{public_identifier} (DOCTYPE_TOKEN)
535 ## ->{system_identifier} (DOCTYPE_TOKEN)
536 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
537 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
538 ## ->{name}
539 ## ->{value}
540 ## ->{has_reference} == 1 or 0
541 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
542 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
543 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
544 ## while the token is pushed back to the stack.
545
546 ## ISSUE: "When a DOCTYPE token is created, its
547 ## <i>self-closing flag</i> must be unset (its other state is that it
548 ## be set), and its attributes list must be empty.": Wrong subject?
549
550 ## Emitted token MUST immediately be handled by the tree construction state.
551
552 ## Before each step, UA MAY check to see if either one of the scripts in
553 ## "list of scripts that will execute as soon as possible" or the first
554 ## script in the "list of scripts that will execute asynchronously",
555 ## has completed loading. If one has, then it MUST be executed
556 ## and removed from the list.
557
558 ## NOTE: HTML5 "Writing HTML documents" section, applied to
559 ## documents and not to user agents and conformance checkers,
560 ## contains some requirements that are not detected by the
561 ## parsing algorithm:
562 ## - Some requirements on character encoding declarations. ## TODO
563 ## - "Elements MUST NOT contain content that their content model disallows."
564 ## ... Some are parse error, some are not (will be reported by c.c.).
565 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
566 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
567 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
568
569 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
570 ## be detected by the HTML5 parsing algorithm:
571 ## - Text,
572
573 sub _get_next_token ($) {
574 my $self = shift;
575
576 if ($self->{self_closing}) {
577 !!!parse-error (type => 'nestc', token => $self->{current_token});
578 ## NOTE: The |self_closing| flag is only set by start tag token.
579 ## In addition, when a start tag token is emitted, it is always set to
580 ## |current_token|.
581 delete $self->{self_closing};
582 }
583
584 if (@{$self->{token}}) {
585 $self->{self_closing} = $self->{token}->[0]->{self_closing};
586 return shift @{$self->{token}};
587 }
588
589 A: {
590 if ($self->{state} == DATA_STATE) {
591 if ($self->{next_char} == 0x0026) { # &
592 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
593 not $self->{escape}) {
594 !!!cp (1);
595 $self->{state} = ENTITY_DATA_STATE;
596 !!!next-input-character;
597 redo A;
598 } else {
599 !!!cp (2);
600 #
601 }
602 } elsif ($self->{next_char} == 0x002D) { # -
603 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
604 unless ($self->{escape}) {
605 if ($self->{prev_char}->[0] == 0x002D and # -
606 $self->{prev_char}->[1] == 0x0021 and # !
607 $self->{prev_char}->[2] == 0x003C) { # <
608 !!!cp (3);
609 $self->{escape} = 1;
610 } else {
611 !!!cp (4);
612 }
613 } else {
614 !!!cp (5);
615 }
616 }
617
618 #
619 } elsif ($self->{next_char} == 0x003C) { # <
620 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
621 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
622 not $self->{escape})) {
623 !!!cp (6);
624 $self->{state} = TAG_OPEN_STATE;
625 !!!next-input-character;
626 redo A;
627 } else {
628 !!!cp (7);
629 #
630 }
631 } elsif ($self->{next_char} == 0x003E) { # >
632 if ($self->{escape} and
633 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
634 if ($self->{prev_char}->[0] == 0x002D and # -
635 $self->{prev_char}->[1] == 0x002D) { # -
636 !!!cp (8);
637 delete $self->{escape};
638 } else {
639 !!!cp (9);
640 }
641 } else {
642 !!!cp (10);
643 }
644
645 #
646 } elsif ($self->{next_char} == -1) {
647 !!!cp (11);
648 !!!emit ({type => END_OF_FILE_TOKEN,
649 line => $self->{line}, column => $self->{column}});
650 last A; ## TODO: ok?
651 } else {
652 !!!cp (12);
653 }
654 # Anything else
655 my $token = {type => CHARACTER_TOKEN,
656 data => chr $self->{next_char},
657 line => $self->{line}, column => $self->{column},
658 };
659 ## Stay in the data state
660 !!!next-input-character;
661
662 !!!emit ($token);
663
664 redo A;
665 } elsif ($self->{state} == ENTITY_DATA_STATE) {
666 ## (cannot happen in CDATA state)
667
668 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
669
670 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
671
672 $self->{state} = DATA_STATE;
673 # next-input-character is already done
674
675 unless (defined $token) {
676 !!!cp (13);
677 !!!emit ({type => CHARACTER_TOKEN, data => '&',
678 line => $l, column => $c,
679 });
680 } else {
681 !!!cp (14);
682 !!!emit ($token);
683 }
684
685 redo A;
686 } elsif ($self->{state} == TAG_OPEN_STATE) {
687 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
688 if ($self->{next_char} == 0x002F) { # /
689 !!!cp (15);
690 !!!next-input-character;
691 $self->{state} = CLOSE_TAG_OPEN_STATE;
692 redo A;
693 } else {
694 !!!cp (16);
695 ## reconsume
696 $self->{state} = DATA_STATE;
697
698 !!!emit ({type => CHARACTER_TOKEN, data => '<',
699 line => $self->{line_prev},
700 column => $self->{column_prev},
701 });
702
703 redo A;
704 }
705 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
706 if ($self->{next_char} == 0x0021) { # !
707 !!!cp (17);
708 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
709 !!!next-input-character;
710 redo A;
711 } elsif ($self->{next_char} == 0x002F) { # /
712 !!!cp (18);
713 $self->{state} = CLOSE_TAG_OPEN_STATE;
714 !!!next-input-character;
715 redo A;
716 } elsif (0x0041 <= $self->{next_char} and
717 $self->{next_char} <= 0x005A) { # A..Z
718 !!!cp (19);
719 $self->{current_token}
720 = {type => START_TAG_TOKEN,
721 tag_name => chr ($self->{next_char} + 0x0020),
722 line => $self->{line_prev},
723 column => $self->{column_prev}};
724 $self->{state} = TAG_NAME_STATE;
725 !!!next-input-character;
726 redo A;
727 } elsif (0x0061 <= $self->{next_char} and
728 $self->{next_char} <= 0x007A) { # a..z
729 !!!cp (20);
730 $self->{current_token} = {type => START_TAG_TOKEN,
731 tag_name => chr ($self->{next_char}),
732 line => $self->{line_prev},
733 column => $self->{column_prev}};
734 $self->{state} = TAG_NAME_STATE;
735 !!!next-input-character;
736 redo A;
737 } elsif ($self->{next_char} == 0x003E) { # >
738 !!!cp (21);
739 !!!parse-error (type => 'empty start tag',
740 line => $self->{line_prev},
741 column => $self->{column_prev});
742 $self->{state} = DATA_STATE;
743 !!!next-input-character;
744
745 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
746 line => $self->{line_prev},
747 column => $self->{column_prev},
748 });
749
750 redo A;
751 } elsif ($self->{next_char} == 0x003F) { # ?
752 !!!cp (22);
753 !!!parse-error (type => 'pio',
754 line => $self->{line_prev},
755 column => $self->{column_prev});
756 $self->{state} = BOGUS_COMMENT_STATE;
757 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
758 line => $self->{line_prev},
759 column => $self->{column_prev},
760 };
761 ## $self->{next_char} is intentionally left as is
762 redo A;
763 } else {
764 !!!cp (23);
765 !!!parse-error (type => 'bare stago');
766 $self->{state} = DATA_STATE;
767 ## reconsume
768
769 !!!emit ({type => CHARACTER_TOKEN, data => '<',
770 line => $self->{line_prev},
771 column => $self->{column_prev},
772 });
773
774 redo A;
775 }
776 } else {
777 die "$0: $self->{content_model} in tag open";
778 }
779 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
780 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
781 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
782 if (defined $self->{last_emitted_start_tag_name}) {
783
784 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
785 my @next_char;
786 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
787 push @next_char, $self->{next_char};
788 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
789 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
790 if ($self->{next_char} == $c or $self->{next_char} == $C) {
791 !!!cp (24);
792 !!!next-input-character;
793 next TAGNAME;
794 } else {
795 !!!cp (25);
796 $self->{next_char} = shift @next_char; # reconsume
797 !!!back-next-input-character (@next_char);
798 $self->{state} = DATA_STATE;
799
800 !!!emit ({type => CHARACTER_TOKEN, data => '</',
801 line => $l, column => $c,
802 });
803
804 redo A;
805 }
806 }
807 push @next_char, $self->{next_char};
808
809 unless ($self->{next_char} == 0x0009 or # HT
810 $self->{next_char} == 0x000A or # LF
811 $self->{next_char} == 0x000B or # VT
812 $self->{next_char} == 0x000C or # FF
813 $self->{next_char} == 0x0020 or # SP
814 $self->{next_char} == 0x003E or # >
815 $self->{next_char} == 0x002F or # /
816 $self->{next_char} == -1) {
817 !!!cp (26);
818 $self->{next_char} = shift @next_char; # reconsume
819 !!!back-next-input-character (@next_char);
820 $self->{state} = DATA_STATE;
821 !!!emit ({type => CHARACTER_TOKEN, data => '</',
822 line => $l, column => $c,
823 });
824 redo A;
825 } else {
826 !!!cp (27);
827 $self->{next_char} = shift @next_char;
828 !!!back-next-input-character (@next_char);
829 # and consume...
830 }
831 } else {
832 ## No start tag token has ever been emitted
833 !!!cp (28);
834 # next-input-character is already done
835 $self->{state} = DATA_STATE;
836 !!!emit ({type => CHARACTER_TOKEN, data => '</',
837 line => $l, column => $c,
838 });
839 redo A;
840 }
841 }
842
843 if (0x0041 <= $self->{next_char} and
844 $self->{next_char} <= 0x005A) { # A..Z
845 !!!cp (29);
846 $self->{current_token}
847 = {type => END_TAG_TOKEN,
848 tag_name => chr ($self->{next_char} + 0x0020),
849 line => $l, column => $c};
850 $self->{state} = TAG_NAME_STATE;
851 !!!next-input-character;
852 redo A;
853 } elsif (0x0061 <= $self->{next_char} and
854 $self->{next_char} <= 0x007A) { # a..z
855 !!!cp (30);
856 $self->{current_token} = {type => END_TAG_TOKEN,
857 tag_name => chr ($self->{next_char}),
858 line => $l, column => $c};
859 $self->{state} = TAG_NAME_STATE;
860 !!!next-input-character;
861 redo A;
862 } elsif ($self->{next_char} == 0x003E) { # >
863 !!!cp (31);
864 !!!parse-error (type => 'empty end tag',
865 line => $self->{line_prev}, ## "<" in "</>"
866 column => $self->{column_prev} - 1);
867 $self->{state} = DATA_STATE;
868 !!!next-input-character;
869 redo A;
870 } elsif ($self->{next_char} == -1) {
871 !!!cp (32);
872 !!!parse-error (type => 'bare etago');
873 $self->{state} = DATA_STATE;
874 # reconsume
875
876 !!!emit ({type => CHARACTER_TOKEN, data => '</',
877 line => $l, column => $c,
878 });
879
880 redo A;
881 } else {
882 !!!cp (33);
883 !!!parse-error (type => 'bogus end tag');
884 $self->{state} = BOGUS_COMMENT_STATE;
885 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
886 line => $self->{line_prev}, # "<" of "</"
887 column => $self->{column_prev} - 1,
888 };
889 ## $self->{next_char} is intentionally left as is
890 redo A;
891 }
892 } elsif ($self->{state} == TAG_NAME_STATE) {
893 if ($self->{next_char} == 0x0009 or # HT
894 $self->{next_char} == 0x000A or # LF
895 $self->{next_char} == 0x000B or # VT
896 $self->{next_char} == 0x000C or # FF
897 $self->{next_char} == 0x0020) { # SP
898 !!!cp (34);
899 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
900 !!!next-input-character;
901 redo A;
902 } elsif ($self->{next_char} == 0x003E) { # >
903 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
904 !!!cp (35);
905 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
906 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
907 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
908 #if ($self->{current_token}->{attributes}) {
909 # ## NOTE: This should never be reached.
910 # !!! cp (36);
911 # !!! parse-error (type => 'end tag attribute');
912 #} else {
913 !!!cp (37);
914 #}
915 } else {
916 die "$0: $self->{current_token}->{type}: Unknown token type";
917 }
918 $self->{state} = DATA_STATE;
919 !!!next-input-character;
920
921 !!!emit ($self->{current_token}); # start tag or end tag
922
923 redo A;
924 } elsif (0x0041 <= $self->{next_char} and
925 $self->{next_char} <= 0x005A) { # A..Z
926 !!!cp (38);
927 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
928 # start tag or end tag
929 ## Stay in this state
930 !!!next-input-character;
931 redo A;
932 } elsif ($self->{next_char} == -1) {
933 !!!parse-error (type => 'unclosed tag');
934 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
935 !!!cp (39);
936 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
937 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
938 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
939 #if ($self->{current_token}->{attributes}) {
940 # ## NOTE: This state should never be reached.
941 # !!! cp (40);
942 # !!! parse-error (type => 'end tag attribute');
943 #} else {
944 !!!cp (41);
945 #}
946 } else {
947 die "$0: $self->{current_token}->{type}: Unknown token type";
948 }
949 $self->{state} = DATA_STATE;
950 # reconsume
951
952 !!!emit ($self->{current_token}); # start tag or end tag
953
954 redo A;
955 } elsif ($self->{next_char} == 0x002F) { # /
956 !!!cp (42);
957 $self->{state} = SELF_CLOSING_START_TAG_STATE;
958 !!!next-input-character;
959 redo A;
960 } else {
961 !!!cp (44);
962 $self->{current_token}->{tag_name} .= chr $self->{next_char};
963 # start tag or end tag
964 ## Stay in the state
965 !!!next-input-character;
966 redo A;
967 }
968 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
969 if ($self->{next_char} == 0x0009 or # HT
970 $self->{next_char} == 0x000A or # LF
971 $self->{next_char} == 0x000B or # VT
972 $self->{next_char} == 0x000C or # FF
973 $self->{next_char} == 0x0020) { # SP
974 !!!cp (45);
975 ## Stay in the state
976 !!!next-input-character;
977 redo A;
978 } elsif ($self->{next_char} == 0x003E) { # >
979 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
980 !!!cp (46);
981 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
982 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
983 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
984 if ($self->{current_token}->{attributes}) {
985 !!!cp (47);
986 !!!parse-error (type => 'end tag attribute');
987 } else {
988 !!!cp (48);
989 }
990 } else {
991 die "$0: $self->{current_token}->{type}: Unknown token type";
992 }
993 $self->{state} = DATA_STATE;
994 !!!next-input-character;
995
996 !!!emit ($self->{current_token}); # start tag or end tag
997
998 redo A;
999 } elsif (0x0041 <= $self->{next_char} and
1000 $self->{next_char} <= 0x005A) { # A..Z
1001 !!!cp (49);
1002 $self->{current_attribute}
1003 = {name => chr ($self->{next_char} + 0x0020),
1004 value => '',
1005 line => $self->{line}, column => $self->{column}};
1006 $self->{state} = ATTRIBUTE_NAME_STATE;
1007 !!!next-input-character;
1008 redo A;
1009 } elsif ($self->{next_char} == 0x002F) { # /
1010 !!!cp (50);
1011 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1012 !!!next-input-character;
1013 redo A;
1014 } elsif ($self->{next_char} == -1) {
1015 !!!parse-error (type => 'unclosed tag');
1016 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1017 !!!cp (52);
1018 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1019 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1020 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1021 if ($self->{current_token}->{attributes}) {
1022 !!!cp (53);
1023 !!!parse-error (type => 'end tag attribute');
1024 } else {
1025 !!!cp (54);
1026 }
1027 } else {
1028 die "$0: $self->{current_token}->{type}: Unknown token type";
1029 }
1030 $self->{state} = DATA_STATE;
1031 # reconsume
1032
1033 !!!emit ($self->{current_token}); # start tag or end tag
1034
1035 redo A;
1036 } else {
1037 if ({
1038 0x0022 => 1, # "
1039 0x0027 => 1, # '
1040 0x003D => 1, # =
1041 }->{$self->{next_char}}) {
1042 !!!cp (55);
1043 !!!parse-error (type => 'bad attribute name');
1044 } else {
1045 !!!cp (56);
1046 }
1047 $self->{current_attribute}
1048 = {name => chr ($self->{next_char}),
1049 value => '',
1050 line => $self->{line}, column => $self->{column}};
1051 $self->{state} = ATTRIBUTE_NAME_STATE;
1052 !!!next-input-character;
1053 redo A;
1054 }
1055 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1056 my $before_leave = sub {
1057 if (exists $self->{current_token}->{attributes} # start tag or end tag
1058 ->{$self->{current_attribute}->{name}}) { # MUST
1059 !!!cp (57);
1060 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1061 ## Discard $self->{current_attribute} # MUST
1062 } else {
1063 !!!cp (58);
1064 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1065 = $self->{current_attribute};
1066 }
1067 }; # $before_leave
1068
1069 if ($self->{next_char} == 0x0009 or # HT
1070 $self->{next_char} == 0x000A or # LF
1071 $self->{next_char} == 0x000B or # VT
1072 $self->{next_char} == 0x000C or # FF
1073 $self->{next_char} == 0x0020) { # SP
1074 !!!cp (59);
1075 $before_leave->();
1076 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1077 !!!next-input-character;
1078 redo A;
1079 } elsif ($self->{next_char} == 0x003D) { # =
1080 !!!cp (60);
1081 $before_leave->();
1082 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1083 !!!next-input-character;
1084 redo A;
1085 } elsif ($self->{next_char} == 0x003E) { # >
1086 $before_leave->();
1087 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1088 !!!cp (61);
1089 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1090 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1091 !!!cp (62);
1092 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1093 if ($self->{current_token}->{attributes}) {
1094 !!!parse-error (type => 'end tag attribute');
1095 }
1096 } else {
1097 die "$0: $self->{current_token}->{type}: Unknown token type";
1098 }
1099 $self->{state} = DATA_STATE;
1100 !!!next-input-character;
1101
1102 !!!emit ($self->{current_token}); # start tag or end tag
1103
1104 redo A;
1105 } elsif (0x0041 <= $self->{next_char} and
1106 $self->{next_char} <= 0x005A) { # A..Z
1107 !!!cp (63);
1108 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1109 ## Stay in the state
1110 !!!next-input-character;
1111 redo A;
1112 } elsif ($self->{next_char} == 0x002F) { # /
1113 !!!cp (64);
1114 $before_leave->();
1115 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1116 !!!next-input-character;
1117 redo A;
1118 } elsif ($self->{next_char} == -1) {
1119 !!!parse-error (type => 'unclosed tag');
1120 $before_leave->();
1121 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1122 !!!cp (66);
1123 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1124 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1125 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1126 if ($self->{current_token}->{attributes}) {
1127 !!!cp (67);
1128 !!!parse-error (type => 'end tag attribute');
1129 } else {
1130 ## NOTE: This state should never be reached.
1131 !!!cp (68);
1132 }
1133 } else {
1134 die "$0: $self->{current_token}->{type}: Unknown token type";
1135 }
1136 $self->{state} = DATA_STATE;
1137 # reconsume
1138
1139 !!!emit ($self->{current_token}); # start tag or end tag
1140
1141 redo A;
1142 } else {
1143 if ($self->{next_char} == 0x0022 or # "
1144 $self->{next_char} == 0x0027) { # '
1145 !!!cp (69);
1146 !!!parse-error (type => 'bad attribute name');
1147 } else {
1148 !!!cp (70);
1149 }
1150 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1151 ## Stay in the state
1152 !!!next-input-character;
1153 redo A;
1154 }
1155 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1156 if ($self->{next_char} == 0x0009 or # HT
1157 $self->{next_char} == 0x000A or # LF
1158 $self->{next_char} == 0x000B or # VT
1159 $self->{next_char} == 0x000C or # FF
1160 $self->{next_char} == 0x0020) { # SP
1161 !!!cp (71);
1162 ## Stay in the state
1163 !!!next-input-character;
1164 redo A;
1165 } elsif ($self->{next_char} == 0x003D) { # =
1166 !!!cp (72);
1167 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1168 !!!next-input-character;
1169 redo A;
1170 } elsif ($self->{next_char} == 0x003E) { # >
1171 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1172 !!!cp (73);
1173 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1174 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1175 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1176 if ($self->{current_token}->{attributes}) {
1177 !!!cp (74);
1178 !!!parse-error (type => 'end tag attribute');
1179 } else {
1180 ## NOTE: This state should never be reached.
1181 !!!cp (75);
1182 }
1183 } else {
1184 die "$0: $self->{current_token}->{type}: Unknown token type";
1185 }
1186 $self->{state} = DATA_STATE;
1187 !!!next-input-character;
1188
1189 !!!emit ($self->{current_token}); # start tag or end tag
1190
1191 redo A;
1192 } elsif (0x0041 <= $self->{next_char} and
1193 $self->{next_char} <= 0x005A) { # A..Z
1194 !!!cp (76);
1195 $self->{current_attribute}
1196 = {name => chr ($self->{next_char} + 0x0020),
1197 value => '',
1198 line => $self->{line}, column => $self->{column}};
1199 $self->{state} = ATTRIBUTE_NAME_STATE;
1200 !!!next-input-character;
1201 redo A;
1202 } elsif ($self->{next_char} == 0x002F) { # /
1203 !!!cp (77);
1204 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1205 !!!next-input-character;
1206 redo A;
1207 } elsif ($self->{next_char} == -1) {
1208 !!!parse-error (type => 'unclosed tag');
1209 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1210 !!!cp (79);
1211 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1212 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1213 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1214 if ($self->{current_token}->{attributes}) {
1215 !!!cp (80);
1216 !!!parse-error (type => 'end tag attribute');
1217 } else {
1218 ## NOTE: This state should never be reached.
1219 !!!cp (81);
1220 }
1221 } else {
1222 die "$0: $self->{current_token}->{type}: Unknown token type";
1223 }
1224 $self->{state} = DATA_STATE;
1225 # reconsume
1226
1227 !!!emit ($self->{current_token}); # start tag or end tag
1228
1229 redo A;
1230 } else {
1231 !!!cp (82);
1232 $self->{current_attribute}
1233 = {name => chr ($self->{next_char}),
1234 value => '',
1235 line => $self->{line}, column => $self->{column}};
1236 $self->{state} = ATTRIBUTE_NAME_STATE;
1237 !!!next-input-character;
1238 redo A;
1239 }
1240 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1241 if ($self->{next_char} == 0x0009 or # HT
1242 $self->{next_char} == 0x000A or # LF
1243 $self->{next_char} == 0x000B or # VT
1244 $self->{next_char} == 0x000C or # FF
1245 $self->{next_char} == 0x0020) { # SP
1246 !!!cp (83);
1247 ## Stay in the state
1248 !!!next-input-character;
1249 redo A;
1250 } elsif ($self->{next_char} == 0x0022) { # "
1251 !!!cp (84);
1252 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1253 !!!next-input-character;
1254 redo A;
1255 } elsif ($self->{next_char} == 0x0026) { # &
1256 !!!cp (85);
1257 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1258 ## reconsume
1259 redo A;
1260 } elsif ($self->{next_char} == 0x0027) { # '
1261 !!!cp (86);
1262 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1263 !!!next-input-character;
1264 redo A;
1265 } elsif ($self->{next_char} == 0x003E) { # >
1266 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1267 !!!cp (87);
1268 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1269 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1270 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1271 if ($self->{current_token}->{attributes}) {
1272 !!!cp (88);
1273 !!!parse-error (type => 'end tag attribute');
1274 } else {
1275 ## NOTE: This state should never be reached.
1276 !!!cp (89);
1277 }
1278 } else {
1279 die "$0: $self->{current_token}->{type}: Unknown token type";
1280 }
1281 $self->{state} = DATA_STATE;
1282 !!!next-input-character;
1283
1284 !!!emit ($self->{current_token}); # start tag or end tag
1285
1286 redo A;
1287 } elsif ($self->{next_char} == -1) {
1288 !!!parse-error (type => 'unclosed tag');
1289 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1290 !!!cp (90);
1291 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1292 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1293 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1294 if ($self->{current_token}->{attributes}) {
1295 !!!cp (91);
1296 !!!parse-error (type => 'end tag attribute');
1297 } else {
1298 ## NOTE: This state should never be reached.
1299 !!!cp (92);
1300 }
1301 } else {
1302 die "$0: $self->{current_token}->{type}: Unknown token type";
1303 }
1304 $self->{state} = DATA_STATE;
1305 ## reconsume
1306
1307 !!!emit ($self->{current_token}); # start tag or end tag
1308
1309 redo A;
1310 } else {
1311 if ($self->{next_char} == 0x003D) { # =
1312 !!!cp (93);
1313 !!!parse-error (type => 'bad attribute value');
1314 } else {
1315 !!!cp (94);
1316 }
1317 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1318 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1319 !!!next-input-character;
1320 redo A;
1321 }
1322 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1323 if ($self->{next_char} == 0x0022) { # "
1324 !!!cp (95);
1325 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1326 !!!next-input-character;
1327 redo A;
1328 } elsif ($self->{next_char} == 0x0026) { # &
1329 !!!cp (96);
1330 $self->{last_attribute_value_state} = $self->{state};
1331 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1332 !!!next-input-character;
1333 redo A;
1334 } elsif ($self->{next_char} == -1) {
1335 !!!parse-error (type => 'unclosed attribute value');
1336 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1337 !!!cp (97);
1338 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1339 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1340 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1341 if ($self->{current_token}->{attributes}) {
1342 !!!cp (98);
1343 !!!parse-error (type => 'end tag attribute');
1344 } else {
1345 ## NOTE: This state should never be reached.
1346 !!!cp (99);
1347 }
1348 } else {
1349 die "$0: $self->{current_token}->{type}: Unknown token type";
1350 }
1351 $self->{state} = DATA_STATE;
1352 ## reconsume
1353
1354 !!!emit ($self->{current_token}); # start tag or end tag
1355
1356 redo A;
1357 } else {
1358 !!!cp (100);
1359 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1360 ## Stay in the state
1361 !!!next-input-character;
1362 redo A;
1363 }
1364 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1365 if ($self->{next_char} == 0x0027) { # '
1366 !!!cp (101);
1367 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1368 !!!next-input-character;
1369 redo A;
1370 } elsif ($self->{next_char} == 0x0026) { # &
1371 !!!cp (102);
1372 $self->{last_attribute_value_state} = $self->{state};
1373 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1374 !!!next-input-character;
1375 redo A;
1376 } elsif ($self->{next_char} == -1) {
1377 !!!parse-error (type => 'unclosed attribute value');
1378 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1379 !!!cp (103);
1380 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1381 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1382 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1383 if ($self->{current_token}->{attributes}) {
1384 !!!cp (104);
1385 !!!parse-error (type => 'end tag attribute');
1386 } else {
1387 ## NOTE: This state should never be reached.
1388 !!!cp (105);
1389 }
1390 } else {
1391 die "$0: $self->{current_token}->{type}: Unknown token type";
1392 }
1393 $self->{state} = DATA_STATE;
1394 ## reconsume
1395
1396 !!!emit ($self->{current_token}); # start tag or end tag
1397
1398 redo A;
1399 } else {
1400 !!!cp (106);
1401 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1402 ## Stay in the state
1403 !!!next-input-character;
1404 redo A;
1405 }
1406 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1407 if ($self->{next_char} == 0x0009 or # HT
1408 $self->{next_char} == 0x000A or # LF
1409 $self->{next_char} == 0x000B or # HT
1410 $self->{next_char} == 0x000C or # FF
1411 $self->{next_char} == 0x0020) { # SP
1412 !!!cp (107);
1413 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1414 !!!next-input-character;
1415 redo A;
1416 } elsif ($self->{next_char} == 0x0026) { # &
1417 !!!cp (108);
1418 $self->{last_attribute_value_state} = $self->{state};
1419 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1420 !!!next-input-character;
1421 redo A;
1422 } elsif ($self->{next_char} == 0x003E) { # >
1423 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1424 !!!cp (109);
1425 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1426 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1427 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1428 if ($self->{current_token}->{attributes}) {
1429 !!!cp (110);
1430 !!!parse-error (type => 'end tag attribute');
1431 } else {
1432 ## NOTE: This state should never be reached.
1433 !!!cp (111);
1434 }
1435 } else {
1436 die "$0: $self->{current_token}->{type}: Unknown token type";
1437 }
1438 $self->{state} = DATA_STATE;
1439 !!!next-input-character;
1440
1441 !!!emit ($self->{current_token}); # start tag or end tag
1442
1443 redo A;
1444 } elsif ($self->{next_char} == -1) {
1445 !!!parse-error (type => 'unclosed tag');
1446 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1447 !!!cp (112);
1448 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1449 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1450 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1451 if ($self->{current_token}->{attributes}) {
1452 !!!cp (113);
1453 !!!parse-error (type => 'end tag attribute');
1454 } else {
1455 ## NOTE: This state should never be reached.
1456 !!!cp (114);
1457 }
1458 } else {
1459 die "$0: $self->{current_token}->{type}: Unknown token type";
1460 }
1461 $self->{state} = DATA_STATE;
1462 ## reconsume
1463
1464 !!!emit ($self->{current_token}); # start tag or end tag
1465
1466 redo A;
1467 } else {
1468 if ({
1469 0x0022 => 1, # "
1470 0x0027 => 1, # '
1471 0x003D => 1, # =
1472 }->{$self->{next_char}}) {
1473 !!!cp (115);
1474 !!!parse-error (type => 'bad attribute value');
1475 } else {
1476 !!!cp (116);
1477 }
1478 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1479 ## Stay in the state
1480 !!!next-input-character;
1481 redo A;
1482 }
1483 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1484 my $token = $self->_tokenize_attempt_to_consume_an_entity
1485 (1,
1486 $self->{last_attribute_value_state}
1487 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1488 $self->{last_attribute_value_state}
1489 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1490 -1);
1491
1492 unless (defined $token) {
1493 !!!cp (117);
1494 $self->{current_attribute}->{value} .= '&';
1495 } else {
1496 !!!cp (118);
1497 $self->{current_attribute}->{value} .= $token->{data};
1498 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1499 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1500 }
1501
1502 $self->{state} = $self->{last_attribute_value_state};
1503 # next-input-character is already done
1504 redo A;
1505 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1506 if ($self->{next_char} == 0x0009 or # HT
1507 $self->{next_char} == 0x000A or # LF
1508 $self->{next_char} == 0x000B or # VT
1509 $self->{next_char} == 0x000C or # FF
1510 $self->{next_char} == 0x0020) { # SP
1511 !!!cp (118);
1512 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1513 !!!next-input-character;
1514 redo A;
1515 } elsif ($self->{next_char} == 0x003E) { # >
1516 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1517 !!!cp (119);
1518 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1519 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1520 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1521 if ($self->{current_token}->{attributes}) {
1522 !!!cp (120);
1523 !!!parse-error (type => 'end tag attribute');
1524 } else {
1525 ## NOTE: This state should never be reached.
1526 !!!cp (121);
1527 }
1528 } else {
1529 die "$0: $self->{current_token}->{type}: Unknown token type";
1530 }
1531 $self->{state} = DATA_STATE;
1532 !!!next-input-character;
1533
1534 !!!emit ($self->{current_token}); # start tag or end tag
1535
1536 redo A;
1537 } elsif ($self->{next_char} == 0x002F) { # /
1538 !!!cp (122);
1539 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1540 !!!next-input-character;
1541 redo A;
1542 } else {
1543 !!!cp ('124.1');
1544 !!!parse-error (type => 'no space between attributes');
1545 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1546 ## reconsume
1547 redo A;
1548 }
1549 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1550 if ($self->{next_char} == 0x003E) { # >
1551 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1552 !!!cp ('124.2');
1553 !!!parse-error (type => 'nestc', token => $self->{current_token});
1554 ## TODO: Different type than slash in start tag
1555 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1556 if ($self->{current_token}->{attributes}) {
1557 !!!cp ('124.4');
1558 !!!parse-error (type => 'end tag attribute');
1559 } else {
1560 !!!cp ('124.5');
1561 }
1562 ## TODO: Test |<title></title/>|
1563 } else {
1564 !!!cp ('124.3');
1565 $self->{self_closing} = 1;
1566 }
1567
1568 $self->{state} = DATA_STATE;
1569 !!!next-input-character;
1570
1571 !!!emit ($self->{current_token}); # start tag or end tag
1572
1573 redo A;
1574 } else {
1575 !!!cp ('124.4');
1576 !!!parse-error (type => 'nestc');
1577 ## TODO: This error type is wrong.
1578 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1579 ## Reconsume.
1580 redo A;
1581 }
1582 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1583 ## (only happen if PCDATA state)
1584
1585 ## NOTE: Set by the previous state
1586 #my $token = {type => COMMENT_TOKEN, data => ''};
1587
1588 BC: {
1589 if ($self->{next_char} == 0x003E) { # >
1590 !!!cp (124);
1591 $self->{state} = DATA_STATE;
1592 !!!next-input-character;
1593
1594 !!!emit ($self->{current_token}); # comment
1595
1596 redo A;
1597 } elsif ($self->{next_char} == -1) {
1598 !!!cp (125);
1599 $self->{state} = DATA_STATE;
1600 ## reconsume
1601
1602 !!!emit ($self->{current_token}); # comment
1603
1604 redo A;
1605 } else {
1606 !!!cp (126);
1607 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1608 !!!next-input-character;
1609 redo BC;
1610 }
1611 } # BC
1612
1613 die "$0: _get_next_token: unexpected case [BC]";
1614 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1615 ## (only happen if PCDATA state)
1616
1617 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1618
1619 my @next_char;
1620 push @next_char, $self->{next_char};
1621
1622 if ($self->{next_char} == 0x002D) { # -
1623 !!!next-input-character;
1624 push @next_char, $self->{next_char};
1625 if ($self->{next_char} == 0x002D) { # -
1626 !!!cp (127);
1627 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1628 line => $l, column => $c,
1629 };
1630 $self->{state} = COMMENT_START_STATE;
1631 !!!next-input-character;
1632 redo A;
1633 } else {
1634 !!!cp (128);
1635 }
1636 } elsif ($self->{next_char} == 0x0044 or # D
1637 $self->{next_char} == 0x0064) { # d
1638 !!!next-input-character;
1639 push @next_char, $self->{next_char};
1640 if ($self->{next_char} == 0x004F or # O
1641 $self->{next_char} == 0x006F) { # o
1642 !!!next-input-character;
1643 push @next_char, $self->{next_char};
1644 if ($self->{next_char} == 0x0043 or # C
1645 $self->{next_char} == 0x0063) { # c
1646 !!!next-input-character;
1647 push @next_char, $self->{next_char};
1648 if ($self->{next_char} == 0x0054 or # T
1649 $self->{next_char} == 0x0074) { # t
1650 !!!next-input-character;
1651 push @next_char, $self->{next_char};
1652 if ($self->{next_char} == 0x0059 or # Y
1653 $self->{next_char} == 0x0079) { # y
1654 !!!next-input-character;
1655 push @next_char, $self->{next_char};
1656 if ($self->{next_char} == 0x0050 or # P
1657 $self->{next_char} == 0x0070) { # p
1658 !!!next-input-character;
1659 push @next_char, $self->{next_char};
1660 if ($self->{next_char} == 0x0045 or # E
1661 $self->{next_char} == 0x0065) { # e
1662 !!!cp (129);
1663 ## TODO: What a stupid code this is!
1664 $self->{state} = DOCTYPE_STATE;
1665 $self->{current_token} = {type => DOCTYPE_TOKEN,
1666 quirks => 1,
1667 line => $l, column => $c,
1668 };
1669 !!!next-input-character;
1670 redo A;
1671 } else {
1672 !!!cp (130);
1673 }
1674 } else {
1675 !!!cp (131);
1676 }
1677 } else {
1678 !!!cp (132);
1679 }
1680 } else {
1681 !!!cp (133);
1682 }
1683 } else {
1684 !!!cp (134);
1685 }
1686 } else {
1687 !!!cp (135);
1688 }
1689 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1690 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1691 $self->{next_char} == 0x005B) { # [
1692 !!!next-input-character;
1693 push @next_char, $self->{next_char};
1694 if ($self->{next_char} == 0x0043) { # C
1695 !!!next-input-character;
1696 push @next_char, $self->{next_char};
1697 if ($self->{next_char} == 0x0044) { # D
1698 !!!next-input-character;
1699 push @next_char, $self->{next_char};
1700 if ($self->{next_char} == 0x0041) { # A
1701 !!!next-input-character;
1702 push @next_char, $self->{next_char};
1703 if ($self->{next_char} == 0x0054) { # T
1704 !!!next-input-character;
1705 push @next_char, $self->{next_char};
1706 if ($self->{next_char} == 0x0041) { # A
1707 !!!next-input-character;
1708 push @next_char, $self->{next_char};
1709 if ($self->{next_char} == 0x005B) { # [
1710 !!!cp (135.1);
1711 $self->{state} = CDATA_BLOCK_STATE;
1712 !!!next-input-character;
1713 redo A;
1714 } else {
1715 !!!cp (135.2);
1716 }
1717 } else {
1718 !!!cp (135.3);
1719 }
1720 } else {
1721 !!!cp (135.4);
1722 }
1723 } else {
1724 !!!cp (135.5);
1725 }
1726 } else {
1727 !!!cp (135.6);
1728 }
1729 } else {
1730 !!!cp (135.7);
1731 }
1732 } else {
1733 !!!cp (136);
1734 }
1735
1736 !!!parse-error (type => 'bogus comment');
1737 $self->{next_char} = shift @next_char;
1738 !!!back-next-input-character (@next_char);
1739 $self->{state} = BOGUS_COMMENT_STATE;
1740 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1741 line => $l, column => $c,
1742 };
1743 redo A;
1744
1745 ## ISSUE: typos in spec: chacacters, is is a parse error
1746 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1747 } elsif ($self->{state} == COMMENT_START_STATE) {
1748 if ($self->{next_char} == 0x002D) { # -
1749 !!!cp (137);
1750 $self->{state} = COMMENT_START_DASH_STATE;
1751 !!!next-input-character;
1752 redo A;
1753 } elsif ($self->{next_char} == 0x003E) { # >
1754 !!!cp (138);
1755 !!!parse-error (type => 'bogus comment');
1756 $self->{state} = DATA_STATE;
1757 !!!next-input-character;
1758
1759 !!!emit ($self->{current_token}); # comment
1760
1761 redo A;
1762 } elsif ($self->{next_char} == -1) {
1763 !!!cp (139);
1764 !!!parse-error (type => 'unclosed comment');
1765 $self->{state} = DATA_STATE;
1766 ## reconsume
1767
1768 !!!emit ($self->{current_token}); # comment
1769
1770 redo A;
1771 } else {
1772 !!!cp (140);
1773 $self->{current_token}->{data} # comment
1774 .= chr ($self->{next_char});
1775 $self->{state} = COMMENT_STATE;
1776 !!!next-input-character;
1777 redo A;
1778 }
1779 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1780 if ($self->{next_char} == 0x002D) { # -
1781 !!!cp (141);
1782 $self->{state} = COMMENT_END_STATE;
1783 !!!next-input-character;
1784 redo A;
1785 } elsif ($self->{next_char} == 0x003E) { # >
1786 !!!cp (142);
1787 !!!parse-error (type => 'bogus comment');
1788 $self->{state} = DATA_STATE;
1789 !!!next-input-character;
1790
1791 !!!emit ($self->{current_token}); # comment
1792
1793 redo A;
1794 } elsif ($self->{next_char} == -1) {
1795 !!!cp (143);
1796 !!!parse-error (type => 'unclosed comment');
1797 $self->{state} = DATA_STATE;
1798 ## reconsume
1799
1800 !!!emit ($self->{current_token}); # comment
1801
1802 redo A;
1803 } else {
1804 !!!cp (144);
1805 $self->{current_token}->{data} # comment
1806 .= '-' . chr ($self->{next_char});
1807 $self->{state} = COMMENT_STATE;
1808 !!!next-input-character;
1809 redo A;
1810 }
1811 } elsif ($self->{state} == COMMENT_STATE) {
1812 if ($self->{next_char} == 0x002D) { # -
1813 !!!cp (145);
1814 $self->{state} = COMMENT_END_DASH_STATE;
1815 !!!next-input-character;
1816 redo A;
1817 } elsif ($self->{next_char} == -1) {
1818 !!!cp (146);
1819 !!!parse-error (type => 'unclosed comment');
1820 $self->{state} = DATA_STATE;
1821 ## reconsume
1822
1823 !!!emit ($self->{current_token}); # comment
1824
1825 redo A;
1826 } else {
1827 !!!cp (147);
1828 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1829 ## Stay in the state
1830 !!!next-input-character;
1831 redo A;
1832 }
1833 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1834 if ($self->{next_char} == 0x002D) { # -
1835 !!!cp (148);
1836 $self->{state} = COMMENT_END_STATE;
1837 !!!next-input-character;
1838 redo A;
1839 } elsif ($self->{next_char} == -1) {
1840 !!!cp (149);
1841 !!!parse-error (type => 'unclosed comment');
1842 $self->{state} = DATA_STATE;
1843 ## reconsume
1844
1845 !!!emit ($self->{current_token}); # comment
1846
1847 redo A;
1848 } else {
1849 !!!cp (150);
1850 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1851 $self->{state} = COMMENT_STATE;
1852 !!!next-input-character;
1853 redo A;
1854 }
1855 } elsif ($self->{state} == COMMENT_END_STATE) {
1856 if ($self->{next_char} == 0x003E) { # >
1857 !!!cp (151);
1858 $self->{state} = DATA_STATE;
1859 !!!next-input-character;
1860
1861 !!!emit ($self->{current_token}); # comment
1862
1863 redo A;
1864 } elsif ($self->{next_char} == 0x002D) { # -
1865 !!!cp (152);
1866 !!!parse-error (type => 'dash in comment',
1867 line => $self->{line_prev},
1868 column => $self->{column_prev});
1869 $self->{current_token}->{data} .= '-'; # comment
1870 ## Stay in the state
1871 !!!next-input-character;
1872 redo A;
1873 } elsif ($self->{next_char} == -1) {
1874 !!!cp (153);
1875 !!!parse-error (type => 'unclosed comment');
1876 $self->{state} = DATA_STATE;
1877 ## reconsume
1878
1879 !!!emit ($self->{current_token}); # comment
1880
1881 redo A;
1882 } else {
1883 !!!cp (154);
1884 !!!parse-error (type => 'dash in comment',
1885 line => $self->{line_prev},
1886 column => $self->{column_prev});
1887 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1888 $self->{state} = COMMENT_STATE;
1889 !!!next-input-character;
1890 redo A;
1891 }
1892 } elsif ($self->{state} == DOCTYPE_STATE) {
1893 if ($self->{next_char} == 0x0009 or # HT
1894 $self->{next_char} == 0x000A or # LF
1895 $self->{next_char} == 0x000B or # VT
1896 $self->{next_char} == 0x000C or # FF
1897 $self->{next_char} == 0x0020) { # SP
1898 !!!cp (155);
1899 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1900 !!!next-input-character;
1901 redo A;
1902 } else {
1903 !!!cp (156);
1904 !!!parse-error (type => 'no space before DOCTYPE name');
1905 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1906 ## reconsume
1907 redo A;
1908 }
1909 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1910 if ($self->{next_char} == 0x0009 or # HT
1911 $self->{next_char} == 0x000A or # LF
1912 $self->{next_char} == 0x000B or # VT
1913 $self->{next_char} == 0x000C or # FF
1914 $self->{next_char} == 0x0020) { # SP
1915 !!!cp (157);
1916 ## Stay in the state
1917 !!!next-input-character;
1918 redo A;
1919 } elsif ($self->{next_char} == 0x003E) { # >
1920 !!!cp (158);
1921 !!!parse-error (type => 'no DOCTYPE name');
1922 $self->{state} = DATA_STATE;
1923 !!!next-input-character;
1924
1925 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
1926
1927 redo A;
1928 } elsif ($self->{next_char} == -1) {
1929 !!!cp (159);
1930 !!!parse-error (type => 'no DOCTYPE name');
1931 $self->{state} = DATA_STATE;
1932 ## reconsume
1933
1934 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
1935
1936 redo A;
1937 } else {
1938 !!!cp (160);
1939 $self->{current_token}->{name} = chr $self->{next_char};
1940 delete $self->{current_token}->{quirks};
1941 ## ISSUE: "Set the token's name name to the" in the spec
1942 $self->{state} = DOCTYPE_NAME_STATE;
1943 !!!next-input-character;
1944 redo A;
1945 }
1946 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1947 ## ISSUE: Redundant "First," in the spec.
1948 if ($self->{next_char} == 0x0009 or # HT
1949 $self->{next_char} == 0x000A or # LF
1950 $self->{next_char} == 0x000B or # VT
1951 $self->{next_char} == 0x000C or # FF
1952 $self->{next_char} == 0x0020) { # SP
1953 !!!cp (161);
1954 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1955 !!!next-input-character;
1956 redo A;
1957 } elsif ($self->{next_char} == 0x003E) { # >
1958 !!!cp (162);
1959 $self->{state} = DATA_STATE;
1960 !!!next-input-character;
1961
1962 !!!emit ($self->{current_token}); # DOCTYPE
1963
1964 redo A;
1965 } elsif ($self->{next_char} == -1) {
1966 !!!cp (163);
1967 !!!parse-error (type => 'unclosed DOCTYPE');
1968 $self->{state} = DATA_STATE;
1969 ## reconsume
1970
1971 $self->{current_token}->{quirks} = 1;
1972 !!!emit ($self->{current_token}); # DOCTYPE
1973
1974 redo A;
1975 } else {
1976 !!!cp (164);
1977 $self->{current_token}->{name}
1978 .= chr ($self->{next_char}); # DOCTYPE
1979 ## Stay in the state
1980 !!!next-input-character;
1981 redo A;
1982 }
1983 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1984 if ($self->{next_char} == 0x0009 or # HT
1985 $self->{next_char} == 0x000A or # LF
1986 $self->{next_char} == 0x000B or # VT
1987 $self->{next_char} == 0x000C or # FF
1988 $self->{next_char} == 0x0020) { # SP
1989 !!!cp (165);
1990 ## Stay in the state
1991 !!!next-input-character;
1992 redo A;
1993 } elsif ($self->{next_char} == 0x003E) { # >
1994 !!!cp (166);
1995 $self->{state} = DATA_STATE;
1996 !!!next-input-character;
1997
1998 !!!emit ($self->{current_token}); # DOCTYPE
1999
2000 redo A;
2001 } elsif ($self->{next_char} == -1) {
2002 !!!cp (167);
2003 !!!parse-error (type => 'unclosed DOCTYPE');
2004 $self->{state} = DATA_STATE;
2005 ## reconsume
2006
2007 $self->{current_token}->{quirks} = 1;
2008 !!!emit ($self->{current_token}); # DOCTYPE
2009
2010 redo A;
2011 } elsif ($self->{next_char} == 0x0050 or # P
2012 $self->{next_char} == 0x0070) { # p
2013 !!!next-input-character;
2014 if ($self->{next_char} == 0x0055 or # U
2015 $self->{next_char} == 0x0075) { # u
2016 !!!next-input-character;
2017 if ($self->{next_char} == 0x0042 or # B
2018 $self->{next_char} == 0x0062) { # b
2019 !!!next-input-character;
2020 if ($self->{next_char} == 0x004C or # L
2021 $self->{next_char} == 0x006C) { # l
2022 !!!next-input-character;
2023 if ($self->{next_char} == 0x0049 or # I
2024 $self->{next_char} == 0x0069) { # i
2025 !!!next-input-character;
2026 if ($self->{next_char} == 0x0043 or # C
2027 $self->{next_char} == 0x0063) { # c
2028 !!!cp (168);
2029 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2030 !!!next-input-character;
2031 redo A;
2032 } else {
2033 !!!cp (169);
2034 }
2035 } else {
2036 !!!cp (170);
2037 }
2038 } else {
2039 !!!cp (171);
2040 }
2041 } else {
2042 !!!cp (172);
2043 }
2044 } else {
2045 !!!cp (173);
2046 }
2047
2048 #
2049 } elsif ($self->{next_char} == 0x0053 or # S
2050 $self->{next_char} == 0x0073) { # s
2051 !!!next-input-character;
2052 if ($self->{next_char} == 0x0059 or # Y
2053 $self->{next_char} == 0x0079) { # y
2054 !!!next-input-character;
2055 if ($self->{next_char} == 0x0053 or # S
2056 $self->{next_char} == 0x0073) { # s
2057 !!!next-input-character;
2058 if ($self->{next_char} == 0x0054 or # T
2059 $self->{next_char} == 0x0074) { # t
2060 !!!next-input-character;
2061 if ($self->{next_char} == 0x0045 or # E
2062 $self->{next_char} == 0x0065) { # e
2063 !!!next-input-character;
2064 if ($self->{next_char} == 0x004D or # M
2065 $self->{next_char} == 0x006D) { # m
2066 !!!cp (174);
2067 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2068 !!!next-input-character;
2069 redo A;
2070 } else {
2071 !!!cp (175);
2072 }
2073 } else {
2074 !!!cp (176);
2075 }
2076 } else {
2077 !!!cp (177);
2078 }
2079 } else {
2080 !!!cp (178);
2081 }
2082 } else {
2083 !!!cp (179);
2084 }
2085
2086 #
2087 } else {
2088 !!!cp (180);
2089 !!!next-input-character;
2090 #
2091 }
2092
2093 !!!parse-error (type => 'string after DOCTYPE name');
2094 $self->{current_token}->{quirks} = 1;
2095
2096 $self->{state} = BOGUS_DOCTYPE_STATE;
2097 # next-input-character is already done
2098 redo A;
2099 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2100 if ({
2101 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2102 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2103 }->{$self->{next_char}}) {
2104 !!!cp (181);
2105 ## Stay in the state
2106 !!!next-input-character;
2107 redo A;
2108 } elsif ($self->{next_char} eq 0x0022) { # "
2109 !!!cp (182);
2110 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2111 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2112 !!!next-input-character;
2113 redo A;
2114 } elsif ($self->{next_char} eq 0x0027) { # '
2115 !!!cp (183);
2116 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2117 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2118 !!!next-input-character;
2119 redo A;
2120 } elsif ($self->{next_char} eq 0x003E) { # >
2121 !!!cp (184);
2122 !!!parse-error (type => 'no PUBLIC literal');
2123
2124 $self->{state} = DATA_STATE;
2125 !!!next-input-character;
2126
2127 $self->{current_token}->{quirks} = 1;
2128 !!!emit ($self->{current_token}); # DOCTYPE
2129
2130 redo A;
2131 } elsif ($self->{next_char} == -1) {
2132 !!!cp (185);
2133 !!!parse-error (type => 'unclosed DOCTYPE');
2134
2135 $self->{state} = DATA_STATE;
2136 ## reconsume
2137
2138 $self->{current_token}->{quirks} = 1;
2139 !!!emit ($self->{current_token}); # DOCTYPE
2140
2141 redo A;
2142 } else {
2143 !!!cp (186);
2144 !!!parse-error (type => 'string after PUBLIC');
2145 $self->{current_token}->{quirks} = 1;
2146
2147 $self->{state} = BOGUS_DOCTYPE_STATE;
2148 !!!next-input-character;
2149 redo A;
2150 }
2151 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2152 if ($self->{next_char} == 0x0022) { # "
2153 !!!cp (187);
2154 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2155 !!!next-input-character;
2156 redo A;
2157 } elsif ($self->{next_char} == 0x003E) { # >
2158 !!!cp (188);
2159 !!!parse-error (type => 'unclosed PUBLIC literal');
2160
2161 $self->{state} = DATA_STATE;
2162 !!!next-input-character;
2163
2164 $self->{current_token}->{quirks} = 1;
2165 !!!emit ($self->{current_token}); # DOCTYPE
2166
2167 redo A;
2168 } elsif ($self->{next_char} == -1) {
2169 !!!cp (189);
2170 !!!parse-error (type => 'unclosed PUBLIC literal');
2171
2172 $self->{state} = DATA_STATE;
2173 ## reconsume
2174
2175 $self->{current_token}->{quirks} = 1;
2176 !!!emit ($self->{current_token}); # DOCTYPE
2177
2178 redo A;
2179 } else {
2180 !!!cp (190);
2181 $self->{current_token}->{public_identifier} # DOCTYPE
2182 .= chr $self->{next_char};
2183 ## Stay in the state
2184 !!!next-input-character;
2185 redo A;
2186 }
2187 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2188 if ($self->{next_char} == 0x0027) { # '
2189 !!!cp (191);
2190 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2191 !!!next-input-character;
2192 redo A;
2193 } elsif ($self->{next_char} == 0x003E) { # >
2194 !!!cp (192);
2195 !!!parse-error (type => 'unclosed PUBLIC literal');
2196
2197 $self->{state} = DATA_STATE;
2198 !!!next-input-character;
2199
2200 $self->{current_token}->{quirks} = 1;
2201 !!!emit ($self->{current_token}); # DOCTYPE
2202
2203 redo A;
2204 } elsif ($self->{next_char} == -1) {
2205 !!!cp (193);
2206 !!!parse-error (type => 'unclosed PUBLIC literal');
2207
2208 $self->{state} = DATA_STATE;
2209 ## reconsume
2210
2211 $self->{current_token}->{quirks} = 1;
2212 !!!emit ($self->{current_token}); # DOCTYPE
2213
2214 redo A;
2215 } else {
2216 !!!cp (194);
2217 $self->{current_token}->{public_identifier} # DOCTYPE
2218 .= chr $self->{next_char};
2219 ## Stay in the state
2220 !!!next-input-character;
2221 redo A;
2222 }
2223 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2224 if ({
2225 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2226 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2227 }->{$self->{next_char}}) {
2228 !!!cp (195);
2229 ## Stay in the state
2230 !!!next-input-character;
2231 redo A;
2232 } elsif ($self->{next_char} == 0x0022) { # "
2233 !!!cp (196);
2234 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2235 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2236 !!!next-input-character;
2237 redo A;
2238 } elsif ($self->{next_char} == 0x0027) { # '
2239 !!!cp (197);
2240 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2241 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2242 !!!next-input-character;
2243 redo A;
2244 } elsif ($self->{next_char} == 0x003E) { # >
2245 !!!cp (198);
2246 $self->{state} = DATA_STATE;
2247 !!!next-input-character;
2248
2249 !!!emit ($self->{current_token}); # DOCTYPE
2250
2251 redo A;
2252 } elsif ($self->{next_char} == -1) {
2253 !!!cp (199);
2254 !!!parse-error (type => 'unclosed DOCTYPE');
2255
2256 $self->{state} = DATA_STATE;
2257 ## reconsume
2258
2259 $self->{current_token}->{quirks} = 1;
2260 !!!emit ($self->{current_token}); # DOCTYPE
2261
2262 redo A;
2263 } else {
2264 !!!cp (200);
2265 !!!parse-error (type => 'string after PUBLIC literal');
2266 $self->{current_token}->{quirks} = 1;
2267
2268 $self->{state} = BOGUS_DOCTYPE_STATE;
2269 !!!next-input-character;
2270 redo A;
2271 }
2272 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2273 if ({
2274 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2275 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2276 }->{$self->{next_char}}) {
2277 !!!cp (201);
2278 ## Stay in the state
2279 !!!next-input-character;
2280 redo A;
2281 } elsif ($self->{next_char} == 0x0022) { # "
2282 !!!cp (202);
2283 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2284 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2285 !!!next-input-character;
2286 redo A;
2287 } elsif ($self->{next_char} == 0x0027) { # '
2288 !!!cp (203);
2289 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2290 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2291 !!!next-input-character;
2292 redo A;
2293 } elsif ($self->{next_char} == 0x003E) { # >
2294 !!!cp (204);
2295 !!!parse-error (type => 'no SYSTEM literal');
2296 $self->{state} = DATA_STATE;
2297 !!!next-input-character;
2298
2299 $self->{current_token}->{quirks} = 1;
2300 !!!emit ($self->{current_token}); # DOCTYPE
2301
2302 redo A;
2303 } elsif ($self->{next_char} == -1) {
2304 !!!cp (205);
2305 !!!parse-error (type => 'unclosed DOCTYPE');
2306
2307 $self->{state} = DATA_STATE;
2308 ## reconsume
2309
2310 $self->{current_token}->{quirks} = 1;
2311 !!!emit ($self->{current_token}); # DOCTYPE
2312
2313 redo A;
2314 } else {
2315 !!!cp (206);
2316 !!!parse-error (type => 'string after SYSTEM');
2317 $self->{current_token}->{quirks} = 1;
2318
2319 $self->{state} = BOGUS_DOCTYPE_STATE;
2320 !!!next-input-character;
2321 redo A;
2322 }
2323 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2324 if ($self->{next_char} == 0x0022) { # "
2325 !!!cp (207);
2326 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2327 !!!next-input-character;
2328 redo A;
2329 } elsif ($self->{next_char} == 0x003E) { # >
2330 !!!cp (208);
2331 !!!parse-error (type => 'unclosed PUBLIC literal');
2332
2333 $self->{state} = DATA_STATE;
2334 !!!next-input-character;
2335
2336 $self->{current_token}->{quirks} = 1;
2337 !!!emit ($self->{current_token}); # DOCTYPE
2338
2339 redo A;
2340 } elsif ($self->{next_char} == -1) {
2341 !!!cp (209);
2342 !!!parse-error (type => 'unclosed SYSTEM literal');
2343
2344 $self->{state} = DATA_STATE;
2345 ## reconsume
2346
2347 $self->{current_token}->{quirks} = 1;
2348 !!!emit ($self->{current_token}); # DOCTYPE
2349
2350 redo A;
2351 } else {
2352 !!!cp (210);
2353 $self->{current_token}->{system_identifier} # DOCTYPE
2354 .= chr $self->{next_char};
2355 ## Stay in the state
2356 !!!next-input-character;
2357 redo A;
2358 }
2359 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2360 if ($self->{next_char} == 0x0027) { # '
2361 !!!cp (211);
2362 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2363 !!!next-input-character;
2364 redo A;
2365 } elsif ($self->{next_char} == 0x003E) { # >
2366 !!!cp (212);
2367 !!!parse-error (type => 'unclosed PUBLIC literal');
2368
2369 $self->{state} = DATA_STATE;
2370 !!!next-input-character;
2371
2372 $self->{current_token}->{quirks} = 1;
2373 !!!emit ($self->{current_token}); # DOCTYPE
2374
2375 redo A;
2376 } elsif ($self->{next_char} == -1) {
2377 !!!cp (213);
2378 !!!parse-error (type => 'unclosed SYSTEM literal');
2379
2380 $self->{state} = DATA_STATE;
2381 ## reconsume
2382
2383 $self->{current_token}->{quirks} = 1;
2384 !!!emit ($self->{current_token}); # DOCTYPE
2385
2386 redo A;
2387 } else {
2388 !!!cp (214);
2389 $self->{current_token}->{system_identifier} # DOCTYPE
2390 .= chr $self->{next_char};
2391 ## Stay in the state
2392 !!!next-input-character;
2393 redo A;
2394 }
2395 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2396 if ({
2397 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2398 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2399 }->{$self->{next_char}}) {
2400 !!!cp (215);
2401 ## Stay in the state
2402 !!!next-input-character;
2403 redo A;
2404 } elsif ($self->{next_char} == 0x003E) { # >
2405 !!!cp (216);
2406 $self->{state} = DATA_STATE;
2407 !!!next-input-character;
2408
2409 !!!emit ($self->{current_token}); # DOCTYPE
2410
2411 redo A;
2412 } elsif ($self->{next_char} == -1) {
2413 !!!cp (217);
2414 !!!parse-error (type => 'unclosed DOCTYPE');
2415
2416 $self->{state} = DATA_STATE;
2417 ## reconsume
2418
2419 $self->{current_token}->{quirks} = 1;
2420 !!!emit ($self->{current_token}); # DOCTYPE
2421
2422 redo A;
2423 } else {
2424 !!!cp (218);
2425 !!!parse-error (type => 'string after SYSTEM literal');
2426 #$self->{current_token}->{quirks} = 1;
2427
2428 $self->{state} = BOGUS_DOCTYPE_STATE;
2429 !!!next-input-character;
2430 redo A;
2431 }
2432 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2433 if ($self->{next_char} == 0x003E) { # >
2434 !!!cp (219);
2435 $self->{state} = DATA_STATE;
2436 !!!next-input-character;
2437
2438 !!!emit ($self->{current_token}); # DOCTYPE
2439
2440 redo A;
2441 } elsif ($self->{next_char} == -1) {
2442 !!!cp (220);
2443 !!!parse-error (type => 'unclosed DOCTYPE');
2444 $self->{state} = DATA_STATE;
2445 ## reconsume
2446
2447 !!!emit ($self->{current_token}); # DOCTYPE
2448
2449 redo A;
2450 } else {
2451 !!!cp (221);
2452 ## Stay in the state
2453 !!!next-input-character;
2454 redo A;
2455 }
2456 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2457 my $s = '';
2458
2459 my ($l, $c) = ($self->{line}, $self->{column});
2460
2461 CS: while ($self->{next_char} != -1) {
2462 if ($self->{next_char} == 0x005D) { # ]
2463 !!!next-input-character;
2464 if ($self->{next_char} == 0x005D) { # ]
2465 !!!next-input-character;
2466 MDC: {
2467 if ($self->{next_char} == 0x003E) { # >
2468 !!!cp (221.1);
2469 !!!next-input-character;
2470 last CS;
2471 } elsif ($self->{next_char} == 0x005D) { # ]
2472 !!!cp (221.2);
2473 $s .= ']';
2474 !!!next-input-character;
2475 redo MDC;
2476 } else {
2477 !!!cp (221.3);
2478 $s .= ']]';
2479 #
2480 }
2481 } # MDC
2482 } else {
2483 !!!cp (221.4);
2484 $s .= ']';
2485 #
2486 }
2487 } else {
2488 !!!cp (221.5);
2489 #
2490 }
2491 $s .= chr $self->{next_char};
2492 !!!next-input-character;
2493 } # CS
2494
2495 $self->{state} = DATA_STATE;
2496 ## next-input-character done or EOF, which is reconsumed.
2497
2498 if (length $s) {
2499 !!!cp (221.6);
2500 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2501 line => $l, column => $c});
2502 } else {
2503 !!!cp (221.7);
2504 }
2505
2506 redo A;
2507
2508 ## ISSUE: "text tokens" in spec.
2509 ## TODO: Streaming support
2510 } else {
2511 die "$0: $self->{state}: Unknown state";
2512 }
2513 } # A
2514
2515 die "$0: _get_next_token: unexpected case";
2516 } # _get_next_token
2517
2518 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2519 my ($self, $in_attr, $additional) = @_;
2520
2521 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2522
2523 if ({
2524 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2525 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2526 $additional => 1,
2527 }->{$self->{next_char}}) {
2528 !!!cp (1001);
2529 ## Don't consume
2530 ## No error
2531 return undef;
2532 } elsif ($self->{next_char} == 0x0023) { # #
2533 !!!next-input-character;
2534 if ($self->{next_char} == 0x0078 or # x
2535 $self->{next_char} == 0x0058) { # X
2536 my $code;
2537 X: {
2538 my $x_char = $self->{next_char};
2539 !!!next-input-character;
2540 if (0x0030 <= $self->{next_char} and
2541 $self->{next_char} <= 0x0039) { # 0..9
2542 !!!cp (1002);
2543 $code ||= 0;
2544 $code *= 0x10;
2545 $code += $self->{next_char} - 0x0030;
2546 redo X;
2547 } elsif (0x0061 <= $self->{next_char} and
2548 $self->{next_char} <= 0x0066) { # a..f
2549 !!!cp (1003);
2550 $code ||= 0;
2551 $code *= 0x10;
2552 $code += $self->{next_char} - 0x0060 + 9;
2553 redo X;
2554 } elsif (0x0041 <= $self->{next_char} and
2555 $self->{next_char} <= 0x0046) { # A..F
2556 !!!cp (1004);
2557 $code ||= 0;
2558 $code *= 0x10;
2559 $code += $self->{next_char} - 0x0040 + 9;
2560 redo X;
2561 } elsif (not defined $code) { # no hexadecimal digit
2562 !!!cp (1005);
2563 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2564 !!!back-next-input-character ($x_char, $self->{next_char});
2565 $self->{next_char} = 0x0023; # #
2566 return undef;
2567 } elsif ($self->{next_char} == 0x003B) { # ;
2568 !!!cp (1006);
2569 !!!next-input-character;
2570 } else {
2571 !!!cp (1007);
2572 !!!parse-error (type => 'no refc', line => $l, column => $c);
2573 }
2574
2575 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2576 !!!cp (1008);
2577 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2578 $code = 0xFFFD;
2579 } elsif ($code > 0x10FFFF) {
2580 !!!cp (1009);
2581 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2582 $code = 0xFFFD;
2583 } elsif ($code == 0x000D) {
2584 !!!cp (1010);
2585 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2586 $code = 0x000A;
2587 } elsif (0x80 <= $code and $code <= 0x9F) {
2588 !!!cp (1011);
2589 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2590 $code = $c1_entity_char->{$code};
2591 }
2592
2593 return {type => CHARACTER_TOKEN, data => chr $code,
2594 has_reference => 1,
2595 line => $l, column => $c,
2596 };
2597 } # X
2598 } elsif (0x0030 <= $self->{next_char} and
2599 $self->{next_char} <= 0x0039) { # 0..9
2600 my $code = $self->{next_char} - 0x0030;
2601 !!!next-input-character;
2602
2603 while (0x0030 <= $self->{next_char} and
2604 $self->{next_char} <= 0x0039) { # 0..9
2605 !!!cp (1012);
2606 $code *= 10;
2607 $code += $self->{next_char} - 0x0030;
2608
2609 !!!next-input-character;
2610 }
2611
2612 if ($self->{next_char} == 0x003B) { # ;
2613 !!!cp (1013);
2614 !!!next-input-character;
2615 } else {
2616 !!!cp (1014);
2617 !!!parse-error (type => 'no refc', line => $l, column => $c);
2618 }
2619
2620 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2621 !!!cp (1015);
2622 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2623 $code = 0xFFFD;
2624 } elsif ($code > 0x10FFFF) {
2625 !!!cp (1016);
2626 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2627 $code = 0xFFFD;
2628 } elsif ($code == 0x000D) {
2629 !!!cp (1017);
2630 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2631 $code = 0x000A;
2632 } elsif (0x80 <= $code and $code <= 0x9F) {
2633 !!!cp (1018);
2634 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2635 $code = $c1_entity_char->{$code};
2636 }
2637
2638 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2639 line => $l, column => $c,
2640 };
2641 } else {
2642 !!!cp (1019);
2643 !!!parse-error (type => 'bare nero', line => $l, column => $c);
2644 !!!back-next-input-character ($self->{next_char});
2645 $self->{next_char} = 0x0023; # #
2646 return undef;
2647 }
2648 } elsif ((0x0041 <= $self->{next_char} and
2649 $self->{next_char} <= 0x005A) or
2650 (0x0061 <= $self->{next_char} and
2651 $self->{next_char} <= 0x007A)) {
2652 my $entity_name = chr $self->{next_char};
2653 !!!next-input-character;
2654
2655 my $value = $entity_name;
2656 my $match = 0;
2657 require Whatpm::_NamedEntityList;
2658 our $EntityChar;
2659
2660 while (length $entity_name < 30 and
2661 ## NOTE: Some number greater than the maximum length of entity name
2662 ((0x0041 <= $self->{next_char} and # a
2663 $self->{next_char} <= 0x005A) or # x
2664 (0x0061 <= $self->{next_char} and # a
2665 $self->{next_char} <= 0x007A) or # z
2666 (0x0030 <= $self->{next_char} and # 0
2667 $self->{next_char} <= 0x0039) or # 9
2668 $self->{next_char} == 0x003B)) { # ;
2669 $entity_name .= chr $self->{next_char};
2670 if (defined $EntityChar->{$entity_name}) {
2671 if ($self->{next_char} == 0x003B) { # ;
2672 !!!cp (1020);
2673 $value = $EntityChar->{$entity_name};
2674 $match = 1;
2675 !!!next-input-character;
2676 last;
2677 } else {
2678 !!!cp (1021);
2679 $value = $EntityChar->{$entity_name};
2680 $match = -1;
2681 !!!next-input-character;
2682 }
2683 } else {
2684 !!!cp (1022);
2685 $value .= chr $self->{next_char};
2686 $match *= 2;
2687 !!!next-input-character;
2688 }
2689 }
2690
2691 if ($match > 0) {
2692 !!!cp (1023);
2693 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2694 line => $l, column => $c,
2695 };
2696 } elsif ($match < 0) {
2697 !!!parse-error (type => 'no refc', line => $l, column => $c);
2698 if ($in_attr and $match < -1) {
2699 !!!cp (1024);
2700 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
2701 line => $l, column => $c,
2702 };
2703 } else {
2704 !!!cp (1025);
2705 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2706 line => $l, column => $c,
2707 };
2708 }
2709 } else {
2710 !!!cp (1026);
2711 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2712 ## NOTE: "No characters are consumed" in the spec.
2713 return {type => CHARACTER_TOKEN, data => '&'.$value,
2714 line => $l, column => $c,
2715 };
2716 }
2717 } else {
2718 !!!cp (1027);
2719 ## no characters are consumed
2720 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2721 return undef;
2722 }
2723 } # _tokenize_attempt_to_consume_an_entity
2724
2725 sub _initialize_tree_constructor ($) {
2726 my $self = shift;
2727 ## NOTE: $self->{document} MUST be specified before this method is called
2728 $self->{document}->strict_error_checking (0);
2729 ## TODO: Turn mutation events off # MUST
2730 ## TODO: Turn loose Document option (manakai extension) on
2731 $self->{document}->manakai_is_html (1); # MUST
2732 } # _initialize_tree_constructor
2733
2734 sub _terminate_tree_constructor ($) {
2735 my $self = shift;
2736 $self->{document}->strict_error_checking (1);
2737 ## TODO: Turn mutation events on
2738 } # _terminate_tree_constructor
2739
2740 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2741
2742 { # tree construction stage
2743 my $token;
2744
2745 sub _construct_tree ($) {
2746 my ($self) = @_;
2747
2748 ## When an interactive UA render the $self->{document} available
2749 ## to the user, or when it begin accepting user input, are
2750 ## not defined.
2751
2752 ## Append a character: collect it and all subsequent consecutive
2753 ## characters and insert one Text node whose data is concatenation
2754 ## of all those characters. # MUST
2755
2756 !!!next-token;
2757
2758 undef $self->{form_element};
2759 undef $self->{head_element};
2760 $self->{open_elements} = [];
2761 undef $self->{inner_html_node};
2762
2763 ## NOTE: The "initial" insertion mode.
2764 $self->_tree_construction_initial; # MUST
2765
2766 ## NOTE: The "before html" insertion mode.
2767 $self->_tree_construction_root_element;
2768 $self->{insertion_mode} = BEFORE_HEAD_IM;
2769
2770 ## NOTE: The "before head" insertion mode and so on.
2771 $self->_tree_construction_main;
2772 } # _construct_tree
2773
2774 sub _tree_construction_initial ($) {
2775 my $self = shift;
2776
2777 ## NOTE: "initial" insertion mode
2778
2779 INITIAL: {
2780 if ($token->{type} == DOCTYPE_TOKEN) {
2781 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2782 ## error, switch to a conformance checking mode for another
2783 ## language.
2784 my $doctype_name = $token->{name};
2785 $doctype_name = '' unless defined $doctype_name;
2786 $doctype_name =~ tr/a-z/A-Z/;
2787 if (not defined $token->{name} or # <!DOCTYPE>
2788 defined $token->{public_identifier} or
2789 defined $token->{system_identifier}) {
2790 !!!cp ('t1');
2791 !!!parse-error (type => 'not HTML5', token => $token);
2792 } elsif ($doctype_name ne 'HTML') {
2793 !!!cp ('t2');
2794 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2795 !!!parse-error (type => 'not HTML5', token => $token);
2796 } else {
2797 !!!cp ('t3');
2798 }
2799
2800 my $doctype = $self->{document}->create_document_type_definition
2801 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2802 ## NOTE: Default value for both |public_id| and |system_id| attributes
2803 ## are empty strings, so that we don't set any value in missing cases.
2804 $doctype->public_id ($token->{public_identifier})
2805 if defined $token->{public_identifier};
2806 $doctype->system_id ($token->{system_identifier})
2807 if defined $token->{system_identifier};
2808 ## NOTE: Other DocumentType attributes are null or empty lists.
2809 ## ISSUE: internalSubset = null??
2810 $self->{document}->append_child ($doctype);
2811
2812 if ($token->{quirks} or $doctype_name ne 'HTML') {
2813 !!!cp ('t4');
2814 $self->{document}->manakai_compat_mode ('quirks');
2815 } elsif (defined $token->{public_identifier}) {
2816 my $pubid = $token->{public_identifier};
2817 $pubid =~ tr/a-z/A-z/;
2818 if ({
2819 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2820 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2821 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2822 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2823 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2824 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2825 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2826 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2827 "-//IETF//DTD HTML 2.0//EN" => 1,
2828 "-//IETF//DTD HTML 2.1E//EN" => 1,
2829 "-//IETF//DTD HTML 3.0//EN" => 1,
2830 "-//IETF//DTD HTML 3.0//EN//" => 1,
2831 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2832 "-//IETF//DTD HTML 3.2//EN" => 1,
2833 "-//IETF//DTD HTML 3//EN" => 1,
2834 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2835 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2836 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2837 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2838 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2839 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2840 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2841 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2842 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2843 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2844 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2845 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2846 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2847 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2848 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2849 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2850 "-//IETF//DTD HTML STRICT//EN" => 1,
2851 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2852 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2853 "-//IETF//DTD HTML//EN" => 1,
2854 "-//IETF//DTD HTML//EN//2.0" => 1,
2855 "-//IETF//DTD HTML//EN//3.0" => 1,
2856 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2857 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2858 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2859 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2860 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2861 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2862 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2863 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2864 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2865 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2866 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2867 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2868 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2869 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2870 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2871 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2872 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2873 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2874 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2875 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2876 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2877 "-//W3C//DTD HTML 3.2//EN" => 1,
2878 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2879 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2880 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2881 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2882 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2883 "-//W3C//DTD W3 HTML//EN" => 1,
2884 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2885 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2886 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2887 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2888 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2889 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2890 "HTML" => 1,
2891 }->{$pubid}) {
2892 !!!cp ('t5');
2893 $self->{document}->manakai_compat_mode ('quirks');
2894 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2895 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2896 if (defined $token->{system_identifier}) {
2897 !!!cp ('t6');
2898 $self->{document}->manakai_compat_mode ('quirks');
2899 } else {
2900 !!!cp ('t7');
2901 $self->{document}->manakai_compat_mode ('limited quirks');
2902 }
2903 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
2904 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
2905 !!!cp ('t8');
2906 $self->{document}->manakai_compat_mode ('limited quirks');
2907 } else {
2908 !!!cp ('t9');
2909 }
2910 } else {
2911 !!!cp ('t10');
2912 }
2913 if (defined $token->{system_identifier}) {
2914 my $sysid = $token->{system_identifier};
2915 $sysid =~ tr/A-Z/a-z/;
2916 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2917 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
2918 $self->{document}->manakai_compat_mode ('quirks');
2919 !!!cp ('t11');
2920 } else {
2921 !!!cp ('t12');
2922 }
2923 } else {
2924 !!!cp ('t13');
2925 }
2926
2927 ## Go to the "before html" insertion mode.
2928 !!!next-token;
2929 return;
2930 } elsif ({
2931 START_TAG_TOKEN, 1,
2932 END_TAG_TOKEN, 1,
2933 END_OF_FILE_TOKEN, 1,
2934 }->{$token->{type}}) {
2935 !!!cp ('t14');
2936 !!!parse-error (type => 'no DOCTYPE', token => $token);
2937 $self->{document}->manakai_compat_mode ('quirks');
2938 ## Go to the "before html" insertion mode.
2939 ## reprocess
2940 !!!ack-later;
2941 return;
2942 } elsif ($token->{type} == CHARACTER_TOKEN) {
2943 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2944 ## Ignore the token
2945
2946 unless (length $token->{data}) {
2947 !!!cp ('t15');
2948 ## Stay in the insertion mode.
2949 !!!next-token;
2950 redo INITIAL;
2951 } else {
2952 !!!cp ('t16');
2953 }
2954 } else {
2955 !!!cp ('t17');
2956 }
2957
2958 !!!parse-error (type => 'no DOCTYPE', token => $token);
2959 $self->{document}->manakai_compat_mode ('quirks');
2960 ## Go to the "before html" insertion mode.
2961 ## reprocess
2962 return;
2963 } elsif ($token->{type} == COMMENT_TOKEN) {
2964 !!!cp ('t18');
2965 my $comment = $self->{document}->create_comment ($token->{data});
2966 $self->{document}->append_child ($comment);
2967
2968 ## Stay in the insertion mode.
2969 !!!next-token;
2970 redo INITIAL;
2971 } else {
2972 die "$0: $token->{type}: Unknown token type";
2973 }
2974 } # INITIAL
2975
2976 die "$0: _tree_construction_initial: This should be never reached";
2977 } # _tree_construction_initial
2978
2979 sub _tree_construction_root_element ($) {
2980 my $self = shift;
2981
2982 ## NOTE: "before html" insertion mode.
2983
2984 B: {
2985 if ($token->{type} == DOCTYPE_TOKEN) {
2986 !!!cp ('t19');
2987 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
2988 ## Ignore the token
2989 ## Stay in the insertion mode.
2990 !!!next-token;
2991 redo B;
2992 } elsif ($token->{type} == COMMENT_TOKEN) {
2993 !!!cp ('t20');
2994 my $comment = $self->{document}->create_comment ($token->{data});
2995 $self->{document}->append_child ($comment);
2996 ## Stay in the insertion mode.
2997 !!!next-token;
2998 redo B;
2999 } elsif ($token->{type} == CHARACTER_TOKEN) {
3000 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3001 ## Ignore the token.
3002
3003 unless (length $token->{data}) {
3004 !!!cp ('t21');
3005 ## Stay in the insertion mode.
3006 !!!next-token;
3007 redo B;
3008 } else {
3009 !!!cp ('t22');
3010 }
3011 } else {
3012 !!!cp ('t23');
3013 }
3014
3015 $self->{application_cache_selection}->(undef);
3016
3017 #
3018 } elsif ($token->{type} == START_TAG_TOKEN) {
3019 if ($token->{tag_name} eq 'html') {
3020 my $root_element;
3021 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3022 $self->{document}->append_child ($root_element);
3023 push @{$self->{open_elements}},
3024 [$root_element, $el_category->{html}];
3025
3026 if ($token->{attributes}->{manifest}) {
3027 !!!cp ('t24');
3028 $self->{application_cache_selection}
3029 ->($token->{attributes}->{manifest}->{value});
3030 ## ISSUE: Spec is unclear on relative references.
3031 ## According to Hixie (#whatwg 2008-03-19), it should be
3032 ## resolved against the base URI of the document in HTML
3033 ## or xml:base of the element in XHTML.
3034 } else {
3035 !!!cp ('t25');
3036 $self->{application_cache_selection}->(undef);
3037 }
3038
3039 !!!nack ('t25c');
3040
3041 !!!next-token;
3042 return; ## Go to the "before head" insertion mode.
3043 } else {
3044 !!!cp ('t25.1');
3045 #
3046 }
3047 } elsif ({
3048 END_TAG_TOKEN, 1,
3049 END_OF_FILE_TOKEN, 1,
3050 }->{$token->{type}}) {
3051 !!!cp ('t26');
3052 #
3053 } else {
3054 die "$0: $token->{type}: Unknown token type";
3055 }
3056
3057 my $root_element;
3058 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3059 $self->{document}->append_child ($root_element);
3060 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3061
3062 $self->{application_cache_selection}->(undef);
3063
3064 ## NOTE: Reprocess the token.
3065 !!!ack-later;
3066 return; ## Go to the "before head" insertion mode.
3067
3068 ## ISSUE: There is an issue in the spec
3069 } # B
3070
3071 die "$0: _tree_construction_root_element: This should never be reached";
3072 } # _tree_construction_root_element
3073
3074 sub _reset_insertion_mode ($) {
3075 my $self = shift;
3076
3077 ## Step 1
3078 my $last;
3079
3080 ## Step 2
3081 my $i = -1;
3082 my $node = $self->{open_elements}->[$i];
3083
3084 ## Step 3
3085 S3: {
3086 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3087 $last = 1;
3088 if (defined $self->{inner_html_node}) {
3089 if ($self->{inner_html_node}->[1] & TABLE_CELL_EL) {
3090 !!!cp ('t27');
3091 #
3092 } else {
3093 !!!cp ('t28');
3094 $node = $self->{inner_html_node};
3095 }
3096 }
3097 }
3098
3099 ## Step 4..14
3100 my $new_mode;
3101 if ($node->[1] & FOREIGN_EL) {
3102 ## NOTE: Strictly spaking, the line below only applies to MathML and
3103 ## SVG elements. Currently the HTML syntax supports only MathML and
3104 ## SVG elements as foreigners.
3105 $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
3106 ## ISSUE: What is set as the secondary insertion mode?
3107 } else {
3108 $new_mode = {
3109 select => IN_SELECT_IM,
3110 ## NOTE: |option| and |optgroup| do not set
3111 ## insertion mode to "in select" by themselves.
3112 td => IN_CELL_IM,
3113 th => IN_CELL_IM,
3114 tr => IN_ROW_IM,
3115 tbody => IN_TABLE_BODY_IM,
3116 thead => IN_TABLE_BODY_IM,
3117 tfoot => IN_TABLE_BODY_IM,
3118 caption => IN_CAPTION_IM,
3119 colgroup => IN_COLUMN_GROUP_IM,
3120 table => IN_TABLE_IM,
3121 head => IN_BODY_IM, # not in head!
3122 body => IN_BODY_IM,
3123 frameset => IN_FRAMESET_IM,
3124 }->{$node->[0]->manakai_local_name};
3125 }
3126 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3127
3128 ## Step 15
3129 if ($node->[1] & HTML_EL) {
3130 unless (defined $self->{head_element}) {
3131 !!!cp ('t29');
3132 $self->{insertion_mode} = BEFORE_HEAD_IM;
3133 } else {
3134 ## ISSUE: Can this state be reached?
3135 !!!cp ('t30');
3136 $self->{insertion_mode} = AFTER_HEAD_IM;
3137 }
3138 return;
3139 } else {
3140 !!!cp ('t31');
3141 }
3142
3143 ## Step 16
3144 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3145
3146 ## Step 17
3147 $i--;
3148 $node = $self->{open_elements}->[$i];
3149
3150 ## Step 18
3151 redo S3;
3152 } # S3
3153
3154 die "$0: _reset_insertion_mode: This line should never be reached";
3155 } # _reset_insertion_mode
3156
3157 sub _tree_construction_main ($) {
3158 my $self = shift;
3159
3160 my $active_formatting_elements = [];
3161
3162 my $reconstruct_active_formatting_elements = sub { # MUST
3163 my $insert = shift;
3164
3165 ## Step 1
3166 return unless @$active_formatting_elements;
3167
3168 ## Step 3
3169 my $i = -1;
3170 my $entry = $active_formatting_elements->[$i];
3171
3172 ## Step 2
3173 return if $entry->[0] eq '#marker';
3174 for (@{$self->{open_elements}}) {
3175 if ($entry->[0] eq $_->[0]) {
3176 !!!cp ('t32');
3177 return;
3178 }
3179 }
3180
3181 S4: {
3182 ## Step 4
3183 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3184
3185 ## Step 5
3186 $i--;
3187 $entry = $active_formatting_elements->[$i];
3188
3189 ## Step 6
3190 if ($entry->[0] eq '#marker') {
3191 !!!cp ('t33_1');
3192 #
3193 } else {
3194 my $in_open_elements;
3195 OE: for (@{$self->{open_elements}}) {
3196 if ($entry->[0] eq $_->[0]) {
3197 !!!cp ('t33');
3198 $in_open_elements = 1;
3199 last OE;
3200 }
3201 }
3202 if ($in_open_elements) {
3203 !!!cp ('t34');
3204 #
3205 } else {
3206 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3207 !!!cp ('t35');
3208 redo S4;
3209 }
3210 }
3211
3212 ## Step 7
3213 $i++;
3214 $entry = $active_formatting_elements->[$i];
3215 } # S4
3216
3217 S7: {
3218 ## Step 8
3219 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3220
3221 ## Step 9
3222 $insert->($clone->[0]);
3223 push @{$self->{open_elements}}, $clone;
3224
3225 ## Step 10
3226 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3227
3228 ## Step 11
3229 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3230 !!!cp ('t36');
3231 ## Step 7'
3232 $i++;
3233 $entry = $active_formatting_elements->[$i];
3234
3235 redo S7;
3236 }
3237
3238 !!!cp ('t37');
3239 } # S7
3240 }; # $reconstruct_active_formatting_elements
3241
3242 my $clear_up_to_marker = sub {
3243 for (reverse 0..$#$active_formatting_elements) {
3244 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3245 !!!cp ('t38');
3246 splice @$active_formatting_elements, $_;
3247 return;
3248 }
3249 }
3250
3251 !!!cp ('t39');
3252 }; # $clear_up_to_marker
3253
3254 my $insert;
3255
3256 my $parse_rcdata = sub ($) {
3257 my ($content_model_flag) = @_;
3258
3259 ## Step 1
3260 my $start_tag_name = $token->{tag_name};
3261 my $el;
3262 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3263
3264 ## Step 2
3265 $insert->($el);
3266
3267 ## Step 3
3268 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3269 delete $self->{escape}; # MUST
3270
3271 ## Step 4
3272 my $text = '';
3273 !!!nack ('t40.1');
3274 !!!next-token;
3275 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3276 !!!cp ('t40');
3277 $text .= $token->{data};
3278 !!!next-token;
3279 }
3280
3281 ## Step 5
3282 if (length $text) {
3283 !!!cp ('t41');
3284 my $text = $self->{document}->create_text_node ($text);
3285 $el->append_child ($text);
3286 }
3287
3288 ## Step 6
3289 $self->{content_model} = PCDATA_CONTENT_MODEL;
3290
3291 ## Step 7
3292 if ($token->{type} == END_TAG_TOKEN and
3293 $token->{tag_name} eq $start_tag_name) {
3294 !!!cp ('t42');
3295 ## Ignore the token
3296 } else {
3297 ## NOTE: An end-of-file token.
3298 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3299 !!!cp ('t43');
3300 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3301 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3302 !!!cp ('t44');
3303 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
3304 } else {
3305 die "$0: $content_model_flag in parse_rcdata";
3306 }
3307 }
3308 !!!next-token;
3309 }; # $parse_rcdata
3310
3311 my $script_start_tag = sub () {
3312 my $script_el;
3313 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3314 ## TODO: mark as "parser-inserted"
3315
3316 $self->{content_model} = CDATA_CONTENT_MODEL;
3317 delete $self->{escape}; # MUST
3318
3319 my $text = '';
3320 !!!nack ('t45.1');
3321 !!!next-token;
3322 while ($token->{type} == CHARACTER_TOKEN) {
3323 !!!cp ('t45');
3324 $text .= $token->{data};
3325 !!!next-token;
3326 } # stop if non-character token or tokenizer stops tokenising
3327 if (length $text) {
3328 !!!cp ('t46');
3329 $script_el->manakai_append_text ($text);
3330 }
3331
3332 $self->{content_model} = PCDATA_CONTENT_MODEL;
3333
3334 if ($token->{type} == END_TAG_TOKEN and
3335 $token->{tag_name} eq 'script') {
3336 !!!cp ('t47');
3337 ## Ignore the token
3338 } else {
3339 !!!cp ('t48');
3340 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3341 ## ISSUE: And ignore?
3342 ## TODO: mark as "already executed"
3343 }
3344
3345 if (defined $self->{inner_html_node}) {
3346 !!!cp ('t49');
3347 ## TODO: mark as "already executed"
3348 } else {
3349 !!!cp ('t50');
3350 ## TODO: $old_insertion_point = current insertion point
3351 ## TODO: insertion point = just before the next input character
3352
3353 $insert->($script_el);
3354
3355 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3356
3357 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3358 }
3359
3360 !!!next-token;
3361 }; # $script_start_tag
3362
3363 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3364 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3365 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3366
3367 my $formatting_end_tag = sub {
3368 my $end_tag_token = shift;
3369 my $tag_name = $end_tag_token->{tag_name};
3370
3371 ## NOTE: The adoption agency algorithm (AAA).
3372
3373 FET: {
3374 ## Step 1
3375 my $formatting_element;
3376 my $formatting_element_i_in_active;
3377 AFE: for (reverse 0..$#$active_formatting_elements) {
3378 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3379 !!!cp ('t52');
3380 last AFE;
3381 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3382 eq $tag_name) {
3383 !!!cp ('t51');
3384 $formatting_element = $active_formatting_elements->[$_];
3385 $formatting_element_i_in_active = $_;
3386 last AFE;
3387 }
3388 } # AFE
3389 unless (defined $formatting_element) {
3390 !!!cp ('t53');
3391 !!!parse-error (type => 'unmatched end tag:'.$tag_name, token => $end_tag_token);
3392 ## Ignore the token
3393 !!!next-token;
3394 return;
3395 }
3396 ## has an element in scope
3397 my $in_scope = 1;
3398 my $formatting_element_i_in_open;
3399 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3400 my $node = $self->{open_elements}->[$_];
3401 if ($node->[0] eq $formatting_element->[0]) {
3402 if ($in_scope) {
3403 !!!cp ('t54');
3404 $formatting_element_i_in_open = $_;
3405 last INSCOPE;
3406 } else { # in open elements but not in scope
3407 !!!cp ('t55');
3408 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3409 token => $end_tag_token);
3410 ## Ignore the token
3411 !!!next-token;
3412 return;
3413 }
3414 } elsif ($node->[1] & SCOPING_EL) {
3415 !!!cp ('t56');
3416 $in_scope = 0;
3417 }
3418 } # INSCOPE
3419 unless (defined $formatting_element_i_in_open) {
3420 !!!cp ('t57');
3421 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3422 token => $end_tag_token);
3423 pop @$active_formatting_elements; # $formatting_element
3424 !!!next-token; ## TODO: ok?
3425 return;
3426 }
3427 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3428 !!!cp ('t58');
3429 !!!parse-error (type => 'not closed',
3430 value => $self->{open_elements}->[-1]->[0]
3431 ->manakai_local_name,
3432 token => $end_tag_token);
3433 }
3434
3435 ## Step 2
3436 my $furthest_block;
3437 my $furthest_block_i_in_open;
3438 OE: for (reverse 0..$#{$self->{open_elements}}) {
3439 my $node = $self->{open_elements}->[$_];
3440 if (not ($node->[1] & FORMATTING_EL) and
3441 #not $phrasing_category->{$node->[1]} and
3442 ($node->[1] & SPECIAL_EL or
3443 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3444 !!!cp ('t59');
3445 $furthest_block = $node;
3446 $furthest_block_i_in_open = $_;
3447 } elsif ($node->[0] eq $formatting_element->[0]) {
3448 !!!cp ('t60');
3449 last OE;
3450 }
3451 } # OE
3452
3453 ## Step 3
3454 unless (defined $furthest_block) { # MUST
3455 !!!cp ('t61');
3456 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3457 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3458 !!!next-token;
3459 return;
3460 }
3461
3462 ## Step 4
3463 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3464
3465 ## Step 5
3466 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3467 if (defined $furthest_block_parent) {
3468 !!!cp ('t62');
3469 $furthest_block_parent->remove_child ($furthest_block->[0]);
3470 }
3471
3472 ## Step 6
3473 my $bookmark_prev_el
3474 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3475 ->[0];
3476
3477 ## Step 7
3478 my $node = $furthest_block;
3479 my $node_i_in_open = $furthest_block_i_in_open;
3480 my $last_node = $furthest_block;
3481 S7: {
3482 ## Step 1
3483 $node_i_in_open--;
3484 $node = $self->{open_elements}->[$node_i_in_open];
3485
3486 ## Step 2
3487 my $node_i_in_active;
3488 S7S2: {
3489 for (reverse 0..$#$active_formatting_elements) {
3490 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3491 !!!cp ('t63');
3492 $node_i_in_active = $_;
3493 last S7S2;
3494 }
3495 }
3496 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3497 redo S7;
3498 } # S7S2
3499
3500 ## Step 3
3501 last S7 if $node->[0] eq $formatting_element->[0];
3502
3503 ## Step 4
3504 if ($last_node->[0] eq $furthest_block->[0]) {
3505 !!!cp ('t64');
3506 $bookmark_prev_el = $node->[0];
3507 }
3508
3509 ## Step 5
3510 if ($node->[0]->has_child_nodes ()) {
3511 !!!cp ('t65');
3512 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3513 $active_formatting_elements->[$node_i_in_active] = $clone;
3514 $self->{open_elements}->[$node_i_in_open] = $clone;
3515 $node = $clone;
3516 }
3517
3518 ## Step 6
3519 $node->[0]->append_child ($last_node->[0]);
3520
3521 ## Step 7
3522 $last_node = $node;
3523
3524 ## Step 8
3525 redo S7;
3526 } # S7
3527
3528 ## Step 8
3529 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3530 my $foster_parent_element;
3531 my $next_sibling;
3532 OE: for (reverse 0..$#{$self->{open_elements}}) {
3533 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3534 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3535 if (defined $parent and $parent->node_type == 1) {
3536 !!!cp ('t65.1');
3537 $foster_parent_element = $parent;
3538 $next_sibling = $self->{open_elements}->[$_]->[0];
3539 } else {
3540 !!!cp ('t65.2');
3541 $foster_parent_element
3542 = $self->{open_elements}->[$_ - 1]->[0];
3543 }
3544 last OE;
3545 }
3546 } # OE
3547 $foster_parent_element = $self->{open_elements}->[0]->[0]
3548 unless defined $foster_parent_element;
3549 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3550 $open_tables->[-1]->[1] = 1; # tainted
3551 } else {
3552 !!!cp ('t65.3');
3553 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3554 }
3555
3556 ## Step 9
3557 my $clone = [$formatting_element->[0]->clone_node (0),
3558 $formatting_element->[1]];
3559
3560 ## Step 10
3561 my @cn = @{$furthest_block->[0]->child_nodes};
3562 $clone->[0]->append_child ($_) for @cn;
3563
3564 ## Step 11
3565 $furthest_block->[0]->append_child ($clone->[0]);
3566
3567 ## Step 12
3568 my $i;
3569 AFE: for (reverse 0..$#$active_formatting_elements) {
3570 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3571 !!!cp ('t66');
3572 splice @$active_formatting_elements, $_, 1;
3573 $i-- and last AFE if defined $i;
3574 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3575 !!!cp ('t67');
3576 $i = $_;
3577 }
3578 } # AFE
3579 splice @$active_formatting_elements, $i + 1, 0, $clone;
3580
3581 ## Step 13
3582 undef $i;
3583 OE: for (reverse 0..$#{$self->{open_elements}}) {
3584 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3585 !!!cp ('t68');
3586 splice @{$self->{open_elements}}, $_, 1;
3587 $i-- and last OE if defined $i;
3588 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3589 !!!cp ('t69');
3590 $i = $_;
3591 }
3592 } # OE
3593 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3594
3595 ## Step 14
3596 redo FET;
3597 } # FET
3598 }; # $formatting_end_tag
3599
3600 $insert = my $insert_to_current = sub {
3601 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3602 }; # $insert_to_current
3603
3604 my $insert_to_foster = sub {
3605 my $child = shift;
3606 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3607 # MUST
3608 my $foster_parent_element;
3609 my $next_sibling;
3610 OE: for (reverse 0..$#{$self->{open_elements}}) {
3611 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3612 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3613 if (defined $parent and $parent->node_type == 1) {
3614 !!!cp ('t70');
3615 $foster_parent_element = $parent;
3616 $next_sibling = $self->{open_elements}->[$_]->[0];
3617 } else {
3618 !!!cp ('t71');
3619 $foster_parent_element
3620 = $self->{open_elements}->[$_ - 1]->[0];
3621 }
3622 last OE;
3623 }
3624 } # OE
3625 $foster_parent_element = $self->{open_elements}->[0]->[0]
3626 unless defined $foster_parent_element;
3627 $foster_parent_element->insert_before
3628 ($child, $next_sibling);
3629 $open_tables->[-1]->[1] = 1; # tainted
3630 } else {
3631 !!!cp ('t72');
3632 $self->{open_elements}->[-1]->[0]->append_child ($child);
3633 }
3634 }; # $insert_to_foster
3635
3636 B: while (1) {
3637 if ($token->{type} == DOCTYPE_TOKEN) {
3638 !!!cp ('t73');
3639 !!!parse-error (type => 'DOCTYPE in the middle', token => $token);
3640 ## Ignore the token
3641 ## Stay in the phase
3642 !!!next-token;
3643 next B;
3644 } elsif ($token->{type} == START_TAG_TOKEN and
3645 $token->{tag_name} eq 'html') {
3646 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3647 !!!cp ('t79');
3648 !!!parse-error (type => 'after html:html', token => $token);
3649 $self->{insertion_mode} = AFTER_BODY_IM;
3650 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3651 !!!cp ('t80');
3652 !!!parse-error (type => 'after html:html', token => $token);
3653 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3654 } else {
3655 !!!cp ('t81');
3656 }
3657
3658 !!!cp ('t82');
3659 !!!parse-error (type => 'not first start tag', token => $token);
3660 my $top_el = $self->{open_elements}->[0]->[0];
3661 for my $attr_name (keys %{$token->{attributes}}) {
3662 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3663 !!!cp ('t84');
3664 $top_el->set_attribute_ns
3665 (undef, [undef, $attr_name],
3666 $token->{attributes}->{$attr_name}->{value});
3667 }
3668 }
3669 !!!nack ('t84.1');
3670 !!!next-token;
3671 next B;
3672 } elsif ($token->{type} == COMMENT_TOKEN) {
3673 my $comment = $self->{document}->create_comment ($token->{data});
3674 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3675 !!!cp ('t85');
3676 $self->{document}->append_child ($comment);
3677 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3678 !!!cp ('t86');
3679 $self->{open_elements}->[0]->[0]->append_child ($comment);
3680 } else {
3681 !!!cp ('t87');
3682 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3683 }
3684 !!!next-token;
3685 next B;
3686 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
3687 if ($token->{type} == CHARACTER_TOKEN) {
3688 !!!cp ('t87.1');
3689 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3690 !!!next-token;
3691 next B;
3692 } elsif ($token->{type} == START_TAG_TOKEN) {
3693 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
3694 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
3695 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
3696 ($token->{tag_name} eq 'svg' and
3697 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
3698 ## NOTE: "using the rules for secondary insertion mode"then"continue"
3699 !!!cp ('t87.2');
3700 #
3701 } elsif ({
3702 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
3703 center => 1, code => 1, dd => 1, div => 1, dl => 1, em => 1,
3704 embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1, ## No h4!
3705 h5 => 1, h6 => 1, head => 1, hr => 1, i => 1, img => 1,
3706 li => 1, menu => 1, meta => 1, nobr => 1, p => 1, pre => 1,
3707 ruby => 1, s => 1, small => 1, span => 1, strong => 1,
3708 sub => 1, sup => 1, table => 1, tt => 1, u => 1, ul => 1,
3709 var => 1,
3710 }->{$token->{tag_name}}) {
3711 !!!cp ('t87.2');
3712 !!!parse-error (type => 'not closed',
3713 value => $self->{open_elements}->[-1]->[0]
3714 ->manakai_local_name,
3715 token => $token);
3716
3717 pop @{$self->{open_elements}}
3718 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
3719
3720 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
3721 ## Reprocess.
3722 next B;
3723 } else {
3724 ## TODO: case fixup
3725
3726 !!!insert-element-f ($self->{open_elements}->[-1]->[0]->namespace_uri, $token);
3727
3728 if ($self->{self_closing}) {
3729 pop @{$self->{open_elements}};
3730 !!!ack ('t87.3');
3731 } else {
3732 !!!cp ('t87.4');
3733 }
3734
3735 !!!next-token;
3736 next B;
3737 }
3738 } elsif ($token->{type} == END_TAG_TOKEN) {
3739 ## NOTE: "using the rules for secondary insertion mode" then "continue"
3740 !!!cp ('t87.5');
3741 #
3742 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3743 ## NOTE: "using the rules for secondary insertion mode" then "continue"
3744 !!!cp ('t87.6');
3745 #
3746 ## TODO: ...
3747 } else {
3748 die "$0: $token->{type}: Unknown token type";
3749 }
3750 }
3751
3752 if ($self->{insertion_mode} & HEAD_IMS) {
3753 if ($token->{type} == CHARACTER_TOKEN) {
3754 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3755 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3756 !!!cp ('t88.2');
3757 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3758 } else {
3759 !!!cp ('t88.1');
3760 ## Ignore the token.
3761 !!!next-token;
3762 next B;
3763 }
3764 unless (length $token->{data}) {
3765 !!!cp ('t88');
3766 !!!next-token;
3767 next B;
3768 }
3769 }
3770
3771 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3772 !!!cp ('t89');
3773 ## As if <head>
3774 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
3775 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3776 push @{$self->{open_elements}},
3777 [$self->{head_element}, $el_category->{head}];
3778
3779 ## Reprocess in the "in head" insertion mode...
3780 pop @{$self->{open_elements}};
3781
3782 ## Reprocess in the "after head" insertion mode...
3783 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3784 !!!cp ('t90');
3785 ## As if </noscript>
3786 pop @{$self->{open_elements}};
3787 !!!parse-error (type => 'in noscript:#character', token => $token);
3788
3789 ## Reprocess in the "in head" insertion mode...
3790 ## As if </head>
3791 pop @{$self->{open_elements}};
3792
3793 ## Reprocess in the "after head" insertion mode...
3794 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3795 !!!cp ('t91');
3796 pop @{$self->{open_elements}};
3797
3798 ## Reprocess in the "after head" insertion mode...
3799 } else {
3800 !!!cp ('t92');
3801 }
3802
3803 ## "after head" insertion mode
3804 ## As if <body>
3805 !!!insert-element ('body',, $token);
3806 $self->{insertion_mode} = IN_BODY_IM;
3807 ## reprocess
3808 next B;
3809 } elsif ($token->{type} == START_TAG_TOKEN) {
3810 if ($token->{tag_name} eq 'head') {
3811 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3812 !!!cp ('t93');
3813 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3814 $self->{open_elements}->[-1]->[0]->append_child
3815 ($self->{head_element});
3816 push @{$self->{open_elements}},
3817 [$self->{head_element}, $el_category->{head}];
3818 $self->{insertion_mode} = IN_HEAD_IM;
3819 !!!nack ('t93.1');
3820 !!!next-token;
3821 next B;
3822 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3823 !!!cp ('t94');
3824 #
3825 } else {
3826 !!!cp ('t95');
3827 !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
3828 ## Ignore the token
3829 !!!nack ('t95.1');
3830 !!!next-token;
3831 next B;
3832 }
3833 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3834 !!!cp ('t96');
3835 ## As if <head>
3836 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
3837 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3838 push @{$self->{open_elements}},
3839 [$self->{head_element}, $el_category->{head}];
3840
3841 $self->{insertion_mode} = IN_HEAD_IM;
3842 ## Reprocess in the "in head" insertion mode...
3843 } else {
3844 !!!cp ('t97');
3845 }
3846
3847 if ($token->{tag_name} eq 'base') {
3848 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3849 !!!cp ('t98');
3850 ## As if </noscript>
3851 pop @{$self->{open_elements}};
3852 !!!parse-error (type => 'in noscript:base', token => $token);
3853
3854 $self->{insertion_mode} = IN_HEAD_IM;
3855 ## Reprocess in the "in head" insertion mode...
3856 } else {
3857 !!!cp ('t99');
3858 }
3859
3860 ## NOTE: There is a "as if in head" code clone.
3861 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3862 !!!cp ('t100');
3863 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3864 push @{$self->{open_elements}},
3865 [$self->{head_element}, $el_category->{head}];
3866 } else {
3867 !!!cp ('t101');
3868 }
3869 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3870 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3871 pop @{$self->{open_elements}} # <head>
3872 if $self->{insertion_mode} == AFTER_HEAD_IM;
3873 !!!nack ('t101.1');
3874 !!!next-token;
3875 next B;
3876 } elsif ($token->{tag_name} eq 'link') {
3877 ## NOTE: There is a "as if in head" code clone.
3878 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3879 !!!cp ('t102');
3880 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3881 push @{$self->{open_elements}},
3882 [$self->{head_element}, $el_category->{head}];
3883 } else {
3884 !!!cp ('t103');
3885 }
3886 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3887 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3888 pop @{$self->{open_elements}} # <head>
3889 if $self->{insertion_mode} == AFTER_HEAD_IM;
3890 !!!ack ('t103.1');
3891 !!!next-token;
3892 next B;
3893 } elsif ($token->{tag_name} eq 'meta') {
3894 ## NOTE: There is a "as if in head" code clone.
3895 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3896 !!!cp ('t104');
3897 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3898 push @{$self->{open_elements}},
3899 [$self->{head_element}, $el_category->{head}];
3900 } else {
3901 !!!cp ('t105');
3902 }
3903 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3904 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3905
3906 unless ($self->{confident}) {
3907 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3908 !!!cp ('t106');
3909 $self->{change_encoding}
3910 ->($self, $token->{attributes}->{charset}->{value},
3911 $token);
3912
3913 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3914 ->set_user_data (manakai_has_reference =>
3915 $token->{attributes}->{charset}
3916 ->{has_reference});
3917 } elsif ($token->{attributes}->{content}) {
3918 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3919 if ($token->{attributes}->{content}->{value}
3920 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3921 [\x09-\x0D\x20]*=
3922 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3923 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3924 !!!cp ('t107');
3925 $self->{change_encoding}
3926 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
3927 $token);
3928 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3929 ->set_user_data (manakai_has_reference =>
3930 $token->{attributes}->{content}
3931 ->{has_reference});
3932 } else {
3933 !!!cp ('t108');
3934 }
3935 }
3936 } else {
3937 if ($token->{attributes}->{charset}) {
3938 !!!cp ('t109');
3939 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3940 ->set_user_data (manakai_has_reference =>
3941 $token->{attributes}->{charset}
3942 ->{has_reference});
3943 }
3944 if ($token->{attributes}->{content}) {
3945 !!!cp ('t110');
3946 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3947 ->set_user_data (manakai_has_reference =>
3948 $token->{attributes}->{content}
3949 ->{has_reference});
3950 }
3951 }
3952
3953 pop @{$self->{open_elements}} # <head>
3954 if $self->{insertion_mode} == AFTER_HEAD_IM;
3955 !!!ack ('t110.1');
3956 !!!next-token;
3957 next B;
3958 } elsif ($token->{tag_name} eq 'title') {
3959 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3960 !!!cp ('t111');
3961 ## As if </noscript>
3962 pop @{$self->{open_elements}};
3963 !!!parse-error (type => 'in noscript:title', token => $token);
3964
3965 $self->{insertion_mode} = IN_HEAD_IM;
3966 ## Reprocess in the "in head" insertion mode...
3967 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3968 !!!cp ('t112');
3969 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3970 push @{$self->{open_elements}},
3971 [$self->{head_element}, $el_category->{head}];
3972 } else {
3973 !!!cp ('t113');
3974 }
3975
3976 ## NOTE: There is a "as if in head" code clone.
3977 my $parent = defined $self->{head_element} ? $self->{head_element}
3978 : $self->{open_elements}->[-1]->[0];
3979 $parse_rcdata->(RCDATA_CONTENT_MODEL);
3980 pop @{$self->{open_elements}} # <head>
3981 if $self->{insertion_mode} == AFTER_HEAD_IM;
3982 next B;
3983 } elsif ($token->{tag_name} eq 'style') {
3984 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3985 ## insertion mode IN_HEAD_IM)
3986 ## NOTE: There is a "as if in head" code clone.
3987 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3988 !!!cp ('t114');
3989 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3990 push @{$self->{open_elements}},
3991 [$self->{head_element}, $el_category->{head}];
3992 } else {
3993 !!!cp ('t115');
3994 }
3995 $parse_rcdata->(CDATA_CONTENT_MODEL);
3996 pop @{$self->{open_elements}} # <head>
3997 if $self->{insertion_mode} == AFTER_HEAD_IM;
3998 next B;
3999 } elsif ($token->{tag_name} eq 'noscript') {
4000 if ($self->{insertion_mode} == IN_HEAD_IM) {
4001 !!!cp ('t116');
4002 ## NOTE: and scripting is disalbed
4003 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4004 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4005 !!!nack ('t116.1');
4006 !!!next-token;
4007 next B;
4008 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4009 !!!cp ('t117');
4010 !!!parse-error (type => 'in noscript:noscript', token => $token);
4011 ## Ignore the token
4012 !!!nack ('t117.1');
4013 !!!next-token;
4014 next B;
4015 } else {
4016 !!!cp ('t118');
4017 #
4018 }
4019 } elsif ($token->{tag_name} eq 'script') {
4020 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4021 !!!cp ('t119');
4022 ## As if </noscript>
4023 pop @{$self->{open_elements}};
4024 !!!parse-error (type => 'in noscript:script', token => $token);
4025
4026 $self->{insertion_mode} = IN_HEAD_IM;
4027 ## Reprocess in the "in head" insertion mode...
4028 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4029 !!!cp ('t120');
4030 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4031 push @{$self->{open_elements}},
4032 [$self->{head_element}, $el_category->{head}];
4033 } else {
4034 !!!cp ('t121');
4035 }
4036
4037 ## NOTE: There is a "as if in head" code clone.
4038 $script_start_tag->();
4039 pop @{$self->{open_elements}} # <head>
4040 if $self->{insertion_mode} == AFTER_HEAD_IM;
4041 next B;
4042 } elsif ($token->{tag_name} eq 'body' or
4043 $token->{tag_name} eq 'frameset') {
4044 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4045 !!!cp ('t122');
4046 ## As if </noscript>
4047 pop @{$self->{open_elements}};
4048 !!!parse-error (type => 'in noscript:'.$token->{tag_name}, token => $token);
4049
4050 ## Reprocess in the "in head" insertion mode...
4051 ## As if </head>
4052 pop @{$self->{open_elements}};
4053
4054 ## Reprocess in the "after head" insertion mode...
4055 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4056 !!!cp ('t124');
4057 pop @{$self->{open_elements}};
4058
4059 ## Reprocess in the "after head" insertion mode...
4060 } else {
4061 !!!cp ('t125');
4062 }
4063
4064 ## "after head" insertion mode
4065 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4066 if ($token->{tag_name} eq 'body') {
4067 !!!cp ('t126');
4068 $self->{insertion_mode} = IN_BODY_IM;
4069 } elsif ($token->{tag_name} eq 'frameset') {
4070 !!!cp ('t127');
4071 $self->{insertion_mode} = IN_FRAMESET_IM;
4072 } else {
4073 die "$0: tag name: $self->{tag_name}";
4074 }
4075 !!!nack ('t127.1');
4076 !!!next-token;
4077 next B;
4078 } else {
4079 !!!cp ('t128');
4080 #
4081 }
4082
4083 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4084 !!!cp ('t129');
4085 ## As if </noscript>
4086 pop @{$self->{open_elements}};
4087 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4088
4089 ## Reprocess in the "in head" insertion mode...
4090 ## As if </head>
4091 pop @{$self->{open_elements}};
4092
4093 ## Reprocess in the "after head" insertion mode...
4094 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4095 !!!cp ('t130');
4096 ## As if </head>
4097 pop @{$self->{open_elements}};
4098
4099 ## Reprocess in the "after head" insertion mode...
4100 } else {
4101 !!!cp ('t131');
4102 }
4103
4104 ## "after head" insertion mode
4105 ## As if <body>
4106 !!!insert-element ('body',, $token);
4107 $self->{insertion_mode} = IN_BODY_IM;
4108 ## reprocess
4109 !!!ack-later;
4110 next B;
4111 } elsif ($token->{type} == END_TAG_TOKEN) {
4112 if ($token->{tag_name} eq 'head') {
4113 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4114 !!!cp ('t132');
4115 ## As if <head>
4116 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4117 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4118 push @{$self->{open_elements}},
4119 [$self->{head_element}, $el_category->{head}];
4120
4121 ## Reprocess in the "in head" insertion mode...
4122 pop @{$self->{open_elements}};
4123 $self->{insertion_mode} = AFTER_HEAD_IM;
4124 !!!next-token;
4125 next B;
4126 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4127 !!!cp ('t133');
4128 ## As if </noscript>
4129 pop @{$self->{open_elements}};
4130 !!!parse-error (type => 'in noscript:/head', token => $token);
4131
4132 ## Reprocess in the "in head" insertion mode...
4133 pop @{$self->{open_elements}};
4134 $self->{insertion_mode} = AFTER_HEAD_IM;
4135 !!!next-token;
4136 next B;
4137 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4138 !!!cp ('t134');
4139 pop @{$self->{open_elements}};
4140 $self->{insertion_mode} = AFTER_HEAD_IM;
4141 !!!next-token;
4142 next B;
4143 } else {
4144 !!!cp ('t135');
4145 #
4146 }
4147 } elsif ($token->{tag_name} eq 'noscript') {
4148 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4149 !!!cp ('t136');
4150 pop @{$self->{open_elements}};
4151 $self->{insertion_mode} = IN_HEAD_IM;
4152 !!!next-token;
4153 next B;
4154 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4155 !!!cp ('t137');
4156 !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4157 ## Ignore the token ## ISSUE: An issue in the spec.
4158 !!!next-token;
4159 next B;
4160 } else {
4161 !!!cp ('t138');
4162 #
4163 }
4164 } elsif ({
4165 body => 1, html => 1,
4166 }->{$token->{tag_name}}) {
4167 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4168 !!!cp ('t139');
4169 ## As if <head>
4170 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4171 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4172 push @{$self->{open_elements}},
4173 [$self->{head_element}, $el_category->{head}];
4174
4175 $self->{insertion_mode} = IN_HEAD_IM;
4176 ## Reprocess in the "in head" insertion mode...
4177 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4178 !!!cp ('t140');
4179 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4180 ## Ignore the token
4181 !!!next-token;
4182 next B;
4183 } else {
4184 !!!cp ('t141');
4185 }
4186
4187 #
4188 } elsif ({
4189 p => 1, br => 1,
4190 }->{$token->{tag_name}}) {
4191 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4192 !!!cp ('t142');
4193 ## As if <head>
4194 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4195 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4196 push @{$self->{open_elements}},
4197 [$self->{head_element}, $el_category->{head}];
4198
4199 $self->{insertion_mode} = IN_HEAD_IM;
4200 ## Reprocess in the "in head" insertion mode...
4201 } else {
4202 !!!cp ('t143');
4203 }
4204
4205 #
4206 } else {
4207 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4208 !!!cp ('t144');
4209 #
4210 } else {
4211 !!!cp ('t145');
4212 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4213 ## Ignore the token
4214 !!!next-token;
4215 next B;
4216 }
4217 }
4218
4219 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4220 !!!cp ('t146');
4221 ## As if </noscript>
4222 pop @{$self->{open_elements}};
4223 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4224
4225 ## Reprocess in the "in head" insertion mode...
4226 ## As if </head>
4227 pop @{$self->{open_elements}};
4228
4229 ## Reprocess in the "after head" insertion mode...
4230 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4231 !!!cp ('t147');
4232 ## As if </head>
4233 pop @{$self->{open_elements}};
4234
4235 ## Reprocess in the "after head" insertion mode...
4236 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4237 ## ISSUE: This case cannot be reached?
4238 !!!cp ('t148');
4239 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4240 ## Ignore the token ## ISSUE: An issue in the spec.
4241 !!!next-token;
4242 next B;
4243 } else {
4244 !!!cp ('t149');
4245 }
4246
4247 ## "after head" insertion mode
4248 ## As if <body>
4249 !!!insert-element ('body',, $token);
4250 $self->{insertion_mode} = IN_BODY_IM;
4251 ## reprocess
4252 next B;
4253 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4254 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4255 !!!cp ('t149.1');
4256
4257 ## NOTE: As if <head>
4258 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4259 $self->{open_elements}->[-1]->[0]->append_child
4260 ($self->{head_element});
4261 #push @{$self->{open_elements}},
4262 # [$self->{head_element}, $el_category->{head}];
4263 #$self->{insertion_mode} = IN_HEAD_IM;
4264 ## NOTE: Reprocess.
4265
4266 ## NOTE: As if </head>
4267 #pop @{$self->{open_elements}};
4268 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4269 ## NOTE: Reprocess.
4270
4271 #
4272 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4273 !!!cp ('t149.2');
4274
4275 ## NOTE: As if </head>
4276 pop @{$self->{open_elements}};
4277 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4278 ## NOTE: Reprocess.
4279
4280 #
4281 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4282 !!!cp ('t149.3');
4283
4284 !!!parse-error (type => 'in noscript:#eof', token => $token);
4285
4286 ## As if </noscript>
4287 pop @{$self->{open_elements}};
4288 #$self->{insertion_mode} = IN_HEAD_IM;
4289 ## NOTE: Reprocess.
4290
4291 ## NOTE: As if </head>
4292 pop @{$self->{open_elements}};
4293 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4294 ## NOTE: Reprocess.
4295
4296 #
4297 } else {
4298 !!!cp ('t149.4');
4299 #
4300 }
4301
4302 ## NOTE: As if <body>
4303 !!!insert-element ('body',, $token);
4304 $self->{insertion_mode} = IN_BODY_IM;
4305 ## NOTE: Reprocess.
4306 next B;
4307 } else {
4308 die "$0: $token->{type}: Unknown token type";
4309 }
4310
4311 ## ISSUE: An issue in the spec.
4312 } elsif ($self->{insertion_mode} & BODY_IMS) {
4313 if ($token->{type} == CHARACTER_TOKEN) {
4314 !!!cp ('t150');
4315 ## NOTE: There is a code clone of "character in body".
4316 $reconstruct_active_formatting_elements->($insert_to_current);
4317
4318 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4319
4320 !!!next-token;
4321 next B;
4322 } elsif ($token->{type} == START_TAG_TOKEN) {
4323 if ({
4324 caption => 1, col => 1, colgroup => 1, tbody => 1,
4325 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4326 }->{$token->{tag_name}}) {
4327 if ($self->{insertion_mode} == IN_CELL_IM) {
4328 ## have an element in table scope
4329 for (reverse 0..$#{$self->{open_elements}}) {
4330 my $node = $self->{open_elements}->[$_];
4331 if ($node->[1] & TABLE_CELL_EL) {
4332 !!!cp ('t151');
4333
4334 ## Close the cell
4335 !!!back-token; # <x>
4336 $token = {type => END_TAG_TOKEN,
4337 tag_name => $node->[0]->manakai_local_name,
4338 line => $token->{line},
4339 column => $token->{column}};
4340 next B;
4341 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4342 !!!cp ('t152');
4343 ## ISSUE: This case can never be reached, maybe.
4344 last;
4345 }
4346 }
4347
4348 !!!cp ('t153');
4349 !!!parse-error (type => 'start tag not allowed',
4350 value => $token->{tag_name}, token => $token);
4351 ## Ignore the token
4352 !!!nack ('t153.1');
4353 !!!next-token;
4354 next B;
4355 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4356 !!!parse-error (type => 'not closed:caption', token => $token);
4357
4358 ## NOTE: As if </caption>.
4359 ## have a table element in table scope
4360 my $i;
4361 INSCOPE: {
4362 for (reverse 0..$#{$self->{open_elements}}) {
4363 my $node = $self->{open_elements}->[$_];
4364 if ($node->[1] & CAPTION_EL) {
4365 !!!cp ('t155');
4366 $i = $_;
4367 last INSCOPE;
4368 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4369 !!!cp ('t156');
4370 last;
4371 }
4372 }
4373
4374 !!!cp ('t157');
4375 !!!parse-error (type => 'start tag not allowed',
4376 value => $token->{tag_name}, token => $token);
4377 ## Ignore the token
4378 !!!nack ('t157.1');
4379 !!!next-token;
4380 next B;
4381 } # INSCOPE
4382
4383 ## generate implied end tags
4384 while ($self->{open_elements}->[-1]->[1]
4385 & END_TAG_OPTIONAL_EL) {
4386 !!!cp ('t158');
4387 pop @{$self->{open_elements}};
4388 }
4389
4390 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4391 !!!cp ('t159');
4392 !!!parse-error (type => 'not closed',
4393 value => $self->{open_elements}->[-1]->[0]
4394 ->manakai_local_name,
4395 token => $token);
4396 } else {
4397 !!!cp ('t160');
4398 }
4399
4400 splice @{$self->{open_elements}}, $i;
4401
4402 $clear_up_to_marker->();
4403
4404 $self->{insertion_mode} = IN_TABLE_IM;
4405
4406 ## reprocess
4407 !!!ack-later;
4408 next B;
4409 } else {
4410 !!!cp ('t161');
4411 #
4412 }
4413 } else {
4414 !!!cp ('t162');
4415 #
4416 }
4417 } elsif ($token->{type} == END_TAG_TOKEN) {
4418 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4419 if ($self->{insertion_mode} == IN_CELL_IM) {
4420 ## have an element in table scope
4421 my $i;
4422 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4423 my $node = $self->{open_elements}->[$_];
4424 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4425 !!!cp ('t163');
4426 $i = $_;
4427 last INSCOPE;
4428 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4429 !!!cp ('t164');
4430 last INSCOPE;
4431 }
4432 } # INSCOPE
4433 unless (defined $i) {
4434 !!!cp ('t165');
4435 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4436 ## Ignore the token
4437 !!!next-token;
4438 next B;
4439 }
4440
4441 ## generate implied end tags
4442 while ($self->{open_elements}->[-1]->[1]
4443 & END_TAG_OPTIONAL_EL) {
4444 !!!cp ('t166');
4445 pop @{$self->{open_elements}};
4446 }
4447
4448 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4449 ne $token->{tag_name}) {
4450 !!!cp ('t167');
4451 !!!parse-error (type => 'not closed',
4452 value => $self->{open_elements}->[-1]->[0]
4453 ->manakai_local_name,
4454 token => $token);
4455 } else {
4456 !!!cp ('t168');
4457 }
4458
4459 splice @{$self->{open_elements}}, $i;
4460
4461 $clear_up_to_marker->();
4462
4463 $self->{insertion_mode} = IN_ROW_IM;
4464
4465 !!!next-token;
4466 next B;
4467 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4468 !!!cp ('t169');
4469 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4470 ## Ignore the token
4471 !!!next-token;
4472 next B;
4473 } else {
4474 !!!cp ('t170');
4475 #
4476 }
4477 } elsif ($token->{tag_name} eq 'caption') {
4478 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4479 ## have a table element in table scope
4480 my $i;
4481 INSCOPE: {
4482 for (reverse 0..$#{$self->{open_elements}}) {
4483 my $node = $self->{open_elements}->[$_];
4484 if ($node->[1] & CAPTION_EL) {
4485 !!!cp ('t171');
4486 $i = $_;
4487 last INSCOPE;
4488 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4489 !!!cp ('t172');
4490 last;
4491 }
4492 }
4493
4494 !!!cp ('t173');
4495 !!!parse-error (type => 'unmatched end tag',
4496 value => $token->{tag_name}, token => $token);
4497 ## Ignore the token
4498 !!!next-token;
4499 next B;
4500 } # INSCOPE
4501
4502 ## generate implied end tags
4503 while ($self->{open_elements}->[-1]->[1]
4504 & END_TAG_OPTIONAL_EL) {
4505 !!!cp ('t174');
4506 pop @{$self->{open_elements}};
4507 }
4508
4509 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4510 !!!cp ('t175');
4511 !!!parse-error (type => 'not closed',
4512 value => $self->{open_elements}->[-1]->[0]
4513 ->manakai_local_name,
4514 token => $token);
4515 } else {
4516 !!!cp ('t176');
4517 }
4518
4519 splice @{$self->{open_elements}}, $i;
4520
4521 $clear_up_to_marker->();
4522
4523 $self->{insertion_mode} = IN_TABLE_IM;
4524
4525 !!!next-token;
4526 next B;
4527 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4528 !!!cp ('t177');
4529 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4530 ## Ignore the token
4531 !!!next-token;
4532 next B;
4533 } else {
4534 !!!cp ('t178');
4535 #
4536 }
4537 } elsif ({
4538 table => 1, tbody => 1, tfoot => 1,
4539 thead => 1, tr => 1,
4540 }->{$token->{tag_name}} and
4541 $self->{insertion_mode} == IN_CELL_IM) {
4542 ## have an element in table scope
4543 my $i;
4544 my $tn;
4545 INSCOPE: {
4546 for (reverse 0..$#{$self->{open_elements}}) {
4547 my $node = $self->{open_elements}->[$_];
4548 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4549 !!!cp ('t179');
4550 $i = $_;
4551
4552 ## Close the cell
4553 !!!back-token; # </x>
4554 $token = {type => END_TAG_TOKEN, tag_name => $tn,
4555 line => $token->{line},
4556 column => $token->{column}};
4557 next B;
4558 } elsif ($node->[1] & TABLE_CELL_EL) {
4559 !!!cp ('t180');
4560 $tn = $node->[0]->manakai_local_name;
4561 ## NOTE: There is exactly one |td| or |th| element
4562 ## in scope in the stack of open elements by definition.
4563 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4564 ## ISSUE: Can this be reached?
4565 !!!cp ('t181');
4566 last;
4567 }
4568 }
4569
4570 !!!cp ('t182');
4571 !!!parse-error (type => 'unmatched end tag',
4572 value => $token->{tag_name}, token => $token);
4573 ## Ignore the token
4574 !!!next-token;
4575 next B;
4576 } # INSCOPE
4577 } elsif ($token->{tag_name} eq 'table' and
4578 $self->{insertion_mode} == IN_CAPTION_IM) {
4579 !!!parse-error (type => 'not closed:caption', token => $token);
4580
4581 ## As if </caption>
4582 ## have a table element in table scope
4583 my $i;
4584 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4585 my $node = $self->{open_elements}->[$_];
4586 if ($node->[1] & CAPTION_EL) {
4587 !!!cp ('t184');
4588 $i = $_;
4589 last INSCOPE;
4590 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4591 !!!cp ('t185');
4592 last INSCOPE;
4593 }
4594 } # INSCOPE
4595 unless (defined $i) {
4596 !!!cp ('t186');
4597 !!!parse-error (type => 'unmatched end tag:caption', token => $token);
4598 ## Ignore the token
4599 !!!next-token;
4600 next B;
4601 }
4602
4603 ## generate implied end tags
4604 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
4605 !!!cp ('t187');
4606 pop @{$self->{open_elements}};
4607 }
4608
4609 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4610 !!!cp ('t188');
4611 !!!parse-error (type => 'not closed',
4612 value => $self->{open_elements}->[-1]->[0]
4613 ->manakai_local_name,
4614 token => $token);
4615 } else {
4616 !!!cp ('t189');
4617 }
4618
4619 splice @{$self->{open_elements}}, $i;
4620
4621 $clear_up_to_marker->();
4622
4623 $self->{insertion_mode} = IN_TABLE_IM;
4624
4625 ## reprocess
4626 next B;
4627 } elsif ({
4628 body => 1, col => 1, colgroup => 1, html => 1,
4629 }->{$token->{tag_name}}) {
4630 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
4631 !!!cp ('t190');
4632 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4633 ## Ignore the token
4634 !!!next-token;
4635 next B;
4636 } else {
4637 !!!cp ('t191');
4638 #
4639 }
4640 } elsif ({
4641 tbody => 1, tfoot => 1,
4642 thead => 1, tr => 1,
4643 }->{$token->{tag_name}} and
4644 $self->{insertion_mode} == IN_CAPTION_IM) {
4645 !!!cp ('t192');
4646 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4647 ## Ignore the token
4648 !!!next-token;
4649 next B;
4650 } else {
4651 !!!cp ('t193');
4652 #
4653 }
4654 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4655 for my $entry (@{$self->{open_elements}}) {
4656 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
4657 !!!cp ('t75');
4658 !!!parse-error (type => 'in body:#eof', token => $token);
4659 last;
4660 }
4661 }
4662
4663 ## Stop parsing.
4664 last B;
4665 } else {
4666 die "$0: $token->{type}: Unknown token type";
4667 }
4668
4669 $insert = $insert_to_current;
4670 #
4671 } elsif ($self->{insertion_mode} & TABLE_IMS) {
4672 if ($token->{type} == CHARACTER_TOKEN) {
4673 if (not $open_tables->[-1]->[1] and # tainted
4674 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4675 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4676
4677 unless (length $token->{data}) {
4678 !!!cp ('t194');
4679 !!!next-token;
4680 next B;
4681 } else {
4682 !!!cp ('t195');
4683 }
4684 }
4685
4686 !!!parse-error (type => 'in table:#character', token => $token);
4687
4688 ## As if in body, but insert into foster parent element
4689 ## ISSUE: Spec says that "whenever a node would be inserted
4690 ## into the current node" while characters might not be
4691 ## result in a new Text node.
4692 $reconstruct_active_formatting_elements->($insert_to_foster);
4693
4694 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4695 # MUST
4696 my $foster_parent_element;
4697 my $next_sibling;
4698 my $prev_sibling;
4699 OE: for (reverse 0..$#{$self->{open_elements}}) {
4700 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4701 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4702 if (defined $parent and $parent->node_type == 1) {
4703 !!!cp ('t196');
4704 $foster_parent_element = $parent;
4705 $next_sibling = $self->{open_elements}->[$_]->[0];
4706 $prev_sibling = $next_sibling->previous_sibling;
4707 } else {
4708 !!!cp ('t197');
4709 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4710 $prev_sibling = $foster_parent_element->last_child;
4711 }
4712 last OE;
4713 }
4714 } # OE
4715 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4716 $prev_sibling = $foster_parent_element->last_child
4717 unless defined $foster_parent_element;
4718 if (defined $prev_sibling and
4719 $prev_sibling->node_type == 3) {
4720 !!!cp ('t198');
4721 $prev_sibling->manakai_append_text ($token->{data});
4722 } else {
4723 !!!cp ('t199');
4724 $foster_parent_element->insert_before
4725 ($self->{document}->create_text_node ($token->{data}),
4726 $next_sibling);
4727 }
4728 $open_tables->[-1]->[1] = 1; # tainted
4729 } else {
4730 !!!cp ('t200');
4731 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4732 }
4733
4734 !!!next-token;
4735 next B;
4736 } elsif ($token->{type} == START_TAG_TOKEN) {
4737 if ({
4738 tr => ($self->{insertion_mode} != IN_ROW_IM),
4739 th => 1, td => 1,
4740 }->{$token->{tag_name}}) {
4741 if ($self->{insertion_mode} == IN_TABLE_IM) {
4742 ## Clear back to table context
4743 while (not ($self->{open_elements}->[-1]->[1]
4744 & TABLE_SCOPING_EL)) {
4745 !!!cp ('t201');
4746 pop @{$self->{open_elements}};
4747 }
4748
4749 !!!insert-element ('tbody',, $token);
4750 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4751 ## reprocess in the "in table body" insertion mode...
4752 }
4753
4754 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4755 unless ($token->{tag_name} eq 'tr') {
4756 !!!cp ('t202');
4757 !!!parse-error (type => 'missing start tag:tr', token => $token);
4758 }
4759
4760 ## Clear back to table body context
4761 while (not ($self->{open_elements}->[-1]->[1]
4762 & TABLE_ROWS_SCOPING_EL)) {
4763 !!!cp ('t203');
4764 ## ISSUE: Can this case be reached?
4765 pop @{$self->{open_elements}};
4766 }
4767
4768 $self->{insertion_mode} = IN_ROW_IM;
4769 if ($token->{tag_name} eq 'tr') {
4770 !!!cp ('t204');
4771 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4772 !!!nack ('t204');
4773 !!!next-token;
4774 next B;
4775 } else {
4776 !!!cp ('t205');
4777 !!!insert-element ('tr',, $token);
4778 ## reprocess in the "in row" insertion mode
4779 }
4780 } else {
4781 !!!cp ('t206');
4782 }
4783
4784 ## Clear back to table row context
4785 while (not ($self->{open_elements}->[-1]->[1]
4786 & TABLE_ROW_SCOPING_EL)) {
4787 !!!cp ('t207');
4788 pop @{$self->{open_elements}};
4789 }
4790
4791 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4792 $self->{insertion_mode} = IN_CELL_IM;
4793
4794 push @$active_formatting_elements, ['#marker', ''];
4795
4796 !!!nack ('t207.1');
4797 !!!next-token;
4798 next B;
4799 } elsif ({
4800 caption => 1, col => 1, colgroup => 1,
4801 tbody => 1, tfoot => 1, thead => 1,
4802 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4803 }->{$token->{tag_name}}) {
4804 if ($self->{insertion_mode} == IN_ROW_IM) {
4805 ## As if </tr>
4806 ## have an element in table scope
4807 my $i;
4808 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4809 my $node = $self->{open_elements}->[$_];
4810 if ($node->[1] & TABLE_ROW_EL) {
4811 !!!cp ('t208');
4812 $i = $_;
4813 last INSCOPE;
4814 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4815 !!!cp ('t209');
4816 last INSCOPE;
4817 }
4818 } # INSCOPE
4819 unless (defined $i) {
4820 !!!cp ('t210');
4821 ## TODO: This type is wrong.
4822 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name}, token => $token);
4823 ## Ignore the token
4824 !!!nack ('t210.1');
4825 !!!next-token;
4826 next B;
4827 }
4828
4829 ## Clear back to table row context
4830 while (not ($self->{open_elements}->[-1]->[1]
4831 & TABLE_ROW_SCOPING_EL)) {
4832 !!!cp ('t211');
4833 ## ISSUE: Can this case be reached?
4834 pop @{$self->{open_elements}};
4835 }
4836
4837 pop @{$self->{open_elements}}; # tr
4838 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4839 if ($token->{tag_name} eq 'tr') {
4840 !!!cp ('t212');
4841 ## reprocess
4842 !!!ack-later;
4843 next B;
4844 } else {
4845 !!!cp ('t213');
4846 ## reprocess in the "in table body" insertion mode...
4847 }
4848 }
4849
4850 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4851 ## have an element in table scope
4852 my $i;
4853 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4854 my $node = $self->{open_elements}->[$_];
4855 if ($node->[1] & TABLE_ROW_GROUP_EL) {
4856 !!!cp ('t214');
4857 $i = $_;
4858 last INSCOPE;
4859 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4860 !!!cp ('t215');
4861 last INSCOPE;
4862 }
4863 } # INSCOPE
4864 unless (defined $i) {
4865 !!!cp ('t216');
4866 ## TODO: This erorr type ios wrong.
4867 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4868 ## Ignore the token
4869 !!!nack ('t216.1');
4870 !!!next-token;
4871 next B;
4872 }
4873
4874 ## Clear back to table body context
4875 while (not ($self->{open_elements}->[-1]->[1]
4876 & TABLE_ROWS_SCOPING_EL)) {
4877 !!!cp ('t217');
4878 ## ISSUE: Can this state be reached?
4879 pop @{$self->{open_elements}};
4880 }
4881
4882 ## As if <{current node}>
4883 ## have an element in table scope
4884 ## true by definition
4885
4886 ## Clear back to table body context
4887 ## nop by definition
4888
4889 pop @{$self->{open_elements}};
4890 $self->{insertion_mode} = IN_TABLE_IM;
4891 ## reprocess in "in table" insertion mode...
4892 } else {
4893 !!!cp ('t218');
4894 }
4895
4896 if ($token->{tag_name} eq 'col') {
4897 ## Clear back to table context
4898 while (not ($self->{open_elements}->[-1]->[1]
4899 & TABLE_SCOPING_EL)) {
4900 !!!cp ('t219');
4901 ## ISSUE: Can this state be reached?
4902 pop @{$self->{open_elements}};
4903 }
4904
4905 !!!insert-element ('colgroup',, $token);
4906 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
4907 ## reprocess
4908 !!!ack-later;
4909 next B;
4910 } elsif ({
4911 caption => 1,
4912 colgroup => 1,
4913 tbody => 1, tfoot => 1, thead => 1,
4914 }->{$token->{tag_name}}) {
4915 ## Clear back to table context
4916 while (not ($self->{open_elements}->[-1]->[1]
4917 & TABLE_SCOPING_EL)) {
4918 !!!cp ('t220');
4919 ## ISSUE: Can this state be reached?
4920 pop @{$self->{open_elements}};
4921 }
4922
4923 push @$active_formatting_elements, ['#marker', '']
4924 if $token->{tag_name} eq 'caption';
4925
4926 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4927 $self->{insertion_mode} = {
4928 caption => IN_CAPTION_IM,
4929 colgroup => IN_COLUMN_GROUP_IM,
4930 tbody => IN_TABLE_BODY_IM,
4931 tfoot => IN_TABLE_BODY_IM,
4932 thead => IN_TABLE_BODY_IM,
4933 }->{$token->{tag_name}};
4934 !!!next-token;
4935 !!!nack ('t220.1');
4936 next B;
4937 } else {
4938 die "$0: in table: <>: $token->{tag_name}";
4939 }
4940 } elsif ($token->{tag_name} eq 'table') {
4941 !!!parse-error (type => 'not closed',
4942 value => $self->{open_elements}->[-1]->[0]
4943 ->manakai_local_name,
4944 token => $token);
4945
4946 ## As if </table>
4947 ## have a table element in table scope
4948 my $i;
4949 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4950 my $node = $self->{open_elements}->[$_];
4951 if ($node->[1] & TABLE_EL) {
4952 !!!cp ('t221');
4953 $i = $_;
4954 last INSCOPE;
4955 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4956 !!!cp ('t222');
4957 last INSCOPE;
4958 }
4959 } # INSCOPE
4960 unless (defined $i) {
4961 !!!cp ('t223');
4962 ## TODO: The following is wrong, maybe.
4963 !!!parse-error (type => 'unmatched end tag:table', token => $token);
4964 ## Ignore tokens </table><table>
4965 !!!nack ('t223.1');
4966 !!!next-token;
4967 next B;
4968 }
4969
4970 ## TODO: Followings are removed from the latest spec.
4971 ## generate implied end tags
4972 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
4973 !!!cp ('t224');
4974 pop @{$self->{open_elements}};
4975 }
4976
4977 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
4978 !!!cp ('t225');
4979 ## NOTE: |<table><tr><table>|
4980 !!!parse-error (type => 'not closed',
4981 value => $self->{open_elements}->[-1]->[0]
4982 ->manakai_local_name,
4983 token => $token);
4984 } else {
4985 !!!cp ('t226');
4986 }
4987
4988 splice @{$self->{open_elements}}, $i;
4989 pop @{$open_tables};
4990
4991 $self->_reset_insertion_mode;
4992
4993 ## reprocess
4994 !!!ack-later;
4995 next B;
4996 } elsif ($token->{tag_name} eq 'style') {
4997 if (not $open_tables->[-1]->[1]) { # tainted
4998 !!!cp ('t227.8');
4999 ## NOTE: This is a "as if in head" code clone.
5000 $parse_rcdata->(CDATA_CONTENT_MODEL);
5001 next B;
5002 } else {
5003 !!!cp ('t227.7');
5004 #
5005 }
5006 } elsif ($token->{tag_name} eq 'script') {
5007 if (not $open_tables->[-1]->[1]) { # tainted
5008 !!!cp ('t227.6');
5009 ## NOTE: This is a "as if in head" code clone.
5010 $script_start_tag->();
5011 next B;
5012 } else {
5013 !!!cp ('t227.5');
5014 #
5015 }
5016 } elsif ($token->{tag_name} eq 'input') {
5017 if (not $open_tables->[-1]->[1]) { # tainted
5018 if ($token->{attributes}->{type}) { ## TODO: case
5019 my $type = lc $token->{attributes}->{type}->{value};
5020 if ($type eq 'hidden') {
5021 !!!cp ('t227.3');
5022 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5023
5024 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5025
5026 ## TODO: form element pointer
5027
5028 pop @{$self->{open_elements}};
5029
5030 !!!next-token;
5031 !!!ack ('t227.2.1');
5032 next B;
5033 } else {
5034 !!!cp ('t227.2');
5035 #
5036 }
5037 } else {
5038 !!!cp ('t227.1');
5039 #
5040 }
5041 } else {
5042 !!!cp ('t227.4');
5043 #
5044 }
5045 } else {
5046 !!!cp ('t227');
5047 #
5048 }
5049
5050 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5051
5052 $insert = $insert_to_foster;
5053 #
5054 } elsif ($token->{type} == END_TAG_TOKEN) {
5055 if ($token->{tag_name} eq 'tr' and
5056 $self->{insertion_mode} == IN_ROW_IM) {
5057 ## have an element in table scope
5058 my $i;
5059 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5060 my $node = $self->{open_elements}->[$_];
5061 if ($node->[1] & TABLE_ROW_EL) {
5062 !!!cp ('t228');
5063 $i = $_;
5064 last INSCOPE;
5065 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5066 !!!cp ('t229');
5067 last INSCOPE;
5068 }
5069 } # INSCOPE
5070 unless (defined $i) {
5071 !!!cp ('t230');
5072 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5073 ## Ignore the token
5074 !!!nack ('t230.1');
5075 !!!next-token;
5076 next B;
5077 } else {
5078 !!!cp ('t232');
5079 }
5080
5081 ## Clear back to table row context
5082 while (not ($self->{open_elements}->[-1]->[1]
5083 & TABLE_ROW_SCOPING_EL)) {
5084 !!!cp ('t231');
5085 ## ISSUE: Can this state be reached?
5086 pop @{$self->{open_elements}};
5087 }
5088
5089 pop @{$self->{open_elements}}; # tr
5090 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5091 !!!next-token;
5092 !!!nack ('t231.1');
5093 next B;
5094 } elsif ($token->{tag_name} eq 'table') {
5095 if ($self->{insertion_mode} == IN_ROW_IM) {
5096 ## As if </tr>
5097 ## have an element in table scope
5098 my $i;
5099 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5100 my $node = $self->{open_elements}->[$_];
5101 if ($node->[1] & TABLE_ROW_EL) {
5102 !!!cp ('t233');
5103 $i = $_;
5104 last INSCOPE;
5105 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5106 !!!cp ('t234');
5107 last INSCOPE;
5108 }
5109 } # INSCOPE
5110 unless (defined $i) {
5111 !!!cp ('t235');
5112 ## TODO: The following is wrong.
5113 !!!parse-error (type => 'unmatched end tag:'.$token->{type}, token => $token);
5114 ## Ignore the token
5115 !!!nack ('t236.1');
5116 !!!next-token;
5117 next B;
5118 }
5119
5120 ## Clear back to table row context
5121 while (not ($self->{open_elements}->[-1]->[1]
5122 & TABLE_ROW_SCOPING_EL)) {
5123 !!!cp ('t236');
5124 ## ISSUE: Can this state be reached?
5125 pop @{$self->{open_elements}};
5126 }
5127
5128 pop @{$self->{open_elements}}; # tr
5129 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5130 ## reprocess in the "in table body" insertion mode...
5131 }
5132
5133 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5134 ## have an element in table scope
5135 my $i;
5136 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5137 my $node = $self->{open_elements}->[$_];
5138 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5139 !!!cp ('t237');
5140 $i = $_;
5141 last INSCOPE;
5142 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5143 !!!cp ('t238');
5144 last INSCOPE;
5145 }
5146 } # INSCOPE
5147 unless (defined $i) {
5148 !!!cp ('t239');
5149 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5150 ## Ignore the token
5151 !!!nack ('t239.1');
5152 !!!next-token;
5153 next B;
5154 }
5155
5156 ## Clear back to table body context
5157 while (not ($self->{open_elements}->[-1]->[1]
5158 & TABLE_ROWS_SCOPING_EL)) {
5159 !!!cp ('t240');
5160 pop @{$self->{open_elements}};
5161 }
5162
5163 ## As if <{current node}>
5164 ## have an element in table scope
5165 ## true by definition
5166
5167 ## Clear back to table body context
5168 ## nop by definition
5169
5170 pop @{$self->{open_elements}};
5171 $self->{insertion_mode} = IN_TABLE_IM;
5172 ## reprocess in the "in table" insertion mode...
5173 }
5174
5175 ## NOTE: </table> in the "in table" insertion mode.
5176 ## When you edit the code fragment below, please ensure that
5177 ## the code for <table> in the "in table" insertion mode
5178 ## is synced with it.
5179
5180 ## have a table element in table scope
5181 my $i;
5182 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5183 my $node = $self->{open_elements}->[$_];
5184 if ($node->[1] & TABLE_EL) {
5185 !!!cp ('t241');
5186 $i = $_;
5187 last INSCOPE;
5188 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5189 !!!cp ('t242');
5190 last INSCOPE;
5191 }
5192 } # INSCOPE
5193 unless (defined $i) {
5194 !!!cp ('t243');
5195 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5196 ## Ignore the token
5197 !!!nack ('t243.1');
5198 !!!next-token;
5199 next B;
5200 }
5201
5202 splice @{$self->{open_elements}}, $i;
5203 pop @{$open_tables};
5204
5205 $self->_reset_insertion_mode;
5206
5207 !!!next-token;
5208 next B;
5209 } elsif ({
5210 tbody => 1, tfoot => 1, thead => 1,
5211 }->{$token->{tag_name}} and
5212 $self->{insertion_mode} & ROW_IMS) {
5213 if ($self->{insertion_mode} == IN_ROW_IM) {
5214 ## have an element in table scope
5215 my $i;
5216 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5217 my $node = $self->{open_elements}->[$_];
5218 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5219 !!!cp ('t247');
5220 $i = $_;
5221 last INSCOPE;
5222 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5223 !!!cp ('t248');
5224 last INSCOPE;
5225 }
5226 } # INSCOPE
5227 unless (defined $i) {
5228 !!!cp ('t249');
5229 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5230 ## Ignore the token
5231 !!!nack ('t249.1');
5232 !!!next-token;
5233 next B;
5234 }
5235
5236 ## As if </tr>
5237 ## have an element in table scope
5238 my $i;
5239 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5240 my $node = $self->{open_elements}->[$_];
5241 if ($node->[1] & TABLE_ROW_EL) {
5242 !!!cp ('t250');
5243 $i = $_;
5244 last INSCOPE;
5245 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5246 !!!cp ('t251');
5247 last INSCOPE;
5248 }
5249 } # INSCOPE
5250 unless (defined $i) {
5251 !!!cp ('t252');
5252 !!!parse-error (type => 'unmatched end tag:tr', token => $token);
5253 ## Ignore the token
5254 !!!nack ('t252.1');
5255 !!!next-token;
5256 next B;
5257 }
5258
5259 ## Clear back to table row context
5260 while (not ($self->{open_elements}->[-1]->[1]
5261 & TABLE_ROW_SCOPING_EL)) {
5262 !!!cp ('t253');
5263 ## ISSUE: Can this case be reached?
5264 pop @{$self->{open_elements}};
5265 }
5266
5267 pop @{$self->{open_elements}}; # tr
5268 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5269 ## reprocess in the "in table body" insertion mode...
5270 }
5271
5272 ## have an element in table scope
5273 my $i;
5274 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5275 my $node = $self->{open_elements}->[$_];
5276 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5277 !!!cp ('t254');
5278 $i = $_;
5279 last INSCOPE;
5280 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5281 !!!cp ('t255');
5282 last INSCOPE;
5283 }
5284 } # INSCOPE
5285 unless (defined $i) {
5286 !!!cp ('t256');
5287 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5288 ## Ignore the token
5289 !!!nack ('t256.1');
5290 !!!next-token;
5291 next B;
5292 }
5293
5294 ## Clear back to table body context
5295 while (not ($self->{open_elements}->[-1]->[1]
5296 & TABLE_ROWS_SCOPING_EL)) {
5297 !!!cp ('t257');
5298 ## ISSUE: Can this case be reached?
5299 pop @{$self->{open_elements}};
5300 }
5301
5302 pop @{$self->{open_elements}};
5303 $self->{insertion_mode} = IN_TABLE_IM;
5304 !!!nack ('t257.1');
5305 !!!next-token;
5306 next B;
5307 } elsif ({
5308 body => 1, caption => 1, col => 1, colgroup => 1,
5309 html => 1, td => 1, th => 1,
5310 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5311 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5312 }->{$token->{tag_name}}) {
5313 !!!cp ('t258');
5314 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5315 ## Ignore the token
5316 !!!nack ('t258.1');
5317 !!!next-token;
5318 next B;
5319 } else {
5320 !!!cp ('t259');
5321 !!!parse-error (type => 'in table:/'.$token->{tag_name}, token => $token);
5322
5323 $insert = $insert_to_foster;
5324 #
5325 }
5326 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5327 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5328 @{$self->{open_elements}} == 1) { # redundant, maybe
5329 !!!parse-error (type => 'in body:#eof', token => $token);
5330 !!!cp ('t259.1');
5331 #
5332 } else {
5333 !!!cp ('t259.2');
5334 #
5335 }
5336
5337 ## Stop parsing
5338 last B;
5339 } else {
5340 die "$0: $token->{type}: Unknown token type";
5341 }
5342 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5343 if ($token->{type} == CHARACTER_TOKEN) {
5344 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5345 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5346 unless (length $token->{data}) {
5347 !!!cp ('t260');
5348 !!!next-token;
5349 next B;
5350 }
5351 }
5352
5353 !!!cp ('t261');
5354 #
5355 } elsif ($token->{type} == START_TAG_TOKEN) {
5356 if ($token->{tag_name} eq 'col') {
5357 !!!cp ('t262');
5358 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5359 pop @{$self->{open_elements}};
5360 !!!ack ('t262.1');
5361 !!!next-token;
5362 next B;
5363 } else {
5364 !!!cp ('t263');
5365 #
5366 }
5367 } elsif ($token->{type} == END_TAG_TOKEN) {
5368 if ($token->{tag_name} eq 'colgroup') {
5369 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5370 !!!cp ('t264');
5371 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5372 ## Ignore the token
5373 !!!next-token;
5374 next B;
5375 } else {
5376 !!!cp ('t265');
5377 pop @{$self->{open_elements}}; # colgroup
5378 $self->{insertion_mode} = IN_TABLE_IM;
5379 !!!next-token;
5380 next B;
5381 }
5382 } elsif ($token->{tag_name} eq 'col') {
5383 !!!cp ('t266');
5384 !!!parse-error (type => 'unmatched end tag:col', token => $token);
5385 ## Ignore the token
5386 !!!next-token;
5387 next B;
5388 } else {
5389 !!!cp ('t267');
5390 #
5391 }
5392 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5393 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5394 @{$self->{open_elements}} == 1) { # redundant, maybe
5395 !!!cp ('t270.2');
5396 ## Stop parsing.
5397 last B;
5398 } else {
5399 ## NOTE: As if </colgroup>.
5400 !!!cp ('t270.1');
5401 pop @{$self->{open_elements}}; # colgroup
5402 $self->{insertion_mode} = IN_TABLE_IM;
5403 ## Reprocess.
5404 next B;
5405 }
5406 } else {
5407 die "$0: $token->{type}: Unknown token type";
5408 }
5409
5410 ## As if </colgroup>
5411 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5412 !!!cp ('t269');
5413 ## TODO: Wrong error type?
5414 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5415 ## Ignore the token
5416 !!!nack ('t269.1');
5417 !!!next-token;
5418 next B;
5419 } else {
5420 !!!cp ('t270');
5421 pop @{$self->{open_elements}}; # colgroup
5422 $self->{insertion_mode} = IN_TABLE_IM;
5423 !!!ack-later;
5424 ## reprocess
5425 next B;
5426 }
5427 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5428 if ($token->{type} == CHARACTER_TOKEN) {
5429 !!!cp ('t271');
5430 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5431 !!!next-token;
5432 next B;
5433 } elsif ($token->{type} == START_TAG_TOKEN) {
5434 if ($token->{tag_name} eq 'option') {
5435 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5436 !!!cp ('t272');
5437 ## As if </option>
5438 pop @{$self->{open_elements}};
5439 } else {
5440 !!!cp ('t273');
5441 }
5442
5443 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5444 !!!nack ('t273.1');
5445 !!!next-token;
5446 next B;
5447 } elsif ($token->{tag_name} eq 'optgroup') {
5448 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5449 !!!cp ('t274');
5450 ## As if </option>
5451 pop @{$self->{open_elements}};
5452 } else {
5453 !!!cp ('t275');
5454 }
5455
5456 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5457 !!!cp ('t276');
5458 ## As if </optgroup>
5459 pop @{$self->{open_elements}};
5460 } else {
5461 !!!cp ('t277');
5462 }
5463
5464 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5465 !!!nack ('t277.1');
5466 !!!next-token;
5467 next B;
5468 } elsif ($token->{tag_name} eq 'select' or
5469 $token->{tag_name} eq 'input' or
5470 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5471 {
5472 caption => 1, table => 1,
5473 tbody => 1, tfoot => 1, thead => 1,
5474 tr => 1, td => 1, th => 1,
5475 }->{$token->{tag_name}})) {
5476 ## TODO: The type below is not good - <select> is replaced by </select>
5477 !!!parse-error (type => 'not closed:select', token => $token);
5478 ## NOTE: As if the token were </select> (<select> case) or
5479 ## as if there were </select> (otherwise).
5480 ## have an element in table scope
5481 my $i;
5482 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5483 my $node = $self->{open_elements}->[$_];
5484 if ($node->[1] & SELECT_EL) {
5485 !!!cp ('t278');
5486 $i = $_;
5487 last INSCOPE;
5488 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5489 !!!cp ('t279');
5490 last INSCOPE;
5491 }
5492 } # INSCOPE
5493 unless (defined $i) {
5494 !!!cp ('t280');
5495 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5496 ## Ignore the token
5497 !!!nack ('t280.1');
5498 !!!next-token;
5499 next B;
5500 }
5501
5502 !!!cp ('t281');
5503 splice @{$self->{open_elements}}, $i;
5504
5505 $self->_reset_insertion_mode;
5506
5507 if ($token->{tag_name} eq 'select') {
5508 !!!nack ('t281.2');
5509 !!!next-token;
5510 next B;
5511 } else {
5512 !!!cp ('t281.1');
5513 !!!ack-later;
5514 ## Reprocess the token.
5515 next B;
5516 }
5517 } else {
5518 !!!cp ('t282');
5519 !!!parse-error (type => 'in select:'.$token->{tag_name}, token => $token);
5520 ## Ignore the token
5521 !!!nack ('t282.1');
5522 !!!next-token;
5523 next B;
5524 }
5525 } elsif ($token->{type} == END_TAG_TOKEN) {
5526 if ($token->{tag_name} eq 'optgroup') {
5527 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
5528 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
5529 !!!cp ('t283');
5530 ## As if </option>
5531 splice @{$self->{open_elements}}, -2;
5532 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5533 !!!cp ('t284');
5534 pop @{$self->{open_elements}};
5535 } else {
5536 !!!cp ('t285');
5537 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5538 ## Ignore the token
5539 }
5540 !!!nack ('t285.1');
5541 !!!next-token;
5542 next B;
5543 } elsif ($token->{tag_name} eq 'option') {
5544 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5545 !!!cp ('t286');
5546 pop @{$self->{open_elements}};
5547 } else {
5548 !!!cp ('t287');
5549 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5550 ## Ignore the token
5551 }
5552 !!!nack ('t287.1');
5553 !!!next-token;
5554 next B;
5555 } elsif ($token->{tag_name} eq 'select') {
5556 ## have an element in table scope
5557 my $i;
5558 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5559 my $node = $self->{open_elements}->[$_];
5560 if ($node->[1] & SELECT_EL) {
5561 !!!cp ('t288');
5562 $i = $_;
5563 last INSCOPE;
5564 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5565 !!!cp ('t289');
5566 last INSCOPE;
5567 }
5568 } # INSCOPE
5569 unless (defined $i) {
5570 !!!cp ('t290');
5571 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5572 ## Ignore the token
5573 !!!nack ('t290.1');
5574 !!!next-token;
5575 next B;
5576 }
5577
5578 !!!cp ('t291');
5579 splice @{$self->{open_elements}}, $i;
5580
5581 $self->_reset_insertion_mode;
5582
5583 !!!nack ('t291.1');
5584 !!!next-token;
5585 next B;
5586 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5587 {
5588 caption => 1, table => 1, tbody => 1,
5589 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5590 }->{$token->{tag_name}}) {
5591 ## TODO: The following is wrong?
5592 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5593
5594 ## have an element in table scope
5595 my $i;
5596 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5597 my $node = $self->{open_elements}->[$_];
5598 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5599 !!!cp ('t292');
5600 $i = $_;
5601 last INSCOPE;
5602 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5603 !!!cp ('t293');
5604 last INSCOPE;
5605 }
5606 } # INSCOPE
5607 unless (defined $i) {
5608 !!!cp ('t294');
5609 ## Ignore the token
5610 !!!nack ('t294.1');
5611 !!!next-token;
5612 next B;
5613 }
5614
5615 ## As if </select>
5616 ## have an element in table scope
5617 undef $i;
5618 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5619 my $node = $self->{open_elements}->[$_];
5620 if ($node->[1] & SELECT_EL) {
5621 !!!cp ('t295');
5622 $i = $_;
5623 last INSCOPE;
5624 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5625 ## ISSUE: Can this state be reached?
5626 !!!cp ('t296');
5627 last INSCOPE;
5628 }
5629 } # INSCOPE
5630 unless (defined $i) {
5631 !!!cp ('t297');
5632 ## TODO: The following error type is correct?
5633 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5634 ## Ignore the </select> token
5635 !!!nack ('t297.1');
5636 !!!next-token; ## TODO: ok?
5637 next B;
5638 }
5639
5640 !!!cp ('t298');
5641 splice @{$self->{open_elements}}, $i;
5642
5643 $self->_reset_insertion_mode;
5644
5645 !!!ack-later;
5646 ## reprocess
5647 next B;
5648 } else {
5649 !!!cp ('t299');
5650 !!!parse-error (type => 'in select:/'.$token->{tag_name}, token => $token);
5651 ## Ignore the token
5652 !!!nack ('t299.3');
5653 !!!next-token;
5654 next B;
5655 }
5656 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5657 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5658 @{$self->{open_elements}} == 1) { # redundant, maybe
5659 !!!cp ('t299.1');
5660 !!!parse-error (type => 'in body:#eof', token => $token);
5661 } else {
5662 !!!cp ('t299.2');
5663 }
5664
5665 ## Stop parsing.
5666 last B;
5667 } else {
5668 die "$0: $token->{type}: Unknown token type";
5669 }
5670 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
5671 if ($token->{type} == CHARACTER_TOKEN) {
5672 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5673 my $data = $1;
5674 ## As if in body
5675 $reconstruct_active_formatting_elements->($insert_to_current);
5676
5677 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5678
5679 unless (length $token->{data}) {
5680 !!!cp ('t300');
5681 !!!next-token;
5682 next B;
5683 }
5684 }
5685
5686 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5687 !!!cp ('t301');
5688 !!!parse-error (type => 'after html:#character', token => $token);
5689
5690 ## Reprocess in the "after body" insertion mode.
5691 } else {
5692 !!!cp ('t302');
5693 }
5694
5695 ## "after body" insertion mode
5696 !!!parse-error (type => 'after body:#character', token => $token);
5697
5698 $self->{insertion_mode} = IN_BODY_IM;
5699 ## reprocess
5700 next B;
5701 } elsif ($token->{type} == START_TAG_TOKEN) {
5702 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5703 !!!cp ('t303');
5704 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5705
5706 ## Reprocess in the "after body" insertion mode.
5707 } else {
5708 !!!cp ('t304');
5709 }
5710
5711 ## "after body" insertion mode
5712 !!!parse-error (type => 'after body:'.$token->{tag_name}, token => $token);
5713
5714 $self->{insertion_mode} = IN_BODY_IM;
5715 !!!ack-later;
5716 ## reprocess
5717 next B;
5718 } elsif ($token->{type} == END_TAG_TOKEN) {
5719 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5720 !!!cp ('t305');
5721 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5722
5723 $self->{insertion_mode} = AFTER_BODY_IM;
5724 ## Reprocess in the "after body" insertion mode.
5725 } else {
5726 !!!cp ('t306');
5727 }
5728
5729 ## "after body" insertion mode
5730 if ($token->{tag_name} eq 'html') {
5731 if (defined $self->{inner_html_node}) {
5732 !!!cp ('t307');
5733 !!!parse-error (type => 'unmatched end tag:html', token => $token);
5734 ## Ignore the token
5735 !!!next-token;
5736 next B;
5737 } else {
5738 !!!cp ('t308');
5739 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
5740 !!!next-token;
5741 next B;
5742 }
5743 } else {
5744 !!!cp ('t309');
5745 !!!parse-error (type => 'after body:/'.$token->{tag_name}, token => $token);
5746
5747 $self->{insertion_mode} = IN_BODY_IM;
5748 ## reprocess
5749 next B;
5750 }
5751 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5752 !!!cp ('t309.2');
5753 ## Stop parsing
5754 last B;
5755 } else {
5756 die "$0: $token->{type}: Unknown token type";
5757 }
5758 } elsif ($self->{insertion_mode} & FRAME_IMS) {
5759 if ($token->{type} == CHARACTER_TOKEN) {
5760 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5761 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5762
5763 unless (length $token->{data}) {
5764 !!!cp ('t310');
5765 !!!next-token;
5766 next B;
5767 }
5768 }
5769
5770 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5771 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5772 !!!cp ('t311');
5773 !!!parse-error (type => 'in frameset:#character', token => $token);
5774 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
5775 !!!cp ('t312');
5776 !!!parse-error (type => 'after frameset:#character', token => $token);
5777 } else { # "after html frameset"
5778 !!!cp ('t313');
5779 !!!parse-error (type => 'after html:#character', token => $token);
5780
5781 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5782 ## Reprocess in the "after frameset" insertion mode.
5783 !!!parse-error (type => 'after frameset:#character', token => $token);
5784 }
5785
5786 ## Ignore the token.
5787 if (length $token->{data}) {
5788 !!!cp ('t314');
5789 ## reprocess the rest of characters
5790 } else {
5791 !!!cp ('t315');
5792 !!!next-token;
5793 }
5794 next B;
5795 }
5796
5797 die qq[$0: Character "$token->{data}"];
5798 } elsif ($token->{type} == START_TAG_TOKEN) {
5799 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5800 !!!cp ('t316');
5801 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5802
5803 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5804 ## Process in the "after frameset" insertion mode.
5805 } else {
5806 !!!cp ('t317');
5807 }
5808
5809 if ($token->{tag_name} eq 'frameset' and
5810 $self->{insertion_mode} == IN_FRAMESET_IM) {
5811 !!!cp ('t318');
5812 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5813 !!!nack ('t318.1');
5814 !!!next-token;
5815 next B;
5816 } elsif ($token->{tag_name} eq 'frame' and
5817 $self->{insertion_mode} == IN_FRAMESET_IM) {
5818 !!!cp ('t319');
5819 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5820 pop @{$self->{open_elements}};
5821 !!!ack ('t319.1');
5822 !!!next-token;
5823 next B;
5824 } elsif ($token->{tag_name} eq 'noframes') {
5825 !!!cp ('t320');
5826 ## NOTE: As if in body.
5827 $parse_rcdata->(CDATA_CONTENT_MODEL);
5828 next B;
5829 } else {
5830 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5831 !!!cp ('t321');
5832 !!!parse-error (type => 'in frameset:'.$token->{tag_name}, token => $token);
5833 } else {
5834 !!!cp ('t322');
5835 !!!parse-error (type => 'after frameset:'.$token->{tag_name}, token => $token);
5836 }
5837 ## Ignore the token
5838 !!!nack ('t322.1');
5839 !!!next-token;
5840 next B;
5841 }
5842 } elsif ($token->{type} == END_TAG_TOKEN) {
5843 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5844 !!!cp ('t323');
5845 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5846
5847 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5848 ## Process in the "after frameset" insertion mode.
5849 } else {
5850 !!!cp ('t324');
5851 }
5852
5853 if ($token->{tag_name} eq 'frameset' and
5854 $self->{insertion_mode} == IN_FRAMESET_IM) {
5855 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5856 @{$self->{open_elements}} == 1) {
5857 !!!cp ('t325');
5858 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5859 ## Ignore the token
5860 !!!next-token;
5861 } else {
5862 !!!cp ('t326');
5863 pop @{$self->{open_elements}};
5864 !!!next-token;
5865 }
5866
5867 if (not defined $self->{inner_html_node} and
5868 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
5869 !!!cp ('t327');
5870 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5871 } else {
5872 !!!cp ('t328');
5873 }
5874 next B;
5875 } elsif ($token->{tag_name} eq 'html' and
5876 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
5877 !!!cp ('t329');
5878 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
5879 !!!next-token;
5880 next B;
5881 } else {
5882 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5883 !!!cp ('t330');
5884 !!!parse-error (type => 'in frameset:/'.$token->{tag_name}, token => $token);
5885 } else {
5886 !!!cp ('t331');
5887 !!!parse-error (type => 'after frameset:/'.$token->{tag_name}, token => $token);
5888 }
5889 ## Ignore the token
5890 !!!next-token;
5891 next B;
5892 }
5893 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5894 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5895 @{$self->{open_elements}} == 1) { # redundant, maybe
5896 !!!cp ('t331.1');
5897 !!!parse-error (type => 'in body:#eof', token => $token);
5898 } else {
5899 !!!cp ('t331.2');
5900 }
5901
5902 ## Stop parsing
5903 last B;
5904 } else {
5905 die "$0: $token->{type}: Unknown token type";
5906 }
5907
5908 ## ISSUE: An issue in spec here
5909 } else {
5910 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5911 }
5912
5913 ## "in body" insertion mode
5914 if ($token->{type} == START_TAG_TOKEN) {
5915 if ($token->{tag_name} eq 'script') {
5916 !!!cp ('t332');
5917 ## NOTE: This is an "as if in head" code clone
5918 $script_start_tag->();
5919 next B;
5920 } elsif ($token->{tag_name} eq 'style') {
5921 !!!cp ('t333');
5922 ## NOTE: This is an "as if in head" code clone
5923 $parse_rcdata->(CDATA_CONTENT_MODEL);
5924 next B;
5925 } elsif ({
5926 base => 1, link => 1,
5927 }->{$token->{tag_name}}) {
5928 !!!cp ('t334');
5929 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5930 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5931 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5932 !!!ack ('t334.1');
5933 !!!next-token;
5934 next B;
5935 } elsif ($token->{tag_name} eq 'meta') {
5936 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5937 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5938 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5939
5940 unless ($self->{confident}) {
5941 if ($token->{attributes}->{charset}) { ## TODO: And if supported
5942 !!!cp ('t335');
5943 $self->{change_encoding}
5944 ->($self, $token->{attributes}->{charset}->{value}, $token);
5945
5946 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5947 ->set_user_data (manakai_has_reference =>
5948 $token->{attributes}->{charset}
5949 ->{has_reference});
5950 } elsif ($token->{attributes}->{content}) {
5951 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
5952 if ($token->{attributes}->{content}->{value}
5953 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
5954 [\x09-\x0D\x20]*=
5955 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
5956 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
5957 !!!cp ('t336');
5958 $self->{change_encoding}
5959 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
5960 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5961 ->set_user_data (manakai_has_reference =>
5962 $token->{attributes}->{content}
5963 ->{has_reference});
5964 }
5965 }
5966 } else {
5967 if ($token->{attributes}->{charset}) {
5968 !!!cp ('t337');
5969 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5970 ->set_user_data (manakai_has_reference =>
5971 $token->{attributes}->{charset}
5972 ->{has_reference});
5973 }
5974 if ($token->{attributes}->{content}) {
5975 !!!cp ('t338');
5976 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5977 ->set_user_data (manakai_has_reference =>
5978 $token->{attributes}->{content}
5979 ->{has_reference});
5980 }
5981 }
5982
5983 !!!ack ('t338.1');
5984 !!!next-token;
5985 next B;
5986 } elsif ($token->{tag_name} eq 'title') {
5987 !!!cp ('t341');
5988 ## NOTE: This is an "as if in head" code clone
5989 $parse_rcdata->(RCDATA_CONTENT_MODEL);
5990 next B;
5991 } elsif ($token->{tag_name} eq 'body') {
5992 !!!parse-error (type => 'in body:body', token => $token);
5993
5994 if (@{$self->{open_elements}} == 1 or
5995 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
5996 !!!cp ('t342');
5997 ## Ignore the token
5998 } else {
5999 my $body_el = $self->{open_elements}->[1]->[0];
6000 for my $attr_name (keys %{$token->{attributes}}) {
6001 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6002 !!!cp ('t343');
6003 $body_el->set_attribute_ns
6004 (undef, [undef, $attr_name],
6005 $token->{attributes}->{$attr_name}->{value});
6006 }
6007 }
6008 }
6009 !!!nack ('t343.1');
6010 !!!next-token;
6011 next B;
6012 } elsif ({
6013 address => 1, blockquote => 1, center => 1, dir => 1,
6014 div => 1, dl => 1, fieldset => 1,
6015 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6016 menu => 1, ol => 1, p => 1, ul => 1,
6017 pre => 1, listing => 1,
6018 form => 1,
6019 table => 1,
6020 hr => 1,
6021 }->{$token->{tag_name}}) {
6022 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6023 !!!cp ('t350');
6024 !!!parse-error (type => 'in form:form', token => $token);
6025 ## Ignore the token
6026 !!!nack ('t350.1');
6027 !!!next-token;
6028 next B;
6029 }
6030
6031 ## has a p element in scope
6032 INSCOPE: for (reverse @{$self->{open_elements}}) {
6033 if ($_->[1] & P_EL) {
6034 !!!cp ('t344');
6035 !!!back-token; # <form>
6036 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6037 line => $token->{line}, column => $token->{column}};
6038 next B;
6039 } elsif ($_->[1] & SCOPING_EL) {
6040 !!!cp ('t345');
6041 last INSCOPE;
6042 }
6043 } # INSCOPE
6044
6045 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6046 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6047 !!!nack ('t346.1');
6048 !!!next-token;
6049 if ($token->{type} == CHARACTER_TOKEN) {
6050 $token->{data} =~ s/^\x0A//;
6051 unless (length $token->{data}) {
6052 !!!cp ('t346');
6053 !!!next-token;
6054 } else {
6055 !!!cp ('t349');
6056 }
6057 } else {
6058 !!!cp ('t348');
6059 }
6060 } elsif ($token->{tag_name} eq 'form') {
6061 !!!cp ('t347.1');
6062 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6063
6064 !!!nack ('t347.2');
6065 !!!next-token;
6066 } elsif ($token->{tag_name} eq 'table') {
6067 !!!cp ('t382');
6068 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6069
6070 $self->{insertion_mode} = IN_TABLE_IM;
6071
6072 !!!nack ('t382.1');
6073 !!!next-token;
6074 } elsif ($token->{tag_name} eq 'hr') {
6075 !!!cp ('t386');
6076 pop @{$self->{open_elements}};
6077
6078 !!!nack ('t386.1');
6079 !!!next-token;
6080 } else {
6081 !!!nack ('t347.1');
6082 !!!next-token;
6083 }
6084 next B;
6085 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6086 ## has a p element in scope
6087 INSCOPE: for (reverse @{$self->{open_elements}}) {
6088 if ($_->[1] & P_EL) {
6089 !!!cp ('t353');
6090 !!!back-token; # <x>
6091 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6092 line => $token->{line}, column => $token->{column}};
6093 next B;
6094 } elsif ($_->[1] & SCOPING_EL) {
6095 !!!cp ('t354');
6096 last INSCOPE;
6097 }
6098 } # INSCOPE
6099
6100 ## Step 1
6101 my $i = -1;
6102 my $node = $self->{open_elements}->[$i];
6103 my $li_or_dtdd = {li => {li => 1},
6104 dt => {dt => 1, dd => 1},
6105 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6106 LI: {
6107 ## Step 2
6108 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6109 if ($i != -1) {
6110 !!!cp ('t355');
6111 !!!parse-error (type => 'not closed',
6112 value => $self->{open_elements}->[-1]->[0]
6113 ->manakai_local_name,
6114 token => $token);
6115 } else {
6116 !!!cp ('t356');
6117 }
6118 splice @{$self->{open_elements}}, $i;
6119 last LI;
6120 } else {
6121 !!!cp ('t357');
6122 }
6123
6124 ## Step 3
6125 if (not ($node->[1] & FORMATTING_EL) and
6126 #not $phrasing_category->{$node->[1]} and
6127 ($node->[1] & SPECIAL_EL or
6128 $node->[1] & SCOPING_EL) and
6129 not ($node->[1] & ADDRESS_EL) and
6130 not ($node->[1] & DIV_EL)) {
6131 !!!cp ('t358');
6132 last LI;
6133 }
6134
6135 !!!cp ('t359');
6136 ## Step 4
6137 $i--;
6138 $node = $self->{open_elements}->[$i];
6139 redo LI;
6140 } # LI
6141
6142 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6143 !!!nack ('t359.1');
6144 !!!next-token;
6145 next B;
6146 } elsif ($token->{tag_name} eq 'plaintext') {
6147 ## has a p element in scope
6148 INSCOPE: for (reverse @{$self->{open_elements}}) {
6149 if ($_->[1] & P_EL) {
6150 !!!cp ('t367');
6151 !!!back-token; # <plaintext>
6152 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6153 line => $token->{line}, column => $token->{column}};
6154 next B;
6155 } elsif ($_->[1] & SCOPING_EL) {
6156 !!!cp ('t368');
6157 last INSCOPE;
6158 }
6159 } # INSCOPE
6160
6161 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6162
6163 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6164
6165 !!!nack ('t368.1');
6166 !!!next-token;
6167 next B;
6168 } elsif ($token->{tag_name} eq 'a') {
6169 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6170 my $node = $active_formatting_elements->[$i];
6171 if ($node->[1] & A_EL) {
6172 !!!cp ('t371');
6173 !!!parse-error (type => 'in a:a', token => $token);
6174
6175 !!!back-token; # <a>
6176 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6177 line => $token->{line}, column => $token->{column}};
6178 $formatting_end_tag->($token);
6179
6180 AFE2: for (reverse 0..$#$active_formatting_elements) {
6181 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6182 !!!cp ('t372');
6183 splice @$active_formatting_elements, $_, 1;
6184 last AFE2;
6185 }
6186 } # AFE2
6187 OE: for (reverse 0..$#{$self->{open_elements}}) {
6188 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6189 !!!cp ('t373');
6190 splice @{$self->{open_elements}}, $_, 1;
6191 last OE;
6192 }
6193 } # OE
6194 last AFE;
6195 } elsif ($node->[0] eq '#marker') {
6196 !!!cp ('t374');
6197 last AFE;
6198 }
6199 } # AFE
6200
6201 $reconstruct_active_formatting_elements->($insert_to_current);
6202
6203 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6204 push @$active_formatting_elements, $self->{open_elements}->[-1];
6205
6206 !!!nack ('t374.1');
6207 !!!next-token;
6208 next B;
6209 } elsif ($token->{tag_name} eq 'nobr') {
6210 $reconstruct_active_formatting_elements->($insert_to_current);
6211
6212 ## has a |nobr| element in scope
6213 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6214 my $node = $self->{open_elements}->[$_];
6215 if ($node->[1] & NOBR_EL) {
6216 !!!cp ('t376');
6217 !!!parse-error (type => 'in nobr:nobr', token => $token);
6218 !!!back-token; # <nobr>
6219 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6220 line => $token->{line}, column => $token->{column}};
6221 next B;
6222 } elsif ($node->[1] & SCOPING_EL) {
6223 !!!cp ('t377');
6224 last INSCOPE;
6225 }
6226 } # INSCOPE
6227
6228 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6229 push @$active_formatting_elements, $self->{open_elements}->[-1];
6230
6231 !!!nack ('t377.1');
6232 !!!next-token;
6233 next B;
6234 } elsif ($token->{tag_name} eq 'button') {
6235 ## has a button element in scope
6236 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6237 my $node = $self->{open_elements}->[$_];
6238 if ($node->[1] & BUTTON_EL) {
6239 !!!cp ('t378');
6240 !!!parse-error (type => 'in button:button', token => $token);
6241 !!!back-token; # <button>
6242 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6243 line => $token->{line}, column => $token->{column}};
6244 next B;
6245 } elsif ($node->[1] & SCOPING_EL) {
6246 !!!cp ('t379');
6247 last INSCOPE;
6248 }
6249 } # INSCOPE
6250
6251 $reconstruct_active_formatting_elements->($insert_to_current);
6252
6253 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6254
6255 ## TODO: associate with $self->{form_element} if defined
6256
6257 push @$active_formatting_elements, ['#marker', ''];
6258
6259 !!!nack ('t379.1');
6260 !!!next-token;
6261 next B;
6262 } elsif ({
6263 xmp => 1,
6264 iframe => 1,
6265 noembed => 1,
6266 noframes => 1,
6267 noscript => 0, ## TODO: 1 if scripting is enabled
6268 }->{$token->{tag_name}}) {
6269 if ($token->{tag_name} eq 'xmp') {
6270 !!!cp ('t381');
6271 $reconstruct_active_formatting_elements->($insert_to_current);
6272 } else {
6273 !!!cp ('t399');
6274 }
6275 ## NOTE: There is an "as if in body" code clone.
6276 $parse_rcdata->(CDATA_CONTENT_MODEL);
6277 next B;
6278 } elsif ($token->{tag_name} eq 'isindex') {
6279 !!!parse-error (type => 'isindex', token => $token);
6280
6281 if (defined $self->{form_element}) {
6282 !!!cp ('t389');
6283 ## Ignore the token
6284 !!!nack ('t389'); ## NOTE: Not acknowledged.
6285 !!!next-token;
6286 next B;
6287 } else {
6288 my $at = $token->{attributes};
6289 my $form_attrs;
6290 $form_attrs->{action} = $at->{action} if $at->{action};
6291 my $prompt_attr = $at->{prompt};
6292 $at->{name} = {name => 'name', value => 'isindex'};
6293 delete $at->{action};
6294 delete $at->{prompt};
6295 my @tokens = (
6296 {type => START_TAG_TOKEN, tag_name => 'form',
6297 attributes => $form_attrs,
6298 line => $token->{line}, column => $token->{column}},
6299 {type => START_TAG_TOKEN, tag_name => 'hr',
6300 line => $token->{line}, column => $token->{column}},
6301 {type => START_TAG_TOKEN, tag_name => 'p',
6302 line => $token->{line}, column => $token->{column}},
6303 {type => START_TAG_TOKEN, tag_name => 'label',
6304 line => $token->{line}, column => $token->{column}},
6305 );
6306 if ($prompt_attr) {
6307 !!!cp ('t390');
6308 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6309 #line => $token->{line}, column => $token->{column},
6310 };
6311 } else {
6312 !!!cp ('t391');
6313 push @tokens, {type => CHARACTER_TOKEN,
6314 data => 'This is a searchable index. Insert your search keywords here: ',
6315 #line => $token->{line}, column => $token->{column},
6316 }; # SHOULD
6317 ## TODO: make this configurable
6318 }
6319 push @tokens,
6320 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6321 line => $token->{line}, column => $token->{column}},
6322 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6323 {type => END_TAG_TOKEN, tag_name => 'label',
6324 line => $token->{line}, column => $token->{column}},
6325 {type => END_TAG_TOKEN, tag_name => 'p',
6326 line => $token->{line}, column => $token->{column}},
6327 {type => START_TAG_TOKEN, tag_name => 'hr',
6328 line => $token->{line}, column => $token->{column}},
6329 {type => END_TAG_TOKEN, tag_name => 'form',
6330 line => $token->{line}, column => $token->{column}};
6331 !!!nack ('t391.1'); ## NOTE: Not acknowledged.
6332 !!!back-token (@tokens);
6333 !!!next-token;
6334 next B;
6335 }
6336 } elsif ($token->{tag_name} eq 'textarea') {
6337 my $tag_name = $token->{tag_name};
6338 my $el;
6339 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6340
6341 ## TODO: $self->{form_element} if defined
6342 $self->{content_model} = RCDATA_CONTENT_MODEL;
6343 delete $self->{escape}; # MUST
6344
6345 $insert->($el);
6346
6347 my $text = '';
6348 !!!nack ('t392.1');
6349 !!!next-token;
6350 if ($token->{type} == CHARACTER_TOKEN) {
6351 $token->{data} =~ s/^\x0A//;
6352 unless (length $token->{data}) {
6353 !!!cp ('t392');
6354 !!!next-token;
6355 } else {
6356 !!!cp ('t393');
6357 }
6358 } else {
6359 !!!cp ('t394');
6360 }
6361 while ($token->{type} == CHARACTER_TOKEN) {
6362 !!!cp ('t395');
6363 $text .= $token->{data};
6364 !!!next-token;
6365 }
6366 if (length $text) {
6367 !!!cp ('t396');
6368 $el->manakai_append_text ($text);
6369 }
6370
6371 $self->{content_model} = PCDATA_CONTENT_MODEL;
6372
6373 if ($token->{type} == END_TAG_TOKEN and
6374 $token->{tag_name} eq $tag_name) {
6375 !!!cp ('t397');
6376 ## Ignore the token
6377 } else {
6378 !!!cp ('t398');
6379 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
6380 }
6381 !!!next-token;
6382 next B;
6383 } elsif ($token->{tag_name} eq 'math' or
6384 $token->{tag_name} eq 'svg') {
6385 $reconstruct_active_formatting_elements->($insert_to_current);
6386
6387 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token);
6388
6389 if ($self->{self_closing}) {
6390 pop @{$self->{open_elements}};
6391 !!!ack ('t398.1');
6392 } else {
6393 !!!cp ('t398.2');
6394 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6395 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6396 ## mode, "in body" (not "in foreign content") secondary insertion
6397 ## mode, maybe.
6398 }
6399
6400 !!!next-token;
6401 next B;
6402 } elsif ({
6403 caption => 1, col => 1, colgroup => 1, frame => 1,
6404 frameset => 1, head => 1, option => 1, optgroup => 1,
6405 tbody => 1, td => 1, tfoot => 1, th => 1,
6406 thead => 1, tr => 1,
6407 }->{$token->{tag_name}}) {
6408 !!!cp ('t401');
6409 !!!parse-error (type => 'in body:'.$token->{tag_name}, token => $token);
6410 ## Ignore the token
6411 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6412 !!!next-token;
6413 next B;
6414
6415 ## ISSUE: An issue on HTML5 new elements in the spec.
6416 } else {
6417 if ($token->{tag_name} eq 'image') {
6418 !!!cp ('t384');
6419 !!!parse-error (type => 'image', token => $token);
6420 $token->{tag_name} = 'img';
6421 } else {
6422 !!!cp ('t385');
6423 }
6424
6425 ## NOTE: There is an "as if <br>" code clone.
6426 $reconstruct_active_formatting_elements->($insert_to_current);
6427
6428 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6429
6430 if ({
6431 applet => 1, marquee => 1, object => 1,
6432 }->{$token->{tag_name}}) {
6433 !!!cp ('t380');
6434 push @$active_formatting_elements, ['#marker', ''];
6435 !!!nack ('t380.1');
6436 } elsif ({
6437 b => 1, big => 1, em => 1, font => 1, i => 1,
6438 s => 1, small => 1, strile => 1,
6439 strong => 1, tt => 1, u => 1,
6440 }->{$token->{tag_name}}) {
6441 !!!cp ('t375');
6442 push @$active_formatting_elements, $self->{open_elements}->[-1];
6443 !!!nack ('t375.1');
6444 } elsif ($token->{tag_name} eq 'input') {
6445 !!!cp ('t388');
6446 ## TODO: associate with $self->{form_element} if defined
6447 pop @{$self->{open_elements}};
6448 !!!ack ('t388.2');
6449 } elsif ({
6450 area => 1, basefont => 1, bgsound => 1, br => 1,
6451 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
6452 #image => 1,
6453 }->{$token->{tag_name}}) {
6454 !!!cp ('t388.1');
6455 pop @{$self->{open_elements}};
6456 !!!ack ('t388.3');
6457 } elsif ($token->{tag_name} eq 'select') {
6458 ## TODO: associate with $self->{form_element} if defined
6459
6460 if ($self->{insertion_mode} & TABLE_IMS or
6461 $self->{insertion_mode} & BODY_TABLE_IMS or
6462 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6463 !!!cp ('t400.1');
6464 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6465 } else {
6466 !!!cp ('t400.2');
6467 $self->{insertion_mode} = IN_SELECT_IM;
6468 }
6469 !!!nack ('t400.3');
6470 } else {
6471 !!!nack ('t402');
6472 }
6473
6474 !!!next-token;
6475 next B;
6476 }
6477 } elsif ($token->{type} == END_TAG_TOKEN) {
6478 if ($token->{tag_name} eq 'body') {
6479 ## has a |body| element in scope
6480 my $i;
6481 INSCOPE: {
6482 for (reverse @{$self->{open_elements}}) {
6483 if ($_->[1] & BODY_EL) {
6484 !!!cp ('t405');
6485 $i = $_;
6486 last INSCOPE;
6487 } elsif ($_->[1] & SCOPING_EL) {
6488 !!!cp ('t405.1');
6489 last;
6490 }
6491 }
6492
6493 !!!parse-error (type => 'start tag not allowed',
6494 value => $token->{tag_name}, token => $token);
6495 ## NOTE: Ignore the token.
6496 !!!next-token;
6497 next B;
6498 } # INSCOPE
6499
6500 for (@{$self->{open_elements}}) {
6501 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
6502 !!!cp ('t403');
6503 !!!parse-error (type => 'not closed',
6504 value => $_->[0]->manakai_local_name,
6505 token => $token);
6506 last;
6507 } else {
6508 !!!cp ('t404');
6509 }
6510 }
6511
6512 $self->{insertion_mode} = AFTER_BODY_IM;
6513 !!!next-token;
6514 next B;
6515 } elsif ($token->{tag_name} eq 'html') {
6516 ## TODO: Update this code. It seems that the code below is not
6517 ## up-to-date, though it has same effect as speced.
6518 if (@{$self->{open_elements}} > 1 and
6519 $self->{open_elements}->[1]->[1] & BODY_EL) {
6520 ## ISSUE: There is an issue in the spec.
6521 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
6522 !!!cp ('t406');
6523 !!!parse-error (type => 'not closed',
6524 value => $self->{open_elements}->[1]->[0]
6525 ->manakai_local_name,
6526 token => $token);
6527 } else {
6528 !!!cp ('t407');
6529 }
6530 $self->{insertion_mode} = AFTER_BODY_IM;
6531 ## reprocess
6532 next B;
6533 } else {
6534 !!!cp ('t408');
6535 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6536 ## Ignore the token
6537 !!!next-token;
6538 next B;
6539 }
6540 } elsif ({
6541 address => 1, blockquote => 1, center => 1, dir => 1,
6542 div => 1, dl => 1, fieldset => 1, listing => 1,
6543 menu => 1, ol => 1, pre => 1, ul => 1,
6544 dd => 1, dt => 1, li => 1,
6545 applet => 1, button => 1, marquee => 1, object => 1,
6546 }->{$token->{tag_name}}) {
6547 ## has an element in scope
6548 my $i;
6549 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6550 my $node = $self->{open_elements}->[$_];
6551 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6552 !!!cp ('t410');
6553 $i = $_;
6554 last INSCOPE;
6555 } elsif ($node->[1] & SCOPING_EL) {
6556 !!!cp ('t411');
6557 last INSCOPE;
6558 }
6559 } # INSCOPE
6560
6561 unless (defined $i) { # has an element in scope
6562 !!!cp ('t413');
6563 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6564 } else {
6565 ## Step 1. generate implied end tags
6566 while ({
6567 dd => ($token->{tag_name} ne 'dd'),
6568 dt => ($token->{tag_name} ne 'dt'),
6569 li => ($token->{tag_name} ne 'li'),
6570 p => 1,
6571 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
6572 !!!cp ('t409');
6573 pop @{$self->{open_elements}};
6574 }
6575
6576 ## Step 2.
6577 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6578 ne $token->{tag_name}) {
6579 !!!cp ('t412');
6580 !!!parse-error (type => 'not closed',
6581 value => $self->{open_elements}->[-1]->[0]
6582 ->manakai_local_name,
6583 token => $token);
6584 } else {
6585 !!!cp ('t414');
6586 }
6587
6588 ## Step 3.
6589 splice @{$self->{open_elements}}, $i;
6590
6591 ## Step 4.
6592 $clear_up_to_marker->()
6593 if {
6594 applet => 1, button => 1, marquee => 1, object => 1,
6595 }->{$token->{tag_name}};
6596 }
6597 !!!next-token;
6598 next B;
6599 } elsif ($token->{tag_name} eq 'form') {
6600 undef $self->{form_element};
6601
6602 ## has an element in scope
6603 my $i;
6604 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6605 my $node = $self->{open_elements}->[$_];
6606 if ($node->[1] & FORM_EL) {
6607 !!!cp ('t418');
6608 $i = $_;
6609 last INSCOPE;
6610 } elsif ($node->[1] & SCOPING_EL) {
6611 !!!cp ('t419');
6612 last INSCOPE;
6613 }
6614 } # INSCOPE
6615
6616 unless (defined $i) { # has an element in scope
6617 !!!cp ('t421');
6618 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6619 } else {
6620 ## Step 1. generate implied end tags
6621 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6622 !!!cp ('t417');
6623 pop @{$self->{open_elements}};
6624 }
6625
6626 ## Step 2.
6627 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6628 ne $token->{tag_name}) {
6629 !!!cp ('t417.1');
6630 !!!parse-error (type => 'not closed',
6631 value => $self->{open_elements}->[-1]->[0]
6632 ->manakai_local_name,
6633 token => $token);
6634 } else {
6635 !!!cp ('t420');
6636 }
6637
6638 ## Step 3.
6639 splice @{$self->{open_elements}}, $i;
6640 }
6641
6642 !!!next-token;
6643 next B;
6644 } elsif ({
6645 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6646 }->{$token->{tag_name}}) {
6647 ## has an element in scope
6648 my $i;
6649 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6650 my $node = $self->{open_elements}->[$_];
6651 if ($node->[1] & HEADING_EL) {
6652 !!!cp ('t423');
6653 $i = $_;
6654 last INSCOPE;
6655 } elsif ($node->[1] & SCOPING_EL) {
6656 !!!cp ('t424');
6657 last INSCOPE;
6658 }
6659 } # INSCOPE
6660
6661 unless (defined $i) { # has an element in scope
6662 !!!cp ('t425.1');
6663 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6664 } else {
6665 ## Step 1. generate implied end tags
6666 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6667 !!!cp ('t422');
6668 pop @{$self->{open_elements}};
6669 }
6670
6671 ## Step 2.
6672 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6673 ne $token->{tag_name}) {
6674 !!!cp ('t425');
6675 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6676 } else {
6677 !!!cp ('t426');
6678 }
6679
6680 ## Step 3.
6681 splice @{$self->{open_elements}}, $i;
6682 }
6683
6684 !!!next-token;
6685 next B;
6686 } elsif ($token->{tag_name} eq 'p') {
6687 ## has an element in scope
6688 my $i;
6689 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6690 my $node = $self->{open_elements}->[$_];
6691 if ($node->[1] & P_EL) {
6692 !!!cp ('t410.1');
6693 $i = $_;
6694 last INSCOPE;
6695 } elsif ($node->[1] & SCOPING_EL) {
6696 !!!cp ('t411.1');
6697 last INSCOPE;
6698 }
6699 } # INSCOPE
6700
6701 if (defined $i) {
6702 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6703 ne $token->{tag_name}) {
6704 !!!cp ('t412.1');
6705 !!!parse-error (type => 'not closed',
6706 value => $self->{open_elements}->[-1]->[0]
6707 ->manakai_local_name,
6708 token => $token);
6709 } else {
6710 !!!cp ('t414.1');
6711 }
6712
6713 splice @{$self->{open_elements}}, $i;
6714 } else {
6715 !!!cp ('t413.1');
6716 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6717
6718 !!!cp ('t415.1');
6719 ## As if <p>, then reprocess the current token
6720 my $el;
6721 !!!create-element ($el, $HTML_NS, 'p',, $token);
6722 $insert->($el);
6723 ## NOTE: Not inserted into |$self->{open_elements}|.
6724 }
6725
6726 !!!next-token;
6727 next B;
6728 } elsif ({
6729 a => 1,
6730 b => 1, big => 1, em => 1, font => 1, i => 1,
6731 nobr => 1, s => 1, small => 1, strile => 1,
6732 strong => 1, tt => 1, u => 1,
6733 }->{$token->{tag_name}}) {
6734 !!!cp ('t427');
6735 $formatting_end_tag->($token);
6736 next B;
6737 } elsif ($token->{tag_name} eq 'br') {
6738 !!!cp ('t428');
6739 !!!parse-error (type => 'unmatched end tag:br', token => $token);
6740
6741 ## As if <br>
6742 $reconstruct_active_formatting_elements->($insert_to_current);
6743
6744 my $el;
6745 !!!create-element ($el, $HTML_NS, 'br',, $token);
6746 $insert->($el);
6747
6748 ## Ignore the token.
6749 !!!next-token;
6750 next B;
6751 } elsif ({
6752 caption => 1, col => 1, colgroup => 1, frame => 1,
6753 frameset => 1, head => 1, option => 1, optgroup => 1,
6754 tbody => 1, td => 1, tfoot => 1, th => 1,
6755 thead => 1, tr => 1,
6756 area => 1, basefont => 1, bgsound => 1,
6757 embed => 1, hr => 1, iframe => 1, image => 1,
6758 img => 1, input => 1, isindex => 1, noembed => 1,
6759 noframes => 1, param => 1, select => 1, spacer => 1,
6760 table => 1, textarea => 1, wbr => 1,
6761 noscript => 0, ## TODO: if scripting is enabled
6762 }->{$token->{tag_name}}) {
6763 !!!cp ('t429');
6764 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6765 ## Ignore the token
6766 !!!next-token;
6767 next B;
6768
6769 ## ISSUE: Issue on HTML5 new elements in spec
6770
6771 } else {
6772 ## Step 1
6773 my $node_i = -1;
6774 my $node = $self->{open_elements}->[$node_i];
6775
6776 ## Step 2
6777 S2: {
6778 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6779 ## Step 1
6780 ## generate implied end tags
6781 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6782 !!!cp ('t430');
6783 ## ISSUE: Can this case be reached?
6784 pop @{$self->{open_elements}};
6785 }
6786
6787 ## Step 2
6788 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6789 ne $token->{tag_name}) {
6790 !!!cp ('t431');
6791 ## NOTE: <x><y></x>
6792 !!!parse-error (type => 'not closed',
6793 value => $self->{open_elements}->[-1]->[0]
6794 ->manakai_local_name,
6795 token => $token);
6796 } else {
6797 !!!cp ('t432');
6798 }
6799
6800 ## Step 3
6801 splice @{$self->{open_elements}}, $node_i;
6802
6803 !!!next-token;
6804 last S2;
6805 } else {
6806 ## Step 3
6807 if (not ($node->[1] & FORMATTING_EL) and
6808 #not $phrasing_category->{$node->[1]} and
6809 ($node->[1] & SPECIAL_EL or
6810 $node->[1] & SCOPING_EL)) {
6811 !!!cp ('t433');
6812 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6813 ## Ignore the token
6814 !!!next-token;
6815 last S2;
6816 }
6817
6818 !!!cp ('t434');
6819 }
6820
6821 ## Step 4
6822 $node_i--;
6823 $node = $self->{open_elements}->[$node_i];
6824
6825 ## Step 5;
6826 redo S2;
6827 } # S2
6828 next B;
6829 }
6830 }
6831 next B;
6832 } continue { # B
6833 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
6834 ## NOTE: The code below is executed in cases where it does not have
6835 ## to be, but it it is harmless even in those cases.
6836 ## has an element in scope
6837 INSCOPE: {
6838 for (reverse 0..$#{$self->{open_elements}}) {
6839 my $node = $self->{open_elements}->[$_];
6840 if ($node->[1] & FOREIGN_EL) {
6841 last INSCOPE;
6842 } elsif ($node->[1] & SCOPING_EL) {
6843 last;
6844 }
6845 }
6846
6847 ## NOTE: No foreign element in scope.
6848 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
6849 } # INSCOPE
6850 }
6851 } # B
6852
6853 ## Stop parsing # MUST
6854
6855 ## TODO: script stuffs
6856 } # _tree_construct_main
6857
6858 sub set_inner_html ($$$) {
6859 my $class = shift;
6860 my $node = shift;
6861 my $s = \$_[0];
6862 my $onerror = $_[1];
6863
6864 ## ISSUE: Should {confident} be true?
6865
6866 my $nt = $node->node_type;
6867 if ($nt == 9) {
6868 # MUST
6869
6870 ## Step 1 # MUST
6871 ## TODO: If the document has an active parser, ...
6872 ## ISSUE: There is an issue in the spec.
6873
6874 ## Step 2 # MUST
6875 my @cn = @{$node->child_nodes};
6876 for (@cn) {
6877 $node->remove_child ($_);
6878 }
6879
6880 ## Step 3, 4, 5 # MUST
6881 $class->parse_string ($$s => $node, $onerror);
6882 } elsif ($nt == 1) {
6883 ## TODO: If non-html element
6884
6885 ## NOTE: Most of this code is copied from |parse_string|
6886
6887 ## Step 1 # MUST
6888 my $this_doc = $node->owner_document;
6889 my $doc = $this_doc->implementation->create_document;
6890 $doc->manakai_is_html (1);
6891 my $p = $class->new;
6892 $p->{document} = $doc;
6893
6894 ## Step 8 # MUST
6895 my $i = 0;
6896 $p->{line_prev} = $p->{line} = 1;
6897 $p->{column_prev} = $p->{column} = 0;
6898 $p->{set_next_char} = sub {
6899 my $self = shift;
6900
6901 pop @{$self->{prev_char}};
6902 unshift @{$self->{prev_char}}, $self->{next_char};
6903
6904 $self->{next_char} = -1 and return if $i >= length $$s;
6905 $self->{next_char} = ord substr $$s, $i++, 1;
6906
6907 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
6908 $p->{column}++;
6909
6910 if ($self->{next_char} == 0x000A) { # LF
6911 $p->{line}++;
6912 $p->{column} = 0;
6913 !!!cp ('i1');
6914 } elsif ($self->{next_char} == 0x000D) { # CR
6915 $i++ if substr ($$s, $i, 1) eq "\x0A";
6916 $self->{next_char} = 0x000A; # LF # MUST
6917 $p->{line}++;
6918 $p->{column} = 0;
6919 !!!cp ('i2');
6920 } elsif ($self->{next_char} > 0x10FFFF) {
6921 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6922 !!!cp ('i3');
6923 } elsif ($self->{next_char} == 0x0000) { # NULL
6924 !!!cp ('i4');
6925 !!!parse-error (type => 'NULL');
6926 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6927 }
6928 };
6929 $p->{prev_char} = [-1, -1, -1];
6930 $p->{next_char} = -1;
6931
6932 my $ponerror = $onerror || sub {
6933 my (%opt) = @_;
6934 my $line = $opt{line};
6935 my $column = $opt{column};
6936 if (defined $opt{token} and defined $opt{token}->{line}) {
6937 $line = $opt{token}->{line};
6938 $column = $opt{token}->{column};
6939 }
6940 warn "Parse error ($opt{type}) at line $line column $column\n";
6941 };
6942 $p->{parse_error} = sub {
6943 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
6944 };
6945
6946 $p->_initialize_tokenizer;
6947 $p->_initialize_tree_constructor;
6948
6949 ## Step 2
6950 my $node_ln = $node->manakai_local_name;
6951 $p->{content_model} = {
6952 title => RCDATA_CONTENT_MODEL,
6953 textarea => RCDATA_CONTENT_MODEL,
6954 style => CDATA_CONTENT_MODEL,
6955 script => CDATA_CONTENT_MODEL,
6956 xmp => CDATA_CONTENT_MODEL,
6957 iframe => CDATA_CONTENT_MODEL,
6958 noembed => CDATA_CONTENT_MODEL,
6959 noframes => CDATA_CONTENT_MODEL,
6960 noscript => CDATA_CONTENT_MODEL,
6961 plaintext => PLAINTEXT_CONTENT_MODEL,
6962 }->{$node_ln};
6963 $p->{content_model} = PCDATA_CONTENT_MODEL
6964 unless defined $p->{content_model};
6965 ## ISSUE: What is "the name of the element"? local name?
6966
6967 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
6968 ## TODO: Foreign element OK?
6969
6970 ## Step 3
6971 my $root = $doc->create_element_ns
6972 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
6973
6974 ## Step 4 # MUST
6975 $doc->append_child ($root);
6976
6977 ## Step 5 # MUST
6978 push @{$p->{open_elements}}, [$root, $el_category->{html}];
6979
6980 undef $p->{head_element};
6981
6982 ## Step 6 # MUST
6983 $p->_reset_insertion_mode;
6984
6985 ## Step 7 # MUST
6986 my $anode = $node;
6987 AN: while (defined $anode) {
6988 if ($anode->node_type == 1) {
6989 my $nsuri = $anode->namespace_uri;
6990 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
6991 if ($anode->manakai_local_name eq 'form') {
6992 !!!cp ('i5');
6993 $p->{form_element} = $anode;
6994 last AN;
6995 }
6996 }
6997 }
6998 $anode = $anode->parent_node;
6999 } # AN
7000
7001 ## Step 9 # MUST
7002 {
7003 my $self = $p;
7004 !!!next-token;
7005 }
7006 $p->_tree_construction_main;
7007
7008 ## Step 10 # MUST
7009 my @cn = @{$node->child_nodes};
7010 for (@cn) {
7011 $node->remove_child ($_);
7012 }
7013 ## ISSUE: mutation events? read-only?
7014
7015 ## Step 11 # MUST
7016 @cn = @{$root->child_nodes};
7017 for (@cn) {
7018 $this_doc->adopt_node ($_);
7019 $node->append_child ($_);
7020 }
7021 ## ISSUE: mutation events?
7022
7023 $p->_terminate_tree_constructor;
7024
7025 delete $p->{parse_error}; # delete loop
7026 } else {
7027 die "$0: |set_inner_html| is not defined for node of type $nt";
7028 }
7029 } # set_inner_html
7030
7031 } # tree construction stage
7032
7033 package Whatpm::HTML::RestartParser;
7034 push our @ISA, 'Error';
7035
7036 1;
7037 # $Date: 2008/04/12 15:37:33 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24