/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.128 - (show annotations) (download) (as text)
Sat Apr 12 15:31:56 2008 UTC (16 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.127: +3 -3 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	12 Apr 2008 15:31:52 -0000
	* HTML.pm.src: Support for new long MathML entities (HTML5
	revision 1406).

2008-04-13  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.127 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
16 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
17 my $SVG_NS = q<http://www.w3.org/2000/svg>;
18 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
19 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
20 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
21
22 sub A_EL () { 0b1 }
23 sub ADDRESS_EL () { 0b10 }
24 sub BODY_EL () { 0b100 }
25 sub BUTTON_EL () { 0b1000 }
26 sub CAPTION_EL () { 0b10000 }
27 sub DD_EL () { 0b100000 }
28 sub DIV_EL () { 0b1000000 }
29 sub DT_EL () { 0b10000000 }
30 sub FORM_EL () { 0b100000000 }
31 sub FORMATTING_EL () { 0b1000000000 }
32 sub FRAMESET_EL () { 0b10000000000 }
33 sub HEADING_EL () { 0b100000000000 }
34 sub HTML_EL () { 0b1000000000000 }
35 sub LI_EL () { 0b10000000000000 }
36 sub NOBR_EL () { 0b100000000000000 }
37 sub OPTION_EL () { 0b1000000000000000 }
38 sub OPTGROUP_EL () { 0b10000000000000000 }
39 sub P_EL () { 0b100000000000000000 }
40 sub SELECT_EL () { 0b1000000000000000000 }
41 sub TABLE_EL () { 0b10000000000000000000 }
42 sub TABLE_CELL_EL () { 0b100000000000000000000 }
43 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
44 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
45 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
46 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
47 sub FOREIGN_EL () { 0b10000000000000000000000000 }
48 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
49 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 sub END_TAG_OPTIONAL_EL () {
58 DD_EL |
59 DT_EL |
60 LI_EL |
61 P_EL
62 }
63
64 sub ALL_END_TAG_OPTIONAL_EL () {
65 END_TAG_OPTIONAL_EL |
66 BODY_EL |
67 HTML_EL |
68 TABLE_CELL_EL |
69 TABLE_ROW_EL |
70 TABLE_ROW_GROUP_EL
71 }
72
73 sub SCOPING_EL () {
74 BUTTON_EL |
75 CAPTION_EL |
76 HTML_EL |
77 TABLE_EL |
78 TABLE_CELL_EL |
79 MISC_SCOPING_EL
80 }
81
82 sub TABLE_SCOPING_EL () {
83 HTML_EL |
84 TABLE_EL
85 }
86
87 sub TABLE_ROWS_SCOPING_EL () {
88 HTML_EL |
89 TABLE_ROW_GROUP_EL
90 }
91
92 sub TABLE_ROW_SCOPING_EL () {
93 HTML_EL |
94 TABLE_ROW_EL
95 }
96
97 sub SPECIAL_EL () {
98 ADDRESS_EL |
99 BODY_EL |
100 DIV_EL |
101 END_TAG_OPTIONAL_EL |
102 FORM_EL |
103 FRAMESET_EL |
104 HEADING_EL |
105 OPTION_EL |
106 OPTGROUP_EL |
107 SELECT_EL |
108 TABLE_ROW_EL |
109 TABLE_ROW_GROUP_EL |
110 MISC_SPECIAL_EL
111 }
112
113 my $el_category = {
114 a => A_EL | FORMATTING_EL,
115 address => ADDRESS_EL,
116 applet => MISC_SCOPING_EL,
117 area => MISC_SPECIAL_EL,
118 b => FORMATTING_EL,
119 base => MISC_SPECIAL_EL,
120 basefont => MISC_SPECIAL_EL,
121 bgsound => MISC_SPECIAL_EL,
122 big => FORMATTING_EL,
123 blockquote => MISC_SPECIAL_EL,
124 body => BODY_EL,
125 br => MISC_SPECIAL_EL,
126 button => BUTTON_EL,
127 caption => CAPTION_EL,
128 center => MISC_SPECIAL_EL,
129 col => MISC_SPECIAL_EL,
130 colgroup => MISC_SPECIAL_EL,
131 dd => DD_EL,
132 dir => MISC_SPECIAL_EL,
133 div => DIV_EL,
134 dl => MISC_SPECIAL_EL,
135 dt => DT_EL,
136 em => FORMATTING_EL,
137 embed => MISC_SPECIAL_EL,
138 fieldset => MISC_SPECIAL_EL,
139 font => FORMATTING_EL,
140 form => FORM_EL,
141 frame => MISC_SPECIAL_EL,
142 frameset => FRAMESET_EL,
143 h1 => HEADING_EL,
144 h2 => HEADING_EL,
145 h3 => HEADING_EL,
146 h4 => HEADING_EL,
147 h5 => HEADING_EL,
148 h6 => HEADING_EL,
149 head => MISC_SPECIAL_EL,
150 hr => MISC_SPECIAL_EL,
151 html => HTML_EL,
152 i => FORMATTING_EL,
153 iframe => MISC_SPECIAL_EL,
154 img => MISC_SPECIAL_EL,
155 input => MISC_SPECIAL_EL,
156 isindex => MISC_SPECIAL_EL,
157 li => LI_EL,
158 link => MISC_SPECIAL_EL,
159 listing => MISC_SPECIAL_EL,
160 marquee => MISC_SCOPING_EL,
161 menu => MISC_SPECIAL_EL,
162 meta => MISC_SPECIAL_EL,
163 nobr => NOBR_EL | FORMATTING_EL,
164 noembed => MISC_SPECIAL_EL,
165 noframes => MISC_SPECIAL_EL,
166 noscript => MISC_SPECIAL_EL,
167 object => MISC_SCOPING_EL,
168 ol => MISC_SPECIAL_EL,
169 optgroup => OPTGROUP_EL,
170 option => OPTION_EL,
171 p => P_EL,
172 param => MISC_SPECIAL_EL,
173 plaintext => MISC_SPECIAL_EL,
174 pre => MISC_SPECIAL_EL,
175 s => FORMATTING_EL,
176 script => MISC_SPECIAL_EL,
177 select => SELECT_EL,
178 small => FORMATTING_EL,
179 spacer => MISC_SPECIAL_EL,
180 strike => FORMATTING_EL,
181 strong => FORMATTING_EL,
182 style => MISC_SPECIAL_EL,
183 table => TABLE_EL,
184 tbody => TABLE_ROW_GROUP_EL,
185 td => TABLE_CELL_EL,
186 textarea => MISC_SPECIAL_EL,
187 tfoot => TABLE_ROW_GROUP_EL,
188 th => TABLE_CELL_EL,
189 thead => TABLE_ROW_GROUP_EL,
190 title => MISC_SPECIAL_EL,
191 tr => TABLE_ROW_EL,
192 tt => FORMATTING_EL,
193 u => FORMATTING_EL,
194 ul => MISC_SPECIAL_EL,
195 wbr => MISC_SPECIAL_EL,
196 };
197
198 my $el_category_f = {
199 $MML_NS => {
200 'annotation-xml' => MML_AXML_EL,
201 mi => FOREIGN_FLOW_CONTENT_EL,
202 mo => FOREIGN_FLOW_CONTENT_EL,
203 mn => FOREIGN_FLOW_CONTENT_EL,
204 ms => FOREIGN_FLOW_CONTENT_EL,
205 mtext => FOREIGN_FLOW_CONTENT_EL,
206 },
207 $SVG_NS => {
208 foreignobject => FOREIGN_FLOW_CONTENT_EL, ## TODO: case
209 desc => FOREIGN_FLOW_CONTENT_EL,
210 title => FOREIGN_FLOW_CONTENT_EL,
211 },
212 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
213 };
214
215 my $c1_entity_char = {
216 0x80 => 0x20AC,
217 0x81 => 0xFFFD,
218 0x82 => 0x201A,
219 0x83 => 0x0192,
220 0x84 => 0x201E,
221 0x85 => 0x2026,
222 0x86 => 0x2020,
223 0x87 => 0x2021,
224 0x88 => 0x02C6,
225 0x89 => 0x2030,
226 0x8A => 0x0160,
227 0x8B => 0x2039,
228 0x8C => 0x0152,
229 0x8D => 0xFFFD,
230 0x8E => 0x017D,
231 0x8F => 0xFFFD,
232 0x90 => 0xFFFD,
233 0x91 => 0x2018,
234 0x92 => 0x2019,
235 0x93 => 0x201C,
236 0x94 => 0x201D,
237 0x95 => 0x2022,
238 0x96 => 0x2013,
239 0x97 => 0x2014,
240 0x98 => 0x02DC,
241 0x99 => 0x2122,
242 0x9A => 0x0161,
243 0x9B => 0x203A,
244 0x9C => 0x0153,
245 0x9D => 0xFFFD,
246 0x9E => 0x017E,
247 0x9F => 0x0178,
248 }; # $c1_entity_char
249
250 sub parse_byte_string ($$$$;$) {
251 my $self = ref $_[0] ? shift : shift->new;
252 my $charset = shift;
253 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
254 my $s;
255
256 if (defined $charset) {
257 require Encode; ## TODO: decode(utf8) don't delete BOM
258 $s = \ (Encode::decode ($charset, $$bytes_s));
259 $self->{input_encoding} = lc $charset; ## TODO: normalize name
260 $self->{confident} = 1;
261 } else {
262 ## TODO: Implement HTML5 detection algorithm
263 require Whatpm::Charset::UniversalCharDet;
264 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
265 (substr ($$bytes_s, 0, 1024));
266 $charset ||= 'windows-1252';
267 $s = \ (Encode::decode ($charset, $$bytes_s));
268 $self->{input_encoding} = $charset;
269 $self->{confident} = 0;
270 }
271
272 $self->{change_encoding} = sub {
273 my $self = shift;
274 my $charset = lc shift;
275 my $token = shift;
276 ## TODO: if $charset is supported
277 ## TODO: normalize charset name
278
279 ## "Change the encoding" algorithm:
280
281 ## Step 1
282 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
283 $charset = 'utf-8';
284 }
285
286 ## Step 2
287 if (defined $self->{input_encoding} and
288 $self->{input_encoding} eq $charset) {
289 $self->{confident} = 1;
290 return;
291 }
292
293 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
294 ':'.$charset, level => 'w', token => $token);
295
296 ## Step 3
297 # if (can) {
298 ## change the encoding on the fly.
299 #$self->{confident} = 1;
300 #return;
301 # }
302
303 ## Step 4
304 throw Whatpm::HTML::RestartParser (charset => $charset);
305 }; # $self->{change_encoding}
306
307 my @args = @_; shift @args; # $s
308 my $return;
309 try {
310 $return = $self->parse_char_string ($s, @args);
311 } catch Whatpm::HTML::RestartParser with {
312 my $charset = shift->{charset};
313 $s = \ (Encode::decode ($charset, $$bytes_s));
314 $self->{input_encoding} = $charset; ## TODO: normalize
315 $self->{confident} = 1;
316 $return = $self->parse_char_string ($s, @args);
317 };
318 return $return;
319 } # parse_byte_string
320
321 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
322 ## and the HTML layer MUST ignore it. However, we does strip BOM in
323 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
324 ## because the core part of our HTML parser expects a string of character,
325 ## not a string of bytes or code units or anything which might contain a BOM.
326 ## Therefore, any parser interface that accepts a string of bytes,
327 ## such as |parse_byte_string| in this module, must ensure that it does
328 ## strip the BOM and never strip any ZWNBSP.
329
330 *parse_char_string = \&parse_string;
331
332 sub parse_string ($$$;$) {
333 my $self = ref $_[0] ? shift : shift->new;
334 my $s = ref $_[0] ? $_[0] : \($_[0]);
335 $self->{document} = $_[1];
336 @{$self->{document}->child_nodes} = ();
337
338 ## NOTE: |set_inner_html| copies most of this method's code
339
340 $self->{confident} = 1 unless exists $self->{confident};
341 $self->{document}->input_encoding ($self->{input_encoding})
342 if defined $self->{input_encoding};
343
344 my $i = 0;
345 $self->{line_prev} = $self->{line} = 1;
346 $self->{column_prev} = $self->{column} = 0;
347 $self->{set_next_char} = sub {
348 my $self = shift;
349
350 pop @{$self->{prev_char}};
351 unshift @{$self->{prev_char}}, $self->{next_char};
352
353 $self->{next_char} = -1 and return if $i >= length $$s;
354 $self->{next_char} = ord substr $$s, $i++, 1;
355
356 ($self->{line_prev}, $self->{column_prev})
357 = ($self->{line}, $self->{column});
358 $self->{column}++;
359
360 if ($self->{next_char} == 0x000A) { # LF
361 $self->{line}++;
362 $self->{column} = 0;
363 } elsif ($self->{next_char} == 0x000D) { # CR
364 $i++ if substr ($$s, $i, 1) eq "\x0A";
365 $self->{next_char} = 0x000A; # LF # MUST
366 $self->{line}++;
367 $self->{column} = 0;
368 } elsif ($self->{next_char} > 0x10FFFF) {
369 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
370 } elsif ($self->{next_char} == 0x0000) { # NULL
371 !!!parse-error (type => 'NULL');
372 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
373 }
374 };
375 $self->{prev_char} = [-1, -1, -1];
376 $self->{next_char} = -1;
377
378 my $onerror = $_[2] || sub {
379 my (%opt) = @_;
380 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
381 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
382 warn "Parse error ($opt{type}) at line $line column $column\n";
383 };
384 $self->{parse_error} = sub {
385 $onerror->(line => $self->{line}, column => $self->{column}, @_);
386 };
387
388 $self->_initialize_tokenizer;
389 $self->_initialize_tree_constructor;
390 $self->_construct_tree;
391 $self->_terminate_tree_constructor;
392
393 delete $self->{parse_error}; # remove loop
394
395 return $self->{document};
396 } # parse_string
397
398 sub new ($) {
399 my $class = shift;
400 my $self = bless {}, $class;
401 $self->{set_next_char} = sub {
402 $self->{next_char} = -1;
403 };
404 $self->{parse_error} = sub {
405 #
406 };
407 $self->{change_encoding} = sub {
408 # if ($_[0] is a supported encoding) {
409 # run "change the encoding" algorithm;
410 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
411 # }
412 };
413 $self->{application_cache_selection} = sub {
414 #
415 };
416 return $self;
417 } # new
418
419 sub CM_ENTITY () { 0b001 } # & markup in data
420 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
421 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
422
423 sub PLAINTEXT_CONTENT_MODEL () { 0 }
424 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
425 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
426 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
427
428 sub DATA_STATE () { 0 }
429 sub ENTITY_DATA_STATE () { 1 }
430 sub TAG_OPEN_STATE () { 2 }
431 sub CLOSE_TAG_OPEN_STATE () { 3 }
432 sub TAG_NAME_STATE () { 4 }
433 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
434 sub ATTRIBUTE_NAME_STATE () { 6 }
435 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
436 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
437 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
438 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
439 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
440 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
441 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
442 sub COMMENT_START_STATE () { 14 }
443 sub COMMENT_START_DASH_STATE () { 15 }
444 sub COMMENT_STATE () { 16 }
445 sub COMMENT_END_STATE () { 17 }
446 sub COMMENT_END_DASH_STATE () { 18 }
447 sub BOGUS_COMMENT_STATE () { 19 }
448 sub DOCTYPE_STATE () { 20 }
449 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
450 sub DOCTYPE_NAME_STATE () { 22 }
451 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
452 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
453 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
454 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
455 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
456 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
457 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
458 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
459 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
460 sub BOGUS_DOCTYPE_STATE () { 32 }
461 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
462 sub SELF_CLOSING_START_TAG_STATE () { 34 }
463 sub CDATA_BLOCK_STATE () { 35 }
464
465 sub DOCTYPE_TOKEN () { 1 }
466 sub COMMENT_TOKEN () { 2 }
467 sub START_TAG_TOKEN () { 3 }
468 sub END_TAG_TOKEN () { 4 }
469 sub END_OF_FILE_TOKEN () { 5 }
470 sub CHARACTER_TOKEN () { 6 }
471
472 sub AFTER_HTML_IMS () { 0b100 }
473 sub HEAD_IMS () { 0b1000 }
474 sub BODY_IMS () { 0b10000 }
475 sub BODY_TABLE_IMS () { 0b100000 }
476 sub TABLE_IMS () { 0b1000000 }
477 sub ROW_IMS () { 0b10000000 }
478 sub BODY_AFTER_IMS () { 0b100000000 }
479 sub FRAME_IMS () { 0b1000000000 }
480 sub SELECT_IMS () { 0b10000000000 }
481 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
482 ## NOTE: "in foreign content" insertion mode is special; it is combined
483 ## with the secondary insertion mode. In this parser, they are stored
484 ## together in the bit-or'ed form.
485
486 ## NOTE: "initial" and "before html" insertion modes have no constants.
487
488 ## NOTE: "after after body" insertion mode.
489 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
490
491 ## NOTE: "after after frameset" insertion mode.
492 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
493
494 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
495 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
496 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
497 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
498 sub IN_BODY_IM () { BODY_IMS }
499 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
500 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
501 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
502 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
503 sub IN_TABLE_IM () { TABLE_IMS }
504 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
505 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
506 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
507 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
508 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
509 sub IN_COLUMN_GROUP_IM () { 0b10 }
510
511 ## Implementations MUST act as if state machine in the spec
512
513 sub _initialize_tokenizer ($) {
514 my $self = shift;
515 $self->{state} = DATA_STATE; # MUST
516 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
517 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
518 undef $self->{current_attribute};
519 undef $self->{last_emitted_start_tag_name};
520 undef $self->{last_attribute_value_state};
521 delete $self->{self_closing};
522 $self->{char} = [];
523 # $self->{next_char}
524 !!!next-input-character;
525 $self->{token} = [];
526 # $self->{escape}
527 } # _initialize_tokenizer
528
529 ## A token has:
530 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
531 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
532 ## ->{name} (DOCTYPE_TOKEN)
533 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
534 ## ->{public_identifier} (DOCTYPE_TOKEN)
535 ## ->{system_identifier} (DOCTYPE_TOKEN)
536 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
537 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
538 ## ->{name}
539 ## ->{value}
540 ## ->{has_reference} == 1 or 0
541 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
542 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
543 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
544 ## while the token is pushed back to the stack.
545
546 ## ISSUE: "When a DOCTYPE token is created, its
547 ## <i>self-closing flag</i> must be unset (its other state is that it
548 ## be set), and its attributes list must be empty.": Wrong subject?
549
550 ## Emitted token MUST immediately be handled by the tree construction state.
551
552 ## Before each step, UA MAY check to see if either one of the scripts in
553 ## "list of scripts that will execute as soon as possible" or the first
554 ## script in the "list of scripts that will execute asynchronously",
555 ## has completed loading. If one has, then it MUST be executed
556 ## and removed from the list.
557
558 ## NOTE: HTML5 "Writing HTML documents" section, applied to
559 ## documents and not to user agents and conformance checkers,
560 ## contains some requirements that are not detected by the
561 ## parsing algorithm:
562 ## - Some requirements on character encoding declarations. ## TODO
563 ## - "Elements MUST NOT contain content that their content model disallows."
564 ## ... Some are parse error, some are not (will be reported by c.c.).
565 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
566 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
567 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
568
569 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
570 ## be detected by the HTML5 parsing algorithm:
571 ## - Text,
572
573 sub _get_next_token ($) {
574 my $self = shift;
575
576 if ($self->{self_closing}) {
577 !!!parse-error (type => 'nestc', token => $self->{current_token});
578 ## NOTE: The |self_closing| flag is only set by start tag token.
579 ## In addition, when a start tag token is emitted, it is always set to
580 ## |current_token|.
581 delete $self->{self_closing};
582 }
583
584 if (@{$self->{token}}) {
585 $self->{self_closing} = $self->{token}->[0]->{self_closing};
586 return shift @{$self->{token}};
587 }
588
589 A: {
590 if ($self->{state} == DATA_STATE) {
591 if ($self->{next_char} == 0x0026) { # &
592 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
593 not $self->{escape}) {
594 !!!cp (1);
595 $self->{state} = ENTITY_DATA_STATE;
596 !!!next-input-character;
597 redo A;
598 } else {
599 !!!cp (2);
600 #
601 }
602 } elsif ($self->{next_char} == 0x002D) { # -
603 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
604 unless ($self->{escape}) {
605 if ($self->{prev_char}->[0] == 0x002D and # -
606 $self->{prev_char}->[1] == 0x0021 and # !
607 $self->{prev_char}->[2] == 0x003C) { # <
608 !!!cp (3);
609 $self->{escape} = 1;
610 } else {
611 !!!cp (4);
612 }
613 } else {
614 !!!cp (5);
615 }
616 }
617
618 #
619 } elsif ($self->{next_char} == 0x003C) { # <
620 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
621 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
622 not $self->{escape})) {
623 !!!cp (6);
624 $self->{state} = TAG_OPEN_STATE;
625 !!!next-input-character;
626 redo A;
627 } else {
628 !!!cp (7);
629 #
630 }
631 } elsif ($self->{next_char} == 0x003E) { # >
632 if ($self->{escape} and
633 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
634 if ($self->{prev_char}->[0] == 0x002D and # -
635 $self->{prev_char}->[1] == 0x002D) { # -
636 !!!cp (8);
637 delete $self->{escape};
638 } else {
639 !!!cp (9);
640 }
641 } else {
642 !!!cp (10);
643 }
644
645 #
646 } elsif ($self->{next_char} == -1) {
647 !!!cp (11);
648 !!!emit ({type => END_OF_FILE_TOKEN,
649 line => $self->{line}, column => $self->{column}});
650 last A; ## TODO: ok?
651 } else {
652 !!!cp (12);
653 }
654 # Anything else
655 my $token = {type => CHARACTER_TOKEN,
656 data => chr $self->{next_char},
657 line => $self->{line}, column => $self->{column},
658 };
659 ## Stay in the data state
660 !!!next-input-character;
661
662 !!!emit ($token);
663
664 redo A;
665 } elsif ($self->{state} == ENTITY_DATA_STATE) {
666 ## (cannot happen in CDATA state)
667
668 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
669
670 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
671
672 $self->{state} = DATA_STATE;
673 # next-input-character is already done
674
675 unless (defined $token) {
676 !!!cp (13);
677 !!!emit ({type => CHARACTER_TOKEN, data => '&',
678 line => $l, column => $c,
679 });
680 } else {
681 !!!cp (14);
682 !!!emit ($token);
683 }
684
685 redo A;
686 } elsif ($self->{state} == TAG_OPEN_STATE) {
687 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
688 if ($self->{next_char} == 0x002F) { # /
689 !!!cp (15);
690 !!!next-input-character;
691 $self->{state} = CLOSE_TAG_OPEN_STATE;
692 redo A;
693 } else {
694 !!!cp (16);
695 ## reconsume
696 $self->{state} = DATA_STATE;
697
698 !!!emit ({type => CHARACTER_TOKEN, data => '<',
699 line => $self->{line_prev},
700 column => $self->{column_prev},
701 });
702
703 redo A;
704 }
705 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
706 if ($self->{next_char} == 0x0021) { # !
707 !!!cp (17);
708 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
709 !!!next-input-character;
710 redo A;
711 } elsif ($self->{next_char} == 0x002F) { # /
712 !!!cp (18);
713 $self->{state} = CLOSE_TAG_OPEN_STATE;
714 !!!next-input-character;
715 redo A;
716 } elsif (0x0041 <= $self->{next_char} and
717 $self->{next_char} <= 0x005A) { # A..Z
718 !!!cp (19);
719 $self->{current_token}
720 = {type => START_TAG_TOKEN,
721 tag_name => chr ($self->{next_char} + 0x0020),
722 line => $self->{line_prev},
723 column => $self->{column_prev}};
724 $self->{state} = TAG_NAME_STATE;
725 !!!next-input-character;
726 redo A;
727 } elsif (0x0061 <= $self->{next_char} and
728 $self->{next_char} <= 0x007A) { # a..z
729 !!!cp (20);
730 $self->{current_token} = {type => START_TAG_TOKEN,
731 tag_name => chr ($self->{next_char}),
732 line => $self->{line_prev},
733 column => $self->{column_prev}};
734 $self->{state} = TAG_NAME_STATE;
735 !!!next-input-character;
736 redo A;
737 } elsif ($self->{next_char} == 0x003E) { # >
738 !!!cp (21);
739 !!!parse-error (type => 'empty start tag',
740 line => $self->{line_prev},
741 column => $self->{column_prev});
742 $self->{state} = DATA_STATE;
743 !!!next-input-character;
744
745 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
746 line => $self->{line_prev},
747 column => $self->{column_prev},
748 });
749
750 redo A;
751 } elsif ($self->{next_char} == 0x003F) { # ?
752 !!!cp (22);
753 !!!parse-error (type => 'pio',
754 line => $self->{line_prev},
755 column => $self->{column_prev});
756 $self->{state} = BOGUS_COMMENT_STATE;
757 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
758 line => $self->{line_prev},
759 column => $self->{column_prev},
760 };
761 ## $self->{next_char} is intentionally left as is
762 redo A;
763 } else {
764 !!!cp (23);
765 !!!parse-error (type => 'bare stago');
766 $self->{state} = DATA_STATE;
767 ## reconsume
768
769 !!!emit ({type => CHARACTER_TOKEN, data => '<',
770 line => $self->{line_prev},
771 column => $self->{column_prev},
772 });
773
774 redo A;
775 }
776 } else {
777 die "$0: $self->{content_model} in tag open";
778 }
779 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
780 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
781 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
782 if (defined $self->{last_emitted_start_tag_name}) {
783
784 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
785 my @next_char;
786 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
787 push @next_char, $self->{next_char};
788 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
789 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
790 if ($self->{next_char} == $c or $self->{next_char} == $C) {
791 !!!cp (24);
792 !!!next-input-character;
793 next TAGNAME;
794 } else {
795 !!!cp (25);
796 $self->{next_char} = shift @next_char; # reconsume
797 !!!back-next-input-character (@next_char);
798 $self->{state} = DATA_STATE;
799
800 !!!emit ({type => CHARACTER_TOKEN, data => '</',
801 line => $l, column => $c,
802 });
803
804 redo A;
805 }
806 }
807 push @next_char, $self->{next_char};
808
809 unless ($self->{next_char} == 0x0009 or # HT
810 $self->{next_char} == 0x000A or # LF
811 $self->{next_char} == 0x000B or # VT
812 $self->{next_char} == 0x000C or # FF
813 $self->{next_char} == 0x0020 or # SP
814 $self->{next_char} == 0x003E or # >
815 $self->{next_char} == 0x002F or # /
816 $self->{next_char} == -1) {
817 !!!cp (26);
818 $self->{next_char} = shift @next_char; # reconsume
819 !!!back-next-input-character (@next_char);
820 $self->{state} = DATA_STATE;
821 !!!emit ({type => CHARACTER_TOKEN, data => '</',
822 line => $l, column => $c,
823 });
824 redo A;
825 } else {
826 !!!cp (27);
827 $self->{next_char} = shift @next_char;
828 !!!back-next-input-character (@next_char);
829 # and consume...
830 }
831 } else {
832 ## No start tag token has ever been emitted
833 !!!cp (28);
834 # next-input-character is already done
835 $self->{state} = DATA_STATE;
836 !!!emit ({type => CHARACTER_TOKEN, data => '</',
837 line => $l, column => $c,
838 });
839 redo A;
840 }
841 }
842
843 if (0x0041 <= $self->{next_char} and
844 $self->{next_char} <= 0x005A) { # A..Z
845 !!!cp (29);
846 $self->{current_token}
847 = {type => END_TAG_TOKEN,
848 tag_name => chr ($self->{next_char} + 0x0020),
849 line => $l, column => $c};
850 $self->{state} = TAG_NAME_STATE;
851 !!!next-input-character;
852 redo A;
853 } elsif (0x0061 <= $self->{next_char} and
854 $self->{next_char} <= 0x007A) { # a..z
855 !!!cp (30);
856 $self->{current_token} = {type => END_TAG_TOKEN,
857 tag_name => chr ($self->{next_char}),
858 line => $l, column => $c};
859 $self->{state} = TAG_NAME_STATE;
860 !!!next-input-character;
861 redo A;
862 } elsif ($self->{next_char} == 0x003E) { # >
863 !!!cp (31);
864 !!!parse-error (type => 'empty end tag',
865 line => $self->{line_prev}, ## "<" in "</>"
866 column => $self->{column_prev} - 1);
867 $self->{state} = DATA_STATE;
868 !!!next-input-character;
869 redo A;
870 } elsif ($self->{next_char} == -1) {
871 !!!cp (32);
872 !!!parse-error (type => 'bare etago');
873 $self->{state} = DATA_STATE;
874 # reconsume
875
876 !!!emit ({type => CHARACTER_TOKEN, data => '</',
877 line => $l, column => $c,
878 });
879
880 redo A;
881 } else {
882 !!!cp (33);
883 !!!parse-error (type => 'bogus end tag');
884 $self->{state} = BOGUS_COMMENT_STATE;
885 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
886 line => $self->{line_prev}, # "<" of "</"
887 column => $self->{column_prev} - 1,
888 };
889 ## $self->{next_char} is intentionally left as is
890 redo A;
891 }
892 } elsif ($self->{state} == TAG_NAME_STATE) {
893 if ($self->{next_char} == 0x0009 or # HT
894 $self->{next_char} == 0x000A or # LF
895 $self->{next_char} == 0x000B or # VT
896 $self->{next_char} == 0x000C or # FF
897 $self->{next_char} == 0x0020) { # SP
898 !!!cp (34);
899 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
900 !!!next-input-character;
901 redo A;
902 } elsif ($self->{next_char} == 0x003E) { # >
903 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
904 !!!cp (35);
905 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
906 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
907 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
908 #if ($self->{current_token}->{attributes}) {
909 # ## NOTE: This should never be reached.
910 # !!! cp (36);
911 # !!! parse-error (type => 'end tag attribute');
912 #} else {
913 !!!cp (37);
914 #}
915 } else {
916 die "$0: $self->{current_token}->{type}: Unknown token type";
917 }
918 $self->{state} = DATA_STATE;
919 !!!next-input-character;
920
921 !!!emit ($self->{current_token}); # start tag or end tag
922
923 redo A;
924 } elsif (0x0041 <= $self->{next_char} and
925 $self->{next_char} <= 0x005A) { # A..Z
926 !!!cp (38);
927 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
928 # start tag or end tag
929 ## Stay in this state
930 !!!next-input-character;
931 redo A;
932 } elsif ($self->{next_char} == -1) {
933 !!!parse-error (type => 'unclosed tag');
934 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
935 !!!cp (39);
936 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
937 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
938 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
939 #if ($self->{current_token}->{attributes}) {
940 # ## NOTE: This state should never be reached.
941 # !!! cp (40);
942 # !!! parse-error (type => 'end tag attribute');
943 #} else {
944 !!!cp (41);
945 #}
946 } else {
947 die "$0: $self->{current_token}->{type}: Unknown token type";
948 }
949 $self->{state} = DATA_STATE;
950 # reconsume
951
952 !!!emit ($self->{current_token}); # start tag or end tag
953
954 redo A;
955 } elsif ($self->{next_char} == 0x002F) { # /
956 !!!cp (42);
957 $self->{state} = SELF_CLOSING_START_TAG_STATE;
958 !!!next-input-character;
959 redo A;
960 } else {
961 !!!cp (44);
962 $self->{current_token}->{tag_name} .= chr $self->{next_char};
963 # start tag or end tag
964 ## Stay in the state
965 !!!next-input-character;
966 redo A;
967 }
968 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
969 if ($self->{next_char} == 0x0009 or # HT
970 $self->{next_char} == 0x000A or # LF
971 $self->{next_char} == 0x000B or # VT
972 $self->{next_char} == 0x000C or # FF
973 $self->{next_char} == 0x0020) { # SP
974 !!!cp (45);
975 ## Stay in the state
976 !!!next-input-character;
977 redo A;
978 } elsif ($self->{next_char} == 0x003E) { # >
979 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
980 !!!cp (46);
981 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
982 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
983 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
984 if ($self->{current_token}->{attributes}) {
985 !!!cp (47);
986 !!!parse-error (type => 'end tag attribute');
987 } else {
988 !!!cp (48);
989 }
990 } else {
991 die "$0: $self->{current_token}->{type}: Unknown token type";
992 }
993 $self->{state} = DATA_STATE;
994 !!!next-input-character;
995
996 !!!emit ($self->{current_token}); # start tag or end tag
997
998 redo A;
999 } elsif (0x0041 <= $self->{next_char} and
1000 $self->{next_char} <= 0x005A) { # A..Z
1001 !!!cp (49);
1002 $self->{current_attribute}
1003 = {name => chr ($self->{next_char} + 0x0020),
1004 value => '',
1005 line => $self->{line}, column => $self->{column}};
1006 $self->{state} = ATTRIBUTE_NAME_STATE;
1007 !!!next-input-character;
1008 redo A;
1009 } elsif ($self->{next_char} == 0x002F) { # /
1010 !!!cp (50);
1011 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1012 !!!next-input-character;
1013 redo A;
1014 } elsif ($self->{next_char} == -1) {
1015 !!!parse-error (type => 'unclosed tag');
1016 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1017 !!!cp (52);
1018 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1019 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1020 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1021 if ($self->{current_token}->{attributes}) {
1022 !!!cp (53);
1023 !!!parse-error (type => 'end tag attribute');
1024 } else {
1025 !!!cp (54);
1026 }
1027 } else {
1028 die "$0: $self->{current_token}->{type}: Unknown token type";
1029 }
1030 $self->{state} = DATA_STATE;
1031 # reconsume
1032
1033 !!!emit ($self->{current_token}); # start tag or end tag
1034
1035 redo A;
1036 } else {
1037 if ({
1038 0x0022 => 1, # "
1039 0x0027 => 1, # '
1040 0x003D => 1, # =
1041 }->{$self->{next_char}}) {
1042 !!!cp (55);
1043 !!!parse-error (type => 'bad attribute name');
1044 } else {
1045 !!!cp (56);
1046 }
1047 $self->{current_attribute}
1048 = {name => chr ($self->{next_char}),
1049 value => '',
1050 line => $self->{line}, column => $self->{column}};
1051 $self->{state} = ATTRIBUTE_NAME_STATE;
1052 !!!next-input-character;
1053 redo A;
1054 }
1055 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1056 my $before_leave = sub {
1057 if (exists $self->{current_token}->{attributes} # start tag or end tag
1058 ->{$self->{current_attribute}->{name}}) { # MUST
1059 !!!cp (57);
1060 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1061 ## Discard $self->{current_attribute} # MUST
1062 } else {
1063 !!!cp (58);
1064 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1065 = $self->{current_attribute};
1066 }
1067 }; # $before_leave
1068
1069 if ($self->{next_char} == 0x0009 or # HT
1070 $self->{next_char} == 0x000A or # LF
1071 $self->{next_char} == 0x000B or # VT
1072 $self->{next_char} == 0x000C or # FF
1073 $self->{next_char} == 0x0020) { # SP
1074 !!!cp (59);
1075 $before_leave->();
1076 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1077 !!!next-input-character;
1078 redo A;
1079 } elsif ($self->{next_char} == 0x003D) { # =
1080 !!!cp (60);
1081 $before_leave->();
1082 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1083 !!!next-input-character;
1084 redo A;
1085 } elsif ($self->{next_char} == 0x003E) { # >
1086 $before_leave->();
1087 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1088 !!!cp (61);
1089 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1090 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1091 !!!cp (62);
1092 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1093 if ($self->{current_token}->{attributes}) {
1094 !!!parse-error (type => 'end tag attribute');
1095 }
1096 } else {
1097 die "$0: $self->{current_token}->{type}: Unknown token type";
1098 }
1099 $self->{state} = DATA_STATE;
1100 !!!next-input-character;
1101
1102 !!!emit ($self->{current_token}); # start tag or end tag
1103
1104 redo A;
1105 } elsif (0x0041 <= $self->{next_char} and
1106 $self->{next_char} <= 0x005A) { # A..Z
1107 !!!cp (63);
1108 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1109 ## Stay in the state
1110 !!!next-input-character;
1111 redo A;
1112 } elsif ($self->{next_char} == 0x002F) { # /
1113 !!!cp (64);
1114 $before_leave->();
1115 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1116 !!!next-input-character;
1117 redo A;
1118 } elsif ($self->{next_char} == -1) {
1119 !!!parse-error (type => 'unclosed tag');
1120 $before_leave->();
1121 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1122 !!!cp (66);
1123 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1124 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1125 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1126 if ($self->{current_token}->{attributes}) {
1127 !!!cp (67);
1128 !!!parse-error (type => 'end tag attribute');
1129 } else {
1130 ## NOTE: This state should never be reached.
1131 !!!cp (68);
1132 }
1133 } else {
1134 die "$0: $self->{current_token}->{type}: Unknown token type";
1135 }
1136 $self->{state} = DATA_STATE;
1137 # reconsume
1138
1139 !!!emit ($self->{current_token}); # start tag or end tag
1140
1141 redo A;
1142 } else {
1143 if ($self->{next_char} == 0x0022 or # "
1144 $self->{next_char} == 0x0027) { # '
1145 !!!cp (69);
1146 !!!parse-error (type => 'bad attribute name');
1147 } else {
1148 !!!cp (70);
1149 }
1150 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1151 ## Stay in the state
1152 !!!next-input-character;
1153 redo A;
1154 }
1155 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1156 if ($self->{next_char} == 0x0009 or # HT
1157 $self->{next_char} == 0x000A or # LF
1158 $self->{next_char} == 0x000B or # VT
1159 $self->{next_char} == 0x000C or # FF
1160 $self->{next_char} == 0x0020) { # SP
1161 !!!cp (71);
1162 ## Stay in the state
1163 !!!next-input-character;
1164 redo A;
1165 } elsif ($self->{next_char} == 0x003D) { # =
1166 !!!cp (72);
1167 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1168 !!!next-input-character;
1169 redo A;
1170 } elsif ($self->{next_char} == 0x003E) { # >
1171 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1172 !!!cp (73);
1173 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1174 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1175 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1176 if ($self->{current_token}->{attributes}) {
1177 !!!cp (74);
1178 !!!parse-error (type => 'end tag attribute');
1179 } else {
1180 ## NOTE: This state should never be reached.
1181 !!!cp (75);
1182 }
1183 } else {
1184 die "$0: $self->{current_token}->{type}: Unknown token type";
1185 }
1186 $self->{state} = DATA_STATE;
1187 !!!next-input-character;
1188
1189 !!!emit ($self->{current_token}); # start tag or end tag
1190
1191 redo A;
1192 } elsif (0x0041 <= $self->{next_char} and
1193 $self->{next_char} <= 0x005A) { # A..Z
1194 !!!cp (76);
1195 $self->{current_attribute}
1196 = {name => chr ($self->{next_char} + 0x0020),
1197 value => '',
1198 line => $self->{line}, column => $self->{column}};
1199 $self->{state} = ATTRIBUTE_NAME_STATE;
1200 !!!next-input-character;
1201 redo A;
1202 } elsif ($self->{next_char} == 0x002F) { # /
1203 !!!cp (77);
1204 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1205 !!!next-input-character;
1206 redo A;
1207 } elsif ($self->{next_char} == -1) {
1208 !!!parse-error (type => 'unclosed tag');
1209 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1210 !!!cp (79);
1211 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1212 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1213 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1214 if ($self->{current_token}->{attributes}) {
1215 !!!cp (80);
1216 !!!parse-error (type => 'end tag attribute');
1217 } else {
1218 ## NOTE: This state should never be reached.
1219 !!!cp (81);
1220 }
1221 } else {
1222 die "$0: $self->{current_token}->{type}: Unknown token type";
1223 }
1224 $self->{state} = DATA_STATE;
1225 # reconsume
1226
1227 !!!emit ($self->{current_token}); # start tag or end tag
1228
1229 redo A;
1230 } else {
1231 !!!cp (82);
1232 $self->{current_attribute}
1233 = {name => chr ($self->{next_char}),
1234 value => '',
1235 line => $self->{line}, column => $self->{column}};
1236 $self->{state} = ATTRIBUTE_NAME_STATE;
1237 !!!next-input-character;
1238 redo A;
1239 }
1240 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1241 if ($self->{next_char} == 0x0009 or # HT
1242 $self->{next_char} == 0x000A or # LF
1243 $self->{next_char} == 0x000B or # VT
1244 $self->{next_char} == 0x000C or # FF
1245 $self->{next_char} == 0x0020) { # SP
1246 !!!cp (83);
1247 ## Stay in the state
1248 !!!next-input-character;
1249 redo A;
1250 } elsif ($self->{next_char} == 0x0022) { # "
1251 !!!cp (84);
1252 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1253 !!!next-input-character;
1254 redo A;
1255 } elsif ($self->{next_char} == 0x0026) { # &
1256 !!!cp (85);
1257 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1258 ## reconsume
1259 redo A;
1260 } elsif ($self->{next_char} == 0x0027) { # '
1261 !!!cp (86);
1262 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1263 !!!next-input-character;
1264 redo A;
1265 } elsif ($self->{next_char} == 0x003E) { # >
1266 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1267 !!!cp (87);
1268 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1269 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1270 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1271 if ($self->{current_token}->{attributes}) {
1272 !!!cp (88);
1273 !!!parse-error (type => 'end tag attribute');
1274 } else {
1275 ## NOTE: This state should never be reached.
1276 !!!cp (89);
1277 }
1278 } else {
1279 die "$0: $self->{current_token}->{type}: Unknown token type";
1280 }
1281 $self->{state} = DATA_STATE;
1282 !!!next-input-character;
1283
1284 !!!emit ($self->{current_token}); # start tag or end tag
1285
1286 redo A;
1287 } elsif ($self->{next_char} == -1) {
1288 !!!parse-error (type => 'unclosed tag');
1289 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1290 !!!cp (90);
1291 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1292 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1293 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1294 if ($self->{current_token}->{attributes}) {
1295 !!!cp (91);
1296 !!!parse-error (type => 'end tag attribute');
1297 } else {
1298 ## NOTE: This state should never be reached.
1299 !!!cp (92);
1300 }
1301 } else {
1302 die "$0: $self->{current_token}->{type}: Unknown token type";
1303 }
1304 $self->{state} = DATA_STATE;
1305 ## reconsume
1306
1307 !!!emit ($self->{current_token}); # start tag or end tag
1308
1309 redo A;
1310 } else {
1311 if ($self->{next_char} == 0x003D) { # =
1312 !!!cp (93);
1313 !!!parse-error (type => 'bad attribute value');
1314 } else {
1315 !!!cp (94);
1316 }
1317 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1318 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1319 !!!next-input-character;
1320 redo A;
1321 }
1322 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1323 if ($self->{next_char} == 0x0022) { # "
1324 !!!cp (95);
1325 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1326 !!!next-input-character;
1327 redo A;
1328 } elsif ($self->{next_char} == 0x0026) { # &
1329 !!!cp (96);
1330 $self->{last_attribute_value_state} = $self->{state};
1331 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1332 !!!next-input-character;
1333 redo A;
1334 } elsif ($self->{next_char} == -1) {
1335 !!!parse-error (type => 'unclosed attribute value');
1336 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1337 !!!cp (97);
1338 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1339 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1340 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1341 if ($self->{current_token}->{attributes}) {
1342 !!!cp (98);
1343 !!!parse-error (type => 'end tag attribute');
1344 } else {
1345 ## NOTE: This state should never be reached.
1346 !!!cp (99);
1347 }
1348 } else {
1349 die "$0: $self->{current_token}->{type}: Unknown token type";
1350 }
1351 $self->{state} = DATA_STATE;
1352 ## reconsume
1353
1354 !!!emit ($self->{current_token}); # start tag or end tag
1355
1356 redo A;
1357 } else {
1358 !!!cp (100);
1359 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1360 ## Stay in the state
1361 !!!next-input-character;
1362 redo A;
1363 }
1364 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1365 if ($self->{next_char} == 0x0027) { # '
1366 !!!cp (101);
1367 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1368 !!!next-input-character;
1369 redo A;
1370 } elsif ($self->{next_char} == 0x0026) { # &
1371 !!!cp (102);
1372 $self->{last_attribute_value_state} = $self->{state};
1373 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1374 !!!next-input-character;
1375 redo A;
1376 } elsif ($self->{next_char} == -1) {
1377 !!!parse-error (type => 'unclosed attribute value');
1378 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1379 !!!cp (103);
1380 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1381 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1382 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1383 if ($self->{current_token}->{attributes}) {
1384 !!!cp (104);
1385 !!!parse-error (type => 'end tag attribute');
1386 } else {
1387 ## NOTE: This state should never be reached.
1388 !!!cp (105);
1389 }
1390 } else {
1391 die "$0: $self->{current_token}->{type}: Unknown token type";
1392 }
1393 $self->{state} = DATA_STATE;
1394 ## reconsume
1395
1396 !!!emit ($self->{current_token}); # start tag or end tag
1397
1398 redo A;
1399 } else {
1400 !!!cp (106);
1401 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1402 ## Stay in the state
1403 !!!next-input-character;
1404 redo A;
1405 }
1406 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1407 if ($self->{next_char} == 0x0009 or # HT
1408 $self->{next_char} == 0x000A or # LF
1409 $self->{next_char} == 0x000B or # HT
1410 $self->{next_char} == 0x000C or # FF
1411 $self->{next_char} == 0x0020) { # SP
1412 !!!cp (107);
1413 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1414 !!!next-input-character;
1415 redo A;
1416 } elsif ($self->{next_char} == 0x0026) { # &
1417 !!!cp (108);
1418 $self->{last_attribute_value_state} = $self->{state};
1419 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1420 !!!next-input-character;
1421 redo A;
1422 } elsif ($self->{next_char} == 0x003E) { # >
1423 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1424 !!!cp (109);
1425 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1426 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1427 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1428 if ($self->{current_token}->{attributes}) {
1429 !!!cp (110);
1430 !!!parse-error (type => 'end tag attribute');
1431 } else {
1432 ## NOTE: This state should never be reached.
1433 !!!cp (111);
1434 }
1435 } else {
1436 die "$0: $self->{current_token}->{type}: Unknown token type";
1437 }
1438 $self->{state} = DATA_STATE;
1439 !!!next-input-character;
1440
1441 !!!emit ($self->{current_token}); # start tag or end tag
1442
1443 redo A;
1444 } elsif ($self->{next_char} == -1) {
1445 !!!parse-error (type => 'unclosed tag');
1446 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1447 !!!cp (112);
1448 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1449 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1450 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1451 if ($self->{current_token}->{attributes}) {
1452 !!!cp (113);
1453 !!!parse-error (type => 'end tag attribute');
1454 } else {
1455 ## NOTE: This state should never be reached.
1456 !!!cp (114);
1457 }
1458 } else {
1459 die "$0: $self->{current_token}->{type}: Unknown token type";
1460 }
1461 $self->{state} = DATA_STATE;
1462 ## reconsume
1463
1464 !!!emit ($self->{current_token}); # start tag or end tag
1465
1466 redo A;
1467 } else {
1468 if ({
1469 0x0022 => 1, # "
1470 0x0027 => 1, # '
1471 0x003D => 1, # =
1472 }->{$self->{next_char}}) {
1473 !!!cp (115);
1474 !!!parse-error (type => 'bad attribute value');
1475 } else {
1476 !!!cp (116);
1477 }
1478 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1479 ## Stay in the state
1480 !!!next-input-character;
1481 redo A;
1482 }
1483 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1484 my $token = $self->_tokenize_attempt_to_consume_an_entity
1485 (1,
1486 $self->{last_attribute_value_state}
1487 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1488 $self->{last_attribute_value_state}
1489 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1490 -1);
1491
1492 unless (defined $token) {
1493 !!!cp (117);
1494 $self->{current_attribute}->{value} .= '&';
1495 } else {
1496 !!!cp (118);
1497 $self->{current_attribute}->{value} .= $token->{data};
1498 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1499 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1500 }
1501
1502 $self->{state} = $self->{last_attribute_value_state};
1503 # next-input-character is already done
1504 redo A;
1505 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1506 if ($self->{next_char} == 0x0009 or # HT
1507 $self->{next_char} == 0x000A or # LF
1508 $self->{next_char} == 0x000B or # VT
1509 $self->{next_char} == 0x000C or # FF
1510 $self->{next_char} == 0x0020) { # SP
1511 !!!cp (118);
1512 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1513 !!!next-input-character;
1514 redo A;
1515 } elsif ($self->{next_char} == 0x003E) { # >
1516 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1517 !!!cp (119);
1518 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1519 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1520 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1521 if ($self->{current_token}->{attributes}) {
1522 !!!cp (120);
1523 !!!parse-error (type => 'end tag attribute');
1524 } else {
1525 ## NOTE: This state should never be reached.
1526 !!!cp (121);
1527 }
1528 } else {
1529 die "$0: $self->{current_token}->{type}: Unknown token type";
1530 }
1531 $self->{state} = DATA_STATE;
1532 !!!next-input-character;
1533
1534 !!!emit ($self->{current_token}); # start tag or end tag
1535
1536 redo A;
1537 } elsif ($self->{next_char} == 0x002F) { # /
1538 !!!cp (122);
1539 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1540 !!!next-input-character;
1541 redo A;
1542 } else {
1543 !!!cp ('124.1');
1544 !!!parse-error (type => 'no space between attributes');
1545 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1546 ## reconsume
1547 redo A;
1548 }
1549 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1550 if ($self->{next_char} == 0x003E) { # >
1551 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1552 !!!cp ('124.2');
1553 !!!parse-error (type => 'nestc', token => $self->{current_token});
1554 ## TODO: Different type than slash in start tag
1555 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1556 if ($self->{current_token}->{attributes}) {
1557 !!!cp ('124.4');
1558 !!!parse-error (type => 'end tag attribute');
1559 } else {
1560 !!!cp ('124.5');
1561 }
1562 ## TODO: Test |<title></title/>|
1563 } else {
1564 !!!cp ('124.3');
1565 $self->{self_closing} = 1;
1566 }
1567
1568 $self->{state} = DATA_STATE;
1569 !!!next-input-character;
1570
1571 !!!emit ($self->{current_token}); # start tag or end tag
1572
1573 redo A;
1574 } else {
1575 !!!cp ('124.4');
1576 !!!parse-error (type => 'nestc');
1577 ## TODO: This error type is wrong.
1578 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1579 ## Reconsume.
1580 redo A;
1581 }
1582 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1583 ## (only happen if PCDATA state)
1584
1585 ## NOTE: Set by the previous state
1586 #my $token = {type => COMMENT_TOKEN, data => ''};
1587
1588 BC: {
1589 if ($self->{next_char} == 0x003E) { # >
1590 !!!cp (124);
1591 $self->{state} = DATA_STATE;
1592 !!!next-input-character;
1593
1594 !!!emit ($self->{current_token}); # comment
1595
1596 redo A;
1597 } elsif ($self->{next_char} == -1) {
1598 !!!cp (125);
1599 $self->{state} = DATA_STATE;
1600 ## reconsume
1601
1602 !!!emit ($self->{current_token}); # comment
1603
1604 redo A;
1605 } else {
1606 !!!cp (126);
1607 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1608 !!!next-input-character;
1609 redo BC;
1610 }
1611 } # BC
1612
1613 die "$0: _get_next_token: unexpected case [BC]";
1614 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1615 ## (only happen if PCDATA state)
1616
1617 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1618
1619 my @next_char;
1620 push @next_char, $self->{next_char};
1621
1622 if ($self->{next_char} == 0x002D) { # -
1623 !!!next-input-character;
1624 push @next_char, $self->{next_char};
1625 if ($self->{next_char} == 0x002D) { # -
1626 !!!cp (127);
1627 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1628 line => $l, column => $c,
1629 };
1630 $self->{state} = COMMENT_START_STATE;
1631 !!!next-input-character;
1632 redo A;
1633 } else {
1634 !!!cp (128);
1635 }
1636 } elsif ($self->{next_char} == 0x0044 or # D
1637 $self->{next_char} == 0x0064) { # d
1638 !!!next-input-character;
1639 push @next_char, $self->{next_char};
1640 if ($self->{next_char} == 0x004F or # O
1641 $self->{next_char} == 0x006F) { # o
1642 !!!next-input-character;
1643 push @next_char, $self->{next_char};
1644 if ($self->{next_char} == 0x0043 or # C
1645 $self->{next_char} == 0x0063) { # c
1646 !!!next-input-character;
1647 push @next_char, $self->{next_char};
1648 if ($self->{next_char} == 0x0054 or # T
1649 $self->{next_char} == 0x0074) { # t
1650 !!!next-input-character;
1651 push @next_char, $self->{next_char};
1652 if ($self->{next_char} == 0x0059 or # Y
1653 $self->{next_char} == 0x0079) { # y
1654 !!!next-input-character;
1655 push @next_char, $self->{next_char};
1656 if ($self->{next_char} == 0x0050 or # P
1657 $self->{next_char} == 0x0070) { # p
1658 !!!next-input-character;
1659 push @next_char, $self->{next_char};
1660 if ($self->{next_char} == 0x0045 or # E
1661 $self->{next_char} == 0x0065) { # e
1662 !!!cp (129);
1663 ## TODO: What a stupid code this is!
1664 $self->{state} = DOCTYPE_STATE;
1665 $self->{current_token} = {type => DOCTYPE_TOKEN,
1666 quirks => 1,
1667 line => $l, column => $c,
1668 };
1669 !!!next-input-character;
1670 redo A;
1671 } else {
1672 !!!cp (130);
1673 }
1674 } else {
1675 !!!cp (131);
1676 }
1677 } else {
1678 !!!cp (132);
1679 }
1680 } else {
1681 !!!cp (133);
1682 }
1683 } else {
1684 !!!cp (134);
1685 }
1686 } else {
1687 !!!cp (135);
1688 }
1689 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1690 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1691 $self->{next_char} == 0x005B) { # [
1692 !!!next-input-character;
1693 push @next_char, $self->{next_char};
1694 if ($self->{next_char} == 0x0043) { # C
1695 !!!next-input-character;
1696 push @next_char, $self->{next_char};
1697 if ($self->{next_char} == 0x0044) { # D
1698 !!!next-input-character;
1699 push @next_char, $self->{next_char};
1700 if ($self->{next_char} == 0x0041) { # A
1701 !!!next-input-character;
1702 push @next_char, $self->{next_char};
1703 if ($self->{next_char} == 0x0054) { # T
1704 !!!next-input-character;
1705 push @next_char, $self->{next_char};
1706 if ($self->{next_char} == 0x0041) { # A
1707 !!!next-input-character;
1708 push @next_char, $self->{next_char};
1709 if ($self->{next_char} == 0x005B) { # [
1710 !!!cp (135.1);
1711 $self->{state} = CDATA_BLOCK_STATE;
1712 !!!next-input-character;
1713 redo A;
1714 } else {
1715 !!!cp (135.2);
1716 }
1717 } else {
1718 !!!cp (135.3);
1719 }
1720 } else {
1721 !!!cp (135.4);
1722 }
1723 } else {
1724 !!!cp (135.5);
1725 }
1726 } else {
1727 !!!cp (135.6);
1728 }
1729 } else {
1730 !!!cp (135.7);
1731 }
1732 } else {
1733 !!!cp (136);
1734 }
1735
1736 !!!parse-error (type => 'bogus comment');
1737 $self->{next_char} = shift @next_char;
1738 !!!back-next-input-character (@next_char);
1739 $self->{state} = BOGUS_COMMENT_STATE;
1740 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1741 line => $l, column => $c,
1742 };
1743 redo A;
1744
1745 ## ISSUE: typos in spec: chacacters, is is a parse error
1746 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1747 } elsif ($self->{state} == COMMENT_START_STATE) {
1748 if ($self->{next_char} == 0x002D) { # -
1749 !!!cp (137);
1750 $self->{state} = COMMENT_START_DASH_STATE;
1751 !!!next-input-character;
1752 redo A;
1753 } elsif ($self->{next_char} == 0x003E) { # >
1754 !!!cp (138);
1755 !!!parse-error (type => 'bogus comment');
1756 $self->{state} = DATA_STATE;
1757 !!!next-input-character;
1758
1759 !!!emit ($self->{current_token}); # comment
1760
1761 redo A;
1762 } elsif ($self->{next_char} == -1) {
1763 !!!cp (139);
1764 !!!parse-error (type => 'unclosed comment');
1765 $self->{state} = DATA_STATE;
1766 ## reconsume
1767
1768 !!!emit ($self->{current_token}); # comment
1769
1770 redo A;
1771 } else {
1772 !!!cp (140);
1773 $self->{current_token}->{data} # comment
1774 .= chr ($self->{next_char});
1775 $self->{state} = COMMENT_STATE;
1776 !!!next-input-character;
1777 redo A;
1778 }
1779 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1780 if ($self->{next_char} == 0x002D) { # -
1781 !!!cp (141);
1782 $self->{state} = COMMENT_END_STATE;
1783 !!!next-input-character;
1784 redo A;
1785 } elsif ($self->{next_char} == 0x003E) { # >
1786 !!!cp (142);
1787 !!!parse-error (type => 'bogus comment');
1788 $self->{state} = DATA_STATE;
1789 !!!next-input-character;
1790
1791 !!!emit ($self->{current_token}); # comment
1792
1793 redo A;
1794 } elsif ($self->{next_char} == -1) {
1795 !!!cp (143);
1796 !!!parse-error (type => 'unclosed comment');
1797 $self->{state} = DATA_STATE;
1798 ## reconsume
1799
1800 !!!emit ($self->{current_token}); # comment
1801
1802 redo A;
1803 } else {
1804 !!!cp (144);
1805 $self->{current_token}->{data} # comment
1806 .= '-' . chr ($self->{next_char});
1807 $self->{state} = COMMENT_STATE;
1808 !!!next-input-character;
1809 redo A;
1810 }
1811 } elsif ($self->{state} == COMMENT_STATE) {
1812 if ($self->{next_char} == 0x002D) { # -
1813 !!!cp (145);
1814 $self->{state} = COMMENT_END_DASH_STATE;
1815 !!!next-input-character;
1816 redo A;
1817 } elsif ($self->{next_char} == -1) {
1818 !!!cp (146);
1819 !!!parse-error (type => 'unclosed comment');
1820 $self->{state} = DATA_STATE;
1821 ## reconsume
1822
1823 !!!emit ($self->{current_token}); # comment
1824
1825 redo A;
1826 } else {
1827 !!!cp (147);
1828 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1829 ## Stay in the state
1830 !!!next-input-character;
1831 redo A;
1832 }
1833 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1834 if ($self->{next_char} == 0x002D) { # -
1835 !!!cp (148);
1836 $self->{state} = COMMENT_END_STATE;
1837 !!!next-input-character;
1838 redo A;
1839 } elsif ($self->{next_char} == -1) {
1840 !!!cp (149);
1841 !!!parse-error (type => 'unclosed comment');
1842 $self->{state} = DATA_STATE;
1843 ## reconsume
1844
1845 !!!emit ($self->{current_token}); # comment
1846
1847 redo A;
1848 } else {
1849 !!!cp (150);
1850 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1851 $self->{state} = COMMENT_STATE;
1852 !!!next-input-character;
1853 redo A;
1854 }
1855 } elsif ($self->{state} == COMMENT_END_STATE) {
1856 if ($self->{next_char} == 0x003E) { # >
1857 !!!cp (151);
1858 $self->{state} = DATA_STATE;
1859 !!!next-input-character;
1860
1861 !!!emit ($self->{current_token}); # comment
1862
1863 redo A;
1864 } elsif ($self->{next_char} == 0x002D) { # -
1865 !!!cp (152);
1866 !!!parse-error (type => 'dash in comment',
1867 line => $self->{line_prev},
1868 column => $self->{column_prev});
1869 $self->{current_token}->{data} .= '-'; # comment
1870 ## Stay in the state
1871 !!!next-input-character;
1872 redo A;
1873 } elsif ($self->{next_char} == -1) {
1874 !!!cp (153);
1875 !!!parse-error (type => 'unclosed comment');
1876 $self->{state} = DATA_STATE;
1877 ## reconsume
1878
1879 !!!emit ($self->{current_token}); # comment
1880
1881 redo A;
1882 } else {
1883 !!!cp (154);
1884 !!!parse-error (type => 'dash in comment',
1885 line => $self->{line_prev},
1886 column => $self->{column_prev});
1887 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1888 $self->{state} = COMMENT_STATE;
1889 !!!next-input-character;
1890 redo A;
1891 }
1892 } elsif ($self->{state} == DOCTYPE_STATE) {
1893 if ($self->{next_char} == 0x0009 or # HT
1894 $self->{next_char} == 0x000A or # LF
1895 $self->{next_char} == 0x000B or # VT
1896 $self->{next_char} == 0x000C or # FF
1897 $self->{next_char} == 0x0020) { # SP
1898 !!!cp (155);
1899 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1900 !!!next-input-character;
1901 redo A;
1902 } else {
1903 !!!cp (156);
1904 !!!parse-error (type => 'no space before DOCTYPE name');
1905 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1906 ## reconsume
1907 redo A;
1908 }
1909 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1910 if ($self->{next_char} == 0x0009 or # HT
1911 $self->{next_char} == 0x000A or # LF
1912 $self->{next_char} == 0x000B or # VT
1913 $self->{next_char} == 0x000C or # FF
1914 $self->{next_char} == 0x0020) { # SP
1915 !!!cp (157);
1916 ## Stay in the state
1917 !!!next-input-character;
1918 redo A;
1919 } elsif ($self->{next_char} == 0x003E) { # >
1920 !!!cp (158);
1921 !!!parse-error (type => 'no DOCTYPE name');
1922 $self->{state} = DATA_STATE;
1923 !!!next-input-character;
1924
1925 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
1926
1927 redo A;
1928 } elsif ($self->{next_char} == -1) {
1929 !!!cp (159);
1930 !!!parse-error (type => 'no DOCTYPE name');
1931 $self->{state} = DATA_STATE;
1932 ## reconsume
1933
1934 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
1935
1936 redo A;
1937 } else {
1938 !!!cp (160);
1939 $self->{current_token}->{name} = chr $self->{next_char};
1940 delete $self->{current_token}->{quirks};
1941 ## ISSUE: "Set the token's name name to the" in the spec
1942 $self->{state} = DOCTYPE_NAME_STATE;
1943 !!!next-input-character;
1944 redo A;
1945 }
1946 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1947 ## ISSUE: Redundant "First," in the spec.
1948 if ($self->{next_char} == 0x0009 or # HT
1949 $self->{next_char} == 0x000A or # LF
1950 $self->{next_char} == 0x000B or # VT
1951 $self->{next_char} == 0x000C or # FF
1952 $self->{next_char} == 0x0020) { # SP
1953 !!!cp (161);
1954 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1955 !!!next-input-character;
1956 redo A;
1957 } elsif ($self->{next_char} == 0x003E) { # >
1958 !!!cp (162);
1959 $self->{state} = DATA_STATE;
1960 !!!next-input-character;
1961
1962 !!!emit ($self->{current_token}); # DOCTYPE
1963
1964 redo A;
1965 } elsif ($self->{next_char} == -1) {
1966 !!!cp (163);
1967 !!!parse-error (type => 'unclosed DOCTYPE');
1968 $self->{state} = DATA_STATE;
1969 ## reconsume
1970
1971 $self->{current_token}->{quirks} = 1;
1972 !!!emit ($self->{current_token}); # DOCTYPE
1973
1974 redo A;
1975 } else {
1976 !!!cp (164);
1977 $self->{current_token}->{name}
1978 .= chr ($self->{next_char}); # DOCTYPE
1979 ## Stay in the state
1980 !!!next-input-character;
1981 redo A;
1982 }
1983 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1984 if ($self->{next_char} == 0x0009 or # HT
1985 $self->{next_char} == 0x000A or # LF
1986 $self->{next_char} == 0x000B or # VT
1987 $self->{next_char} == 0x000C or # FF
1988 $self->{next_char} == 0x0020) { # SP
1989 !!!cp (165);
1990 ## Stay in the state
1991 !!!next-input-character;
1992 redo A;
1993 } elsif ($self->{next_char} == 0x003E) { # >
1994 !!!cp (166);
1995 $self->{state} = DATA_STATE;
1996 !!!next-input-character;
1997
1998 !!!emit ($self->{current_token}); # DOCTYPE
1999
2000 redo A;
2001 } elsif ($self->{next_char} == -1) {
2002 !!!cp (167);
2003 !!!parse-error (type => 'unclosed DOCTYPE');
2004 $self->{state} = DATA_STATE;
2005 ## reconsume
2006
2007 $self->{current_token}->{quirks} = 1;
2008 !!!emit ($self->{current_token}); # DOCTYPE
2009
2010 redo A;
2011 } elsif ($self->{next_char} == 0x0050 or # P
2012 $self->{next_char} == 0x0070) { # p
2013 !!!next-input-character;
2014 if ($self->{next_char} == 0x0055 or # U
2015 $self->{next_char} == 0x0075) { # u
2016 !!!next-input-character;
2017 if ($self->{next_char} == 0x0042 or # B
2018 $self->{next_char} == 0x0062) { # b
2019 !!!next-input-character;
2020 if ($self->{next_char} == 0x004C or # L
2021 $self->{next_char} == 0x006C) { # l
2022 !!!next-input-character;
2023 if ($self->{next_char} == 0x0049 or # I
2024 $self->{next_char} == 0x0069) { # i
2025 !!!next-input-character;
2026 if ($self->{next_char} == 0x0043 or # C
2027 $self->{next_char} == 0x0063) { # c
2028 !!!cp (168);
2029 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2030 !!!next-input-character;
2031 redo A;
2032 } else {
2033 !!!cp (169);
2034 }
2035 } else {
2036 !!!cp (170);
2037 }
2038 } else {
2039 !!!cp (171);
2040 }
2041 } else {
2042 !!!cp (172);
2043 }
2044 } else {
2045 !!!cp (173);
2046 }
2047
2048 #
2049 } elsif ($self->{next_char} == 0x0053 or # S
2050 $self->{next_char} == 0x0073) { # s
2051 !!!next-input-character;
2052 if ($self->{next_char} == 0x0059 or # Y
2053 $self->{next_char} == 0x0079) { # y
2054 !!!next-input-character;
2055 if ($self->{next_char} == 0x0053 or # S
2056 $self->{next_char} == 0x0073) { # s
2057 !!!next-input-character;
2058 if ($self->{next_char} == 0x0054 or # T
2059 $self->{next_char} == 0x0074) { # t
2060 !!!next-input-character;
2061 if ($self->{next_char} == 0x0045 or # E
2062 $self->{next_char} == 0x0065) { # e
2063 !!!next-input-character;
2064 if ($self->{next_char} == 0x004D or # M
2065 $self->{next_char} == 0x006D) { # m
2066 !!!cp (174);
2067 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2068 !!!next-input-character;
2069 redo A;
2070 } else {
2071 !!!cp (175);
2072 }
2073 } else {
2074 !!!cp (176);
2075 }
2076 } else {
2077 !!!cp (177);
2078 }
2079 } else {
2080 !!!cp (178);
2081 }
2082 } else {
2083 !!!cp (179);
2084 }
2085
2086 #
2087 } else {
2088 !!!cp (180);
2089 !!!next-input-character;
2090 #
2091 }
2092
2093 !!!parse-error (type => 'string after DOCTYPE name');
2094 $self->{current_token}->{quirks} = 1;
2095
2096 $self->{state} = BOGUS_DOCTYPE_STATE;
2097 # next-input-character is already done
2098 redo A;
2099 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2100 if ({
2101 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2102 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2103 }->{$self->{next_char}}) {
2104 !!!cp (181);
2105 ## Stay in the state
2106 !!!next-input-character;
2107 redo A;
2108 } elsif ($self->{next_char} eq 0x0022) { # "
2109 !!!cp (182);
2110 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2111 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2112 !!!next-input-character;
2113 redo A;
2114 } elsif ($self->{next_char} eq 0x0027) { # '
2115 !!!cp (183);
2116 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2117 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2118 !!!next-input-character;
2119 redo A;
2120 } elsif ($self->{next_char} eq 0x003E) { # >
2121 !!!cp (184);
2122 !!!parse-error (type => 'no PUBLIC literal');
2123
2124 $self->{state} = DATA_STATE;
2125 !!!next-input-character;
2126
2127 $self->{current_token}->{quirks} = 1;
2128 !!!emit ($self->{current_token}); # DOCTYPE
2129
2130 redo A;
2131 } elsif ($self->{next_char} == -1) {
2132 !!!cp (185);
2133 !!!parse-error (type => 'unclosed DOCTYPE');
2134
2135 $self->{state} = DATA_STATE;
2136 ## reconsume
2137
2138 $self->{current_token}->{quirks} = 1;
2139 !!!emit ($self->{current_token}); # DOCTYPE
2140
2141 redo A;
2142 } else {
2143 !!!cp (186);
2144 !!!parse-error (type => 'string after PUBLIC');
2145 $self->{current_token}->{quirks} = 1;
2146
2147 $self->{state} = BOGUS_DOCTYPE_STATE;
2148 !!!next-input-character;
2149 redo A;
2150 }
2151 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2152 if ($self->{next_char} == 0x0022) { # "
2153 !!!cp (187);
2154 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2155 !!!next-input-character;
2156 redo A;
2157 } elsif ($self->{next_char} == 0x003E) { # >
2158 !!!cp (188);
2159 !!!parse-error (type => 'unclosed PUBLIC literal');
2160
2161 $self->{state} = DATA_STATE;
2162 !!!next-input-character;
2163
2164 $self->{current_token}->{quirks} = 1;
2165 !!!emit ($self->{current_token}); # DOCTYPE
2166
2167 redo A;
2168 } elsif ($self->{next_char} == -1) {
2169 !!!cp (189);
2170 !!!parse-error (type => 'unclosed PUBLIC literal');
2171
2172 $self->{state} = DATA_STATE;
2173 ## reconsume
2174
2175 $self->{current_token}->{quirks} = 1;
2176 !!!emit ($self->{current_token}); # DOCTYPE
2177
2178 redo A;
2179 } else {
2180 !!!cp (190);
2181 $self->{current_token}->{public_identifier} # DOCTYPE
2182 .= chr $self->{next_char};
2183 ## Stay in the state
2184 !!!next-input-character;
2185 redo A;
2186 }
2187 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2188 if ($self->{next_char} == 0x0027) { # '
2189 !!!cp (191);
2190 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2191 !!!next-input-character;
2192 redo A;
2193 } elsif ($self->{next_char} == 0x003E) { # >
2194 !!!cp (192);
2195 !!!parse-error (type => 'unclosed PUBLIC literal');
2196
2197 $self->{state} = DATA_STATE;
2198 !!!next-input-character;
2199
2200 $self->{current_token}->{quirks} = 1;
2201 !!!emit ($self->{current_token}); # DOCTYPE
2202
2203 redo A;
2204 } elsif ($self->{next_char} == -1) {
2205 !!!cp (193);
2206 !!!parse-error (type => 'unclosed PUBLIC literal');
2207
2208 $self->{state} = DATA_STATE;
2209 ## reconsume
2210
2211 $self->{current_token}->{quirks} = 1;
2212 !!!emit ($self->{current_token}); # DOCTYPE
2213
2214 redo A;
2215 } else {
2216 !!!cp (194);
2217 $self->{current_token}->{public_identifier} # DOCTYPE
2218 .= chr $self->{next_char};
2219 ## Stay in the state
2220 !!!next-input-character;
2221 redo A;
2222 }
2223 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2224 if ({
2225 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2226 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2227 }->{$self->{next_char}}) {
2228 !!!cp (195);
2229 ## Stay in the state
2230 !!!next-input-character;
2231 redo A;
2232 } elsif ($self->{next_char} == 0x0022) { # "
2233 !!!cp (196);
2234 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2235 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2236 !!!next-input-character;
2237 redo A;
2238 } elsif ($self->{next_char} == 0x0027) { # '
2239 !!!cp (197);
2240 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2241 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2242 !!!next-input-character;
2243 redo A;
2244 } elsif ($self->{next_char} == 0x003E) { # >
2245 !!!cp (198);
2246 $self->{state} = DATA_STATE;
2247 !!!next-input-character;
2248
2249 !!!emit ($self->{current_token}); # DOCTYPE
2250
2251 redo A;
2252 } elsif ($self->{next_char} == -1) {
2253 !!!cp (199);
2254 !!!parse-error (type => 'unclosed DOCTYPE');
2255
2256 $self->{state} = DATA_STATE;
2257 ## reconsume
2258
2259 $self->{current_token}->{quirks} = 1;
2260 !!!emit ($self->{current_token}); # DOCTYPE
2261
2262 redo A;
2263 } else {
2264 !!!cp (200);
2265 !!!parse-error (type => 'string after PUBLIC literal');
2266 $self->{current_token}->{quirks} = 1;
2267
2268 $self->{state} = BOGUS_DOCTYPE_STATE;
2269 !!!next-input-character;
2270 redo A;
2271 }
2272 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2273 if ({
2274 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2275 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2276 }->{$self->{next_char}}) {
2277 !!!cp (201);
2278 ## Stay in the state
2279 !!!next-input-character;
2280 redo A;
2281 } elsif ($self->{next_char} == 0x0022) { # "
2282 !!!cp (202);
2283 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2284 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2285 !!!next-input-character;
2286 redo A;
2287 } elsif ($self->{next_char} == 0x0027) { # '
2288 !!!cp (203);
2289 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2290 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2291 !!!next-input-character;
2292 redo A;
2293 } elsif ($self->{next_char} == 0x003E) { # >
2294 !!!cp (204);
2295 !!!parse-error (type => 'no SYSTEM literal');
2296 $self->{state} = DATA_STATE;
2297 !!!next-input-character;
2298
2299 $self->{current_token}->{quirks} = 1;
2300 !!!emit ($self->{current_token}); # DOCTYPE
2301
2302 redo A;
2303 } elsif ($self->{next_char} == -1) {
2304 !!!cp (205);
2305 !!!parse-error (type => 'unclosed DOCTYPE');
2306
2307 $self->{state} = DATA_STATE;
2308 ## reconsume
2309
2310 $self->{current_token}->{quirks} = 1;
2311 !!!emit ($self->{current_token}); # DOCTYPE
2312
2313 redo A;
2314 } else {
2315 !!!cp (206);
2316 !!!parse-error (type => 'string after SYSTEM');
2317 $self->{current_token}->{quirks} = 1;
2318
2319 $self->{state} = BOGUS_DOCTYPE_STATE;
2320 !!!next-input-character;
2321 redo A;
2322 }
2323 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2324 if ($self->{next_char} == 0x0022) { # "
2325 !!!cp (207);
2326 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2327 !!!next-input-character;
2328 redo A;
2329 } elsif ($self->{next_char} == 0x003E) { # >
2330 !!!cp (208);
2331 !!!parse-error (type => 'unclosed PUBLIC literal');
2332
2333 $self->{state} = DATA_STATE;
2334 !!!next-input-character;
2335
2336 $self->{current_token}->{quirks} = 1;
2337 !!!emit ($self->{current_token}); # DOCTYPE
2338
2339 redo A;
2340 } elsif ($self->{next_char} == -1) {
2341 !!!cp (209);
2342 !!!parse-error (type => 'unclosed SYSTEM literal');
2343
2344 $self->{state} = DATA_STATE;
2345 ## reconsume
2346
2347 $self->{current_token}->{quirks} = 1;
2348 !!!emit ($self->{current_token}); # DOCTYPE
2349
2350 redo A;
2351 } else {
2352 !!!cp (210);
2353 $self->{current_token}->{system_identifier} # DOCTYPE
2354 .= chr $self->{next_char};
2355 ## Stay in the state
2356 !!!next-input-character;
2357 redo A;
2358 }
2359 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2360 if ($self->{next_char} == 0x0027) { # '
2361 !!!cp (211);
2362 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2363 !!!next-input-character;
2364 redo A;
2365 } elsif ($self->{next_char} == 0x003E) { # >
2366 !!!cp (212);
2367 !!!parse-error (type => 'unclosed PUBLIC literal');
2368
2369 $self->{state} = DATA_STATE;
2370 !!!next-input-character;
2371
2372 $self->{current_token}->{quirks} = 1;
2373 !!!emit ($self->{current_token}); # DOCTYPE
2374
2375 redo A;
2376 } elsif ($self->{next_char} == -1) {
2377 !!!cp (213);
2378 !!!parse-error (type => 'unclosed SYSTEM literal');
2379
2380 $self->{state} = DATA_STATE;
2381 ## reconsume
2382
2383 $self->{current_token}->{quirks} = 1;
2384 !!!emit ($self->{current_token}); # DOCTYPE
2385
2386 redo A;
2387 } else {
2388 !!!cp (214);
2389 $self->{current_token}->{system_identifier} # DOCTYPE
2390 .= chr $self->{next_char};
2391 ## Stay in the state
2392 !!!next-input-character;
2393 redo A;
2394 }
2395 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2396 if ({
2397 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2398 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2399 }->{$self->{next_char}}) {
2400 !!!cp (215);
2401 ## Stay in the state
2402 !!!next-input-character;
2403 redo A;
2404 } elsif ($self->{next_char} == 0x003E) { # >
2405 !!!cp (216);
2406 $self->{state} = DATA_STATE;
2407 !!!next-input-character;
2408
2409 !!!emit ($self->{current_token}); # DOCTYPE
2410
2411 redo A;
2412 } elsif ($self->{next_char} == -1) {
2413 !!!cp (217);
2414 !!!parse-error (type => 'unclosed DOCTYPE');
2415
2416 $self->{state} = DATA_STATE;
2417 ## reconsume
2418
2419 $self->{current_token}->{quirks} = 1;
2420 !!!emit ($self->{current_token}); # DOCTYPE
2421
2422 redo A;
2423 } else {
2424 !!!cp (218);
2425 !!!parse-error (type => 'string after SYSTEM literal');
2426 #$self->{current_token}->{quirks} = 1;
2427
2428 $self->{state} = BOGUS_DOCTYPE_STATE;
2429 !!!next-input-character;
2430 redo A;
2431 }
2432 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2433 if ($self->{next_char} == 0x003E) { # >
2434 !!!cp (219);
2435 $self->{state} = DATA_STATE;
2436 !!!next-input-character;
2437
2438 !!!emit ($self->{current_token}); # DOCTYPE
2439
2440 redo A;
2441 } elsif ($self->{next_char} == -1) {
2442 !!!cp (220);
2443 !!!parse-error (type => 'unclosed DOCTYPE');
2444 $self->{state} = DATA_STATE;
2445 ## reconsume
2446
2447 !!!emit ($self->{current_token}); # DOCTYPE
2448
2449 redo A;
2450 } else {
2451 !!!cp (221);
2452 ## Stay in the state
2453 !!!next-input-character;
2454 redo A;
2455 }
2456 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2457 my $s = '';
2458
2459 my ($l, $c) = ($self->{line}, $self->{column});
2460
2461 CS: while ($self->{next_char} != -1) {
2462 if ($self->{next_char} == 0x005D) { # ]
2463 !!!next-input-character;
2464 if ($self->{next_char} == 0x005D) { # ]
2465 !!!next-input-character;
2466 MDC: {
2467 if ($self->{next_char} == 0x003E) { # >
2468 !!!cp (221.1);
2469 !!!next-input-character;
2470 last CS;
2471 } elsif ($self->{next_char} == 0x005D) { # ]
2472 !!!cp (221.2);
2473 $s .= ']';
2474 !!!next-input-character;
2475 redo MDC;
2476 } else {
2477 !!!cp (221.3);
2478 $s .= ']]';
2479 #
2480 }
2481 } # MDC
2482 } else {
2483 !!!cp (221.4);
2484 $s .= ']';
2485 #
2486 }
2487 } else {
2488 !!!cp (221.5);
2489 #
2490 }
2491 $s .= chr $self->{next_char};
2492 !!!next-input-character;
2493 } # CS
2494
2495 $self->{state} = DATA_STATE;
2496 ## next-input-character done or EOF, which is reconsumed.
2497
2498 if (length $s) {
2499 !!!cp (221.6);
2500 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2501 line => $l, column => $c});
2502 } else {
2503 !!!cp (221.7);
2504 }
2505
2506 redo A;
2507
2508 ## ISSUE: "text tokens" in spec.
2509 ## TODO: Streaming support
2510 } else {
2511 die "$0: $self->{state}: Unknown state";
2512 }
2513 } # A
2514
2515 die "$0: _get_next_token: unexpected case";
2516 } # _get_next_token
2517
2518 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2519 my ($self, $in_attr, $additional) = @_;
2520
2521 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2522
2523 if ({
2524 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2525 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2526 $additional => 1,
2527 }->{$self->{next_char}}) {
2528 !!!cp (1001);
2529 ## Don't consume
2530 ## No error
2531 return undef;
2532 } elsif ($self->{next_char} == 0x0023) { # #
2533 !!!next-input-character;
2534 if ($self->{next_char} == 0x0078 or # x
2535 $self->{next_char} == 0x0058) { # X
2536 my $code;
2537 X: {
2538 my $x_char = $self->{next_char};
2539 !!!next-input-character;
2540 if (0x0030 <= $self->{next_char} and
2541 $self->{next_char} <= 0x0039) { # 0..9
2542 !!!cp (1002);
2543 $code ||= 0;
2544 $code *= 0x10;
2545 $code += $self->{next_char} - 0x0030;
2546 redo X;
2547 } elsif (0x0061 <= $self->{next_char} and
2548 $self->{next_char} <= 0x0066) { # a..f
2549 !!!cp (1003);
2550 $code ||= 0;
2551 $code *= 0x10;
2552 $code += $self->{next_char} - 0x0060 + 9;
2553 redo X;
2554 } elsif (0x0041 <= $self->{next_char} and
2555 $self->{next_char} <= 0x0046) { # A..F
2556 !!!cp (1004);
2557 $code ||= 0;
2558 $code *= 0x10;
2559 $code += $self->{next_char} - 0x0040 + 9;
2560 redo X;
2561 } elsif (not defined $code) { # no hexadecimal digit
2562 !!!cp (1005);
2563 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2564 !!!back-next-input-character ($x_char, $self->{next_char});
2565 $self->{next_char} = 0x0023; # #
2566 return undef;
2567 } elsif ($self->{next_char} == 0x003B) { # ;
2568 !!!cp (1006);
2569 !!!next-input-character;
2570 } else {
2571 !!!cp (1007);
2572 !!!parse-error (type => 'no refc', line => $l, column => $c);
2573 }
2574
2575 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2576 !!!cp (1008);
2577 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2578 $code = 0xFFFD;
2579 } elsif ($code > 0x10FFFF) {
2580 !!!cp (1009);
2581 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2582 $code = 0xFFFD;
2583 } elsif ($code == 0x000D) {
2584 !!!cp (1010);
2585 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2586 $code = 0x000A;
2587 } elsif (0x80 <= $code and $code <= 0x9F) {
2588 !!!cp (1011);
2589 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2590 $code = $c1_entity_char->{$code};
2591 }
2592
2593 return {type => CHARACTER_TOKEN, data => chr $code,
2594 has_reference => 1,
2595 line => $l, column => $c,
2596 };
2597 } # X
2598 } elsif (0x0030 <= $self->{next_char} and
2599 $self->{next_char} <= 0x0039) { # 0..9
2600 my $code = $self->{next_char} - 0x0030;
2601 !!!next-input-character;
2602
2603 while (0x0030 <= $self->{next_char} and
2604 $self->{next_char} <= 0x0039) { # 0..9
2605 !!!cp (1012);
2606 $code *= 10;
2607 $code += $self->{next_char} - 0x0030;
2608
2609 !!!next-input-character;
2610 }
2611
2612 if ($self->{next_char} == 0x003B) { # ;
2613 !!!cp (1013);
2614 !!!next-input-character;
2615 } else {
2616 !!!cp (1014);
2617 !!!parse-error (type => 'no refc', line => $l, column => $c);
2618 }
2619
2620 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2621 !!!cp (1015);
2622 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2623 $code = 0xFFFD;
2624 } elsif ($code > 0x10FFFF) {
2625 !!!cp (1016);
2626 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2627 $code = 0xFFFD;
2628 } elsif ($code == 0x000D) {
2629 !!!cp (1017);
2630 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2631 $code = 0x000A;
2632 } elsif (0x80 <= $code and $code <= 0x9F) {
2633 !!!cp (1018);
2634 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2635 $code = $c1_entity_char->{$code};
2636 }
2637
2638 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2639 line => $l, column => $c,
2640 };
2641 } else {
2642 !!!cp (1019);
2643 !!!parse-error (type => 'bare nero', line => $l, column => $c);
2644 !!!back-next-input-character ($self->{next_char});
2645 $self->{next_char} = 0x0023; # #
2646 return undef;
2647 }
2648 } elsif ((0x0041 <= $self->{next_char} and
2649 $self->{next_char} <= 0x005A) or
2650 (0x0061 <= $self->{next_char} and
2651 $self->{next_char} <= 0x007A)) {
2652 my $entity_name = chr $self->{next_char};
2653 !!!next-input-character;
2654
2655 my $value = $entity_name;
2656 my $match = 0;
2657 require Whatpm::_NamedEntityList;
2658 our $EntityChar;
2659
2660 while (length $entity_name < 30 and
2661 ## NOTE: Some number greater than the maximum length of entity name
2662 ((0x0041 <= $self->{next_char} and # a
2663 $self->{next_char} <= 0x005A) or # x
2664 (0x0061 <= $self->{next_char} and # a
2665 $self->{next_char} <= 0x007A) or # z
2666 (0x0030 <= $self->{next_char} and # 0
2667 $self->{next_char} <= 0x0039) or # 9
2668 $self->{next_char} == 0x003B)) { # ;
2669 $entity_name .= chr $self->{next_char};
2670 if (defined $EntityChar->{$entity_name}) {
2671 if ($self->{next_char} == 0x003B) { # ;
2672 !!!cp (1020);
2673 $value = $EntityChar->{$entity_name};
2674 $match = 1;
2675 !!!next-input-character;
2676 last;
2677 } else {
2678 !!!cp (1021);
2679 $value = $EntityChar->{$entity_name};
2680 $match = -1;
2681 !!!next-input-character;
2682 }
2683 } else {
2684 !!!cp (1022);
2685 $value .= chr $self->{next_char};
2686 $match *= 2;
2687 !!!next-input-character;
2688 }
2689 }
2690
2691 if ($match > 0) {
2692 !!!cp (1023);
2693 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2694 line => $l, column => $c,
2695 };
2696 } elsif ($match < 0) {
2697 !!!parse-error (type => 'no refc', line => $l, column => $c);
2698 if ($in_attr and $match < -1) {
2699 !!!cp (1024);
2700 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
2701 line => $l, column => $c,
2702 };
2703 } else {
2704 !!!cp (1025);
2705 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2706 line => $l, column => $c,
2707 };
2708 }
2709 } else {
2710 !!!cp (1026);
2711 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2712 ## NOTE: "No characters are consumed" in the spec.
2713 return {type => CHARACTER_TOKEN, data => '&'.$value,
2714 line => $l, column => $c,
2715 };
2716 }
2717 } else {
2718 !!!cp (1027);
2719 ## no characters are consumed
2720 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2721 return undef;
2722 }
2723 } # _tokenize_attempt_to_consume_an_entity
2724
2725 sub _initialize_tree_constructor ($) {
2726 my $self = shift;
2727 ## NOTE: $self->{document} MUST be specified before this method is called
2728 $self->{document}->strict_error_checking (0);
2729 ## TODO: Turn mutation events off # MUST
2730 ## TODO: Turn loose Document option (manakai extension) on
2731 $self->{document}->manakai_is_html (1); # MUST
2732 } # _initialize_tree_constructor
2733
2734 sub _terminate_tree_constructor ($) {
2735 my $self = shift;
2736 $self->{document}->strict_error_checking (1);
2737 ## TODO: Turn mutation events on
2738 } # _terminate_tree_constructor
2739
2740 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2741
2742 { # tree construction stage
2743 my $token;
2744
2745 sub _construct_tree ($) {
2746 my ($self) = @_;
2747
2748 ## When an interactive UA render the $self->{document} available
2749 ## to the user, or when it begin accepting user input, are
2750 ## not defined.
2751
2752 ## Append a character: collect it and all subsequent consecutive
2753 ## characters and insert one Text node whose data is concatenation
2754 ## of all those characters. # MUST
2755
2756 !!!next-token;
2757
2758 undef $self->{form_element};
2759 undef $self->{head_element};
2760 $self->{open_elements} = [];
2761 undef $self->{inner_html_node};
2762
2763 ## NOTE: The "initial" insertion mode.
2764 $self->_tree_construction_initial; # MUST
2765
2766 ## NOTE: The "before html" insertion mode.
2767 $self->_tree_construction_root_element;
2768 $self->{insertion_mode} = BEFORE_HEAD_IM;
2769
2770 ## NOTE: The "before head" insertion mode and so on.
2771 $self->_tree_construction_main;
2772 } # _construct_tree
2773
2774 sub _tree_construction_initial ($) {
2775 my $self = shift;
2776
2777 ## NOTE: "initial" insertion mode
2778
2779 INITIAL: {
2780 if ($token->{type} == DOCTYPE_TOKEN) {
2781 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2782 ## error, switch to a conformance checking mode for another
2783 ## language.
2784 my $doctype_name = $token->{name};
2785 $doctype_name = '' unless defined $doctype_name;
2786 $doctype_name =~ tr/a-z/A-Z/;
2787 if (not defined $token->{name} or # <!DOCTYPE>
2788 defined $token->{public_identifier} or
2789 defined $token->{system_identifier}) {
2790 !!!cp ('t1');
2791 !!!parse-error (type => 'not HTML5', token => $token);
2792 } elsif ($doctype_name ne 'HTML') {
2793 !!!cp ('t2');
2794 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2795 !!!parse-error (type => 'not HTML5', token => $token);
2796 } else {
2797 !!!cp ('t3');
2798 }
2799
2800 my $doctype = $self->{document}->create_document_type_definition
2801 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2802 ## NOTE: Default value for both |public_id| and |system_id| attributes
2803 ## are empty strings, so that we don't set any value in missing cases.
2804 $doctype->public_id ($token->{public_identifier})
2805 if defined $token->{public_identifier};
2806 $doctype->system_id ($token->{system_identifier})
2807 if defined $token->{system_identifier};
2808 ## NOTE: Other DocumentType attributes are null or empty lists.
2809 ## ISSUE: internalSubset = null??
2810 $self->{document}->append_child ($doctype);
2811
2812 if ($token->{quirks} or $doctype_name ne 'HTML') {
2813 !!!cp ('t4');
2814 $self->{document}->manakai_compat_mode ('quirks');
2815 } elsif (defined $token->{public_identifier}) {
2816 my $pubid = $token->{public_identifier};
2817 $pubid =~ tr/a-z/A-z/;
2818 if ({
2819 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2820 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2821 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2822 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2823 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2824 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2825 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2826 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2827 "-//IETF//DTD HTML 2.0//EN" => 1,
2828 "-//IETF//DTD HTML 2.1E//EN" => 1,
2829 "-//IETF//DTD HTML 3.0//EN" => 1,
2830 "-//IETF//DTD HTML 3.0//EN//" => 1,
2831 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2832 "-//IETF//DTD HTML 3.2//EN" => 1,
2833 "-//IETF//DTD HTML 3//EN" => 1,
2834 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2835 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2836 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2837 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2838 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2839 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2840 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2841 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2842 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2843 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2844 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2845 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2846 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2847 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2848 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2849 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2850 "-//IETF//DTD HTML STRICT//EN" => 1,
2851 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2852 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2853 "-//IETF//DTD HTML//EN" => 1,
2854 "-//IETF//DTD HTML//EN//2.0" => 1,
2855 "-//IETF//DTD HTML//EN//3.0" => 1,
2856 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2857 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2858 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2859 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2860 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2861 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2862 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2863 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2864 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2865 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2866 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2867 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2868 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2869 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2870 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2871 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2872 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2873 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2874 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2875 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2876 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2877 "-//W3C//DTD HTML 3.2//EN" => 1,
2878 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2879 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2880 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2881 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2882 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2883 "-//W3C//DTD W3 HTML//EN" => 1,
2884 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2885 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2886 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2887 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2888 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2889 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2890 "HTML" => 1,
2891 }->{$pubid}) {
2892 !!!cp ('t5');
2893 $self->{document}->manakai_compat_mode ('quirks');
2894 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2895 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2896 if (defined $token->{system_identifier}) {
2897 !!!cp ('t6');
2898 $self->{document}->manakai_compat_mode ('quirks');
2899 } else {
2900 !!!cp ('t7');
2901 $self->{document}->manakai_compat_mode ('limited quirks');
2902 }
2903 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
2904 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
2905 !!!cp ('t8');
2906 $self->{document}->manakai_compat_mode ('limited quirks');
2907 } else {
2908 !!!cp ('t9');
2909 }
2910 } else {
2911 !!!cp ('t10');
2912 }
2913 if (defined $token->{system_identifier}) {
2914 my $sysid = $token->{system_identifier};
2915 $sysid =~ tr/A-Z/a-z/;
2916 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
2917 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
2918 $self->{document}->manakai_compat_mode ('quirks');
2919 !!!cp ('t11');
2920 } else {
2921 !!!cp ('t12');
2922 }
2923 } else {
2924 !!!cp ('t13');
2925 }
2926
2927 ## Go to the "before html" insertion mode.
2928 !!!next-token;
2929 return;
2930 } elsif ({
2931 START_TAG_TOKEN, 1,
2932 END_TAG_TOKEN, 1,
2933 END_OF_FILE_TOKEN, 1,
2934 }->{$token->{type}}) {
2935 !!!cp ('t14');
2936 !!!parse-error (type => 'no DOCTYPE', token => $token);
2937 $self->{document}->manakai_compat_mode ('quirks');
2938 ## Go to the "before html" insertion mode.
2939 ## reprocess
2940 !!!ack-later;
2941 return;
2942 } elsif ($token->{type} == CHARACTER_TOKEN) {
2943 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
2944 ## Ignore the token
2945
2946 unless (length $token->{data}) {
2947 !!!cp ('t15');
2948 ## Stay in the insertion mode.
2949 !!!next-token;
2950 redo INITIAL;
2951 } else {
2952 !!!cp ('t16');
2953 }
2954 } else {
2955 !!!cp ('t17');
2956 }
2957
2958 !!!parse-error (type => 'no DOCTYPE', token => $token);
2959 $self->{document}->manakai_compat_mode ('quirks');
2960 ## Go to the "before html" insertion mode.
2961 ## reprocess
2962 return;
2963 } elsif ($token->{type} == COMMENT_TOKEN) {
2964 !!!cp ('t18');
2965 my $comment = $self->{document}->create_comment ($token->{data});
2966 $self->{document}->append_child ($comment);
2967
2968 ## Stay in the insertion mode.
2969 !!!next-token;
2970 redo INITIAL;
2971 } else {
2972 die "$0: $token->{type}: Unknown token type";
2973 }
2974 } # INITIAL
2975
2976 die "$0: _tree_construction_initial: This should be never reached";
2977 } # _tree_construction_initial
2978
2979 sub _tree_construction_root_element ($) {
2980 my $self = shift;
2981
2982 ## NOTE: "before html" insertion mode.
2983
2984 B: {
2985 if ($token->{type} == DOCTYPE_TOKEN) {
2986 !!!cp ('t19');
2987 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
2988 ## Ignore the token
2989 ## Stay in the insertion mode.
2990 !!!next-token;
2991 redo B;
2992 } elsif ($token->{type} == COMMENT_TOKEN) {
2993 !!!cp ('t20');
2994 my $comment = $self->{document}->create_comment ($token->{data});
2995 $self->{document}->append_child ($comment);
2996 ## Stay in the insertion mode.
2997 !!!next-token;
2998 redo B;
2999 } elsif ($token->{type} == CHARACTER_TOKEN) {
3000 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3001 ## Ignore the token.
3002
3003 unless (length $token->{data}) {
3004 !!!cp ('t21');
3005 ## Stay in the insertion mode.
3006 !!!next-token;
3007 redo B;
3008 } else {
3009 !!!cp ('t22');
3010 }
3011 } else {
3012 !!!cp ('t23');
3013 }
3014
3015 $self->{application_cache_selection}->(undef);
3016
3017 #
3018 } elsif ($token->{type} == START_TAG_TOKEN) {
3019 if ($token->{tag_name} eq 'html') {
3020 my $root_element;
3021 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3022 $self->{document}->append_child ($root_element);
3023 push @{$self->{open_elements}},
3024 [$root_element, $el_category->{html}];
3025
3026 if ($token->{attributes}->{manifest}) {
3027 !!!cp ('t24');
3028 $self->{application_cache_selection}
3029 ->($token->{attributes}->{manifest}->{value});
3030 ## ISSUE: Spec is unclear on relative references.
3031 ## According to Hixie (#whatwg 2008-03-19), it should be
3032 ## resolved against the base URI of the document in HTML
3033 ## or xml:base of the element in XHTML.
3034 } else {
3035 !!!cp ('t25');
3036 $self->{application_cache_selection}->(undef);
3037 }
3038
3039 !!!nack ('t25c');
3040
3041 !!!next-token;
3042 return; ## Go to the "before head" insertion mode.
3043 } else {
3044 !!!cp ('t25.1');
3045 #
3046 }
3047 } elsif ({
3048 END_TAG_TOKEN, 1,
3049 END_OF_FILE_TOKEN, 1,
3050 }->{$token->{type}}) {
3051 !!!cp ('t26');
3052 #
3053 } else {
3054 die "$0: $token->{type}: Unknown token type";
3055 }
3056
3057 my $root_element;
3058 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3059 $self->{document}->append_child ($root_element);
3060 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3061
3062 $self->{application_cache_selection}->(undef);
3063
3064 ## NOTE: Reprocess the token.
3065 !!!ack-later;
3066 return; ## Go to the "before head" insertion mode.
3067
3068 ## ISSUE: There is an issue in the spec
3069 } # B
3070
3071 die "$0: _tree_construction_root_element: This should never be reached";
3072 } # _tree_construction_root_element
3073
3074 sub _reset_insertion_mode ($) {
3075 my $self = shift;
3076
3077 ## Step 1
3078 my $last;
3079
3080 ## Step 2
3081 my $i = -1;
3082 my $node = $self->{open_elements}->[$i];
3083
3084 ## Step 3
3085 S3: {
3086 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3087 $last = 1;
3088 if (defined $self->{inner_html_node}) {
3089 if ($self->{inner_html_node}->[1] & TABLE_CELL_EL) {
3090 !!!cp ('t27');
3091 #
3092 } else {
3093 !!!cp ('t28');
3094 $node = $self->{inner_html_node};
3095 }
3096 }
3097 }
3098
3099 ## Step 4..14
3100 my $new_mode;
3101 if ($node->[1] & FOREIGN_EL) {
3102 ## NOTE: Strictly spaking, the line below only applies to MathML and
3103 ## SVG elements. Currently the HTML syntax supports only MathML and
3104 ## SVG elements as foreigners.
3105 $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
3106 ## ISSUE: What is set as the secondary insertion mode?
3107 } else {
3108 $new_mode = {
3109 select => IN_SELECT_IM,
3110 ## NOTE: |option| and |optgroup| do not set
3111 ## insertion mode to "in select" by themselves.
3112 td => IN_CELL_IM,
3113 th => IN_CELL_IM,
3114 tr => IN_ROW_IM,
3115 tbody => IN_TABLE_BODY_IM,
3116 thead => IN_TABLE_BODY_IM,
3117 tfoot => IN_TABLE_BODY_IM,
3118 caption => IN_CAPTION_IM,
3119 colgroup => IN_COLUMN_GROUP_IM,
3120 table => IN_TABLE_IM,
3121 head => IN_BODY_IM, # not in head!
3122 body => IN_BODY_IM,
3123 frameset => IN_FRAMESET_IM,
3124 }->{$node->[0]->manakai_local_name};
3125 }
3126 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3127
3128 ## Step 15
3129 if ($node->[1] & HTML_EL) {
3130 unless (defined $self->{head_element}) {
3131 !!!cp ('t29');
3132 $self->{insertion_mode} = BEFORE_HEAD_IM;
3133 } else {
3134 ## ISSUE: Can this state be reached?
3135 !!!cp ('t30');
3136 $self->{insertion_mode} = AFTER_HEAD_IM;
3137 }
3138 return;
3139 } else {
3140 !!!cp ('t31');
3141 }
3142
3143 ## Step 16
3144 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3145
3146 ## Step 17
3147 $i--;
3148 $node = $self->{open_elements}->[$i];
3149
3150 ## Step 18
3151 redo S3;
3152 } # S3
3153
3154 die "$0: _reset_insertion_mode: This line should never be reached";
3155 } # _reset_insertion_mode
3156
3157 sub _tree_construction_main ($) {
3158 my $self = shift;
3159
3160 my $active_formatting_elements = [];
3161
3162 my $reconstruct_active_formatting_elements = sub { # MUST
3163 my $insert = shift;
3164
3165 ## Step 1
3166 return unless @$active_formatting_elements;
3167
3168 ## Step 3
3169 my $i = -1;
3170 my $entry = $active_formatting_elements->[$i];
3171
3172 ## Step 2
3173 return if $entry->[0] eq '#marker';
3174 for (@{$self->{open_elements}}) {
3175 if ($entry->[0] eq $_->[0]) {
3176 !!!cp ('t32');
3177 return;
3178 }
3179 }
3180
3181 S4: {
3182 ## Step 4
3183 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3184
3185 ## Step 5
3186 $i--;
3187 $entry = $active_formatting_elements->[$i];
3188
3189 ## Step 6
3190 if ($entry->[0] eq '#marker') {
3191 !!!cp ('t33_1');
3192 #
3193 } else {
3194 my $in_open_elements;
3195 OE: for (@{$self->{open_elements}}) {
3196 if ($entry->[0] eq $_->[0]) {
3197 !!!cp ('t33');
3198 $in_open_elements = 1;
3199 last OE;
3200 }
3201 }
3202 if ($in_open_elements) {
3203 !!!cp ('t34');
3204 #
3205 } else {
3206 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3207 !!!cp ('t35');
3208 redo S4;
3209 }
3210 }
3211
3212 ## Step 7
3213 $i++;
3214 $entry = $active_formatting_elements->[$i];
3215 } # S4
3216
3217 S7: {
3218 ## Step 8
3219 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3220
3221 ## Step 9
3222 $insert->($clone->[0]);
3223 push @{$self->{open_elements}}, $clone;
3224
3225 ## Step 10
3226 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3227
3228 ## Step 11
3229 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3230 !!!cp ('t36');
3231 ## Step 7'
3232 $i++;
3233 $entry = $active_formatting_elements->[$i];
3234
3235 redo S7;
3236 }
3237
3238 !!!cp ('t37');
3239 } # S7
3240 }; # $reconstruct_active_formatting_elements
3241
3242 my $clear_up_to_marker = sub {
3243 for (reverse 0..$#$active_formatting_elements) {
3244 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3245 !!!cp ('t38');
3246 splice @$active_formatting_elements, $_;
3247 return;
3248 }
3249 }
3250
3251 !!!cp ('t39');
3252 }; # $clear_up_to_marker
3253
3254 my $insert;
3255
3256 my $parse_rcdata = sub ($) {
3257 my ($content_model_flag) = @_;
3258
3259 ## Step 1
3260 my $start_tag_name = $token->{tag_name};
3261 my $el;
3262 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3263
3264 ## Step 2
3265 $insert->($el);
3266
3267 ## Step 3
3268 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3269 delete $self->{escape}; # MUST
3270
3271 ## Step 4
3272 my $text = '';
3273 !!!nack ('t40.1');
3274 !!!next-token;
3275 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3276 !!!cp ('t40');
3277 $text .= $token->{data};
3278 !!!next-token;
3279 }
3280
3281 ## Step 5
3282 if (length $text) {
3283 !!!cp ('t41');
3284 my $text = $self->{document}->create_text_node ($text);
3285 $el->append_child ($text);
3286 }
3287
3288 ## Step 6
3289 $self->{content_model} = PCDATA_CONTENT_MODEL;
3290
3291 ## Step 7
3292 if ($token->{type} == END_TAG_TOKEN and
3293 $token->{tag_name} eq $start_tag_name) {
3294 !!!cp ('t42');
3295 ## Ignore the token
3296 } else {
3297 ## NOTE: An end-of-file token.
3298 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3299 !!!cp ('t43');
3300 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3301 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3302 !!!cp ('t44');
3303 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
3304 } else {
3305 die "$0: $content_model_flag in parse_rcdata";
3306 }
3307 }
3308 !!!next-token;
3309 }; # $parse_rcdata
3310
3311 my $script_start_tag = sub () {
3312 my $script_el;
3313 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3314 ## TODO: mark as "parser-inserted"
3315
3316 $self->{content_model} = CDATA_CONTENT_MODEL;
3317 delete $self->{escape}; # MUST
3318
3319 my $text = '';
3320 !!!nack ('t45.1');
3321 !!!next-token;
3322 while ($token->{type} == CHARACTER_TOKEN) {
3323 !!!cp ('t45');
3324 $text .= $token->{data};
3325 !!!next-token;
3326 } # stop if non-character token or tokenizer stops tokenising
3327 if (length $text) {
3328 !!!cp ('t46');
3329 $script_el->manakai_append_text ($text);
3330 }
3331
3332 $self->{content_model} = PCDATA_CONTENT_MODEL;
3333
3334 if ($token->{type} == END_TAG_TOKEN and
3335 $token->{tag_name} eq 'script') {
3336 !!!cp ('t47');
3337 ## Ignore the token
3338 } else {
3339 !!!cp ('t48');
3340 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3341 ## ISSUE: And ignore?
3342 ## TODO: mark as "already executed"
3343 }
3344
3345 if (defined $self->{inner_html_node}) {
3346 !!!cp ('t49');
3347 ## TODO: mark as "already executed"
3348 } else {
3349 !!!cp ('t50');
3350 ## TODO: $old_insertion_point = current insertion point
3351 ## TODO: insertion point = just before the next input character
3352
3353 $insert->($script_el);
3354
3355 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3356
3357 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3358 }
3359
3360 !!!next-token;
3361 }; # $script_start_tag
3362
3363 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3364 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3365 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3366
3367 my $formatting_end_tag = sub {
3368 my $end_tag_token = shift;
3369 my $tag_name = $end_tag_token->{tag_name};
3370
3371 ## NOTE: The adoption agency algorithm (AAA).
3372
3373 FET: {
3374 ## Step 1
3375 my $formatting_element;
3376 my $formatting_element_i_in_active;
3377 AFE: for (reverse 0..$#$active_formatting_elements) {
3378 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3379 !!!cp ('t52');
3380 last AFE;
3381 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3382 eq $tag_name) {
3383 !!!cp ('t51');
3384 $formatting_element = $active_formatting_elements->[$_];
3385 $formatting_element_i_in_active = $_;
3386 last AFE;
3387 }
3388 } # AFE
3389 unless (defined $formatting_element) {
3390 !!!cp ('t53');
3391 !!!parse-error (type => 'unmatched end tag:'.$tag_name, token => $end_tag_token);
3392 ## Ignore the token
3393 !!!next-token;
3394 return;
3395 }
3396 ## has an element in scope
3397 my $in_scope = 1;
3398 my $formatting_element_i_in_open;
3399 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3400 my $node = $self->{open_elements}->[$_];
3401 if ($node->[0] eq $formatting_element->[0]) {
3402 if ($in_scope) {
3403 !!!cp ('t54');
3404 $formatting_element_i_in_open = $_;
3405 last INSCOPE;
3406 } else { # in open elements but not in scope
3407 !!!cp ('t55');
3408 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3409 token => $end_tag_token);
3410 ## Ignore the token
3411 !!!next-token;
3412 return;
3413 }
3414 } elsif ($node->[1] & SCOPING_EL) {
3415 !!!cp ('t56');
3416 $in_scope = 0;
3417 }
3418 } # INSCOPE
3419 unless (defined $formatting_element_i_in_open) {
3420 !!!cp ('t57');
3421 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3422 token => $end_tag_token);
3423 pop @$active_formatting_elements; # $formatting_element
3424 !!!next-token; ## TODO: ok?
3425 return;
3426 }
3427 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3428 !!!cp ('t58');
3429 !!!parse-error (type => 'not closed',
3430 value => $self->{open_elements}->[-1]->[0]
3431 ->manakai_local_name,
3432 token => $end_tag_token);
3433 }
3434
3435 ## Step 2
3436 my $furthest_block;
3437 my $furthest_block_i_in_open;
3438 OE: for (reverse 0..$#{$self->{open_elements}}) {
3439 my $node = $self->{open_elements}->[$_];
3440 if (not ($node->[1] & FORMATTING_EL) and
3441 #not $phrasing_category->{$node->[1]} and
3442 ($node->[1] & SPECIAL_EL or
3443 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3444 !!!cp ('t59');
3445 $furthest_block = $node;
3446 $furthest_block_i_in_open = $_;
3447 } elsif ($node->[0] eq $formatting_element->[0]) {
3448 !!!cp ('t60');
3449 last OE;
3450 }
3451 } # OE
3452
3453 ## Step 3
3454 unless (defined $furthest_block) { # MUST
3455 !!!cp ('t61');
3456 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3457 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3458 !!!next-token;
3459 return;
3460 }
3461
3462 ## Step 4
3463 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3464
3465 ## Step 5
3466 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3467 if (defined $furthest_block_parent) {
3468 !!!cp ('t62');
3469 $furthest_block_parent->remove_child ($furthest_block->[0]);
3470 }
3471
3472 ## Step 6
3473 my $bookmark_prev_el
3474 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3475 ->[0];
3476
3477 ## Step 7
3478 my $node = $furthest_block;
3479 my $node_i_in_open = $furthest_block_i_in_open;
3480 my $last_node = $furthest_block;
3481 S7: {
3482 ## Step 1
3483 $node_i_in_open--;
3484 $node = $self->{open_elements}->[$node_i_in_open];
3485
3486 ## Step 2
3487 my $node_i_in_active;
3488 S7S2: {
3489 for (reverse 0..$#$active_formatting_elements) {
3490 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3491 !!!cp ('t63');
3492 $node_i_in_active = $_;
3493 last S7S2;
3494 }
3495 }
3496 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3497 redo S7;
3498 } # S7S2
3499
3500 ## Step 3
3501 last S7 if $node->[0] eq $formatting_element->[0];
3502
3503 ## Step 4
3504 if ($last_node->[0] eq $furthest_block->[0]) {
3505 !!!cp ('t64');
3506 $bookmark_prev_el = $node->[0];
3507 }
3508
3509 ## Step 5
3510 if ($node->[0]->has_child_nodes ()) {
3511 !!!cp ('t65');
3512 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3513 $active_formatting_elements->[$node_i_in_active] = $clone;
3514 $self->{open_elements}->[$node_i_in_open] = $clone;
3515 $node = $clone;
3516 }
3517
3518 ## Step 6
3519 $node->[0]->append_child ($last_node->[0]);
3520
3521 ## Step 7
3522 $last_node = $node;
3523
3524 ## Step 8
3525 redo S7;
3526 } # S7
3527
3528 ## Step 8
3529 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3530 my $foster_parent_element;
3531 my $next_sibling;
3532 OE: for (reverse 0..$#{$self->{open_elements}}) {
3533 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3534 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3535 if (defined $parent and $parent->node_type == 1) {
3536 !!!cp ('t65.1');
3537 $foster_parent_element = $parent;
3538 $next_sibling = $self->{open_elements}->[$_]->[0];
3539 } else {
3540 !!!cp ('t65.2');
3541 $foster_parent_element
3542 = $self->{open_elements}->[$_ - 1]->[0];
3543 }
3544 last OE;
3545 }
3546 } # OE
3547 $foster_parent_element = $self->{open_elements}->[0]->[0]
3548 unless defined $foster_parent_element;
3549 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3550 $open_tables->[-1]->[1] = 1; # tainted
3551 } else {
3552 !!!cp ('t65.3');
3553 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3554 }
3555
3556 ## Step 9
3557 my $clone = [$formatting_element->[0]->clone_node (0),
3558 $formatting_element->[1]];
3559
3560 ## Step 10
3561 my @cn = @{$furthest_block->[0]->child_nodes};
3562 $clone->[0]->append_child ($_) for @cn;
3563
3564 ## Step 11
3565 $furthest_block->[0]->append_child ($clone->[0]);
3566
3567 ## Step 12
3568 my $i;
3569 AFE: for (reverse 0..$#$active_formatting_elements) {
3570 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3571 !!!cp ('t66');
3572 splice @$active_formatting_elements, $_, 1;
3573 $i-- and last AFE if defined $i;
3574 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3575 !!!cp ('t67');
3576 $i = $_;
3577 }
3578 } # AFE
3579 splice @$active_formatting_elements, $i + 1, 0, $clone;
3580
3581 ## Step 13
3582 undef $i;
3583 OE: for (reverse 0..$#{$self->{open_elements}}) {
3584 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3585 !!!cp ('t68');
3586 splice @{$self->{open_elements}}, $_, 1;
3587 $i-- and last OE if defined $i;
3588 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3589 !!!cp ('t69');
3590 $i = $_;
3591 }
3592 } # OE
3593 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3594
3595 ## Step 14
3596 redo FET;
3597 } # FET
3598 }; # $formatting_end_tag
3599
3600 $insert = my $insert_to_current = sub {
3601 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3602 }; # $insert_to_current
3603
3604 my $insert_to_foster = sub {
3605 my $child = shift;
3606 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3607 # MUST
3608 my $foster_parent_element;
3609 my $next_sibling;
3610 OE: for (reverse 0..$#{$self->{open_elements}}) {
3611 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3612 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3613 if (defined $parent and $parent->node_type == 1) {
3614 !!!cp ('t70');
3615 $foster_parent_element = $parent;
3616 $next_sibling = $self->{open_elements}->[$_]->[0];
3617 } else {
3618 !!!cp ('t71');
3619 $foster_parent_element
3620 = $self->{open_elements}->[$_ - 1]->[0];
3621 }
3622 last OE;
3623 }
3624 } # OE
3625 $foster_parent_element = $self->{open_elements}->[0]->[0]
3626 unless defined $foster_parent_element;
3627 $foster_parent_element->insert_before
3628 ($child, $next_sibling);
3629 $open_tables->[-1]->[1] = 1; # tainted
3630 } else {
3631 !!!cp ('t72');
3632 $self->{open_elements}->[-1]->[0]->append_child ($child);
3633 }
3634 }; # $insert_to_foster
3635
3636 B: while (1) {
3637 if ($token->{type} == DOCTYPE_TOKEN) {
3638 !!!cp ('t73');
3639 !!!parse-error (type => 'DOCTYPE in the middle', token => $token);
3640 ## Ignore the token
3641 ## Stay in the phase
3642 !!!next-token;
3643 next B;
3644 } elsif ($token->{type} == START_TAG_TOKEN and
3645 $token->{tag_name} eq 'html') {
3646 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3647 !!!cp ('t79');
3648 !!!parse-error (type => 'after html:html', token => $token);
3649 $self->{insertion_mode} = AFTER_BODY_IM;
3650 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3651 !!!cp ('t80');
3652 !!!parse-error (type => 'after html:html', token => $token);
3653 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3654 } else {
3655 !!!cp ('t81');
3656 }
3657
3658 !!!cp ('t82');
3659 !!!parse-error (type => 'not first start tag', token => $token);
3660 my $top_el = $self->{open_elements}->[0]->[0];
3661 for my $attr_name (keys %{$token->{attributes}}) {
3662 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3663 !!!cp ('t84');
3664 $top_el->set_attribute_ns
3665 (undef, [undef, $attr_name],
3666 $token->{attributes}->{$attr_name}->{value});
3667 }
3668 }
3669 !!!nack ('t84.1');
3670 !!!next-token;
3671 next B;
3672 } elsif ($token->{type} == COMMENT_TOKEN) {
3673 my $comment = $self->{document}->create_comment ($token->{data});
3674 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3675 !!!cp ('t85');
3676 $self->{document}->append_child ($comment);
3677 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3678 !!!cp ('t86');
3679 $self->{open_elements}->[0]->[0]->append_child ($comment);
3680 } else {
3681 !!!cp ('t87');
3682 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3683 }
3684 !!!next-token;
3685 next B;
3686 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
3687 if ($token->{type} == CHARACTER_TOKEN) {
3688 !!!cp ('t87.1');
3689 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3690 !!!next-token;
3691 next B;
3692 } elsif ($token->{type} == START_TAG_TOKEN) {
3693 if ($self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL or
3694 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
3695 ($token->{tag_name} eq 'svg' and
3696 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
3697 ## NOTE: "using the rules for secondary insertion mode"then"continue"
3698 !!!cp ('t87.2');
3699 #
3700 } elsif ({
3701 ## TODO:
3702 }->{$token->{tag_name}}) {
3703 !!!cp ('t87.2');
3704 !!!parse-error (type => 'not closed',
3705 value => $self->{open_elements}->[-1]->[0]
3706 ->manakai_local_name,
3707 token => $token);
3708
3709 pop @{$self->{open_elements}}
3710 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
3711
3712 $self->{insertion_mode} &= ~ $self->{insertion_mode};
3713 ## Reprocess.
3714 next B;
3715 } else {
3716 ## TODO: case fixup
3717
3718 !!!insert-element-f ($self->{open_elements}->[-1]->[0]->namespace_uri, $token);
3719
3720 if ($self->{self_closing}) {
3721 pop @{$self->{open_elements}};
3722 !!!ack ('t87.3');
3723 } else {
3724 !!!cp ('t87.4');
3725 }
3726
3727 !!!next-token;
3728 next B;
3729 }
3730 } elsif ($token->{type} == END_TAG_TOKEN) {
3731 ## NOTE: "using the rules for secondary insertion mode" then "continue"
3732 !!!cp ('t87.5');
3733 #
3734 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3735 ## NOTE: "using the rules for secondary insertion mode" then "continue"
3736 !!!cp ('t87.6');
3737 #
3738 ## TODO: ...
3739 } else {
3740 die "$0: $token->{type}: Unknown token type";
3741 }
3742 }
3743
3744 if ($self->{insertion_mode} & HEAD_IMS) {
3745 if ($token->{type} == CHARACTER_TOKEN) {
3746 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3747 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3748 !!!cp ('t88.2');
3749 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3750 } else {
3751 !!!cp ('t88.1');
3752 ## Ignore the token.
3753 !!!next-token;
3754 next B;
3755 }
3756 unless (length $token->{data}) {
3757 !!!cp ('t88');
3758 !!!next-token;
3759 next B;
3760 }
3761 }
3762
3763 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3764 !!!cp ('t89');
3765 ## As if <head>
3766 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
3767 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3768 push @{$self->{open_elements}},
3769 [$self->{head_element}, $el_category->{head}];
3770
3771 ## Reprocess in the "in head" insertion mode...
3772 pop @{$self->{open_elements}};
3773
3774 ## Reprocess in the "after head" insertion mode...
3775 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3776 !!!cp ('t90');
3777 ## As if </noscript>
3778 pop @{$self->{open_elements}};
3779 !!!parse-error (type => 'in noscript:#character', token => $token);
3780
3781 ## Reprocess in the "in head" insertion mode...
3782 ## As if </head>
3783 pop @{$self->{open_elements}};
3784
3785 ## Reprocess in the "after head" insertion mode...
3786 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3787 !!!cp ('t91');
3788 pop @{$self->{open_elements}};
3789
3790 ## Reprocess in the "after head" insertion mode...
3791 } else {
3792 !!!cp ('t92');
3793 }
3794
3795 ## "after head" insertion mode
3796 ## As if <body>
3797 !!!insert-element ('body',, $token);
3798 $self->{insertion_mode} = IN_BODY_IM;
3799 ## reprocess
3800 next B;
3801 } elsif ($token->{type} == START_TAG_TOKEN) {
3802 if ($token->{tag_name} eq 'head') {
3803 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3804 !!!cp ('t93');
3805 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3806 $self->{open_elements}->[-1]->[0]->append_child
3807 ($self->{head_element});
3808 push @{$self->{open_elements}},
3809 [$self->{head_element}, $el_category->{head}];
3810 $self->{insertion_mode} = IN_HEAD_IM;
3811 !!!nack ('t93.1');
3812 !!!next-token;
3813 next B;
3814 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3815 !!!cp ('t94');
3816 #
3817 } else {
3818 !!!cp ('t95');
3819 !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
3820 ## Ignore the token
3821 !!!nack ('t95.1');
3822 !!!next-token;
3823 next B;
3824 }
3825 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3826 !!!cp ('t96');
3827 ## As if <head>
3828 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
3829 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3830 push @{$self->{open_elements}},
3831 [$self->{head_element}, $el_category->{head}];
3832
3833 $self->{insertion_mode} = IN_HEAD_IM;
3834 ## Reprocess in the "in head" insertion mode...
3835 } else {
3836 !!!cp ('t97');
3837 }
3838
3839 if ($token->{tag_name} eq 'base') {
3840 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3841 !!!cp ('t98');
3842 ## As if </noscript>
3843 pop @{$self->{open_elements}};
3844 !!!parse-error (type => 'in noscript:base', token => $token);
3845
3846 $self->{insertion_mode} = IN_HEAD_IM;
3847 ## Reprocess in the "in head" insertion mode...
3848 } else {
3849 !!!cp ('t99');
3850 }
3851
3852 ## NOTE: There is a "as if in head" code clone.
3853 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3854 !!!cp ('t100');
3855 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3856 push @{$self->{open_elements}},
3857 [$self->{head_element}, $el_category->{head}];
3858 } else {
3859 !!!cp ('t101');
3860 }
3861 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3862 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3863 pop @{$self->{open_elements}} # <head>
3864 if $self->{insertion_mode} == AFTER_HEAD_IM;
3865 !!!nack ('t101.1');
3866 !!!next-token;
3867 next B;
3868 } elsif ($token->{tag_name} eq 'link') {
3869 ## NOTE: There is a "as if in head" code clone.
3870 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3871 !!!cp ('t102');
3872 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3873 push @{$self->{open_elements}},
3874 [$self->{head_element}, $el_category->{head}];
3875 } else {
3876 !!!cp ('t103');
3877 }
3878 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3879 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3880 pop @{$self->{open_elements}} # <head>
3881 if $self->{insertion_mode} == AFTER_HEAD_IM;
3882 !!!ack ('t103.1');
3883 !!!next-token;
3884 next B;
3885 } elsif ($token->{tag_name} eq 'meta') {
3886 ## NOTE: There is a "as if in head" code clone.
3887 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3888 !!!cp ('t104');
3889 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3890 push @{$self->{open_elements}},
3891 [$self->{head_element}, $el_category->{head}];
3892 } else {
3893 !!!cp ('t105');
3894 }
3895 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3896 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
3897
3898 unless ($self->{confident}) {
3899 if ($token->{attributes}->{charset}) { ## TODO: And if supported
3900 !!!cp ('t106');
3901 $self->{change_encoding}
3902 ->($self, $token->{attributes}->{charset}->{value},
3903 $token);
3904
3905 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3906 ->set_user_data (manakai_has_reference =>
3907 $token->{attributes}->{charset}
3908 ->{has_reference});
3909 } elsif ($token->{attributes}->{content}) {
3910 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
3911 if ($token->{attributes}->{content}->{value}
3912 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
3913 [\x09-\x0D\x20]*=
3914 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
3915 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
3916 !!!cp ('t107');
3917 $self->{change_encoding}
3918 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
3919 $token);
3920 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3921 ->set_user_data (manakai_has_reference =>
3922 $token->{attributes}->{content}
3923 ->{has_reference});
3924 } else {
3925 !!!cp ('t108');
3926 }
3927 }
3928 } else {
3929 if ($token->{attributes}->{charset}) {
3930 !!!cp ('t109');
3931 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
3932 ->set_user_data (manakai_has_reference =>
3933 $token->{attributes}->{charset}
3934 ->{has_reference});
3935 }
3936 if ($token->{attributes}->{content}) {
3937 !!!cp ('t110');
3938 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
3939 ->set_user_data (manakai_has_reference =>
3940 $token->{attributes}->{content}
3941 ->{has_reference});
3942 }
3943 }
3944
3945 pop @{$self->{open_elements}} # <head>
3946 if $self->{insertion_mode} == AFTER_HEAD_IM;
3947 !!!ack ('t110.1');
3948 !!!next-token;
3949 next B;
3950 } elsif ($token->{tag_name} eq 'title') {
3951 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3952 !!!cp ('t111');
3953 ## As if </noscript>
3954 pop @{$self->{open_elements}};
3955 !!!parse-error (type => 'in noscript:title', token => $token);
3956
3957 $self->{insertion_mode} = IN_HEAD_IM;
3958 ## Reprocess in the "in head" insertion mode...
3959 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3960 !!!cp ('t112');
3961 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3962 push @{$self->{open_elements}},
3963 [$self->{head_element}, $el_category->{head}];
3964 } else {
3965 !!!cp ('t113');
3966 }
3967
3968 ## NOTE: There is a "as if in head" code clone.
3969 my $parent = defined $self->{head_element} ? $self->{head_element}
3970 : $self->{open_elements}->[-1]->[0];
3971 $parse_rcdata->(RCDATA_CONTENT_MODEL);
3972 pop @{$self->{open_elements}} # <head>
3973 if $self->{insertion_mode} == AFTER_HEAD_IM;
3974 next B;
3975 } elsif ($token->{tag_name} eq 'style') {
3976 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
3977 ## insertion mode IN_HEAD_IM)
3978 ## NOTE: There is a "as if in head" code clone.
3979 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3980 !!!cp ('t114');
3981 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3982 push @{$self->{open_elements}},
3983 [$self->{head_element}, $el_category->{head}];
3984 } else {
3985 !!!cp ('t115');
3986 }
3987 $parse_rcdata->(CDATA_CONTENT_MODEL);
3988 pop @{$self->{open_elements}} # <head>
3989 if $self->{insertion_mode} == AFTER_HEAD_IM;
3990 next B;
3991 } elsif ($token->{tag_name} eq 'noscript') {
3992 if ($self->{insertion_mode} == IN_HEAD_IM) {
3993 !!!cp ('t116');
3994 ## NOTE: and scripting is disalbed
3995 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
3996 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
3997 !!!nack ('t116.1');
3998 !!!next-token;
3999 next B;
4000 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4001 !!!cp ('t117');
4002 !!!parse-error (type => 'in noscript:noscript', token => $token);
4003 ## Ignore the token
4004 !!!nack ('t117.1');
4005 !!!next-token;
4006 next B;
4007 } else {
4008 !!!cp ('t118');
4009 #
4010 }
4011 } elsif ($token->{tag_name} eq 'script') {
4012 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4013 !!!cp ('t119');
4014 ## As if </noscript>
4015 pop @{$self->{open_elements}};
4016 !!!parse-error (type => 'in noscript:script', token => $token);
4017
4018 $self->{insertion_mode} = IN_HEAD_IM;
4019 ## Reprocess in the "in head" insertion mode...
4020 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4021 !!!cp ('t120');
4022 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4023 push @{$self->{open_elements}},
4024 [$self->{head_element}, $el_category->{head}];
4025 } else {
4026 !!!cp ('t121');
4027 }
4028
4029 ## NOTE: There is a "as if in head" code clone.
4030 $script_start_tag->();
4031 pop @{$self->{open_elements}} # <head>
4032 if $self->{insertion_mode} == AFTER_HEAD_IM;
4033 next B;
4034 } elsif ($token->{tag_name} eq 'body' or
4035 $token->{tag_name} eq 'frameset') {
4036 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4037 !!!cp ('t122');
4038 ## As if </noscript>
4039 pop @{$self->{open_elements}};
4040 !!!parse-error (type => 'in noscript:'.$token->{tag_name}, token => $token);
4041
4042 ## Reprocess in the "in head" insertion mode...
4043 ## As if </head>
4044 pop @{$self->{open_elements}};
4045
4046 ## Reprocess in the "after head" insertion mode...
4047 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4048 !!!cp ('t124');
4049 pop @{$self->{open_elements}};
4050
4051 ## Reprocess in the "after head" insertion mode...
4052 } else {
4053 !!!cp ('t125');
4054 }
4055
4056 ## "after head" insertion mode
4057 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4058 if ($token->{tag_name} eq 'body') {
4059 !!!cp ('t126');
4060 $self->{insertion_mode} = IN_BODY_IM;
4061 } elsif ($token->{tag_name} eq 'frameset') {
4062 !!!cp ('t127');
4063 $self->{insertion_mode} = IN_FRAMESET_IM;
4064 } else {
4065 die "$0: tag name: $self->{tag_name}";
4066 }
4067 !!!nack ('t127.1');
4068 !!!next-token;
4069 next B;
4070 } else {
4071 !!!cp ('t128');
4072 #
4073 }
4074
4075 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4076 !!!cp ('t129');
4077 ## As if </noscript>
4078 pop @{$self->{open_elements}};
4079 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4080
4081 ## Reprocess in the "in head" insertion mode...
4082 ## As if </head>
4083 pop @{$self->{open_elements}};
4084
4085 ## Reprocess in the "after head" insertion mode...
4086 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4087 !!!cp ('t130');
4088 ## As if </head>
4089 pop @{$self->{open_elements}};
4090
4091 ## Reprocess in the "after head" insertion mode...
4092 } else {
4093 !!!cp ('t131');
4094 }
4095
4096 ## "after head" insertion mode
4097 ## As if <body>
4098 !!!insert-element ('body',, $token);
4099 $self->{insertion_mode} = IN_BODY_IM;
4100 ## reprocess
4101 !!!ack-later;
4102 next B;
4103 } elsif ($token->{type} == END_TAG_TOKEN) {
4104 if ($token->{tag_name} eq 'head') {
4105 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4106 !!!cp ('t132');
4107 ## As if <head>
4108 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4109 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4110 push @{$self->{open_elements}},
4111 [$self->{head_element}, $el_category->{head}];
4112
4113 ## Reprocess in the "in head" insertion mode...
4114 pop @{$self->{open_elements}};
4115 $self->{insertion_mode} = AFTER_HEAD_IM;
4116 !!!next-token;
4117 next B;
4118 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4119 !!!cp ('t133');
4120 ## As if </noscript>
4121 pop @{$self->{open_elements}};
4122 !!!parse-error (type => 'in noscript:/head', token => $token);
4123
4124 ## Reprocess in the "in head" insertion mode...
4125 pop @{$self->{open_elements}};
4126 $self->{insertion_mode} = AFTER_HEAD_IM;
4127 !!!next-token;
4128 next B;
4129 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4130 !!!cp ('t134');
4131 pop @{$self->{open_elements}};
4132 $self->{insertion_mode} = AFTER_HEAD_IM;
4133 !!!next-token;
4134 next B;
4135 } else {
4136 !!!cp ('t135');
4137 #
4138 }
4139 } elsif ($token->{tag_name} eq 'noscript') {
4140 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4141 !!!cp ('t136');
4142 pop @{$self->{open_elements}};
4143 $self->{insertion_mode} = IN_HEAD_IM;
4144 !!!next-token;
4145 next B;
4146 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4147 !!!cp ('t137');
4148 !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4149 ## Ignore the token ## ISSUE: An issue in the spec.
4150 !!!next-token;
4151 next B;
4152 } else {
4153 !!!cp ('t138');
4154 #
4155 }
4156 } elsif ({
4157 body => 1, html => 1,
4158 }->{$token->{tag_name}}) {
4159 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4160 !!!cp ('t139');
4161 ## As if <head>
4162 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4163 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4164 push @{$self->{open_elements}},
4165 [$self->{head_element}, $el_category->{head}];
4166
4167 $self->{insertion_mode} = IN_HEAD_IM;
4168 ## Reprocess in the "in head" insertion mode...
4169 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4170 !!!cp ('t140');
4171 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4172 ## Ignore the token
4173 !!!next-token;
4174 next B;
4175 } else {
4176 !!!cp ('t141');
4177 }
4178
4179 #
4180 } elsif ({
4181 p => 1, br => 1,
4182 }->{$token->{tag_name}}) {
4183 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4184 !!!cp ('t142');
4185 ## As if <head>
4186 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4187 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4188 push @{$self->{open_elements}},
4189 [$self->{head_element}, $el_category->{head}];
4190
4191 $self->{insertion_mode} = IN_HEAD_IM;
4192 ## Reprocess in the "in head" insertion mode...
4193 } else {
4194 !!!cp ('t143');
4195 }
4196
4197 #
4198 } else {
4199 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4200 !!!cp ('t144');
4201 #
4202 } else {
4203 !!!cp ('t145');
4204 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4205 ## Ignore the token
4206 !!!next-token;
4207 next B;
4208 }
4209 }
4210
4211 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4212 !!!cp ('t146');
4213 ## As if </noscript>
4214 pop @{$self->{open_elements}};
4215 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4216
4217 ## Reprocess in the "in head" insertion mode...
4218 ## As if </head>
4219 pop @{$self->{open_elements}};
4220
4221 ## Reprocess in the "after head" insertion mode...
4222 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4223 !!!cp ('t147');
4224 ## As if </head>
4225 pop @{$self->{open_elements}};
4226
4227 ## Reprocess in the "after head" insertion mode...
4228 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4229 ## ISSUE: This case cannot be reached?
4230 !!!cp ('t148');
4231 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4232 ## Ignore the token ## ISSUE: An issue in the spec.
4233 !!!next-token;
4234 next B;
4235 } else {
4236 !!!cp ('t149');
4237 }
4238
4239 ## "after head" insertion mode
4240 ## As if <body>
4241 !!!insert-element ('body',, $token);
4242 $self->{insertion_mode} = IN_BODY_IM;
4243 ## reprocess
4244 next B;
4245 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4246 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4247 !!!cp ('t149.1');
4248
4249 ## NOTE: As if <head>
4250 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4251 $self->{open_elements}->[-1]->[0]->append_child
4252 ($self->{head_element});
4253 #push @{$self->{open_elements}},
4254 # [$self->{head_element}, $el_category->{head}];
4255 #$self->{insertion_mode} = IN_HEAD_IM;
4256 ## NOTE: Reprocess.
4257
4258 ## NOTE: As if </head>
4259 #pop @{$self->{open_elements}};
4260 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4261 ## NOTE: Reprocess.
4262
4263 #
4264 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4265 !!!cp ('t149.2');
4266
4267 ## NOTE: As if </head>
4268 pop @{$self->{open_elements}};
4269 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4270 ## NOTE: Reprocess.
4271
4272 #
4273 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4274 !!!cp ('t149.3');
4275
4276 !!!parse-error (type => 'in noscript:#eof', token => $token);
4277
4278 ## As if </noscript>
4279 pop @{$self->{open_elements}};
4280 #$self->{insertion_mode} = IN_HEAD_IM;
4281 ## NOTE: Reprocess.
4282
4283 ## NOTE: As if </head>
4284 pop @{$self->{open_elements}};
4285 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4286 ## NOTE: Reprocess.
4287
4288 #
4289 } else {
4290 !!!cp ('t149.4');
4291 #
4292 }
4293
4294 ## NOTE: As if <body>
4295 !!!insert-element ('body',, $token);
4296 $self->{insertion_mode} = IN_BODY_IM;
4297 ## NOTE: Reprocess.
4298 next B;
4299 } else {
4300 die "$0: $token->{type}: Unknown token type";
4301 }
4302
4303 ## ISSUE: An issue in the spec.
4304 } elsif ($self->{insertion_mode} & BODY_IMS) {
4305 if ($token->{type} == CHARACTER_TOKEN) {
4306 !!!cp ('t150');
4307 ## NOTE: There is a code clone of "character in body".
4308 $reconstruct_active_formatting_elements->($insert_to_current);
4309
4310 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4311
4312 !!!next-token;
4313 next B;
4314 } elsif ($token->{type} == START_TAG_TOKEN) {
4315 if ({
4316 caption => 1, col => 1, colgroup => 1, tbody => 1,
4317 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4318 }->{$token->{tag_name}}) {
4319 if ($self->{insertion_mode} == IN_CELL_IM) {
4320 ## have an element in table scope
4321 for (reverse 0..$#{$self->{open_elements}}) {
4322 my $node = $self->{open_elements}->[$_];
4323 if ($node->[1] & TABLE_CELL_EL) {
4324 !!!cp ('t151');
4325
4326 ## Close the cell
4327 !!!back-token; # <x>
4328 $token = {type => END_TAG_TOKEN,
4329 tag_name => $node->[0]->manakai_local_name,
4330 line => $token->{line},
4331 column => $token->{column}};
4332 next B;
4333 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4334 !!!cp ('t152');
4335 ## ISSUE: This case can never be reached, maybe.
4336 last;
4337 }
4338 }
4339
4340 !!!cp ('t153');
4341 !!!parse-error (type => 'start tag not allowed',
4342 value => $token->{tag_name}, token => $token);
4343 ## Ignore the token
4344 !!!nack ('t153.1');
4345 !!!next-token;
4346 next B;
4347 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4348 !!!parse-error (type => 'not closed:caption', token => $token);
4349
4350 ## NOTE: As if </caption>.
4351 ## have a table element in table scope
4352 my $i;
4353 INSCOPE: {
4354 for (reverse 0..$#{$self->{open_elements}}) {
4355 my $node = $self->{open_elements}->[$_];
4356 if ($node->[1] & CAPTION_EL) {
4357 !!!cp ('t155');
4358 $i = $_;
4359 last INSCOPE;
4360 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4361 !!!cp ('t156');
4362 last;
4363 }
4364 }
4365
4366 !!!cp ('t157');
4367 !!!parse-error (type => 'start tag not allowed',
4368 value => $token->{tag_name}, token => $token);
4369 ## Ignore the token
4370 !!!nack ('t157.1');
4371 !!!next-token;
4372 next B;
4373 } # INSCOPE
4374
4375 ## generate implied end tags
4376 while ($self->{open_elements}->[-1]->[1]
4377 & END_TAG_OPTIONAL_EL) {
4378 !!!cp ('t158');
4379 pop @{$self->{open_elements}};
4380 }
4381
4382 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4383 !!!cp ('t159');
4384 !!!parse-error (type => 'not closed',
4385 value => $self->{open_elements}->[-1]->[0]
4386 ->manakai_local_name,
4387 token => $token);
4388 } else {
4389 !!!cp ('t160');
4390 }
4391
4392 splice @{$self->{open_elements}}, $i;
4393
4394 $clear_up_to_marker->();
4395
4396 $self->{insertion_mode} = IN_TABLE_IM;
4397
4398 ## reprocess
4399 !!!ack-later;
4400 next B;
4401 } else {
4402 !!!cp ('t161');
4403 #
4404 }
4405 } else {
4406 !!!cp ('t162');
4407 #
4408 }
4409 } elsif ($token->{type} == END_TAG_TOKEN) {
4410 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4411 if ($self->{insertion_mode} == IN_CELL_IM) {
4412 ## have an element in table scope
4413 my $i;
4414 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4415 my $node = $self->{open_elements}->[$_];
4416 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4417 !!!cp ('t163');
4418 $i = $_;
4419 last INSCOPE;
4420 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4421 !!!cp ('t164');
4422 last INSCOPE;
4423 }
4424 } # INSCOPE
4425 unless (defined $i) {
4426 !!!cp ('t165');
4427 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4428 ## Ignore the token
4429 !!!next-token;
4430 next B;
4431 }
4432
4433 ## generate implied end tags
4434 while ($self->{open_elements}->[-1]->[1]
4435 & END_TAG_OPTIONAL_EL) {
4436 !!!cp ('t166');
4437 pop @{$self->{open_elements}};
4438 }
4439
4440 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4441 ne $token->{tag_name}) {
4442 !!!cp ('t167');
4443 !!!parse-error (type => 'not closed',
4444 value => $self->{open_elements}->[-1]->[0]
4445 ->manakai_local_name,
4446 token => $token);
4447 } else {
4448 !!!cp ('t168');
4449 }
4450
4451 splice @{$self->{open_elements}}, $i;
4452
4453 $clear_up_to_marker->();
4454
4455 $self->{insertion_mode} = IN_ROW_IM;
4456
4457 !!!next-token;
4458 next B;
4459 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4460 !!!cp ('t169');
4461 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4462 ## Ignore the token
4463 !!!next-token;
4464 next B;
4465 } else {
4466 !!!cp ('t170');
4467 #
4468 }
4469 } elsif ($token->{tag_name} eq 'caption') {
4470 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4471 ## have a table element in table scope
4472 my $i;
4473 INSCOPE: {
4474 for (reverse 0..$#{$self->{open_elements}}) {
4475 my $node = $self->{open_elements}->[$_];
4476 if ($node->[1] & CAPTION_EL) {
4477 !!!cp ('t171');
4478 $i = $_;
4479 last INSCOPE;
4480 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4481 !!!cp ('t172');
4482 last;
4483 }
4484 }
4485
4486 !!!cp ('t173');
4487 !!!parse-error (type => 'unmatched end tag',
4488 value => $token->{tag_name}, token => $token);
4489 ## Ignore the token
4490 !!!next-token;
4491 next B;
4492 } # INSCOPE
4493
4494 ## generate implied end tags
4495 while ($self->{open_elements}->[-1]->[1]
4496 & END_TAG_OPTIONAL_EL) {
4497 !!!cp ('t174');
4498 pop @{$self->{open_elements}};
4499 }
4500
4501 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4502 !!!cp ('t175');
4503 !!!parse-error (type => 'not closed',
4504 value => $self->{open_elements}->[-1]->[0]
4505 ->manakai_local_name,
4506 token => $token);
4507 } else {
4508 !!!cp ('t176');
4509 }
4510
4511 splice @{$self->{open_elements}}, $i;
4512
4513 $clear_up_to_marker->();
4514
4515 $self->{insertion_mode} = IN_TABLE_IM;
4516
4517 !!!next-token;
4518 next B;
4519 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4520 !!!cp ('t177');
4521 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4522 ## Ignore the token
4523 !!!next-token;
4524 next B;
4525 } else {
4526 !!!cp ('t178');
4527 #
4528 }
4529 } elsif ({
4530 table => 1, tbody => 1, tfoot => 1,
4531 thead => 1, tr => 1,
4532 }->{$token->{tag_name}} and
4533 $self->{insertion_mode} == IN_CELL_IM) {
4534 ## have an element in table scope
4535 my $i;
4536 my $tn;
4537 INSCOPE: {
4538 for (reverse 0..$#{$self->{open_elements}}) {
4539 my $node = $self->{open_elements}->[$_];
4540 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4541 !!!cp ('t179');
4542 $i = $_;
4543
4544 ## Close the cell
4545 !!!back-token; # </x>
4546 $token = {type => END_TAG_TOKEN, tag_name => $tn,
4547 line => $token->{line},
4548 column => $token->{column}};
4549 next B;
4550 } elsif ($node->[1] & TABLE_CELL_EL) {
4551 !!!cp ('t180');
4552 $tn = $node->[0]->manakai_local_name;
4553 ## NOTE: There is exactly one |td| or |th| element
4554 ## in scope in the stack of open elements by definition.
4555 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4556 ## ISSUE: Can this be reached?
4557 !!!cp ('t181');
4558 last;
4559 }
4560 }
4561
4562 !!!cp ('t182');
4563 !!!parse-error (type => 'unmatched end tag',
4564 value => $token->{tag_name}, token => $token);
4565 ## Ignore the token
4566 !!!next-token;
4567 next B;
4568 } # INSCOPE
4569 } elsif ($token->{tag_name} eq 'table' and
4570 $self->{insertion_mode} == IN_CAPTION_IM) {
4571 !!!parse-error (type => 'not closed:caption', token => $token);
4572
4573 ## As if </caption>
4574 ## have a table element in table scope
4575 my $i;
4576 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4577 my $node = $self->{open_elements}->[$_];
4578 if ($node->[1] & CAPTION_EL) {
4579 !!!cp ('t184');
4580 $i = $_;
4581 last INSCOPE;
4582 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4583 !!!cp ('t185');
4584 last INSCOPE;
4585 }
4586 } # INSCOPE
4587 unless (defined $i) {
4588 !!!cp ('t186');
4589 !!!parse-error (type => 'unmatched end tag:caption', token => $token);
4590 ## Ignore the token
4591 !!!next-token;
4592 next B;
4593 }
4594
4595 ## generate implied end tags
4596 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
4597 !!!cp ('t187');
4598 pop @{$self->{open_elements}};
4599 }
4600
4601 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4602 !!!cp ('t188');
4603 !!!parse-error (type => 'not closed',
4604 value => $self->{open_elements}->[-1]->[0]
4605 ->manakai_local_name,
4606 token => $token);
4607 } else {
4608 !!!cp ('t189');
4609 }
4610
4611 splice @{$self->{open_elements}}, $i;
4612
4613 $clear_up_to_marker->();
4614
4615 $self->{insertion_mode} = IN_TABLE_IM;
4616
4617 ## reprocess
4618 next B;
4619 } elsif ({
4620 body => 1, col => 1, colgroup => 1, html => 1,
4621 }->{$token->{tag_name}}) {
4622 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
4623 !!!cp ('t190');
4624 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4625 ## Ignore the token
4626 !!!next-token;
4627 next B;
4628 } else {
4629 !!!cp ('t191');
4630 #
4631 }
4632 } elsif ({
4633 tbody => 1, tfoot => 1,
4634 thead => 1, tr => 1,
4635 }->{$token->{tag_name}} and
4636 $self->{insertion_mode} == IN_CAPTION_IM) {
4637 !!!cp ('t192');
4638 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4639 ## Ignore the token
4640 !!!next-token;
4641 next B;
4642 } else {
4643 !!!cp ('t193');
4644 #
4645 }
4646 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4647 for my $entry (@{$self->{open_elements}}) {
4648 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
4649 !!!cp ('t75');
4650 !!!parse-error (type => 'in body:#eof', token => $token);
4651 last;
4652 }
4653 }
4654
4655 ## Stop parsing.
4656 last B;
4657 } else {
4658 die "$0: $token->{type}: Unknown token type";
4659 }
4660
4661 $insert = $insert_to_current;
4662 #
4663 } elsif ($self->{insertion_mode} & TABLE_IMS) {
4664 if ($token->{type} == CHARACTER_TOKEN) {
4665 if (not $open_tables->[-1]->[1] and # tainted
4666 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4667 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4668
4669 unless (length $token->{data}) {
4670 !!!cp ('t194');
4671 !!!next-token;
4672 next B;
4673 } else {
4674 !!!cp ('t195');
4675 }
4676 }
4677
4678 !!!parse-error (type => 'in table:#character', token => $token);
4679
4680 ## As if in body, but insert into foster parent element
4681 ## ISSUE: Spec says that "whenever a node would be inserted
4682 ## into the current node" while characters might not be
4683 ## result in a new Text node.
4684 $reconstruct_active_formatting_elements->($insert_to_foster);
4685
4686 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4687 # MUST
4688 my $foster_parent_element;
4689 my $next_sibling;
4690 my $prev_sibling;
4691 OE: for (reverse 0..$#{$self->{open_elements}}) {
4692 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4693 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4694 if (defined $parent and $parent->node_type == 1) {
4695 !!!cp ('t196');
4696 $foster_parent_element = $parent;
4697 $next_sibling = $self->{open_elements}->[$_]->[0];
4698 $prev_sibling = $next_sibling->previous_sibling;
4699 } else {
4700 !!!cp ('t197');
4701 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4702 $prev_sibling = $foster_parent_element->last_child;
4703 }
4704 last OE;
4705 }
4706 } # OE
4707 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4708 $prev_sibling = $foster_parent_element->last_child
4709 unless defined $foster_parent_element;
4710 if (defined $prev_sibling and
4711 $prev_sibling->node_type == 3) {
4712 !!!cp ('t198');
4713 $prev_sibling->manakai_append_text ($token->{data});
4714 } else {
4715 !!!cp ('t199');
4716 $foster_parent_element->insert_before
4717 ($self->{document}->create_text_node ($token->{data}),
4718 $next_sibling);
4719 }
4720 $open_tables->[-1]->[1] = 1; # tainted
4721 } else {
4722 !!!cp ('t200');
4723 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4724 }
4725
4726 !!!next-token;
4727 next B;
4728 } elsif ($token->{type} == START_TAG_TOKEN) {
4729 if ({
4730 tr => ($self->{insertion_mode} != IN_ROW_IM),
4731 th => 1, td => 1,
4732 }->{$token->{tag_name}}) {
4733 if ($self->{insertion_mode} == IN_TABLE_IM) {
4734 ## Clear back to table context
4735 while (not ($self->{open_elements}->[-1]->[1]
4736 & TABLE_SCOPING_EL)) {
4737 !!!cp ('t201');
4738 pop @{$self->{open_elements}};
4739 }
4740
4741 !!!insert-element ('tbody',, $token);
4742 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4743 ## reprocess in the "in table body" insertion mode...
4744 }
4745
4746 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4747 unless ($token->{tag_name} eq 'tr') {
4748 !!!cp ('t202');
4749 !!!parse-error (type => 'missing start tag:tr', token => $token);
4750 }
4751
4752 ## Clear back to table body context
4753 while (not ($self->{open_elements}->[-1]->[1]
4754 & TABLE_ROWS_SCOPING_EL)) {
4755 !!!cp ('t203');
4756 ## ISSUE: Can this case be reached?
4757 pop @{$self->{open_elements}};
4758 }
4759
4760 $self->{insertion_mode} = IN_ROW_IM;
4761 if ($token->{tag_name} eq 'tr') {
4762 !!!cp ('t204');
4763 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4764 !!!nack ('t204');
4765 !!!next-token;
4766 next B;
4767 } else {
4768 !!!cp ('t205');
4769 !!!insert-element ('tr',, $token);
4770 ## reprocess in the "in row" insertion mode
4771 }
4772 } else {
4773 !!!cp ('t206');
4774 }
4775
4776 ## Clear back to table row context
4777 while (not ($self->{open_elements}->[-1]->[1]
4778 & TABLE_ROW_SCOPING_EL)) {
4779 !!!cp ('t207');
4780 pop @{$self->{open_elements}};
4781 }
4782
4783 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4784 $self->{insertion_mode} = IN_CELL_IM;
4785
4786 push @$active_formatting_elements, ['#marker', ''];
4787
4788 !!!nack ('t207.1');
4789 !!!next-token;
4790 next B;
4791 } elsif ({
4792 caption => 1, col => 1, colgroup => 1,
4793 tbody => 1, tfoot => 1, thead => 1,
4794 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4795 }->{$token->{tag_name}}) {
4796 if ($self->{insertion_mode} == IN_ROW_IM) {
4797 ## As if </tr>
4798 ## have an element in table scope
4799 my $i;
4800 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4801 my $node = $self->{open_elements}->[$_];
4802 if ($node->[1] & TABLE_ROW_EL) {
4803 !!!cp ('t208');
4804 $i = $_;
4805 last INSCOPE;
4806 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4807 !!!cp ('t209');
4808 last INSCOPE;
4809 }
4810 } # INSCOPE
4811 unless (defined $i) {
4812 !!!cp ('t210');
4813 ## TODO: This type is wrong.
4814 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name}, token => $token);
4815 ## Ignore the token
4816 !!!nack ('t210.1');
4817 !!!next-token;
4818 next B;
4819 }
4820
4821 ## Clear back to table row context
4822 while (not ($self->{open_elements}->[-1]->[1]
4823 & TABLE_ROW_SCOPING_EL)) {
4824 !!!cp ('t211');
4825 ## ISSUE: Can this case be reached?
4826 pop @{$self->{open_elements}};
4827 }
4828
4829 pop @{$self->{open_elements}}; # tr
4830 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4831 if ($token->{tag_name} eq 'tr') {
4832 !!!cp ('t212');
4833 ## reprocess
4834 !!!ack-later;
4835 next B;
4836 } else {
4837 !!!cp ('t213');
4838 ## reprocess in the "in table body" insertion mode...
4839 }
4840 }
4841
4842 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4843 ## have an element in table scope
4844 my $i;
4845 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4846 my $node = $self->{open_elements}->[$_];
4847 if ($node->[1] & TABLE_ROW_GROUP_EL) {
4848 !!!cp ('t214');
4849 $i = $_;
4850 last INSCOPE;
4851 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4852 !!!cp ('t215');
4853 last INSCOPE;
4854 }
4855 } # INSCOPE
4856 unless (defined $i) {
4857 !!!cp ('t216');
4858 ## TODO: This erorr type ios wrong.
4859 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4860 ## Ignore the token
4861 !!!nack ('t216.1');
4862 !!!next-token;
4863 next B;
4864 }
4865
4866 ## Clear back to table body context
4867 while (not ($self->{open_elements}->[-1]->[1]
4868 & TABLE_ROWS_SCOPING_EL)) {
4869 !!!cp ('t217');
4870 ## ISSUE: Can this state be reached?
4871 pop @{$self->{open_elements}};
4872 }
4873
4874 ## As if <{current node}>
4875 ## have an element in table scope
4876 ## true by definition
4877
4878 ## Clear back to table body context
4879 ## nop by definition
4880
4881 pop @{$self->{open_elements}};
4882 $self->{insertion_mode} = IN_TABLE_IM;
4883 ## reprocess in "in table" insertion mode...
4884 } else {
4885 !!!cp ('t218');
4886 }
4887
4888 if ($token->{tag_name} eq 'col') {
4889 ## Clear back to table context
4890 while (not ($self->{open_elements}->[-1]->[1]
4891 & TABLE_SCOPING_EL)) {
4892 !!!cp ('t219');
4893 ## ISSUE: Can this state be reached?
4894 pop @{$self->{open_elements}};
4895 }
4896
4897 !!!insert-element ('colgroup',, $token);
4898 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
4899 ## reprocess
4900 !!!ack-later;
4901 next B;
4902 } elsif ({
4903 caption => 1,
4904 colgroup => 1,
4905 tbody => 1, tfoot => 1, thead => 1,
4906 }->{$token->{tag_name}}) {
4907 ## Clear back to table context
4908 while (not ($self->{open_elements}->[-1]->[1]
4909 & TABLE_SCOPING_EL)) {
4910 !!!cp ('t220');
4911 ## ISSUE: Can this state be reached?
4912 pop @{$self->{open_elements}};
4913 }
4914
4915 push @$active_formatting_elements, ['#marker', '']
4916 if $token->{tag_name} eq 'caption';
4917
4918 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4919 $self->{insertion_mode} = {
4920 caption => IN_CAPTION_IM,
4921 colgroup => IN_COLUMN_GROUP_IM,
4922 tbody => IN_TABLE_BODY_IM,
4923 tfoot => IN_TABLE_BODY_IM,
4924 thead => IN_TABLE_BODY_IM,
4925 }->{$token->{tag_name}};
4926 !!!next-token;
4927 !!!nack ('t220.1');
4928 next B;
4929 } else {
4930 die "$0: in table: <>: $token->{tag_name}";
4931 }
4932 } elsif ($token->{tag_name} eq 'table') {
4933 !!!parse-error (type => 'not closed',
4934 value => $self->{open_elements}->[-1]->[0]
4935 ->manakai_local_name,
4936 token => $token);
4937
4938 ## As if </table>
4939 ## have a table element in table scope
4940 my $i;
4941 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4942 my $node = $self->{open_elements}->[$_];
4943 if ($node->[1] & TABLE_EL) {
4944 !!!cp ('t221');
4945 $i = $_;
4946 last INSCOPE;
4947 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4948 !!!cp ('t222');
4949 last INSCOPE;
4950 }
4951 } # INSCOPE
4952 unless (defined $i) {
4953 !!!cp ('t223');
4954 ## TODO: The following is wrong, maybe.
4955 !!!parse-error (type => 'unmatched end tag:table', token => $token);
4956 ## Ignore tokens </table><table>
4957 !!!nack ('t223.1');
4958 !!!next-token;
4959 next B;
4960 }
4961
4962 ## TODO: Followings are removed from the latest spec.
4963 ## generate implied end tags
4964 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
4965 !!!cp ('t224');
4966 pop @{$self->{open_elements}};
4967 }
4968
4969 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
4970 !!!cp ('t225');
4971 ## NOTE: |<table><tr><table>|
4972 !!!parse-error (type => 'not closed',
4973 value => $self->{open_elements}->[-1]->[0]
4974 ->manakai_local_name,
4975 token => $token);
4976 } else {
4977 !!!cp ('t226');
4978 }
4979
4980 splice @{$self->{open_elements}}, $i;
4981 pop @{$open_tables};
4982
4983 $self->_reset_insertion_mode;
4984
4985 ## reprocess
4986 !!!ack-later;
4987 next B;
4988 } elsif ($token->{tag_name} eq 'style') {
4989 if (not $open_tables->[-1]->[1]) { # tainted
4990 !!!cp ('t227.8');
4991 ## NOTE: This is a "as if in head" code clone.
4992 $parse_rcdata->(CDATA_CONTENT_MODEL);
4993 next B;
4994 } else {
4995 !!!cp ('t227.7');
4996 #
4997 }
4998 } elsif ($token->{tag_name} eq 'script') {
4999 if (not $open_tables->[-1]->[1]) { # tainted
5000 !!!cp ('t227.6');
5001 ## NOTE: This is a "as if in head" code clone.
5002 $script_start_tag->();
5003 next B;
5004 } else {
5005 !!!cp ('t227.5');
5006 #
5007 }
5008 } elsif ($token->{tag_name} eq 'input') {
5009 if (not $open_tables->[-1]->[1]) { # tainted
5010 if ($token->{attributes}->{type}) { ## TODO: case
5011 my $type = lc $token->{attributes}->{type}->{value};
5012 if ($type eq 'hidden') {
5013 !!!cp ('t227.3');
5014 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5015
5016 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5017
5018 ## TODO: form element pointer
5019
5020 pop @{$self->{open_elements}};
5021
5022 !!!next-token;
5023 !!!ack ('t227.2.1');
5024 next B;
5025 } else {
5026 !!!cp ('t227.2');
5027 #
5028 }
5029 } else {
5030 !!!cp ('t227.1');
5031 #
5032 }
5033 } else {
5034 !!!cp ('t227.4');
5035 #
5036 }
5037 } else {
5038 !!!cp ('t227');
5039 #
5040 }
5041
5042 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5043
5044 $insert = $insert_to_foster;
5045 #
5046 } elsif ($token->{type} == END_TAG_TOKEN) {
5047 if ($token->{tag_name} eq 'tr' and
5048 $self->{insertion_mode} == IN_ROW_IM) {
5049 ## have an element in table scope
5050 my $i;
5051 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5052 my $node = $self->{open_elements}->[$_];
5053 if ($node->[1] & TABLE_ROW_EL) {
5054 !!!cp ('t228');
5055 $i = $_;
5056 last INSCOPE;
5057 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5058 !!!cp ('t229');
5059 last INSCOPE;
5060 }
5061 } # INSCOPE
5062 unless (defined $i) {
5063 !!!cp ('t230');
5064 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5065 ## Ignore the token
5066 !!!nack ('t230.1');
5067 !!!next-token;
5068 next B;
5069 } else {
5070 !!!cp ('t232');
5071 }
5072
5073 ## Clear back to table row context
5074 while (not ($self->{open_elements}->[-1]->[1]
5075 & TABLE_ROW_SCOPING_EL)) {
5076 !!!cp ('t231');
5077 ## ISSUE: Can this state be reached?
5078 pop @{$self->{open_elements}};
5079 }
5080
5081 pop @{$self->{open_elements}}; # tr
5082 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5083 !!!next-token;
5084 !!!nack ('t231.1');
5085 next B;
5086 } elsif ($token->{tag_name} eq 'table') {
5087 if ($self->{insertion_mode} == IN_ROW_IM) {
5088 ## As if </tr>
5089 ## have an element in table scope
5090 my $i;
5091 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5092 my $node = $self->{open_elements}->[$_];
5093 if ($node->[1] & TABLE_ROW_EL) {
5094 !!!cp ('t233');
5095 $i = $_;
5096 last INSCOPE;
5097 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5098 !!!cp ('t234');
5099 last INSCOPE;
5100 }
5101 } # INSCOPE
5102 unless (defined $i) {
5103 !!!cp ('t235');
5104 ## TODO: The following is wrong.
5105 !!!parse-error (type => 'unmatched end tag:'.$token->{type}, token => $token);
5106 ## Ignore the token
5107 !!!nack ('t236.1');
5108 !!!next-token;
5109 next B;
5110 }
5111
5112 ## Clear back to table row context
5113 while (not ($self->{open_elements}->[-1]->[1]
5114 & TABLE_ROW_SCOPING_EL)) {
5115 !!!cp ('t236');
5116 ## ISSUE: Can this state be reached?
5117 pop @{$self->{open_elements}};
5118 }
5119
5120 pop @{$self->{open_elements}}; # tr
5121 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5122 ## reprocess in the "in table body" insertion mode...
5123 }
5124
5125 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5126 ## have an element in table scope
5127 my $i;
5128 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5129 my $node = $self->{open_elements}->[$_];
5130 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5131 !!!cp ('t237');
5132 $i = $_;
5133 last INSCOPE;
5134 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5135 !!!cp ('t238');
5136 last INSCOPE;
5137 }
5138 } # INSCOPE
5139 unless (defined $i) {
5140 !!!cp ('t239');
5141 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5142 ## Ignore the token
5143 !!!nack ('t239.1');
5144 !!!next-token;
5145 next B;
5146 }
5147
5148 ## Clear back to table body context
5149 while (not ($self->{open_elements}->[-1]->[1]
5150 & TABLE_ROWS_SCOPING_EL)) {
5151 !!!cp ('t240');
5152 pop @{$self->{open_elements}};
5153 }
5154
5155 ## As if <{current node}>
5156 ## have an element in table scope
5157 ## true by definition
5158
5159 ## Clear back to table body context
5160 ## nop by definition
5161
5162 pop @{$self->{open_elements}};
5163 $self->{insertion_mode} = IN_TABLE_IM;
5164 ## reprocess in the "in table" insertion mode...
5165 }
5166
5167 ## NOTE: </table> in the "in table" insertion mode.
5168 ## When you edit the code fragment below, please ensure that
5169 ## the code for <table> in the "in table" insertion mode
5170 ## is synced with it.
5171
5172 ## have a table element in table scope
5173 my $i;
5174 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5175 my $node = $self->{open_elements}->[$_];
5176 if ($node->[1] & TABLE_EL) {
5177 !!!cp ('t241');
5178 $i = $_;
5179 last INSCOPE;
5180 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5181 !!!cp ('t242');
5182 last INSCOPE;
5183 }
5184 } # INSCOPE
5185 unless (defined $i) {
5186 !!!cp ('t243');
5187 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5188 ## Ignore the token
5189 !!!nack ('t243.1');
5190 !!!next-token;
5191 next B;
5192 }
5193
5194 splice @{$self->{open_elements}}, $i;
5195 pop @{$open_tables};
5196
5197 $self->_reset_insertion_mode;
5198
5199 !!!next-token;
5200 next B;
5201 } elsif ({
5202 tbody => 1, tfoot => 1, thead => 1,
5203 }->{$token->{tag_name}} and
5204 $self->{insertion_mode} & ROW_IMS) {
5205 if ($self->{insertion_mode} == IN_ROW_IM) {
5206 ## have an element in table scope
5207 my $i;
5208 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5209 my $node = $self->{open_elements}->[$_];
5210 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5211 !!!cp ('t247');
5212 $i = $_;
5213 last INSCOPE;
5214 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5215 !!!cp ('t248');
5216 last INSCOPE;
5217 }
5218 } # INSCOPE
5219 unless (defined $i) {
5220 !!!cp ('t249');
5221 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5222 ## Ignore the token
5223 !!!nack ('t249.1');
5224 !!!next-token;
5225 next B;
5226 }
5227
5228 ## As if </tr>
5229 ## have an element in table scope
5230 my $i;
5231 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5232 my $node = $self->{open_elements}->[$_];
5233 if ($node->[1] & TABLE_ROW_EL) {
5234 !!!cp ('t250');
5235 $i = $_;
5236 last INSCOPE;
5237 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5238 !!!cp ('t251');
5239 last INSCOPE;
5240 }
5241 } # INSCOPE
5242 unless (defined $i) {
5243 !!!cp ('t252');
5244 !!!parse-error (type => 'unmatched end tag:tr', token => $token);
5245 ## Ignore the token
5246 !!!nack ('t252.1');
5247 !!!next-token;
5248 next B;
5249 }
5250
5251 ## Clear back to table row context
5252 while (not ($self->{open_elements}->[-1]->[1]
5253 & TABLE_ROW_SCOPING_EL)) {
5254 !!!cp ('t253');
5255 ## ISSUE: Can this case be reached?
5256 pop @{$self->{open_elements}};
5257 }
5258
5259 pop @{$self->{open_elements}}; # tr
5260 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5261 ## reprocess in the "in table body" insertion mode...
5262 }
5263
5264 ## have an element in table scope
5265 my $i;
5266 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5267 my $node = $self->{open_elements}->[$_];
5268 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5269 !!!cp ('t254');
5270 $i = $_;
5271 last INSCOPE;
5272 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5273 !!!cp ('t255');
5274 last INSCOPE;
5275 }
5276 } # INSCOPE
5277 unless (defined $i) {
5278 !!!cp ('t256');
5279 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5280 ## Ignore the token
5281 !!!nack ('t256.1');
5282 !!!next-token;
5283 next B;
5284 }
5285
5286 ## Clear back to table body context
5287 while (not ($self->{open_elements}->[-1]->[1]
5288 & TABLE_ROWS_SCOPING_EL)) {
5289 !!!cp ('t257');
5290 ## ISSUE: Can this case be reached?
5291 pop @{$self->{open_elements}};
5292 }
5293
5294 pop @{$self->{open_elements}};
5295 $self->{insertion_mode} = IN_TABLE_IM;
5296 !!!nack ('t257.1');
5297 !!!next-token;
5298 next B;
5299 } elsif ({
5300 body => 1, caption => 1, col => 1, colgroup => 1,
5301 html => 1, td => 1, th => 1,
5302 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5303 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5304 }->{$token->{tag_name}}) {
5305 !!!cp ('t258');
5306 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5307 ## Ignore the token
5308 !!!nack ('t258.1');
5309 !!!next-token;
5310 next B;
5311 } else {
5312 !!!cp ('t259');
5313 !!!parse-error (type => 'in table:/'.$token->{tag_name}, token => $token);
5314
5315 $insert = $insert_to_foster;
5316 #
5317 }
5318 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5319 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5320 @{$self->{open_elements}} == 1) { # redundant, maybe
5321 !!!parse-error (type => 'in body:#eof', token => $token);
5322 !!!cp ('t259.1');
5323 #
5324 } else {
5325 !!!cp ('t259.2');
5326 #
5327 }
5328
5329 ## Stop parsing
5330 last B;
5331 } else {
5332 die "$0: $token->{type}: Unknown token type";
5333 }
5334 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5335 if ($token->{type} == CHARACTER_TOKEN) {
5336 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5337 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5338 unless (length $token->{data}) {
5339 !!!cp ('t260');
5340 !!!next-token;
5341 next B;
5342 }
5343 }
5344
5345 !!!cp ('t261');
5346 #
5347 } elsif ($token->{type} == START_TAG_TOKEN) {
5348 if ($token->{tag_name} eq 'col') {
5349 !!!cp ('t262');
5350 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5351 pop @{$self->{open_elements}};
5352 !!!ack ('t262.1');
5353 !!!next-token;
5354 next B;
5355 } else {
5356 !!!cp ('t263');
5357 #
5358 }
5359 } elsif ($token->{type} == END_TAG_TOKEN) {
5360 if ($token->{tag_name} eq 'colgroup') {
5361 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5362 !!!cp ('t264');
5363 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5364 ## Ignore the token
5365 !!!next-token;
5366 next B;
5367 } else {
5368 !!!cp ('t265');
5369 pop @{$self->{open_elements}}; # colgroup
5370 $self->{insertion_mode} = IN_TABLE_IM;
5371 !!!next-token;
5372 next B;
5373 }
5374 } elsif ($token->{tag_name} eq 'col') {
5375 !!!cp ('t266');
5376 !!!parse-error (type => 'unmatched end tag:col', token => $token);
5377 ## Ignore the token
5378 !!!next-token;
5379 next B;
5380 } else {
5381 !!!cp ('t267');
5382 #
5383 }
5384 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5385 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5386 @{$self->{open_elements}} == 1) { # redundant, maybe
5387 !!!cp ('t270.2');
5388 ## Stop parsing.
5389 last B;
5390 } else {
5391 ## NOTE: As if </colgroup>.
5392 !!!cp ('t270.1');
5393 pop @{$self->{open_elements}}; # colgroup
5394 $self->{insertion_mode} = IN_TABLE_IM;
5395 ## Reprocess.
5396 next B;
5397 }
5398 } else {
5399 die "$0: $token->{type}: Unknown token type";
5400 }
5401
5402 ## As if </colgroup>
5403 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5404 !!!cp ('t269');
5405 ## TODO: Wrong error type?
5406 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5407 ## Ignore the token
5408 !!!nack ('t269.1');
5409 !!!next-token;
5410 next B;
5411 } else {
5412 !!!cp ('t270');
5413 pop @{$self->{open_elements}}; # colgroup
5414 $self->{insertion_mode} = IN_TABLE_IM;
5415 !!!ack-later;
5416 ## reprocess
5417 next B;
5418 }
5419 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5420 if ($token->{type} == CHARACTER_TOKEN) {
5421 !!!cp ('t271');
5422 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5423 !!!next-token;
5424 next B;
5425 } elsif ($token->{type} == START_TAG_TOKEN) {
5426 if ($token->{tag_name} eq 'option') {
5427 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5428 !!!cp ('t272');
5429 ## As if </option>
5430 pop @{$self->{open_elements}};
5431 } else {
5432 !!!cp ('t273');
5433 }
5434
5435 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5436 !!!nack ('t273.1');
5437 !!!next-token;
5438 next B;
5439 } elsif ($token->{tag_name} eq 'optgroup') {
5440 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5441 !!!cp ('t274');
5442 ## As if </option>
5443 pop @{$self->{open_elements}};
5444 } else {
5445 !!!cp ('t275');
5446 }
5447
5448 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5449 !!!cp ('t276');
5450 ## As if </optgroup>
5451 pop @{$self->{open_elements}};
5452 } else {
5453 !!!cp ('t277');
5454 }
5455
5456 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5457 !!!nack ('t277.1');
5458 !!!next-token;
5459 next B;
5460 } elsif ($token->{tag_name} eq 'select' or
5461 $token->{tag_name} eq 'input' or
5462 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5463 {
5464 caption => 1, table => 1,
5465 tbody => 1, tfoot => 1, thead => 1,
5466 tr => 1, td => 1, th => 1,
5467 }->{$token->{tag_name}})) {
5468 ## TODO: The type below is not good - <select> is replaced by </select>
5469 !!!parse-error (type => 'not closed:select', token => $token);
5470 ## NOTE: As if the token were </select> (<select> case) or
5471 ## as if there were </select> (otherwise).
5472 ## have an element in table scope
5473 my $i;
5474 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5475 my $node = $self->{open_elements}->[$_];
5476 if ($node->[1] & SELECT_EL) {
5477 !!!cp ('t278');
5478 $i = $_;
5479 last INSCOPE;
5480 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5481 !!!cp ('t279');
5482 last INSCOPE;
5483 }
5484 } # INSCOPE
5485 unless (defined $i) {
5486 !!!cp ('t280');
5487 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5488 ## Ignore the token
5489 !!!nack ('t280.1');
5490 !!!next-token;
5491 next B;
5492 }
5493
5494 !!!cp ('t281');
5495 splice @{$self->{open_elements}}, $i;
5496
5497 $self->_reset_insertion_mode;
5498
5499 if ($token->{tag_name} eq 'select') {
5500 !!!nack ('t281.2');
5501 !!!next-token;
5502 next B;
5503 } else {
5504 !!!cp ('t281.1');
5505 !!!ack-later;
5506 ## Reprocess the token.
5507 next B;
5508 }
5509 } else {
5510 !!!cp ('t282');
5511 !!!parse-error (type => 'in select:'.$token->{tag_name}, token => $token);
5512 ## Ignore the token
5513 !!!nack ('t282.1');
5514 !!!next-token;
5515 next B;
5516 }
5517 } elsif ($token->{type} == END_TAG_TOKEN) {
5518 if ($token->{tag_name} eq 'optgroup') {
5519 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
5520 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
5521 !!!cp ('t283');
5522 ## As if </option>
5523 splice @{$self->{open_elements}}, -2;
5524 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5525 !!!cp ('t284');
5526 pop @{$self->{open_elements}};
5527 } else {
5528 !!!cp ('t285');
5529 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5530 ## Ignore the token
5531 }
5532 !!!nack ('t285.1');
5533 !!!next-token;
5534 next B;
5535 } elsif ($token->{tag_name} eq 'option') {
5536 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5537 !!!cp ('t286');
5538 pop @{$self->{open_elements}};
5539 } else {
5540 !!!cp ('t287');
5541 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5542 ## Ignore the token
5543 }
5544 !!!nack ('t287.1');
5545 !!!next-token;
5546 next B;
5547 } elsif ($token->{tag_name} eq 'select') {
5548 ## have an element in table scope
5549 my $i;
5550 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5551 my $node = $self->{open_elements}->[$_];
5552 if ($node->[1] & SELECT_EL) {
5553 !!!cp ('t288');
5554 $i = $_;
5555 last INSCOPE;
5556 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5557 !!!cp ('t289');
5558 last INSCOPE;
5559 }
5560 } # INSCOPE
5561 unless (defined $i) {
5562 !!!cp ('t290');
5563 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5564 ## Ignore the token
5565 !!!nack ('t290.1');
5566 !!!next-token;
5567 next B;
5568 }
5569
5570 !!!cp ('t291');
5571 splice @{$self->{open_elements}}, $i;
5572
5573 $self->_reset_insertion_mode;
5574
5575 !!!nack ('t291.1');
5576 !!!next-token;
5577 next B;
5578 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5579 {
5580 caption => 1, table => 1, tbody => 1,
5581 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5582 }->{$token->{tag_name}}) {
5583 ## TODO: The following is wrong?
5584 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5585
5586 ## have an element in table scope
5587 my $i;
5588 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5589 my $node = $self->{open_elements}->[$_];
5590 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5591 !!!cp ('t292');
5592 $i = $_;
5593 last INSCOPE;
5594 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5595 !!!cp ('t293');
5596 last INSCOPE;
5597 }
5598 } # INSCOPE
5599 unless (defined $i) {
5600 !!!cp ('t294');
5601 ## Ignore the token
5602 !!!nack ('t294.1');
5603 !!!next-token;
5604 next B;
5605 }
5606
5607 ## As if </select>
5608 ## have an element in table scope
5609 undef $i;
5610 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5611 my $node = $self->{open_elements}->[$_];
5612 if ($node->[1] & SELECT_EL) {
5613 !!!cp ('t295');
5614 $i = $_;
5615 last INSCOPE;
5616 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5617 ## ISSUE: Can this state be reached?
5618 !!!cp ('t296');
5619 last INSCOPE;
5620 }
5621 } # INSCOPE
5622 unless (defined $i) {
5623 !!!cp ('t297');
5624 ## TODO: The following error type is correct?
5625 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5626 ## Ignore the </select> token
5627 !!!nack ('t297.1');
5628 !!!next-token; ## TODO: ok?
5629 next B;
5630 }
5631
5632 !!!cp ('t298');
5633 splice @{$self->{open_elements}}, $i;
5634
5635 $self->_reset_insertion_mode;
5636
5637 !!!ack-later;
5638 ## reprocess
5639 next B;
5640 } else {
5641 !!!cp ('t299');
5642 !!!parse-error (type => 'in select:/'.$token->{tag_name}, token => $token);
5643 ## Ignore the token
5644 !!!nack ('t299.3');
5645 !!!next-token;
5646 next B;
5647 }
5648 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5649 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5650 @{$self->{open_elements}} == 1) { # redundant, maybe
5651 !!!cp ('t299.1');
5652 !!!parse-error (type => 'in body:#eof', token => $token);
5653 } else {
5654 !!!cp ('t299.2');
5655 }
5656
5657 ## Stop parsing.
5658 last B;
5659 } else {
5660 die "$0: $token->{type}: Unknown token type";
5661 }
5662 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
5663 if ($token->{type} == CHARACTER_TOKEN) {
5664 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5665 my $data = $1;
5666 ## As if in body
5667 $reconstruct_active_formatting_elements->($insert_to_current);
5668
5669 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5670
5671 unless (length $token->{data}) {
5672 !!!cp ('t300');
5673 !!!next-token;
5674 next B;
5675 }
5676 }
5677
5678 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5679 !!!cp ('t301');
5680 !!!parse-error (type => 'after html:#character', token => $token);
5681
5682 ## Reprocess in the "after body" insertion mode.
5683 } else {
5684 !!!cp ('t302');
5685 }
5686
5687 ## "after body" insertion mode
5688 !!!parse-error (type => 'after body:#character', token => $token);
5689
5690 $self->{insertion_mode} = IN_BODY_IM;
5691 ## reprocess
5692 next B;
5693 } elsif ($token->{type} == START_TAG_TOKEN) {
5694 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5695 !!!cp ('t303');
5696 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5697
5698 ## Reprocess in the "after body" insertion mode.
5699 } else {
5700 !!!cp ('t304');
5701 }
5702
5703 ## "after body" insertion mode
5704 !!!parse-error (type => 'after body:'.$token->{tag_name}, token => $token);
5705
5706 $self->{insertion_mode} = IN_BODY_IM;
5707 !!!ack-later;
5708 ## reprocess
5709 next B;
5710 } elsif ($token->{type} == END_TAG_TOKEN) {
5711 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5712 !!!cp ('t305');
5713 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5714
5715 $self->{insertion_mode} = AFTER_BODY_IM;
5716 ## Reprocess in the "after body" insertion mode.
5717 } else {
5718 !!!cp ('t306');
5719 }
5720
5721 ## "after body" insertion mode
5722 if ($token->{tag_name} eq 'html') {
5723 if (defined $self->{inner_html_node}) {
5724 !!!cp ('t307');
5725 !!!parse-error (type => 'unmatched end tag:html', token => $token);
5726 ## Ignore the token
5727 !!!next-token;
5728 next B;
5729 } else {
5730 !!!cp ('t308');
5731 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
5732 !!!next-token;
5733 next B;
5734 }
5735 } else {
5736 !!!cp ('t309');
5737 !!!parse-error (type => 'after body:/'.$token->{tag_name}, token => $token);
5738
5739 $self->{insertion_mode} = IN_BODY_IM;
5740 ## reprocess
5741 next B;
5742 }
5743 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5744 !!!cp ('t309.2');
5745 ## Stop parsing
5746 last B;
5747 } else {
5748 die "$0: $token->{type}: Unknown token type";
5749 }
5750 } elsif ($self->{insertion_mode} & FRAME_IMS) {
5751 if ($token->{type} == CHARACTER_TOKEN) {
5752 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5753 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5754
5755 unless (length $token->{data}) {
5756 !!!cp ('t310');
5757 !!!next-token;
5758 next B;
5759 }
5760 }
5761
5762 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5763 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5764 !!!cp ('t311');
5765 !!!parse-error (type => 'in frameset:#character', token => $token);
5766 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
5767 !!!cp ('t312');
5768 !!!parse-error (type => 'after frameset:#character', token => $token);
5769 } else { # "after html frameset"
5770 !!!cp ('t313');
5771 !!!parse-error (type => 'after html:#character', token => $token);
5772
5773 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5774 ## Reprocess in the "after frameset" insertion mode.
5775 !!!parse-error (type => 'after frameset:#character', token => $token);
5776 }
5777
5778 ## Ignore the token.
5779 if (length $token->{data}) {
5780 !!!cp ('t314');
5781 ## reprocess the rest of characters
5782 } else {
5783 !!!cp ('t315');
5784 !!!next-token;
5785 }
5786 next B;
5787 }
5788
5789 die qq[$0: Character "$token->{data}"];
5790 } elsif ($token->{type} == START_TAG_TOKEN) {
5791 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5792 !!!cp ('t316');
5793 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5794
5795 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5796 ## Process in the "after frameset" insertion mode.
5797 } else {
5798 !!!cp ('t317');
5799 }
5800
5801 if ($token->{tag_name} eq 'frameset' and
5802 $self->{insertion_mode} == IN_FRAMESET_IM) {
5803 !!!cp ('t318');
5804 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5805 !!!nack ('t318.1');
5806 !!!next-token;
5807 next B;
5808 } elsif ($token->{tag_name} eq 'frame' and
5809 $self->{insertion_mode} == IN_FRAMESET_IM) {
5810 !!!cp ('t319');
5811 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5812 pop @{$self->{open_elements}};
5813 !!!ack ('t319.1');
5814 !!!next-token;
5815 next B;
5816 } elsif ($token->{tag_name} eq 'noframes') {
5817 !!!cp ('t320');
5818 ## NOTE: As if in body.
5819 $parse_rcdata->(CDATA_CONTENT_MODEL);
5820 next B;
5821 } else {
5822 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5823 !!!cp ('t321');
5824 !!!parse-error (type => 'in frameset:'.$token->{tag_name}, token => $token);
5825 } else {
5826 !!!cp ('t322');
5827 !!!parse-error (type => 'after frameset:'.$token->{tag_name}, token => $token);
5828 }
5829 ## Ignore the token
5830 !!!nack ('t322.1');
5831 !!!next-token;
5832 next B;
5833 }
5834 } elsif ($token->{type} == END_TAG_TOKEN) {
5835 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5836 !!!cp ('t323');
5837 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5838
5839 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5840 ## Process in the "after frameset" insertion mode.
5841 } else {
5842 !!!cp ('t324');
5843 }
5844
5845 if ($token->{tag_name} eq 'frameset' and
5846 $self->{insertion_mode} == IN_FRAMESET_IM) {
5847 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5848 @{$self->{open_elements}} == 1) {
5849 !!!cp ('t325');
5850 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5851 ## Ignore the token
5852 !!!next-token;
5853 } else {
5854 !!!cp ('t326');
5855 pop @{$self->{open_elements}};
5856 !!!next-token;
5857 }
5858
5859 if (not defined $self->{inner_html_node} and
5860 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
5861 !!!cp ('t327');
5862 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5863 } else {
5864 !!!cp ('t328');
5865 }
5866 next B;
5867 } elsif ($token->{tag_name} eq 'html' and
5868 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
5869 !!!cp ('t329');
5870 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
5871 !!!next-token;
5872 next B;
5873 } else {
5874 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5875 !!!cp ('t330');
5876 !!!parse-error (type => 'in frameset:/'.$token->{tag_name}, token => $token);
5877 } else {
5878 !!!cp ('t331');
5879 !!!parse-error (type => 'after frameset:/'.$token->{tag_name}, token => $token);
5880 }
5881 ## Ignore the token
5882 !!!next-token;
5883 next B;
5884 }
5885 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5886 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5887 @{$self->{open_elements}} == 1) { # redundant, maybe
5888 !!!cp ('t331.1');
5889 !!!parse-error (type => 'in body:#eof', token => $token);
5890 } else {
5891 !!!cp ('t331.2');
5892 }
5893
5894 ## Stop parsing
5895 last B;
5896 } else {
5897 die "$0: $token->{type}: Unknown token type";
5898 }
5899
5900 ## ISSUE: An issue in spec here
5901 } else {
5902 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5903 }
5904
5905 ## "in body" insertion mode
5906 if ($token->{type} == START_TAG_TOKEN) {
5907 if ($token->{tag_name} eq 'script') {
5908 !!!cp ('t332');
5909 ## NOTE: This is an "as if in head" code clone
5910 $script_start_tag->();
5911 next B;
5912 } elsif ($token->{tag_name} eq 'style') {
5913 !!!cp ('t333');
5914 ## NOTE: This is an "as if in head" code clone
5915 $parse_rcdata->(CDATA_CONTENT_MODEL);
5916 next B;
5917 } elsif ({
5918 base => 1, link => 1,
5919 }->{$token->{tag_name}}) {
5920 !!!cp ('t334');
5921 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5922 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5923 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5924 !!!ack ('t334.1');
5925 !!!next-token;
5926 next B;
5927 } elsif ($token->{tag_name} eq 'meta') {
5928 ## NOTE: This is an "as if in head" code clone, only "-t" differs
5929 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
5930 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
5931
5932 unless ($self->{confident}) {
5933 if ($token->{attributes}->{charset}) { ## TODO: And if supported
5934 !!!cp ('t335');
5935 $self->{change_encoding}
5936 ->($self, $token->{attributes}->{charset}->{value}, $token);
5937
5938 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5939 ->set_user_data (manakai_has_reference =>
5940 $token->{attributes}->{charset}
5941 ->{has_reference});
5942 } elsif ($token->{attributes}->{content}) {
5943 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
5944 if ($token->{attributes}->{content}->{value}
5945 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
5946 [\x09-\x0D\x20]*=
5947 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
5948 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
5949 !!!cp ('t336');
5950 $self->{change_encoding}
5951 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
5952 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5953 ->set_user_data (manakai_has_reference =>
5954 $token->{attributes}->{content}
5955 ->{has_reference});
5956 }
5957 }
5958 } else {
5959 if ($token->{attributes}->{charset}) {
5960 !!!cp ('t337');
5961 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
5962 ->set_user_data (manakai_has_reference =>
5963 $token->{attributes}->{charset}
5964 ->{has_reference});
5965 }
5966 if ($token->{attributes}->{content}) {
5967 !!!cp ('t338');
5968 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
5969 ->set_user_data (manakai_has_reference =>
5970 $token->{attributes}->{content}
5971 ->{has_reference});
5972 }
5973 }
5974
5975 !!!ack ('t338.1');
5976 !!!next-token;
5977 next B;
5978 } elsif ($token->{tag_name} eq 'title') {
5979 !!!cp ('t341');
5980 ## NOTE: This is an "as if in head" code clone
5981 $parse_rcdata->(RCDATA_CONTENT_MODEL);
5982 next B;
5983 } elsif ($token->{tag_name} eq 'body') {
5984 !!!parse-error (type => 'in body:body', token => $token);
5985
5986 if (@{$self->{open_elements}} == 1 or
5987 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
5988 !!!cp ('t342');
5989 ## Ignore the token
5990 } else {
5991 my $body_el = $self->{open_elements}->[1]->[0];
5992 for my $attr_name (keys %{$token->{attributes}}) {
5993 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
5994 !!!cp ('t343');
5995 $body_el->set_attribute_ns
5996 (undef, [undef, $attr_name],
5997 $token->{attributes}->{$attr_name}->{value});
5998 }
5999 }
6000 }
6001 !!!nack ('t343.1');
6002 !!!next-token;
6003 next B;
6004 } elsif ({
6005 address => 1, blockquote => 1, center => 1, dir => 1,
6006 div => 1, dl => 1, fieldset => 1,
6007 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6008 menu => 1, ol => 1, p => 1, ul => 1,
6009 pre => 1, listing => 1,
6010 form => 1,
6011 table => 1,
6012 hr => 1,
6013 }->{$token->{tag_name}}) {
6014 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6015 !!!cp ('t350');
6016 !!!parse-error (type => 'in form:form', token => $token);
6017 ## Ignore the token
6018 !!!nack ('t350.1');
6019 !!!next-token;
6020 next B;
6021 }
6022
6023 ## has a p element in scope
6024 INSCOPE: for (reverse @{$self->{open_elements}}) {
6025 if ($_->[1] & P_EL) {
6026 !!!cp ('t344');
6027 !!!back-token; # <form>
6028 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6029 line => $token->{line}, column => $token->{column}};
6030 next B;
6031 } elsif ($_->[1] & SCOPING_EL) {
6032 !!!cp ('t345');
6033 last INSCOPE;
6034 }
6035 } # INSCOPE
6036
6037 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6038 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6039 !!!nack ('t346.1');
6040 !!!next-token;
6041 if ($token->{type} == CHARACTER_TOKEN) {
6042 $token->{data} =~ s/^\x0A//;
6043 unless (length $token->{data}) {
6044 !!!cp ('t346');
6045 !!!next-token;
6046 } else {
6047 !!!cp ('t349');
6048 }
6049 } else {
6050 !!!cp ('t348');
6051 }
6052 } elsif ($token->{tag_name} eq 'form') {
6053 !!!cp ('t347.1');
6054 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6055
6056 !!!nack ('t347.2');
6057 !!!next-token;
6058 } elsif ($token->{tag_name} eq 'table') {
6059 !!!cp ('t382');
6060 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6061
6062 $self->{insertion_mode} = IN_TABLE_IM;
6063
6064 !!!nack ('t382.1');
6065 !!!next-token;
6066 } elsif ($token->{tag_name} eq 'hr') {
6067 !!!cp ('t386');
6068 pop @{$self->{open_elements}};
6069
6070 !!!nack ('t386.1');
6071 !!!next-token;
6072 } else {
6073 !!!nack ('t347.1');
6074 !!!next-token;
6075 }
6076 next B;
6077 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6078 ## has a p element in scope
6079 INSCOPE: for (reverse @{$self->{open_elements}}) {
6080 if ($_->[1] & P_EL) {
6081 !!!cp ('t353');
6082 !!!back-token; # <x>
6083 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6084 line => $token->{line}, column => $token->{column}};
6085 next B;
6086 } elsif ($_->[1] & SCOPING_EL) {
6087 !!!cp ('t354');
6088 last INSCOPE;
6089 }
6090 } # INSCOPE
6091
6092 ## Step 1
6093 my $i = -1;
6094 my $node = $self->{open_elements}->[$i];
6095 my $li_or_dtdd = {li => {li => 1},
6096 dt => {dt => 1, dd => 1},
6097 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6098 LI: {
6099 ## Step 2
6100 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6101 if ($i != -1) {
6102 !!!cp ('t355');
6103 !!!parse-error (type => 'not closed',
6104 value => $self->{open_elements}->[-1]->[0]
6105 ->manakai_local_name,
6106 token => $token);
6107 } else {
6108 !!!cp ('t356');
6109 }
6110 splice @{$self->{open_elements}}, $i;
6111 last LI;
6112 } else {
6113 !!!cp ('t357');
6114 }
6115
6116 ## Step 3
6117 if (not ($node->[1] & FORMATTING_EL) and
6118 #not $phrasing_category->{$node->[1]} and
6119 ($node->[1] & SPECIAL_EL or
6120 $node->[1] & SCOPING_EL) and
6121 not ($node->[1] & ADDRESS_EL) and
6122 not ($node->[1] & DIV_EL)) {
6123 !!!cp ('t358');
6124 last LI;
6125 }
6126
6127 !!!cp ('t359');
6128 ## Step 4
6129 $i--;
6130 $node = $self->{open_elements}->[$i];
6131 redo LI;
6132 } # LI
6133
6134 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6135 !!!nack ('t359.1');
6136 !!!next-token;
6137 next B;
6138 } elsif ($token->{tag_name} eq 'plaintext') {
6139 ## has a p element in scope
6140 INSCOPE: for (reverse @{$self->{open_elements}}) {
6141 if ($_->[1] & P_EL) {
6142 !!!cp ('t367');
6143 !!!back-token; # <plaintext>
6144 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6145 line => $token->{line}, column => $token->{column}};
6146 next B;
6147 } elsif ($_->[1] & SCOPING_EL) {
6148 !!!cp ('t368');
6149 last INSCOPE;
6150 }
6151 } # INSCOPE
6152
6153 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6154
6155 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6156
6157 !!!nack ('t368.1');
6158 !!!next-token;
6159 next B;
6160 } elsif ($token->{tag_name} eq 'a') {
6161 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6162 my $node = $active_formatting_elements->[$i];
6163 if ($node->[1] & A_EL) {
6164 !!!cp ('t371');
6165 !!!parse-error (type => 'in a:a', token => $token);
6166
6167 !!!back-token; # <a>
6168 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6169 line => $token->{line}, column => $token->{column}};
6170 $formatting_end_tag->($token);
6171
6172 AFE2: for (reverse 0..$#$active_formatting_elements) {
6173 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6174 !!!cp ('t372');
6175 splice @$active_formatting_elements, $_, 1;
6176 last AFE2;
6177 }
6178 } # AFE2
6179 OE: for (reverse 0..$#{$self->{open_elements}}) {
6180 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6181 !!!cp ('t373');
6182 splice @{$self->{open_elements}}, $_, 1;
6183 last OE;
6184 }
6185 } # OE
6186 last AFE;
6187 } elsif ($node->[0] eq '#marker') {
6188 !!!cp ('t374');
6189 last AFE;
6190 }
6191 } # AFE
6192
6193 $reconstruct_active_formatting_elements->($insert_to_current);
6194
6195 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6196 push @$active_formatting_elements, $self->{open_elements}->[-1];
6197
6198 !!!nack ('t374.1');
6199 !!!next-token;
6200 next B;
6201 } elsif ($token->{tag_name} eq 'nobr') {
6202 $reconstruct_active_formatting_elements->($insert_to_current);
6203
6204 ## has a |nobr| element in scope
6205 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6206 my $node = $self->{open_elements}->[$_];
6207 if ($node->[1] & NOBR_EL) {
6208 !!!cp ('t376');
6209 !!!parse-error (type => 'in nobr:nobr', token => $token);
6210 !!!back-token; # <nobr>
6211 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6212 line => $token->{line}, column => $token->{column}};
6213 next B;
6214 } elsif ($node->[1] & SCOPING_EL) {
6215 !!!cp ('t377');
6216 last INSCOPE;
6217 }
6218 } # INSCOPE
6219
6220 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6221 push @$active_formatting_elements, $self->{open_elements}->[-1];
6222
6223 !!!nack ('t377.1');
6224 !!!next-token;
6225 next B;
6226 } elsif ($token->{tag_name} eq 'button') {
6227 ## has a button element in scope
6228 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6229 my $node = $self->{open_elements}->[$_];
6230 if ($node->[1] & BUTTON_EL) {
6231 !!!cp ('t378');
6232 !!!parse-error (type => 'in button:button', token => $token);
6233 !!!back-token; # <button>
6234 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6235 line => $token->{line}, column => $token->{column}};
6236 next B;
6237 } elsif ($node->[1] & SCOPING_EL) {
6238 !!!cp ('t379');
6239 last INSCOPE;
6240 }
6241 } # INSCOPE
6242
6243 $reconstruct_active_formatting_elements->($insert_to_current);
6244
6245 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6246
6247 ## TODO: associate with $self->{form_element} if defined
6248
6249 push @$active_formatting_elements, ['#marker', ''];
6250
6251 !!!nack ('t379.1');
6252 !!!next-token;
6253 next B;
6254 } elsif ({
6255 xmp => 1,
6256 iframe => 1,
6257 noembed => 1,
6258 noframes => 1,
6259 noscript => 0, ## TODO: 1 if scripting is enabled
6260 }->{$token->{tag_name}}) {
6261 if ($token->{tag_name} eq 'xmp') {
6262 !!!cp ('t381');
6263 $reconstruct_active_formatting_elements->($insert_to_current);
6264 } else {
6265 !!!cp ('t399');
6266 }
6267 ## NOTE: There is an "as if in body" code clone.
6268 $parse_rcdata->(CDATA_CONTENT_MODEL);
6269 next B;
6270 } elsif ($token->{tag_name} eq 'isindex') {
6271 !!!parse-error (type => 'isindex', token => $token);
6272
6273 if (defined $self->{form_element}) {
6274 !!!cp ('t389');
6275 ## Ignore the token
6276 !!!nack ('t389'); ## NOTE: Not acknowledged.
6277 !!!next-token;
6278 next B;
6279 } else {
6280 my $at = $token->{attributes};
6281 my $form_attrs;
6282 $form_attrs->{action} = $at->{action} if $at->{action};
6283 my $prompt_attr = $at->{prompt};
6284 $at->{name} = {name => 'name', value => 'isindex'};
6285 delete $at->{action};
6286 delete $at->{prompt};
6287 my @tokens = (
6288 {type => START_TAG_TOKEN, tag_name => 'form',
6289 attributes => $form_attrs,
6290 line => $token->{line}, column => $token->{column}},
6291 {type => START_TAG_TOKEN, tag_name => 'hr',
6292 line => $token->{line}, column => $token->{column}},
6293 {type => START_TAG_TOKEN, tag_name => 'p',
6294 line => $token->{line}, column => $token->{column}},
6295 {type => START_TAG_TOKEN, tag_name => 'label',
6296 line => $token->{line}, column => $token->{column}},
6297 );
6298 if ($prompt_attr) {
6299 !!!cp ('t390');
6300 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6301 #line => $token->{line}, column => $token->{column},
6302 };
6303 } else {
6304 !!!cp ('t391');
6305 push @tokens, {type => CHARACTER_TOKEN,
6306 data => 'This is a searchable index. Insert your search keywords here: ',
6307 #line => $token->{line}, column => $token->{column},
6308 }; # SHOULD
6309 ## TODO: make this configurable
6310 }
6311 push @tokens,
6312 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6313 line => $token->{line}, column => $token->{column}},
6314 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6315 {type => END_TAG_TOKEN, tag_name => 'label',
6316 line => $token->{line}, column => $token->{column}},
6317 {type => END_TAG_TOKEN, tag_name => 'p',
6318 line => $token->{line}, column => $token->{column}},
6319 {type => START_TAG_TOKEN, tag_name => 'hr',
6320 line => $token->{line}, column => $token->{column}},
6321 {type => END_TAG_TOKEN, tag_name => 'form',
6322 line => $token->{line}, column => $token->{column}};
6323 !!!nack ('t391.1'); ## NOTE: Not acknowledged.
6324 !!!back-token (@tokens);
6325 !!!next-token;
6326 next B;
6327 }
6328 } elsif ($token->{tag_name} eq 'textarea') {
6329 my $tag_name = $token->{tag_name};
6330 my $el;
6331 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6332
6333 ## TODO: $self->{form_element} if defined
6334 $self->{content_model} = RCDATA_CONTENT_MODEL;
6335 delete $self->{escape}; # MUST
6336
6337 $insert->($el);
6338
6339 my $text = '';
6340 !!!nack ('t392.1');
6341 !!!next-token;
6342 if ($token->{type} == CHARACTER_TOKEN) {
6343 $token->{data} =~ s/^\x0A//;
6344 unless (length $token->{data}) {
6345 !!!cp ('t392');
6346 !!!next-token;
6347 } else {
6348 !!!cp ('t393');
6349 }
6350 } else {
6351 !!!cp ('t394');
6352 }
6353 while ($token->{type} == CHARACTER_TOKEN) {
6354 !!!cp ('t395');
6355 $text .= $token->{data};
6356 !!!next-token;
6357 }
6358 if (length $text) {
6359 !!!cp ('t396');
6360 $el->manakai_append_text ($text);
6361 }
6362
6363 $self->{content_model} = PCDATA_CONTENT_MODEL;
6364
6365 if ($token->{type} == END_TAG_TOKEN and
6366 $token->{tag_name} eq $tag_name) {
6367 !!!cp ('t397');
6368 ## Ignore the token
6369 } else {
6370 !!!cp ('t398');
6371 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
6372 }
6373 !!!next-token;
6374 next B;
6375 } elsif ($token->{tag_name} eq 'math' or
6376 $token->{tag_name} eq 'svg') {
6377 $reconstruct_active_formatting_elements->($insert_to_current);
6378
6379 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token);
6380
6381 if ($self->{self_closing}) {
6382 pop @{$self->{open_elements}};
6383 !!!ack ('t398.1');
6384 } else {
6385 !!!cp ('t398.2');
6386 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6387 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6388 ## mode, "in body" (not "in foreign content") secondary insertion
6389 ## mode, maybe.
6390 }
6391
6392 !!!next-token;
6393 next B;
6394 } elsif ({
6395 caption => 1, col => 1, colgroup => 1, frame => 1,
6396 frameset => 1, head => 1, option => 1, optgroup => 1,
6397 tbody => 1, td => 1, tfoot => 1, th => 1,
6398 thead => 1, tr => 1,
6399 }->{$token->{tag_name}}) {
6400 !!!cp ('t401');
6401 !!!parse-error (type => 'in body:'.$token->{tag_name}, token => $token);
6402 ## Ignore the token
6403 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6404 !!!next-token;
6405 next B;
6406
6407 ## ISSUE: An issue on HTML5 new elements in the spec.
6408 } else {
6409 if ($token->{tag_name} eq 'image') {
6410 !!!cp ('t384');
6411 !!!parse-error (type => 'image', token => $token);
6412 $token->{tag_name} = 'img';
6413 } else {
6414 !!!cp ('t385');
6415 }
6416
6417 ## NOTE: There is an "as if <br>" code clone.
6418 $reconstruct_active_formatting_elements->($insert_to_current);
6419
6420 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6421
6422 if ({
6423 applet => 1, marquee => 1, object => 1,
6424 }->{$token->{tag_name}}) {
6425 !!!cp ('t380');
6426 push @$active_formatting_elements, ['#marker', ''];
6427 !!!nack ('t380.1');
6428 } elsif ({
6429 b => 1, big => 1, em => 1, font => 1, i => 1,
6430 s => 1, small => 1, strile => 1,
6431 strong => 1, tt => 1, u => 1,
6432 }->{$token->{tag_name}}) {
6433 !!!cp ('t375');
6434 push @$active_formatting_elements, $self->{open_elements}->[-1];
6435 !!!nack ('t375.1');
6436 } elsif ($token->{tag_name} eq 'input') {
6437 !!!cp ('t388');
6438 ## TODO: associate with $self->{form_element} if defined
6439 pop @{$self->{open_elements}};
6440 !!!ack ('t388.2');
6441 } elsif ({
6442 area => 1, basefont => 1, bgsound => 1, br => 1,
6443 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
6444 #image => 1,
6445 }->{$token->{tag_name}}) {
6446 !!!cp ('t388.1');
6447 pop @{$self->{open_elements}};
6448 !!!ack ('t388.3');
6449 } elsif ($token->{tag_name} eq 'select') {
6450 ## TODO: associate with $self->{form_element} if defined
6451
6452 if ($self->{insertion_mode} & TABLE_IMS or
6453 $self->{insertion_mode} & BODY_TABLE_IMS or
6454 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6455 !!!cp ('t400.1');
6456 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6457 } else {
6458 !!!cp ('t400.2');
6459 $self->{insertion_mode} = IN_SELECT_IM;
6460 }
6461 !!!nack ('t400.3');
6462 } else {
6463 !!!nack ('t402');
6464 }
6465
6466 !!!next-token;
6467 next B;
6468 }
6469 } elsif ($token->{type} == END_TAG_TOKEN) {
6470 if ($token->{tag_name} eq 'body') {
6471 ## has a |body| element in scope
6472 my $i;
6473 INSCOPE: {
6474 for (reverse @{$self->{open_elements}}) {
6475 if ($_->[1] & BODY_EL) {
6476 !!!cp ('t405');
6477 $i = $_;
6478 last INSCOPE;
6479 } elsif ($_->[1] & SCOPING_EL) {
6480 !!!cp ('t405.1');
6481 last;
6482 }
6483 }
6484
6485 !!!parse-error (type => 'start tag not allowed',
6486 value => $token->{tag_name}, token => $token);
6487 ## NOTE: Ignore the token.
6488 !!!next-token;
6489 next B;
6490 } # INSCOPE
6491
6492 for (@{$self->{open_elements}}) {
6493 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
6494 !!!cp ('t403');
6495 !!!parse-error (type => 'not closed',
6496 value => $_->[0]->manakai_local_name,
6497 token => $token);
6498 last;
6499 } else {
6500 !!!cp ('t404');
6501 }
6502 }
6503
6504 $self->{insertion_mode} = AFTER_BODY_IM;
6505 !!!next-token;
6506 next B;
6507 } elsif ($token->{tag_name} eq 'html') {
6508 ## TODO: Update this code. It seems that the code below is not
6509 ## up-to-date, though it has same effect as speced.
6510 if (@{$self->{open_elements}} > 1 and
6511 $self->{open_elements}->[1]->[1] & BODY_EL) {
6512 ## ISSUE: There is an issue in the spec.
6513 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
6514 !!!cp ('t406');
6515 !!!parse-error (type => 'not closed',
6516 value => $self->{open_elements}->[1]->[0]
6517 ->manakai_local_name,
6518 token => $token);
6519 } else {
6520 !!!cp ('t407');
6521 }
6522 $self->{insertion_mode} = AFTER_BODY_IM;
6523 ## reprocess
6524 next B;
6525 } else {
6526 !!!cp ('t408');
6527 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6528 ## Ignore the token
6529 !!!next-token;
6530 next B;
6531 }
6532 } elsif ({
6533 address => 1, blockquote => 1, center => 1, dir => 1,
6534 div => 1, dl => 1, fieldset => 1, listing => 1,
6535 menu => 1, ol => 1, pre => 1, ul => 1,
6536 dd => 1, dt => 1, li => 1,
6537 applet => 1, button => 1, marquee => 1, object => 1,
6538 }->{$token->{tag_name}}) {
6539 ## has an element in scope
6540 my $i;
6541 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6542 my $node = $self->{open_elements}->[$_];
6543 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6544 !!!cp ('t410');
6545 $i = $_;
6546 last INSCOPE;
6547 } elsif ($node->[1] & SCOPING_EL) {
6548 !!!cp ('t411');
6549 last INSCOPE;
6550 }
6551 } # INSCOPE
6552
6553 unless (defined $i) { # has an element in scope
6554 !!!cp ('t413');
6555 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6556 } else {
6557 ## Step 1. generate implied end tags
6558 while ({
6559 dd => ($token->{tag_name} ne 'dd'),
6560 dt => ($token->{tag_name} ne 'dt'),
6561 li => ($token->{tag_name} ne 'li'),
6562 p => 1,
6563 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
6564 !!!cp ('t409');
6565 pop @{$self->{open_elements}};
6566 }
6567
6568 ## Step 2.
6569 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6570 ne $token->{tag_name}) {
6571 !!!cp ('t412');
6572 !!!parse-error (type => 'not closed',
6573 value => $self->{open_elements}->[-1]->[0]
6574 ->manakai_local_name,
6575 token => $token);
6576 } else {
6577 !!!cp ('t414');
6578 }
6579
6580 ## Step 3.
6581 splice @{$self->{open_elements}}, $i;
6582
6583 ## Step 4.
6584 $clear_up_to_marker->()
6585 if {
6586 applet => 1, button => 1, marquee => 1, object => 1,
6587 }->{$token->{tag_name}};
6588 }
6589 !!!next-token;
6590 next B;
6591 } elsif ($token->{tag_name} eq 'form') {
6592 undef $self->{form_element};
6593
6594 ## has an element in scope
6595 my $i;
6596 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6597 my $node = $self->{open_elements}->[$_];
6598 if ($node->[1] & FORM_EL) {
6599 !!!cp ('t418');
6600 $i = $_;
6601 last INSCOPE;
6602 } elsif ($node->[1] & SCOPING_EL) {
6603 !!!cp ('t419');
6604 last INSCOPE;
6605 }
6606 } # INSCOPE
6607
6608 unless (defined $i) { # has an element in scope
6609 !!!cp ('t421');
6610 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6611 } else {
6612 ## Step 1. generate implied end tags
6613 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6614 !!!cp ('t417');
6615 pop @{$self->{open_elements}};
6616 }
6617
6618 ## Step 2.
6619 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6620 ne $token->{tag_name}) {
6621 !!!cp ('t417.1');
6622 !!!parse-error (type => 'not closed',
6623 value => $self->{open_elements}->[-1]->[0]
6624 ->manakai_local_name,
6625 token => $token);
6626 } else {
6627 !!!cp ('t420');
6628 }
6629
6630 ## Step 3.
6631 splice @{$self->{open_elements}}, $i;
6632 }
6633
6634 !!!next-token;
6635 next B;
6636 } elsif ({
6637 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6638 }->{$token->{tag_name}}) {
6639 ## has an element in scope
6640 my $i;
6641 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6642 my $node = $self->{open_elements}->[$_];
6643 if ($node->[1] & HEADING_EL) {
6644 !!!cp ('t423');
6645 $i = $_;
6646 last INSCOPE;
6647 } elsif ($node->[1] & SCOPING_EL) {
6648 !!!cp ('t424');
6649 last INSCOPE;
6650 }
6651 } # INSCOPE
6652
6653 unless (defined $i) { # has an element in scope
6654 !!!cp ('t425.1');
6655 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6656 } else {
6657 ## Step 1. generate implied end tags
6658 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6659 !!!cp ('t422');
6660 pop @{$self->{open_elements}};
6661 }
6662
6663 ## Step 2.
6664 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6665 ne $token->{tag_name}) {
6666 !!!cp ('t425');
6667 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6668 } else {
6669 !!!cp ('t426');
6670 }
6671
6672 ## Step 3.
6673 splice @{$self->{open_elements}}, $i;
6674 }
6675
6676 !!!next-token;
6677 next B;
6678 } elsif ($token->{tag_name} eq 'p') {
6679 ## has an element in scope
6680 my $i;
6681 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6682 my $node = $self->{open_elements}->[$_];
6683 if ($node->[1] & P_EL) {
6684 !!!cp ('t410.1');
6685 $i = $_;
6686 last INSCOPE;
6687 } elsif ($node->[1] & SCOPING_EL) {
6688 !!!cp ('t411.1');
6689 last INSCOPE;
6690 }
6691 } # INSCOPE
6692
6693 if (defined $i) {
6694 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6695 ne $token->{tag_name}) {
6696 !!!cp ('t412.1');
6697 !!!parse-error (type => 'not closed',
6698 value => $self->{open_elements}->[-1]->[0]
6699 ->manakai_local_name,
6700 token => $token);
6701 } else {
6702 !!!cp ('t414.1');
6703 }
6704
6705 splice @{$self->{open_elements}}, $i;
6706 } else {
6707 !!!cp ('t413.1');
6708 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6709
6710 !!!cp ('t415.1');
6711 ## As if <p>, then reprocess the current token
6712 my $el;
6713 !!!create-element ($el, $HTML_NS, 'p',, $token);
6714 $insert->($el);
6715 ## NOTE: Not inserted into |$self->{open_elements}|.
6716 }
6717
6718 !!!next-token;
6719 next B;
6720 } elsif ({
6721 a => 1,
6722 b => 1, big => 1, em => 1, font => 1, i => 1,
6723 nobr => 1, s => 1, small => 1, strile => 1,
6724 strong => 1, tt => 1, u => 1,
6725 }->{$token->{tag_name}}) {
6726 !!!cp ('t427');
6727 $formatting_end_tag->($token);
6728 next B;
6729 } elsif ($token->{tag_name} eq 'br') {
6730 !!!cp ('t428');
6731 !!!parse-error (type => 'unmatched end tag:br', token => $token);
6732
6733 ## As if <br>
6734 $reconstruct_active_formatting_elements->($insert_to_current);
6735
6736 my $el;
6737 !!!create-element ($el, $HTML_NS, 'br',, $token);
6738 $insert->($el);
6739
6740 ## Ignore the token.
6741 !!!next-token;
6742 next B;
6743 } elsif ({
6744 caption => 1, col => 1, colgroup => 1, frame => 1,
6745 frameset => 1, head => 1, option => 1, optgroup => 1,
6746 tbody => 1, td => 1, tfoot => 1, th => 1,
6747 thead => 1, tr => 1,
6748 area => 1, basefont => 1, bgsound => 1,
6749 embed => 1, hr => 1, iframe => 1, image => 1,
6750 img => 1, input => 1, isindex => 1, noembed => 1,
6751 noframes => 1, param => 1, select => 1, spacer => 1,
6752 table => 1, textarea => 1, wbr => 1,
6753 noscript => 0, ## TODO: if scripting is enabled
6754 }->{$token->{tag_name}}) {
6755 !!!cp ('t429');
6756 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6757 ## Ignore the token
6758 !!!next-token;
6759 next B;
6760
6761 ## ISSUE: Issue on HTML5 new elements in spec
6762
6763 } else {
6764 ## Step 1
6765 my $node_i = -1;
6766 my $node = $self->{open_elements}->[$node_i];
6767
6768 ## Step 2
6769 S2: {
6770 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6771 ## Step 1
6772 ## generate implied end tags
6773 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6774 !!!cp ('t430');
6775 ## ISSUE: Can this case be reached?
6776 pop @{$self->{open_elements}};
6777 }
6778
6779 ## Step 2
6780 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6781 ne $token->{tag_name}) {
6782 !!!cp ('t431');
6783 ## NOTE: <x><y></x>
6784 !!!parse-error (type => 'not closed',
6785 value => $self->{open_elements}->[-1]->[0]
6786 ->manakai_local_name,
6787 token => $token);
6788 } else {
6789 !!!cp ('t432');
6790 }
6791
6792 ## Step 3
6793 splice @{$self->{open_elements}}, $node_i;
6794
6795 !!!next-token;
6796 last S2;
6797 } else {
6798 ## Step 3
6799 if (not ($node->[1] & FORMATTING_EL) and
6800 #not $phrasing_category->{$node->[1]} and
6801 ($node->[1] & SPECIAL_EL or
6802 $node->[1] & SCOPING_EL)) {
6803 !!!cp ('t433');
6804 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6805 ## Ignore the token
6806 !!!next-token;
6807 last S2;
6808 }
6809
6810 !!!cp ('t434');
6811 }
6812
6813 ## Step 4
6814 $node_i--;
6815 $node = $self->{open_elements}->[$node_i];
6816
6817 ## Step 5;
6818 redo S2;
6819 } # S2
6820 next B;
6821 }
6822 }
6823 next B;
6824 } continue { # B
6825 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
6826 ## NOTE: The code below is executed in cases where it does not have
6827 ## to be, but it it is harmless even in those cases.
6828 ## has an element in scope
6829 INSCOPE: {
6830 for (reverse 0..$#{$self->{open_elements}}) {
6831 my $node = $self->{open_elements}->[$_];
6832 if ($node->[1] & FOREIGN_EL) {
6833 last INSCOPE;
6834 } elsif ($node->[1] & SCOPING_EL) {
6835 last;
6836 }
6837 }
6838
6839 ## NOTE: No foreign element in scope.
6840 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
6841 } # INSCOPE
6842 }
6843 } # B
6844
6845 ## Stop parsing # MUST
6846
6847 ## TODO: script stuffs
6848 } # _tree_construct_main
6849
6850 sub set_inner_html ($$$) {
6851 my $class = shift;
6852 my $node = shift;
6853 my $s = \$_[0];
6854 my $onerror = $_[1];
6855
6856 ## ISSUE: Should {confident} be true?
6857
6858 my $nt = $node->node_type;
6859 if ($nt == 9) {
6860 # MUST
6861
6862 ## Step 1 # MUST
6863 ## TODO: If the document has an active parser, ...
6864 ## ISSUE: There is an issue in the spec.
6865
6866 ## Step 2 # MUST
6867 my @cn = @{$node->child_nodes};
6868 for (@cn) {
6869 $node->remove_child ($_);
6870 }
6871
6872 ## Step 3, 4, 5 # MUST
6873 $class->parse_string ($$s => $node, $onerror);
6874 } elsif ($nt == 1) {
6875 ## TODO: If non-html element
6876
6877 ## NOTE: Most of this code is copied from |parse_string|
6878
6879 ## Step 1 # MUST
6880 my $this_doc = $node->owner_document;
6881 my $doc = $this_doc->implementation->create_document;
6882 $doc->manakai_is_html (1);
6883 my $p = $class->new;
6884 $p->{document} = $doc;
6885
6886 ## Step 8 # MUST
6887 my $i = 0;
6888 $p->{line_prev} = $p->{line} = 1;
6889 $p->{column_prev} = $p->{column} = 0;
6890 $p->{set_next_char} = sub {
6891 my $self = shift;
6892
6893 pop @{$self->{prev_char}};
6894 unshift @{$self->{prev_char}}, $self->{next_char};
6895
6896 $self->{next_char} = -1 and return if $i >= length $$s;
6897 $self->{next_char} = ord substr $$s, $i++, 1;
6898
6899 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
6900 $p->{column}++;
6901
6902 if ($self->{next_char} == 0x000A) { # LF
6903 $p->{line}++;
6904 $p->{column} = 0;
6905 !!!cp ('i1');
6906 } elsif ($self->{next_char} == 0x000D) { # CR
6907 $i++ if substr ($$s, $i, 1) eq "\x0A";
6908 $self->{next_char} = 0x000A; # LF # MUST
6909 $p->{line}++;
6910 $p->{column} = 0;
6911 !!!cp ('i2');
6912 } elsif ($self->{next_char} > 0x10FFFF) {
6913 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6914 !!!cp ('i3');
6915 } elsif ($self->{next_char} == 0x0000) { # NULL
6916 !!!cp ('i4');
6917 !!!parse-error (type => 'NULL');
6918 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
6919 }
6920 };
6921 $p->{prev_char} = [-1, -1, -1];
6922 $p->{next_char} = -1;
6923
6924 my $ponerror = $onerror || sub {
6925 my (%opt) = @_;
6926 my $line = $opt{line};
6927 my $column = $opt{column};
6928 if (defined $opt{token} and defined $opt{token}->{line}) {
6929 $line = $opt{token}->{line};
6930 $column = $opt{token}->{column};
6931 }
6932 warn "Parse error ($opt{type}) at line $line column $column\n";
6933 };
6934 $p->{parse_error} = sub {
6935 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
6936 };
6937
6938 $p->_initialize_tokenizer;
6939 $p->_initialize_tree_constructor;
6940
6941 ## Step 2
6942 my $node_ln = $node->manakai_local_name;
6943 $p->{content_model} = {
6944 title => RCDATA_CONTENT_MODEL,
6945 textarea => RCDATA_CONTENT_MODEL,
6946 style => CDATA_CONTENT_MODEL,
6947 script => CDATA_CONTENT_MODEL,
6948 xmp => CDATA_CONTENT_MODEL,
6949 iframe => CDATA_CONTENT_MODEL,
6950 noembed => CDATA_CONTENT_MODEL,
6951 noframes => CDATA_CONTENT_MODEL,
6952 noscript => CDATA_CONTENT_MODEL,
6953 plaintext => PLAINTEXT_CONTENT_MODEL,
6954 }->{$node_ln};
6955 $p->{content_model} = PCDATA_CONTENT_MODEL
6956 unless defined $p->{content_model};
6957 ## ISSUE: What is "the name of the element"? local name?
6958
6959 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
6960 ## TODO: Foreign element OK?
6961
6962 ## Step 3
6963 my $root = $doc->create_element_ns
6964 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
6965
6966 ## Step 4 # MUST
6967 $doc->append_child ($root);
6968
6969 ## Step 5 # MUST
6970 push @{$p->{open_elements}}, [$root, $el_category->{html}];
6971
6972 undef $p->{head_element};
6973
6974 ## Step 6 # MUST
6975 $p->_reset_insertion_mode;
6976
6977 ## Step 7 # MUST
6978 my $anode = $node;
6979 AN: while (defined $anode) {
6980 if ($anode->node_type == 1) {
6981 my $nsuri = $anode->namespace_uri;
6982 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
6983 if ($anode->manakai_local_name eq 'form') {
6984 !!!cp ('i5');
6985 $p->{form_element} = $anode;
6986 last AN;
6987 }
6988 }
6989 }
6990 $anode = $anode->parent_node;
6991 } # AN
6992
6993 ## Step 9 # MUST
6994 {
6995 my $self = $p;
6996 !!!next-token;
6997 }
6998 $p->_tree_construction_main;
6999
7000 ## Step 10 # MUST
7001 my @cn = @{$node->child_nodes};
7002 for (@cn) {
7003 $node->remove_child ($_);
7004 }
7005 ## ISSUE: mutation events? read-only?
7006
7007 ## Step 11 # MUST
7008 @cn = @{$root->child_nodes};
7009 for (@cn) {
7010 $this_doc->adopt_node ($_);
7011 $node->append_child ($_);
7012 }
7013 ## ISSUE: mutation events?
7014
7015 $p->_terminate_tree_constructor;
7016
7017 delete $p->{parse_error}; # delete loop
7018 } else {
7019 die "$0: |set_inner_html| is not defined for node of type $nt";
7020 }
7021 } # set_inner_html
7022
7023 } # tree construction stage
7024
7025 package Whatpm::HTML::RestartParser;
7026 push our @ISA, 'Error';
7027
7028 1;
7029 # $Date: 2008/04/12 15:25:52 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24