/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.182 - (show annotations) (download) (as text)
Mon Sep 15 07:19:03 2008 UTC (17 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.181: +34 -57 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	15 Sep 2008 07:17:34 -0000
	* HTML.pm.src: Remove checking for control character, surrogate
	pair, or noncharacter code points and non-Unicode code
	points (they should be handled by Whatpm::Charset::UnicodeChecker).
	(parse_char_stream): Support for the |$get_wrapper| argument and
	character stream error handlers.

2008-09-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/Charset/ChangeLog	15 Sep 2008 07:18:45 -0000
	* DecodeHandle.pm (onerror): Return |undef| if no explicit value
	is set.

	* UnicodeChecker.pm: Support for HTML5 parse errors.
	(onerror): Return |undef| if no explicit value is set.

2008-09-15  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.181 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## NOTE: This module don't check all HTML5 parse errors; character
7 ## encoding related parse errors are expected to be handled by relevant
8 ## modules.
9 ## Parse errors for control characters that are not allowed in HTML5
10 ## documents, for surrogate code points, and for noncharacter code
11 ## points, as well as U+FFFD substitions for characters whose code points
12 ## is higher than U+10FFFF may be detected by combining the parser with
13 ## the checker implemented by Whatpm::Charset::UnicodeChecker (for its
14 ## usage example, see |t/HTML-tree.t| in the Whatpm package or the
15 ## WebHACC::Language::HTML module in the WebHACC package).
16
17 ## ISSUE:
18 ## var doc = implementation.createDocument (null, null, null);
19 ## doc.write ('');
20 ## alert (doc.compatMode);
21
22 require IO::Handle;
23
24 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
25 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
26 my $SVG_NS = q<http://www.w3.org/2000/svg>;
27 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
28 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
29 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
30
31 sub A_EL () { 0b1 }
32 sub ADDRESS_EL () { 0b10 }
33 sub BODY_EL () { 0b100 }
34 sub BUTTON_EL () { 0b1000 }
35 sub CAPTION_EL () { 0b10000 }
36 sub DD_EL () { 0b100000 }
37 sub DIV_EL () { 0b1000000 }
38 sub DT_EL () { 0b10000000 }
39 sub FORM_EL () { 0b100000000 }
40 sub FORMATTING_EL () { 0b1000000000 }
41 sub FRAMESET_EL () { 0b10000000000 }
42 sub HEADING_EL () { 0b100000000000 }
43 sub HTML_EL () { 0b1000000000000 }
44 sub LI_EL () { 0b10000000000000 }
45 sub NOBR_EL () { 0b100000000000000 }
46 sub OPTION_EL () { 0b1000000000000000 }
47 sub OPTGROUP_EL () { 0b10000000000000000 }
48 sub P_EL () { 0b100000000000000000 }
49 sub SELECT_EL () { 0b1000000000000000000 }
50 sub TABLE_EL () { 0b10000000000000000000 }
51 sub TABLE_CELL_EL () { 0b100000000000000000000 }
52 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
53 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
54 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
55 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
56 sub FOREIGN_EL () { 0b10000000000000000000000000 }
57 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
58 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
59 sub RUBY_EL () { 0b10000000000000000000000000000 }
60 sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
61
62 sub TABLE_ROWS_EL () {
63 TABLE_EL |
64 TABLE_ROW_EL |
65 TABLE_ROW_GROUP_EL
66 }
67
68 ## NOTE: Used in "generate implied end tags" algorithm.
69 ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
70 ## is used in "generate implied end tags" implementation (search for the
71 ## function mae).
72 sub END_TAG_OPTIONAL_EL () {
73 DD_EL |
74 DT_EL |
75 LI_EL |
76 P_EL |
77 RUBY_COMPONENT_EL
78 }
79
80 ## NOTE: Used in </body> and EOF algorithms.
81 sub ALL_END_TAG_OPTIONAL_EL () {
82 DD_EL |
83 DT_EL |
84 LI_EL |
85 P_EL |
86
87 BODY_EL |
88 HTML_EL |
89 TABLE_CELL_EL |
90 TABLE_ROW_EL |
91 TABLE_ROW_GROUP_EL
92 }
93
94 sub SCOPING_EL () {
95 BUTTON_EL |
96 CAPTION_EL |
97 HTML_EL |
98 TABLE_EL |
99 TABLE_CELL_EL |
100 MISC_SCOPING_EL
101 }
102
103 sub TABLE_SCOPING_EL () {
104 HTML_EL |
105 TABLE_EL
106 }
107
108 sub TABLE_ROWS_SCOPING_EL () {
109 HTML_EL |
110 TABLE_ROW_GROUP_EL
111 }
112
113 sub TABLE_ROW_SCOPING_EL () {
114 HTML_EL |
115 TABLE_ROW_EL
116 }
117
118 sub SPECIAL_EL () {
119 ADDRESS_EL |
120 BODY_EL |
121 DIV_EL |
122
123 DD_EL |
124 DT_EL |
125 LI_EL |
126 P_EL |
127
128 FORM_EL |
129 FRAMESET_EL |
130 HEADING_EL |
131 OPTION_EL |
132 OPTGROUP_EL |
133 SELECT_EL |
134 TABLE_ROW_EL |
135 TABLE_ROW_GROUP_EL |
136 MISC_SPECIAL_EL
137 }
138
139 my $el_category = {
140 a => A_EL | FORMATTING_EL,
141 address => ADDRESS_EL,
142 applet => MISC_SCOPING_EL,
143 area => MISC_SPECIAL_EL,
144 b => FORMATTING_EL,
145 base => MISC_SPECIAL_EL,
146 basefont => MISC_SPECIAL_EL,
147 bgsound => MISC_SPECIAL_EL,
148 big => FORMATTING_EL,
149 blockquote => MISC_SPECIAL_EL,
150 body => BODY_EL,
151 br => MISC_SPECIAL_EL,
152 button => BUTTON_EL,
153 caption => CAPTION_EL,
154 center => MISC_SPECIAL_EL,
155 col => MISC_SPECIAL_EL,
156 colgroup => MISC_SPECIAL_EL,
157 dd => DD_EL,
158 dir => MISC_SPECIAL_EL,
159 div => DIV_EL,
160 dl => MISC_SPECIAL_EL,
161 dt => DT_EL,
162 em => FORMATTING_EL,
163 embed => MISC_SPECIAL_EL,
164 fieldset => MISC_SPECIAL_EL,
165 font => FORMATTING_EL,
166 form => FORM_EL,
167 frame => MISC_SPECIAL_EL,
168 frameset => FRAMESET_EL,
169 h1 => HEADING_EL,
170 h2 => HEADING_EL,
171 h3 => HEADING_EL,
172 h4 => HEADING_EL,
173 h5 => HEADING_EL,
174 h6 => HEADING_EL,
175 head => MISC_SPECIAL_EL,
176 hr => MISC_SPECIAL_EL,
177 html => HTML_EL,
178 i => FORMATTING_EL,
179 iframe => MISC_SPECIAL_EL,
180 img => MISC_SPECIAL_EL,
181 input => MISC_SPECIAL_EL,
182 isindex => MISC_SPECIAL_EL,
183 li => LI_EL,
184 link => MISC_SPECIAL_EL,
185 listing => MISC_SPECIAL_EL,
186 marquee => MISC_SCOPING_EL,
187 menu => MISC_SPECIAL_EL,
188 meta => MISC_SPECIAL_EL,
189 nobr => NOBR_EL | FORMATTING_EL,
190 noembed => MISC_SPECIAL_EL,
191 noframes => MISC_SPECIAL_EL,
192 noscript => MISC_SPECIAL_EL,
193 object => MISC_SCOPING_EL,
194 ol => MISC_SPECIAL_EL,
195 optgroup => OPTGROUP_EL,
196 option => OPTION_EL,
197 p => P_EL,
198 param => MISC_SPECIAL_EL,
199 plaintext => MISC_SPECIAL_EL,
200 pre => MISC_SPECIAL_EL,
201 rp => RUBY_COMPONENT_EL,
202 rt => RUBY_COMPONENT_EL,
203 ruby => RUBY_EL,
204 s => FORMATTING_EL,
205 script => MISC_SPECIAL_EL,
206 select => SELECT_EL,
207 small => FORMATTING_EL,
208 spacer => MISC_SPECIAL_EL,
209 strike => FORMATTING_EL,
210 strong => FORMATTING_EL,
211 style => MISC_SPECIAL_EL,
212 table => TABLE_EL,
213 tbody => TABLE_ROW_GROUP_EL,
214 td => TABLE_CELL_EL,
215 textarea => MISC_SPECIAL_EL,
216 tfoot => TABLE_ROW_GROUP_EL,
217 th => TABLE_CELL_EL,
218 thead => TABLE_ROW_GROUP_EL,
219 title => MISC_SPECIAL_EL,
220 tr => TABLE_ROW_EL,
221 tt => FORMATTING_EL,
222 u => FORMATTING_EL,
223 ul => MISC_SPECIAL_EL,
224 wbr => MISC_SPECIAL_EL,
225 };
226
227 my $el_category_f = {
228 $MML_NS => {
229 'annotation-xml' => MML_AXML_EL,
230 mi => FOREIGN_FLOW_CONTENT_EL,
231 mo => FOREIGN_FLOW_CONTENT_EL,
232 mn => FOREIGN_FLOW_CONTENT_EL,
233 ms => FOREIGN_FLOW_CONTENT_EL,
234 mtext => FOREIGN_FLOW_CONTENT_EL,
235 },
236 $SVG_NS => {
237 foreignObject => FOREIGN_FLOW_CONTENT_EL,
238 desc => FOREIGN_FLOW_CONTENT_EL,
239 title => FOREIGN_FLOW_CONTENT_EL,
240 },
241 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
242 };
243
244 my $svg_attr_name = {
245 attributename => 'attributeName',
246 attributetype => 'attributeType',
247 basefrequency => 'baseFrequency',
248 baseprofile => 'baseProfile',
249 calcmode => 'calcMode',
250 clippathunits => 'clipPathUnits',
251 contentscripttype => 'contentScriptType',
252 contentstyletype => 'contentStyleType',
253 diffuseconstant => 'diffuseConstant',
254 edgemode => 'edgeMode',
255 externalresourcesrequired => 'externalResourcesRequired',
256 filterres => 'filterRes',
257 filterunits => 'filterUnits',
258 glyphref => 'glyphRef',
259 gradienttransform => 'gradientTransform',
260 gradientunits => 'gradientUnits',
261 kernelmatrix => 'kernelMatrix',
262 kernelunitlength => 'kernelUnitLength',
263 keypoints => 'keyPoints',
264 keysplines => 'keySplines',
265 keytimes => 'keyTimes',
266 lengthadjust => 'lengthAdjust',
267 limitingconeangle => 'limitingConeAngle',
268 markerheight => 'markerHeight',
269 markerunits => 'markerUnits',
270 markerwidth => 'markerWidth',
271 maskcontentunits => 'maskContentUnits',
272 maskunits => 'maskUnits',
273 numoctaves => 'numOctaves',
274 pathlength => 'pathLength',
275 patterncontentunits => 'patternContentUnits',
276 patterntransform => 'patternTransform',
277 patternunits => 'patternUnits',
278 pointsatx => 'pointsAtX',
279 pointsaty => 'pointsAtY',
280 pointsatz => 'pointsAtZ',
281 preservealpha => 'preserveAlpha',
282 preserveaspectratio => 'preserveAspectRatio',
283 primitiveunits => 'primitiveUnits',
284 refx => 'refX',
285 refy => 'refY',
286 repeatcount => 'repeatCount',
287 repeatdur => 'repeatDur',
288 requiredextensions => 'requiredExtensions',
289 requiredfeatures => 'requiredFeatures',
290 specularconstant => 'specularConstant',
291 specularexponent => 'specularExponent',
292 spreadmethod => 'spreadMethod',
293 startoffset => 'startOffset',
294 stddeviation => 'stdDeviation',
295 stitchtiles => 'stitchTiles',
296 surfacescale => 'surfaceScale',
297 systemlanguage => 'systemLanguage',
298 tablevalues => 'tableValues',
299 targetx => 'targetX',
300 targety => 'targetY',
301 textlength => 'textLength',
302 viewbox => 'viewBox',
303 viewtarget => 'viewTarget',
304 xchannelselector => 'xChannelSelector',
305 ychannelselector => 'yChannelSelector',
306 zoomandpan => 'zoomAndPan',
307 };
308
309 my $foreign_attr_xname = {
310 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
311 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
312 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
313 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
314 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
315 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
316 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
317 'xml:base' => [$XML_NS, ['xml', 'base']],
318 'xml:lang' => [$XML_NS, ['xml', 'lang']],
319 'xml:space' => [$XML_NS, ['xml', 'space']],
320 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
321 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
322 };
323
324 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
325
326 my $c1_entity_char = {
327 0x80 => 0x20AC,
328 0x81 => 0xFFFD,
329 0x82 => 0x201A,
330 0x83 => 0x0192,
331 0x84 => 0x201E,
332 0x85 => 0x2026,
333 0x86 => 0x2020,
334 0x87 => 0x2021,
335 0x88 => 0x02C6,
336 0x89 => 0x2030,
337 0x8A => 0x0160,
338 0x8B => 0x2039,
339 0x8C => 0x0152,
340 0x8D => 0xFFFD,
341 0x8E => 0x017D,
342 0x8F => 0xFFFD,
343 0x90 => 0xFFFD,
344 0x91 => 0x2018,
345 0x92 => 0x2019,
346 0x93 => 0x201C,
347 0x94 => 0x201D,
348 0x95 => 0x2022,
349 0x96 => 0x2013,
350 0x97 => 0x2014,
351 0x98 => 0x02DC,
352 0x99 => 0x2122,
353 0x9A => 0x0161,
354 0x9B => 0x203A,
355 0x9C => 0x0153,
356 0x9D => 0xFFFD,
357 0x9E => 0x017E,
358 0x9F => 0x0178,
359 }; # $c1_entity_char
360
361 sub parse_byte_string ($$$$;$) {
362 my $self = shift;
363 my $charset_name = shift;
364 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
365 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
366 } # parse_byte_string
367
368 sub parse_byte_stream ($$$$;$$) {
369 # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_;
370 my $self = ref $_[0] ? shift : shift->new;
371 my $charset_name = shift;
372 my $byte_stream = $_[0];
373
374 my $onerror = $_[2] || sub {
375 my (%opt) = @_;
376 warn "Parse error ($opt{type})\n";
377 };
378 $self->{parse_error} = $onerror; # updated later by parse_char_string
379
380 my $get_wrapper = $_[3] || sub ($) {
381 return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle
382 };
383
384 ## HTML5 encoding sniffing algorithm
385 require Message::Charset::Info;
386 my $charset;
387 my $buffer;
388 my ($char_stream, $e_status);
389
390 SNIFFING: {
391 ## NOTE: By setting |allow_fallback| option true when the
392 ## |get_decode_handle| method is invoked, we ignore what the HTML5
393 ## spec requires, i.e. unsupported encoding should be ignored.
394 ## TODO: We should not do this unless the parser is invoked
395 ## in the conformance checking mode, in which this behavior
396 ## would be useful.
397
398 ## Step 1
399 if (defined $charset_name) {
400 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
401 ## TODO: Is this ok? Transfer protocol's parameter should be
402 ## interpreted in its semantics?
403
404 ## ISSUE: Unsupported encoding is not ignored according to the spec.
405 ($char_stream, $e_status) = $charset->get_decode_handle
406 ($byte_stream, allow_error_reporting => 1,
407 allow_fallback => 1);
408 if ($char_stream) {
409 $self->{confident} = 1;
410 last SNIFFING;
411 } else {
412 ## TODO: unsupported error
413 }
414 }
415
416 ## Step 2
417 my $byte_buffer = '';
418 for (1..1024) {
419 my $char = $byte_stream->getc;
420 last unless defined $char;
421 $byte_buffer .= $char;
422 } ## TODO: timeout
423
424 ## Step 3
425 if ($byte_buffer =~ /^\xFE\xFF/) {
426 $charset = Message::Charset::Info->get_by_html_name ('utf-16be');
427 ($char_stream, $e_status) = $charset->get_decode_handle
428 ($byte_stream, allow_error_reporting => 1,
429 allow_fallback => 1, byte_buffer => \$byte_buffer);
430 $self->{confident} = 1;
431 last SNIFFING;
432 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
433 $charset = Message::Charset::Info->get_by_html_name ('utf-16le');
434 ($char_stream, $e_status) = $charset->get_decode_handle
435 ($byte_stream, allow_error_reporting => 1,
436 allow_fallback => 1, byte_buffer => \$byte_buffer);
437 $self->{confident} = 1;
438 last SNIFFING;
439 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
440 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
441 ($char_stream, $e_status) = $charset->get_decode_handle
442 ($byte_stream, allow_error_reporting => 1,
443 allow_fallback => 1, byte_buffer => \$byte_buffer);
444 $self->{confident} = 1;
445 last SNIFFING;
446 }
447
448 ## Step 4
449 ## TODO: <meta charset>
450
451 ## Step 5
452 ## TODO: from history
453
454 ## Step 6
455 require Whatpm::Charset::UniversalCharDet;
456 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
457 ($byte_buffer);
458 if (defined $charset_name) {
459 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
460
461 ## ISSUE: Unsupported encoding is not ignored according to the spec.
462 require Whatpm::Charset::DecodeHandle;
463 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
464 ($byte_stream);
465 ($char_stream, $e_status) = $charset->get_decode_handle
466 ($buffer, allow_error_reporting => 1,
467 allow_fallback => 1, byte_buffer => \$byte_buffer);
468 if ($char_stream) {
469 $buffer->{buffer} = $byte_buffer;
470 !!!parse-error (type => 'sniffing:chardet',
471 text => $charset_name,
472 level => $self->{level}->{info},
473 layer => 'encode',
474 line => 1, column => 1);
475 $self->{confident} = 0;
476 last SNIFFING;
477 }
478 }
479
480 ## Step 7: default
481 ## TODO: Make this configurable.
482 $charset = Message::Charset::Info->get_by_html_name ('windows-1252');
483 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
484 ## detectable in the step 6.
485 require Whatpm::Charset::DecodeHandle;
486 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
487 ($byte_stream);
488 ($char_stream, $e_status)
489 = $charset->get_decode_handle ($buffer,
490 allow_error_reporting => 1,
491 allow_fallback => 1,
492 byte_buffer => \$byte_buffer);
493 $buffer->{buffer} = $byte_buffer;
494 !!!parse-error (type => 'sniffing:default',
495 text => 'windows-1252',
496 level => $self->{level}->{info},
497 line => 1, column => 1,
498 layer => 'encode');
499 $self->{confident} = 0;
500 } # SNIFFING
501
502 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
503 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
504 !!!parse-error (type => 'chardecode:fallback',
505 #text => $self->{input_encoding},
506 level => $self->{level}->{uncertain},
507 line => 1, column => 1,
508 layer => 'encode');
509 } elsif (not ($e_status &
510 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
511 $self->{input_encoding} = $charset->get_iana_name;
512 !!!parse-error (type => 'chardecode:no error',
513 text => $self->{input_encoding},
514 level => $self->{level}->{uncertain},
515 line => 1, column => 1,
516 layer => 'encode');
517 } else {
518 $self->{input_encoding} = $charset->get_iana_name;
519 }
520
521 $self->{change_encoding} = sub {
522 my $self = shift;
523 $charset_name = shift;
524 my $token = shift;
525
526 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
527 ($char_stream, $e_status) = $charset->get_decode_handle
528 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
529 byte_buffer => \ $buffer->{buffer});
530
531 if ($char_stream) { # if supported
532 ## "Change the encoding" algorithm:
533
534 ## Step 1
535 if ($charset->{category} &
536 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
537 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
538 ($char_stream, $e_status) = $charset->get_decode_handle
539 ($byte_stream,
540 byte_buffer => \ $buffer->{buffer});
541 }
542 $charset_name = $charset->get_iana_name;
543
544 ## Step 2
545 if (defined $self->{input_encoding} and
546 $self->{input_encoding} eq $charset_name) {
547 !!!parse-error (type => 'charset label:matching',
548 text => $charset_name,
549 level => $self->{level}->{info});
550 $self->{confident} = 1;
551 return;
552 }
553
554 !!!parse-error (type => 'charset label detected',
555 text => $self->{input_encoding},
556 value => $charset_name,
557 level => $self->{level}->{warn},
558 token => $token);
559
560 ## Step 3
561 # if (can) {
562 ## change the encoding on the fly.
563 #$self->{confident} = 1;
564 #return;
565 # }
566
567 ## Step 4
568 throw Whatpm::HTML::RestartParser ();
569 }
570 }; # $self->{change_encoding}
571
572 my $char_onerror = sub {
573 my (undef, $type, %opt) = @_;
574 !!!parse-error (layer => 'encode',
575 line => $self->{line}, column => $self->{column} + 1,
576 %opt, type => $type);
577 if ($opt{octets}) {
578 ${$opt{octets}} = "\x{FFFD}"; # relacement character
579 }
580 };
581
582 my $wrapped_char_stream = $get_wrapper->($char_stream);
583 $wrapped_char_stream->onerror ($char_onerror);
584
585 my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef;
586 my $return;
587 try {
588 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
589 } catch Whatpm::HTML::RestartParser with {
590 ## NOTE: Invoked after {change_encoding}.
591
592 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
593 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
594 !!!parse-error (type => 'chardecode:fallback',
595 level => $self->{level}->{uncertain},
596 #text => $self->{input_encoding},
597 line => 1, column => 1,
598 layer => 'encode');
599 } elsif (not ($e_status &
600 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
601 $self->{input_encoding} = $charset->get_iana_name;
602 !!!parse-error (type => 'chardecode:no error',
603 text => $self->{input_encoding},
604 level => $self->{level}->{uncertain},
605 line => 1, column => 1,
606 layer => 'encode');
607 } else {
608 $self->{input_encoding} = $charset->get_iana_name;
609 }
610 $self->{confident} = 1;
611
612 $wrapped_char_stream = $get_wrapper->($char_stream);
613 $wrapped_char_stream->onerror ($char_onerror);
614
615 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
616 };
617 return $return;
618 } # parse_byte_stream
619
620 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
621 ## and the HTML layer MUST ignore it. However, we does strip BOM in
622 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
623 ## because the core part of our HTML parser expects a string of character,
624 ## not a string of bytes or code units or anything which might contain a BOM.
625 ## Therefore, any parser interface that accepts a string of bytes,
626 ## such as |parse_byte_string| in this module, must ensure that it does
627 ## strip the BOM and never strip any ZWNBSP.
628
629 sub parse_char_string ($$$;$$) {
630 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
631 my $self = shift;
632 my $s = ref $_[0] ? $_[0] : \($_[0]);
633 require Whatpm::Charset::DecodeHandle;
634 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
635 return $self->parse_char_stream ($input, @_[1..$#_]);
636 } # parse_char_string
637 *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
638
639 sub parse_char_stream ($$$;$$) {
640 my $self = ref $_[0] ? shift : shift->new;
641 my $input = $_[0];
642 $self->{document} = $_[1];
643 @{$self->{document}->child_nodes} = ();
644
645 ## NOTE: |set_inner_html| copies most of this method's code
646
647 $self->{confident} = 1 unless exists $self->{confident};
648 $self->{document}->input_encoding ($self->{input_encoding})
649 if defined $self->{input_encoding};
650 ## TODO: |{input_encoding}| is needless?
651
652 $self->{line_prev} = $self->{line} = 1;
653 $self->{column_prev} = -1;
654 $self->{column} = 0;
655 $self->{set_next_char} = sub {
656 my $self = shift;
657
658 my $char = '';
659 if (defined $self->{next_next_char}) {
660 $char = $self->{next_next_char};
661 delete $self->{next_next_char};
662 $self->{next_char} = ord $char;
663 } else {
664 $self->{char_buffer} = '';
665 $self->{char_buffer_pos} = 0;
666
667 my $count = $input->manakai_read_until
668 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
669 if ($count) {
670 $self->{line_prev} = $self->{line};
671 $self->{column_prev} = $self->{column};
672 $self->{column}++;
673 $self->{next_char}
674 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
675 return;
676 }
677
678 if ($input->read ($char, 1)) {
679 $self->{next_char} = ord $char;
680 } else {
681 $self->{next_char} = -1;
682 return;
683 }
684 }
685
686 ($self->{line_prev}, $self->{column_prev})
687 = ($self->{line}, $self->{column});
688 $self->{column}++;
689
690 if ($self->{next_char} == 0x000A) { # LF
691 !!!cp ('j1');
692 $self->{line}++;
693 $self->{column} = 0;
694 } elsif ($self->{next_char} == 0x000D) { # CR
695 !!!cp ('j2');
696 ## TODO: support for abort/streaming
697 my $next = '';
698 if ($input->read ($next, 1) and $next ne "\x0A") {
699 $self->{next_next_char} = $next;
700 }
701 $self->{next_char} = 0x000A; # LF # MUST
702 $self->{line}++;
703 $self->{column} = 0;
704 } elsif ($self->{next_char} == 0x0000) { # NULL
705 !!!cp ('j4');
706 !!!parse-error (type => 'NULL');
707 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
708 }
709 };
710
711 $self->{read_until} = sub {
712 #my ($scalar, $specials_range, $offset) = @_;
713 return 0 if defined $self->{next_next_char};
714
715 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
716 my $offset = $_[2] || 0;
717
718 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
719 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
720 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
721 substr ($_[0], $offset)
722 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
723 my $count = $+[0] - $-[0];
724 if ($count) {
725 $self->{column} += $count;
726 $self->{char_buffer_pos} += $count;
727 $self->{line_prev} = $self->{line};
728 $self->{column_prev} = $self->{column} - 1;
729 $self->{prev_char} = [-1, -1, -1];
730 $self->{next_char} = -1;
731 }
732 return $count;
733 } else {
734 return 0;
735 }
736 } else {
737 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
738 if ($count) {
739 $self->{column} += $count;
740 $self->{line_prev} = $self->{line};
741 $self->{column_prev} = $self->{column} - 1;
742 $self->{prev_char} = [-1, -1, -1];
743 $self->{next_char} = -1;
744 }
745 return $count;
746 }
747 }; # $self->{read_until}
748
749 my $onerror = $_[2] || sub {
750 my (%opt) = @_;
751 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
752 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
753 warn "Parse error ($opt{type}) at line $line column $column\n";
754 };
755 $self->{parse_error} = sub {
756 $onerror->(line => $self->{line}, column => $self->{column}, @_);
757 };
758
759 my $char_onerror = sub {
760 my (undef, $type, %opt) = @_;
761 !!!parse-error (layer => 'encode',
762 line => $self->{line}, column => $self->{column} + 1,
763 %opt, type => $type);
764 }; # $char_onerror
765
766 if ($_[3]) {
767 $input = $_[3]->($input);
768 $input->onerror ($char_onerror);
769 } else {
770 $input->onerror ($char_onerror) unless defined $input->onerror;
771 }
772
773 $self->_initialize_tokenizer;
774 $self->_initialize_tree_constructor;
775 $self->_construct_tree;
776 $self->_terminate_tree_constructor;
777
778 delete $self->{parse_error}; # remove loop
779
780 return $self->{document};
781 } # parse_char_stream
782
783 sub new ($) {
784 my $class = shift;
785 my $self = bless {
786 level => {must => 'm',
787 should => 's',
788 warn => 'w',
789 info => 'i',
790 uncertain => 'u'},
791 }, $class;
792 $self->{set_next_char} = sub {
793 $self->{next_char} = -1;
794 };
795 $self->{parse_error} = sub {
796 #
797 };
798 $self->{change_encoding} = sub {
799 # if ($_[0] is a supported encoding) {
800 # run "change the encoding" algorithm;
801 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
802 # }
803 };
804 $self->{application_cache_selection} = sub {
805 #
806 };
807 return $self;
808 } # new
809
810 sub CM_ENTITY () { 0b001 } # & markup in data
811 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
812 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
813
814 sub PLAINTEXT_CONTENT_MODEL () { 0 }
815 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
816 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
817 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
818
819 sub DATA_STATE () { 0 }
820 #sub ENTITY_DATA_STATE () { 1 }
821 sub TAG_OPEN_STATE () { 2 }
822 sub CLOSE_TAG_OPEN_STATE () { 3 }
823 sub TAG_NAME_STATE () { 4 }
824 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
825 sub ATTRIBUTE_NAME_STATE () { 6 }
826 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
827 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
828 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
829 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
830 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
831 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
832 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
833 sub COMMENT_START_STATE () { 14 }
834 sub COMMENT_START_DASH_STATE () { 15 }
835 sub COMMENT_STATE () { 16 }
836 sub COMMENT_END_STATE () { 17 }
837 sub COMMENT_END_DASH_STATE () { 18 }
838 sub BOGUS_COMMENT_STATE () { 19 }
839 sub DOCTYPE_STATE () { 20 }
840 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
841 sub DOCTYPE_NAME_STATE () { 22 }
842 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
843 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
844 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
845 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
846 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
847 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
848 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
849 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
850 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
851 sub BOGUS_DOCTYPE_STATE () { 32 }
852 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
853 sub SELF_CLOSING_START_TAG_STATE () { 34 }
854 sub CDATA_SECTION_STATE () { 35 }
855 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
856 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
857 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
858 sub CDATA_PCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
859 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
860 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
861 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
862 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
863 ## NOTE: "Entity data state", "entity in attribute value state", and
864 ## "consume a character reference" algorithm are jointly implemented
865 ## using the following six states:
866 sub ENTITY_STATE () { 44 }
867 sub ENTITY_HASH_STATE () { 45 }
868 sub NCR_NUM_STATE () { 46 }
869 sub HEXREF_X_STATE () { 47 }
870 sub HEXREF_HEX_STATE () { 48 }
871 sub ENTITY_NAME_STATE () { 49 }
872
873 sub DOCTYPE_TOKEN () { 1 }
874 sub COMMENT_TOKEN () { 2 }
875 sub START_TAG_TOKEN () { 3 }
876 sub END_TAG_TOKEN () { 4 }
877 sub END_OF_FILE_TOKEN () { 5 }
878 sub CHARACTER_TOKEN () { 6 }
879
880 sub AFTER_HTML_IMS () { 0b100 }
881 sub HEAD_IMS () { 0b1000 }
882 sub BODY_IMS () { 0b10000 }
883 sub BODY_TABLE_IMS () { 0b100000 }
884 sub TABLE_IMS () { 0b1000000 }
885 sub ROW_IMS () { 0b10000000 }
886 sub BODY_AFTER_IMS () { 0b100000000 }
887 sub FRAME_IMS () { 0b1000000000 }
888 sub SELECT_IMS () { 0b10000000000 }
889 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
890 ## NOTE: "in foreign content" insertion mode is special; it is combined
891 ## with the secondary insertion mode. In this parser, they are stored
892 ## together in the bit-or'ed form.
893
894 ## NOTE: "initial" and "before html" insertion modes have no constants.
895
896 ## NOTE: "after after body" insertion mode.
897 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
898
899 ## NOTE: "after after frameset" insertion mode.
900 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
901
902 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
903 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
904 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
905 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
906 sub IN_BODY_IM () { BODY_IMS }
907 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
908 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
909 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
910 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
911 sub IN_TABLE_IM () { TABLE_IMS }
912 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
913 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
914 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
915 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
916 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
917 sub IN_COLUMN_GROUP_IM () { 0b10 }
918
919 ## Implementations MUST act as if state machine in the spec
920
921 sub _initialize_tokenizer ($) {
922 my $self = shift;
923 $self->{state} = DATA_STATE; # MUST
924 #$self->{state_keyword}; # initialized when used
925 #$self->{entity__value}; # initialized when used
926 #$self->{entity__match}; # initialized when used
927 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
928 undef $self->{current_token};
929 undef $self->{current_attribute};
930 undef $self->{last_emitted_start_tag_name};
931 #$self->{prev_state}; # initialized when used
932 delete $self->{self_closing};
933 $self->{char_buffer} = '';
934 $self->{char_buffer_pos} = 0;
935 $self->{prev_char} = [-1, -1, -1];
936 $self->{next_char} = -1;
937 !!!next-input-character;
938 $self->{token} = [];
939 # $self->{escape}
940 } # _initialize_tokenizer
941
942 ## A token has:
943 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
944 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
945 ## ->{name} (DOCTYPE_TOKEN)
946 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
947 ## ->{public_identifier} (DOCTYPE_TOKEN)
948 ## ->{system_identifier} (DOCTYPE_TOKEN)
949 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
950 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
951 ## ->{name}
952 ## ->{value}
953 ## ->{has_reference} == 1 or 0
954 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
955 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
956 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
957 ## while the token is pushed back to the stack.
958
959 ## Emitted token MUST immediately be handled by the tree construction state.
960
961 ## Before each step, UA MAY check to see if either one of the scripts in
962 ## "list of scripts that will execute as soon as possible" or the first
963 ## script in the "list of scripts that will execute asynchronously",
964 ## has completed loading. If one has, then it MUST be executed
965 ## and removed from the list.
966
967 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
968 ## (This requirement was dropped from HTML5 spec, unfortunately.)
969
970 sub _get_next_token ($) {
971 my $self = shift;
972
973 if ($self->{self_closing}) {
974 !!!parse-error (type => 'nestc', token => $self->{current_token});
975 ## NOTE: The |self_closing| flag is only set by start tag token.
976 ## In addition, when a start tag token is emitted, it is always set to
977 ## |current_token|.
978 delete $self->{self_closing};
979 }
980
981 if (@{$self->{token}}) {
982 $self->{self_closing} = $self->{token}->[0]->{self_closing};
983 return shift @{$self->{token}};
984 }
985
986 A: {
987 if ($self->{state} == DATA_STATE) {
988 if ($self->{next_char} == 0x0026) { # &
989 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
990 not $self->{escape}) {
991 !!!cp (1);
992 ## NOTE: In the spec, the tokenizer is switched to the
993 ## "entity data state". In this implementation, the tokenizer
994 ## is switched to the |ENTITY_STATE|, which is an implementation
995 ## of the "consume a character reference" algorithm.
996 $self->{entity_additional} = -1;
997 $self->{prev_state} = DATA_STATE;
998 $self->{state} = ENTITY_STATE;
999 !!!next-input-character;
1000 redo A;
1001 } else {
1002 !!!cp (2);
1003 #
1004 }
1005 } elsif ($self->{next_char} == 0x002D) { # -
1006 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1007 unless ($self->{escape}) {
1008 if ($self->{prev_char}->[0] == 0x002D and # -
1009 $self->{prev_char}->[1] == 0x0021 and # !
1010 $self->{prev_char}->[2] == 0x003C) { # <
1011 !!!cp (3);
1012 $self->{escape} = 1;
1013 } else {
1014 !!!cp (4);
1015 }
1016 } else {
1017 !!!cp (5);
1018 }
1019 }
1020
1021 #
1022 } elsif ($self->{next_char} == 0x003C) { # <
1023 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
1024 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
1025 not $self->{escape})) {
1026 !!!cp (6);
1027 $self->{state} = TAG_OPEN_STATE;
1028 !!!next-input-character;
1029 redo A;
1030 } else {
1031 !!!cp (7);
1032 #
1033 }
1034 } elsif ($self->{next_char} == 0x003E) { # >
1035 if ($self->{escape} and
1036 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
1037 if ($self->{prev_char}->[0] == 0x002D and # -
1038 $self->{prev_char}->[1] == 0x002D) { # -
1039 !!!cp (8);
1040 delete $self->{escape};
1041 } else {
1042 !!!cp (9);
1043 }
1044 } else {
1045 !!!cp (10);
1046 }
1047
1048 #
1049 } elsif ($self->{next_char} == -1) {
1050 !!!cp (11);
1051 !!!emit ({type => END_OF_FILE_TOKEN,
1052 line => $self->{line}, column => $self->{column}});
1053 last A; ## TODO: ok?
1054 } else {
1055 !!!cp (12);
1056 }
1057 # Anything else
1058 my $token = {type => CHARACTER_TOKEN,
1059 data => chr $self->{next_char},
1060 line => $self->{line}, column => $self->{column},
1061 };
1062 $self->{read_until}->($token->{data}, q[-!<>&], length $token->{data});
1063
1064 ## Stay in the data state
1065 !!!next-input-character;
1066
1067 !!!emit ($token);
1068
1069 redo A;
1070 } elsif ($self->{state} == TAG_OPEN_STATE) {
1071 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1072 if ($self->{next_char} == 0x002F) { # /
1073 !!!cp (15);
1074 !!!next-input-character;
1075 $self->{state} = CLOSE_TAG_OPEN_STATE;
1076 redo A;
1077 } else {
1078 !!!cp (16);
1079 ## reconsume
1080 $self->{state} = DATA_STATE;
1081
1082 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1083 line => $self->{line_prev},
1084 column => $self->{column_prev},
1085 });
1086
1087 redo A;
1088 }
1089 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1090 if ($self->{next_char} == 0x0021) { # !
1091 !!!cp (17);
1092 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1093 !!!next-input-character;
1094 redo A;
1095 } elsif ($self->{next_char} == 0x002F) { # /
1096 !!!cp (18);
1097 $self->{state} = CLOSE_TAG_OPEN_STATE;
1098 !!!next-input-character;
1099 redo A;
1100 } elsif (0x0041 <= $self->{next_char} and
1101 $self->{next_char} <= 0x005A) { # A..Z
1102 !!!cp (19);
1103 $self->{current_token}
1104 = {type => START_TAG_TOKEN,
1105 tag_name => chr ($self->{next_char} + 0x0020),
1106 line => $self->{line_prev},
1107 column => $self->{column_prev}};
1108 $self->{state} = TAG_NAME_STATE;
1109 !!!next-input-character;
1110 redo A;
1111 } elsif (0x0061 <= $self->{next_char} and
1112 $self->{next_char} <= 0x007A) { # a..z
1113 !!!cp (20);
1114 $self->{current_token} = {type => START_TAG_TOKEN,
1115 tag_name => chr ($self->{next_char}),
1116 line => $self->{line_prev},
1117 column => $self->{column_prev}};
1118 $self->{state} = TAG_NAME_STATE;
1119 !!!next-input-character;
1120 redo A;
1121 } elsif ($self->{next_char} == 0x003E) { # >
1122 !!!cp (21);
1123 !!!parse-error (type => 'empty start tag',
1124 line => $self->{line_prev},
1125 column => $self->{column_prev});
1126 $self->{state} = DATA_STATE;
1127 !!!next-input-character;
1128
1129 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1130 line => $self->{line_prev},
1131 column => $self->{column_prev},
1132 });
1133
1134 redo A;
1135 } elsif ($self->{next_char} == 0x003F) { # ?
1136 !!!cp (22);
1137 !!!parse-error (type => 'pio',
1138 line => $self->{line_prev},
1139 column => $self->{column_prev});
1140 $self->{state} = BOGUS_COMMENT_STATE;
1141 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1142 line => $self->{line_prev},
1143 column => $self->{column_prev},
1144 };
1145 ## $self->{next_char} is intentionally left as is
1146 redo A;
1147 } else {
1148 !!!cp (23);
1149 !!!parse-error (type => 'bare stago',
1150 line => $self->{line_prev},
1151 column => $self->{column_prev});
1152 $self->{state} = DATA_STATE;
1153 ## reconsume
1154
1155 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1156 line => $self->{line_prev},
1157 column => $self->{column_prev},
1158 });
1159
1160 redo A;
1161 }
1162 } else {
1163 die "$0: $self->{content_model} in tag open";
1164 }
1165 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1166 ## NOTE: The "close tag open state" in the spec is implemented as
1167 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_PCDATA_CLOSE_TAG_STATE|.
1168
1169 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1170 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1171 if (defined $self->{last_emitted_start_tag_name}) {
1172 $self->{state} = CDATA_PCDATA_CLOSE_TAG_STATE;
1173 $self->{state_keyword} = '';
1174 ## Reconsume.
1175 redo A;
1176 } else {
1177 ## No start tag token has ever been emitted
1178 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1179 !!!cp (28);
1180 $self->{state} = DATA_STATE;
1181 ## Reconsume.
1182 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1183 line => $l, column => $c,
1184 });
1185 redo A;
1186 }
1187 }
1188
1189 if (0x0041 <= $self->{next_char} and
1190 $self->{next_char} <= 0x005A) { # A..Z
1191 !!!cp (29);
1192 $self->{current_token}
1193 = {type => END_TAG_TOKEN,
1194 tag_name => chr ($self->{next_char} + 0x0020),
1195 line => $l, column => $c};
1196 $self->{state} = TAG_NAME_STATE;
1197 !!!next-input-character;
1198 redo A;
1199 } elsif (0x0061 <= $self->{next_char} and
1200 $self->{next_char} <= 0x007A) { # a..z
1201 !!!cp (30);
1202 $self->{current_token} = {type => END_TAG_TOKEN,
1203 tag_name => chr ($self->{next_char}),
1204 line => $l, column => $c};
1205 $self->{state} = TAG_NAME_STATE;
1206 !!!next-input-character;
1207 redo A;
1208 } elsif ($self->{next_char} == 0x003E) { # >
1209 !!!cp (31);
1210 !!!parse-error (type => 'empty end tag',
1211 line => $self->{line_prev}, ## "<" in "</>"
1212 column => $self->{column_prev} - 1);
1213 $self->{state} = DATA_STATE;
1214 !!!next-input-character;
1215 redo A;
1216 } elsif ($self->{next_char} == -1) {
1217 !!!cp (32);
1218 !!!parse-error (type => 'bare etago');
1219 $self->{state} = DATA_STATE;
1220 # reconsume
1221
1222 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1223 line => $l, column => $c,
1224 });
1225
1226 redo A;
1227 } else {
1228 !!!cp (33);
1229 !!!parse-error (type => 'bogus end tag');
1230 $self->{state} = BOGUS_COMMENT_STATE;
1231 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1232 line => $self->{line_prev}, # "<" of "</"
1233 column => $self->{column_prev} - 1,
1234 };
1235 ## NOTE: $self->{next_char} is intentionally left as is.
1236 ## Although the "anything else" case of the spec not explicitly
1237 ## states that the next input character is to be reconsumed,
1238 ## it will be included to the |data| of the comment token
1239 ## generated from the bogus end tag, as defined in the
1240 ## "bogus comment state" entry.
1241 redo A;
1242 }
1243 } elsif ($self->{state} == CDATA_PCDATA_CLOSE_TAG_STATE) {
1244 my $ch = substr $self->{last_emitted_start_tag_name}, length $self->{state_keyword}, 1;
1245 if (length $ch) {
1246 my $CH = $ch;
1247 $ch =~ tr/a-z/A-Z/;
1248 my $nch = chr $self->{next_char};
1249 if ($nch eq $ch or $nch eq $CH) {
1250 !!!cp (24);
1251 ## Stay in the state.
1252 $self->{state_keyword} .= $nch;
1253 !!!next-input-character;
1254 redo A;
1255 } else {
1256 !!!cp (25);
1257 $self->{state} = DATA_STATE;
1258 ## Reconsume.
1259 !!!emit ({type => CHARACTER_TOKEN,
1260 data => '</' . $self->{state_keyword},
1261 line => $self->{line_prev},
1262 column => $self->{column_prev} - 1 - length $self->{state_keyword},
1263 });
1264 redo A;
1265 }
1266 } else { # after "<{tag-name}"
1267 unless ({
1268 0x0009 => 1, # HT
1269 0x000A => 1, # LF
1270 0x000B => 1, # VT
1271 0x000C => 1, # FF
1272 0x0020 => 1, # SP
1273 0x003E => 1, # >
1274 0x002F => 1, # /
1275 -1 => 1, # EOF
1276 }->{$self->{next_char}}) {
1277 !!!cp (26);
1278 ## Reconsume.
1279 $self->{state} = DATA_STATE;
1280 !!!emit ({type => CHARACTER_TOKEN,
1281 data => '</' . $self->{state_keyword},
1282 line => $self->{line_prev},
1283 column => $self->{column_prev} - 1 - length $self->{state_keyword},
1284 });
1285 redo A;
1286 } else {
1287 !!!cp (27);
1288 $self->{current_token}
1289 = {type => END_TAG_TOKEN,
1290 tag_name => $self->{last_emitted_start_tag_name},
1291 line => $self->{line_prev},
1292 column => $self->{column_prev} - 1 - length $self->{state_keyword}};
1293 $self->{state} = TAG_NAME_STATE;
1294 ## Reconsume.
1295 redo A;
1296 }
1297 }
1298 } elsif ($self->{state} == TAG_NAME_STATE) {
1299 if ($self->{next_char} == 0x0009 or # HT
1300 $self->{next_char} == 0x000A or # LF
1301 $self->{next_char} == 0x000B or # VT
1302 $self->{next_char} == 0x000C or # FF
1303 $self->{next_char} == 0x0020) { # SP
1304 !!!cp (34);
1305 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1306 !!!next-input-character;
1307 redo A;
1308 } elsif ($self->{next_char} == 0x003E) { # >
1309 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1310 !!!cp (35);
1311 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1312 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1313 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1314 #if ($self->{current_token}->{attributes}) {
1315 # ## NOTE: This should never be reached.
1316 # !!! cp (36);
1317 # !!! parse-error (type => 'end tag attribute');
1318 #} else {
1319 !!!cp (37);
1320 #}
1321 } else {
1322 die "$0: $self->{current_token}->{type}: Unknown token type";
1323 }
1324 $self->{state} = DATA_STATE;
1325 !!!next-input-character;
1326
1327 !!!emit ($self->{current_token}); # start tag or end tag
1328
1329 redo A;
1330 } elsif (0x0041 <= $self->{next_char} and
1331 $self->{next_char} <= 0x005A) { # A..Z
1332 !!!cp (38);
1333 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1334 # start tag or end tag
1335 ## Stay in this state
1336 !!!next-input-character;
1337 redo A;
1338 } elsif ($self->{next_char} == -1) {
1339 !!!parse-error (type => 'unclosed tag');
1340 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1341 !!!cp (39);
1342 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1343 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1344 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1345 #if ($self->{current_token}->{attributes}) {
1346 # ## NOTE: This state should never be reached.
1347 # !!! cp (40);
1348 # !!! parse-error (type => 'end tag attribute');
1349 #} else {
1350 !!!cp (41);
1351 #}
1352 } else {
1353 die "$0: $self->{current_token}->{type}: Unknown token type";
1354 }
1355 $self->{state} = DATA_STATE;
1356 # reconsume
1357
1358 !!!emit ($self->{current_token}); # start tag or end tag
1359
1360 redo A;
1361 } elsif ($self->{next_char} == 0x002F) { # /
1362 !!!cp (42);
1363 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1364 !!!next-input-character;
1365 redo A;
1366 } else {
1367 !!!cp (44);
1368 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1369 # start tag or end tag
1370 ## Stay in the state
1371 !!!next-input-character;
1372 redo A;
1373 }
1374 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1375 if ($self->{next_char} == 0x0009 or # HT
1376 $self->{next_char} == 0x000A or # LF
1377 $self->{next_char} == 0x000B or # VT
1378 $self->{next_char} == 0x000C or # FF
1379 $self->{next_char} == 0x0020) { # SP
1380 !!!cp (45);
1381 ## Stay in the state
1382 !!!next-input-character;
1383 redo A;
1384 } elsif ($self->{next_char} == 0x003E) { # >
1385 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1386 !!!cp (46);
1387 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1388 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1389 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1390 if ($self->{current_token}->{attributes}) {
1391 !!!cp (47);
1392 !!!parse-error (type => 'end tag attribute');
1393 } else {
1394 !!!cp (48);
1395 }
1396 } else {
1397 die "$0: $self->{current_token}->{type}: Unknown token type";
1398 }
1399 $self->{state} = DATA_STATE;
1400 !!!next-input-character;
1401
1402 !!!emit ($self->{current_token}); # start tag or end tag
1403
1404 redo A;
1405 } elsif (0x0041 <= $self->{next_char} and
1406 $self->{next_char} <= 0x005A) { # A..Z
1407 !!!cp (49);
1408 $self->{current_attribute}
1409 = {name => chr ($self->{next_char} + 0x0020),
1410 value => '',
1411 line => $self->{line}, column => $self->{column}};
1412 $self->{state} = ATTRIBUTE_NAME_STATE;
1413 !!!next-input-character;
1414 redo A;
1415 } elsif ($self->{next_char} == 0x002F) { # /
1416 !!!cp (50);
1417 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1418 !!!next-input-character;
1419 redo A;
1420 } elsif ($self->{next_char} == -1) {
1421 !!!parse-error (type => 'unclosed tag');
1422 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1423 !!!cp (52);
1424 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1425 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1426 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1427 if ($self->{current_token}->{attributes}) {
1428 !!!cp (53);
1429 !!!parse-error (type => 'end tag attribute');
1430 } else {
1431 !!!cp (54);
1432 }
1433 } else {
1434 die "$0: $self->{current_token}->{type}: Unknown token type";
1435 }
1436 $self->{state} = DATA_STATE;
1437 # reconsume
1438
1439 !!!emit ($self->{current_token}); # start tag or end tag
1440
1441 redo A;
1442 } else {
1443 if ({
1444 0x0022 => 1, # "
1445 0x0027 => 1, # '
1446 0x003D => 1, # =
1447 }->{$self->{next_char}}) {
1448 !!!cp (55);
1449 !!!parse-error (type => 'bad attribute name');
1450 } else {
1451 !!!cp (56);
1452 }
1453 $self->{current_attribute}
1454 = {name => chr ($self->{next_char}),
1455 value => '',
1456 line => $self->{line}, column => $self->{column}};
1457 $self->{state} = ATTRIBUTE_NAME_STATE;
1458 !!!next-input-character;
1459 redo A;
1460 }
1461 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1462 my $before_leave = sub {
1463 if (exists $self->{current_token}->{attributes} # start tag or end tag
1464 ->{$self->{current_attribute}->{name}}) { # MUST
1465 !!!cp (57);
1466 !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1467 ## Discard $self->{current_attribute} # MUST
1468 } else {
1469 !!!cp (58);
1470 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1471 = $self->{current_attribute};
1472 }
1473 }; # $before_leave
1474
1475 if ($self->{next_char} == 0x0009 or # HT
1476 $self->{next_char} == 0x000A or # LF
1477 $self->{next_char} == 0x000B or # VT
1478 $self->{next_char} == 0x000C or # FF
1479 $self->{next_char} == 0x0020) { # SP
1480 !!!cp (59);
1481 $before_leave->();
1482 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1483 !!!next-input-character;
1484 redo A;
1485 } elsif ($self->{next_char} == 0x003D) { # =
1486 !!!cp (60);
1487 $before_leave->();
1488 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1489 !!!next-input-character;
1490 redo A;
1491 } elsif ($self->{next_char} == 0x003E) { # >
1492 $before_leave->();
1493 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1494 !!!cp (61);
1495 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1496 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1497 !!!cp (62);
1498 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1499 if ($self->{current_token}->{attributes}) {
1500 !!!parse-error (type => 'end tag attribute');
1501 }
1502 } else {
1503 die "$0: $self->{current_token}->{type}: Unknown token type";
1504 }
1505 $self->{state} = DATA_STATE;
1506 !!!next-input-character;
1507
1508 !!!emit ($self->{current_token}); # start tag or end tag
1509
1510 redo A;
1511 } elsif (0x0041 <= $self->{next_char} and
1512 $self->{next_char} <= 0x005A) { # A..Z
1513 !!!cp (63);
1514 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1515 ## Stay in the state
1516 !!!next-input-character;
1517 redo A;
1518 } elsif ($self->{next_char} == 0x002F) { # /
1519 !!!cp (64);
1520 $before_leave->();
1521 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1522 !!!next-input-character;
1523 redo A;
1524 } elsif ($self->{next_char} == -1) {
1525 !!!parse-error (type => 'unclosed tag');
1526 $before_leave->();
1527 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1528 !!!cp (66);
1529 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1530 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1531 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1532 if ($self->{current_token}->{attributes}) {
1533 !!!cp (67);
1534 !!!parse-error (type => 'end tag attribute');
1535 } else {
1536 ## NOTE: This state should never be reached.
1537 !!!cp (68);
1538 }
1539 } else {
1540 die "$0: $self->{current_token}->{type}: Unknown token type";
1541 }
1542 $self->{state} = DATA_STATE;
1543 # reconsume
1544
1545 !!!emit ($self->{current_token}); # start tag or end tag
1546
1547 redo A;
1548 } else {
1549 if ($self->{next_char} == 0x0022 or # "
1550 $self->{next_char} == 0x0027) { # '
1551 !!!cp (69);
1552 !!!parse-error (type => 'bad attribute name');
1553 } else {
1554 !!!cp (70);
1555 }
1556 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1557 ## Stay in the state
1558 !!!next-input-character;
1559 redo A;
1560 }
1561 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1562 if ($self->{next_char} == 0x0009 or # HT
1563 $self->{next_char} == 0x000A or # LF
1564 $self->{next_char} == 0x000B or # VT
1565 $self->{next_char} == 0x000C or # FF
1566 $self->{next_char} == 0x0020) { # SP
1567 !!!cp (71);
1568 ## Stay in the state
1569 !!!next-input-character;
1570 redo A;
1571 } elsif ($self->{next_char} == 0x003D) { # =
1572 !!!cp (72);
1573 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1574 !!!next-input-character;
1575 redo A;
1576 } elsif ($self->{next_char} == 0x003E) { # >
1577 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1578 !!!cp (73);
1579 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1580 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1581 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1582 if ($self->{current_token}->{attributes}) {
1583 !!!cp (74);
1584 !!!parse-error (type => 'end tag attribute');
1585 } else {
1586 ## NOTE: This state should never be reached.
1587 !!!cp (75);
1588 }
1589 } else {
1590 die "$0: $self->{current_token}->{type}: Unknown token type";
1591 }
1592 $self->{state} = DATA_STATE;
1593 !!!next-input-character;
1594
1595 !!!emit ($self->{current_token}); # start tag or end tag
1596
1597 redo A;
1598 } elsif (0x0041 <= $self->{next_char} and
1599 $self->{next_char} <= 0x005A) { # A..Z
1600 !!!cp (76);
1601 $self->{current_attribute}
1602 = {name => chr ($self->{next_char} + 0x0020),
1603 value => '',
1604 line => $self->{line}, column => $self->{column}};
1605 $self->{state} = ATTRIBUTE_NAME_STATE;
1606 !!!next-input-character;
1607 redo A;
1608 } elsif ($self->{next_char} == 0x002F) { # /
1609 !!!cp (77);
1610 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1611 !!!next-input-character;
1612 redo A;
1613 } elsif ($self->{next_char} == -1) {
1614 !!!parse-error (type => 'unclosed tag');
1615 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1616 !!!cp (79);
1617 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1618 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1619 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1620 if ($self->{current_token}->{attributes}) {
1621 !!!cp (80);
1622 !!!parse-error (type => 'end tag attribute');
1623 } else {
1624 ## NOTE: This state should never be reached.
1625 !!!cp (81);
1626 }
1627 } else {
1628 die "$0: $self->{current_token}->{type}: Unknown token type";
1629 }
1630 $self->{state} = DATA_STATE;
1631 # reconsume
1632
1633 !!!emit ($self->{current_token}); # start tag or end tag
1634
1635 redo A;
1636 } else {
1637 if ($self->{next_char} == 0x0022 or # "
1638 $self->{next_char} == 0x0027) { # '
1639 !!!cp (78);
1640 !!!parse-error (type => 'bad attribute name');
1641 } else {
1642 !!!cp (82);
1643 }
1644 $self->{current_attribute}
1645 = {name => chr ($self->{next_char}),
1646 value => '',
1647 line => $self->{line}, column => $self->{column}};
1648 $self->{state} = ATTRIBUTE_NAME_STATE;
1649 !!!next-input-character;
1650 redo A;
1651 }
1652 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1653 if ($self->{next_char} == 0x0009 or # HT
1654 $self->{next_char} == 0x000A or # LF
1655 $self->{next_char} == 0x000B or # VT
1656 $self->{next_char} == 0x000C or # FF
1657 $self->{next_char} == 0x0020) { # SP
1658 !!!cp (83);
1659 ## Stay in the state
1660 !!!next-input-character;
1661 redo A;
1662 } elsif ($self->{next_char} == 0x0022) { # "
1663 !!!cp (84);
1664 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1665 !!!next-input-character;
1666 redo A;
1667 } elsif ($self->{next_char} == 0x0026) { # &
1668 !!!cp (85);
1669 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1670 ## reconsume
1671 redo A;
1672 } elsif ($self->{next_char} == 0x0027) { # '
1673 !!!cp (86);
1674 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1675 !!!next-input-character;
1676 redo A;
1677 } elsif ($self->{next_char} == 0x003E) { # >
1678 !!!parse-error (type => 'empty unquoted attribute value');
1679 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1680 !!!cp (87);
1681 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1682 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1683 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1684 if ($self->{current_token}->{attributes}) {
1685 !!!cp (88);
1686 !!!parse-error (type => 'end tag attribute');
1687 } else {
1688 ## NOTE: This state should never be reached.
1689 !!!cp (89);
1690 }
1691 } else {
1692 die "$0: $self->{current_token}->{type}: Unknown token type";
1693 }
1694 $self->{state} = DATA_STATE;
1695 !!!next-input-character;
1696
1697 !!!emit ($self->{current_token}); # start tag or end tag
1698
1699 redo A;
1700 } elsif ($self->{next_char} == -1) {
1701 !!!parse-error (type => 'unclosed tag');
1702 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1703 !!!cp (90);
1704 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1705 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1706 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1707 if ($self->{current_token}->{attributes}) {
1708 !!!cp (91);
1709 !!!parse-error (type => 'end tag attribute');
1710 } else {
1711 ## NOTE: This state should never be reached.
1712 !!!cp (92);
1713 }
1714 } else {
1715 die "$0: $self->{current_token}->{type}: Unknown token type";
1716 }
1717 $self->{state} = DATA_STATE;
1718 ## reconsume
1719
1720 !!!emit ($self->{current_token}); # start tag or end tag
1721
1722 redo A;
1723 } else {
1724 if ($self->{next_char} == 0x003D) { # =
1725 !!!cp (93);
1726 !!!parse-error (type => 'bad attribute value');
1727 } else {
1728 !!!cp (94);
1729 }
1730 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1731 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1732 !!!next-input-character;
1733 redo A;
1734 }
1735 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1736 if ($self->{next_char} == 0x0022) { # "
1737 !!!cp (95);
1738 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1739 !!!next-input-character;
1740 redo A;
1741 } elsif ($self->{next_char} == 0x0026) { # &
1742 !!!cp (96);
1743 ## NOTE: In the spec, the tokenizer is switched to the
1744 ## "entity in attribute value state". In this implementation, the
1745 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1746 ## implementation of the "consume a character reference" algorithm.
1747 $self->{prev_state} = $self->{state};
1748 $self->{entity_additional} = 0x0022; # "
1749 $self->{state} = ENTITY_STATE;
1750 !!!next-input-character;
1751 redo A;
1752 } elsif ($self->{next_char} == -1) {
1753 !!!parse-error (type => 'unclosed attribute value');
1754 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1755 !!!cp (97);
1756 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1757 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1758 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1759 if ($self->{current_token}->{attributes}) {
1760 !!!cp (98);
1761 !!!parse-error (type => 'end tag attribute');
1762 } else {
1763 ## NOTE: This state should never be reached.
1764 !!!cp (99);
1765 }
1766 } else {
1767 die "$0: $self->{current_token}->{type}: Unknown token type";
1768 }
1769 $self->{state} = DATA_STATE;
1770 ## reconsume
1771
1772 !!!emit ($self->{current_token}); # start tag or end tag
1773
1774 redo A;
1775 } else {
1776 !!!cp (100);
1777 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1778 $self->{read_until}->($self->{current_attribute}->{value},
1779 q["&],
1780 length $self->{current_attribute}->{value});
1781
1782 ## Stay in the state
1783 !!!next-input-character;
1784 redo A;
1785 }
1786 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1787 if ($self->{next_char} == 0x0027) { # '
1788 !!!cp (101);
1789 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1790 !!!next-input-character;
1791 redo A;
1792 } elsif ($self->{next_char} == 0x0026) { # &
1793 !!!cp (102);
1794 ## NOTE: In the spec, the tokenizer is switched to the
1795 ## "entity in attribute value state". In this implementation, the
1796 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1797 ## implementation of the "consume a character reference" algorithm.
1798 $self->{entity_additional} = 0x0027; # '
1799 $self->{prev_state} = $self->{state};
1800 $self->{state} = ENTITY_STATE;
1801 !!!next-input-character;
1802 redo A;
1803 } elsif ($self->{next_char} == -1) {
1804 !!!parse-error (type => 'unclosed attribute value');
1805 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1806 !!!cp (103);
1807 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1808 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1809 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1810 if ($self->{current_token}->{attributes}) {
1811 !!!cp (104);
1812 !!!parse-error (type => 'end tag attribute');
1813 } else {
1814 ## NOTE: This state should never be reached.
1815 !!!cp (105);
1816 }
1817 } else {
1818 die "$0: $self->{current_token}->{type}: Unknown token type";
1819 }
1820 $self->{state} = DATA_STATE;
1821 ## reconsume
1822
1823 !!!emit ($self->{current_token}); # start tag or end tag
1824
1825 redo A;
1826 } else {
1827 !!!cp (106);
1828 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1829 $self->{read_until}->($self->{current_attribute}->{value},
1830 q['&],
1831 length $self->{current_attribute}->{value});
1832
1833 ## Stay in the state
1834 !!!next-input-character;
1835 redo A;
1836 }
1837 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1838 if ($self->{next_char} == 0x0009 or # HT
1839 $self->{next_char} == 0x000A or # LF
1840 $self->{next_char} == 0x000B or # HT
1841 $self->{next_char} == 0x000C or # FF
1842 $self->{next_char} == 0x0020) { # SP
1843 !!!cp (107);
1844 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1845 !!!next-input-character;
1846 redo A;
1847 } elsif ($self->{next_char} == 0x0026) { # &
1848 !!!cp (108);
1849 ## NOTE: In the spec, the tokenizer is switched to the
1850 ## "entity in attribute value state". In this implementation, the
1851 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1852 ## implementation of the "consume a character reference" algorithm.
1853 $self->{entity_additional} = -1;
1854 $self->{prev_state} = $self->{state};
1855 $self->{state} = ENTITY_STATE;
1856 !!!next-input-character;
1857 redo A;
1858 } elsif ($self->{next_char} == 0x003E) { # >
1859 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1860 !!!cp (109);
1861 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1862 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1863 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1864 if ($self->{current_token}->{attributes}) {
1865 !!!cp (110);
1866 !!!parse-error (type => 'end tag attribute');
1867 } else {
1868 ## NOTE: This state should never be reached.
1869 !!!cp (111);
1870 }
1871 } else {
1872 die "$0: $self->{current_token}->{type}: Unknown token type";
1873 }
1874 $self->{state} = DATA_STATE;
1875 !!!next-input-character;
1876
1877 !!!emit ($self->{current_token}); # start tag or end tag
1878
1879 redo A;
1880 } elsif ($self->{next_char} == -1) {
1881 !!!parse-error (type => 'unclosed tag');
1882 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1883 !!!cp (112);
1884 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1885 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1886 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1887 if ($self->{current_token}->{attributes}) {
1888 !!!cp (113);
1889 !!!parse-error (type => 'end tag attribute');
1890 } else {
1891 ## NOTE: This state should never be reached.
1892 !!!cp (114);
1893 }
1894 } else {
1895 die "$0: $self->{current_token}->{type}: Unknown token type";
1896 }
1897 $self->{state} = DATA_STATE;
1898 ## reconsume
1899
1900 !!!emit ($self->{current_token}); # start tag or end tag
1901
1902 redo A;
1903 } else {
1904 if ({
1905 0x0022 => 1, # "
1906 0x0027 => 1, # '
1907 0x003D => 1, # =
1908 }->{$self->{next_char}}) {
1909 !!!cp (115);
1910 !!!parse-error (type => 'bad attribute value');
1911 } else {
1912 !!!cp (116);
1913 }
1914 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1915 $self->{read_until}->($self->{current_attribute}->{value},
1916 q["'=& >],
1917 length $self->{current_attribute}->{value});
1918
1919 ## Stay in the state
1920 !!!next-input-character;
1921 redo A;
1922 }
1923 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1924 if ($self->{next_char} == 0x0009 or # HT
1925 $self->{next_char} == 0x000A or # LF
1926 $self->{next_char} == 0x000B or # VT
1927 $self->{next_char} == 0x000C or # FF
1928 $self->{next_char} == 0x0020) { # SP
1929 !!!cp (118);
1930 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1931 !!!next-input-character;
1932 redo A;
1933 } elsif ($self->{next_char} == 0x003E) { # >
1934 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1935 !!!cp (119);
1936 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1937 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1938 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1939 if ($self->{current_token}->{attributes}) {
1940 !!!cp (120);
1941 !!!parse-error (type => 'end tag attribute');
1942 } else {
1943 ## NOTE: This state should never be reached.
1944 !!!cp (121);
1945 }
1946 } else {
1947 die "$0: $self->{current_token}->{type}: Unknown token type";
1948 }
1949 $self->{state} = DATA_STATE;
1950 !!!next-input-character;
1951
1952 !!!emit ($self->{current_token}); # start tag or end tag
1953
1954 redo A;
1955 } elsif ($self->{next_char} == 0x002F) { # /
1956 !!!cp (122);
1957 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1958 !!!next-input-character;
1959 redo A;
1960 } elsif ($self->{next_char} == -1) {
1961 !!!parse-error (type => 'unclosed tag');
1962 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1963 !!!cp (122.3);
1964 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1965 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1966 if ($self->{current_token}->{attributes}) {
1967 !!!cp (122.1);
1968 !!!parse-error (type => 'end tag attribute');
1969 } else {
1970 ## NOTE: This state should never be reached.
1971 !!!cp (122.2);
1972 }
1973 } else {
1974 die "$0: $self->{current_token}->{type}: Unknown token type";
1975 }
1976 $self->{state} = DATA_STATE;
1977 ## Reconsume.
1978 !!!emit ($self->{current_token}); # start tag or end tag
1979 redo A;
1980 } else {
1981 !!!cp ('124.1');
1982 !!!parse-error (type => 'no space between attributes');
1983 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1984 ## reconsume
1985 redo A;
1986 }
1987 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1988 if ($self->{next_char} == 0x003E) { # >
1989 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1990 !!!cp ('124.2');
1991 !!!parse-error (type => 'nestc', token => $self->{current_token});
1992 ## TODO: Different type than slash in start tag
1993 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1994 if ($self->{current_token}->{attributes}) {
1995 !!!cp ('124.4');
1996 !!!parse-error (type => 'end tag attribute');
1997 } else {
1998 !!!cp ('124.5');
1999 }
2000 ## TODO: Test |<title></title/>|
2001 } else {
2002 !!!cp ('124.3');
2003 $self->{self_closing} = 1;
2004 }
2005
2006 $self->{state} = DATA_STATE;
2007 !!!next-input-character;
2008
2009 !!!emit ($self->{current_token}); # start tag or end tag
2010
2011 redo A;
2012 } elsif ($self->{next_char} == -1) {
2013 !!!parse-error (type => 'unclosed tag');
2014 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
2015 !!!cp (124.7);
2016 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
2017 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
2018 if ($self->{current_token}->{attributes}) {
2019 !!!cp (124.5);
2020 !!!parse-error (type => 'end tag attribute');
2021 } else {
2022 ## NOTE: This state should never be reached.
2023 !!!cp (124.6);
2024 }
2025 } else {
2026 die "$0: $self->{current_token}->{type}: Unknown token type";
2027 }
2028 $self->{state} = DATA_STATE;
2029 ## Reconsume.
2030 !!!emit ($self->{current_token}); # start tag or end tag
2031 redo A;
2032 } else {
2033 !!!cp ('124.4');
2034 !!!parse-error (type => 'nestc');
2035 ## TODO: This error type is wrong.
2036 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2037 ## Reconsume.
2038 redo A;
2039 }
2040 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2041 ## (only happen if PCDATA state)
2042
2043 ## NOTE: Unlike spec's "bogus comment state", this implementation
2044 ## consumes characters one-by-one basis.
2045
2046 if ($self->{next_char} == 0x003E) { # >
2047 !!!cp (124);
2048 $self->{state} = DATA_STATE;
2049 !!!next-input-character;
2050
2051 !!!emit ($self->{current_token}); # comment
2052 redo A;
2053 } elsif ($self->{next_char} == -1) {
2054 !!!cp (125);
2055 $self->{state} = DATA_STATE;
2056 ## reconsume
2057
2058 !!!emit ($self->{current_token}); # comment
2059 redo A;
2060 } else {
2061 !!!cp (126);
2062 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2063 $self->{read_until}->($self->{current_token}->{data},
2064 q[>],
2065 length $self->{current_token}->{data});
2066
2067 ## Stay in the state.
2068 !!!next-input-character;
2069 redo A;
2070 }
2071 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2072 ## (only happen if PCDATA state)
2073
2074 if ($self->{next_char} == 0x002D) { # -
2075 !!!cp (133);
2076 $self->{state} = MD_HYPHEN_STATE;
2077 !!!next-input-character;
2078 redo A;
2079 } elsif ($self->{next_char} == 0x0044 or # D
2080 $self->{next_char} == 0x0064) { # d
2081 ## ASCII case-insensitive.
2082 !!!cp (130);
2083 $self->{state} = MD_DOCTYPE_STATE;
2084 $self->{state_keyword} = chr $self->{next_char};
2085 !!!next-input-character;
2086 redo A;
2087 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2088 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2089 $self->{next_char} == 0x005B) { # [
2090 !!!cp (135.4);
2091 $self->{state} = MD_CDATA_STATE;
2092 $self->{state_keyword} = '[';
2093 !!!next-input-character;
2094 redo A;
2095 } else {
2096 !!!cp (136);
2097 }
2098
2099 !!!parse-error (type => 'bogus comment',
2100 line => $self->{line_prev},
2101 column => $self->{column_prev} - 1);
2102 ## Reconsume.
2103 $self->{state} = BOGUS_COMMENT_STATE;
2104 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2105 line => $self->{line_prev},
2106 column => $self->{column_prev} - 1,
2107 };
2108 redo A;
2109 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2110 if ($self->{next_char} == 0x002D) { # -
2111 !!!cp (127);
2112 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2113 line => $self->{line_prev},
2114 column => $self->{column_prev} - 2,
2115 };
2116 $self->{state} = COMMENT_START_STATE;
2117 !!!next-input-character;
2118 redo A;
2119 } else {
2120 !!!cp (128);
2121 !!!parse-error (type => 'bogus comment',
2122 line => $self->{line_prev},
2123 column => $self->{column_prev} - 2);
2124 $self->{state} = BOGUS_COMMENT_STATE;
2125 ## Reconsume.
2126 $self->{current_token} = {type => COMMENT_TOKEN,
2127 data => '-',
2128 line => $self->{line_prev},
2129 column => $self->{column_prev} - 2,
2130 };
2131 redo A;
2132 }
2133 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2134 ## ASCII case-insensitive.
2135 if ($self->{next_char} == [
2136 undef,
2137 0x004F, # O
2138 0x0043, # C
2139 0x0054, # T
2140 0x0059, # Y
2141 0x0050, # P
2142 ]->[length $self->{state_keyword}] or
2143 $self->{next_char} == [
2144 undef,
2145 0x006F, # o
2146 0x0063, # c
2147 0x0074, # t
2148 0x0079, # y
2149 0x0070, # p
2150 ]->[length $self->{state_keyword}]) {
2151 !!!cp (131);
2152 ## Stay in the state.
2153 $self->{state_keyword} .= chr $self->{next_char};
2154 !!!next-input-character;
2155 redo A;
2156 } elsif ((length $self->{state_keyword}) == 6 and
2157 ($self->{next_char} == 0x0045 or # E
2158 $self->{next_char} == 0x0065)) { # e
2159 !!!cp (129);
2160 $self->{state} = DOCTYPE_STATE;
2161 $self->{current_token} = {type => DOCTYPE_TOKEN,
2162 quirks => 1,
2163 line => $self->{line_prev},
2164 column => $self->{column_prev} - 7,
2165 };
2166 !!!next-input-character;
2167 redo A;
2168 } else {
2169 !!!cp (132);
2170 !!!parse-error (type => 'bogus comment',
2171 line => $self->{line_prev},
2172 column => $self->{column_prev} - 1 - length $self->{state_keyword});
2173 $self->{state} = BOGUS_COMMENT_STATE;
2174 ## Reconsume.
2175 $self->{current_token} = {type => COMMENT_TOKEN,
2176 data => $self->{state_keyword},
2177 line => $self->{line_prev},
2178 column => $self->{column_prev} - 1 - length $self->{state_keyword},
2179 };
2180 redo A;
2181 }
2182 } elsif ($self->{state} == MD_CDATA_STATE) {
2183 if ($self->{next_char} == {
2184 '[' => 0x0043, # C
2185 '[C' => 0x0044, # D
2186 '[CD' => 0x0041, # A
2187 '[CDA' => 0x0054, # T
2188 '[CDAT' => 0x0041, # A
2189 }->{$self->{state_keyword}}) {
2190 !!!cp (135.1);
2191 ## Stay in the state.
2192 $self->{state_keyword} .= chr $self->{next_char};
2193 !!!next-input-character;
2194 redo A;
2195 } elsif ($self->{state_keyword} eq '[CDATA' and
2196 $self->{next_char} == 0x005B) { # [
2197 !!!cp (135.2);
2198 $self->{current_token} = {type => CHARACTER_TOKEN,
2199 data => '',
2200 line => $self->{line_prev},
2201 column => $self->{column_prev} - 7};
2202 $self->{state} = CDATA_SECTION_STATE;
2203 !!!next-input-character;
2204 redo A;
2205 } else {
2206 !!!cp (135.3);
2207 !!!parse-error (type => 'bogus comment',
2208 line => $self->{line_prev},
2209 column => $self->{column_prev} - 1 - length $self->{state_keyword});
2210 $self->{state} = BOGUS_COMMENT_STATE;
2211 ## Reconsume.
2212 $self->{current_token} = {type => COMMENT_TOKEN,
2213 data => $self->{state_keyword},
2214 line => $self->{line_prev},
2215 column => $self->{column_prev} - 1 - length $self->{state_keyword},
2216 };
2217 redo A;
2218 }
2219 } elsif ($self->{state} == COMMENT_START_STATE) {
2220 if ($self->{next_char} == 0x002D) { # -
2221 !!!cp (137);
2222 $self->{state} = COMMENT_START_DASH_STATE;
2223 !!!next-input-character;
2224 redo A;
2225 } elsif ($self->{next_char} == 0x003E) { # >
2226 !!!cp (138);
2227 !!!parse-error (type => 'bogus comment');
2228 $self->{state} = DATA_STATE;
2229 !!!next-input-character;
2230
2231 !!!emit ($self->{current_token}); # comment
2232
2233 redo A;
2234 } elsif ($self->{next_char} == -1) {
2235 !!!cp (139);
2236 !!!parse-error (type => 'unclosed comment');
2237 $self->{state} = DATA_STATE;
2238 ## reconsume
2239
2240 !!!emit ($self->{current_token}); # comment
2241
2242 redo A;
2243 } else {
2244 !!!cp (140);
2245 $self->{current_token}->{data} # comment
2246 .= chr ($self->{next_char});
2247 $self->{state} = COMMENT_STATE;
2248 !!!next-input-character;
2249 redo A;
2250 }
2251 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2252 if ($self->{next_char} == 0x002D) { # -
2253 !!!cp (141);
2254 $self->{state} = COMMENT_END_STATE;
2255 !!!next-input-character;
2256 redo A;
2257 } elsif ($self->{next_char} == 0x003E) { # >
2258 !!!cp (142);
2259 !!!parse-error (type => 'bogus comment');
2260 $self->{state} = DATA_STATE;
2261 !!!next-input-character;
2262
2263 !!!emit ($self->{current_token}); # comment
2264
2265 redo A;
2266 } elsif ($self->{next_char} == -1) {
2267 !!!cp (143);
2268 !!!parse-error (type => 'unclosed comment');
2269 $self->{state} = DATA_STATE;
2270 ## reconsume
2271
2272 !!!emit ($self->{current_token}); # comment
2273
2274 redo A;
2275 } else {
2276 !!!cp (144);
2277 $self->{current_token}->{data} # comment
2278 .= '-' . chr ($self->{next_char});
2279 $self->{state} = COMMENT_STATE;
2280 !!!next-input-character;
2281 redo A;
2282 }
2283 } elsif ($self->{state} == COMMENT_STATE) {
2284 if ($self->{next_char} == 0x002D) { # -
2285 !!!cp (145);
2286 $self->{state} = COMMENT_END_DASH_STATE;
2287 !!!next-input-character;
2288 redo A;
2289 } elsif ($self->{next_char} == -1) {
2290 !!!cp (146);
2291 !!!parse-error (type => 'unclosed comment');
2292 $self->{state} = DATA_STATE;
2293 ## reconsume
2294
2295 !!!emit ($self->{current_token}); # comment
2296
2297 redo A;
2298 } else {
2299 !!!cp (147);
2300 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2301 $self->{read_until}->($self->{current_token}->{data},
2302 q[-],
2303 length $self->{current_token}->{data});
2304
2305 ## Stay in the state
2306 !!!next-input-character;
2307 redo A;
2308 }
2309 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2310 if ($self->{next_char} == 0x002D) { # -
2311 !!!cp (148);
2312 $self->{state} = COMMENT_END_STATE;
2313 !!!next-input-character;
2314 redo A;
2315 } elsif ($self->{next_char} == -1) {
2316 !!!cp (149);
2317 !!!parse-error (type => 'unclosed comment');
2318 $self->{state} = DATA_STATE;
2319 ## reconsume
2320
2321 !!!emit ($self->{current_token}); # comment
2322
2323 redo A;
2324 } else {
2325 !!!cp (150);
2326 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2327 $self->{state} = COMMENT_STATE;
2328 !!!next-input-character;
2329 redo A;
2330 }
2331 } elsif ($self->{state} == COMMENT_END_STATE) {
2332 if ($self->{next_char} == 0x003E) { # >
2333 !!!cp (151);
2334 $self->{state} = DATA_STATE;
2335 !!!next-input-character;
2336
2337 !!!emit ($self->{current_token}); # comment
2338
2339 redo A;
2340 } elsif ($self->{next_char} == 0x002D) { # -
2341 !!!cp (152);
2342 !!!parse-error (type => 'dash in comment',
2343 line => $self->{line_prev},
2344 column => $self->{column_prev});
2345 $self->{current_token}->{data} .= '-'; # comment
2346 ## Stay in the state
2347 !!!next-input-character;
2348 redo A;
2349 } elsif ($self->{next_char} == -1) {
2350 !!!cp (153);
2351 !!!parse-error (type => 'unclosed comment');
2352 $self->{state} = DATA_STATE;
2353 ## reconsume
2354
2355 !!!emit ($self->{current_token}); # comment
2356
2357 redo A;
2358 } else {
2359 !!!cp (154);
2360 !!!parse-error (type => 'dash in comment',
2361 line => $self->{line_prev},
2362 column => $self->{column_prev});
2363 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2364 $self->{state} = COMMENT_STATE;
2365 !!!next-input-character;
2366 redo A;
2367 }
2368 } elsif ($self->{state} == DOCTYPE_STATE) {
2369 if ($self->{next_char} == 0x0009 or # HT
2370 $self->{next_char} == 0x000A or # LF
2371 $self->{next_char} == 0x000B or # VT
2372 $self->{next_char} == 0x000C or # FF
2373 $self->{next_char} == 0x0020) { # SP
2374 !!!cp (155);
2375 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2376 !!!next-input-character;
2377 redo A;
2378 } else {
2379 !!!cp (156);
2380 !!!parse-error (type => 'no space before DOCTYPE name');
2381 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2382 ## reconsume
2383 redo A;
2384 }
2385 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2386 if ($self->{next_char} == 0x0009 or # HT
2387 $self->{next_char} == 0x000A or # LF
2388 $self->{next_char} == 0x000B or # VT
2389 $self->{next_char} == 0x000C or # FF
2390 $self->{next_char} == 0x0020) { # SP
2391 !!!cp (157);
2392 ## Stay in the state
2393 !!!next-input-character;
2394 redo A;
2395 } elsif ($self->{next_char} == 0x003E) { # >
2396 !!!cp (158);
2397 !!!parse-error (type => 'no DOCTYPE name');
2398 $self->{state} = DATA_STATE;
2399 !!!next-input-character;
2400
2401 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2402
2403 redo A;
2404 } elsif ($self->{next_char} == -1) {
2405 !!!cp (159);
2406 !!!parse-error (type => 'no DOCTYPE name');
2407 $self->{state} = DATA_STATE;
2408 ## reconsume
2409
2410 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2411
2412 redo A;
2413 } else {
2414 !!!cp (160);
2415 $self->{current_token}->{name} = chr $self->{next_char};
2416 delete $self->{current_token}->{quirks};
2417 ## ISSUE: "Set the token's name name to the" in the spec
2418 $self->{state} = DOCTYPE_NAME_STATE;
2419 !!!next-input-character;
2420 redo A;
2421 }
2422 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2423 ## ISSUE: Redundant "First," in the spec.
2424 if ($self->{next_char} == 0x0009 or # HT
2425 $self->{next_char} == 0x000A or # LF
2426 $self->{next_char} == 0x000B or # VT
2427 $self->{next_char} == 0x000C or # FF
2428 $self->{next_char} == 0x0020) { # SP
2429 !!!cp (161);
2430 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2431 !!!next-input-character;
2432 redo A;
2433 } elsif ($self->{next_char} == 0x003E) { # >
2434 !!!cp (162);
2435 $self->{state} = DATA_STATE;
2436 !!!next-input-character;
2437
2438 !!!emit ($self->{current_token}); # DOCTYPE
2439
2440 redo A;
2441 } elsif ($self->{next_char} == -1) {
2442 !!!cp (163);
2443 !!!parse-error (type => 'unclosed DOCTYPE');
2444 $self->{state} = DATA_STATE;
2445 ## reconsume
2446
2447 $self->{current_token}->{quirks} = 1;
2448 !!!emit ($self->{current_token}); # DOCTYPE
2449
2450 redo A;
2451 } else {
2452 !!!cp (164);
2453 $self->{current_token}->{name}
2454 .= chr ($self->{next_char}); # DOCTYPE
2455 ## Stay in the state
2456 !!!next-input-character;
2457 redo A;
2458 }
2459 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2460 if ($self->{next_char} == 0x0009 or # HT
2461 $self->{next_char} == 0x000A or # LF
2462 $self->{next_char} == 0x000B or # VT
2463 $self->{next_char} == 0x000C or # FF
2464 $self->{next_char} == 0x0020) { # SP
2465 !!!cp (165);
2466 ## Stay in the state
2467 !!!next-input-character;
2468 redo A;
2469 } elsif ($self->{next_char} == 0x003E) { # >
2470 !!!cp (166);
2471 $self->{state} = DATA_STATE;
2472 !!!next-input-character;
2473
2474 !!!emit ($self->{current_token}); # DOCTYPE
2475
2476 redo A;
2477 } elsif ($self->{next_char} == -1) {
2478 !!!cp (167);
2479 !!!parse-error (type => 'unclosed DOCTYPE');
2480 $self->{state} = DATA_STATE;
2481 ## reconsume
2482
2483 $self->{current_token}->{quirks} = 1;
2484 !!!emit ($self->{current_token}); # DOCTYPE
2485
2486 redo A;
2487 } elsif ($self->{next_char} == 0x0050 or # P
2488 $self->{next_char} == 0x0070) { # p
2489 $self->{state} = PUBLIC_STATE;
2490 $self->{state_keyword} = chr $self->{next_char};
2491 !!!next-input-character;
2492 redo A;
2493 } elsif ($self->{next_char} == 0x0053 or # S
2494 $self->{next_char} == 0x0073) { # s
2495 $self->{state} = SYSTEM_STATE;
2496 $self->{state_keyword} = chr $self->{next_char};
2497 !!!next-input-character;
2498 redo A;
2499 } else {
2500 !!!cp (180);
2501 !!!parse-error (type => 'string after DOCTYPE name');
2502 $self->{current_token}->{quirks} = 1;
2503
2504 $self->{state} = BOGUS_DOCTYPE_STATE;
2505 !!!next-input-character;
2506 redo A;
2507 }
2508 } elsif ($self->{state} == PUBLIC_STATE) {
2509 ## ASCII case-insensitive
2510 if ($self->{next_char} == [
2511 undef,
2512 0x0055, # U
2513 0x0042, # B
2514 0x004C, # L
2515 0x0049, # I
2516 ]->[length $self->{state_keyword}] or
2517 $self->{next_char} == [
2518 undef,
2519 0x0075, # u
2520 0x0062, # b
2521 0x006C, # l
2522 0x0069, # i
2523 ]->[length $self->{state_keyword}]) {
2524 !!!cp (175);
2525 ## Stay in the state.
2526 $self->{state_keyword} .= chr $self->{next_char};
2527 !!!next-input-character;
2528 redo A;
2529 } elsif ((length $self->{state_keyword}) == 5 and
2530 ($self->{next_char} == 0x0043 or # C
2531 $self->{next_char} == 0x0063)) { # c
2532 !!!cp (168);
2533 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2534 !!!next-input-character;
2535 redo A;
2536 } else {
2537 !!!cp (169);
2538 !!!parse-error (type => 'string after DOCTYPE name',
2539 line => $self->{line_prev},
2540 column => $self->{column_prev} + 1 - length $self->{state_keyword});
2541 $self->{current_token}->{quirks} = 1;
2542
2543 $self->{state} = BOGUS_DOCTYPE_STATE;
2544 ## Reconsume.
2545 redo A;
2546 }
2547 } elsif ($self->{state} == SYSTEM_STATE) {
2548 ## ASCII case-insensitive
2549 if ($self->{next_char} == [
2550 undef,
2551 0x0059, # Y
2552 0x0053, # S
2553 0x0054, # T
2554 0x0045, # E
2555 ]->[length $self->{state_keyword}] or
2556 $self->{next_char} == [
2557 undef,
2558 0x0079, # y
2559 0x0073, # s
2560 0x0074, # t
2561 0x0065, # e
2562 ]->[length $self->{state_keyword}]) {
2563 !!!cp (170);
2564 ## Stay in the state.
2565 $self->{state_keyword} .= chr $self->{next_char};
2566 !!!next-input-character;
2567 redo A;
2568 } elsif ((length $self->{state_keyword}) == 5 and
2569 ($self->{next_char} == 0x004D or # M
2570 $self->{next_char} == 0x006D)) { # m
2571 !!!cp (171);
2572 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2573 !!!next-input-character;
2574 redo A;
2575 } else {
2576 !!!cp (172);
2577 !!!parse-error (type => 'string after DOCTYPE name',
2578 line => $self->{line_prev},
2579 column => $self->{column_prev} + 1 - length $self->{state_keyword});
2580 $self->{current_token}->{quirks} = 1;
2581
2582 $self->{state} = BOGUS_DOCTYPE_STATE;
2583 ## Reconsume.
2584 redo A;
2585 }
2586 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2587 if ({
2588 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2589 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2590 }->{$self->{next_char}}) {
2591 !!!cp (181);
2592 ## Stay in the state
2593 !!!next-input-character;
2594 redo A;
2595 } elsif ($self->{next_char} eq 0x0022) { # "
2596 !!!cp (182);
2597 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2598 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2599 !!!next-input-character;
2600 redo A;
2601 } elsif ($self->{next_char} eq 0x0027) { # '
2602 !!!cp (183);
2603 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2604 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2605 !!!next-input-character;
2606 redo A;
2607 } elsif ($self->{next_char} eq 0x003E) { # >
2608 !!!cp (184);
2609 !!!parse-error (type => 'no PUBLIC literal');
2610
2611 $self->{state} = DATA_STATE;
2612 !!!next-input-character;
2613
2614 $self->{current_token}->{quirks} = 1;
2615 !!!emit ($self->{current_token}); # DOCTYPE
2616
2617 redo A;
2618 } elsif ($self->{next_char} == -1) {
2619 !!!cp (185);
2620 !!!parse-error (type => 'unclosed DOCTYPE');
2621
2622 $self->{state} = DATA_STATE;
2623 ## reconsume
2624
2625 $self->{current_token}->{quirks} = 1;
2626 !!!emit ($self->{current_token}); # DOCTYPE
2627
2628 redo A;
2629 } else {
2630 !!!cp (186);
2631 !!!parse-error (type => 'string after PUBLIC');
2632 $self->{current_token}->{quirks} = 1;
2633
2634 $self->{state} = BOGUS_DOCTYPE_STATE;
2635 !!!next-input-character;
2636 redo A;
2637 }
2638 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2639 if ($self->{next_char} == 0x0022) { # "
2640 !!!cp (187);
2641 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2642 !!!next-input-character;
2643 redo A;
2644 } elsif ($self->{next_char} == 0x003E) { # >
2645 !!!cp (188);
2646 !!!parse-error (type => 'unclosed PUBLIC literal');
2647
2648 $self->{state} = DATA_STATE;
2649 !!!next-input-character;
2650
2651 $self->{current_token}->{quirks} = 1;
2652 !!!emit ($self->{current_token}); # DOCTYPE
2653
2654 redo A;
2655 } elsif ($self->{next_char} == -1) {
2656 !!!cp (189);
2657 !!!parse-error (type => 'unclosed PUBLIC literal');
2658
2659 $self->{state} = DATA_STATE;
2660 ## reconsume
2661
2662 $self->{current_token}->{quirks} = 1;
2663 !!!emit ($self->{current_token}); # DOCTYPE
2664
2665 redo A;
2666 } else {
2667 !!!cp (190);
2668 $self->{current_token}->{public_identifier} # DOCTYPE
2669 .= chr $self->{next_char};
2670 $self->{read_until}->($self->{current_token}->{public_identifier},
2671 q[">],
2672 length $self->{current_token}->{public_identifier});
2673
2674 ## Stay in the state
2675 !!!next-input-character;
2676 redo A;
2677 }
2678 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2679 if ($self->{next_char} == 0x0027) { # '
2680 !!!cp (191);
2681 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2682 !!!next-input-character;
2683 redo A;
2684 } elsif ($self->{next_char} == 0x003E) { # >
2685 !!!cp (192);
2686 !!!parse-error (type => 'unclosed PUBLIC literal');
2687
2688 $self->{state} = DATA_STATE;
2689 !!!next-input-character;
2690
2691 $self->{current_token}->{quirks} = 1;
2692 !!!emit ($self->{current_token}); # DOCTYPE
2693
2694 redo A;
2695 } elsif ($self->{next_char} == -1) {
2696 !!!cp (193);
2697 !!!parse-error (type => 'unclosed PUBLIC literal');
2698
2699 $self->{state} = DATA_STATE;
2700 ## reconsume
2701
2702 $self->{current_token}->{quirks} = 1;
2703 !!!emit ($self->{current_token}); # DOCTYPE
2704
2705 redo A;
2706 } else {
2707 !!!cp (194);
2708 $self->{current_token}->{public_identifier} # DOCTYPE
2709 .= chr $self->{next_char};
2710 $self->{read_until}->($self->{current_token}->{public_identifier},
2711 q['>],
2712 length $self->{current_token}->{public_identifier});
2713
2714 ## Stay in the state
2715 !!!next-input-character;
2716 redo A;
2717 }
2718 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2719 if ({
2720 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2721 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2722 }->{$self->{next_char}}) {
2723 !!!cp (195);
2724 ## Stay in the state
2725 !!!next-input-character;
2726 redo A;
2727 } elsif ($self->{next_char} == 0x0022) { # "
2728 !!!cp (196);
2729 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2730 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2731 !!!next-input-character;
2732 redo A;
2733 } elsif ($self->{next_char} == 0x0027) { # '
2734 !!!cp (197);
2735 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2736 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2737 !!!next-input-character;
2738 redo A;
2739 } elsif ($self->{next_char} == 0x003E) { # >
2740 !!!cp (198);
2741 $self->{state} = DATA_STATE;
2742 !!!next-input-character;
2743
2744 !!!emit ($self->{current_token}); # DOCTYPE
2745
2746 redo A;
2747 } elsif ($self->{next_char} == -1) {
2748 !!!cp (199);
2749 !!!parse-error (type => 'unclosed DOCTYPE');
2750
2751 $self->{state} = DATA_STATE;
2752 ## reconsume
2753
2754 $self->{current_token}->{quirks} = 1;
2755 !!!emit ($self->{current_token}); # DOCTYPE
2756
2757 redo A;
2758 } else {
2759 !!!cp (200);
2760 !!!parse-error (type => 'string after PUBLIC literal');
2761 $self->{current_token}->{quirks} = 1;
2762
2763 $self->{state} = BOGUS_DOCTYPE_STATE;
2764 !!!next-input-character;
2765 redo A;
2766 }
2767 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2768 if ({
2769 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2770 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2771 }->{$self->{next_char}}) {
2772 !!!cp (201);
2773 ## Stay in the state
2774 !!!next-input-character;
2775 redo A;
2776 } elsif ($self->{next_char} == 0x0022) { # "
2777 !!!cp (202);
2778 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2779 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2780 !!!next-input-character;
2781 redo A;
2782 } elsif ($self->{next_char} == 0x0027) { # '
2783 !!!cp (203);
2784 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2785 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2786 !!!next-input-character;
2787 redo A;
2788 } elsif ($self->{next_char} == 0x003E) { # >
2789 !!!cp (204);
2790 !!!parse-error (type => 'no SYSTEM literal');
2791 $self->{state} = DATA_STATE;
2792 !!!next-input-character;
2793
2794 $self->{current_token}->{quirks} = 1;
2795 !!!emit ($self->{current_token}); # DOCTYPE
2796
2797 redo A;
2798 } elsif ($self->{next_char} == -1) {
2799 !!!cp (205);
2800 !!!parse-error (type => 'unclosed DOCTYPE');
2801
2802 $self->{state} = DATA_STATE;
2803 ## reconsume
2804
2805 $self->{current_token}->{quirks} = 1;
2806 !!!emit ($self->{current_token}); # DOCTYPE
2807
2808 redo A;
2809 } else {
2810 !!!cp (206);
2811 !!!parse-error (type => 'string after SYSTEM');
2812 $self->{current_token}->{quirks} = 1;
2813
2814 $self->{state} = BOGUS_DOCTYPE_STATE;
2815 !!!next-input-character;
2816 redo A;
2817 }
2818 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2819 if ($self->{next_char} == 0x0022) { # "
2820 !!!cp (207);
2821 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2822 !!!next-input-character;
2823 redo A;
2824 } elsif ($self->{next_char} == 0x003E) { # >
2825 !!!cp (208);
2826 !!!parse-error (type => 'unclosed SYSTEM literal');
2827
2828 $self->{state} = DATA_STATE;
2829 !!!next-input-character;
2830
2831 $self->{current_token}->{quirks} = 1;
2832 !!!emit ($self->{current_token}); # DOCTYPE
2833
2834 redo A;
2835 } elsif ($self->{next_char} == -1) {
2836 !!!cp (209);
2837 !!!parse-error (type => 'unclosed SYSTEM literal');
2838
2839 $self->{state} = DATA_STATE;
2840 ## reconsume
2841
2842 $self->{current_token}->{quirks} = 1;
2843 !!!emit ($self->{current_token}); # DOCTYPE
2844
2845 redo A;
2846 } else {
2847 !!!cp (210);
2848 $self->{current_token}->{system_identifier} # DOCTYPE
2849 .= chr $self->{next_char};
2850 $self->{read_until}->($self->{current_token}->{system_identifier},
2851 q[">],
2852 length $self->{current_token}->{system_identifier});
2853
2854 ## Stay in the state
2855 !!!next-input-character;
2856 redo A;
2857 }
2858 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2859 if ($self->{next_char} == 0x0027) { # '
2860 !!!cp (211);
2861 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2862 !!!next-input-character;
2863 redo A;
2864 } elsif ($self->{next_char} == 0x003E) { # >
2865 !!!cp (212);
2866 !!!parse-error (type => 'unclosed SYSTEM literal');
2867
2868 $self->{state} = DATA_STATE;
2869 !!!next-input-character;
2870
2871 $self->{current_token}->{quirks} = 1;
2872 !!!emit ($self->{current_token}); # DOCTYPE
2873
2874 redo A;
2875 } elsif ($self->{next_char} == -1) {
2876 !!!cp (213);
2877 !!!parse-error (type => 'unclosed SYSTEM literal');
2878
2879 $self->{state} = DATA_STATE;
2880 ## reconsume
2881
2882 $self->{current_token}->{quirks} = 1;
2883 !!!emit ($self->{current_token}); # DOCTYPE
2884
2885 redo A;
2886 } else {
2887 !!!cp (214);
2888 $self->{current_token}->{system_identifier} # DOCTYPE
2889 .= chr $self->{next_char};
2890 $self->{read_until}->($self->{current_token}->{system_identifier},
2891 q['>],
2892 length $self->{current_token}->{system_identifier});
2893
2894 ## Stay in the state
2895 !!!next-input-character;
2896 redo A;
2897 }
2898 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2899 if ({
2900 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2901 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2902 }->{$self->{next_char}}) {
2903 !!!cp (215);
2904 ## Stay in the state
2905 !!!next-input-character;
2906 redo A;
2907 } elsif ($self->{next_char} == 0x003E) { # >
2908 !!!cp (216);
2909 $self->{state} = DATA_STATE;
2910 !!!next-input-character;
2911
2912 !!!emit ($self->{current_token}); # DOCTYPE
2913
2914 redo A;
2915 } elsif ($self->{next_char} == -1) {
2916 !!!cp (217);
2917 !!!parse-error (type => 'unclosed DOCTYPE');
2918 $self->{state} = DATA_STATE;
2919 ## reconsume
2920
2921 $self->{current_token}->{quirks} = 1;
2922 !!!emit ($self->{current_token}); # DOCTYPE
2923
2924 redo A;
2925 } else {
2926 !!!cp (218);
2927 !!!parse-error (type => 'string after SYSTEM literal');
2928 #$self->{current_token}->{quirks} = 1;
2929
2930 $self->{state} = BOGUS_DOCTYPE_STATE;
2931 !!!next-input-character;
2932 redo A;
2933 }
2934 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2935 if ($self->{next_char} == 0x003E) { # >
2936 !!!cp (219);
2937 $self->{state} = DATA_STATE;
2938 !!!next-input-character;
2939
2940 !!!emit ($self->{current_token}); # DOCTYPE
2941
2942 redo A;
2943 } elsif ($self->{next_char} == -1) {
2944 !!!cp (220);
2945 !!!parse-error (type => 'unclosed DOCTYPE');
2946 $self->{state} = DATA_STATE;
2947 ## reconsume
2948
2949 !!!emit ($self->{current_token}); # DOCTYPE
2950
2951 redo A;
2952 } else {
2953 !!!cp (221);
2954 my $s = '';
2955 $self->{read_until}->($s, q[>], 0);
2956
2957 ## Stay in the state
2958 !!!next-input-character;
2959 redo A;
2960 }
2961 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2962 ## NOTE: "CDATA section state" in the state is jointly implemented
2963 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2964 ## and |CDATA_SECTION_MSE2_STATE|.
2965
2966 if ($self->{next_char} == 0x005D) { # ]
2967 !!!cp (221.1);
2968 $self->{state} = CDATA_SECTION_MSE1_STATE;
2969 !!!next-input-character;
2970 redo A;
2971 } elsif ($self->{next_char} == -1) {
2972 $self->{state} = DATA_STATE;
2973 !!!next-input-character;
2974 if (length $self->{current_token}->{data}) { # character
2975 !!!cp (221.2);
2976 !!!emit ($self->{current_token}); # character
2977 } else {
2978 !!!cp (221.3);
2979 ## No token to emit. $self->{current_token} is discarded.
2980 }
2981 redo A;
2982 } else {
2983 !!!cp (221.4);
2984 $self->{current_token}->{data} .= chr $self->{next_char};
2985 $self->{read_until}->($self->{current_token}->{data},
2986 q<]>,
2987 length $self->{current_token}->{data});
2988
2989 ## Stay in the state.
2990 !!!next-input-character;
2991 redo A;
2992 }
2993
2994 ## ISSUE: "text tokens" in spec.
2995 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2996 if ($self->{next_char} == 0x005D) { # ]
2997 !!!cp (221.5);
2998 $self->{state} = CDATA_SECTION_MSE2_STATE;
2999 !!!next-input-character;
3000 redo A;
3001 } else {
3002 !!!cp (221.6);
3003 $self->{current_token}->{data} .= ']';
3004 $self->{state} = CDATA_SECTION_STATE;
3005 ## Reconsume.
3006 redo A;
3007 }
3008 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3009 if ($self->{next_char} == 0x003E) { # >
3010 $self->{state} = DATA_STATE;
3011 !!!next-input-character;
3012 if (length $self->{current_token}->{data}) { # character
3013 !!!cp (221.7);
3014 !!!emit ($self->{current_token}); # character
3015 } else {
3016 !!!cp (221.8);
3017 ## No token to emit. $self->{current_token} is discarded.
3018 }
3019 redo A;
3020 } elsif ($self->{next_char} == 0x005D) { # ]
3021 !!!cp (221.9); # character
3022 $self->{current_token}->{data} .= ']'; ## Add first "]" of "]]]".
3023 ## Stay in the state.
3024 !!!next-input-character;
3025 redo A;
3026 } else {
3027 !!!cp (221.11);
3028 $self->{current_token}->{data} .= ']]'; # character
3029 $self->{state} = CDATA_SECTION_STATE;
3030 ## Reconsume.
3031 redo A;
3032 }
3033 } elsif ($self->{state} == ENTITY_STATE) {
3034 if ({
3035 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
3036 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, &
3037 $self->{entity_additional} => 1,
3038 }->{$self->{next_char}}) {
3039 !!!cp (1001);
3040 ## Don't consume
3041 ## No error
3042 ## Return nothing.
3043 #
3044 } elsif ($self->{next_char} == 0x0023) { # #
3045 !!!cp (999);
3046 $self->{state} = ENTITY_HASH_STATE;
3047 $self->{state_keyword} = '#';
3048 !!!next-input-character;
3049 redo A;
3050 } elsif ((0x0041 <= $self->{next_char} and
3051 $self->{next_char} <= 0x005A) or # A..Z
3052 (0x0061 <= $self->{next_char} and
3053 $self->{next_char} <= 0x007A)) { # a..z
3054 !!!cp (998);
3055 require Whatpm::_NamedEntityList;
3056 $self->{state} = ENTITY_NAME_STATE;
3057 $self->{state_keyword} = chr $self->{next_char};
3058 $self->{entity__value} = $self->{state_keyword};
3059 $self->{entity__match} = 0;
3060 !!!next-input-character;
3061 redo A;
3062 } else {
3063 !!!cp (1027);
3064 !!!parse-error (type => 'bare ero');
3065 ## Return nothing.
3066 #
3067 }
3068
3069 ## NOTE: No character is consumed by the "consume a character
3070 ## reference" algorithm. In other word, there is an "&" character
3071 ## that does not introduce a character reference, which would be
3072 ## appended to the parent element or the attribute value in later
3073 ## process of the tokenizer.
3074
3075 if ($self->{prev_state} == DATA_STATE) {
3076 !!!cp (997);
3077 $self->{state} = $self->{prev_state};
3078 ## Reconsume.
3079 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3080 line => $self->{line_prev},
3081 column => $self->{column_prev},
3082 });
3083 redo A;
3084 } else {
3085 !!!cp (996);
3086 $self->{current_attribute}->{value} .= '&';
3087 $self->{state} = $self->{prev_state};
3088 ## Reconsume.
3089 redo A;
3090 }
3091 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3092 if ($self->{next_char} == 0x0078 or # x
3093 $self->{next_char} == 0x0058) { # X
3094 !!!cp (995);
3095 $self->{state} = HEXREF_X_STATE;
3096 $self->{state_keyword} .= chr $self->{next_char};
3097 !!!next-input-character;
3098 redo A;
3099 } elsif (0x0030 <= $self->{next_char} and
3100 $self->{next_char} <= 0x0039) { # 0..9
3101 !!!cp (994);
3102 $self->{state} = NCR_NUM_STATE;
3103 $self->{state_keyword} = $self->{next_char} - 0x0030;
3104 !!!next-input-character;
3105 redo A;
3106 } else {
3107 !!!parse-error (type => 'bare nero',
3108 line => $self->{line_prev},
3109 column => $self->{column_prev} - 1);
3110
3111 ## NOTE: According to the spec algorithm, nothing is returned,
3112 ## and then "&#" is appended to the parent element or the attribute
3113 ## value in the later processing.
3114
3115 if ($self->{prev_state} == DATA_STATE) {
3116 !!!cp (1019);
3117 $self->{state} = $self->{prev_state};
3118 ## Reconsume.
3119 !!!emit ({type => CHARACTER_TOKEN,
3120 data => '&#',
3121 line => $self->{line_prev},
3122 column => $self->{column_prev} - 1,
3123 });
3124 redo A;
3125 } else {
3126 !!!cp (993);
3127 $self->{current_attribute}->{value} .= '&#';
3128 $self->{state} = $self->{prev_state};
3129 ## Reconsume.
3130 redo A;
3131 }
3132 }
3133 } elsif ($self->{state} == NCR_NUM_STATE) {
3134 if (0x0030 <= $self->{next_char} and
3135 $self->{next_char} <= 0x0039) { # 0..9
3136 !!!cp (1012);
3137 $self->{state_keyword} *= 10;
3138 $self->{state_keyword} += $self->{next_char} - 0x0030;
3139
3140 ## Stay in the state.
3141 !!!next-input-character;
3142 redo A;
3143 } elsif ($self->{next_char} == 0x003B) { # ;
3144 !!!cp (1013);
3145 !!!next-input-character;
3146 #
3147 } else {
3148 !!!cp (1014);
3149 !!!parse-error (type => 'no refc');
3150 ## Reconsume.
3151 #
3152 }
3153
3154 my $code = $self->{state_keyword};
3155 my $l = $self->{line_prev};
3156 my $c = $self->{column_prev};
3157 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3158 !!!cp (1015);
3159 !!!parse-error (type => 'invalid character reference',
3160 text => (sprintf 'U+%04X', $code),
3161 line => $l, column => $c);
3162 $code = 0xFFFD;
3163 } elsif ($code > 0x10FFFF) {
3164 !!!cp (1016);
3165 !!!parse-error (type => 'invalid character reference',
3166 text => (sprintf 'U-%08X', $code),
3167 line => $l, column => $c);
3168 $code = 0xFFFD;
3169 } elsif ($code == 0x000D) {
3170 !!!cp (1017);
3171 !!!parse-error (type => 'CR character reference',
3172 line => $l, column => $c);
3173 $code = 0x000A;
3174 } elsif (0x80 <= $code and $code <= 0x9F) {
3175 !!!cp (1018);
3176 !!!parse-error (type => 'C1 character reference',
3177 text => (sprintf 'U+%04X', $code),
3178 line => $l, column => $c);
3179 $code = $c1_entity_char->{$code};
3180 }
3181
3182 if ($self->{prev_state} == DATA_STATE) {
3183 !!!cp (992);
3184 $self->{state} = $self->{prev_state};
3185 ## Reconsume.
3186 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3187 line => $l, column => $c,
3188 });
3189 redo A;
3190 } else {
3191 !!!cp (991);
3192 $self->{current_attribute}->{value} .= chr $code;
3193 $self->{current_attribute}->{has_reference} = 1;
3194 $self->{state} = $self->{prev_state};
3195 ## Reconsume.
3196 redo A;
3197 }
3198 } elsif ($self->{state} == HEXREF_X_STATE) {
3199 if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
3200 (0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or
3201 (0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) {
3202 # 0..9, A..F, a..f
3203 !!!cp (990);
3204 $self->{state} = HEXREF_HEX_STATE;
3205 $self->{state_keyword} = 0;
3206 ## Reconsume.
3207 redo A;
3208 } else {
3209 !!!parse-error (type => 'bare hcro',
3210 line => $self->{line_prev},
3211 column => $self->{column_prev} - 2);
3212
3213 ## NOTE: According to the spec algorithm, nothing is returned,
3214 ## and then "&#" followed by "X" or "x" is appended to the parent
3215 ## element or the attribute value in the later processing.
3216
3217 if ($self->{prev_state} == DATA_STATE) {
3218 !!!cp (1005);
3219 $self->{state} = $self->{prev_state};
3220 ## Reconsume.
3221 !!!emit ({type => CHARACTER_TOKEN,
3222 data => '&' . $self->{state_keyword},
3223 line => $self->{line_prev},
3224 column => $self->{column_prev} - length $self->{state_keyword},
3225 });
3226 redo A;
3227 } else {
3228 !!!cp (989);
3229 $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
3230 $self->{state} = $self->{prev_state};
3231 ## Reconsume.
3232 redo A;
3233 }
3234 }
3235 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3236 if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) {
3237 # 0..9
3238 !!!cp (1002);
3239 $self->{state_keyword} *= 0x10;
3240 $self->{state_keyword} += $self->{next_char} - 0x0030;
3241 ## Stay in the state.
3242 !!!next-input-character;
3243 redo A;
3244 } elsif (0x0061 <= $self->{next_char} and
3245 $self->{next_char} <= 0x0066) { # a..f
3246 !!!cp (1003);
3247 $self->{state_keyword} *= 0x10;
3248 $self->{state_keyword} += $self->{next_char} - 0x0060 + 9;
3249 ## Stay in the state.
3250 !!!next-input-character;
3251 redo A;
3252 } elsif (0x0041 <= $self->{next_char} and
3253 $self->{next_char} <= 0x0046) { # A..F
3254 !!!cp (1004);
3255 $self->{state_keyword} *= 0x10;
3256 $self->{state_keyword} += $self->{next_char} - 0x0040 + 9;
3257 ## Stay in the state.
3258 !!!next-input-character;
3259 redo A;
3260 } elsif ($self->{next_char} == 0x003B) { # ;
3261 !!!cp (1006);
3262 !!!next-input-character;
3263 #
3264 } else {
3265 !!!cp (1007);
3266 !!!parse-error (type => 'no refc',
3267 line => $self->{line},
3268 column => $self->{column});
3269 ## Reconsume.
3270 #
3271 }
3272
3273 my $code = $self->{state_keyword};
3274 my $l = $self->{line_prev};
3275 my $c = $self->{column_prev};
3276 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3277 !!!cp (1008);
3278 !!!parse-error (type => 'invalid character reference',
3279 text => (sprintf 'U+%04X', $code),
3280 line => $l, column => $c);
3281 $code = 0xFFFD;
3282 } elsif ($code > 0x10FFFF) {
3283 !!!cp (1009);
3284 !!!parse-error (type => 'invalid character reference',
3285 text => (sprintf 'U-%08X', $code),
3286 line => $l, column => $c);
3287 $code = 0xFFFD;
3288 } elsif ($code == 0x000D) {
3289 !!!cp (1010);
3290 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
3291 $code = 0x000A;
3292 } elsif (0x80 <= $code and $code <= 0x9F) {
3293 !!!cp (1011);
3294 !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
3295 $code = $c1_entity_char->{$code};
3296 }
3297
3298 if ($self->{prev_state} == DATA_STATE) {
3299 !!!cp (988);
3300 $self->{state} = $self->{prev_state};
3301 ## Reconsume.
3302 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3303 line => $l, column => $c,
3304 });
3305 redo A;
3306 } else {
3307 !!!cp (987);
3308 $self->{current_attribute}->{value} .= chr $code;
3309 $self->{current_attribute}->{has_reference} = 1;
3310 $self->{state} = $self->{prev_state};
3311 ## Reconsume.
3312 redo A;
3313 }
3314 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3315 if (length $self->{state_keyword} < 30 and
3316 ## NOTE: Some number greater than the maximum length of entity name
3317 ((0x0041 <= $self->{next_char} and # a
3318 $self->{next_char} <= 0x005A) or # x
3319 (0x0061 <= $self->{next_char} and # a
3320 $self->{next_char} <= 0x007A) or # z
3321 (0x0030 <= $self->{next_char} and # 0
3322 $self->{next_char} <= 0x0039) or # 9
3323 $self->{next_char} == 0x003B)) { # ;
3324 our $EntityChar;
3325 $self->{state_keyword} .= chr $self->{next_char};
3326 if (defined $EntityChar->{$self->{state_keyword}}) {
3327 if ($self->{next_char} == 0x003B) { # ;
3328 !!!cp (1020);
3329 $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3330 $self->{entity__match} = 1;
3331 !!!next-input-character;
3332 #
3333 } else {
3334 !!!cp (1021);
3335 $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3336 $self->{entity__match} = -1;
3337 ## Stay in the state.
3338 !!!next-input-character;
3339 redo A;
3340 }
3341 } else {
3342 !!!cp (1022);
3343 $self->{entity__value} .= chr $self->{next_char};
3344 $self->{entity__match} *= 2;
3345 ## Stay in the state.
3346 !!!next-input-character;
3347 redo A;
3348 }
3349 }
3350
3351 my $data;
3352 my $has_ref;
3353 if ($self->{entity__match} > 0) {
3354 !!!cp (1023);
3355 $data = $self->{entity__value};
3356 $has_ref = 1;
3357 #
3358 } elsif ($self->{entity__match} < 0) {
3359 !!!parse-error (type => 'no refc');
3360 if ($self->{prev_state} != DATA_STATE and # in attribute
3361 $self->{entity__match} < -1) {
3362 !!!cp (1024);
3363 $data = '&' . $self->{state_keyword};
3364 #
3365 } else {
3366 !!!cp (1025);
3367 $data = $self->{entity__value};
3368 $has_ref = 1;
3369 #
3370 }
3371 } else {
3372 !!!cp (1026);
3373 !!!parse-error (type => 'bare ero',
3374 line => $self->{line_prev},
3375 column => $self->{column_prev} - length $self->{state_keyword});
3376 $data = '&' . $self->{state_keyword};
3377 #
3378 }
3379
3380 ## NOTE: In these cases, when a character reference is found,
3381 ## it is consumed and a character token is returned, or, otherwise,
3382 ## nothing is consumed and returned, according to the spec algorithm.
3383 ## In this implementation, anything that has been examined by the
3384 ## tokenizer is appended to the parent element or the attribute value
3385 ## as string, either literal string when no character reference or
3386 ## entity-replaced string otherwise, in this stage, since any characters
3387 ## that would not be consumed are appended in the data state or in an
3388 ## appropriate attribute value state anyway.
3389
3390 if ($self->{prev_state} == DATA_STATE) {
3391 !!!cp (986);
3392 $self->{state} = $self->{prev_state};
3393 ## Reconsume.
3394 !!!emit ({type => CHARACTER_TOKEN,
3395 data => $data,
3396 line => $self->{line_prev},
3397 column => $self->{column_prev} + 1 - length $self->{state_keyword},
3398 });
3399 redo A;
3400 } else {
3401 !!!cp (985);
3402 $self->{current_attribute}->{value} .= $data;
3403 $self->{current_attribute}->{has_reference} = 1 if $has_ref;
3404 $self->{state} = $self->{prev_state};
3405 ## Reconsume.
3406 redo A;
3407 }
3408 } else {
3409 die "$0: $self->{state}: Unknown state";
3410 }
3411 } # A
3412
3413 die "$0: _get_next_token: unexpected case";
3414 } # _get_next_token
3415
3416 sub _initialize_tree_constructor ($) {
3417 my $self = shift;
3418 ## NOTE: $self->{document} MUST be specified before this method is called
3419 $self->{document}->strict_error_checking (0);
3420 ## TODO: Turn mutation events off # MUST
3421 ## TODO: Turn loose Document option (manakai extension) on
3422 $self->{document}->manakai_is_html (1); # MUST
3423 $self->{document}->set_user_data (manakai_source_line => 1);
3424 $self->{document}->set_user_data (manakai_source_column => 1);
3425 } # _initialize_tree_constructor
3426
3427 sub _terminate_tree_constructor ($) {
3428 my $self = shift;
3429 $self->{document}->strict_error_checking (1);
3430 ## TODO: Turn mutation events on
3431 } # _terminate_tree_constructor
3432
3433 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3434
3435 { # tree construction stage
3436 my $token;
3437
3438 sub _construct_tree ($) {
3439 my ($self) = @_;
3440
3441 ## When an interactive UA render the $self->{document} available
3442 ## to the user, or when it begin accepting user input, are
3443 ## not defined.
3444
3445 ## Append a character: collect it and all subsequent consecutive
3446 ## characters and insert one Text node whose data is concatenation
3447 ## of all those characters. # MUST
3448
3449 !!!next-token;
3450
3451 undef $self->{form_element};
3452 undef $self->{head_element};
3453 $self->{open_elements} = [];
3454 undef $self->{inner_html_node};
3455
3456 ## NOTE: The "initial" insertion mode.
3457 $self->_tree_construction_initial; # MUST
3458
3459 ## NOTE: The "before html" insertion mode.
3460 $self->_tree_construction_root_element;
3461 $self->{insertion_mode} = BEFORE_HEAD_IM;
3462
3463 ## NOTE: The "before head" insertion mode and so on.
3464 $self->_tree_construction_main;
3465 } # _construct_tree
3466
3467 sub _tree_construction_initial ($) {
3468 my $self = shift;
3469
3470 ## NOTE: "initial" insertion mode
3471
3472 INITIAL: {
3473 if ($token->{type} == DOCTYPE_TOKEN) {
3474 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3475 ## error, switch to a conformance checking mode for another
3476 ## language.
3477 my $doctype_name = $token->{name};
3478 $doctype_name = '' unless defined $doctype_name;
3479 $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3480 if (not defined $token->{name} or # <!DOCTYPE>
3481 defined $token->{system_identifier}) {
3482 !!!cp ('t1');
3483 !!!parse-error (type => 'not HTML5', token => $token);
3484 } elsif ($doctype_name ne 'HTML') {
3485 !!!cp ('t2');
3486 !!!parse-error (type => 'not HTML5', token => $token);
3487 } elsif (defined $token->{public_identifier}) {
3488 if ($token->{public_identifier} eq 'XSLT-compat') {
3489 !!!cp ('t1.2');
3490 !!!parse-error (type => 'XSLT-compat', token => $token,
3491 level => $self->{level}->{should});
3492 } else {
3493 !!!parse-error (type => 'not HTML5', token => $token);
3494 }
3495 } else {
3496 !!!cp ('t3');
3497 #
3498 }
3499
3500 my $doctype = $self->{document}->create_document_type_definition
3501 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3502 ## NOTE: Default value for both |public_id| and |system_id| attributes
3503 ## are empty strings, so that we don't set any value in missing cases.
3504 $doctype->public_id ($token->{public_identifier})
3505 if defined $token->{public_identifier};
3506 $doctype->system_id ($token->{system_identifier})
3507 if defined $token->{system_identifier};
3508 ## NOTE: Other DocumentType attributes are null or empty lists.
3509 ## ISSUE: internalSubset = null??
3510 $self->{document}->append_child ($doctype);
3511
3512 if ($token->{quirks} or $doctype_name ne 'HTML') {
3513 !!!cp ('t4');
3514 $self->{document}->manakai_compat_mode ('quirks');
3515 } elsif (defined $token->{public_identifier}) {
3516 my $pubid = $token->{public_identifier};
3517 $pubid =~ tr/a-z/A-z/;
3518 my $prefix = [
3519 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3520 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3521 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3522 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3523 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3524 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3525 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3526 "-//IETF//DTD HTML 2.0 STRICT//",
3527 "-//IETF//DTD HTML 2.0//",
3528 "-//IETF//DTD HTML 2.1E//",
3529 "-//IETF//DTD HTML 3.0//",
3530 "-//IETF//DTD HTML 3.2 FINAL//",
3531 "-//IETF//DTD HTML 3.2//",
3532 "-//IETF//DTD HTML 3//",
3533 "-//IETF//DTD HTML LEVEL 0//",
3534 "-//IETF//DTD HTML LEVEL 1//",
3535 "-//IETF//DTD HTML LEVEL 2//",
3536 "-//IETF//DTD HTML LEVEL 3//",
3537 "-//IETF//DTD HTML STRICT LEVEL 0//",
3538 "-//IETF//DTD HTML STRICT LEVEL 1//",
3539 "-//IETF//DTD HTML STRICT LEVEL 2//",
3540 "-//IETF//DTD HTML STRICT LEVEL 3//",
3541 "-//IETF//DTD HTML STRICT//",
3542 "-//IETF//DTD HTML//",
3543 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3544 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3545 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3546 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3547 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3548 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3549 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3550 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3551 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3552 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3553 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3554 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3555 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3556 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3557 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3558 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3559 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3560 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3561 "-//W3C//DTD HTML 3 1995-03-24//",
3562 "-//W3C//DTD HTML 3.2 DRAFT//",
3563 "-//W3C//DTD HTML 3.2 FINAL//",
3564 "-//W3C//DTD HTML 3.2//",
3565 "-//W3C//DTD HTML 3.2S DRAFT//",
3566 "-//W3C//DTD HTML 4.0 FRAMESET//",
3567 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3568 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3569 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3570 "-//W3C//DTD W3 HTML//",
3571 "-//W3O//DTD W3 HTML 3.0//",
3572 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3573 "-//WEBTECHS//DTD MOZILLA HTML//",
3574 ]; # $prefix
3575 my $match;
3576 for (@$prefix) {
3577 if (substr ($prefix, 0, length $_) eq $_) {
3578 $match = 1;
3579 last;
3580 }
3581 }
3582 if ($match or
3583 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3584 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3585 $pubid eq "HTML") {
3586 !!!cp ('t5');
3587 $self->{document}->manakai_compat_mode ('quirks');
3588 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3589 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3590 if (defined $token->{system_identifier}) {
3591 !!!cp ('t6');
3592 $self->{document}->manakai_compat_mode ('quirks');
3593 } else {
3594 !!!cp ('t7');
3595 $self->{document}->manakai_compat_mode ('limited quirks');
3596 }
3597 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3598 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3599 !!!cp ('t8');
3600 $self->{document}->manakai_compat_mode ('limited quirks');
3601 } else {
3602 !!!cp ('t9');
3603 }
3604 } else {
3605 !!!cp ('t10');
3606 }
3607 if (defined $token->{system_identifier}) {
3608 my $sysid = $token->{system_identifier};
3609 $sysid =~ tr/A-Z/a-z/;
3610 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3611 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3612 ## marked as quirks.
3613 $self->{document}->manakai_compat_mode ('quirks');
3614 !!!cp ('t11');
3615 } else {
3616 !!!cp ('t12');
3617 }
3618 } else {
3619 !!!cp ('t13');
3620 }
3621
3622 ## Go to the "before html" insertion mode.
3623 !!!next-token;
3624 return;
3625 } elsif ({
3626 START_TAG_TOKEN, 1,
3627 END_TAG_TOKEN, 1,
3628 END_OF_FILE_TOKEN, 1,
3629 }->{$token->{type}}) {
3630 !!!cp ('t14');
3631 !!!parse-error (type => 'no DOCTYPE', token => $token);
3632 $self->{document}->manakai_compat_mode ('quirks');
3633 ## Go to the "before html" insertion mode.
3634 ## reprocess
3635 !!!ack-later;
3636 return;
3637 } elsif ($token->{type} == CHARACTER_TOKEN) {
3638 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3639 ## Ignore the token
3640
3641 unless (length $token->{data}) {
3642 !!!cp ('t15');
3643 ## Stay in the insertion mode.
3644 !!!next-token;
3645 redo INITIAL;
3646 } else {
3647 !!!cp ('t16');
3648 }
3649 } else {
3650 !!!cp ('t17');
3651 }
3652
3653 !!!parse-error (type => 'no DOCTYPE', token => $token);
3654 $self->{document}->manakai_compat_mode ('quirks');
3655 ## Go to the "before html" insertion mode.
3656 ## reprocess
3657 return;
3658 } elsif ($token->{type} == COMMENT_TOKEN) {
3659 !!!cp ('t18');
3660 my $comment = $self->{document}->create_comment ($token->{data});
3661 $self->{document}->append_child ($comment);
3662
3663 ## Stay in the insertion mode.
3664 !!!next-token;
3665 redo INITIAL;
3666 } else {
3667 die "$0: $token->{type}: Unknown token type";
3668 }
3669 } # INITIAL
3670
3671 die "$0: _tree_construction_initial: This should be never reached";
3672 } # _tree_construction_initial
3673
3674 sub _tree_construction_root_element ($) {
3675 my $self = shift;
3676
3677 ## NOTE: "before html" insertion mode.
3678
3679 B: {
3680 if ($token->{type} == DOCTYPE_TOKEN) {
3681 !!!cp ('t19');
3682 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3683 ## Ignore the token
3684 ## Stay in the insertion mode.
3685 !!!next-token;
3686 redo B;
3687 } elsif ($token->{type} == COMMENT_TOKEN) {
3688 !!!cp ('t20');
3689 my $comment = $self->{document}->create_comment ($token->{data});
3690 $self->{document}->append_child ($comment);
3691 ## Stay in the insertion mode.
3692 !!!next-token;
3693 redo B;
3694 } elsif ($token->{type} == CHARACTER_TOKEN) {
3695 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3696 ## Ignore the token.
3697
3698 unless (length $token->{data}) {
3699 !!!cp ('t21');
3700 ## Stay in the insertion mode.
3701 !!!next-token;
3702 redo B;
3703 } else {
3704 !!!cp ('t22');
3705 }
3706 } else {
3707 !!!cp ('t23');
3708 }
3709
3710 $self->{application_cache_selection}->(undef);
3711
3712 #
3713 } elsif ($token->{type} == START_TAG_TOKEN) {
3714 if ($token->{tag_name} eq 'html') {
3715 my $root_element;
3716 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3717 $self->{document}->append_child ($root_element);
3718 push @{$self->{open_elements}},
3719 [$root_element, $el_category->{html}];
3720
3721 if ($token->{attributes}->{manifest}) {
3722 !!!cp ('t24');
3723 $self->{application_cache_selection}
3724 ->($token->{attributes}->{manifest}->{value});
3725 ## ISSUE: Spec is unclear on relative references.
3726 ## According to Hixie (#whatwg 2008-03-19), it should be
3727 ## resolved against the base URI of the document in HTML
3728 ## or xml:base of the element in XHTML.
3729 } else {
3730 !!!cp ('t25');
3731 $self->{application_cache_selection}->(undef);
3732 }
3733
3734 !!!nack ('t25c');
3735
3736 !!!next-token;
3737 return; ## Go to the "before head" insertion mode.
3738 } else {
3739 !!!cp ('t25.1');
3740 #
3741 }
3742 } elsif ({
3743 END_TAG_TOKEN, 1,
3744 END_OF_FILE_TOKEN, 1,
3745 }->{$token->{type}}) {
3746 !!!cp ('t26');
3747 #
3748 } else {
3749 die "$0: $token->{type}: Unknown token type";
3750 }
3751
3752 my $root_element;
3753 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3754 $self->{document}->append_child ($root_element);
3755 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3756
3757 $self->{application_cache_selection}->(undef);
3758
3759 ## NOTE: Reprocess the token.
3760 !!!ack-later;
3761 return; ## Go to the "before head" insertion mode.
3762
3763 ## ISSUE: There is an issue in the spec
3764 } # B
3765
3766 die "$0: _tree_construction_root_element: This should never be reached";
3767 } # _tree_construction_root_element
3768
3769 sub _reset_insertion_mode ($) {
3770 my $self = shift;
3771
3772 ## Step 1
3773 my $last;
3774
3775 ## Step 2
3776 my $i = -1;
3777 my $node = $self->{open_elements}->[$i];
3778
3779 ## Step 3
3780 S3: {
3781 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3782 $last = 1;
3783 if (defined $self->{inner_html_node}) {
3784 !!!cp ('t28');
3785 $node = $self->{inner_html_node};
3786 } else {
3787 die "_reset_insertion_mode: t27";
3788 }
3789 }
3790
3791 ## Step 4..14
3792 my $new_mode;
3793 if ($node->[1] & FOREIGN_EL) {
3794 !!!cp ('t28.1');
3795 ## NOTE: Strictly spaking, the line below only applies to MathML and
3796 ## SVG elements. Currently the HTML syntax supports only MathML and
3797 ## SVG elements as foreigners.
3798 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3799 } elsif ($node->[1] & TABLE_CELL_EL) {
3800 if ($last) {
3801 !!!cp ('t28.2');
3802 #
3803 } else {
3804 !!!cp ('t28.3');
3805 $new_mode = IN_CELL_IM;
3806 }
3807 } else {
3808 !!!cp ('t28.4');
3809 $new_mode = {
3810 select => IN_SELECT_IM,
3811 ## NOTE: |option| and |optgroup| do not set
3812 ## insertion mode to "in select" by themselves.
3813 tr => IN_ROW_IM,
3814 tbody => IN_TABLE_BODY_IM,
3815 thead => IN_TABLE_BODY_IM,
3816 tfoot => IN_TABLE_BODY_IM,
3817 caption => IN_CAPTION_IM,
3818 colgroup => IN_COLUMN_GROUP_IM,
3819 table => IN_TABLE_IM,
3820 head => IN_BODY_IM, # not in head!
3821 body => IN_BODY_IM,
3822 frameset => IN_FRAMESET_IM,
3823 }->{$node->[0]->manakai_local_name};
3824 }
3825 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3826
3827 ## Step 15
3828 if ($node->[1] & HTML_EL) {
3829 unless (defined $self->{head_element}) {
3830 !!!cp ('t29');
3831 $self->{insertion_mode} = BEFORE_HEAD_IM;
3832 } else {
3833 ## ISSUE: Can this state be reached?
3834 !!!cp ('t30');
3835 $self->{insertion_mode} = AFTER_HEAD_IM;
3836 }
3837 return;
3838 } else {
3839 !!!cp ('t31');
3840 }
3841
3842 ## Step 16
3843 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3844
3845 ## Step 17
3846 $i--;
3847 $node = $self->{open_elements}->[$i];
3848
3849 ## Step 18
3850 redo S3;
3851 } # S3
3852
3853 die "$0: _reset_insertion_mode: This line should never be reached";
3854 } # _reset_insertion_mode
3855
3856 sub _tree_construction_main ($) {
3857 my $self = shift;
3858
3859 my $active_formatting_elements = [];
3860
3861 my $reconstruct_active_formatting_elements = sub { # MUST
3862 my $insert = shift;
3863
3864 ## Step 1
3865 return unless @$active_formatting_elements;
3866
3867 ## Step 3
3868 my $i = -1;
3869 my $entry = $active_formatting_elements->[$i];
3870
3871 ## Step 2
3872 return if $entry->[0] eq '#marker';
3873 for (@{$self->{open_elements}}) {
3874 if ($entry->[0] eq $_->[0]) {
3875 !!!cp ('t32');
3876 return;
3877 }
3878 }
3879
3880 S4: {
3881 ## Step 4
3882 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3883
3884 ## Step 5
3885 $i--;
3886 $entry = $active_formatting_elements->[$i];
3887
3888 ## Step 6
3889 if ($entry->[0] eq '#marker') {
3890 !!!cp ('t33_1');
3891 #
3892 } else {
3893 my $in_open_elements;
3894 OE: for (@{$self->{open_elements}}) {
3895 if ($entry->[0] eq $_->[0]) {
3896 !!!cp ('t33');
3897 $in_open_elements = 1;
3898 last OE;
3899 }
3900 }
3901 if ($in_open_elements) {
3902 !!!cp ('t34');
3903 #
3904 } else {
3905 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3906 !!!cp ('t35');
3907 redo S4;
3908 }
3909 }
3910
3911 ## Step 7
3912 $i++;
3913 $entry = $active_formatting_elements->[$i];
3914 } # S4
3915
3916 S7: {
3917 ## Step 8
3918 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3919
3920 ## Step 9
3921 $insert->($clone->[0]);
3922 push @{$self->{open_elements}}, $clone;
3923
3924 ## Step 10
3925 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3926
3927 ## Step 11
3928 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3929 !!!cp ('t36');
3930 ## Step 7'
3931 $i++;
3932 $entry = $active_formatting_elements->[$i];
3933
3934 redo S7;
3935 }
3936
3937 !!!cp ('t37');
3938 } # S7
3939 }; # $reconstruct_active_formatting_elements
3940
3941 my $clear_up_to_marker = sub {
3942 for (reverse 0..$#$active_formatting_elements) {
3943 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3944 !!!cp ('t38');
3945 splice @$active_formatting_elements, $_;
3946 return;
3947 }
3948 }
3949
3950 !!!cp ('t39');
3951 }; # $clear_up_to_marker
3952
3953 my $insert;
3954
3955 my $parse_rcdata = sub ($) {
3956 my ($content_model_flag) = @_;
3957
3958 ## Step 1
3959 my $start_tag_name = $token->{tag_name};
3960 my $el;
3961 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3962
3963 ## Step 2
3964 $insert->($el);
3965
3966 ## Step 3
3967 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3968 delete $self->{escape}; # MUST
3969
3970 ## Step 4
3971 my $text = '';
3972 !!!nack ('t40.1');
3973 !!!next-token;
3974 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3975 !!!cp ('t40');
3976 $text .= $token->{data};
3977 !!!next-token;
3978 }
3979
3980 ## Step 5
3981 if (length $text) {
3982 !!!cp ('t41');
3983 my $text = $self->{document}->create_text_node ($text);
3984 $el->append_child ($text);
3985 }
3986
3987 ## Step 6
3988 $self->{content_model} = PCDATA_CONTENT_MODEL;
3989
3990 ## Step 7
3991 if ($token->{type} == END_TAG_TOKEN and
3992 $token->{tag_name} eq $start_tag_name) {
3993 !!!cp ('t42');
3994 ## Ignore the token
3995 } else {
3996 ## NOTE: An end-of-file token.
3997 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3998 !!!cp ('t43');
3999 !!!parse-error (type => 'in CDATA:#eof', token => $token);
4000 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
4001 !!!cp ('t44');
4002 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
4003 } else {
4004 die "$0: $content_model_flag in parse_rcdata";
4005 }
4006 }
4007 !!!next-token;
4008 }; # $parse_rcdata
4009
4010 my $script_start_tag = sub () {
4011 my $script_el;
4012 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
4013 ## TODO: mark as "parser-inserted"
4014
4015 $self->{content_model} = CDATA_CONTENT_MODEL;
4016 delete $self->{escape}; # MUST
4017
4018 my $text = '';
4019 !!!nack ('t45.1');
4020 !!!next-token;
4021 while ($token->{type} == CHARACTER_TOKEN) {
4022 !!!cp ('t45');
4023 $text .= $token->{data};
4024 !!!next-token;
4025 } # stop if non-character token or tokenizer stops tokenising
4026 if (length $text) {
4027 !!!cp ('t46');
4028 $script_el->manakai_append_text ($text);
4029 }
4030
4031 $self->{content_model} = PCDATA_CONTENT_MODEL;
4032
4033 if ($token->{type} == END_TAG_TOKEN and
4034 $token->{tag_name} eq 'script') {
4035 !!!cp ('t47');
4036 ## Ignore the token
4037 } else {
4038 !!!cp ('t48');
4039 !!!parse-error (type => 'in CDATA:#eof', token => $token);
4040 ## ISSUE: And ignore?
4041 ## TODO: mark as "already executed"
4042 }
4043
4044 if (defined $self->{inner_html_node}) {
4045 !!!cp ('t49');
4046 ## TODO: mark as "already executed"
4047 } else {
4048 !!!cp ('t50');
4049 ## TODO: $old_insertion_point = current insertion point
4050 ## TODO: insertion point = just before the next input character
4051
4052 $insert->($script_el);
4053
4054 ## TODO: insertion point = $old_insertion_point (might be "undefined")
4055
4056 ## TODO: if there is a script that will execute as soon as the parser resume, then...
4057 }
4058
4059 !!!next-token;
4060 }; # $script_start_tag
4061
4062 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
4063 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
4064 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
4065
4066 my $formatting_end_tag = sub {
4067 my $end_tag_token = shift;
4068 my $tag_name = $end_tag_token->{tag_name};
4069
4070 ## NOTE: The adoption agency algorithm (AAA).
4071
4072 FET: {
4073 ## Step 1
4074 my $formatting_element;
4075 my $formatting_element_i_in_active;
4076 AFE: for (reverse 0..$#$active_formatting_elements) {
4077 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
4078 !!!cp ('t52');
4079 last AFE;
4080 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
4081 eq $tag_name) {
4082 !!!cp ('t51');
4083 $formatting_element = $active_formatting_elements->[$_];
4084 $formatting_element_i_in_active = $_;
4085 last AFE;
4086 }
4087 } # AFE
4088 unless (defined $formatting_element) {
4089 !!!cp ('t53');
4090 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
4091 ## Ignore the token
4092 !!!next-token;
4093 return;
4094 }
4095 ## has an element in scope
4096 my $in_scope = 1;
4097 my $formatting_element_i_in_open;
4098 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4099 my $node = $self->{open_elements}->[$_];
4100 if ($node->[0] eq $formatting_element->[0]) {
4101 if ($in_scope) {
4102 !!!cp ('t54');
4103 $formatting_element_i_in_open = $_;
4104 last INSCOPE;
4105 } else { # in open elements but not in scope
4106 !!!cp ('t55');
4107 !!!parse-error (type => 'unmatched end tag',
4108 text => $token->{tag_name},
4109 token => $end_tag_token);
4110 ## Ignore the token
4111 !!!next-token;
4112 return;
4113 }
4114 } elsif ($node->[1] & SCOPING_EL) {
4115 !!!cp ('t56');
4116 $in_scope = 0;
4117 }
4118 } # INSCOPE
4119 unless (defined $formatting_element_i_in_open) {
4120 !!!cp ('t57');
4121 !!!parse-error (type => 'unmatched end tag',
4122 text => $token->{tag_name},
4123 token => $end_tag_token);
4124 pop @$active_formatting_elements; # $formatting_element
4125 !!!next-token; ## TODO: ok?
4126 return;
4127 }
4128 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
4129 !!!cp ('t58');
4130 !!!parse-error (type => 'not closed',
4131 text => $self->{open_elements}->[-1]->[0]
4132 ->manakai_local_name,
4133 token => $end_tag_token);
4134 }
4135
4136 ## Step 2
4137 my $furthest_block;
4138 my $furthest_block_i_in_open;
4139 OE: for (reverse 0..$#{$self->{open_elements}}) {
4140 my $node = $self->{open_elements}->[$_];
4141 if (not ($node->[1] & FORMATTING_EL) and
4142 #not $phrasing_category->{$node->[1]} and
4143 ($node->[1] & SPECIAL_EL or
4144 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
4145 !!!cp ('t59');
4146 $furthest_block = $node;
4147 $furthest_block_i_in_open = $_;
4148 } elsif ($node->[0] eq $formatting_element->[0]) {
4149 !!!cp ('t60');
4150 last OE;
4151 }
4152 } # OE
4153
4154 ## Step 3
4155 unless (defined $furthest_block) { # MUST
4156 !!!cp ('t61');
4157 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
4158 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
4159 !!!next-token;
4160 return;
4161 }
4162
4163 ## Step 4
4164 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
4165
4166 ## Step 5
4167 my $furthest_block_parent = $furthest_block->[0]->parent_node;
4168 if (defined $furthest_block_parent) {
4169 !!!cp ('t62');
4170 $furthest_block_parent->remove_child ($furthest_block->[0]);
4171 }
4172
4173 ## Step 6
4174 my $bookmark_prev_el
4175 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
4176 ->[0];
4177
4178 ## Step 7
4179 my $node = $furthest_block;
4180 my $node_i_in_open = $furthest_block_i_in_open;
4181 my $last_node = $furthest_block;
4182 S7: {
4183 ## Step 1
4184 $node_i_in_open--;
4185 $node = $self->{open_elements}->[$node_i_in_open];
4186
4187 ## Step 2
4188 my $node_i_in_active;
4189 S7S2: {
4190 for (reverse 0..$#$active_formatting_elements) {
4191 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4192 !!!cp ('t63');
4193 $node_i_in_active = $_;
4194 last S7S2;
4195 }
4196 }
4197 splice @{$self->{open_elements}}, $node_i_in_open, 1;
4198 redo S7;
4199 } # S7S2
4200
4201 ## Step 3
4202 last S7 if $node->[0] eq $formatting_element->[0];
4203
4204 ## Step 4
4205 if ($last_node->[0] eq $furthest_block->[0]) {
4206 !!!cp ('t64');
4207 $bookmark_prev_el = $node->[0];
4208 }
4209
4210 ## Step 5
4211 if ($node->[0]->has_child_nodes ()) {
4212 !!!cp ('t65');
4213 my $clone = [$node->[0]->clone_node (0), $node->[1]];
4214 $active_formatting_elements->[$node_i_in_active] = $clone;
4215 $self->{open_elements}->[$node_i_in_open] = $clone;
4216 $node = $clone;
4217 }
4218
4219 ## Step 6
4220 $node->[0]->append_child ($last_node->[0]);
4221
4222 ## Step 7
4223 $last_node = $node;
4224
4225 ## Step 8
4226 redo S7;
4227 } # S7
4228
4229 ## Step 8
4230 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
4231 my $foster_parent_element;
4232 my $next_sibling;
4233 OE: for (reverse 0..$#{$self->{open_elements}}) {
4234 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4235 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4236 if (defined $parent and $parent->node_type == 1) {
4237 !!!cp ('t65.1');
4238 $foster_parent_element = $parent;
4239 $next_sibling = $self->{open_elements}->[$_]->[0];
4240 } else {
4241 !!!cp ('t65.2');
4242 $foster_parent_element
4243 = $self->{open_elements}->[$_ - 1]->[0];
4244 }
4245 last OE;
4246 }
4247 } # OE
4248 $foster_parent_element = $self->{open_elements}->[0]->[0]
4249 unless defined $foster_parent_element;
4250 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
4251 $open_tables->[-1]->[1] = 1; # tainted
4252 } else {
4253 !!!cp ('t65.3');
4254 $common_ancestor_node->[0]->append_child ($last_node->[0]);
4255 }
4256
4257 ## Step 9
4258 my $clone = [$formatting_element->[0]->clone_node (0),
4259 $formatting_element->[1]];
4260
4261 ## Step 10
4262 my @cn = @{$furthest_block->[0]->child_nodes};
4263 $clone->[0]->append_child ($_) for @cn;
4264
4265 ## Step 11
4266 $furthest_block->[0]->append_child ($clone->[0]);
4267
4268 ## Step 12
4269 my $i;
4270 AFE: for (reverse 0..$#$active_formatting_elements) {
4271 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
4272 !!!cp ('t66');
4273 splice @$active_formatting_elements, $_, 1;
4274 $i-- and last AFE if defined $i;
4275 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
4276 !!!cp ('t67');
4277 $i = $_;
4278 }
4279 } # AFE
4280 splice @$active_formatting_elements, $i + 1, 0, $clone;
4281
4282 ## Step 13
4283 undef $i;
4284 OE: for (reverse 0..$#{$self->{open_elements}}) {
4285 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
4286 !!!cp ('t68');
4287 splice @{$self->{open_elements}}, $_, 1;
4288 $i-- and last OE if defined $i;
4289 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
4290 !!!cp ('t69');
4291 $i = $_;
4292 }
4293 } # OE
4294 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
4295
4296 ## Step 14
4297 redo FET;
4298 } # FET
4299 }; # $formatting_end_tag
4300
4301 $insert = my $insert_to_current = sub {
4302 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
4303 }; # $insert_to_current
4304
4305 my $insert_to_foster = sub {
4306 my $child = shift;
4307 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4308 # MUST
4309 my $foster_parent_element;
4310 my $next_sibling;
4311 OE: for (reverse 0..$#{$self->{open_elements}}) {
4312 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4313 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4314 if (defined $parent and $parent->node_type == 1) {
4315 !!!cp ('t70');
4316 $foster_parent_element = $parent;
4317 $next_sibling = $self->{open_elements}->[$_]->[0];
4318 } else {
4319 !!!cp ('t71');
4320 $foster_parent_element
4321 = $self->{open_elements}->[$_ - 1]->[0];
4322 }
4323 last OE;
4324 }
4325 } # OE
4326 $foster_parent_element = $self->{open_elements}->[0]->[0]
4327 unless defined $foster_parent_element;
4328 $foster_parent_element->insert_before
4329 ($child, $next_sibling);
4330 $open_tables->[-1]->[1] = 1; # tainted
4331 } else {
4332 !!!cp ('t72');
4333 $self->{open_elements}->[-1]->[0]->append_child ($child);
4334 }
4335 }; # $insert_to_foster
4336
4337 B: while (1) {
4338 if ($token->{type} == DOCTYPE_TOKEN) {
4339 !!!cp ('t73');
4340 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4341 ## Ignore the token
4342 ## Stay in the phase
4343 !!!next-token;
4344 next B;
4345 } elsif ($token->{type} == START_TAG_TOKEN and
4346 $token->{tag_name} eq 'html') {
4347 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4348 !!!cp ('t79');
4349 !!!parse-error (type => 'after html', text => 'html', token => $token);
4350 $self->{insertion_mode} = AFTER_BODY_IM;
4351 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4352 !!!cp ('t80');
4353 !!!parse-error (type => 'after html', text => 'html', token => $token);
4354 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4355 } else {
4356 !!!cp ('t81');
4357 }
4358
4359 !!!cp ('t82');
4360 !!!parse-error (type => 'not first start tag', token => $token);
4361 my $top_el = $self->{open_elements}->[0]->[0];
4362 for my $attr_name (keys %{$token->{attributes}}) {
4363 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4364 !!!cp ('t84');
4365 $top_el->set_attribute_ns
4366 (undef, [undef, $attr_name],
4367 $token->{attributes}->{$attr_name}->{value});
4368 }
4369 }
4370 !!!nack ('t84.1');
4371 !!!next-token;
4372 next B;
4373 } elsif ($token->{type} == COMMENT_TOKEN) {
4374 my $comment = $self->{document}->create_comment ($token->{data});
4375 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4376 !!!cp ('t85');
4377 $self->{document}->append_child ($comment);
4378 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4379 !!!cp ('t86');
4380 $self->{open_elements}->[0]->[0]->append_child ($comment);
4381 } else {
4382 !!!cp ('t87');
4383 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4384 }
4385 !!!next-token;
4386 next B;
4387 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4388 if ($token->{type} == CHARACTER_TOKEN) {
4389 !!!cp ('t87.1');
4390 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4391 !!!next-token;
4392 next B;
4393 } elsif ($token->{type} == START_TAG_TOKEN) {
4394 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4395 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4396 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4397 ($token->{tag_name} eq 'svg' and
4398 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4399 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4400 !!!cp ('t87.2');
4401 #
4402 } elsif ({
4403 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4404 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4405 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4406 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4407 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4408 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4409 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4410 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4411 }->{$token->{tag_name}}) {
4412 !!!cp ('t87.2');
4413 !!!parse-error (type => 'not closed',
4414 text => $self->{open_elements}->[-1]->[0]
4415 ->manakai_local_name,
4416 token => $token);
4417
4418 pop @{$self->{open_elements}}
4419 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4420
4421 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4422 ## Reprocess.
4423 next B;
4424 } else {
4425 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4426 my $tag_name = $token->{tag_name};
4427 if ($nsuri eq $SVG_NS) {
4428 $tag_name = {
4429 altglyph => 'altGlyph',
4430 altglyphdef => 'altGlyphDef',
4431 altglyphitem => 'altGlyphItem',
4432 animatecolor => 'animateColor',
4433 animatemotion => 'animateMotion',
4434 animatetransform => 'animateTransform',
4435 clippath => 'clipPath',
4436 feblend => 'feBlend',
4437 fecolormatrix => 'feColorMatrix',
4438 fecomponenttransfer => 'feComponentTransfer',
4439 fecomposite => 'feComposite',
4440 feconvolvematrix => 'feConvolveMatrix',
4441 fediffuselighting => 'feDiffuseLighting',
4442 fedisplacementmap => 'feDisplacementMap',
4443 fedistantlight => 'feDistantLight',
4444 feflood => 'feFlood',
4445 fefunca => 'feFuncA',
4446 fefuncb => 'feFuncB',
4447 fefuncg => 'feFuncG',
4448 fefuncr => 'feFuncR',
4449 fegaussianblur => 'feGaussianBlur',
4450 feimage => 'feImage',
4451 femerge => 'feMerge',
4452 femergenode => 'feMergeNode',
4453 femorphology => 'feMorphology',
4454 feoffset => 'feOffset',
4455 fepointlight => 'fePointLight',
4456 fespecularlighting => 'feSpecularLighting',
4457 fespotlight => 'feSpotLight',
4458 fetile => 'feTile',
4459 feturbulence => 'feTurbulence',
4460 foreignobject => 'foreignObject',
4461 glyphref => 'glyphRef',
4462 lineargradient => 'linearGradient',
4463 radialgradient => 'radialGradient',
4464 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4465 textpath => 'textPath',
4466 }->{$tag_name} || $tag_name;
4467 }
4468
4469 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4470
4471 ## "adjust foreign attributes" - done in insert-element-f
4472
4473 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4474
4475 if ($self->{self_closing}) {
4476 pop @{$self->{open_elements}};
4477 !!!ack ('t87.3');
4478 } else {
4479 !!!cp ('t87.4');
4480 }
4481
4482 !!!next-token;
4483 next B;
4484 }
4485 } elsif ($token->{type} == END_TAG_TOKEN) {
4486 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4487 !!!cp ('t87.5');
4488 #
4489 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4490 !!!cp ('t87.6');
4491 !!!parse-error (type => 'not closed',
4492 text => $self->{open_elements}->[-1]->[0]
4493 ->manakai_local_name,
4494 token => $token);
4495
4496 pop @{$self->{open_elements}}
4497 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4498
4499 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4500 ## Reprocess.
4501 next B;
4502 } else {
4503 die "$0: $token->{type}: Unknown token type";
4504 }
4505 }
4506
4507 if ($self->{insertion_mode} & HEAD_IMS) {
4508 if ($token->{type} == CHARACTER_TOKEN) {
4509 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4510 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4511 !!!cp ('t88.2');
4512 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4513 #
4514 } else {
4515 !!!cp ('t88.1');
4516 ## Ignore the token.
4517 #
4518 }
4519 unless (length $token->{data}) {
4520 !!!cp ('t88');
4521 !!!next-token;
4522 next B;
4523 }
4524 ## TODO: set $token->{column} appropriately
4525 }
4526
4527 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4528 !!!cp ('t89');
4529 ## As if <head>
4530 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4531 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4532 push @{$self->{open_elements}},
4533 [$self->{head_element}, $el_category->{head}];
4534
4535 ## Reprocess in the "in head" insertion mode...
4536 pop @{$self->{open_elements}};
4537
4538 ## Reprocess in the "after head" insertion mode...
4539 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4540 !!!cp ('t90');
4541 ## As if </noscript>
4542 pop @{$self->{open_elements}};
4543 !!!parse-error (type => 'in noscript:#text', token => $token);
4544
4545 ## Reprocess in the "in head" insertion mode...
4546 ## As if </head>
4547 pop @{$self->{open_elements}};
4548
4549 ## Reprocess in the "after head" insertion mode...
4550 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4551 !!!cp ('t91');
4552 pop @{$self->{open_elements}};
4553
4554 ## Reprocess in the "after head" insertion mode...
4555 } else {
4556 !!!cp ('t92');
4557 }
4558
4559 ## "after head" insertion mode
4560 ## As if <body>
4561 !!!insert-element ('body',, $token);
4562 $self->{insertion_mode} = IN_BODY_IM;
4563 ## reprocess
4564 next B;
4565 } elsif ($token->{type} == START_TAG_TOKEN) {
4566 if ($token->{tag_name} eq 'head') {
4567 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4568 !!!cp ('t93');
4569 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4570 $self->{open_elements}->[-1]->[0]->append_child
4571 ($self->{head_element});
4572 push @{$self->{open_elements}},
4573 [$self->{head_element}, $el_category->{head}];
4574 $self->{insertion_mode} = IN_HEAD_IM;
4575 !!!nack ('t93.1');
4576 !!!next-token;
4577 next B;
4578 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4579 !!!cp ('t93.2');
4580 !!!parse-error (type => 'after head', text => 'head',
4581 token => $token);
4582 ## Ignore the token
4583 !!!nack ('t93.3');
4584 !!!next-token;
4585 next B;
4586 } else {
4587 !!!cp ('t95');
4588 !!!parse-error (type => 'in head:head',
4589 token => $token); # or in head noscript
4590 ## Ignore the token
4591 !!!nack ('t95.1');
4592 !!!next-token;
4593 next B;
4594 }
4595 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4596 !!!cp ('t96');
4597 ## As if <head>
4598 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4599 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4600 push @{$self->{open_elements}},
4601 [$self->{head_element}, $el_category->{head}];
4602
4603 $self->{insertion_mode} = IN_HEAD_IM;
4604 ## Reprocess in the "in head" insertion mode...
4605 } else {
4606 !!!cp ('t97');
4607 }
4608
4609 if ($token->{tag_name} eq 'base') {
4610 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4611 !!!cp ('t98');
4612 ## As if </noscript>
4613 pop @{$self->{open_elements}};
4614 !!!parse-error (type => 'in noscript', text => 'base',
4615 token => $token);
4616
4617 $self->{insertion_mode} = IN_HEAD_IM;
4618 ## Reprocess in the "in head" insertion mode...
4619 } else {
4620 !!!cp ('t99');
4621 }
4622
4623 ## NOTE: There is a "as if in head" code clone.
4624 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4625 !!!cp ('t100');
4626 !!!parse-error (type => 'after head',
4627 text => $token->{tag_name}, token => $token);
4628 push @{$self->{open_elements}},
4629 [$self->{head_element}, $el_category->{head}];
4630 } else {
4631 !!!cp ('t101');
4632 }
4633 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4634 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4635 pop @{$self->{open_elements}} # <head>
4636 if $self->{insertion_mode} == AFTER_HEAD_IM;
4637 !!!nack ('t101.1');
4638 !!!next-token;
4639 next B;
4640 } elsif ($token->{tag_name} eq 'link') {
4641 ## NOTE: There is a "as if in head" code clone.
4642 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4643 !!!cp ('t102');
4644 !!!parse-error (type => 'after head',
4645 text => $token->{tag_name}, token => $token);
4646 push @{$self->{open_elements}},
4647 [$self->{head_element}, $el_category->{head}];
4648 } else {
4649 !!!cp ('t103');
4650 }
4651 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4652 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4653 pop @{$self->{open_elements}} # <head>
4654 if $self->{insertion_mode} == AFTER_HEAD_IM;
4655 !!!ack ('t103.1');
4656 !!!next-token;
4657 next B;
4658 } elsif ($token->{tag_name} eq 'meta') {
4659 ## NOTE: There is a "as if in head" code clone.
4660 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4661 !!!cp ('t104');
4662 !!!parse-error (type => 'after head',
4663 text => $token->{tag_name}, token => $token);
4664 push @{$self->{open_elements}},
4665 [$self->{head_element}, $el_category->{head}];
4666 } else {
4667 !!!cp ('t105');
4668 }
4669 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4670 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4671
4672 unless ($self->{confident}) {
4673 if ($token->{attributes}->{charset}) {
4674 !!!cp ('t106');
4675 ## NOTE: Whether the encoding is supported or not is handled
4676 ## in the {change_encoding} callback.
4677 $self->{change_encoding}
4678 ->($self, $token->{attributes}->{charset}->{value},
4679 $token);
4680
4681 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4682 ->set_user_data (manakai_has_reference =>
4683 $token->{attributes}->{charset}
4684 ->{has_reference});
4685 } elsif ($token->{attributes}->{content}) {
4686 if ($token->{attributes}->{content}->{value}
4687 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4688 [\x09-\x0D\x20]*=
4689 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4690 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4691 !!!cp ('t107');
4692 ## NOTE: Whether the encoding is supported or not is handled
4693 ## in the {change_encoding} callback.
4694 $self->{change_encoding}
4695 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4696 $token);
4697 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4698 ->set_user_data (manakai_has_reference =>
4699 $token->{attributes}->{content}
4700 ->{has_reference});
4701 } else {
4702 !!!cp ('t108');
4703 }
4704 }
4705 } else {
4706 if ($token->{attributes}->{charset}) {
4707 !!!cp ('t109');
4708 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4709 ->set_user_data (manakai_has_reference =>
4710 $token->{attributes}->{charset}
4711 ->{has_reference});
4712 }
4713 if ($token->{attributes}->{content}) {
4714 !!!cp ('t110');
4715 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4716 ->set_user_data (manakai_has_reference =>
4717 $token->{attributes}->{content}
4718 ->{has_reference});
4719 }
4720 }
4721
4722 pop @{$self->{open_elements}} # <head>
4723 if $self->{insertion_mode} == AFTER_HEAD_IM;
4724 !!!ack ('t110.1');
4725 !!!next-token;
4726 next B;
4727 } elsif ($token->{tag_name} eq 'title') {
4728 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4729 !!!cp ('t111');
4730 ## As if </noscript>
4731 pop @{$self->{open_elements}};
4732 !!!parse-error (type => 'in noscript', text => 'title',
4733 token => $token);
4734
4735 $self->{insertion_mode} = IN_HEAD_IM;
4736 ## Reprocess in the "in head" insertion mode...
4737 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4738 !!!cp ('t112');
4739 !!!parse-error (type => 'after head',
4740 text => $token->{tag_name}, token => $token);
4741 push @{$self->{open_elements}},
4742 [$self->{head_element}, $el_category->{head}];
4743 } else {
4744 !!!cp ('t113');
4745 }
4746
4747 ## NOTE: There is a "as if in head" code clone.
4748 my $parent = defined $self->{head_element} ? $self->{head_element}
4749 : $self->{open_elements}->[-1]->[0];
4750 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4751 pop @{$self->{open_elements}} # <head>
4752 if $self->{insertion_mode} == AFTER_HEAD_IM;
4753 next B;
4754 } elsif ($token->{tag_name} eq 'style' or
4755 $token->{tag_name} eq 'noframes') {
4756 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4757 ## insertion mode IN_HEAD_IM)
4758 ## NOTE: There is a "as if in head" code clone.
4759 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4760 !!!cp ('t114');
4761 !!!parse-error (type => 'after head',
4762 text => $token->{tag_name}, token => $token);
4763 push @{$self->{open_elements}},
4764 [$self->{head_element}, $el_category->{head}];
4765 } else {
4766 !!!cp ('t115');
4767 }
4768 $parse_rcdata->(CDATA_CONTENT_MODEL);
4769 pop @{$self->{open_elements}} # <head>
4770 if $self->{insertion_mode} == AFTER_HEAD_IM;
4771 next B;
4772 } elsif ($token->{tag_name} eq 'noscript') {
4773 if ($self->{insertion_mode} == IN_HEAD_IM) {
4774 !!!cp ('t116');
4775 ## NOTE: and scripting is disalbed
4776 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4777 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4778 !!!nack ('t116.1');
4779 !!!next-token;
4780 next B;
4781 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4782 !!!cp ('t117');
4783 !!!parse-error (type => 'in noscript', text => 'noscript',
4784 token => $token);
4785 ## Ignore the token
4786 !!!nack ('t117.1');
4787 !!!next-token;
4788 next B;
4789 } else {
4790 !!!cp ('t118');
4791 #
4792 }
4793 } elsif ($token->{tag_name} eq 'script') {
4794 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4795 !!!cp ('t119');
4796 ## As if </noscript>
4797 pop @{$self->{open_elements}};
4798 !!!parse-error (type => 'in noscript', text => 'script',
4799 token => $token);
4800
4801 $self->{insertion_mode} = IN_HEAD_IM;
4802 ## Reprocess in the "in head" insertion mode...
4803 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4804 !!!cp ('t120');
4805 !!!parse-error (type => 'after head',
4806 text => $token->{tag_name}, token => $token);
4807 push @{$self->{open_elements}},
4808 [$self->{head_element}, $el_category->{head}];
4809 } else {
4810 !!!cp ('t121');
4811 }
4812
4813 ## NOTE: There is a "as if in head" code clone.
4814 $script_start_tag->();
4815 pop @{$self->{open_elements}} # <head>
4816 if $self->{insertion_mode} == AFTER_HEAD_IM;
4817 next B;
4818 } elsif ($token->{tag_name} eq 'body' or
4819 $token->{tag_name} eq 'frameset') {
4820 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4821 !!!cp ('t122');
4822 ## As if </noscript>
4823 pop @{$self->{open_elements}};
4824 !!!parse-error (type => 'in noscript',
4825 text => $token->{tag_name}, token => $token);
4826
4827 ## Reprocess in the "in head" insertion mode...
4828 ## As if </head>
4829 pop @{$self->{open_elements}};
4830
4831 ## Reprocess in the "after head" insertion mode...
4832 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4833 !!!cp ('t124');
4834 pop @{$self->{open_elements}};
4835
4836 ## Reprocess in the "after head" insertion mode...
4837 } else {
4838 !!!cp ('t125');
4839 }
4840
4841 ## "after head" insertion mode
4842 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4843 if ($token->{tag_name} eq 'body') {
4844 !!!cp ('t126');
4845 $self->{insertion_mode} = IN_BODY_IM;
4846 } elsif ($token->{tag_name} eq 'frameset') {
4847 !!!cp ('t127');
4848 $self->{insertion_mode} = IN_FRAMESET_IM;
4849 } else {
4850 die "$0: tag name: $self->{tag_name}";
4851 }
4852 !!!nack ('t127.1');
4853 !!!next-token;
4854 next B;
4855 } else {
4856 !!!cp ('t128');
4857 #
4858 }
4859
4860 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4861 !!!cp ('t129');
4862 ## As if </noscript>
4863 pop @{$self->{open_elements}};
4864 !!!parse-error (type => 'in noscript:/',
4865 text => $token->{tag_name}, token => $token);
4866
4867 ## Reprocess in the "in head" insertion mode...
4868 ## As if </head>
4869 pop @{$self->{open_elements}};
4870
4871 ## Reprocess in the "after head" insertion mode...
4872 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4873 !!!cp ('t130');
4874 ## As if </head>
4875 pop @{$self->{open_elements}};
4876
4877 ## Reprocess in the "after head" insertion mode...
4878 } else {
4879 !!!cp ('t131');
4880 }
4881
4882 ## "after head" insertion mode
4883 ## As if <body>
4884 !!!insert-element ('body',, $token);
4885 $self->{insertion_mode} = IN_BODY_IM;
4886 ## reprocess
4887 !!!ack-later;
4888 next B;
4889 } elsif ($token->{type} == END_TAG_TOKEN) {
4890 if ($token->{tag_name} eq 'head') {
4891 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4892 !!!cp ('t132');
4893 ## As if <head>
4894 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4895 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4896 push @{$self->{open_elements}},
4897 [$self->{head_element}, $el_category->{head}];
4898
4899 ## Reprocess in the "in head" insertion mode...
4900 pop @{$self->{open_elements}};
4901 $self->{insertion_mode} = AFTER_HEAD_IM;
4902 !!!next-token;
4903 next B;
4904 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4905 !!!cp ('t133');
4906 ## As if </noscript>
4907 pop @{$self->{open_elements}};
4908 !!!parse-error (type => 'in noscript:/',
4909 text => 'head', token => $token);
4910
4911 ## Reprocess in the "in head" insertion mode...
4912 pop @{$self->{open_elements}};
4913 $self->{insertion_mode} = AFTER_HEAD_IM;
4914 !!!next-token;
4915 next B;
4916 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4917 !!!cp ('t134');
4918 pop @{$self->{open_elements}};
4919 $self->{insertion_mode} = AFTER_HEAD_IM;
4920 !!!next-token;
4921 next B;
4922 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4923 !!!cp ('t134.1');
4924 !!!parse-error (type => 'unmatched end tag', text => 'head',
4925 token => $token);
4926 ## Ignore the token
4927 !!!next-token;
4928 next B;
4929 } else {
4930 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4931 }
4932 } elsif ($token->{tag_name} eq 'noscript') {
4933 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4934 !!!cp ('t136');
4935 pop @{$self->{open_elements}};
4936 $self->{insertion_mode} = IN_HEAD_IM;
4937 !!!next-token;
4938 next B;
4939 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4940 $self->{insertion_mode} == AFTER_HEAD_IM) {
4941 !!!cp ('t137');
4942 !!!parse-error (type => 'unmatched end tag',
4943 text => 'noscript', token => $token);
4944 ## Ignore the token ## ISSUE: An issue in the spec.
4945 !!!next-token;
4946 next B;
4947 } else {
4948 !!!cp ('t138');
4949 #
4950 }
4951 } elsif ({
4952 body => 1, html => 1,
4953 }->{$token->{tag_name}}) {
4954 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4955 $self->{insertion_mode} == IN_HEAD_IM or
4956 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4957 !!!cp ('t140');
4958 !!!parse-error (type => 'unmatched end tag',
4959 text => $token->{tag_name}, token => $token);
4960 ## Ignore the token
4961 !!!next-token;
4962 next B;
4963 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4964 !!!cp ('t140.1');
4965 !!!parse-error (type => 'unmatched end tag',
4966 text => $token->{tag_name}, token => $token);
4967 ## Ignore the token
4968 !!!next-token;
4969 next B;
4970 } else {
4971 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4972 }
4973 } elsif ($token->{tag_name} eq 'p') {
4974 !!!cp ('t142');
4975 !!!parse-error (type => 'unmatched end tag',
4976 text => $token->{tag_name}, token => $token);
4977 ## Ignore the token
4978 !!!next-token;
4979 next B;
4980 } elsif ($token->{tag_name} eq 'br') {
4981 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4982 !!!cp ('t142.2');
4983 ## (before head) as if <head>, (in head) as if </head>
4984 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4985 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4986 $self->{insertion_mode} = AFTER_HEAD_IM;
4987
4988 ## Reprocess in the "after head" insertion mode...
4989 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4990 !!!cp ('t143.2');
4991 ## As if </head>
4992 pop @{$self->{open_elements}};
4993 $self->{insertion_mode} = AFTER_HEAD_IM;
4994
4995 ## Reprocess in the "after head" insertion mode...
4996 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4997 !!!cp ('t143.3');
4998 ## ISSUE: Two parse errors for <head><noscript></br>
4999 !!!parse-error (type => 'unmatched end tag',
5000 text => 'br', token => $token);
5001 ## As if </noscript>
5002 pop @{$self->{open_elements}};
5003 $self->{insertion_mode} = IN_HEAD_IM;
5004
5005 ## Reprocess in the "in head" insertion mode...
5006 ## As if </head>
5007 pop @{$self->{open_elements}};
5008 $self->{insertion_mode} = AFTER_HEAD_IM;
5009
5010 ## Reprocess in the "after head" insertion mode...
5011 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
5012 !!!cp ('t143.4');
5013 #
5014 } else {
5015 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5016 }
5017
5018 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
5019 !!!parse-error (type => 'unmatched end tag',
5020 text => 'br', token => $token);
5021 ## Ignore the token
5022 !!!next-token;
5023 next B;
5024 } else {
5025 !!!cp ('t145');
5026 !!!parse-error (type => 'unmatched end tag',
5027 text => $token->{tag_name}, token => $token);
5028 ## Ignore the token
5029 !!!next-token;
5030 next B;
5031 }
5032
5033 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5034 !!!cp ('t146');
5035 ## As if </noscript>
5036 pop @{$self->{open_elements}};
5037 !!!parse-error (type => 'in noscript:/',
5038 text => $token->{tag_name}, token => $token);
5039
5040 ## Reprocess in the "in head" insertion mode...
5041 ## As if </head>
5042 pop @{$self->{open_elements}};
5043
5044 ## Reprocess in the "after head" insertion mode...
5045 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5046 !!!cp ('t147');
5047 ## As if </head>
5048 pop @{$self->{open_elements}};
5049
5050 ## Reprocess in the "after head" insertion mode...
5051 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5052 ## ISSUE: This case cannot be reached?
5053 !!!cp ('t148');
5054 !!!parse-error (type => 'unmatched end tag',
5055 text => $token->{tag_name}, token => $token);
5056 ## Ignore the token ## ISSUE: An issue in the spec.
5057 !!!next-token;
5058 next B;
5059 } else {
5060 !!!cp ('t149');
5061 }
5062
5063 ## "after head" insertion mode
5064 ## As if <body>
5065 !!!insert-element ('body',, $token);
5066 $self->{insertion_mode} = IN_BODY_IM;
5067 ## reprocess
5068 next B;
5069 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5070 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5071 !!!cp ('t149.1');
5072
5073 ## NOTE: As if <head>
5074 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
5075 $self->{open_elements}->[-1]->[0]->append_child
5076 ($self->{head_element});
5077 #push @{$self->{open_elements}},
5078 # [$self->{head_element}, $el_category->{head}];
5079 #$self->{insertion_mode} = IN_HEAD_IM;
5080 ## NOTE: Reprocess.
5081
5082 ## NOTE: As if </head>
5083 #pop @{$self->{open_elements}};
5084 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5085 ## NOTE: Reprocess.
5086
5087 #
5088 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5089 !!!cp ('t149.2');
5090
5091 ## NOTE: As if </head>
5092 pop @{$self->{open_elements}};
5093 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5094 ## NOTE: Reprocess.
5095
5096 #
5097 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5098 !!!cp ('t149.3');
5099
5100 !!!parse-error (type => 'in noscript:#eof', token => $token);
5101
5102 ## As if </noscript>
5103 pop @{$self->{open_elements}};
5104 #$self->{insertion_mode} = IN_HEAD_IM;
5105 ## NOTE: Reprocess.
5106
5107 ## NOTE: As if </head>
5108 pop @{$self->{open_elements}};
5109 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5110 ## NOTE: Reprocess.
5111
5112 #
5113 } else {
5114 !!!cp ('t149.4');
5115 #
5116 }
5117
5118 ## NOTE: As if <body>
5119 !!!insert-element ('body',, $token);
5120 $self->{insertion_mode} = IN_BODY_IM;
5121 ## NOTE: Reprocess.
5122 next B;
5123 } else {
5124 die "$0: $token->{type}: Unknown token type";
5125 }
5126
5127 ## ISSUE: An issue in the spec.
5128 } elsif ($self->{insertion_mode} & BODY_IMS) {
5129 if ($token->{type} == CHARACTER_TOKEN) {
5130 !!!cp ('t150');
5131 ## NOTE: There is a code clone of "character in body".
5132 $reconstruct_active_formatting_elements->($insert_to_current);
5133
5134 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5135
5136 !!!next-token;
5137 next B;
5138 } elsif ($token->{type} == START_TAG_TOKEN) {
5139 if ({
5140 caption => 1, col => 1, colgroup => 1, tbody => 1,
5141 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
5142 }->{$token->{tag_name}}) {
5143 if ($self->{insertion_mode} == IN_CELL_IM) {
5144 ## have an element in table scope
5145 for (reverse 0..$#{$self->{open_elements}}) {
5146 my $node = $self->{open_elements}->[$_];
5147 if ($node->[1] & TABLE_CELL_EL) {
5148 !!!cp ('t151');
5149
5150 ## Close the cell
5151 !!!back-token; # <x>
5152 $token = {type => END_TAG_TOKEN,
5153 tag_name => $node->[0]->manakai_local_name,
5154 line => $token->{line},
5155 column => $token->{column}};
5156 next B;
5157 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5158 !!!cp ('t152');
5159 ## ISSUE: This case can never be reached, maybe.
5160 last;
5161 }
5162 }
5163
5164 !!!cp ('t153');
5165 !!!parse-error (type => 'start tag not allowed',
5166 text => $token->{tag_name}, token => $token);
5167 ## Ignore the token
5168 !!!nack ('t153.1');
5169 !!!next-token;
5170 next B;
5171 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5172 !!!parse-error (type => 'not closed', text => 'caption',
5173 token => $token);
5174
5175 ## NOTE: As if </caption>.
5176 ## have a table element in table scope
5177 my $i;
5178 INSCOPE: {
5179 for (reverse 0..$#{$self->{open_elements}}) {
5180 my $node = $self->{open_elements}->[$_];
5181 if ($node->[1] & CAPTION_EL) {
5182 !!!cp ('t155');
5183 $i = $_;
5184 last INSCOPE;
5185 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5186 !!!cp ('t156');
5187 last;
5188 }
5189 }
5190
5191 !!!cp ('t157');
5192 !!!parse-error (type => 'start tag not allowed',
5193 text => $token->{tag_name}, token => $token);
5194 ## Ignore the token
5195 !!!nack ('t157.1');
5196 !!!next-token;
5197 next B;
5198 } # INSCOPE
5199
5200 ## generate implied end tags
5201 while ($self->{open_elements}->[-1]->[1]
5202 & END_TAG_OPTIONAL_EL) {
5203 !!!cp ('t158');
5204 pop @{$self->{open_elements}};
5205 }
5206
5207 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5208 !!!cp ('t159');
5209 !!!parse-error (type => 'not closed',
5210 text => $self->{open_elements}->[-1]->[0]
5211 ->manakai_local_name,
5212 token => $token);
5213 } else {
5214 !!!cp ('t160');
5215 }
5216
5217 splice @{$self->{open_elements}}, $i;
5218
5219 $clear_up_to_marker->();
5220
5221 $self->{insertion_mode} = IN_TABLE_IM;
5222
5223 ## reprocess
5224 !!!ack-later;
5225 next B;
5226 } else {
5227 !!!cp ('t161');
5228 #
5229 }
5230 } else {
5231 !!!cp ('t162');
5232 #
5233 }
5234 } elsif ($token->{type} == END_TAG_TOKEN) {
5235 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5236 if ($self->{insertion_mode} == IN_CELL_IM) {
5237 ## have an element in table scope
5238 my $i;
5239 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5240 my $node = $self->{open_elements}->[$_];
5241 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5242 !!!cp ('t163');
5243 $i = $_;
5244 last INSCOPE;
5245 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5246 !!!cp ('t164');
5247 last INSCOPE;
5248 }
5249 } # INSCOPE
5250 unless (defined $i) {
5251 !!!cp ('t165');
5252 !!!parse-error (type => 'unmatched end tag',
5253 text => $token->{tag_name},
5254 token => $token);
5255 ## Ignore the token
5256 !!!next-token;
5257 next B;
5258 }
5259
5260 ## generate implied end tags
5261 while ($self->{open_elements}->[-1]->[1]
5262 & END_TAG_OPTIONAL_EL) {
5263 !!!cp ('t166');
5264 pop @{$self->{open_elements}};
5265 }
5266
5267 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
5268 ne $token->{tag_name}) {
5269 !!!cp ('t167');
5270 !!!parse-error (type => 'not closed',
5271 text => $self->{open_elements}->[-1]->[0]
5272 ->manakai_local_name,
5273 token => $token);
5274 } else {
5275 !!!cp ('t168');
5276 }
5277
5278 splice @{$self->{open_elements}}, $i;
5279
5280 $clear_up_to_marker->();
5281
5282 $self->{insertion_mode} = IN_ROW_IM;
5283
5284 !!!next-token;
5285 next B;
5286 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5287 !!!cp ('t169');
5288 !!!parse-error (type => 'unmatched end tag',
5289 text => $token->{tag_name}, token => $token);
5290 ## Ignore the token
5291 !!!next-token;
5292 next B;
5293 } else {
5294 !!!cp ('t170');
5295 #
5296 }
5297 } elsif ($token->{tag_name} eq 'caption') {
5298 if ($self->{insertion_mode} == IN_CAPTION_IM) {
5299 ## have a table element in table scope
5300 my $i;
5301 INSCOPE: {
5302 for (reverse 0..$#{$self->{open_elements}}) {
5303 my $node = $self->{open_elements}->[$_];
5304 if ($node->[1] & CAPTION_EL) {
5305 !!!cp ('t171');
5306 $i = $_;
5307 last INSCOPE;
5308 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5309 !!!cp ('t172');
5310 last;
5311 }
5312 }
5313
5314 !!!cp ('t173');
5315 !!!parse-error (type => 'unmatched end tag',
5316 text => $token->{tag_name}, token => $token);
5317 ## Ignore the token
5318 !!!next-token;
5319 next B;
5320 } # INSCOPE
5321
5322 ## generate implied end tags
5323 while ($self->{open_elements}->[-1]->[1]
5324 & END_TAG_OPTIONAL_EL) {
5325 !!!cp ('t174');
5326 pop @{$self->{open_elements}};
5327 }
5328
5329 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5330 !!!cp ('t175');
5331 !!!parse-error (type => 'not closed',
5332 text => $self->{open_elements}->[-1]->[0]
5333 ->manakai_local_name,
5334 token => $token);
5335 } else {
5336 !!!cp ('t176');
5337 }
5338
5339 splice @{$self->{open_elements}}, $i;
5340
5341 $clear_up_to_marker->();
5342
5343 $self->{insertion_mode} = IN_TABLE_IM;
5344
5345 !!!next-token;
5346 next B;
5347 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5348 !!!cp ('t177');
5349 !!!parse-error (type => 'unmatched end tag',
5350 text => $token->{tag_name}, token => $token);
5351 ## Ignore the token
5352 !!!next-token;
5353 next B;
5354 } else {
5355 !!!cp ('t178');
5356 #
5357 }
5358 } elsif ({
5359 table => 1, tbody => 1, tfoot => 1,
5360 thead => 1, tr => 1,
5361 }->{$token->{tag_name}} and
5362 $self->{insertion_mode} == IN_CELL_IM) {
5363 ## have an element in table scope
5364 my $i;
5365 my $tn;
5366 INSCOPE: {
5367 for (reverse 0..$#{$self->{open_elements}}) {
5368 my $node = $self->{open_elements}->[$_];
5369 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5370 !!!cp ('t179');
5371 $i = $_;
5372
5373 ## Close the cell
5374 !!!back-token; # </x>
5375 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5376 line => $token->{line},
5377 column => $token->{column}};
5378 next B;
5379 } elsif ($node->[1] & TABLE_CELL_EL) {
5380 !!!cp ('t180');
5381 $tn = $node->[0]->manakai_local_name;
5382 ## NOTE: There is exactly one |td| or |th| element
5383 ## in scope in the stack of open elements by definition.
5384 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5385 ## ISSUE: Can this be reached?
5386 !!!cp ('t181');
5387 last;
5388 }
5389 }
5390
5391 !!!cp ('t182');
5392 !!!parse-error (type => 'unmatched end tag',
5393 text => $token->{tag_name}, token => $token);
5394 ## Ignore the token
5395 !!!next-token;
5396 next B;
5397 } # INSCOPE
5398 } elsif ($token->{tag_name} eq 'table' and
5399 $self->{insertion_mode} == IN_CAPTION_IM) {
5400 !!!parse-error (type => 'not closed', text => 'caption',
5401 token => $token);
5402
5403 ## As if </caption>
5404 ## have a table element in table scope
5405 my $i;
5406 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5407 my $node = $self->{open_elements}->[$_];
5408 if ($node->[1] & CAPTION_EL) {
5409 !!!cp ('t184');
5410 $i = $_;
5411 last INSCOPE;
5412 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5413 !!!cp ('t185');
5414 last INSCOPE;
5415 }
5416 } # INSCOPE
5417 unless (defined $i) {
5418 !!!cp ('t186');
5419 !!!parse-error (type => 'unmatched end tag',
5420 text => 'caption', token => $token);
5421 ## Ignore the token
5422 !!!next-token;
5423 next B;
5424 }
5425
5426 ## generate implied end tags
5427 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5428 !!!cp ('t187');
5429 pop @{$self->{open_elements}};
5430 }
5431
5432 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5433 !!!cp ('t188');
5434 !!!parse-error (type => 'not closed',
5435 text => $self->{open_elements}->[-1]->[0]
5436 ->manakai_local_name,
5437 token => $token);
5438 } else {
5439 !!!cp ('t189');
5440 }
5441
5442 splice @{$self->{open_elements}}, $i;
5443
5444 $clear_up_to_marker->();
5445
5446 $self->{insertion_mode} = IN_TABLE_IM;
5447
5448 ## reprocess
5449 next B;
5450 } elsif ({
5451 body => 1, col => 1, colgroup => 1, html => 1,
5452 }->{$token->{tag_name}}) {
5453 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5454 !!!cp ('t190');
5455 !!!parse-error (type => 'unmatched end tag',
5456 text => $token->{tag_name}, token => $token);
5457 ## Ignore the token
5458 !!!next-token;
5459 next B;
5460 } else {
5461 !!!cp ('t191');
5462 #
5463 }
5464 } elsif ({
5465 tbody => 1, tfoot => 1,
5466 thead => 1, tr => 1,
5467 }->{$token->{tag_name}} and
5468 $self->{insertion_mode} == IN_CAPTION_IM) {
5469 !!!cp ('t192');
5470 !!!parse-error (type => 'unmatched end tag',
5471 text => $token->{tag_name}, token => $token);
5472 ## Ignore the token
5473 !!!next-token;
5474 next B;
5475 } else {
5476 !!!cp ('t193');
5477 #
5478 }
5479 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5480 for my $entry (@{$self->{open_elements}}) {
5481 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5482 !!!cp ('t75');
5483 !!!parse-error (type => 'in body:#eof', token => $token);
5484 last;
5485 }
5486 }
5487
5488 ## Stop parsing.
5489 last B;
5490 } else {
5491 die "$0: $token->{type}: Unknown token type";
5492 }
5493
5494 $insert = $insert_to_current;
5495 #
5496 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5497 if ($token->{type} == CHARACTER_TOKEN) {
5498 if (not $open_tables->[-1]->[1] and # tainted
5499 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5500 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5501
5502 unless (length $token->{data}) {
5503 !!!cp ('t194');
5504 !!!next-token;
5505 next B;
5506 } else {
5507 !!!cp ('t195');
5508 }
5509 }
5510
5511 !!!parse-error (type => 'in table:#text', token => $token);
5512
5513 ## As if in body, but insert into foster parent element
5514 ## ISSUE: Spec says that "whenever a node would be inserted
5515 ## into the current node" while characters might not be
5516 ## result in a new Text node.
5517 $reconstruct_active_formatting_elements->($insert_to_foster);
5518
5519 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5520 # MUST
5521 my $foster_parent_element;
5522 my $next_sibling;
5523 my $prev_sibling;
5524 OE: for (reverse 0..$#{$self->{open_elements}}) {
5525 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5526 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5527 if (defined $parent and $parent->node_type == 1) {
5528 !!!cp ('t196');
5529 $foster_parent_element = $parent;
5530 $next_sibling = $self->{open_elements}->[$_]->[0];
5531 $prev_sibling = $next_sibling->previous_sibling;
5532 } else {
5533 !!!cp ('t197');
5534 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5535 $prev_sibling = $foster_parent_element->last_child;
5536 }
5537 last OE;
5538 }
5539 } # OE
5540 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5541 $prev_sibling = $foster_parent_element->last_child
5542 unless defined $foster_parent_element;
5543 if (defined $prev_sibling and
5544 $prev_sibling->node_type == 3) {
5545 !!!cp ('t198');
5546 $prev_sibling->manakai_append_text ($token->{data});
5547 } else {
5548 !!!cp ('t199');
5549 $foster_parent_element->insert_before
5550 ($self->{document}->create_text_node ($token->{data}),
5551 $next_sibling);
5552 }
5553 $open_tables->[-1]->[1] = 1; # tainted
5554 } else {
5555 !!!cp ('t200');
5556 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5557 }
5558
5559 !!!next-token;
5560 next B;
5561 } elsif ($token->{type} == START_TAG_TOKEN) {
5562 if ({
5563 tr => ($self->{insertion_mode} != IN_ROW_IM),
5564 th => 1, td => 1,
5565 }->{$token->{tag_name}}) {
5566 if ($self->{insertion_mode} == IN_TABLE_IM) {
5567 ## Clear back to table context
5568 while (not ($self->{open_elements}->[-1]->[1]
5569 & TABLE_SCOPING_EL)) {
5570 !!!cp ('t201');
5571 pop @{$self->{open_elements}};
5572 }
5573
5574 !!!insert-element ('tbody',, $token);
5575 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5576 ## reprocess in the "in table body" insertion mode...
5577 }
5578
5579 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5580 unless ($token->{tag_name} eq 'tr') {
5581 !!!cp ('t202');
5582 !!!parse-error (type => 'missing start tag:tr', token => $token);
5583 }
5584
5585 ## Clear back to table body context
5586 while (not ($self->{open_elements}->[-1]->[1]
5587 & TABLE_ROWS_SCOPING_EL)) {
5588 !!!cp ('t203');
5589 ## ISSUE: Can this case be reached?
5590 pop @{$self->{open_elements}};
5591 }
5592
5593 $self->{insertion_mode} = IN_ROW_IM;
5594 if ($token->{tag_name} eq 'tr') {
5595 !!!cp ('t204');
5596 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5597 !!!nack ('t204');
5598 !!!next-token;
5599 next B;
5600 } else {
5601 !!!cp ('t205');
5602 !!!insert-element ('tr',, $token);
5603 ## reprocess in the "in row" insertion mode
5604 }
5605 } else {
5606 !!!cp ('t206');
5607 }
5608
5609 ## Clear back to table row context
5610 while (not ($self->{open_elements}->[-1]->[1]
5611 & TABLE_ROW_SCOPING_EL)) {
5612 !!!cp ('t207');
5613 pop @{$self->{open_elements}};
5614 }
5615
5616 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5617 $self->{insertion_mode} = IN_CELL_IM;
5618
5619 push @$active_formatting_elements, ['#marker', ''];
5620
5621 !!!nack ('t207.1');
5622 !!!next-token;
5623 next B;
5624 } elsif ({
5625 caption => 1, col => 1, colgroup => 1,
5626 tbody => 1, tfoot => 1, thead => 1,
5627 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5628 }->{$token->{tag_name}}) {
5629 if ($self->{insertion_mode} == IN_ROW_IM) {
5630 ## As if </tr>
5631 ## have an element in table scope
5632 my $i;
5633 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5634 my $node = $self->{open_elements}->[$_];
5635 if ($node->[1] & TABLE_ROW_EL) {
5636 !!!cp ('t208');
5637 $i = $_;
5638 last INSCOPE;
5639 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5640 !!!cp ('t209');
5641 last INSCOPE;
5642 }
5643 } # INSCOPE
5644 unless (defined $i) {
5645 !!!cp ('t210');
5646 ## TODO: This type is wrong.
5647 !!!parse-error (type => 'unmacthed end tag',
5648 text => $token->{tag_name}, token => $token);
5649 ## Ignore the token
5650 !!!nack ('t210.1');
5651 !!!next-token;
5652 next B;
5653 }
5654
5655 ## Clear back to table row context
5656 while (not ($self->{open_elements}->[-1]->[1]
5657 & TABLE_ROW_SCOPING_EL)) {
5658 !!!cp ('t211');
5659 ## ISSUE: Can this case be reached?
5660 pop @{$self->{open_elements}};
5661 }
5662
5663 pop @{$self->{open_elements}}; # tr
5664 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5665 if ($token->{tag_name} eq 'tr') {
5666 !!!cp ('t212');
5667 ## reprocess
5668 !!!ack-later;
5669 next B;
5670 } else {
5671 !!!cp ('t213');
5672 ## reprocess in the "in table body" insertion mode...
5673 }
5674 }
5675
5676 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5677 ## have an element in table scope
5678 my $i;
5679 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5680 my $node = $self->{open_elements}->[$_];
5681 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5682 !!!cp ('t214');
5683 $i = $_;
5684 last INSCOPE;
5685 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5686 !!!cp ('t215');
5687 last INSCOPE;
5688 }
5689 } # INSCOPE
5690 unless (defined $i) {
5691 !!!cp ('t216');
5692 ## TODO: This erorr type is wrong.
5693 !!!parse-error (type => 'unmatched end tag',
5694 text => $token->{tag_name}, token => $token);
5695 ## Ignore the token
5696 !!!nack ('t216.1');
5697 !!!next-token;
5698 next B;
5699 }
5700
5701 ## Clear back to table body context
5702 while (not ($self->{open_elements}->[-1]->[1]
5703 & TABLE_ROWS_SCOPING_EL)) {
5704 !!!cp ('t217');
5705 ## ISSUE: Can this state be reached?
5706 pop @{$self->{open_elements}};
5707 }
5708
5709 ## As if <{current node}>
5710 ## have an element in table scope
5711 ## true by definition
5712
5713 ## Clear back to table body context
5714 ## nop by definition
5715
5716 pop @{$self->{open_elements}};
5717 $self->{insertion_mode} = IN_TABLE_IM;
5718 ## reprocess in "in table" insertion mode...
5719 } else {
5720 !!!cp ('t218');
5721 }
5722
5723 if ($token->{tag_name} eq 'col') {
5724 ## Clear back to table context
5725 while (not ($self->{open_elements}->[-1]->[1]
5726 & TABLE_SCOPING_EL)) {
5727 !!!cp ('t219');
5728 ## ISSUE: Can this state be reached?
5729 pop @{$self->{open_elements}};
5730 }
5731
5732 !!!insert-element ('colgroup',, $token);
5733 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5734 ## reprocess
5735 !!!ack-later;
5736 next B;
5737 } elsif ({
5738 caption => 1,
5739 colgroup => 1,
5740 tbody => 1, tfoot => 1, thead => 1,
5741 }->{$token->{tag_name}}) {
5742 ## Clear back to table context
5743 while (not ($self->{open_elements}->[-1]->[1]
5744 & TABLE_SCOPING_EL)) {
5745 !!!cp ('t220');
5746 ## ISSUE: Can this state be reached?
5747 pop @{$self->{open_elements}};
5748 }
5749
5750 push @$active_formatting_elements, ['#marker', '']
5751 if $token->{tag_name} eq 'caption';
5752
5753 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5754 $self->{insertion_mode} = {
5755 caption => IN_CAPTION_IM,
5756 colgroup => IN_COLUMN_GROUP_IM,
5757 tbody => IN_TABLE_BODY_IM,
5758 tfoot => IN_TABLE_BODY_IM,
5759 thead => IN_TABLE_BODY_IM,
5760 }->{$token->{tag_name}};
5761 !!!next-token;
5762 !!!nack ('t220.1');
5763 next B;
5764 } else {
5765 die "$0: in table: <>: $token->{tag_name}";
5766 }
5767 } elsif ($token->{tag_name} eq 'table') {
5768 !!!parse-error (type => 'not closed',
5769 text => $self->{open_elements}->[-1]->[0]
5770 ->manakai_local_name,
5771 token => $token);
5772
5773 ## As if </table>
5774 ## have a table element in table scope
5775 my $i;
5776 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5777 my $node = $self->{open_elements}->[$_];
5778 if ($node->[1] & TABLE_EL) {
5779 !!!cp ('t221');
5780 $i = $_;
5781 last INSCOPE;
5782 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5783 !!!cp ('t222');
5784 last INSCOPE;
5785 }
5786 } # INSCOPE
5787 unless (defined $i) {
5788 !!!cp ('t223');
5789 ## TODO: The following is wrong, maybe.
5790 !!!parse-error (type => 'unmatched end tag', text => 'table',
5791 token => $token);
5792 ## Ignore tokens </table><table>
5793 !!!nack ('t223.1');
5794 !!!next-token;
5795 next B;
5796 }
5797
5798 ## TODO: Followings are removed from the latest spec.
5799 ## generate implied end tags
5800 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5801 !!!cp ('t224');
5802 pop @{$self->{open_elements}};
5803 }
5804
5805 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5806 !!!cp ('t225');
5807 ## NOTE: |<table><tr><table>|
5808 !!!parse-error (type => 'not closed',
5809 text => $self->{open_elements}->[-1]->[0]
5810 ->manakai_local_name,
5811 token => $token);
5812 } else {
5813 !!!cp ('t226');
5814 }
5815
5816 splice @{$self->{open_elements}}, $i;
5817 pop @{$open_tables};
5818
5819 $self->_reset_insertion_mode;
5820
5821 ## reprocess
5822 !!!ack-later;
5823 next B;
5824 } elsif ($token->{tag_name} eq 'style') {
5825 if (not $open_tables->[-1]->[1]) { # tainted
5826 !!!cp ('t227.8');
5827 ## NOTE: This is a "as if in head" code clone.
5828 $parse_rcdata->(CDATA_CONTENT_MODEL);
5829 next B;
5830 } else {
5831 !!!cp ('t227.7');
5832 #
5833 }
5834 } elsif ($token->{tag_name} eq 'script') {
5835 if (not $open_tables->[-1]->[1]) { # tainted
5836 !!!cp ('t227.6');
5837 ## NOTE: This is a "as if in head" code clone.
5838 $script_start_tag->();
5839 next B;
5840 } else {
5841 !!!cp ('t227.5');
5842 #
5843 }
5844 } elsif ($token->{tag_name} eq 'input') {
5845 if (not $open_tables->[-1]->[1]) { # tainted
5846 if ($token->{attributes}->{type}) { ## TODO: case
5847 my $type = lc $token->{attributes}->{type}->{value};
5848 if ($type eq 'hidden') {
5849 !!!cp ('t227.3');
5850 !!!parse-error (type => 'in table',
5851 text => $token->{tag_name}, token => $token);
5852
5853 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5854
5855 ## TODO: form element pointer
5856
5857 pop @{$self->{open_elements}};
5858
5859 !!!next-token;
5860 !!!ack ('t227.2.1');
5861 next B;
5862 } else {
5863 !!!cp ('t227.2');
5864 #
5865 }
5866 } else {
5867 !!!cp ('t227.1');
5868 #
5869 }
5870 } else {
5871 !!!cp ('t227.4');
5872 #
5873 }
5874 } else {
5875 !!!cp ('t227');
5876 #
5877 }
5878
5879 !!!parse-error (type => 'in table', text => $token->{tag_name},
5880 token => $token);
5881
5882 $insert = $insert_to_foster;
5883 #
5884 } elsif ($token->{type} == END_TAG_TOKEN) {
5885 if ($token->{tag_name} eq 'tr' and
5886 $self->{insertion_mode} == IN_ROW_IM) {
5887 ## have an element in table scope
5888 my $i;
5889 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5890 my $node = $self->{open_elements}->[$_];
5891 if ($node->[1] & TABLE_ROW_EL) {
5892 !!!cp ('t228');
5893 $i = $_;
5894 last INSCOPE;
5895 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5896 !!!cp ('t229');
5897 last INSCOPE;
5898 }
5899 } # INSCOPE
5900 unless (defined $i) {
5901 !!!cp ('t230');
5902 !!!parse-error (type => 'unmatched end tag',
5903 text => $token->{tag_name}, token => $token);
5904 ## Ignore the token
5905 !!!nack ('t230.1');
5906 !!!next-token;
5907 next B;
5908 } else {
5909 !!!cp ('t232');
5910 }
5911
5912 ## Clear back to table row context
5913 while (not ($self->{open_elements}->[-1]->[1]
5914 & TABLE_ROW_SCOPING_EL)) {
5915 !!!cp ('t231');
5916 ## ISSUE: Can this state be reached?
5917 pop @{$self->{open_elements}};
5918 }
5919
5920 pop @{$self->{open_elements}}; # tr
5921 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5922 !!!next-token;
5923 !!!nack ('t231.1');
5924 next B;
5925 } elsif ($token->{tag_name} eq 'table') {
5926 if ($self->{insertion_mode} == IN_ROW_IM) {
5927 ## As if </tr>
5928 ## have an element in table scope
5929 my $i;
5930 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5931 my $node = $self->{open_elements}->[$_];
5932 if ($node->[1] & TABLE_ROW_EL) {
5933 !!!cp ('t233');
5934 $i = $_;
5935 last INSCOPE;
5936 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5937 !!!cp ('t234');
5938 last INSCOPE;
5939 }
5940 } # INSCOPE
5941 unless (defined $i) {
5942 !!!cp ('t235');
5943 ## TODO: The following is wrong.
5944 !!!parse-error (type => 'unmatched end tag',
5945 text => $token->{type}, token => $token);
5946 ## Ignore the token
5947 !!!nack ('t236.1');
5948 !!!next-token;
5949 next B;
5950 }
5951
5952 ## Clear back to table row context
5953 while (not ($self->{open_elements}->[-1]->[1]
5954 & TABLE_ROW_SCOPING_EL)) {
5955 !!!cp ('t236');
5956 ## ISSUE: Can this state be reached?
5957 pop @{$self->{open_elements}};
5958 }
5959
5960 pop @{$self->{open_elements}}; # tr
5961 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5962 ## reprocess in the "in table body" insertion mode...
5963 }
5964
5965 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5966 ## have an element in table scope
5967 my $i;
5968 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5969 my $node = $self->{open_elements}->[$_];
5970 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5971 !!!cp ('t237');
5972 $i = $_;
5973 last INSCOPE;
5974 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5975 !!!cp ('t238');
5976 last INSCOPE;
5977 }
5978 } # INSCOPE
5979 unless (defined $i) {
5980 !!!cp ('t239');
5981 !!!parse-error (type => 'unmatched end tag',
5982 text => $token->{tag_name}, token => $token);
5983 ## Ignore the token
5984 !!!nack ('t239.1');
5985 !!!next-token;
5986 next B;
5987 }
5988
5989 ## Clear back to table body context
5990 while (not ($self->{open_elements}->[-1]->[1]
5991 & TABLE_ROWS_SCOPING_EL)) {
5992 !!!cp ('t240');
5993 pop @{$self->{open_elements}};
5994 }
5995
5996 ## As if <{current node}>
5997 ## have an element in table scope
5998 ## true by definition
5999
6000 ## Clear back to table body context
6001 ## nop by definition
6002
6003 pop @{$self->{open_elements}};
6004 $self->{insertion_mode} = IN_TABLE_IM;
6005 ## reprocess in the "in table" insertion mode...
6006 }
6007
6008 ## NOTE: </table> in the "in table" insertion mode.
6009 ## When you edit the code fragment below, please ensure that
6010 ## the code for <table> in the "in table" insertion mode
6011 ## is synced with it.
6012
6013 ## have a table element in table scope
6014 my $i;
6015 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6016 my $node = $self->{open_elements}->[$_];
6017 if ($node->[1] & TABLE_EL) {
6018 !!!cp ('t241');
6019 $i = $_;
6020 last INSCOPE;
6021 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6022 !!!cp ('t242');
6023 last INSCOPE;
6024 }
6025 } # INSCOPE
6026 unless (defined $i) {
6027 !!!cp ('t243');
6028 !!!parse-error (type => 'unmatched end tag',
6029 text => $token->{tag_name}, token => $token);
6030 ## Ignore the token
6031 !!!nack ('t243.1');
6032 !!!next-token;
6033 next B;
6034 }
6035
6036 splice @{$self->{open_elements}}, $i;
6037 pop @{$open_tables};
6038
6039 $self->_reset_insertion_mode;
6040
6041 !!!next-token;
6042 next B;
6043 } elsif ({
6044 tbody => 1, tfoot => 1, thead => 1,
6045 }->{$token->{tag_name}} and
6046 $self->{insertion_mode} & ROW_IMS) {
6047 if ($self->{insertion_mode} == IN_ROW_IM) {
6048 ## have an element in table scope
6049 my $i;
6050 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6051 my $node = $self->{open_elements}->[$_];
6052 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6053 !!!cp ('t247');
6054 $i = $_;
6055 last INSCOPE;
6056 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6057 !!!cp ('t248');
6058 last INSCOPE;
6059 }
6060 } # INSCOPE
6061 unless (defined $i) {
6062 !!!cp ('t249');
6063 !!!parse-error (type => 'unmatched end tag',
6064 text => $token->{tag_name}, token => $token);
6065 ## Ignore the token
6066 !!!nack ('t249.1');
6067 !!!next-token;
6068 next B;
6069 }
6070
6071 ## As if </tr>
6072 ## have an element in table scope
6073 my $i;
6074 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6075 my $node = $self->{open_elements}->[$_];
6076 if ($node->[1] & TABLE_ROW_EL) {
6077 !!!cp ('t250');
6078 $i = $_;
6079 last INSCOPE;
6080 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6081 !!!cp ('t251');
6082 last INSCOPE;
6083 }
6084 } # INSCOPE
6085 unless (defined $i) {
6086 !!!cp ('t252');
6087 !!!parse-error (type => 'unmatched end tag',
6088 text => 'tr', token => $token);
6089 ## Ignore the token
6090 !!!nack ('t252.1');
6091 !!!next-token;
6092 next B;
6093 }
6094
6095 ## Clear back to table row context
6096 while (not ($self->{open_elements}->[-1]->[1]
6097 & TABLE_ROW_SCOPING_EL)) {
6098 !!!cp ('t253');
6099 ## ISSUE: Can this case be reached?
6100 pop @{$self->{open_elements}};
6101 }
6102
6103 pop @{$self->{open_elements}}; # tr
6104 $self->{insertion_mode} = IN_TABLE_BODY_IM;
6105 ## reprocess in the "in table body" insertion mode...
6106 }
6107
6108 ## have an element in table scope
6109 my $i;
6110 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6111 my $node = $self->{open_elements}->[$_];
6112 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6113 !!!cp ('t254');
6114 $i = $_;
6115 last INSCOPE;
6116 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6117 !!!cp ('t255');
6118 last INSCOPE;
6119 }
6120 } # INSCOPE
6121 unless (defined $i) {
6122 !!!cp ('t256');
6123 !!!parse-error (type => 'unmatched end tag',
6124 text => $token->{tag_name}, token => $token);
6125 ## Ignore the token
6126 !!!nack ('t256.1');
6127 !!!next-token;
6128 next B;
6129 }
6130
6131 ## Clear back to table body context
6132 while (not ($self->{open_elements}->[-1]->[1]
6133 & TABLE_ROWS_SCOPING_EL)) {
6134 !!!cp ('t257');
6135 ## ISSUE: Can this case be reached?
6136 pop @{$self->{open_elements}};
6137 }
6138
6139 pop @{$self->{open_elements}};
6140 $self->{insertion_mode} = IN_TABLE_IM;
6141 !!!nack ('t257.1');
6142 !!!next-token;
6143 next B;
6144 } elsif ({
6145 body => 1, caption => 1, col => 1, colgroup => 1,
6146 html => 1, td => 1, th => 1,
6147 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
6148 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
6149 }->{$token->{tag_name}}) {
6150 !!!cp ('t258');
6151 !!!parse-error (type => 'unmatched end tag',
6152 text => $token->{tag_name}, token => $token);
6153 ## Ignore the token
6154 !!!nack ('t258.1');
6155 !!!next-token;
6156 next B;
6157 } else {
6158 !!!cp ('t259');
6159 !!!parse-error (type => 'in table:/',
6160 text => $token->{tag_name}, token => $token);
6161
6162 $insert = $insert_to_foster;
6163 #
6164 }
6165 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6166 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6167 @{$self->{open_elements}} == 1) { # redundant, maybe
6168 !!!parse-error (type => 'in body:#eof', token => $token);
6169 !!!cp ('t259.1');
6170 #
6171 } else {
6172 !!!cp ('t259.2');
6173 #
6174 }
6175
6176 ## Stop parsing
6177 last B;
6178 } else {
6179 die "$0: $token->{type}: Unknown token type";
6180 }
6181 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6182 if ($token->{type} == CHARACTER_TOKEN) {
6183 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6184 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6185 unless (length $token->{data}) {
6186 !!!cp ('t260');
6187 !!!next-token;
6188 next B;
6189 }
6190 }
6191
6192 !!!cp ('t261');
6193 #
6194 } elsif ($token->{type} == START_TAG_TOKEN) {
6195 if ($token->{tag_name} eq 'col') {
6196 !!!cp ('t262');
6197 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6198 pop @{$self->{open_elements}};
6199 !!!ack ('t262.1');
6200 !!!next-token;
6201 next B;
6202 } else {
6203 !!!cp ('t263');
6204 #
6205 }
6206 } elsif ($token->{type} == END_TAG_TOKEN) {
6207 if ($token->{tag_name} eq 'colgroup') {
6208 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6209 !!!cp ('t264');
6210 !!!parse-error (type => 'unmatched end tag',
6211 text => 'colgroup', token => $token);
6212 ## Ignore the token
6213 !!!next-token;
6214 next B;
6215 } else {
6216 !!!cp ('t265');
6217 pop @{$self->{open_elements}}; # colgroup
6218 $self->{insertion_mode} = IN_TABLE_IM;
6219 !!!next-token;
6220 next B;
6221 }
6222 } elsif ($token->{tag_name} eq 'col') {
6223 !!!cp ('t266');
6224 !!!parse-error (type => 'unmatched end tag',
6225 text => 'col', token => $token);
6226 ## Ignore the token
6227 !!!next-token;
6228 next B;
6229 } else {
6230 !!!cp ('t267');
6231 #
6232 }
6233 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6234 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6235 @{$self->{open_elements}} == 1) { # redundant, maybe
6236 !!!cp ('t270.2');
6237 ## Stop parsing.
6238 last B;
6239 } else {
6240 ## NOTE: As if </colgroup>.
6241 !!!cp ('t270.1');
6242 pop @{$self->{open_elements}}; # colgroup
6243 $self->{insertion_mode} = IN_TABLE_IM;
6244 ## Reprocess.
6245 next B;
6246 }
6247 } else {
6248 die "$0: $token->{type}: Unknown token type";
6249 }
6250
6251 ## As if </colgroup>
6252 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6253 !!!cp ('t269');
6254 ## TODO: Wrong error type?
6255 !!!parse-error (type => 'unmatched end tag',
6256 text => 'colgroup', token => $token);
6257 ## Ignore the token
6258 !!!nack ('t269.1');
6259 !!!next-token;
6260 next B;
6261 } else {
6262 !!!cp ('t270');
6263 pop @{$self->{open_elements}}; # colgroup
6264 $self->{insertion_mode} = IN_TABLE_IM;
6265 !!!ack-later;
6266 ## reprocess
6267 next B;
6268 }
6269 } elsif ($self->{insertion_mode} & SELECT_IMS) {
6270 if ($token->{type} == CHARACTER_TOKEN) {
6271 !!!cp ('t271');
6272 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
6273 !!!next-token;
6274 next B;
6275 } elsif ($token->{type} == START_TAG_TOKEN) {
6276 if ($token->{tag_name} eq 'option') {
6277 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6278 !!!cp ('t272');
6279 ## As if </option>
6280 pop @{$self->{open_elements}};
6281 } else {
6282 !!!cp ('t273');
6283 }
6284
6285 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6286 !!!nack ('t273.1');
6287 !!!next-token;
6288 next B;
6289 } elsif ($token->{tag_name} eq 'optgroup') {
6290 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6291 !!!cp ('t274');
6292 ## As if </option>
6293 pop @{$self->{open_elements}};
6294 } else {
6295 !!!cp ('t275');
6296 }
6297
6298 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6299 !!!cp ('t276');
6300 ## As if </optgroup>
6301 pop @{$self->{open_elements}};
6302 } else {
6303 !!!cp ('t277');
6304 }
6305
6306 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6307 !!!nack ('t277.1');
6308 !!!next-token;
6309 next B;
6310 } elsif ({
6311 select => 1, input => 1, textarea => 1,
6312 }->{$token->{tag_name}} or
6313 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6314 {
6315 caption => 1, table => 1,
6316 tbody => 1, tfoot => 1, thead => 1,
6317 tr => 1, td => 1, th => 1,
6318 }->{$token->{tag_name}})) {
6319 ## TODO: The type below is not good - <select> is replaced by </select>
6320 !!!parse-error (type => 'not closed', text => 'select',
6321 token => $token);
6322 ## NOTE: As if the token were </select> (<select> case) or
6323 ## as if there were </select> (otherwise).
6324 ## have an element in table scope
6325 my $i;
6326 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6327 my $node = $self->{open_elements}->[$_];
6328 if ($node->[1] & SELECT_EL) {
6329 !!!cp ('t278');
6330 $i = $_;
6331 last INSCOPE;
6332 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6333 !!!cp ('t279');
6334 last INSCOPE;
6335 }
6336 } # INSCOPE
6337 unless (defined $i) {
6338 !!!cp ('t280');
6339 !!!parse-error (type => 'unmatched end tag',
6340 text => 'select', token => $token);
6341 ## Ignore the token
6342 !!!nack ('t280.1');
6343 !!!next-token;
6344 next B;
6345 }
6346
6347 !!!cp ('t281');
6348 splice @{$self->{open_elements}}, $i;
6349
6350 $self->_reset_insertion_mode;
6351
6352 if ($token->{tag_name} eq 'select') {
6353 !!!nack ('t281.2');
6354 !!!next-token;
6355 next B;
6356 } else {
6357 !!!cp ('t281.1');
6358 !!!ack-later;
6359 ## Reprocess the token.
6360 next B;
6361 }
6362 } else {
6363 !!!cp ('t282');
6364 !!!parse-error (type => 'in select',
6365 text => $token->{tag_name}, token => $token);
6366 ## Ignore the token
6367 !!!nack ('t282.1');
6368 !!!next-token;
6369 next B;
6370 }
6371 } elsif ($token->{type} == END_TAG_TOKEN) {
6372 if ($token->{tag_name} eq 'optgroup') {
6373 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6374 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6375 !!!cp ('t283');
6376 ## As if </option>
6377 splice @{$self->{open_elements}}, -2;
6378 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6379 !!!cp ('t284');
6380 pop @{$self->{open_elements}};
6381 } else {
6382 !!!cp ('t285');
6383 !!!parse-error (type => 'unmatched end tag',
6384 text => $token->{tag_name}, token => $token);
6385 ## Ignore the token
6386 }
6387 !!!nack ('t285.1');
6388 !!!next-token;
6389 next B;
6390 } elsif ($token->{tag_name} eq 'option') {
6391 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6392 !!!cp ('t286');
6393 pop @{$self->{open_elements}};
6394 } else {
6395 !!!cp ('t287');
6396 !!!parse-error (type => 'unmatched end tag',
6397 text => $token->{tag_name}, token => $token);
6398 ## Ignore the token
6399 }
6400 !!!nack ('t287.1');
6401 !!!next-token;
6402 next B;
6403 } elsif ($token->{tag_name} eq 'select') {
6404 ## have an element in table scope
6405 my $i;
6406 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6407 my $node = $self->{open_elements}->[$_];
6408 if ($node->[1] & SELECT_EL) {
6409 !!!cp ('t288');
6410 $i = $_;
6411 last INSCOPE;
6412 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6413 !!!cp ('t289');
6414 last INSCOPE;
6415 }
6416 } # INSCOPE
6417 unless (defined $i) {
6418 !!!cp ('t290');
6419 !!!parse-error (type => 'unmatched end tag',
6420 text => $token->{tag_name}, token => $token);
6421 ## Ignore the token
6422 !!!nack ('t290.1');
6423 !!!next-token;
6424 next B;
6425 }
6426
6427 !!!cp ('t291');
6428 splice @{$self->{open_elements}}, $i;
6429
6430 $self->_reset_insertion_mode;
6431
6432 !!!nack ('t291.1');
6433 !!!next-token;
6434 next B;
6435 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6436 {
6437 caption => 1, table => 1, tbody => 1,
6438 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6439 }->{$token->{tag_name}}) {
6440 ## TODO: The following is wrong?
6441 !!!parse-error (type => 'unmatched end tag',
6442 text => $token->{tag_name}, token => $token);
6443
6444 ## have an element in table scope
6445 my $i;
6446 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6447 my $node = $self->{open_elements}->[$_];
6448 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6449 !!!cp ('t292');
6450 $i = $_;
6451 last INSCOPE;
6452 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6453 !!!cp ('t293');
6454 last INSCOPE;
6455 }
6456 } # INSCOPE
6457 unless (defined $i) {
6458 !!!cp ('t294');
6459 ## Ignore the token
6460 !!!nack ('t294.1');
6461 !!!next-token;
6462 next B;
6463 }
6464
6465 ## As if </select>
6466 ## have an element in table scope
6467 undef $i;
6468 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6469 my $node = $self->{open_elements}->[$_];
6470 if ($node->[1] & SELECT_EL) {
6471 !!!cp ('t295');
6472 $i = $_;
6473 last INSCOPE;
6474 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6475 ## ISSUE: Can this state be reached?
6476 !!!cp ('t296');
6477 last INSCOPE;
6478 }
6479 } # INSCOPE
6480 unless (defined $i) {
6481 !!!cp ('t297');
6482 ## TODO: The following error type is correct?
6483 !!!parse-error (type => 'unmatched end tag',
6484 text => 'select', token => $token);
6485 ## Ignore the </select> token
6486 !!!nack ('t297.1');
6487 !!!next-token; ## TODO: ok?
6488 next B;
6489 }
6490
6491 !!!cp ('t298');
6492 splice @{$self->{open_elements}}, $i;
6493
6494 $self->_reset_insertion_mode;
6495
6496 !!!ack-later;
6497 ## reprocess
6498 next B;
6499 } else {
6500 !!!cp ('t299');
6501 !!!parse-error (type => 'in select:/',
6502 text => $token->{tag_name}, token => $token);
6503 ## Ignore the token
6504 !!!nack ('t299.3');
6505 !!!next-token;
6506 next B;
6507 }
6508 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6509 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6510 @{$self->{open_elements}} == 1) { # redundant, maybe
6511 !!!cp ('t299.1');
6512 !!!parse-error (type => 'in body:#eof', token => $token);
6513 } else {
6514 !!!cp ('t299.2');
6515 }
6516
6517 ## Stop parsing.
6518 last B;
6519 } else {
6520 die "$0: $token->{type}: Unknown token type";
6521 }
6522 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6523 if ($token->{type} == CHARACTER_TOKEN) {
6524 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6525 my $data = $1;
6526 ## As if in body
6527 $reconstruct_active_formatting_elements->($insert_to_current);
6528
6529 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6530
6531 unless (length $token->{data}) {
6532 !!!cp ('t300');
6533 !!!next-token;
6534 next B;
6535 }
6536 }
6537
6538 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6539 !!!cp ('t301');
6540 !!!parse-error (type => 'after html:#text', token => $token);
6541
6542 ## Reprocess in the "after body" insertion mode.
6543 } else {
6544 !!!cp ('t302');
6545 }
6546
6547 ## "after body" insertion mode
6548 !!!parse-error (type => 'after body:#text', token => $token);
6549
6550 $self->{insertion_mode} = IN_BODY_IM;
6551 ## reprocess
6552 next B;
6553 } elsif ($token->{type} == START_TAG_TOKEN) {
6554 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6555 !!!cp ('t303');
6556 !!!parse-error (type => 'after html',
6557 text => $token->{tag_name}, token => $token);
6558
6559 ## Reprocess in the "after body" insertion mode.
6560 } else {
6561 !!!cp ('t304');
6562 }
6563
6564 ## "after body" insertion mode
6565 !!!parse-error (type => 'after body',
6566 text => $token->{tag_name}, token => $token);
6567
6568 $self->{insertion_mode} = IN_BODY_IM;
6569 !!!ack-later;
6570 ## reprocess
6571 next B;
6572 } elsif ($token->{type} == END_TAG_TOKEN) {
6573 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6574 !!!cp ('t305');
6575 !!!parse-error (type => 'after html:/',
6576 text => $token->{tag_name}, token => $token);
6577
6578 $self->{insertion_mode} = AFTER_BODY_IM;
6579 ## Reprocess in the "after body" insertion mode.
6580 } else {
6581 !!!cp ('t306');
6582 }
6583
6584 ## "after body" insertion mode
6585 if ($token->{tag_name} eq 'html') {
6586 if (defined $self->{inner_html_node}) {
6587 !!!cp ('t307');
6588 !!!parse-error (type => 'unmatched end tag',
6589 text => 'html', token => $token);
6590 ## Ignore the token
6591 !!!next-token;
6592 next B;
6593 } else {
6594 !!!cp ('t308');
6595 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6596 !!!next-token;
6597 next B;
6598 }
6599 } else {
6600 !!!cp ('t309');
6601 !!!parse-error (type => 'after body:/',
6602 text => $token->{tag_name}, token => $token);
6603
6604 $self->{insertion_mode} = IN_BODY_IM;
6605 ## reprocess
6606 next B;
6607 }
6608 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6609 !!!cp ('t309.2');
6610 ## Stop parsing
6611 last B;
6612 } else {
6613 die "$0: $token->{type}: Unknown token type";
6614 }
6615 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6616 if ($token->{type} == CHARACTER_TOKEN) {
6617 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6618 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6619
6620 unless (length $token->{data}) {
6621 !!!cp ('t310');
6622 !!!next-token;
6623 next B;
6624 }
6625 }
6626
6627 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6628 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6629 !!!cp ('t311');
6630 !!!parse-error (type => 'in frameset:#text', token => $token);
6631 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6632 !!!cp ('t312');
6633 !!!parse-error (type => 'after frameset:#text', token => $token);
6634 } else { # "after after frameset"
6635 !!!cp ('t313');
6636 !!!parse-error (type => 'after html:#text', token => $token);
6637 }
6638
6639 ## Ignore the token.
6640 if (length $token->{data}) {
6641 !!!cp ('t314');
6642 ## reprocess the rest of characters
6643 } else {
6644 !!!cp ('t315');
6645 !!!next-token;
6646 }
6647 next B;
6648 }
6649
6650 die qq[$0: Character "$token->{data}"];
6651 } elsif ($token->{type} == START_TAG_TOKEN) {
6652 if ($token->{tag_name} eq 'frameset' and
6653 $self->{insertion_mode} == IN_FRAMESET_IM) {
6654 !!!cp ('t318');
6655 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6656 !!!nack ('t318.1');
6657 !!!next-token;
6658 next B;
6659 } elsif ($token->{tag_name} eq 'frame' and
6660 $self->{insertion_mode} == IN_FRAMESET_IM) {
6661 !!!cp ('t319');
6662 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6663 pop @{$self->{open_elements}};
6664 !!!ack ('t319.1');
6665 !!!next-token;
6666 next B;
6667 } elsif ($token->{tag_name} eq 'noframes') {
6668 !!!cp ('t320');
6669 ## NOTE: As if in head.
6670 $parse_rcdata->(CDATA_CONTENT_MODEL);
6671 next B;
6672
6673 ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6674 ## has no parse error.
6675 } else {
6676 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6677 !!!cp ('t321');
6678 !!!parse-error (type => 'in frameset',
6679 text => $token->{tag_name}, token => $token);
6680 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6681 !!!cp ('t322');
6682 !!!parse-error (type => 'after frameset',
6683 text => $token->{tag_name}, token => $token);
6684 } else { # "after after frameset"
6685 !!!cp ('t322.2');
6686 !!!parse-error (type => 'after after frameset',
6687 text => $token->{tag_name}, token => $token);
6688 }
6689 ## Ignore the token
6690 !!!nack ('t322.1');
6691 !!!next-token;
6692 next B;
6693 }
6694 } elsif ($token->{type} == END_TAG_TOKEN) {
6695 if ($token->{tag_name} eq 'frameset' and
6696 $self->{insertion_mode} == IN_FRAMESET_IM) {
6697 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6698 @{$self->{open_elements}} == 1) {
6699 !!!cp ('t325');
6700 !!!parse-error (type => 'unmatched end tag',
6701 text => $token->{tag_name}, token => $token);
6702 ## Ignore the token
6703 !!!next-token;
6704 } else {
6705 !!!cp ('t326');
6706 pop @{$self->{open_elements}};
6707 !!!next-token;
6708 }
6709
6710 if (not defined $self->{inner_html_node} and
6711 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6712 !!!cp ('t327');
6713 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6714 } else {
6715 !!!cp ('t328');
6716 }
6717 next B;
6718 } elsif ($token->{tag_name} eq 'html' and
6719 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6720 !!!cp ('t329');
6721 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6722 !!!next-token;
6723 next B;
6724 } else {
6725 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6726 !!!cp ('t330');
6727 !!!parse-error (type => 'in frameset:/',
6728 text => $token->{tag_name}, token => $token);
6729 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6730 !!!cp ('t330.1');
6731 !!!parse-error (type => 'after frameset:/',
6732 text => $token->{tag_name}, token => $token);
6733 } else { # "after after html"
6734 !!!cp ('t331');
6735 !!!parse-error (type => 'after after frameset:/',
6736 text => $token->{tag_name}, token => $token);
6737 }
6738 ## Ignore the token
6739 !!!next-token;
6740 next B;
6741 }
6742 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6743 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6744 @{$self->{open_elements}} == 1) { # redundant, maybe
6745 !!!cp ('t331.1');
6746 !!!parse-error (type => 'in body:#eof', token => $token);
6747 } else {
6748 !!!cp ('t331.2');
6749 }
6750
6751 ## Stop parsing
6752 last B;
6753 } else {
6754 die "$0: $token->{type}: Unknown token type";
6755 }
6756
6757 ## ISSUE: An issue in spec here
6758 } else {
6759 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6760 }
6761
6762 ## "in body" insertion mode
6763 if ($token->{type} == START_TAG_TOKEN) {
6764 if ($token->{tag_name} eq 'script') {
6765 !!!cp ('t332');
6766 ## NOTE: This is an "as if in head" code clone
6767 $script_start_tag->();
6768 next B;
6769 } elsif ($token->{tag_name} eq 'style') {
6770 !!!cp ('t333');
6771 ## NOTE: This is an "as if in head" code clone
6772 $parse_rcdata->(CDATA_CONTENT_MODEL);
6773 next B;
6774 } elsif ({
6775 base => 1, link => 1,
6776 }->{$token->{tag_name}}) {
6777 !!!cp ('t334');
6778 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6779 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6780 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6781 !!!ack ('t334.1');
6782 !!!next-token;
6783 next B;
6784 } elsif ($token->{tag_name} eq 'meta') {
6785 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6786 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6787 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6788
6789 unless ($self->{confident}) {
6790 if ($token->{attributes}->{charset}) {
6791 !!!cp ('t335');
6792 ## NOTE: Whether the encoding is supported or not is handled
6793 ## in the {change_encoding} callback.
6794 $self->{change_encoding}
6795 ->($self, $token->{attributes}->{charset}->{value}, $token);
6796
6797 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6798 ->set_user_data (manakai_has_reference =>
6799 $token->{attributes}->{charset}
6800 ->{has_reference});
6801 } elsif ($token->{attributes}->{content}) {
6802 if ($token->{attributes}->{content}->{value}
6803 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6804 [\x09-\x0D\x20]*=
6805 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6806 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6807 !!!cp ('t336');
6808 ## NOTE: Whether the encoding is supported or not is handled
6809 ## in the {change_encoding} callback.
6810 $self->{change_encoding}
6811 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6812 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6813 ->set_user_data (manakai_has_reference =>
6814 $token->{attributes}->{content}
6815 ->{has_reference});
6816 }
6817 }
6818 } else {
6819 if ($token->{attributes}->{charset}) {
6820 !!!cp ('t337');
6821 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6822 ->set_user_data (manakai_has_reference =>
6823 $token->{attributes}->{charset}
6824 ->{has_reference});
6825 }
6826 if ($token->{attributes}->{content}) {
6827 !!!cp ('t338');
6828 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6829 ->set_user_data (manakai_has_reference =>
6830 $token->{attributes}->{content}
6831 ->{has_reference});
6832 }
6833 }
6834
6835 !!!ack ('t338.1');
6836 !!!next-token;
6837 next B;
6838 } elsif ($token->{tag_name} eq 'title') {
6839 !!!cp ('t341');
6840 ## NOTE: This is an "as if in head" code clone
6841 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6842 next B;
6843 } elsif ($token->{tag_name} eq 'body') {
6844 !!!parse-error (type => 'in body', text => 'body', token => $token);
6845
6846 if (@{$self->{open_elements}} == 1 or
6847 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6848 !!!cp ('t342');
6849 ## Ignore the token
6850 } else {
6851 my $body_el = $self->{open_elements}->[1]->[0];
6852 for my $attr_name (keys %{$token->{attributes}}) {
6853 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6854 !!!cp ('t343');
6855 $body_el->set_attribute_ns
6856 (undef, [undef, $attr_name],
6857 $token->{attributes}->{$attr_name}->{value});
6858 }
6859 }
6860 }
6861 !!!nack ('t343.1');
6862 !!!next-token;
6863 next B;
6864 } elsif ({
6865 address => 1, blockquote => 1, center => 1, dir => 1,
6866 div => 1, dl => 1, fieldset => 1,
6867 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6868 menu => 1, ol => 1, p => 1, ul => 1,
6869 pre => 1, listing => 1,
6870 form => 1,
6871 table => 1,
6872 hr => 1,
6873 }->{$token->{tag_name}}) {
6874 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6875 !!!cp ('t350');
6876 !!!parse-error (type => 'in form:form', token => $token);
6877 ## Ignore the token
6878 !!!nack ('t350.1');
6879 !!!next-token;
6880 next B;
6881 }
6882
6883 ## has a p element in scope
6884 INSCOPE: for (reverse @{$self->{open_elements}}) {
6885 if ($_->[1] & P_EL) {
6886 !!!cp ('t344');
6887 !!!back-token; # <form>
6888 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6889 line => $token->{line}, column => $token->{column}};
6890 next B;
6891 } elsif ($_->[1] & SCOPING_EL) {
6892 !!!cp ('t345');
6893 last INSCOPE;
6894 }
6895 } # INSCOPE
6896
6897 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6898 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6899 !!!nack ('t346.1');
6900 !!!next-token;
6901 if ($token->{type} == CHARACTER_TOKEN) {
6902 $token->{data} =~ s/^\x0A//;
6903 unless (length $token->{data}) {
6904 !!!cp ('t346');
6905 !!!next-token;
6906 } else {
6907 !!!cp ('t349');
6908 }
6909 } else {
6910 !!!cp ('t348');
6911 }
6912 } elsif ($token->{tag_name} eq 'form') {
6913 !!!cp ('t347.1');
6914 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6915
6916 !!!nack ('t347.2');
6917 !!!next-token;
6918 } elsif ($token->{tag_name} eq 'table') {
6919 !!!cp ('t382');
6920 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6921
6922 $self->{insertion_mode} = IN_TABLE_IM;
6923
6924 !!!nack ('t382.1');
6925 !!!next-token;
6926 } elsif ($token->{tag_name} eq 'hr') {
6927 !!!cp ('t386');
6928 pop @{$self->{open_elements}};
6929
6930 !!!nack ('t386.1');
6931 !!!next-token;
6932 } else {
6933 !!!nack ('t347.1');
6934 !!!next-token;
6935 }
6936 next B;
6937 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6938 ## has a p element in scope
6939 INSCOPE: for (reverse @{$self->{open_elements}}) {
6940 if ($_->[1] & P_EL) {
6941 !!!cp ('t353');
6942 !!!back-token; # <x>
6943 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6944 line => $token->{line}, column => $token->{column}};
6945 next B;
6946 } elsif ($_->[1] & SCOPING_EL) {
6947 !!!cp ('t354');
6948 last INSCOPE;
6949 }
6950 } # INSCOPE
6951
6952 ## Step 1
6953 my $i = -1;
6954 my $node = $self->{open_elements}->[$i];
6955 my $li_or_dtdd = {li => {li => 1},
6956 dt => {dt => 1, dd => 1},
6957 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6958 LI: {
6959 ## Step 2
6960 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6961 if ($i != -1) {
6962 !!!cp ('t355');
6963 !!!parse-error (type => 'not closed',
6964 text => $self->{open_elements}->[-1]->[0]
6965 ->manakai_local_name,
6966 token => $token);
6967 } else {
6968 !!!cp ('t356');
6969 }
6970 splice @{$self->{open_elements}}, $i;
6971 last LI;
6972 } else {
6973 !!!cp ('t357');
6974 }
6975
6976 ## Step 3
6977 if (not ($node->[1] & FORMATTING_EL) and
6978 #not $phrasing_category->{$node->[1]} and
6979 ($node->[1] & SPECIAL_EL or
6980 $node->[1] & SCOPING_EL) and
6981 not ($node->[1] & ADDRESS_EL) and
6982 not ($node->[1] & DIV_EL)) {
6983 !!!cp ('t358');
6984 last LI;
6985 }
6986
6987 !!!cp ('t359');
6988 ## Step 4
6989 $i--;
6990 $node = $self->{open_elements}->[$i];
6991 redo LI;
6992 } # LI
6993
6994 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6995 !!!nack ('t359.1');
6996 !!!next-token;
6997 next B;
6998 } elsif ($token->{tag_name} eq 'plaintext') {
6999 ## has a p element in scope
7000 INSCOPE: for (reverse @{$self->{open_elements}}) {
7001 if ($_->[1] & P_EL) {
7002 !!!cp ('t367');
7003 !!!back-token; # <plaintext>
7004 $token = {type => END_TAG_TOKEN, tag_name => 'p',
7005 line => $token->{line}, column => $token->{column}};
7006 next B;
7007 } elsif ($_->[1] & SCOPING_EL) {
7008 !!!cp ('t368');
7009 last INSCOPE;
7010 }
7011 } # INSCOPE
7012
7013 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7014
7015 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
7016
7017 !!!nack ('t368.1');
7018 !!!next-token;
7019 next B;
7020 } elsif ($token->{tag_name} eq 'a') {
7021 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
7022 my $node = $active_formatting_elements->[$i];
7023 if ($node->[1] & A_EL) {
7024 !!!cp ('t371');
7025 !!!parse-error (type => 'in a:a', token => $token);
7026
7027 !!!back-token; # <a>
7028 $token = {type => END_TAG_TOKEN, tag_name => 'a',
7029 line => $token->{line}, column => $token->{column}};
7030 $formatting_end_tag->($token);
7031
7032 AFE2: for (reverse 0..$#$active_formatting_elements) {
7033 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
7034 !!!cp ('t372');
7035 splice @$active_formatting_elements, $_, 1;
7036 last AFE2;
7037 }
7038 } # AFE2
7039 OE: for (reverse 0..$#{$self->{open_elements}}) {
7040 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
7041 !!!cp ('t373');
7042 splice @{$self->{open_elements}}, $_, 1;
7043 last OE;
7044 }
7045 } # OE
7046 last AFE;
7047 } elsif ($node->[0] eq '#marker') {
7048 !!!cp ('t374');
7049 last AFE;
7050 }
7051 } # AFE
7052
7053 $reconstruct_active_formatting_elements->($insert_to_current);
7054
7055 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7056 push @$active_formatting_elements, $self->{open_elements}->[-1];
7057
7058 !!!nack ('t374.1');
7059 !!!next-token;
7060 next B;
7061 } elsif ($token->{tag_name} eq 'nobr') {
7062 $reconstruct_active_formatting_elements->($insert_to_current);
7063
7064 ## has a |nobr| element in scope
7065 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7066 my $node = $self->{open_elements}->[$_];
7067 if ($node->[1] & NOBR_EL) {
7068 !!!cp ('t376');
7069 !!!parse-error (type => 'in nobr:nobr', token => $token);
7070 !!!back-token; # <nobr>
7071 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
7072 line => $token->{line}, column => $token->{column}};
7073 next B;
7074 } elsif ($node->[1] & SCOPING_EL) {
7075 !!!cp ('t377');
7076 last INSCOPE;
7077 }
7078 } # INSCOPE
7079
7080 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7081 push @$active_formatting_elements, $self->{open_elements}->[-1];
7082
7083 !!!nack ('t377.1');
7084 !!!next-token;
7085 next B;
7086 } elsif ($token->{tag_name} eq 'button') {
7087 ## has a button element in scope
7088 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7089 my $node = $self->{open_elements}->[$_];
7090 if ($node->[1] & BUTTON_EL) {
7091 !!!cp ('t378');
7092 !!!parse-error (type => 'in button:button', token => $token);
7093 !!!back-token; # <button>
7094 $token = {type => END_TAG_TOKEN, tag_name => 'button',
7095 line => $token->{line}, column => $token->{column}};
7096 next B;
7097 } elsif ($node->[1] & SCOPING_EL) {
7098 !!!cp ('t379');
7099 last INSCOPE;
7100 }
7101 } # INSCOPE
7102
7103 $reconstruct_active_formatting_elements->($insert_to_current);
7104
7105 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7106
7107 ## TODO: associate with $self->{form_element} if defined
7108
7109 push @$active_formatting_elements, ['#marker', ''];
7110
7111 !!!nack ('t379.1');
7112 !!!next-token;
7113 next B;
7114 } elsif ({
7115 xmp => 1,
7116 iframe => 1,
7117 noembed => 1,
7118 noframes => 1, ## NOTE: This is an "as if in head" code clone.
7119 noscript => 0, ## TODO: 1 if scripting is enabled
7120 }->{$token->{tag_name}}) {
7121 if ($token->{tag_name} eq 'xmp') {
7122 !!!cp ('t381');
7123 $reconstruct_active_formatting_elements->($insert_to_current);
7124 } else {
7125 !!!cp ('t399');
7126 }
7127 ## NOTE: There is an "as if in body" code clone.
7128 $parse_rcdata->(CDATA_CONTENT_MODEL);
7129 next B;
7130 } elsif ($token->{tag_name} eq 'isindex') {
7131 !!!parse-error (type => 'isindex', token => $token);
7132
7133 if (defined $self->{form_element}) {
7134 !!!cp ('t389');
7135 ## Ignore the token
7136 !!!nack ('t389'); ## NOTE: Not acknowledged.
7137 !!!next-token;
7138 next B;
7139 } else {
7140 !!!ack ('t391.1');
7141
7142 my $at = $token->{attributes};
7143 my $form_attrs;
7144 $form_attrs->{action} = $at->{action} if $at->{action};
7145 my $prompt_attr = $at->{prompt};
7146 $at->{name} = {name => 'name', value => 'isindex'};
7147 delete $at->{action};
7148 delete $at->{prompt};
7149 my @tokens = (
7150 {type => START_TAG_TOKEN, tag_name => 'form',
7151 attributes => $form_attrs,
7152 line => $token->{line}, column => $token->{column}},
7153 {type => START_TAG_TOKEN, tag_name => 'hr',
7154 line => $token->{line}, column => $token->{column}},
7155 {type => START_TAG_TOKEN, tag_name => 'p',
7156 line => $token->{line}, column => $token->{column}},
7157 {type => START_TAG_TOKEN, tag_name => 'label',
7158 line => $token->{line}, column => $token->{column}},
7159 );
7160 if ($prompt_attr) {
7161 !!!cp ('t390');
7162 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
7163 #line => $token->{line}, column => $token->{column},
7164 };
7165 } else {
7166 !!!cp ('t391');
7167 push @tokens, {type => CHARACTER_TOKEN,
7168 data => 'This is a searchable index. Insert your search keywords here: ',
7169 #line => $token->{line}, column => $token->{column},
7170 }; # SHOULD
7171 ## TODO: make this configurable
7172 }
7173 push @tokens,
7174 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
7175 line => $token->{line}, column => $token->{column}},
7176 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
7177 {type => END_TAG_TOKEN, tag_name => 'label',
7178 line => $token->{line}, column => $token->{column}},
7179 {type => END_TAG_TOKEN, tag_name => 'p',
7180 line => $token->{line}, column => $token->{column}},
7181 {type => START_TAG_TOKEN, tag_name => 'hr',
7182 line => $token->{line}, column => $token->{column}},
7183 {type => END_TAG_TOKEN, tag_name => 'form',
7184 line => $token->{line}, column => $token->{column}};
7185 !!!back-token (@tokens);
7186 !!!next-token;
7187 next B;
7188 }
7189 } elsif ($token->{tag_name} eq 'textarea') {
7190 my $tag_name = $token->{tag_name};
7191 my $el;
7192 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
7193
7194 ## TODO: $self->{form_element} if defined
7195 $self->{content_model} = RCDATA_CONTENT_MODEL;
7196 delete $self->{escape}; # MUST
7197
7198 $insert->($el);
7199
7200 my $text = '';
7201 !!!nack ('t392.1');
7202 !!!next-token;
7203 if ($token->{type} == CHARACTER_TOKEN) {
7204 $token->{data} =~ s/^\x0A//;
7205 unless (length $token->{data}) {
7206 !!!cp ('t392');
7207 !!!next-token;
7208 } else {
7209 !!!cp ('t393');
7210 }
7211 } else {
7212 !!!cp ('t394');
7213 }
7214 while ($token->{type} == CHARACTER_TOKEN) {
7215 !!!cp ('t395');
7216 $text .= $token->{data};
7217 !!!next-token;
7218 }
7219 if (length $text) {
7220 !!!cp ('t396');
7221 $el->manakai_append_text ($text);
7222 }
7223
7224 $self->{content_model} = PCDATA_CONTENT_MODEL;
7225
7226 if ($token->{type} == END_TAG_TOKEN and
7227 $token->{tag_name} eq $tag_name) {
7228 !!!cp ('t397');
7229 ## Ignore the token
7230 } else {
7231 !!!cp ('t398');
7232 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
7233 }
7234 !!!next-token;
7235 next B;
7236 } elsif ($token->{tag_name} eq 'rt' or
7237 $token->{tag_name} eq 'rp') {
7238 ## has a |ruby| element in scope
7239 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7240 my $node = $self->{open_elements}->[$_];
7241 if ($node->[1] & RUBY_EL) {
7242 !!!cp ('t398.1');
7243 ## generate implied end tags
7244 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7245 !!!cp ('t398.2');
7246 pop @{$self->{open_elements}};
7247 }
7248 unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
7249 !!!cp ('t398.3');
7250 !!!parse-error (type => 'not closed',
7251 text => $self->{open_elements}->[-1]->[0]
7252 ->manakai_local_name,
7253 token => $token);
7254 pop @{$self->{open_elements}}
7255 while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
7256 }
7257 last INSCOPE;
7258 } elsif ($node->[1] & SCOPING_EL) {
7259 !!!cp ('t398.4');
7260 last INSCOPE;
7261 }
7262 } # INSCOPE
7263
7264 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7265
7266 !!!nack ('t398.5');
7267 !!!next-token;
7268 redo B;
7269 } elsif ($token->{tag_name} eq 'math' or
7270 $token->{tag_name} eq 'svg') {
7271 $reconstruct_active_formatting_elements->($insert_to_current);
7272
7273 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
7274
7275 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
7276
7277 ## "adjust foreign attributes" - done in insert-element-f
7278
7279 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
7280
7281 if ($self->{self_closing}) {
7282 pop @{$self->{open_elements}};
7283 !!!ack ('t398.1');
7284 } else {
7285 !!!cp ('t398.2');
7286 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
7287 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
7288 ## mode, "in body" (not "in foreign content") secondary insertion
7289 ## mode, maybe.
7290 }
7291
7292 !!!next-token;
7293 next B;
7294 } elsif ({
7295 caption => 1, col => 1, colgroup => 1, frame => 1,
7296 frameset => 1, head => 1, option => 1, optgroup => 1,
7297 tbody => 1, td => 1, tfoot => 1, th => 1,
7298 thead => 1, tr => 1,
7299 }->{$token->{tag_name}}) {
7300 !!!cp ('t401');
7301 !!!parse-error (type => 'in body',
7302 text => $token->{tag_name}, token => $token);
7303 ## Ignore the token
7304 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
7305 !!!next-token;
7306 next B;
7307
7308 ## ISSUE: An issue on HTML5 new elements in the spec.
7309 } else {
7310 if ($token->{tag_name} eq 'image') {
7311 !!!cp ('t384');
7312 !!!parse-error (type => 'image', token => $token);
7313 $token->{tag_name} = 'img';
7314 } else {
7315 !!!cp ('t385');
7316 }
7317
7318 ## NOTE: There is an "as if <br>" code clone.
7319 $reconstruct_active_formatting_elements->($insert_to_current);
7320
7321 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7322
7323 if ({
7324 applet => 1, marquee => 1, object => 1,
7325 }->{$token->{tag_name}}) {
7326 !!!cp ('t380');
7327 push @$active_formatting_elements, ['#marker', ''];
7328 !!!nack ('t380.1');
7329 } elsif ({
7330 b => 1, big => 1, em => 1, font => 1, i => 1,
7331 s => 1, small => 1, strile => 1,
7332 strong => 1, tt => 1, u => 1,
7333 }->{$token->{tag_name}}) {
7334 !!!cp ('t375');
7335 push @$active_formatting_elements, $self->{open_elements}->[-1];
7336 !!!nack ('t375.1');
7337 } elsif ($token->{tag_name} eq 'input') {
7338 !!!cp ('t388');
7339 ## TODO: associate with $self->{form_element} if defined
7340 pop @{$self->{open_elements}};
7341 !!!ack ('t388.2');
7342 } elsif ({
7343 area => 1, basefont => 1, bgsound => 1, br => 1,
7344 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7345 #image => 1,
7346 }->{$token->{tag_name}}) {
7347 !!!cp ('t388.1');
7348 pop @{$self->{open_elements}};
7349 !!!ack ('t388.3');
7350 } elsif ($token->{tag_name} eq 'select') {
7351 ## TODO: associate with $self->{form_element} if defined
7352
7353 if ($self->{insertion_mode} & TABLE_IMS or
7354 $self->{insertion_mode} & BODY_TABLE_IMS or
7355 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7356 !!!cp ('t400.1');
7357 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7358 } else {
7359 !!!cp ('t400.2');
7360 $self->{insertion_mode} = IN_SELECT_IM;
7361 }
7362 !!!nack ('t400.3');
7363 } else {
7364 !!!nack ('t402');
7365 }
7366
7367 !!!next-token;
7368 next B;
7369 }
7370 } elsif ($token->{type} == END_TAG_TOKEN) {
7371 if ($token->{tag_name} eq 'body') {
7372 ## has a |body| element in scope
7373 my $i;
7374 INSCOPE: {
7375 for (reverse @{$self->{open_elements}}) {
7376 if ($_->[1] & BODY_EL) {
7377 !!!cp ('t405');
7378 $i = $_;
7379 last INSCOPE;
7380 } elsif ($_->[1] & SCOPING_EL) {
7381 !!!cp ('t405.1');
7382 last;
7383 }
7384 }
7385
7386 !!!parse-error (type => 'start tag not allowed',
7387 text => $token->{tag_name}, token => $token);
7388 ## NOTE: Ignore the token.
7389 !!!next-token;
7390 next B;
7391 } # INSCOPE
7392
7393 for (@{$self->{open_elements}}) {
7394 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7395 !!!cp ('t403');
7396 !!!parse-error (type => 'not closed',
7397 text => $_->[0]->manakai_local_name,
7398 token => $token);
7399 last;
7400 } else {
7401 !!!cp ('t404');
7402 }
7403 }
7404
7405 $self->{insertion_mode} = AFTER_BODY_IM;
7406 !!!next-token;
7407 next B;
7408 } elsif ($token->{tag_name} eq 'html') {
7409 ## TODO: Update this code. It seems that the code below is not
7410 ## up-to-date, though it has same effect as speced.
7411 if (@{$self->{open_elements}} > 1 and
7412 $self->{open_elements}->[1]->[1] & BODY_EL) {
7413 ## ISSUE: There is an issue in the spec.
7414 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7415 !!!cp ('t406');
7416 !!!parse-error (type => 'not closed',
7417 text => $self->{open_elements}->[1]->[0]
7418 ->manakai_local_name,
7419 token => $token);
7420 } else {
7421 !!!cp ('t407');
7422 }
7423 $self->{insertion_mode} = AFTER_BODY_IM;
7424 ## reprocess
7425 next B;
7426 } else {
7427 !!!cp ('t408');
7428 !!!parse-error (type => 'unmatched end tag',
7429 text => $token->{tag_name}, token => $token);
7430 ## Ignore the token
7431 !!!next-token;
7432 next B;
7433 }
7434 } elsif ({
7435 address => 1, blockquote => 1, center => 1, dir => 1,
7436 div => 1, dl => 1, fieldset => 1, listing => 1,
7437 menu => 1, ol => 1, pre => 1, ul => 1,
7438 dd => 1, dt => 1, li => 1,
7439 applet => 1, button => 1, marquee => 1, object => 1,
7440 }->{$token->{tag_name}}) {
7441 ## has an element in scope
7442 my $i;
7443 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7444 my $node = $self->{open_elements}->[$_];
7445 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7446 !!!cp ('t410');
7447 $i = $_;
7448 last INSCOPE;
7449 } elsif ($node->[1] & SCOPING_EL) {
7450 !!!cp ('t411');
7451 last INSCOPE;
7452 }
7453 } # INSCOPE
7454
7455 unless (defined $i) { # has an element in scope
7456 !!!cp ('t413');
7457 !!!parse-error (type => 'unmatched end tag',
7458 text => $token->{tag_name}, token => $token);
7459 ## NOTE: Ignore the token.
7460 } else {
7461 ## Step 1. generate implied end tags
7462 while ({
7463 ## END_TAG_OPTIONAL_EL
7464 dd => ($token->{tag_name} ne 'dd'),
7465 dt => ($token->{tag_name} ne 'dt'),
7466 li => ($token->{tag_name} ne 'li'),
7467 p => 1,
7468 rt => 1,
7469 rp => 1,
7470 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7471 !!!cp ('t409');
7472 pop @{$self->{open_elements}};
7473 }
7474
7475 ## Step 2.
7476 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7477 ne $token->{tag_name}) {
7478 !!!cp ('t412');
7479 !!!parse-error (type => 'not closed',
7480 text => $self->{open_elements}->[-1]->[0]
7481 ->manakai_local_name,
7482 token => $token);
7483 } else {
7484 !!!cp ('t414');
7485 }
7486
7487 ## Step 3.
7488 splice @{$self->{open_elements}}, $i;
7489
7490 ## Step 4.
7491 $clear_up_to_marker->()
7492 if {
7493 applet => 1, button => 1, marquee => 1, object => 1,
7494 }->{$token->{tag_name}};
7495 }
7496 !!!next-token;
7497 next B;
7498 } elsif ($token->{tag_name} eq 'form') {
7499 undef $self->{form_element};
7500
7501 ## has an element in scope
7502 my $i;
7503 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7504 my $node = $self->{open_elements}->[$_];
7505 if ($node->[1] & FORM_EL) {
7506 !!!cp ('t418');
7507 $i = $_;
7508 last INSCOPE;
7509 } elsif ($node->[1] & SCOPING_EL) {
7510 !!!cp ('t419');
7511 last INSCOPE;
7512 }
7513 } # INSCOPE
7514
7515 unless (defined $i) { # has an element in scope
7516 !!!cp ('t421');
7517 !!!parse-error (type => 'unmatched end tag',
7518 text => $token->{tag_name}, token => $token);
7519 ## NOTE: Ignore the token.
7520 } else {
7521 ## Step 1. generate implied end tags
7522 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7523 !!!cp ('t417');
7524 pop @{$self->{open_elements}};
7525 }
7526
7527 ## Step 2.
7528 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7529 ne $token->{tag_name}) {
7530 !!!cp ('t417.1');
7531 !!!parse-error (type => 'not closed',
7532 text => $self->{open_elements}->[-1]->[0]
7533 ->manakai_local_name,
7534 token => $token);
7535 } else {
7536 !!!cp ('t420');
7537 }
7538
7539 ## Step 3.
7540 splice @{$self->{open_elements}}, $i;
7541 }
7542
7543 !!!next-token;
7544 next B;
7545 } elsif ({
7546 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7547 }->{$token->{tag_name}}) {
7548 ## has an element in scope
7549 my $i;
7550 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7551 my $node = $self->{open_elements}->[$_];
7552 if ($node->[1] & HEADING_EL) {
7553 !!!cp ('t423');
7554 $i = $_;
7555 last INSCOPE;
7556 } elsif ($node->[1] & SCOPING_EL) {
7557 !!!cp ('t424');
7558 last INSCOPE;
7559 }
7560 } # INSCOPE
7561
7562 unless (defined $i) { # has an element in scope
7563 !!!cp ('t425.1');
7564 !!!parse-error (type => 'unmatched end tag',
7565 text => $token->{tag_name}, token => $token);
7566 ## NOTE: Ignore the token.
7567 } else {
7568 ## Step 1. generate implied end tags
7569 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7570 !!!cp ('t422');
7571 pop @{$self->{open_elements}};
7572 }
7573
7574 ## Step 2.
7575 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7576 ne $token->{tag_name}) {
7577 !!!cp ('t425');
7578 !!!parse-error (type => 'unmatched end tag',
7579 text => $token->{tag_name}, token => $token);
7580 } else {
7581 !!!cp ('t426');
7582 }
7583
7584 ## Step 3.
7585 splice @{$self->{open_elements}}, $i;
7586 }
7587
7588 !!!next-token;
7589 next B;
7590 } elsif ($token->{tag_name} eq 'p') {
7591 ## has an element in scope
7592 my $i;
7593 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7594 my $node = $self->{open_elements}->[$_];
7595 if ($node->[1] & P_EL) {
7596 !!!cp ('t410.1');
7597 $i = $_;
7598 last INSCOPE;
7599 } elsif ($node->[1] & SCOPING_EL) {
7600 !!!cp ('t411.1');
7601 last INSCOPE;
7602 }
7603 } # INSCOPE
7604
7605 if (defined $i) {
7606 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7607 ne $token->{tag_name}) {
7608 !!!cp ('t412.1');
7609 !!!parse-error (type => 'not closed',
7610 text => $self->{open_elements}->[-1]->[0]
7611 ->manakai_local_name,
7612 token => $token);
7613 } else {
7614 !!!cp ('t414.1');
7615 }
7616
7617 splice @{$self->{open_elements}}, $i;
7618 } else {
7619 !!!cp ('t413.1');
7620 !!!parse-error (type => 'unmatched end tag',
7621 text => $token->{tag_name}, token => $token);
7622
7623 !!!cp ('t415.1');
7624 ## As if <p>, then reprocess the current token
7625 my $el;
7626 !!!create-element ($el, $HTML_NS, 'p',, $token);
7627 $insert->($el);
7628 ## NOTE: Not inserted into |$self->{open_elements}|.
7629 }
7630
7631 !!!next-token;
7632 next B;
7633 } elsif ({
7634 a => 1,
7635 b => 1, big => 1, em => 1, font => 1, i => 1,
7636 nobr => 1, s => 1, small => 1, strile => 1,
7637 strong => 1, tt => 1, u => 1,
7638 }->{$token->{tag_name}}) {
7639 !!!cp ('t427');
7640 $formatting_end_tag->($token);
7641 next B;
7642 } elsif ($token->{tag_name} eq 'br') {
7643 !!!cp ('t428');
7644 !!!parse-error (type => 'unmatched end tag',
7645 text => 'br', token => $token);
7646
7647 ## As if <br>
7648 $reconstruct_active_formatting_elements->($insert_to_current);
7649
7650 my $el;
7651 !!!create-element ($el, $HTML_NS, 'br',, $token);
7652 $insert->($el);
7653
7654 ## Ignore the token.
7655 !!!next-token;
7656 next B;
7657 } elsif ({
7658 caption => 1, col => 1, colgroup => 1, frame => 1,
7659 frameset => 1, head => 1, option => 1, optgroup => 1,
7660 tbody => 1, td => 1, tfoot => 1, th => 1,
7661 thead => 1, tr => 1,
7662 area => 1, basefont => 1, bgsound => 1,
7663 embed => 1, hr => 1, iframe => 1, image => 1,
7664 img => 1, input => 1, isindex => 1, noembed => 1,
7665 noframes => 1, param => 1, select => 1, spacer => 1,
7666 table => 1, textarea => 1, wbr => 1,
7667 noscript => 0, ## TODO: if scripting is enabled
7668 }->{$token->{tag_name}}) {
7669 !!!cp ('t429');
7670 !!!parse-error (type => 'unmatched end tag',
7671 text => $token->{tag_name}, token => $token);
7672 ## Ignore the token
7673 !!!next-token;
7674 next B;
7675
7676 ## ISSUE: Issue on HTML5 new elements in spec
7677
7678 } else {
7679 ## Step 1
7680 my $node_i = -1;
7681 my $node = $self->{open_elements}->[$node_i];
7682
7683 ## Step 2
7684 S2: {
7685 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7686 ## Step 1
7687 ## generate implied end tags
7688 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7689 !!!cp ('t430');
7690 ## NOTE: |<ruby><rt></ruby>|.
7691 ## ISSUE: <ruby><rt></rt> will also take this code path,
7692 ## which seems wrong.
7693 pop @{$self->{open_elements}};
7694 $node_i++;
7695 }
7696
7697 ## Step 2
7698 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7699 ne $token->{tag_name}) {
7700 !!!cp ('t431');
7701 ## NOTE: <x><y></x>
7702 !!!parse-error (type => 'not closed',
7703 text => $self->{open_elements}->[-1]->[0]
7704 ->manakai_local_name,
7705 token => $token);
7706 } else {
7707 !!!cp ('t432');
7708 }
7709
7710 ## Step 3
7711 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7712
7713 !!!next-token;
7714 last S2;
7715 } else {
7716 ## Step 3
7717 if (not ($node->[1] & FORMATTING_EL) and
7718 #not $phrasing_category->{$node->[1]} and
7719 ($node->[1] & SPECIAL_EL or
7720 $node->[1] & SCOPING_EL)) {
7721 !!!cp ('t433');
7722 !!!parse-error (type => 'unmatched end tag',
7723 text => $token->{tag_name}, token => $token);
7724 ## Ignore the token
7725 !!!next-token;
7726 last S2;
7727 }
7728
7729 !!!cp ('t434');
7730 }
7731
7732 ## Step 4
7733 $node_i--;
7734 $node = $self->{open_elements}->[$node_i];
7735
7736 ## Step 5;
7737 redo S2;
7738 } # S2
7739 next B;
7740 }
7741 }
7742 next B;
7743 } continue { # B
7744 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7745 ## NOTE: The code below is executed in cases where it does not have
7746 ## to be, but it it is harmless even in those cases.
7747 ## has an element in scope
7748 INSCOPE: {
7749 for (reverse 0..$#{$self->{open_elements}}) {
7750 my $node = $self->{open_elements}->[$_];
7751 if ($node->[1] & FOREIGN_EL) {
7752 last INSCOPE;
7753 } elsif ($node->[1] & SCOPING_EL) {
7754 last;
7755 }
7756 }
7757
7758 ## NOTE: No foreign element in scope.
7759 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7760 } # INSCOPE
7761 }
7762 } # B
7763
7764 ## Stop parsing # MUST
7765
7766 ## TODO: script stuffs
7767 } # _tree_construct_main
7768
7769 sub set_inner_html ($$$$;$) {
7770 my $class = shift;
7771 my $node = shift;
7772 #my $s = \$_[0];
7773 my $onerror = $_[1];
7774 my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7775
7776 ## ISSUE: Should {confident} be true?
7777
7778 my $nt = $node->node_type;
7779 if ($nt == 9) {
7780 # MUST
7781
7782 ## Step 1 # MUST
7783 ## TODO: If the document has an active parser, ...
7784 ## ISSUE: There is an issue in the spec.
7785
7786 ## Step 2 # MUST
7787 my @cn = @{$node->child_nodes};
7788 for (@cn) {
7789 $node->remove_child ($_);
7790 }
7791
7792 ## Step 3, 4, 5 # MUST
7793 $class->parse_char_string ($_[0] => $node, $onerror, $get_wrapper);
7794 } elsif ($nt == 1) {
7795 ## TODO: If non-html element
7796
7797 ## NOTE: Most of this code is copied from |parse_string|
7798
7799 ## TODO: Support for $get_wrapper
7800
7801 ## Step 1 # MUST
7802 my $this_doc = $node->owner_document;
7803 my $doc = $this_doc->implementation->create_document;
7804 $doc->manakai_is_html (1);
7805 my $p = $class->new;
7806 $p->{document} = $doc;
7807
7808 ## Step 8 # MUST
7809 my $i = 0;
7810 $p->{line_prev} = $p->{line} = 1;
7811 $p->{column_prev} = $p->{column} = 0;
7812 require Whatpm::Charset::DecodeHandle;
7813 my $input = Whatpm::Charset::DecodeHandle::CharString->new (\($_[0]));
7814 $input = $get_wrapper->($input);
7815 $p->{set_next_char} = sub {
7816 my $self = shift;
7817
7818 my $char = '';
7819 if (defined $self->{next_next_char}) {
7820 $char = $self->{next_next_char};
7821 delete $self->{next_next_char};
7822 $self->{next_char} = ord $char;
7823 } else {
7824 $self->{char_buffer} = '';
7825 $self->{char_buffer_pos} = 0;
7826
7827 my $count = $input->manakai_read_until
7828 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/,
7829 $self->{char_buffer_pos});
7830 if ($count) {
7831 $self->{line_prev} = $self->{line};
7832 $self->{column_prev} = $self->{column};
7833 $self->{column}++;
7834 $self->{next_char}
7835 = ord substr ($self->{char_buffer},
7836 $self->{char_buffer_pos}++, 1);
7837 return;
7838 }
7839
7840 if ($input->read ($char, 1)) {
7841 $self->{next_char} = ord $char;
7842 } else {
7843 $self->{next_char} = -1;
7844 return;
7845 }
7846 }
7847
7848 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7849 $p->{column}++;
7850
7851 if ($self->{next_char} == 0x000A) { # LF
7852 $p->{line}++;
7853 $p->{column} = 0;
7854 !!!cp ('i1');
7855 } elsif ($self->{next_char} == 0x000D) { # CR
7856 ## TODO: support for abort/streaming
7857 my $next = '';
7858 if ($input->read ($next, 1) and $next ne "\x0A") {
7859 $self->{next_next_char} = $next;
7860 }
7861 $self->{next_char} = 0x000A; # LF # MUST
7862 $p->{line}++;
7863 $p->{column} = 0;
7864 !!!cp ('i2');
7865 } elsif ($self->{next_char} == 0x0000) { # NULL
7866 !!!cp ('i4');
7867 !!!parse-error (type => 'NULL');
7868 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7869 }
7870 };
7871
7872 $p->{read_until} = sub {
7873 #my ($scalar, $specials_range, $offset) = @_;
7874 return 0 if defined $p->{next_next_char};
7875
7876 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
7877 my $offset = $_[2] || 0;
7878
7879 if ($p->{char_buffer_pos} < length $p->{char_buffer}) {
7880 pos ($p->{char_buffer}) = $p->{char_buffer_pos};
7881 if ($p->{char_buffer} =~ /\G(?>$pattern)+/) {
7882 substr ($_[0], $offset)
7883 = substr ($p->{char_buffer}, $-[0], $+[0] - $-[0]);
7884 my $count = $+[0] - $-[0];
7885 if ($count) {
7886 $p->{column} += $count;
7887 $p->{char_buffer_pos} += $count;
7888 $p->{line_prev} = $p->{line};
7889 $p->{column_prev} = $p->{column} - 1;
7890 $p->{prev_char} = [-1, -1, -1];
7891 $p->{next_char} = -1;
7892 }
7893 return $count;
7894 } else {
7895 return 0;
7896 }
7897 } else {
7898 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
7899 if ($count) {
7900 $p->{column} += $count;
7901 $p->{column_prev} += $count;
7902 $p->{prev_char} = [-1, -1, -1];
7903 $p->{next_char} = -1;
7904 }
7905 return $count;
7906 }
7907 }; # $p->{read_until}
7908
7909 my $ponerror = $onerror || sub {
7910 my (%opt) = @_;
7911 my $line = $opt{line};
7912 my $column = $opt{column};
7913 if (defined $opt{token} and defined $opt{token}->{line}) {
7914 $line = $opt{token}->{line};
7915 $column = $opt{token}->{column};
7916 }
7917 warn "Parse error ($opt{type}) at line $line column $column\n";
7918 };
7919 $p->{parse_error} = sub {
7920 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7921 };
7922
7923 my $char_onerror = sub {
7924 my (undef, $type, %opt) = @_;
7925 $ponerror->(layer => 'encode',
7926 line => $p->{line}, column => $p->{column} + 1,
7927 %opt, type => $type);
7928 }; # $char_onerror
7929 $input->onerror ($char_onerror);
7930
7931 $p->_initialize_tokenizer;
7932 $p->_initialize_tree_constructor;
7933
7934 ## Step 2
7935 my $node_ln = $node->manakai_local_name;
7936 $p->{content_model} = {
7937 title => RCDATA_CONTENT_MODEL,
7938 textarea => RCDATA_CONTENT_MODEL,
7939 style => CDATA_CONTENT_MODEL,
7940 script => CDATA_CONTENT_MODEL,
7941 xmp => CDATA_CONTENT_MODEL,
7942 iframe => CDATA_CONTENT_MODEL,
7943 noembed => CDATA_CONTENT_MODEL,
7944 noframes => CDATA_CONTENT_MODEL,
7945 noscript => CDATA_CONTENT_MODEL,
7946 plaintext => PLAINTEXT_CONTENT_MODEL,
7947 }->{$node_ln};
7948 $p->{content_model} = PCDATA_CONTENT_MODEL
7949 unless defined $p->{content_model};
7950 ## ISSUE: What is "the name of the element"? local name?
7951
7952 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7953 ## TODO: Foreign element OK?
7954
7955 ## Step 3
7956 my $root = $doc->create_element_ns
7957 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7958
7959 ## Step 4 # MUST
7960 $doc->append_child ($root);
7961
7962 ## Step 5 # MUST
7963 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7964
7965 undef $p->{head_element};
7966
7967 ## Step 6 # MUST
7968 $p->_reset_insertion_mode;
7969
7970 ## Step 7 # MUST
7971 my $anode = $node;
7972 AN: while (defined $anode) {
7973 if ($anode->node_type == 1) {
7974 my $nsuri = $anode->namespace_uri;
7975 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7976 if ($anode->manakai_local_name eq 'form') {
7977 !!!cp ('i5');
7978 $p->{form_element} = $anode;
7979 last AN;
7980 }
7981 }
7982 }
7983 $anode = $anode->parent_node;
7984 } # AN
7985
7986 ## Step 9 # MUST
7987 {
7988 my $self = $p;
7989 !!!next-token;
7990 }
7991 $p->_tree_construction_main;
7992
7993 ## Step 10 # MUST
7994 my @cn = @{$node->child_nodes};
7995 for (@cn) {
7996 $node->remove_child ($_);
7997 }
7998 ## ISSUE: mutation events? read-only?
7999
8000 ## Step 11 # MUST
8001 @cn = @{$root->child_nodes};
8002 for (@cn) {
8003 $this_doc->adopt_node ($_);
8004 $node->append_child ($_);
8005 }
8006 ## ISSUE: mutation events?
8007
8008 $p->_terminate_tree_constructor;
8009
8010 delete $p->{parse_error}; # delete loop
8011 } else {
8012 die "$0: |set_inner_html| is not defined for node of type $nt";
8013 }
8014 } # set_inner_html
8015
8016 } # tree construction stage
8017
8018 package Whatpm::HTML::RestartParser;
8019 push our @ISA, 'Error';
8020
8021 1;
8022 # $Date: 2008/09/15 02:54:12 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24