/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.168 - (show annotations) (download) (as text)
Sat Sep 13 10:49:21 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.167: +326 -235 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	13 Sep 2008 10:47:42 -0000
	* content-model-2.dat: A test case for NCR in charset=""
	is added.

2008-09-13  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	13 Sep 2008 10:48:59 -0000
	* HTML.pm.src: Finally we get rid of all the inner loops.  Remove
	entity related tokenizer states in favor of new states
	implementing the consume character reference algorithm.

2008-09-13  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src: "Consume a character reference" algorithm is
	* HTML.pm.src: Make |PUBLIC| and |SYSTEM| keyword tokenizing

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.167 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 require IO::Handle;
12
13 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15 my $SVG_NS = q<http://www.w3.org/2000/svg>;
16 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19
20 sub A_EL () { 0b1 }
21 sub ADDRESS_EL () { 0b10 }
22 sub BODY_EL () { 0b100 }
23 sub BUTTON_EL () { 0b1000 }
24 sub CAPTION_EL () { 0b10000 }
25 sub DD_EL () { 0b100000 }
26 sub DIV_EL () { 0b1000000 }
27 sub DT_EL () { 0b10000000 }
28 sub FORM_EL () { 0b100000000 }
29 sub FORMATTING_EL () { 0b1000000000 }
30 sub FRAMESET_EL () { 0b10000000000 }
31 sub HEADING_EL () { 0b100000000000 }
32 sub HTML_EL () { 0b1000000000000 }
33 sub LI_EL () { 0b10000000000000 }
34 sub NOBR_EL () { 0b100000000000000 }
35 sub OPTION_EL () { 0b1000000000000000 }
36 sub OPTGROUP_EL () { 0b10000000000000000 }
37 sub P_EL () { 0b100000000000000000 }
38 sub SELECT_EL () { 0b1000000000000000000 }
39 sub TABLE_EL () { 0b10000000000000000000 }
40 sub TABLE_CELL_EL () { 0b100000000000000000000 }
41 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48 sub RUBY_EL () { 0b10000000000000000000000000000 }
49 sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 ## NOTE: Used in "generate implied end tags" algorithm.
58 ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
59 ## is used in "generate implied end tags" implementation (search for the
60 ## function mae).
61 sub END_TAG_OPTIONAL_EL () {
62 DD_EL |
63 DT_EL |
64 LI_EL |
65 P_EL |
66 RUBY_COMPONENT_EL
67 }
68
69 ## NOTE: Used in </body> and EOF algorithms.
70 sub ALL_END_TAG_OPTIONAL_EL () {
71 DD_EL |
72 DT_EL |
73 LI_EL |
74 P_EL |
75
76 BODY_EL |
77 HTML_EL |
78 TABLE_CELL_EL |
79 TABLE_ROW_EL |
80 TABLE_ROW_GROUP_EL
81 }
82
83 sub SCOPING_EL () {
84 BUTTON_EL |
85 CAPTION_EL |
86 HTML_EL |
87 TABLE_EL |
88 TABLE_CELL_EL |
89 MISC_SCOPING_EL
90 }
91
92 sub TABLE_SCOPING_EL () {
93 HTML_EL |
94 TABLE_EL
95 }
96
97 sub TABLE_ROWS_SCOPING_EL () {
98 HTML_EL |
99 TABLE_ROW_GROUP_EL
100 }
101
102 sub TABLE_ROW_SCOPING_EL () {
103 HTML_EL |
104 TABLE_ROW_EL
105 }
106
107 sub SPECIAL_EL () {
108 ADDRESS_EL |
109 BODY_EL |
110 DIV_EL |
111
112 DD_EL |
113 DT_EL |
114 LI_EL |
115 P_EL |
116
117 FORM_EL |
118 FRAMESET_EL |
119 HEADING_EL |
120 OPTION_EL |
121 OPTGROUP_EL |
122 SELECT_EL |
123 TABLE_ROW_EL |
124 TABLE_ROW_GROUP_EL |
125 MISC_SPECIAL_EL
126 }
127
128 my $el_category = {
129 a => A_EL | FORMATTING_EL,
130 address => ADDRESS_EL,
131 applet => MISC_SCOPING_EL,
132 area => MISC_SPECIAL_EL,
133 b => FORMATTING_EL,
134 base => MISC_SPECIAL_EL,
135 basefont => MISC_SPECIAL_EL,
136 bgsound => MISC_SPECIAL_EL,
137 big => FORMATTING_EL,
138 blockquote => MISC_SPECIAL_EL,
139 body => BODY_EL,
140 br => MISC_SPECIAL_EL,
141 button => BUTTON_EL,
142 caption => CAPTION_EL,
143 center => MISC_SPECIAL_EL,
144 col => MISC_SPECIAL_EL,
145 colgroup => MISC_SPECIAL_EL,
146 dd => DD_EL,
147 dir => MISC_SPECIAL_EL,
148 div => DIV_EL,
149 dl => MISC_SPECIAL_EL,
150 dt => DT_EL,
151 em => FORMATTING_EL,
152 embed => MISC_SPECIAL_EL,
153 fieldset => MISC_SPECIAL_EL,
154 font => FORMATTING_EL,
155 form => FORM_EL,
156 frame => MISC_SPECIAL_EL,
157 frameset => FRAMESET_EL,
158 h1 => HEADING_EL,
159 h2 => HEADING_EL,
160 h3 => HEADING_EL,
161 h4 => HEADING_EL,
162 h5 => HEADING_EL,
163 h6 => HEADING_EL,
164 head => MISC_SPECIAL_EL,
165 hr => MISC_SPECIAL_EL,
166 html => HTML_EL,
167 i => FORMATTING_EL,
168 iframe => MISC_SPECIAL_EL,
169 img => MISC_SPECIAL_EL,
170 input => MISC_SPECIAL_EL,
171 isindex => MISC_SPECIAL_EL,
172 li => LI_EL,
173 link => MISC_SPECIAL_EL,
174 listing => MISC_SPECIAL_EL,
175 marquee => MISC_SCOPING_EL,
176 menu => MISC_SPECIAL_EL,
177 meta => MISC_SPECIAL_EL,
178 nobr => NOBR_EL | FORMATTING_EL,
179 noembed => MISC_SPECIAL_EL,
180 noframes => MISC_SPECIAL_EL,
181 noscript => MISC_SPECIAL_EL,
182 object => MISC_SCOPING_EL,
183 ol => MISC_SPECIAL_EL,
184 optgroup => OPTGROUP_EL,
185 option => OPTION_EL,
186 p => P_EL,
187 param => MISC_SPECIAL_EL,
188 plaintext => MISC_SPECIAL_EL,
189 pre => MISC_SPECIAL_EL,
190 rp => RUBY_COMPONENT_EL,
191 rt => RUBY_COMPONENT_EL,
192 ruby => RUBY_EL,
193 s => FORMATTING_EL,
194 script => MISC_SPECIAL_EL,
195 select => SELECT_EL,
196 small => FORMATTING_EL,
197 spacer => MISC_SPECIAL_EL,
198 strike => FORMATTING_EL,
199 strong => FORMATTING_EL,
200 style => MISC_SPECIAL_EL,
201 table => TABLE_EL,
202 tbody => TABLE_ROW_GROUP_EL,
203 td => TABLE_CELL_EL,
204 textarea => MISC_SPECIAL_EL,
205 tfoot => TABLE_ROW_GROUP_EL,
206 th => TABLE_CELL_EL,
207 thead => TABLE_ROW_GROUP_EL,
208 title => MISC_SPECIAL_EL,
209 tr => TABLE_ROW_EL,
210 tt => FORMATTING_EL,
211 u => FORMATTING_EL,
212 ul => MISC_SPECIAL_EL,
213 wbr => MISC_SPECIAL_EL,
214 };
215
216 my $el_category_f = {
217 $MML_NS => {
218 'annotation-xml' => MML_AXML_EL,
219 mi => FOREIGN_FLOW_CONTENT_EL,
220 mo => FOREIGN_FLOW_CONTENT_EL,
221 mn => FOREIGN_FLOW_CONTENT_EL,
222 ms => FOREIGN_FLOW_CONTENT_EL,
223 mtext => FOREIGN_FLOW_CONTENT_EL,
224 },
225 $SVG_NS => {
226 foreignObject => FOREIGN_FLOW_CONTENT_EL,
227 desc => FOREIGN_FLOW_CONTENT_EL,
228 title => FOREIGN_FLOW_CONTENT_EL,
229 },
230 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
231 };
232
233 my $svg_attr_name = {
234 attributename => 'attributeName',
235 attributetype => 'attributeType',
236 basefrequency => 'baseFrequency',
237 baseprofile => 'baseProfile',
238 calcmode => 'calcMode',
239 clippathunits => 'clipPathUnits',
240 contentscripttype => 'contentScriptType',
241 contentstyletype => 'contentStyleType',
242 diffuseconstant => 'diffuseConstant',
243 edgemode => 'edgeMode',
244 externalresourcesrequired => 'externalResourcesRequired',
245 filterres => 'filterRes',
246 filterunits => 'filterUnits',
247 glyphref => 'glyphRef',
248 gradienttransform => 'gradientTransform',
249 gradientunits => 'gradientUnits',
250 kernelmatrix => 'kernelMatrix',
251 kernelunitlength => 'kernelUnitLength',
252 keypoints => 'keyPoints',
253 keysplines => 'keySplines',
254 keytimes => 'keyTimes',
255 lengthadjust => 'lengthAdjust',
256 limitingconeangle => 'limitingConeAngle',
257 markerheight => 'markerHeight',
258 markerunits => 'markerUnits',
259 markerwidth => 'markerWidth',
260 maskcontentunits => 'maskContentUnits',
261 maskunits => 'maskUnits',
262 numoctaves => 'numOctaves',
263 pathlength => 'pathLength',
264 patterncontentunits => 'patternContentUnits',
265 patterntransform => 'patternTransform',
266 patternunits => 'patternUnits',
267 pointsatx => 'pointsAtX',
268 pointsaty => 'pointsAtY',
269 pointsatz => 'pointsAtZ',
270 preservealpha => 'preserveAlpha',
271 preserveaspectratio => 'preserveAspectRatio',
272 primitiveunits => 'primitiveUnits',
273 refx => 'refX',
274 refy => 'refY',
275 repeatcount => 'repeatCount',
276 repeatdur => 'repeatDur',
277 requiredextensions => 'requiredExtensions',
278 requiredfeatures => 'requiredFeatures',
279 specularconstant => 'specularConstant',
280 specularexponent => 'specularExponent',
281 spreadmethod => 'spreadMethod',
282 startoffset => 'startOffset',
283 stddeviation => 'stdDeviation',
284 stitchtiles => 'stitchTiles',
285 surfacescale => 'surfaceScale',
286 systemlanguage => 'systemLanguage',
287 tablevalues => 'tableValues',
288 targetx => 'targetX',
289 targety => 'targetY',
290 textlength => 'textLength',
291 viewbox => 'viewBox',
292 viewtarget => 'viewTarget',
293 xchannelselector => 'xChannelSelector',
294 ychannelselector => 'yChannelSelector',
295 zoomandpan => 'zoomAndPan',
296 };
297
298 my $foreign_attr_xname = {
299 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
300 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
301 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
302 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
303 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
304 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
305 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
306 'xml:base' => [$XML_NS, ['xml', 'base']],
307 'xml:lang' => [$XML_NS, ['xml', 'lang']],
308 'xml:space' => [$XML_NS, ['xml', 'space']],
309 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
310 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
311 };
312
313 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
314
315 my $c1_entity_char = {
316 0x80 => 0x20AC,
317 0x81 => 0xFFFD,
318 0x82 => 0x201A,
319 0x83 => 0x0192,
320 0x84 => 0x201E,
321 0x85 => 0x2026,
322 0x86 => 0x2020,
323 0x87 => 0x2021,
324 0x88 => 0x02C6,
325 0x89 => 0x2030,
326 0x8A => 0x0160,
327 0x8B => 0x2039,
328 0x8C => 0x0152,
329 0x8D => 0xFFFD,
330 0x8E => 0x017D,
331 0x8F => 0xFFFD,
332 0x90 => 0xFFFD,
333 0x91 => 0x2018,
334 0x92 => 0x2019,
335 0x93 => 0x201C,
336 0x94 => 0x201D,
337 0x95 => 0x2022,
338 0x96 => 0x2013,
339 0x97 => 0x2014,
340 0x98 => 0x02DC,
341 0x99 => 0x2122,
342 0x9A => 0x0161,
343 0x9B => 0x203A,
344 0x9C => 0x0153,
345 0x9D => 0xFFFD,
346 0x9E => 0x017E,
347 0x9F => 0x0178,
348 }; # $c1_entity_char
349
350 sub parse_byte_string ($$$$;$) {
351 my $self = shift;
352 my $charset_name = shift;
353 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
354 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
355 } # parse_byte_string
356
357 sub parse_byte_stream ($$$$;$$) {
358 # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_;
359 my $self = ref $_[0] ? shift : shift->new;
360 my $charset_name = shift;
361 my $byte_stream = $_[0];
362
363 my $onerror = $_[2] || sub {
364 my (%opt) = @_;
365 warn "Parse error ($opt{type})\n";
366 };
367 $self->{parse_error} = $onerror; # updated later by parse_char_string
368
369 my $get_wrapper = $_[3] || sub ($) {
370 return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle
371 };
372
373 ## HTML5 encoding sniffing algorithm
374 require Message::Charset::Info;
375 my $charset;
376 my $buffer;
377 my ($char_stream, $e_status);
378
379 SNIFFING: {
380 ## NOTE: By setting |allow_fallback| option true when the
381 ## |get_decode_handle| method is invoked, we ignore what the HTML5
382 ## spec requires, i.e. unsupported encoding should be ignored.
383 ## TODO: We should not do this unless the parser is invoked
384 ## in the conformance checking mode, in which this behavior
385 ## would be useful.
386
387 ## Step 1
388 if (defined $charset_name) {
389 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
390 ## TODO: Is this ok? Transfer protocol's parameter should be
391 ## interpreted in its semantics?
392
393 ## ISSUE: Unsupported encoding is not ignored according to the spec.
394 ($char_stream, $e_status) = $charset->get_decode_handle
395 ($byte_stream, allow_error_reporting => 1,
396 allow_fallback => 1);
397 if ($char_stream) {
398 $self->{confident} = 1;
399 last SNIFFING;
400 } else {
401 ## TODO: unsupported error
402 }
403 }
404
405 ## Step 2
406 my $byte_buffer = '';
407 for (1..1024) {
408 my $char = $byte_stream->getc;
409 last unless defined $char;
410 $byte_buffer .= $char;
411 } ## TODO: timeout
412
413 ## Step 3
414 if ($byte_buffer =~ /^\xFE\xFF/) {
415 $charset = Message::Charset::Info->get_by_html_name ('utf-16be');
416 ($char_stream, $e_status) = $charset->get_decode_handle
417 ($byte_stream, allow_error_reporting => 1,
418 allow_fallback => 1, byte_buffer => \$byte_buffer);
419 $self->{confident} = 1;
420 last SNIFFING;
421 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
422 $charset = Message::Charset::Info->get_by_html_name ('utf-16le');
423 ($char_stream, $e_status) = $charset->get_decode_handle
424 ($byte_stream, allow_error_reporting => 1,
425 allow_fallback => 1, byte_buffer => \$byte_buffer);
426 $self->{confident} = 1;
427 last SNIFFING;
428 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
429 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
430 ($char_stream, $e_status) = $charset->get_decode_handle
431 ($byte_stream, allow_error_reporting => 1,
432 allow_fallback => 1, byte_buffer => \$byte_buffer);
433 $self->{confident} = 1;
434 last SNIFFING;
435 }
436
437 ## Step 4
438 ## TODO: <meta charset>
439
440 ## Step 5
441 ## TODO: from history
442
443 ## Step 6
444 require Whatpm::Charset::UniversalCharDet;
445 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
446 ($byte_buffer);
447 if (defined $charset_name) {
448 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
449
450 ## ISSUE: Unsupported encoding is not ignored according to the spec.
451 require Whatpm::Charset::DecodeHandle;
452 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
453 ($byte_stream);
454 ($char_stream, $e_status) = $charset->get_decode_handle
455 ($buffer, allow_error_reporting => 1,
456 allow_fallback => 1, byte_buffer => \$byte_buffer);
457 if ($char_stream) {
458 $buffer->{buffer} = $byte_buffer;
459 !!!parse-error (type => 'sniffing:chardet',
460 text => $charset_name,
461 level => $self->{level}->{info},
462 layer => 'encode',
463 line => 1, column => 1);
464 $self->{confident} = 0;
465 last SNIFFING;
466 }
467 }
468
469 ## Step 7: default
470 ## TODO: Make this configurable.
471 $charset = Message::Charset::Info->get_by_html_name ('windows-1252');
472 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
473 ## detectable in the step 6.
474 require Whatpm::Charset::DecodeHandle;
475 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
476 ($byte_stream);
477 ($char_stream, $e_status)
478 = $charset->get_decode_handle ($buffer,
479 allow_error_reporting => 1,
480 allow_fallback => 1,
481 byte_buffer => \$byte_buffer);
482 $buffer->{buffer} = $byte_buffer;
483 !!!parse-error (type => 'sniffing:default',
484 text => 'windows-1252',
485 level => $self->{level}->{info},
486 line => 1, column => 1,
487 layer => 'encode');
488 $self->{confident} = 0;
489 } # SNIFFING
490
491 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
492 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
493 !!!parse-error (type => 'chardecode:fallback',
494 #text => $self->{input_encoding},
495 level => $self->{level}->{uncertain},
496 line => 1, column => 1,
497 layer => 'encode');
498 } elsif (not ($e_status &
499 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
500 $self->{input_encoding} = $charset->get_iana_name;
501 !!!parse-error (type => 'chardecode:no error',
502 text => $self->{input_encoding},
503 level => $self->{level}->{uncertain},
504 line => 1, column => 1,
505 layer => 'encode');
506 } else {
507 $self->{input_encoding} = $charset->get_iana_name;
508 }
509
510 $self->{change_encoding} = sub {
511 my $self = shift;
512 $charset_name = shift;
513 my $token = shift;
514
515 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
516 ($char_stream, $e_status) = $charset->get_decode_handle
517 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
518 byte_buffer => \ $buffer->{buffer});
519
520 if ($char_stream) { # if supported
521 ## "Change the encoding" algorithm:
522
523 ## Step 1
524 if ($charset->{category} &
525 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
526 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
527 ($char_stream, $e_status) = $charset->get_decode_handle
528 ($byte_stream,
529 byte_buffer => \ $buffer->{buffer});
530 }
531 $charset_name = $charset->get_iana_name;
532
533 ## Step 2
534 if (defined $self->{input_encoding} and
535 $self->{input_encoding} eq $charset_name) {
536 !!!parse-error (type => 'charset label:matching',
537 text => $charset_name,
538 level => $self->{level}->{info});
539 $self->{confident} = 1;
540 return;
541 }
542
543 !!!parse-error (type => 'charset label detected',
544 text => $self->{input_encoding},
545 value => $charset_name,
546 level => $self->{level}->{warn},
547 token => $token);
548
549 ## Step 3
550 # if (can) {
551 ## change the encoding on the fly.
552 #$self->{confident} = 1;
553 #return;
554 # }
555
556 ## Step 4
557 throw Whatpm::HTML::RestartParser ();
558 }
559 }; # $self->{change_encoding}
560
561 my $char_onerror = sub {
562 my (undef, $type, %opt) = @_;
563 !!!parse-error (layer => 'encode',
564 %opt, type => $type,
565 line => $self->{line}, column => $self->{column} + 1);
566 if ($opt{octets}) {
567 ${$opt{octets}} = "\x{FFFD}"; # relacement character
568 }
569 };
570
571 my $wrapped_char_stream = $get_wrapper->($char_stream);
572 $wrapped_char_stream->onerror ($char_onerror);
573
574 my @args = @_; shift @args; # $s
575 my $return;
576 try {
577 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
578 } catch Whatpm::HTML::RestartParser with {
579 ## NOTE: Invoked after {change_encoding}.
580
581 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
582 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
583 !!!parse-error (type => 'chardecode:fallback',
584 level => $self->{level}->{uncertain},
585 #text => $self->{input_encoding},
586 line => 1, column => 1,
587 layer => 'encode');
588 } elsif (not ($e_status &
589 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
590 $self->{input_encoding} = $charset->get_iana_name;
591 !!!parse-error (type => 'chardecode:no error',
592 text => $self->{input_encoding},
593 level => $self->{level}->{uncertain},
594 line => 1, column => 1,
595 layer => 'encode');
596 } else {
597 $self->{input_encoding} = $charset->get_iana_name;
598 }
599 $self->{confident} = 1;
600
601 $wrapped_char_stream = $get_wrapper->($char_stream);
602 $wrapped_char_stream->onerror ($char_onerror);
603
604 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
605 };
606 return $return;
607 } # parse_byte_stream
608
609 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
610 ## and the HTML layer MUST ignore it. However, we does strip BOM in
611 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
612 ## because the core part of our HTML parser expects a string of character,
613 ## not a string of bytes or code units or anything which might contain a BOM.
614 ## Therefore, any parser interface that accepts a string of bytes,
615 ## such as |parse_byte_string| in this module, must ensure that it does
616 ## strip the BOM and never strip any ZWNBSP.
617
618 sub parse_char_string ($$$;$$) {
619 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
620 my $self = shift;
621 require utf8;
622 my $s = ref $_[0] ? $_[0] : \($_[0]);
623 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
624 if ($_[3]) {
625 $input = $_[3]->($input);
626 }
627 return $self->parse_char_stream ($input, @_[1..$#_]);
628 } # parse_char_string
629 *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
630
631 sub parse_char_stream ($$$;$) {
632 my $self = ref $_[0] ? shift : shift->new;
633 my $input = $_[0];
634 $self->{document} = $_[1];
635 @{$self->{document}->child_nodes} = ();
636
637 ## NOTE: |set_inner_html| copies most of this method's code
638
639 $self->{confident} = 1 unless exists $self->{confident};
640 $self->{document}->input_encoding ($self->{input_encoding})
641 if defined $self->{input_encoding};
642
643 my $i = 0;
644 $self->{line_prev} = $self->{line} = 1;
645 $self->{column_prev} = $self->{column} = 0;
646 $self->{set_next_char} = sub {
647 my $self = shift;
648
649 pop @{$self->{prev_char}};
650 unshift @{$self->{prev_char}}, $self->{next_char};
651
652 my $char;
653 if (defined $self->{next_next_char}) {
654 $char = $self->{next_next_char};
655 delete $self->{next_next_char};
656 } else {
657 $char = $input->getc;
658 }
659 $self->{next_char} = -1 and return unless defined $char;
660 $self->{next_char} = ord $char;
661
662 ($self->{line_prev}, $self->{column_prev})
663 = ($self->{line}, $self->{column});
664 $self->{column}++;
665
666 if ($self->{next_char} == 0x000A) { # LF
667 !!!cp ('j1');
668 $self->{line}++;
669 $self->{column} = 0;
670 } elsif ($self->{next_char} == 0x000D) { # CR
671 !!!cp ('j2');
672 my $next = $input->getc;
673 if (defined $next and $next ne "\x0A") {
674 $self->{next_next_char} = $next;
675 }
676 $self->{next_char} = 0x000A; # LF # MUST
677 $self->{line}++;
678 $self->{column} = 0;
679 } elsif ($self->{next_char} > 0x10FFFF) {
680 !!!cp ('j3');
681 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
682 } elsif ($self->{next_char} == 0x0000) { # NULL
683 !!!cp ('j4');
684 !!!parse-error (type => 'NULL');
685 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
686 } elsif ($self->{next_char} <= 0x0008 or
687 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
688 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
689 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
690 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
691 {
692 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
693 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
694 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
695 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
696 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
697 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
698 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
699 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
700 0x10FFFE => 1, 0x10FFFF => 1,
701 }->{$self->{next_char}}) {
702 !!!cp ('j5');
703 if ($self->{next_char} < 0x10000) {
704 !!!parse-error (type => 'control char',
705 text => (sprintf 'U+%04X', $self->{next_char}));
706 } else {
707 !!!parse-error (type => 'control char',
708 text => (sprintf 'U-%08X', $self->{next_char}));
709 }
710 }
711 };
712 $self->{prev_char} = [-1, -1, -1];
713 $self->{next_char} = -1;
714
715 my $onerror = $_[2] || sub {
716 my (%opt) = @_;
717 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
718 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
719 warn "Parse error ($opt{type}) at line $line column $column\n";
720 };
721 $self->{parse_error} = sub {
722 $onerror->(line => $self->{line}, column => $self->{column}, @_);
723 };
724
725 $self->_initialize_tokenizer;
726 $self->_initialize_tree_constructor;
727 $self->_construct_tree;
728 $self->_terminate_tree_constructor;
729
730 delete $self->{parse_error}; # remove loop
731
732 return $self->{document};
733 } # parse_char_stream
734
735 sub new ($) {
736 my $class = shift;
737 my $self = bless {
738 level => {must => 'm',
739 should => 's',
740 warn => 'w',
741 info => 'i',
742 uncertain => 'u'},
743 }, $class;
744 $self->{set_next_char} = sub {
745 $self->{next_char} = -1;
746 };
747 $self->{parse_error} = sub {
748 #
749 };
750 $self->{change_encoding} = sub {
751 # if ($_[0] is a supported encoding) {
752 # run "change the encoding" algorithm;
753 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
754 # }
755 };
756 $self->{application_cache_selection} = sub {
757 #
758 };
759 return $self;
760 } # new
761
762 sub CM_ENTITY () { 0b001 } # & markup in data
763 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
764 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
765
766 sub PLAINTEXT_CONTENT_MODEL () { 0 }
767 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
768 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
769 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
770
771 sub DATA_STATE () { 0 }
772 #sub ENTITY_DATA_STATE () { 1 }
773 sub TAG_OPEN_STATE () { 2 }
774 sub CLOSE_TAG_OPEN_STATE () { 3 }
775 sub TAG_NAME_STATE () { 4 }
776 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
777 sub ATTRIBUTE_NAME_STATE () { 6 }
778 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
779 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
780 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
781 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
782 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
783 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
784 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
785 sub COMMENT_START_STATE () { 14 }
786 sub COMMENT_START_DASH_STATE () { 15 }
787 sub COMMENT_STATE () { 16 }
788 sub COMMENT_END_STATE () { 17 }
789 sub COMMENT_END_DASH_STATE () { 18 }
790 sub BOGUS_COMMENT_STATE () { 19 }
791 sub DOCTYPE_STATE () { 20 }
792 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
793 sub DOCTYPE_NAME_STATE () { 22 }
794 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
795 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
796 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
797 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
798 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
799 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
800 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
801 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
802 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
803 sub BOGUS_DOCTYPE_STATE () { 32 }
804 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
805 sub SELF_CLOSING_START_TAG_STATE () { 34 }
806 sub CDATA_SECTION_STATE () { 35 }
807 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
808 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
809 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
810 sub CDATA_PCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
811 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
812 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
813 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
814 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
815 ## NOTE: "Entity data state", "entity in attribute value state", and
816 ## "consume a character reference" algorithm are jointly implemented
817 ## using the following six states:
818 sub ENTITY_STATE () { 44 }
819 sub ENTITY_HASH_STATE () { 45 }
820 sub NCR_NUM_STATE () { 46 }
821 sub HEXREF_X_STATE () { 47 }
822 sub HEXREF_HEX_STATE () { 48 }
823 sub ENTITY_NAME_STATE () { 49 }
824
825 sub DOCTYPE_TOKEN () { 1 }
826 sub COMMENT_TOKEN () { 2 }
827 sub START_TAG_TOKEN () { 3 }
828 sub END_TAG_TOKEN () { 4 }
829 sub END_OF_FILE_TOKEN () { 5 }
830 sub CHARACTER_TOKEN () { 6 }
831
832 sub AFTER_HTML_IMS () { 0b100 }
833 sub HEAD_IMS () { 0b1000 }
834 sub BODY_IMS () { 0b10000 }
835 sub BODY_TABLE_IMS () { 0b100000 }
836 sub TABLE_IMS () { 0b1000000 }
837 sub ROW_IMS () { 0b10000000 }
838 sub BODY_AFTER_IMS () { 0b100000000 }
839 sub FRAME_IMS () { 0b1000000000 }
840 sub SELECT_IMS () { 0b10000000000 }
841 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
842 ## NOTE: "in foreign content" insertion mode is special; it is combined
843 ## with the secondary insertion mode. In this parser, they are stored
844 ## together in the bit-or'ed form.
845
846 ## NOTE: "initial" and "before html" insertion modes have no constants.
847
848 ## NOTE: "after after body" insertion mode.
849 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
850
851 ## NOTE: "after after frameset" insertion mode.
852 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
853
854 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
855 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
856 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
857 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
858 sub IN_BODY_IM () { BODY_IMS }
859 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
860 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
861 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
862 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
863 sub IN_TABLE_IM () { TABLE_IMS }
864 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
865 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
866 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
867 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
868 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
869 sub IN_COLUMN_GROUP_IM () { 0b10 }
870
871 ## Implementations MUST act as if state machine in the spec
872
873 sub _initialize_tokenizer ($) {
874 my $self = shift;
875 $self->{state} = DATA_STATE; # MUST
876 #$self->{state_keyword}; # initialized when used
877 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
878 undef $self->{current_token};
879 undef $self->{current_attribute};
880 undef $self->{last_emitted_start_tag_name};
881 undef $self->{last_attribute_value_state};
882 delete $self->{self_closing};
883 $self->{char} = [];
884 # $self->{next_char}
885 !!!next-input-character;
886 $self->{token} = [];
887 # $self->{escape}
888 } # _initialize_tokenizer
889
890 ## A token has:
891 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
892 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
893 ## ->{name} (DOCTYPE_TOKEN)
894 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
895 ## ->{public_identifier} (DOCTYPE_TOKEN)
896 ## ->{system_identifier} (DOCTYPE_TOKEN)
897 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
898 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
899 ## ->{name}
900 ## ->{value}
901 ## ->{has_reference} == 1 or 0
902 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
903 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
904 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
905 ## while the token is pushed back to the stack.
906
907 ## Emitted token MUST immediately be handled by the tree construction state.
908
909 ## Before each step, UA MAY check to see if either one of the scripts in
910 ## "list of scripts that will execute as soon as possible" or the first
911 ## script in the "list of scripts that will execute asynchronously",
912 ## has completed loading. If one has, then it MUST be executed
913 ## and removed from the list.
914
915 ## NOTE: HTML5 "Writing HTML documents" section, applied to
916 ## documents and not to user agents and conformance checkers,
917 ## contains some requirements that are not detected by the
918 ## parsing algorithm:
919 ## - Some requirements on character encoding declarations. ## TODO
920 ## - "Elements MUST NOT contain content that their content model disallows."
921 ## ... Some are parse error, some are not (will be reported by c.c.).
922 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
923 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
924 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
925
926 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
927 ## be detected by the HTML5 parsing algorithm:
928 ## - Text,
929
930 sub _get_next_token ($) {
931 my $self = shift;
932
933 if ($self->{self_closing}) {
934 !!!parse-error (type => 'nestc', token => $self->{current_token});
935 ## NOTE: The |self_closing| flag is only set by start tag token.
936 ## In addition, when a start tag token is emitted, it is always set to
937 ## |current_token|.
938 delete $self->{self_closing};
939 }
940
941 if (@{$self->{token}}) {
942 $self->{self_closing} = $self->{token}->[0]->{self_closing};
943 return shift @{$self->{token}};
944 }
945
946 A: {
947 if ($self->{state} == DATA_STATE) {
948 if ($self->{next_char} == 0x0026) { # &
949 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
950 not $self->{escape}) {
951 !!!cp (1);
952 ## NOTE: In the spec, the tokenizer is switched to the
953 ## "entity data state". In this implementation, the tokenizer
954 ## is switched to the |ENTITY_STATE|, which is an implementation
955 ## of the "consume a character reference" algorithm.
956 $self->{entity_in_attr} = 0;
957 $self->{entity_additional} = -1;
958 $self->{state} = ENTITY_STATE;
959 !!!next-input-character;
960 redo A;
961 } else {
962 !!!cp (2);
963 #
964 }
965 } elsif ($self->{next_char} == 0x002D) { # -
966 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
967 unless ($self->{escape}) {
968 if ($self->{prev_char}->[0] == 0x002D and # -
969 $self->{prev_char}->[1] == 0x0021 and # !
970 $self->{prev_char}->[2] == 0x003C) { # <
971 !!!cp (3);
972 $self->{escape} = 1;
973 } else {
974 !!!cp (4);
975 }
976 } else {
977 !!!cp (5);
978 }
979 }
980
981 #
982 } elsif ($self->{next_char} == 0x003C) { # <
983 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
984 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
985 not $self->{escape})) {
986 !!!cp (6);
987 $self->{state} = TAG_OPEN_STATE;
988 !!!next-input-character;
989 redo A;
990 } else {
991 !!!cp (7);
992 #
993 }
994 } elsif ($self->{next_char} == 0x003E) { # >
995 if ($self->{escape} and
996 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
997 if ($self->{prev_char}->[0] == 0x002D and # -
998 $self->{prev_char}->[1] == 0x002D) { # -
999 !!!cp (8);
1000 delete $self->{escape};
1001 } else {
1002 !!!cp (9);
1003 }
1004 } else {
1005 !!!cp (10);
1006 }
1007
1008 #
1009 } elsif ($self->{next_char} == -1) {
1010 !!!cp (11);
1011 !!!emit ({type => END_OF_FILE_TOKEN,
1012 line => $self->{line}, column => $self->{column}});
1013 last A; ## TODO: ok?
1014 } else {
1015 !!!cp (12);
1016 }
1017 # Anything else
1018 my $token = {type => CHARACTER_TOKEN,
1019 data => chr $self->{next_char},
1020 line => $self->{line}, column => $self->{column},
1021 };
1022 ## Stay in the data state
1023 !!!next-input-character;
1024
1025 !!!emit ($token);
1026
1027 redo A;
1028 } elsif ($self->{state} == TAG_OPEN_STATE) {
1029 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1030 if ($self->{next_char} == 0x002F) { # /
1031 !!!cp (15);
1032 !!!next-input-character;
1033 $self->{state} = CLOSE_TAG_OPEN_STATE;
1034 redo A;
1035 } else {
1036 !!!cp (16);
1037 ## reconsume
1038 $self->{state} = DATA_STATE;
1039
1040 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1041 line => $self->{line_prev},
1042 column => $self->{column_prev},
1043 });
1044
1045 redo A;
1046 }
1047 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1048 if ($self->{next_char} == 0x0021) { # !
1049 !!!cp (17);
1050 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1051 !!!next-input-character;
1052 redo A;
1053 } elsif ($self->{next_char} == 0x002F) { # /
1054 !!!cp (18);
1055 $self->{state} = CLOSE_TAG_OPEN_STATE;
1056 !!!next-input-character;
1057 redo A;
1058 } elsif (0x0041 <= $self->{next_char} and
1059 $self->{next_char} <= 0x005A) { # A..Z
1060 !!!cp (19);
1061 $self->{current_token}
1062 = {type => START_TAG_TOKEN,
1063 tag_name => chr ($self->{next_char} + 0x0020),
1064 line => $self->{line_prev},
1065 column => $self->{column_prev}};
1066 $self->{state} = TAG_NAME_STATE;
1067 !!!next-input-character;
1068 redo A;
1069 } elsif (0x0061 <= $self->{next_char} and
1070 $self->{next_char} <= 0x007A) { # a..z
1071 !!!cp (20);
1072 $self->{current_token} = {type => START_TAG_TOKEN,
1073 tag_name => chr ($self->{next_char}),
1074 line => $self->{line_prev},
1075 column => $self->{column_prev}};
1076 $self->{state} = TAG_NAME_STATE;
1077 !!!next-input-character;
1078 redo A;
1079 } elsif ($self->{next_char} == 0x003E) { # >
1080 !!!cp (21);
1081 !!!parse-error (type => 'empty start tag',
1082 line => $self->{line_prev},
1083 column => $self->{column_prev});
1084 $self->{state} = DATA_STATE;
1085 !!!next-input-character;
1086
1087 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1088 line => $self->{line_prev},
1089 column => $self->{column_prev},
1090 });
1091
1092 redo A;
1093 } elsif ($self->{next_char} == 0x003F) { # ?
1094 !!!cp (22);
1095 !!!parse-error (type => 'pio',
1096 line => $self->{line_prev},
1097 column => $self->{column_prev});
1098 $self->{state} = BOGUS_COMMENT_STATE;
1099 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1100 line => $self->{line_prev},
1101 column => $self->{column_prev},
1102 };
1103 ## $self->{next_char} is intentionally left as is
1104 redo A;
1105 } else {
1106 !!!cp (23);
1107 !!!parse-error (type => 'bare stago',
1108 line => $self->{line_prev},
1109 column => $self->{column_prev});
1110 $self->{state} = DATA_STATE;
1111 ## reconsume
1112
1113 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1114 line => $self->{line_prev},
1115 column => $self->{column_prev},
1116 });
1117
1118 redo A;
1119 }
1120 } else {
1121 die "$0: $self->{content_model} in tag open";
1122 }
1123 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1124 ## NOTE: The "close tag open state" in the spec is implemented as
1125 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_PCDATA_CLOSE_TAG_STATE|.
1126
1127 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1128 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1129 if (defined $self->{last_emitted_start_tag_name}) {
1130 $self->{state} = CDATA_PCDATA_CLOSE_TAG_STATE;
1131 $self->{state_keyword} = '';
1132 ## Reconsume.
1133 redo A;
1134 } else {
1135 ## No start tag token has ever been emitted
1136 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1137 !!!cp (28);
1138 $self->{state} = DATA_STATE;
1139 ## Reconsume.
1140 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1141 line => $l, column => $c,
1142 });
1143 redo A;
1144 }
1145 }
1146
1147 if (0x0041 <= $self->{next_char} and
1148 $self->{next_char} <= 0x005A) { # A..Z
1149 !!!cp (29);
1150 $self->{current_token}
1151 = {type => END_TAG_TOKEN,
1152 tag_name => chr ($self->{next_char} + 0x0020),
1153 line => $l, column => $c};
1154 $self->{state} = TAG_NAME_STATE;
1155 !!!next-input-character;
1156 redo A;
1157 } elsif (0x0061 <= $self->{next_char} and
1158 $self->{next_char} <= 0x007A) { # a..z
1159 !!!cp (30);
1160 $self->{current_token} = {type => END_TAG_TOKEN,
1161 tag_name => chr ($self->{next_char}),
1162 line => $l, column => $c};
1163 $self->{state} = TAG_NAME_STATE;
1164 !!!next-input-character;
1165 redo A;
1166 } elsif ($self->{next_char} == 0x003E) { # >
1167 !!!cp (31);
1168 !!!parse-error (type => 'empty end tag',
1169 line => $self->{line_prev}, ## "<" in "</>"
1170 column => $self->{column_prev} - 1);
1171 $self->{state} = DATA_STATE;
1172 !!!next-input-character;
1173 redo A;
1174 } elsif ($self->{next_char} == -1) {
1175 !!!cp (32);
1176 !!!parse-error (type => 'bare etago');
1177 $self->{state} = DATA_STATE;
1178 # reconsume
1179
1180 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1181 line => $l, column => $c,
1182 });
1183
1184 redo A;
1185 } else {
1186 !!!cp (33);
1187 !!!parse-error (type => 'bogus end tag');
1188 $self->{state} = BOGUS_COMMENT_STATE;
1189 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1190 line => $self->{line_prev}, # "<" of "</"
1191 column => $self->{column_prev} - 1,
1192 };
1193 ## NOTE: $self->{next_char} is intentionally left as is.
1194 ## Although the "anything else" case of the spec not explicitly
1195 ## states that the next input character is to be reconsumed,
1196 ## it will be included to the |data| of the comment token
1197 ## generated from the bogus end tag, as defined in the
1198 ## "bogus comment state" entry.
1199 redo A;
1200 }
1201 } elsif ($self->{state} == CDATA_PCDATA_CLOSE_TAG_STATE) {
1202 my $ch = substr $self->{last_emitted_start_tag_name}, length $self->{state_keyword}, 1;
1203 if (length $ch) {
1204 my $CH = $ch;
1205 $ch =~ tr/a-z/A-Z/;
1206 my $nch = chr $self->{next_char};
1207 if ($nch eq $ch or $nch eq $CH) {
1208 !!!cp (24);
1209 ## Stay in the state.
1210 $self->{state_keyword} .= $nch;
1211 !!!next-input-character;
1212 redo A;
1213 } else {
1214 !!!cp (25);
1215 $self->{state} = DATA_STATE;
1216 ## Reconsume.
1217 !!!emit ({type => CHARACTER_TOKEN,
1218 data => '</' . $self->{state_keyword},
1219 line => $self->{line_prev},
1220 column => $self->{column_prev} - 1 - length $self->{state_keyword},
1221 });
1222 redo A;
1223 }
1224 } else { # after "<{tag-name}"
1225 unless ({
1226 0x0009 => 1, # HT
1227 0x000A => 1, # LF
1228 0x000B => 1, # VT
1229 0x000C => 1, # FF
1230 0x0020 => 1, # SP
1231 0x003E => 1, # >
1232 0x002F => 1, # /
1233 -1 => 1, # EOF
1234 }->{$self->{next_char}}) {
1235 !!!cp (26);
1236 ## Reconsume.
1237 $self->{state} = DATA_STATE;
1238 !!!emit ({type => CHARACTER_TOKEN,
1239 data => '</' . $self->{state_keyword},
1240 line => $self->{line_prev},
1241 column => $self->{column_prev} - 1 - length $self->{state_keyword},
1242 });
1243 redo A;
1244 } else {
1245 !!!cp (27);
1246 $self->{current_token}
1247 = {type => END_TAG_TOKEN,
1248 tag_name => $self->{last_emitted_start_tag_name},
1249 line => $self->{line_prev},
1250 column => $self->{column_prev} - 1 - length $self->{state_keyword}};
1251 $self->{state} = TAG_NAME_STATE;
1252 ## Reconsume.
1253 redo A;
1254 }
1255 }
1256 } elsif ($self->{state} == TAG_NAME_STATE) {
1257 if ($self->{next_char} == 0x0009 or # HT
1258 $self->{next_char} == 0x000A or # LF
1259 $self->{next_char} == 0x000B or # VT
1260 $self->{next_char} == 0x000C or # FF
1261 $self->{next_char} == 0x0020) { # SP
1262 !!!cp (34);
1263 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1264 !!!next-input-character;
1265 redo A;
1266 } elsif ($self->{next_char} == 0x003E) { # >
1267 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1268 !!!cp (35);
1269 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1270 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1271 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1272 #if ($self->{current_token}->{attributes}) {
1273 # ## NOTE: This should never be reached.
1274 # !!! cp (36);
1275 # !!! parse-error (type => 'end tag attribute');
1276 #} else {
1277 !!!cp (37);
1278 #}
1279 } else {
1280 die "$0: $self->{current_token}->{type}: Unknown token type";
1281 }
1282 $self->{state} = DATA_STATE;
1283 !!!next-input-character;
1284
1285 !!!emit ($self->{current_token}); # start tag or end tag
1286
1287 redo A;
1288 } elsif (0x0041 <= $self->{next_char} and
1289 $self->{next_char} <= 0x005A) { # A..Z
1290 !!!cp (38);
1291 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1292 # start tag or end tag
1293 ## Stay in this state
1294 !!!next-input-character;
1295 redo A;
1296 } elsif ($self->{next_char} == -1) {
1297 !!!parse-error (type => 'unclosed tag');
1298 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1299 !!!cp (39);
1300 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1301 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1302 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1303 #if ($self->{current_token}->{attributes}) {
1304 # ## NOTE: This state should never be reached.
1305 # !!! cp (40);
1306 # !!! parse-error (type => 'end tag attribute');
1307 #} else {
1308 !!!cp (41);
1309 #}
1310 } else {
1311 die "$0: $self->{current_token}->{type}: Unknown token type";
1312 }
1313 $self->{state} = DATA_STATE;
1314 # reconsume
1315
1316 !!!emit ($self->{current_token}); # start tag or end tag
1317
1318 redo A;
1319 } elsif ($self->{next_char} == 0x002F) { # /
1320 !!!cp (42);
1321 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1322 !!!next-input-character;
1323 redo A;
1324 } else {
1325 !!!cp (44);
1326 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1327 # start tag or end tag
1328 ## Stay in the state
1329 !!!next-input-character;
1330 redo A;
1331 }
1332 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1333 if ($self->{next_char} == 0x0009 or # HT
1334 $self->{next_char} == 0x000A or # LF
1335 $self->{next_char} == 0x000B or # VT
1336 $self->{next_char} == 0x000C or # FF
1337 $self->{next_char} == 0x0020) { # SP
1338 !!!cp (45);
1339 ## Stay in the state
1340 !!!next-input-character;
1341 redo A;
1342 } elsif ($self->{next_char} == 0x003E) { # >
1343 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1344 !!!cp (46);
1345 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1346 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1347 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1348 if ($self->{current_token}->{attributes}) {
1349 !!!cp (47);
1350 !!!parse-error (type => 'end tag attribute');
1351 } else {
1352 !!!cp (48);
1353 }
1354 } else {
1355 die "$0: $self->{current_token}->{type}: Unknown token type";
1356 }
1357 $self->{state} = DATA_STATE;
1358 !!!next-input-character;
1359
1360 !!!emit ($self->{current_token}); # start tag or end tag
1361
1362 redo A;
1363 } elsif (0x0041 <= $self->{next_char} and
1364 $self->{next_char} <= 0x005A) { # A..Z
1365 !!!cp (49);
1366 $self->{current_attribute}
1367 = {name => chr ($self->{next_char} + 0x0020),
1368 value => '',
1369 line => $self->{line}, column => $self->{column}};
1370 $self->{state} = ATTRIBUTE_NAME_STATE;
1371 !!!next-input-character;
1372 redo A;
1373 } elsif ($self->{next_char} == 0x002F) { # /
1374 !!!cp (50);
1375 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1376 !!!next-input-character;
1377 redo A;
1378 } elsif ($self->{next_char} == -1) {
1379 !!!parse-error (type => 'unclosed tag');
1380 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1381 !!!cp (52);
1382 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1383 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1384 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1385 if ($self->{current_token}->{attributes}) {
1386 !!!cp (53);
1387 !!!parse-error (type => 'end tag attribute');
1388 } else {
1389 !!!cp (54);
1390 }
1391 } else {
1392 die "$0: $self->{current_token}->{type}: Unknown token type";
1393 }
1394 $self->{state} = DATA_STATE;
1395 # reconsume
1396
1397 !!!emit ($self->{current_token}); # start tag or end tag
1398
1399 redo A;
1400 } else {
1401 if ({
1402 0x0022 => 1, # "
1403 0x0027 => 1, # '
1404 0x003D => 1, # =
1405 }->{$self->{next_char}}) {
1406 !!!cp (55);
1407 !!!parse-error (type => 'bad attribute name');
1408 } else {
1409 !!!cp (56);
1410 }
1411 $self->{current_attribute}
1412 = {name => chr ($self->{next_char}),
1413 value => '',
1414 line => $self->{line}, column => $self->{column}};
1415 $self->{state} = ATTRIBUTE_NAME_STATE;
1416 !!!next-input-character;
1417 redo A;
1418 }
1419 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1420 my $before_leave = sub {
1421 if (exists $self->{current_token}->{attributes} # start tag or end tag
1422 ->{$self->{current_attribute}->{name}}) { # MUST
1423 !!!cp (57);
1424 !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1425 ## Discard $self->{current_attribute} # MUST
1426 } else {
1427 !!!cp (58);
1428 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1429 = $self->{current_attribute};
1430 }
1431 }; # $before_leave
1432
1433 if ($self->{next_char} == 0x0009 or # HT
1434 $self->{next_char} == 0x000A or # LF
1435 $self->{next_char} == 0x000B or # VT
1436 $self->{next_char} == 0x000C or # FF
1437 $self->{next_char} == 0x0020) { # SP
1438 !!!cp (59);
1439 $before_leave->();
1440 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1441 !!!next-input-character;
1442 redo A;
1443 } elsif ($self->{next_char} == 0x003D) { # =
1444 !!!cp (60);
1445 $before_leave->();
1446 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1447 !!!next-input-character;
1448 redo A;
1449 } elsif ($self->{next_char} == 0x003E) { # >
1450 $before_leave->();
1451 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1452 !!!cp (61);
1453 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1454 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1455 !!!cp (62);
1456 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1457 if ($self->{current_token}->{attributes}) {
1458 !!!parse-error (type => 'end tag attribute');
1459 }
1460 } else {
1461 die "$0: $self->{current_token}->{type}: Unknown token type";
1462 }
1463 $self->{state} = DATA_STATE;
1464 !!!next-input-character;
1465
1466 !!!emit ($self->{current_token}); # start tag or end tag
1467
1468 redo A;
1469 } elsif (0x0041 <= $self->{next_char} and
1470 $self->{next_char} <= 0x005A) { # A..Z
1471 !!!cp (63);
1472 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1473 ## Stay in the state
1474 !!!next-input-character;
1475 redo A;
1476 } elsif ($self->{next_char} == 0x002F) { # /
1477 !!!cp (64);
1478 $before_leave->();
1479 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1480 !!!next-input-character;
1481 redo A;
1482 } elsif ($self->{next_char} == -1) {
1483 !!!parse-error (type => 'unclosed tag');
1484 $before_leave->();
1485 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1486 !!!cp (66);
1487 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1488 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1489 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1490 if ($self->{current_token}->{attributes}) {
1491 !!!cp (67);
1492 !!!parse-error (type => 'end tag attribute');
1493 } else {
1494 ## NOTE: This state should never be reached.
1495 !!!cp (68);
1496 }
1497 } else {
1498 die "$0: $self->{current_token}->{type}: Unknown token type";
1499 }
1500 $self->{state} = DATA_STATE;
1501 # reconsume
1502
1503 !!!emit ($self->{current_token}); # start tag or end tag
1504
1505 redo A;
1506 } else {
1507 if ($self->{next_char} == 0x0022 or # "
1508 $self->{next_char} == 0x0027) { # '
1509 !!!cp (69);
1510 !!!parse-error (type => 'bad attribute name');
1511 } else {
1512 !!!cp (70);
1513 }
1514 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1515 ## Stay in the state
1516 !!!next-input-character;
1517 redo A;
1518 }
1519 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1520 if ($self->{next_char} == 0x0009 or # HT
1521 $self->{next_char} == 0x000A or # LF
1522 $self->{next_char} == 0x000B or # VT
1523 $self->{next_char} == 0x000C or # FF
1524 $self->{next_char} == 0x0020) { # SP
1525 !!!cp (71);
1526 ## Stay in the state
1527 !!!next-input-character;
1528 redo A;
1529 } elsif ($self->{next_char} == 0x003D) { # =
1530 !!!cp (72);
1531 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1532 !!!next-input-character;
1533 redo A;
1534 } elsif ($self->{next_char} == 0x003E) { # >
1535 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1536 !!!cp (73);
1537 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1538 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1539 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1540 if ($self->{current_token}->{attributes}) {
1541 !!!cp (74);
1542 !!!parse-error (type => 'end tag attribute');
1543 } else {
1544 ## NOTE: This state should never be reached.
1545 !!!cp (75);
1546 }
1547 } else {
1548 die "$0: $self->{current_token}->{type}: Unknown token type";
1549 }
1550 $self->{state} = DATA_STATE;
1551 !!!next-input-character;
1552
1553 !!!emit ($self->{current_token}); # start tag or end tag
1554
1555 redo A;
1556 } elsif (0x0041 <= $self->{next_char} and
1557 $self->{next_char} <= 0x005A) { # A..Z
1558 !!!cp (76);
1559 $self->{current_attribute}
1560 = {name => chr ($self->{next_char} + 0x0020),
1561 value => '',
1562 line => $self->{line}, column => $self->{column}};
1563 $self->{state} = ATTRIBUTE_NAME_STATE;
1564 !!!next-input-character;
1565 redo A;
1566 } elsif ($self->{next_char} == 0x002F) { # /
1567 !!!cp (77);
1568 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1569 !!!next-input-character;
1570 redo A;
1571 } elsif ($self->{next_char} == -1) {
1572 !!!parse-error (type => 'unclosed tag');
1573 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1574 !!!cp (79);
1575 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1576 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1577 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1578 if ($self->{current_token}->{attributes}) {
1579 !!!cp (80);
1580 !!!parse-error (type => 'end tag attribute');
1581 } else {
1582 ## NOTE: This state should never be reached.
1583 !!!cp (81);
1584 }
1585 } else {
1586 die "$0: $self->{current_token}->{type}: Unknown token type";
1587 }
1588 $self->{state} = DATA_STATE;
1589 # reconsume
1590
1591 !!!emit ($self->{current_token}); # start tag or end tag
1592
1593 redo A;
1594 } else {
1595 if ($self->{next_char} == 0x0022 or # "
1596 $self->{next_char} == 0x0027) { # '
1597 !!!cp (78);
1598 !!!parse-error (type => 'bad attribute name');
1599 } else {
1600 !!!cp (82);
1601 }
1602 $self->{current_attribute}
1603 = {name => chr ($self->{next_char}),
1604 value => '',
1605 line => $self->{line}, column => $self->{column}};
1606 $self->{state} = ATTRIBUTE_NAME_STATE;
1607 !!!next-input-character;
1608 redo A;
1609 }
1610 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1611 if ($self->{next_char} == 0x0009 or # HT
1612 $self->{next_char} == 0x000A or # LF
1613 $self->{next_char} == 0x000B or # VT
1614 $self->{next_char} == 0x000C or # FF
1615 $self->{next_char} == 0x0020) { # SP
1616 !!!cp (83);
1617 ## Stay in the state
1618 !!!next-input-character;
1619 redo A;
1620 } elsif ($self->{next_char} == 0x0022) { # "
1621 !!!cp (84);
1622 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1623 !!!next-input-character;
1624 redo A;
1625 } elsif ($self->{next_char} == 0x0026) { # &
1626 !!!cp (85);
1627 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1628 ## reconsume
1629 redo A;
1630 } elsif ($self->{next_char} == 0x0027) { # '
1631 !!!cp (86);
1632 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1633 !!!next-input-character;
1634 redo A;
1635 } elsif ($self->{next_char} == 0x003E) { # >
1636 !!!parse-error (type => 'empty unquoted attribute value');
1637 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1638 !!!cp (87);
1639 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1640 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1641 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1642 if ($self->{current_token}->{attributes}) {
1643 !!!cp (88);
1644 !!!parse-error (type => 'end tag attribute');
1645 } else {
1646 ## NOTE: This state should never be reached.
1647 !!!cp (89);
1648 }
1649 } else {
1650 die "$0: $self->{current_token}->{type}: Unknown token type";
1651 }
1652 $self->{state} = DATA_STATE;
1653 !!!next-input-character;
1654
1655 !!!emit ($self->{current_token}); # start tag or end tag
1656
1657 redo A;
1658 } elsif ($self->{next_char} == -1) {
1659 !!!parse-error (type => 'unclosed tag');
1660 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1661 !!!cp (90);
1662 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1663 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1664 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1665 if ($self->{current_token}->{attributes}) {
1666 !!!cp (91);
1667 !!!parse-error (type => 'end tag attribute');
1668 } else {
1669 ## NOTE: This state should never be reached.
1670 !!!cp (92);
1671 }
1672 } else {
1673 die "$0: $self->{current_token}->{type}: Unknown token type";
1674 }
1675 $self->{state} = DATA_STATE;
1676 ## reconsume
1677
1678 !!!emit ($self->{current_token}); # start tag or end tag
1679
1680 redo A;
1681 } else {
1682 if ($self->{next_char} == 0x003D) { # =
1683 !!!cp (93);
1684 !!!parse-error (type => 'bad attribute value');
1685 } else {
1686 !!!cp (94);
1687 }
1688 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1689 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1690 !!!next-input-character;
1691 redo A;
1692 }
1693 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1694 if ($self->{next_char} == 0x0022) { # "
1695 !!!cp (95);
1696 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1697 !!!next-input-character;
1698 redo A;
1699 } elsif ($self->{next_char} == 0x0026) { # &
1700 !!!cp (96);
1701 $self->{last_attribute_value_state} = $self->{state};
1702 ## NOTE: In the spec, the tokenizer is switched to the
1703 ## "entity in attribute value state". In this implementation, the
1704 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1705 ## implementation of the "consume a character reference" algorithm.
1706 $self->{entity_in_attr} = 1;
1707 $self->{entity_additional} = 0x0022; # "
1708 $self->{state} = ENTITY_STATE;
1709 !!!next-input-character;
1710 redo A;
1711 } elsif ($self->{next_char} == -1) {
1712 !!!parse-error (type => 'unclosed attribute value');
1713 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1714 !!!cp (97);
1715 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1716 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1717 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1718 if ($self->{current_token}->{attributes}) {
1719 !!!cp (98);
1720 !!!parse-error (type => 'end tag attribute');
1721 } else {
1722 ## NOTE: This state should never be reached.
1723 !!!cp (99);
1724 }
1725 } else {
1726 die "$0: $self->{current_token}->{type}: Unknown token type";
1727 }
1728 $self->{state} = DATA_STATE;
1729 ## reconsume
1730
1731 !!!emit ($self->{current_token}); # start tag or end tag
1732
1733 redo A;
1734 } else {
1735 !!!cp (100);
1736 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1737 ## Stay in the state
1738 !!!next-input-character;
1739 redo A;
1740 }
1741 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1742 if ($self->{next_char} == 0x0027) { # '
1743 !!!cp (101);
1744 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1745 !!!next-input-character;
1746 redo A;
1747 } elsif ($self->{next_char} == 0x0026) { # &
1748 !!!cp (102);
1749 $self->{last_attribute_value_state} = $self->{state};
1750 ## NOTE: In the spec, the tokenizer is switched to the
1751 ## "entity in attribute value state". In this implementation, the
1752 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1753 ## implementation of the "consume a character reference" algorithm.
1754 $self->{entity_in_attr} = 1;
1755 $self->{entity_additional} = 0x0027; # '
1756 $self->{state} = ENTITY_STATE;
1757 !!!next-input-character;
1758 redo A;
1759 } elsif ($self->{next_char} == -1) {
1760 !!!parse-error (type => 'unclosed attribute value');
1761 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1762 !!!cp (103);
1763 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1764 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1765 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1766 if ($self->{current_token}->{attributes}) {
1767 !!!cp (104);
1768 !!!parse-error (type => 'end tag attribute');
1769 } else {
1770 ## NOTE: This state should never be reached.
1771 !!!cp (105);
1772 }
1773 } else {
1774 die "$0: $self->{current_token}->{type}: Unknown token type";
1775 }
1776 $self->{state} = DATA_STATE;
1777 ## reconsume
1778
1779 !!!emit ($self->{current_token}); # start tag or end tag
1780
1781 redo A;
1782 } else {
1783 !!!cp (106);
1784 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1785 ## Stay in the state
1786 !!!next-input-character;
1787 redo A;
1788 }
1789 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1790 if ($self->{next_char} == 0x0009 or # HT
1791 $self->{next_char} == 0x000A or # LF
1792 $self->{next_char} == 0x000B or # HT
1793 $self->{next_char} == 0x000C or # FF
1794 $self->{next_char} == 0x0020) { # SP
1795 !!!cp (107);
1796 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1797 !!!next-input-character;
1798 redo A;
1799 } elsif ($self->{next_char} == 0x0026) { # &
1800 !!!cp (108);
1801 $self->{last_attribute_value_state} = $self->{state};
1802 ## NOTE: In the spec, the tokenizer is switched to the
1803 ## "entity in attribute value state". In this implementation, the
1804 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1805 ## implementation of the "consume a character reference" algorithm.
1806 $self->{entity_in_attr} = 1;
1807 $self->{entity_additional} = -1;
1808 $self->{state} = ENTITY_STATE;
1809 !!!next-input-character;
1810 redo A;
1811 } elsif ($self->{next_char} == 0x003E) { # >
1812 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1813 !!!cp (109);
1814 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1815 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1816 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1817 if ($self->{current_token}->{attributes}) {
1818 !!!cp (110);
1819 !!!parse-error (type => 'end tag attribute');
1820 } else {
1821 ## NOTE: This state should never be reached.
1822 !!!cp (111);
1823 }
1824 } else {
1825 die "$0: $self->{current_token}->{type}: Unknown token type";
1826 }
1827 $self->{state} = DATA_STATE;
1828 !!!next-input-character;
1829
1830 !!!emit ($self->{current_token}); # start tag or end tag
1831
1832 redo A;
1833 } elsif ($self->{next_char} == -1) {
1834 !!!parse-error (type => 'unclosed tag');
1835 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1836 !!!cp (112);
1837 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1838 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1839 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1840 if ($self->{current_token}->{attributes}) {
1841 !!!cp (113);
1842 !!!parse-error (type => 'end tag attribute');
1843 } else {
1844 ## NOTE: This state should never be reached.
1845 !!!cp (114);
1846 }
1847 } else {
1848 die "$0: $self->{current_token}->{type}: Unknown token type";
1849 }
1850 $self->{state} = DATA_STATE;
1851 ## reconsume
1852
1853 !!!emit ($self->{current_token}); # start tag or end tag
1854
1855 redo A;
1856 } else {
1857 if ({
1858 0x0022 => 1, # "
1859 0x0027 => 1, # '
1860 0x003D => 1, # =
1861 }->{$self->{next_char}}) {
1862 !!!cp (115);
1863 !!!parse-error (type => 'bad attribute value');
1864 } else {
1865 !!!cp (116);
1866 }
1867 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1868 ## Stay in the state
1869 !!!next-input-character;
1870 redo A;
1871 }
1872 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1873 if ($self->{next_char} == 0x0009 or # HT
1874 $self->{next_char} == 0x000A or # LF
1875 $self->{next_char} == 0x000B or # VT
1876 $self->{next_char} == 0x000C or # FF
1877 $self->{next_char} == 0x0020) { # SP
1878 !!!cp (118);
1879 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1880 !!!next-input-character;
1881 redo A;
1882 } elsif ($self->{next_char} == 0x003E) { # >
1883 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1884 !!!cp (119);
1885 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1886 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1887 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1888 if ($self->{current_token}->{attributes}) {
1889 !!!cp (120);
1890 !!!parse-error (type => 'end tag attribute');
1891 } else {
1892 ## NOTE: This state should never be reached.
1893 !!!cp (121);
1894 }
1895 } else {
1896 die "$0: $self->{current_token}->{type}: Unknown token type";
1897 }
1898 $self->{state} = DATA_STATE;
1899 !!!next-input-character;
1900
1901 !!!emit ($self->{current_token}); # start tag or end tag
1902
1903 redo A;
1904 } elsif ($self->{next_char} == 0x002F) { # /
1905 !!!cp (122);
1906 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1907 !!!next-input-character;
1908 redo A;
1909 } elsif ($self->{next_char} == -1) {
1910 !!!parse-error (type => 'unclosed tag');
1911 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1912 !!!cp (122.3);
1913 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1914 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1915 if ($self->{current_token}->{attributes}) {
1916 !!!cp (122.1);
1917 !!!parse-error (type => 'end tag attribute');
1918 } else {
1919 ## NOTE: This state should never be reached.
1920 !!!cp (122.2);
1921 }
1922 } else {
1923 die "$0: $self->{current_token}->{type}: Unknown token type";
1924 }
1925 $self->{state} = DATA_STATE;
1926 ## Reconsume.
1927 !!!emit ($self->{current_token}); # start tag or end tag
1928 redo A;
1929 } else {
1930 !!!cp ('124.1');
1931 !!!parse-error (type => 'no space between attributes');
1932 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1933 ## reconsume
1934 redo A;
1935 }
1936 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1937 if ($self->{next_char} == 0x003E) { # >
1938 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1939 !!!cp ('124.2');
1940 !!!parse-error (type => 'nestc', token => $self->{current_token});
1941 ## TODO: Different type than slash in start tag
1942 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1943 if ($self->{current_token}->{attributes}) {
1944 !!!cp ('124.4');
1945 !!!parse-error (type => 'end tag attribute');
1946 } else {
1947 !!!cp ('124.5');
1948 }
1949 ## TODO: Test |<title></title/>|
1950 } else {
1951 !!!cp ('124.3');
1952 $self->{self_closing} = 1;
1953 }
1954
1955 $self->{state} = DATA_STATE;
1956 !!!next-input-character;
1957
1958 !!!emit ($self->{current_token}); # start tag or end tag
1959
1960 redo A;
1961 } elsif ($self->{next_char} == -1) {
1962 !!!parse-error (type => 'unclosed tag');
1963 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1964 !!!cp (124.7);
1965 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1966 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1967 if ($self->{current_token}->{attributes}) {
1968 !!!cp (124.5);
1969 !!!parse-error (type => 'end tag attribute');
1970 } else {
1971 ## NOTE: This state should never be reached.
1972 !!!cp (124.6);
1973 }
1974 } else {
1975 die "$0: $self->{current_token}->{type}: Unknown token type";
1976 }
1977 $self->{state} = DATA_STATE;
1978 ## Reconsume.
1979 !!!emit ($self->{current_token}); # start tag or end tag
1980 redo A;
1981 } else {
1982 !!!cp ('124.4');
1983 !!!parse-error (type => 'nestc');
1984 ## TODO: This error type is wrong.
1985 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1986 ## Reconsume.
1987 redo A;
1988 }
1989 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1990 ## (only happen if PCDATA state)
1991
1992 ## NOTE: Unlike spec's "bogus comment state", this implementation
1993 ## consumes characters one-by-one basis.
1994
1995 if ($self->{next_char} == 0x003E) { # >
1996 !!!cp (124);
1997 $self->{state} = DATA_STATE;
1998 !!!next-input-character;
1999
2000 !!!emit ($self->{current_token}); # comment
2001 redo A;
2002 } elsif ($self->{next_char} == -1) {
2003 !!!cp (125);
2004 $self->{state} = DATA_STATE;
2005 ## reconsume
2006
2007 !!!emit ($self->{current_token}); # comment
2008 redo A;
2009 } else {
2010 !!!cp (126);
2011 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2012 ## Stay in the state.
2013 !!!next-input-character;
2014 redo A;
2015 }
2016 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2017 ## (only happen if PCDATA state)
2018
2019 if ($self->{next_char} == 0x002D) { # -
2020 !!!cp (133);
2021 $self->{state} = MD_HYPHEN_STATE;
2022 !!!next-input-character;
2023 redo A;
2024 } elsif ($self->{next_char} == 0x0044 or # D
2025 $self->{next_char} == 0x0064) { # d
2026 ## ASCII case-insensitive.
2027 !!!cp (130);
2028 $self->{state} = MD_DOCTYPE_STATE;
2029 $self->{state_keyword} = chr $self->{next_char};
2030 !!!next-input-character;
2031 redo A;
2032 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2033 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2034 $self->{next_char} == 0x005B) { # [
2035 !!!cp (135.4);
2036 $self->{state} = MD_CDATA_STATE;
2037 $self->{state_keyword} = '[';
2038 !!!next-input-character;
2039 redo A;
2040 } else {
2041 !!!cp (136);
2042 }
2043
2044 !!!parse-error (type => 'bogus comment',
2045 line => $self->{line_prev},
2046 column => $self->{column_prev} - 1);
2047 ## Reconsume.
2048 $self->{state} = BOGUS_COMMENT_STATE;
2049 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2050 line => $self->{line_prev},
2051 column => $self->{column_prev} - 1,
2052 };
2053 redo A;
2054 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2055 if ($self->{next_char} == 0x002D) { # -
2056 !!!cp (127);
2057 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2058 line => $self->{line_prev},
2059 column => $self->{column_prev} - 2,
2060 };
2061 $self->{state} = COMMENT_START_STATE;
2062 !!!next-input-character;
2063 redo A;
2064 } else {
2065 !!!cp (128);
2066 !!!parse-error (type => 'bogus comment',
2067 line => $self->{line_prev},
2068 column => $self->{column_prev} - 2);
2069 $self->{state} = BOGUS_COMMENT_STATE;
2070 ## Reconsume.
2071 $self->{current_token} = {type => COMMENT_TOKEN,
2072 data => '-',
2073 line => $self->{line_prev},
2074 column => $self->{column_prev} - 2,
2075 };
2076 redo A;
2077 }
2078 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2079 ## ASCII case-insensitive.
2080 if ($self->{next_char} == [
2081 undef,
2082 0x004F, # O
2083 0x0043, # C
2084 0x0054, # T
2085 0x0059, # Y
2086 0x0050, # P
2087 ]->[length $self->{state_keyword}] or
2088 $self->{next_char} == [
2089 undef,
2090 0x006F, # o
2091 0x0063, # c
2092 0x0074, # t
2093 0x0079, # y
2094 0x0070, # p
2095 ]->[length $self->{state_keyword}]) {
2096 !!!cp (131);
2097 ## Stay in the state.
2098 $self->{state_keyword} .= chr $self->{next_char};
2099 !!!next-input-character;
2100 redo A;
2101 } elsif ((length $self->{state_keyword}) == 6 and
2102 ($self->{next_char} == 0x0045 or # E
2103 $self->{next_char} == 0x0065)) { # e
2104 !!!cp (129);
2105 $self->{state} = DOCTYPE_STATE;
2106 $self->{current_token} = {type => DOCTYPE_TOKEN,
2107 quirks => 1,
2108 line => $self->{line_prev},
2109 column => $self->{column_prev} - 7,
2110 };
2111 !!!next-input-character;
2112 redo A;
2113 } else {
2114 !!!cp (132);
2115 !!!parse-error (type => 'bogus comment',
2116 line => $self->{line_prev},
2117 column => $self->{column_prev} - 1 - length $self->{state_keyword});
2118 $self->{state} = BOGUS_COMMENT_STATE;
2119 ## Reconsume.
2120 $self->{current_token} = {type => COMMENT_TOKEN,
2121 data => $self->{state_keyword},
2122 line => $self->{line_prev},
2123 column => $self->{column_prev} - 1 - length $self->{state_keyword},
2124 };
2125 redo A;
2126 }
2127 } elsif ($self->{state} == MD_CDATA_STATE) {
2128 if ($self->{next_char} == {
2129 '[' => 0x0043, # C
2130 '[C' => 0x0044, # D
2131 '[CD' => 0x0041, # A
2132 '[CDA' => 0x0054, # T
2133 '[CDAT' => 0x0041, # A
2134 }->{$self->{state_keyword}}) {
2135 !!!cp (135.1);
2136 ## Stay in the state.
2137 $self->{state_keyword} .= chr $self->{next_char};
2138 !!!next-input-character;
2139 redo A;
2140 } elsif ($self->{state_keyword} eq '[CDATA' and
2141 $self->{next_char} == 0x005B) { # [
2142 !!!cp (135.2);
2143 $self->{current_token} = {type => CHARACTER_TOKEN,
2144 data => '',
2145 line => $self->{line_prev},
2146 column => $self->{column_prev} - 7};
2147 $self->{state} = CDATA_SECTION_STATE;
2148 !!!next-input-character;
2149 redo A;
2150 } else {
2151 !!!cp (135.3);
2152 !!!parse-error (type => 'bogus comment',
2153 line => $self->{line_prev},
2154 column => $self->{column_prev} - 1 - length $self->{state_keyword});
2155 $self->{state} = BOGUS_COMMENT_STATE;
2156 ## Reconsume.
2157 $self->{current_token} = {type => COMMENT_TOKEN,
2158 data => $self->{state_keyword},
2159 line => $self->{line_prev},
2160 column => $self->{column_prev} - 1 - length $self->{state_keyword},
2161 };
2162 redo A;
2163 }
2164 } elsif ($self->{state} == COMMENT_START_STATE) {
2165 if ($self->{next_char} == 0x002D) { # -
2166 !!!cp (137);
2167 $self->{state} = COMMENT_START_DASH_STATE;
2168 !!!next-input-character;
2169 redo A;
2170 } elsif ($self->{next_char} == 0x003E) { # >
2171 !!!cp (138);
2172 !!!parse-error (type => 'bogus comment');
2173 $self->{state} = DATA_STATE;
2174 !!!next-input-character;
2175
2176 !!!emit ($self->{current_token}); # comment
2177
2178 redo A;
2179 } elsif ($self->{next_char} == -1) {
2180 !!!cp (139);
2181 !!!parse-error (type => 'unclosed comment');
2182 $self->{state} = DATA_STATE;
2183 ## reconsume
2184
2185 !!!emit ($self->{current_token}); # comment
2186
2187 redo A;
2188 } else {
2189 !!!cp (140);
2190 $self->{current_token}->{data} # comment
2191 .= chr ($self->{next_char});
2192 $self->{state} = COMMENT_STATE;
2193 !!!next-input-character;
2194 redo A;
2195 }
2196 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2197 if ($self->{next_char} == 0x002D) { # -
2198 !!!cp (141);
2199 $self->{state} = COMMENT_END_STATE;
2200 !!!next-input-character;
2201 redo A;
2202 } elsif ($self->{next_char} == 0x003E) { # >
2203 !!!cp (142);
2204 !!!parse-error (type => 'bogus comment');
2205 $self->{state} = DATA_STATE;
2206 !!!next-input-character;
2207
2208 !!!emit ($self->{current_token}); # comment
2209
2210 redo A;
2211 } elsif ($self->{next_char} == -1) {
2212 !!!cp (143);
2213 !!!parse-error (type => 'unclosed comment');
2214 $self->{state} = DATA_STATE;
2215 ## reconsume
2216
2217 !!!emit ($self->{current_token}); # comment
2218
2219 redo A;
2220 } else {
2221 !!!cp (144);
2222 $self->{current_token}->{data} # comment
2223 .= '-' . chr ($self->{next_char});
2224 $self->{state} = COMMENT_STATE;
2225 !!!next-input-character;
2226 redo A;
2227 }
2228 } elsif ($self->{state} == COMMENT_STATE) {
2229 if ($self->{next_char} == 0x002D) { # -
2230 !!!cp (145);
2231 $self->{state} = COMMENT_END_DASH_STATE;
2232 !!!next-input-character;
2233 redo A;
2234 } elsif ($self->{next_char} == -1) {
2235 !!!cp (146);
2236 !!!parse-error (type => 'unclosed comment');
2237 $self->{state} = DATA_STATE;
2238 ## reconsume
2239
2240 !!!emit ($self->{current_token}); # comment
2241
2242 redo A;
2243 } else {
2244 !!!cp (147);
2245 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2246 ## Stay in the state
2247 !!!next-input-character;
2248 redo A;
2249 }
2250 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2251 if ($self->{next_char} == 0x002D) { # -
2252 !!!cp (148);
2253 $self->{state} = COMMENT_END_STATE;
2254 !!!next-input-character;
2255 redo A;
2256 } elsif ($self->{next_char} == -1) {
2257 !!!cp (149);
2258 !!!parse-error (type => 'unclosed comment');
2259 $self->{state} = DATA_STATE;
2260 ## reconsume
2261
2262 !!!emit ($self->{current_token}); # comment
2263
2264 redo A;
2265 } else {
2266 !!!cp (150);
2267 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2268 $self->{state} = COMMENT_STATE;
2269 !!!next-input-character;
2270 redo A;
2271 }
2272 } elsif ($self->{state} == COMMENT_END_STATE) {
2273 if ($self->{next_char} == 0x003E) { # >
2274 !!!cp (151);
2275 $self->{state} = DATA_STATE;
2276 !!!next-input-character;
2277
2278 !!!emit ($self->{current_token}); # comment
2279
2280 redo A;
2281 } elsif ($self->{next_char} == 0x002D) { # -
2282 !!!cp (152);
2283 !!!parse-error (type => 'dash in comment',
2284 line => $self->{line_prev},
2285 column => $self->{column_prev});
2286 $self->{current_token}->{data} .= '-'; # comment
2287 ## Stay in the state
2288 !!!next-input-character;
2289 redo A;
2290 } elsif ($self->{next_char} == -1) {
2291 !!!cp (153);
2292 !!!parse-error (type => 'unclosed comment');
2293 $self->{state} = DATA_STATE;
2294 ## reconsume
2295
2296 !!!emit ($self->{current_token}); # comment
2297
2298 redo A;
2299 } else {
2300 !!!cp (154);
2301 !!!parse-error (type => 'dash in comment',
2302 line => $self->{line_prev},
2303 column => $self->{column_prev});
2304 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2305 $self->{state} = COMMENT_STATE;
2306 !!!next-input-character;
2307 redo A;
2308 }
2309 } elsif ($self->{state} == DOCTYPE_STATE) {
2310 if ($self->{next_char} == 0x0009 or # HT
2311 $self->{next_char} == 0x000A or # LF
2312 $self->{next_char} == 0x000B or # VT
2313 $self->{next_char} == 0x000C or # FF
2314 $self->{next_char} == 0x0020) { # SP
2315 !!!cp (155);
2316 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2317 !!!next-input-character;
2318 redo A;
2319 } else {
2320 !!!cp (156);
2321 !!!parse-error (type => 'no space before DOCTYPE name');
2322 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2323 ## reconsume
2324 redo A;
2325 }
2326 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2327 if ($self->{next_char} == 0x0009 or # HT
2328 $self->{next_char} == 0x000A or # LF
2329 $self->{next_char} == 0x000B or # VT
2330 $self->{next_char} == 0x000C or # FF
2331 $self->{next_char} == 0x0020) { # SP
2332 !!!cp (157);
2333 ## Stay in the state
2334 !!!next-input-character;
2335 redo A;
2336 } elsif ($self->{next_char} == 0x003E) { # >
2337 !!!cp (158);
2338 !!!parse-error (type => 'no DOCTYPE name');
2339 $self->{state} = DATA_STATE;
2340 !!!next-input-character;
2341
2342 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2343
2344 redo A;
2345 } elsif ($self->{next_char} == -1) {
2346 !!!cp (159);
2347 !!!parse-error (type => 'no DOCTYPE name');
2348 $self->{state} = DATA_STATE;
2349 ## reconsume
2350
2351 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2352
2353 redo A;
2354 } else {
2355 !!!cp (160);
2356 $self->{current_token}->{name} = chr $self->{next_char};
2357 delete $self->{current_token}->{quirks};
2358 ## ISSUE: "Set the token's name name to the" in the spec
2359 $self->{state} = DOCTYPE_NAME_STATE;
2360 !!!next-input-character;
2361 redo A;
2362 }
2363 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2364 ## ISSUE: Redundant "First," in the spec.
2365 if ($self->{next_char} == 0x0009 or # HT
2366 $self->{next_char} == 0x000A or # LF
2367 $self->{next_char} == 0x000B or # VT
2368 $self->{next_char} == 0x000C or # FF
2369 $self->{next_char} == 0x0020) { # SP
2370 !!!cp (161);
2371 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2372 !!!next-input-character;
2373 redo A;
2374 } elsif ($self->{next_char} == 0x003E) { # >
2375 !!!cp (162);
2376 $self->{state} = DATA_STATE;
2377 !!!next-input-character;
2378
2379 !!!emit ($self->{current_token}); # DOCTYPE
2380
2381 redo A;
2382 } elsif ($self->{next_char} == -1) {
2383 !!!cp (163);
2384 !!!parse-error (type => 'unclosed DOCTYPE');
2385 $self->{state} = DATA_STATE;
2386 ## reconsume
2387
2388 $self->{current_token}->{quirks} = 1;
2389 !!!emit ($self->{current_token}); # DOCTYPE
2390
2391 redo A;
2392 } else {
2393 !!!cp (164);
2394 $self->{current_token}->{name}
2395 .= chr ($self->{next_char}); # DOCTYPE
2396 ## Stay in the state
2397 !!!next-input-character;
2398 redo A;
2399 }
2400 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2401 if ($self->{next_char} == 0x0009 or # HT
2402 $self->{next_char} == 0x000A or # LF
2403 $self->{next_char} == 0x000B or # VT
2404 $self->{next_char} == 0x000C or # FF
2405 $self->{next_char} == 0x0020) { # SP
2406 !!!cp (165);
2407 ## Stay in the state
2408 !!!next-input-character;
2409 redo A;
2410 } elsif ($self->{next_char} == 0x003E) { # >
2411 !!!cp (166);
2412 $self->{state} = DATA_STATE;
2413 !!!next-input-character;
2414
2415 !!!emit ($self->{current_token}); # DOCTYPE
2416
2417 redo A;
2418 } elsif ($self->{next_char} == -1) {
2419 !!!cp (167);
2420 !!!parse-error (type => 'unclosed DOCTYPE');
2421 $self->{state} = DATA_STATE;
2422 ## reconsume
2423
2424 $self->{current_token}->{quirks} = 1;
2425 !!!emit ($self->{current_token}); # DOCTYPE
2426
2427 redo A;
2428 } elsif ($self->{next_char} == 0x0050 or # P
2429 $self->{next_char} == 0x0070) { # p
2430 $self->{state} = PUBLIC_STATE;
2431 $self->{state_keyword} = chr $self->{next_char};
2432 !!!next-input-character;
2433 redo A;
2434 } elsif ($self->{next_char} == 0x0053 or # S
2435 $self->{next_char} == 0x0073) { # s
2436 $self->{state} = SYSTEM_STATE;
2437 $self->{state_keyword} = chr $self->{next_char};
2438 !!!next-input-character;
2439 redo A;
2440 } else {
2441 !!!cp (180);
2442 !!!parse-error (type => 'string after DOCTYPE name');
2443 $self->{current_token}->{quirks} = 1;
2444
2445 $self->{state} = BOGUS_DOCTYPE_STATE;
2446 !!!next-input-character;
2447 redo A;
2448 }
2449 } elsif ($self->{state} == PUBLIC_STATE) {
2450 ## ASCII case-insensitive
2451 if ($self->{next_char} == [
2452 undef,
2453 0x0055, # U
2454 0x0042, # B
2455 0x004C, # L
2456 0x0049, # I
2457 ]->[length $self->{state_keyword}] or
2458 $self->{next_char} == [
2459 undef,
2460 0x0075, # u
2461 0x0062, # b
2462 0x006C, # l
2463 0x0069, # i
2464 ]->[length $self->{state_keyword}]) {
2465 !!!cp (175);
2466 ## Stay in the state.
2467 $self->{state_keyword} .= chr $self->{next_char};
2468 !!!next-input-character;
2469 redo A;
2470 } elsif ((length $self->{state_keyword}) == 5 and
2471 ($self->{next_char} == 0x0043 or # C
2472 $self->{next_char} == 0x0063)) { # c
2473 !!!cp (168);
2474 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2475 !!!next-input-character;
2476 redo A;
2477 } else {
2478 !!!cp (169);
2479 !!!parse-error (type => 'string after DOCTYPE name',
2480 line => $self->{line_prev},
2481 column => $self->{column_prev} + 1 - length $self->{state_keyword});
2482 $self->{current_token}->{quirks} = 1;
2483
2484 $self->{state} = BOGUS_DOCTYPE_STATE;
2485 ## Reconsume.
2486 redo A;
2487 }
2488 } elsif ($self->{state} == SYSTEM_STATE) {
2489 ## ASCII case-insensitive
2490 if ($self->{next_char} == [
2491 undef,
2492 0x0059, # Y
2493 0x0053, # S
2494 0x0054, # T
2495 0x0045, # E
2496 ]->[length $self->{state_keyword}] or
2497 $self->{next_char} == [
2498 undef,
2499 0x0079, # y
2500 0x0073, # s
2501 0x0074, # t
2502 0x0065, # e
2503 ]->[length $self->{state_keyword}]) {
2504 !!!cp (170);
2505 ## Stay in the state.
2506 $self->{state_keyword} .= chr $self->{next_char};
2507 !!!next-input-character;
2508 redo A;
2509 } elsif ((length $self->{state_keyword}) == 5 and
2510 ($self->{next_char} == 0x004D or # M
2511 $self->{next_char} == 0x006D)) { # m
2512 !!!cp (171);
2513 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2514 !!!next-input-character;
2515 redo A;
2516 } else {
2517 !!!cp (172);
2518 !!!parse-error (type => 'string after DOCTYPE name',
2519 line => $self->{line_prev},
2520 column => $self->{column_prev} + 1 - length $self->{state_keyword});
2521 $self->{current_token}->{quirks} = 1;
2522
2523 $self->{state} = BOGUS_DOCTYPE_STATE;
2524 ## Reconsume.
2525 redo A;
2526 }
2527 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2528 if ({
2529 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2530 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2531 }->{$self->{next_char}}) {
2532 !!!cp (181);
2533 ## Stay in the state
2534 !!!next-input-character;
2535 redo A;
2536 } elsif ($self->{next_char} eq 0x0022) { # "
2537 !!!cp (182);
2538 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2539 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2540 !!!next-input-character;
2541 redo A;
2542 } elsif ($self->{next_char} eq 0x0027) { # '
2543 !!!cp (183);
2544 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2545 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2546 !!!next-input-character;
2547 redo A;
2548 } elsif ($self->{next_char} eq 0x003E) { # >
2549 !!!cp (184);
2550 !!!parse-error (type => 'no PUBLIC literal');
2551
2552 $self->{state} = DATA_STATE;
2553 !!!next-input-character;
2554
2555 $self->{current_token}->{quirks} = 1;
2556 !!!emit ($self->{current_token}); # DOCTYPE
2557
2558 redo A;
2559 } elsif ($self->{next_char} == -1) {
2560 !!!cp (185);
2561 !!!parse-error (type => 'unclosed DOCTYPE');
2562
2563 $self->{state} = DATA_STATE;
2564 ## reconsume
2565
2566 $self->{current_token}->{quirks} = 1;
2567 !!!emit ($self->{current_token}); # DOCTYPE
2568
2569 redo A;
2570 } else {
2571 !!!cp (186);
2572 !!!parse-error (type => 'string after PUBLIC');
2573 $self->{current_token}->{quirks} = 1;
2574
2575 $self->{state} = BOGUS_DOCTYPE_STATE;
2576 !!!next-input-character;
2577 redo A;
2578 }
2579 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2580 if ($self->{next_char} == 0x0022) { # "
2581 !!!cp (187);
2582 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2583 !!!next-input-character;
2584 redo A;
2585 } elsif ($self->{next_char} == 0x003E) { # >
2586 !!!cp (188);
2587 !!!parse-error (type => 'unclosed PUBLIC literal');
2588
2589 $self->{state} = DATA_STATE;
2590 !!!next-input-character;
2591
2592 $self->{current_token}->{quirks} = 1;
2593 !!!emit ($self->{current_token}); # DOCTYPE
2594
2595 redo A;
2596 } elsif ($self->{next_char} == -1) {
2597 !!!cp (189);
2598 !!!parse-error (type => 'unclosed PUBLIC literal');
2599
2600 $self->{state} = DATA_STATE;
2601 ## reconsume
2602
2603 $self->{current_token}->{quirks} = 1;
2604 !!!emit ($self->{current_token}); # DOCTYPE
2605
2606 redo A;
2607 } else {
2608 !!!cp (190);
2609 $self->{current_token}->{public_identifier} # DOCTYPE
2610 .= chr $self->{next_char};
2611 ## Stay in the state
2612 !!!next-input-character;
2613 redo A;
2614 }
2615 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2616 if ($self->{next_char} == 0x0027) { # '
2617 !!!cp (191);
2618 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2619 !!!next-input-character;
2620 redo A;
2621 } elsif ($self->{next_char} == 0x003E) { # >
2622 !!!cp (192);
2623 !!!parse-error (type => 'unclosed PUBLIC literal');
2624
2625 $self->{state} = DATA_STATE;
2626 !!!next-input-character;
2627
2628 $self->{current_token}->{quirks} = 1;
2629 !!!emit ($self->{current_token}); # DOCTYPE
2630
2631 redo A;
2632 } elsif ($self->{next_char} == -1) {
2633 !!!cp (193);
2634 !!!parse-error (type => 'unclosed PUBLIC literal');
2635
2636 $self->{state} = DATA_STATE;
2637 ## reconsume
2638
2639 $self->{current_token}->{quirks} = 1;
2640 !!!emit ($self->{current_token}); # DOCTYPE
2641
2642 redo A;
2643 } else {
2644 !!!cp (194);
2645 $self->{current_token}->{public_identifier} # DOCTYPE
2646 .= chr $self->{next_char};
2647 ## Stay in the state
2648 !!!next-input-character;
2649 redo A;
2650 }
2651 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2652 if ({
2653 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2654 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2655 }->{$self->{next_char}}) {
2656 !!!cp (195);
2657 ## Stay in the state
2658 !!!next-input-character;
2659 redo A;
2660 } elsif ($self->{next_char} == 0x0022) { # "
2661 !!!cp (196);
2662 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2663 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2664 !!!next-input-character;
2665 redo A;
2666 } elsif ($self->{next_char} == 0x0027) { # '
2667 !!!cp (197);
2668 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2669 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2670 !!!next-input-character;
2671 redo A;
2672 } elsif ($self->{next_char} == 0x003E) { # >
2673 !!!cp (198);
2674 $self->{state} = DATA_STATE;
2675 !!!next-input-character;
2676
2677 !!!emit ($self->{current_token}); # DOCTYPE
2678
2679 redo A;
2680 } elsif ($self->{next_char} == -1) {
2681 !!!cp (199);
2682 !!!parse-error (type => 'unclosed DOCTYPE');
2683
2684 $self->{state} = DATA_STATE;
2685 ## reconsume
2686
2687 $self->{current_token}->{quirks} = 1;
2688 !!!emit ($self->{current_token}); # DOCTYPE
2689
2690 redo A;
2691 } else {
2692 !!!cp (200);
2693 !!!parse-error (type => 'string after PUBLIC literal');
2694 $self->{current_token}->{quirks} = 1;
2695
2696 $self->{state} = BOGUS_DOCTYPE_STATE;
2697 !!!next-input-character;
2698 redo A;
2699 }
2700 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2701 if ({
2702 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2703 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2704 }->{$self->{next_char}}) {
2705 !!!cp (201);
2706 ## Stay in the state
2707 !!!next-input-character;
2708 redo A;
2709 } elsif ($self->{next_char} == 0x0022) { # "
2710 !!!cp (202);
2711 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2712 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2713 !!!next-input-character;
2714 redo A;
2715 } elsif ($self->{next_char} == 0x0027) { # '
2716 !!!cp (203);
2717 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2718 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2719 !!!next-input-character;
2720 redo A;
2721 } elsif ($self->{next_char} == 0x003E) { # >
2722 !!!cp (204);
2723 !!!parse-error (type => 'no SYSTEM literal');
2724 $self->{state} = DATA_STATE;
2725 !!!next-input-character;
2726
2727 $self->{current_token}->{quirks} = 1;
2728 !!!emit ($self->{current_token}); # DOCTYPE
2729
2730 redo A;
2731 } elsif ($self->{next_char} == -1) {
2732 !!!cp (205);
2733 !!!parse-error (type => 'unclosed DOCTYPE');
2734
2735 $self->{state} = DATA_STATE;
2736 ## reconsume
2737
2738 $self->{current_token}->{quirks} = 1;
2739 !!!emit ($self->{current_token}); # DOCTYPE
2740
2741 redo A;
2742 } else {
2743 !!!cp (206);
2744 !!!parse-error (type => 'string after SYSTEM');
2745 $self->{current_token}->{quirks} = 1;
2746
2747 $self->{state} = BOGUS_DOCTYPE_STATE;
2748 !!!next-input-character;
2749 redo A;
2750 }
2751 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2752 if ($self->{next_char} == 0x0022) { # "
2753 !!!cp (207);
2754 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2755 !!!next-input-character;
2756 redo A;
2757 } elsif ($self->{next_char} == 0x003E) { # >
2758 !!!cp (208);
2759 !!!parse-error (type => 'unclosed SYSTEM literal');
2760
2761 $self->{state} = DATA_STATE;
2762 !!!next-input-character;
2763
2764 $self->{current_token}->{quirks} = 1;
2765 !!!emit ($self->{current_token}); # DOCTYPE
2766
2767 redo A;
2768 } elsif ($self->{next_char} == -1) {
2769 !!!cp (209);
2770 !!!parse-error (type => 'unclosed SYSTEM literal');
2771
2772 $self->{state} = DATA_STATE;
2773 ## reconsume
2774
2775 $self->{current_token}->{quirks} = 1;
2776 !!!emit ($self->{current_token}); # DOCTYPE
2777
2778 redo A;
2779 } else {
2780 !!!cp (210);
2781 $self->{current_token}->{system_identifier} # DOCTYPE
2782 .= chr $self->{next_char};
2783 ## Stay in the state
2784 !!!next-input-character;
2785 redo A;
2786 }
2787 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2788 if ($self->{next_char} == 0x0027) { # '
2789 !!!cp (211);
2790 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2791 !!!next-input-character;
2792 redo A;
2793 } elsif ($self->{next_char} == 0x003E) { # >
2794 !!!cp (212);
2795 !!!parse-error (type => 'unclosed SYSTEM literal');
2796
2797 $self->{state} = DATA_STATE;
2798 !!!next-input-character;
2799
2800 $self->{current_token}->{quirks} = 1;
2801 !!!emit ($self->{current_token}); # DOCTYPE
2802
2803 redo A;
2804 } elsif ($self->{next_char} == -1) {
2805 !!!cp (213);
2806 !!!parse-error (type => 'unclosed SYSTEM literal');
2807
2808 $self->{state} = DATA_STATE;
2809 ## reconsume
2810
2811 $self->{current_token}->{quirks} = 1;
2812 !!!emit ($self->{current_token}); # DOCTYPE
2813
2814 redo A;
2815 } else {
2816 !!!cp (214);
2817 $self->{current_token}->{system_identifier} # DOCTYPE
2818 .= chr $self->{next_char};
2819 ## Stay in the state
2820 !!!next-input-character;
2821 redo A;
2822 }
2823 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2824 if ({
2825 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2826 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2827 }->{$self->{next_char}}) {
2828 !!!cp (215);
2829 ## Stay in the state
2830 !!!next-input-character;
2831 redo A;
2832 } elsif ($self->{next_char} == 0x003E) { # >
2833 !!!cp (216);
2834 $self->{state} = DATA_STATE;
2835 !!!next-input-character;
2836
2837 !!!emit ($self->{current_token}); # DOCTYPE
2838
2839 redo A;
2840 } elsif ($self->{next_char} == -1) {
2841 !!!cp (217);
2842 !!!parse-error (type => 'unclosed DOCTYPE');
2843 $self->{state} = DATA_STATE;
2844 ## reconsume
2845
2846 $self->{current_token}->{quirks} = 1;
2847 !!!emit ($self->{current_token}); # DOCTYPE
2848
2849 redo A;
2850 } else {
2851 !!!cp (218);
2852 !!!parse-error (type => 'string after SYSTEM literal');
2853 #$self->{current_token}->{quirks} = 1;
2854
2855 $self->{state} = BOGUS_DOCTYPE_STATE;
2856 !!!next-input-character;
2857 redo A;
2858 }
2859 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2860 if ($self->{next_char} == 0x003E) { # >
2861 !!!cp (219);
2862 $self->{state} = DATA_STATE;
2863 !!!next-input-character;
2864
2865 !!!emit ($self->{current_token}); # DOCTYPE
2866
2867 redo A;
2868 } elsif ($self->{next_char} == -1) {
2869 !!!cp (220);
2870 !!!parse-error (type => 'unclosed DOCTYPE');
2871 $self->{state} = DATA_STATE;
2872 ## reconsume
2873
2874 !!!emit ($self->{current_token}); # DOCTYPE
2875
2876 redo A;
2877 } else {
2878 !!!cp (221);
2879 ## Stay in the state
2880 !!!next-input-character;
2881 redo A;
2882 }
2883 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2884 ## NOTE: "CDATA section state" in the state is jointly implemented
2885 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2886 ## and |CDATA_SECTION_MSE2_STATE|.
2887
2888 if ($self->{next_char} == 0x005D) { # ]
2889 !!!cp (221.1);
2890 $self->{state} = CDATA_SECTION_MSE1_STATE;
2891 !!!next-input-character;
2892 redo A;
2893 } elsif ($self->{next_char} == -1) {
2894 $self->{state} = DATA_STATE;
2895 !!!next-input-character;
2896 if (length $self->{current_token}->{data}) { # character
2897 !!!cp (221.2);
2898 !!!emit ($self->{current_token}); # character
2899 } else {
2900 !!!cp (221.3);
2901 ## No token to emit. $self->{current_token} is discarded.
2902 }
2903 redo A;
2904 } else {
2905 !!!cp (221.4);
2906 $self->{current_token}->{data} .= chr $self->{next_char};
2907 ## Stay in the state.
2908 !!!next-input-character;
2909 redo A;
2910 }
2911
2912 ## ISSUE: "text tokens" in spec.
2913 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2914 if ($self->{next_char} == 0x005D) { # ]
2915 !!!cp (221.5);
2916 $self->{state} = CDATA_SECTION_MSE2_STATE;
2917 !!!next-input-character;
2918 redo A;
2919 } else {
2920 !!!cp (221.6);
2921 $self->{current_token}->{data} .= ']';
2922 $self->{state} = CDATA_SECTION_STATE;
2923 ## Reconsume.
2924 redo A;
2925 }
2926 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2927 if ($self->{next_char} == 0x003E) { # >
2928 $self->{state} = DATA_STATE;
2929 !!!next-input-character;
2930 if (length $self->{current_token}->{data}) { # character
2931 !!!cp (221.7);
2932 !!!emit ($self->{current_token}); # character
2933 } else {
2934 !!!cp (221.8);
2935 ## No token to emit. $self->{current_token} is discarded.
2936 }
2937 redo A;
2938 } elsif ($self->{next_char} == 0x005D) { # ]
2939 !!!cp (221.9); # character
2940 $self->{current_token}->{data} .= ']'; ## Add first "]" of "]]]".
2941 ## Stay in the state.
2942 !!!next-input-character;
2943 redo A;
2944 } else {
2945 !!!cp (221.11);
2946 $self->{current_token}->{data} .= ']]'; # character
2947 $self->{state} = CDATA_SECTION_STATE;
2948 ## Reconsume.
2949 redo A;
2950 }
2951 } elsif ($self->{state} == ENTITY_STATE) {
2952 if ({
2953 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2954 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, &
2955 $self->{entity_additional} => 1,
2956 }->{$self->{next_char}}) {
2957 !!!cp (1001);
2958 ## Don't consume
2959 ## No error
2960 ## Return nothing.
2961 #
2962 } elsif ($self->{next_char} == 0x0023) { # #
2963 $self->{state} = ENTITY_HASH_STATE;
2964 $self->{state_keyword} = '#';
2965 !!!next-input-character;
2966 redo A;
2967 } elsif ((0x0041 <= $self->{next_char} and
2968 $self->{next_char} <= 0x005A) or # A..Z
2969 (0x0061 <= $self->{next_char} and
2970 $self->{next_char} <= 0x007A)) { # a..z
2971 require Whatpm::_NamedEntityList;
2972 $self->{state} = ENTITY_NAME_STATE;
2973 $self->{state_keyword} = chr $self->{next_char};
2974 $self->{entity__value} = $self->{state_keyword};
2975 $self->{entity__match} = 0;
2976 !!!next-input-character;
2977 redo A;
2978 } else {
2979 !!!cp (1027);
2980 !!!parse-error (type => 'bare ero');
2981 ## Return nothing.
2982 #
2983 }
2984
2985 ## NOTE: No character is consumed by the "consume a character
2986 ## reference" algorithm. In other word, there is an "&" character
2987 ## that does not introduce a character reference, which would be
2988 ## appended to the parent element or the attribute value in later
2989 ## process of the tokenizer.
2990
2991 if ($self->{entity_in_attr}) {
2992 $self->{current_attribute}->{value} .= '&';
2993 $self->{state} = $self->{last_attribute_value_state};
2994 ## Reconsume.
2995 redo A;
2996 } else {
2997 $self->{state} = DATA_STATE;
2998 ## Reconsume.
2999 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3000 line => $self->{line_prev},
3001 column => $self->{column_prev},
3002 });
3003 redo A;
3004 }
3005 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3006 if ($self->{next_char} == 0x0078 or # x
3007 $self->{next_char} == 0x0058) { # X
3008 $self->{state} = HEXREF_X_STATE;
3009 $self->{state_keyword} .= chr $self->{next_char};
3010 !!!next-input-character;
3011 redo A;
3012 } elsif (0x0030 <= $self->{next_char} and
3013 $self->{next_char} <= 0x0039) { # 0..9
3014 $self->{state} = NCR_NUM_STATE;
3015 $self->{state_keyword} = $self->{next_char} - 0x0030;
3016 !!!next-input-character;
3017 redo A;
3018 } else {
3019 !!!cp (1019);
3020 !!!parse-error (type => 'bare nero',
3021 line => $self->{line_prev},
3022 column => $self->{column_prev} - 1);
3023
3024 ## NOTE: According to the spec algorithm, nothing is returned,
3025 ## and then "&#" is appended to the parent element or the attribute
3026 ## value in the later processing.
3027
3028 if ($self->{entity_in_attr}) {
3029 $self->{current_attribute}->{value} .= '&#';
3030 $self->{state} = $self->{last_attribute_value_state};
3031 ## Reconsume.
3032 redo A;
3033 } else {
3034 $self->{state} = DATA_STATE;
3035 ## Reconsume.
3036 !!!emit ({type => CHARACTER_TOKEN,
3037 data => '&#',
3038 line => $self->{line_prev},
3039 column => $self->{column_prev} - 1,
3040 });
3041 redo A;
3042 }
3043 }
3044 } elsif ($self->{state} == NCR_NUM_STATE) {
3045 if (0x0030 <= $self->{next_char} and
3046 $self->{next_char} <= 0x0039) { # 0..9
3047 !!!cp (1012);
3048 $self->{state_keyword} *= 10;
3049 $self->{state_keyword} += $self->{next_char} - 0x0030;
3050
3051 ## Stay in the state.
3052 !!!next-input-character;
3053 redo A;
3054 } elsif ($self->{next_char} == 0x003B) { # ;
3055 !!!cp (1013);
3056 !!!next-input-character;
3057 #
3058 } else {
3059 !!!cp (1014);
3060 !!!parse-error (type => 'no refc');
3061 ## Reconsume.
3062 #
3063 }
3064
3065 my $code = $self->{state_keyword};
3066 my $l = $self->{line_prev};
3067 my $c = $self->{column_prev};
3068 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3069 !!!cp (1015);
3070 !!!parse-error (type => 'invalid character reference',
3071 text => (sprintf 'U+%04X', $code),
3072 line => $l, column => $c);
3073 $code = 0xFFFD;
3074 } elsif ($code > 0x10FFFF) {
3075 !!!cp (1016);
3076 !!!parse-error (type => 'invalid character reference',
3077 text => (sprintf 'U-%08X', $code),
3078 line => $l, column => $c);
3079 $code = 0xFFFD;
3080 } elsif ($code == 0x000D) {
3081 !!!cp (1017);
3082 !!!parse-error (type => 'CR character reference',
3083 line => $l, column => $c);
3084 $code = 0x000A;
3085 } elsif (0x80 <= $code and $code <= 0x9F) {
3086 !!!cp (1018);
3087 !!!parse-error (type => 'C1 character reference',
3088 text => (sprintf 'U+%04X', $code),
3089 line => $l, column => $c);
3090 $code = $c1_entity_char->{$code};
3091 }
3092
3093 if ($self->{entity_in_attr}) {
3094 $self->{current_attribute}->{value} .= chr $code;
3095 $self->{current_attribute}->{has_reference} = 1;
3096 $self->{state} = $self->{last_attribute_value_state};
3097 ## Reconsume.
3098 redo A;
3099 } else {
3100 $self->{state} = DATA_STATE;
3101 ## Reconsume.
3102 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3103 has_reference => 1,
3104 line => $l, column => $c,
3105 });
3106 redo A;
3107 }
3108 } elsif ($self->{state} == HEXREF_X_STATE) {
3109 if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
3110 (0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or
3111 (0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) {
3112 # 0..9, A..F, a..f
3113 $self->{state} = HEXREF_HEX_STATE;
3114 $self->{state_keyword} = 0;
3115 ## Reconsume.
3116 redo A;
3117 } else {
3118 !!!cp (1005);
3119 !!!parse-error (type => 'bare hcro',
3120 line => $self->{line_prev},
3121 column => $self->{column_prev} - 2);
3122
3123 ## NOTE: According to the spec algorithm, nothing is returned,
3124 ## and then "&#" followed by "X" or "x" is appended to the parent
3125 ## element or the attribute value in the later processing.
3126
3127 if ($self->{entity_in_attr}) {
3128 $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
3129 $self->{state} = $self->{last_attribute_value_state};
3130 ## Reconsume.
3131 redo A;
3132 } else {
3133 $self->{state} = DATA_STATE;
3134 ## Reconsume.
3135 !!!emit ({type => CHARACTER_TOKEN,
3136 data => '&' . $self->{state_keyword},
3137 line => $self->{line_prev},
3138 column => $self->{column_prev} - length $self->{state_keyword},
3139 });
3140 redo A;
3141 }
3142 }
3143 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3144 if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) {
3145 # 0..9
3146 !!!cp (1002);
3147 $self->{state_keyword} *= 0x10;
3148 $self->{state_keyword} += $self->{next_char} - 0x0030;
3149 ## Stay in the state.
3150 !!!next-input-character;
3151 redo A;
3152 } elsif (0x0061 <= $self->{next_char} and
3153 $self->{next_char} <= 0x0066) { # a..f
3154 !!!cp (1003);
3155 $self->{state_keyword} *= 0x10;
3156 $self->{state_keyword} += $self->{next_char} - 0x0060 + 9;
3157 ## Stay in the state.
3158 !!!next-input-character;
3159 redo A;
3160 } elsif (0x0041 <= $self->{next_char} and
3161 $self->{next_char} <= 0x0046) { # A..F
3162 !!!cp (1004);
3163 $self->{state_keyword} *= 0x10;
3164 $self->{state_keyword} += $self->{next_char} - 0x0040 + 9;
3165 ## Stay in the state.
3166 !!!next-input-character;
3167 redo A;
3168 } elsif ($self->{next_char} == 0x003B) { # ;
3169 !!!cp (1006);
3170 !!!next-input-character;
3171 #
3172 } else {
3173 !!!cp (1007);
3174 !!!parse-error (type => 'no refc',
3175 line => $self->{line},
3176 column => $self->{column});
3177 ## Reconsume.
3178 #
3179 }
3180
3181 my $code = $self->{state_keyword};
3182 my $l = $self->{line_prev};
3183 my $c = $self->{column_prev};
3184 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3185 !!!cp (1008);
3186 !!!parse-error (type => 'invalid character reference',
3187 text => (sprintf 'U+%04X', $code),
3188 line => $l, column => $c);
3189 $code = 0xFFFD;
3190 } elsif ($code > 0x10FFFF) {
3191 !!!cp (1009);
3192 !!!parse-error (type => 'invalid character reference',
3193 text => (sprintf 'U-%08X', $code),
3194 line => $l, column => $c);
3195 $code = 0xFFFD;
3196 } elsif ($code == 0x000D) {
3197 !!!cp (1010);
3198 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
3199 $code = 0x000A;
3200 } elsif (0x80 <= $code and $code <= 0x9F) {
3201 !!!cp (1011);
3202 !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
3203 $code = $c1_entity_char->{$code};
3204 }
3205
3206 if ($self->{entity_in_attr}) {
3207 $self->{current_attribute}->{value} .= chr $code;
3208 $self->{current_attribute}->{has_reference} = 1;
3209 $self->{state} = $self->{last_attribute_value_state};
3210 ## Reconsume.
3211 redo A;
3212 } else {
3213 $self->{state} = DATA_STATE;
3214 ## Reconsume.
3215 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3216 has_reference => 1,
3217 line => $l, column => $c,
3218 });
3219 redo A;
3220 }
3221 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3222 if (length $self->{state_keyword} < 30 and
3223 ## NOTE: Some number greater than the maximum length of entity name
3224 ((0x0041 <= $self->{next_char} and # a
3225 $self->{next_char} <= 0x005A) or # x
3226 (0x0061 <= $self->{next_char} and # a
3227 $self->{next_char} <= 0x007A) or # z
3228 (0x0030 <= $self->{next_char} and # 0
3229 $self->{next_char} <= 0x0039) or # 9
3230 $self->{next_char} == 0x003B)) { # ;
3231 our $EntityChar;
3232 $self->{state_keyword} .= chr $self->{next_char};
3233 if (defined $EntityChar->{$self->{state_keyword}}) {
3234 if ($self->{next_char} == 0x003B) { # ;
3235 !!!cp (1020);
3236 $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3237 $self->{entity__match} = 1;
3238 !!!next-input-character;
3239 #
3240 } else {
3241 !!!cp (1021);
3242 $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3243 $self->{entity__match} = -1;
3244 ## Stay in the state.
3245 !!!next-input-character;
3246 redo A;
3247 }
3248 } else {
3249 !!!cp (1022);
3250 $self->{entity__value} .= chr $self->{next_char};
3251 $self->{entity__match} *= 2;
3252 ## Stay in the state.
3253 !!!next-input-character;
3254 redo A;
3255 }
3256 }
3257
3258 my $data;
3259 my $has_ref;
3260 if ($self->{entity__match} > 0) {
3261 !!!cp (1023);
3262 $data = $self->{entity__value};
3263 $has_ref = 1;
3264 #
3265 } elsif ($self->{entity__match} < 0) {
3266 !!!parse-error (type => 'no refc');
3267 if ($self->{entity_in_attr} and $self->{entity__match} < -1) {
3268 !!!cp (1024);
3269 $data = '&' . $self->{state_keyword};
3270 #
3271 } else {
3272 !!!cp (1025);
3273 $data = $self->{entity__value};
3274 $has_ref = 1;
3275 #
3276 }
3277 } else {
3278 !!!cp (1026);
3279 !!!parse-error (type => 'bare ero',
3280 line => $self->{line_prev},
3281 column => $self->{column_prev});
3282 $data = '&' . $self->{state_keyword};
3283 #
3284 }
3285
3286 ## NOTE: In these cases, when a character reference is found,
3287 ## it is consumed and a character token is returned, or, otherwise,
3288 ## nothing is consumed and returned, according to the spec algorithm.
3289 ## In this implementation, anything that has been examined by the
3290 ## tokenizer is appended to the parent element or the attribute value
3291 ## as string, either literal string when no character reference or
3292 ## entity-replaced string otherwise, in this stage, since any characters
3293 ## that would not be consumed are appended in the data state or in an
3294 ## appropriate attribute value state anyway.
3295
3296 if ($self->{entity_in_attr}) {
3297 $self->{current_attribute}->{value} .= $data;
3298 $self->{current_attribute}->{has_reference} = 1 if $has_ref;
3299 $self->{state} = $self->{last_attribute_value_state};
3300 ## Reconsume.
3301 redo A;
3302 } else {
3303 $self->{state} = DATA_STATE;
3304 ## Reconsume.
3305 !!!emit ({type => CHARACTER_TOKEN,
3306 data => $data, has_reference => $has_ref,
3307 line => $self->{line_prev},
3308 column => $self->{column_prev} + 1 - length $self->{state_keyword},
3309 });
3310 redo A;
3311 }
3312 } else {
3313 die "$0: $self->{state}: Unknown state";
3314 }
3315 } # A
3316
3317 die "$0: _get_next_token: unexpected case";
3318 } # _get_next_token
3319
3320 sub _initialize_tree_constructor ($) {
3321 my $self = shift;
3322 ## NOTE: $self->{document} MUST be specified before this method is called
3323 $self->{document}->strict_error_checking (0);
3324 ## TODO: Turn mutation events off # MUST
3325 ## TODO: Turn loose Document option (manakai extension) on
3326 $self->{document}->manakai_is_html (1); # MUST
3327 $self->{document}->set_user_data (manakai_source_line => 1);
3328 $self->{document}->set_user_data (manakai_source_column => 1);
3329 } # _initialize_tree_constructor
3330
3331 sub _terminate_tree_constructor ($) {
3332 my $self = shift;
3333 $self->{document}->strict_error_checking (1);
3334 ## TODO: Turn mutation events on
3335 } # _terminate_tree_constructor
3336
3337 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3338
3339 { # tree construction stage
3340 my $token;
3341
3342 sub _construct_tree ($) {
3343 my ($self) = @_;
3344
3345 ## When an interactive UA render the $self->{document} available
3346 ## to the user, or when it begin accepting user input, are
3347 ## not defined.
3348
3349 ## Append a character: collect it and all subsequent consecutive
3350 ## characters and insert one Text node whose data is concatenation
3351 ## of all those characters. # MUST
3352
3353 !!!next-token;
3354
3355 undef $self->{form_element};
3356 undef $self->{head_element};
3357 $self->{open_elements} = [];
3358 undef $self->{inner_html_node};
3359
3360 ## NOTE: The "initial" insertion mode.
3361 $self->_tree_construction_initial; # MUST
3362
3363 ## NOTE: The "before html" insertion mode.
3364 $self->_tree_construction_root_element;
3365 $self->{insertion_mode} = BEFORE_HEAD_IM;
3366
3367 ## NOTE: The "before head" insertion mode and so on.
3368 $self->_tree_construction_main;
3369 } # _construct_tree
3370
3371 sub _tree_construction_initial ($) {
3372 my $self = shift;
3373
3374 ## NOTE: "initial" insertion mode
3375
3376 INITIAL: {
3377 if ($token->{type} == DOCTYPE_TOKEN) {
3378 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3379 ## error, switch to a conformance checking mode for another
3380 ## language.
3381 my $doctype_name = $token->{name};
3382 $doctype_name = '' unless defined $doctype_name;
3383 $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3384 if (not defined $token->{name} or # <!DOCTYPE>
3385 defined $token->{system_identifier}) {
3386 !!!cp ('t1');
3387 !!!parse-error (type => 'not HTML5', token => $token);
3388 } elsif ($doctype_name ne 'HTML') {
3389 !!!cp ('t2');
3390 !!!parse-error (type => 'not HTML5', token => $token);
3391 } elsif (defined $token->{public_identifier}) {
3392 if ($token->{public_identifier} eq 'XSLT-compat') {
3393 !!!cp ('t1.2');
3394 !!!parse-error (type => 'XSLT-compat', token => $token,
3395 level => $self->{level}->{should});
3396 } else {
3397 !!!parse-error (type => 'not HTML5', token => $token);
3398 }
3399 } else {
3400 !!!cp ('t3');
3401 #
3402 }
3403
3404 my $doctype = $self->{document}->create_document_type_definition
3405 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3406 ## NOTE: Default value for both |public_id| and |system_id| attributes
3407 ## are empty strings, so that we don't set any value in missing cases.
3408 $doctype->public_id ($token->{public_identifier})
3409 if defined $token->{public_identifier};
3410 $doctype->system_id ($token->{system_identifier})
3411 if defined $token->{system_identifier};
3412 ## NOTE: Other DocumentType attributes are null or empty lists.
3413 ## ISSUE: internalSubset = null??
3414 $self->{document}->append_child ($doctype);
3415
3416 if ($token->{quirks} or $doctype_name ne 'HTML') {
3417 !!!cp ('t4');
3418 $self->{document}->manakai_compat_mode ('quirks');
3419 } elsif (defined $token->{public_identifier}) {
3420 my $pubid = $token->{public_identifier};
3421 $pubid =~ tr/a-z/A-z/;
3422 my $prefix = [
3423 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3424 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3425 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3426 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3427 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3428 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3429 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3430 "-//IETF//DTD HTML 2.0 STRICT//",
3431 "-//IETF//DTD HTML 2.0//",
3432 "-//IETF//DTD HTML 2.1E//",
3433 "-//IETF//DTD HTML 3.0//",
3434 "-//IETF//DTD HTML 3.2 FINAL//",
3435 "-//IETF//DTD HTML 3.2//",
3436 "-//IETF//DTD HTML 3//",
3437 "-//IETF//DTD HTML LEVEL 0//",
3438 "-//IETF//DTD HTML LEVEL 1//",
3439 "-//IETF//DTD HTML LEVEL 2//",
3440 "-//IETF//DTD HTML LEVEL 3//",
3441 "-//IETF//DTD HTML STRICT LEVEL 0//",
3442 "-//IETF//DTD HTML STRICT LEVEL 1//",
3443 "-//IETF//DTD HTML STRICT LEVEL 2//",
3444 "-//IETF//DTD HTML STRICT LEVEL 3//",
3445 "-//IETF//DTD HTML STRICT//",
3446 "-//IETF//DTD HTML//",
3447 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3448 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3449 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3450 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3451 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3452 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3453 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3454 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3455 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3456 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3457 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3458 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3459 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3460 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3461 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3462 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3463 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3464 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3465 "-//W3C//DTD HTML 3 1995-03-24//",
3466 "-//W3C//DTD HTML 3.2 DRAFT//",
3467 "-//W3C//DTD HTML 3.2 FINAL//",
3468 "-//W3C//DTD HTML 3.2//",
3469 "-//W3C//DTD HTML 3.2S DRAFT//",
3470 "-//W3C//DTD HTML 4.0 FRAMESET//",
3471 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3472 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3473 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3474 "-//W3C//DTD W3 HTML//",
3475 "-//W3O//DTD W3 HTML 3.0//",
3476 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3477 "-//WEBTECHS//DTD MOZILLA HTML//",
3478 ]; # $prefix
3479 my $match;
3480 for (@$prefix) {
3481 if (substr ($prefix, 0, length $_) eq $_) {
3482 $match = 1;
3483 last;
3484 }
3485 }
3486 if ($match or
3487 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3488 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3489 $pubid eq "HTML") {
3490 !!!cp ('t5');
3491 $self->{document}->manakai_compat_mode ('quirks');
3492 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3493 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3494 if (defined $token->{system_identifier}) {
3495 !!!cp ('t6');
3496 $self->{document}->manakai_compat_mode ('quirks');
3497 } else {
3498 !!!cp ('t7');
3499 $self->{document}->manakai_compat_mode ('limited quirks');
3500 }
3501 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3502 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3503 !!!cp ('t8');
3504 $self->{document}->manakai_compat_mode ('limited quirks');
3505 } else {
3506 !!!cp ('t9');
3507 }
3508 } else {
3509 !!!cp ('t10');
3510 }
3511 if (defined $token->{system_identifier}) {
3512 my $sysid = $token->{system_identifier};
3513 $sysid =~ tr/A-Z/a-z/;
3514 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3515 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3516 ## marked as quirks.
3517 $self->{document}->manakai_compat_mode ('quirks');
3518 !!!cp ('t11');
3519 } else {
3520 !!!cp ('t12');
3521 }
3522 } else {
3523 !!!cp ('t13');
3524 }
3525
3526 ## Go to the "before html" insertion mode.
3527 !!!next-token;
3528 return;
3529 } elsif ({
3530 START_TAG_TOKEN, 1,
3531 END_TAG_TOKEN, 1,
3532 END_OF_FILE_TOKEN, 1,
3533 }->{$token->{type}}) {
3534 !!!cp ('t14');
3535 !!!parse-error (type => 'no DOCTYPE', token => $token);
3536 $self->{document}->manakai_compat_mode ('quirks');
3537 ## Go to the "before html" insertion mode.
3538 ## reprocess
3539 !!!ack-later;
3540 return;
3541 } elsif ($token->{type} == CHARACTER_TOKEN) {
3542 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3543 ## Ignore the token
3544
3545 unless (length $token->{data}) {
3546 !!!cp ('t15');
3547 ## Stay in the insertion mode.
3548 !!!next-token;
3549 redo INITIAL;
3550 } else {
3551 !!!cp ('t16');
3552 }
3553 } else {
3554 !!!cp ('t17');
3555 }
3556
3557 !!!parse-error (type => 'no DOCTYPE', token => $token);
3558 $self->{document}->manakai_compat_mode ('quirks');
3559 ## Go to the "before html" insertion mode.
3560 ## reprocess
3561 return;
3562 } elsif ($token->{type} == COMMENT_TOKEN) {
3563 !!!cp ('t18');
3564 my $comment = $self->{document}->create_comment ($token->{data});
3565 $self->{document}->append_child ($comment);
3566
3567 ## Stay in the insertion mode.
3568 !!!next-token;
3569 redo INITIAL;
3570 } else {
3571 die "$0: $token->{type}: Unknown token type";
3572 }
3573 } # INITIAL
3574
3575 die "$0: _tree_construction_initial: This should be never reached";
3576 } # _tree_construction_initial
3577
3578 sub _tree_construction_root_element ($) {
3579 my $self = shift;
3580
3581 ## NOTE: "before html" insertion mode.
3582
3583 B: {
3584 if ($token->{type} == DOCTYPE_TOKEN) {
3585 !!!cp ('t19');
3586 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3587 ## Ignore the token
3588 ## Stay in the insertion mode.
3589 !!!next-token;
3590 redo B;
3591 } elsif ($token->{type} == COMMENT_TOKEN) {
3592 !!!cp ('t20');
3593 my $comment = $self->{document}->create_comment ($token->{data});
3594 $self->{document}->append_child ($comment);
3595 ## Stay in the insertion mode.
3596 !!!next-token;
3597 redo B;
3598 } elsif ($token->{type} == CHARACTER_TOKEN) {
3599 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3600 ## Ignore the token.
3601
3602 unless (length $token->{data}) {
3603 !!!cp ('t21');
3604 ## Stay in the insertion mode.
3605 !!!next-token;
3606 redo B;
3607 } else {
3608 !!!cp ('t22');
3609 }
3610 } else {
3611 !!!cp ('t23');
3612 }
3613
3614 $self->{application_cache_selection}->(undef);
3615
3616 #
3617 } elsif ($token->{type} == START_TAG_TOKEN) {
3618 if ($token->{tag_name} eq 'html') {
3619 my $root_element;
3620 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3621 $self->{document}->append_child ($root_element);
3622 push @{$self->{open_elements}},
3623 [$root_element, $el_category->{html}];
3624
3625 if ($token->{attributes}->{manifest}) {
3626 !!!cp ('t24');
3627 $self->{application_cache_selection}
3628 ->($token->{attributes}->{manifest}->{value});
3629 ## ISSUE: Spec is unclear on relative references.
3630 ## According to Hixie (#whatwg 2008-03-19), it should be
3631 ## resolved against the base URI of the document in HTML
3632 ## or xml:base of the element in XHTML.
3633 } else {
3634 !!!cp ('t25');
3635 $self->{application_cache_selection}->(undef);
3636 }
3637
3638 !!!nack ('t25c');
3639
3640 !!!next-token;
3641 return; ## Go to the "before head" insertion mode.
3642 } else {
3643 !!!cp ('t25.1');
3644 #
3645 }
3646 } elsif ({
3647 END_TAG_TOKEN, 1,
3648 END_OF_FILE_TOKEN, 1,
3649 }->{$token->{type}}) {
3650 !!!cp ('t26');
3651 #
3652 } else {
3653 die "$0: $token->{type}: Unknown token type";
3654 }
3655
3656 my $root_element;
3657 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3658 $self->{document}->append_child ($root_element);
3659 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3660
3661 $self->{application_cache_selection}->(undef);
3662
3663 ## NOTE: Reprocess the token.
3664 !!!ack-later;
3665 return; ## Go to the "before head" insertion mode.
3666
3667 ## ISSUE: There is an issue in the spec
3668 } # B
3669
3670 die "$0: _tree_construction_root_element: This should never be reached";
3671 } # _tree_construction_root_element
3672
3673 sub _reset_insertion_mode ($) {
3674 my $self = shift;
3675
3676 ## Step 1
3677 my $last;
3678
3679 ## Step 2
3680 my $i = -1;
3681 my $node = $self->{open_elements}->[$i];
3682
3683 ## Step 3
3684 S3: {
3685 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3686 $last = 1;
3687 if (defined $self->{inner_html_node}) {
3688 !!!cp ('t28');
3689 $node = $self->{inner_html_node};
3690 } else {
3691 die "_reset_insertion_mode: t27";
3692 }
3693 }
3694
3695 ## Step 4..14
3696 my $new_mode;
3697 if ($node->[1] & FOREIGN_EL) {
3698 !!!cp ('t28.1');
3699 ## NOTE: Strictly spaking, the line below only applies to MathML and
3700 ## SVG elements. Currently the HTML syntax supports only MathML and
3701 ## SVG elements as foreigners.
3702 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3703 } elsif ($node->[1] & TABLE_CELL_EL) {
3704 if ($last) {
3705 !!!cp ('t28.2');
3706 #
3707 } else {
3708 !!!cp ('t28.3');
3709 $new_mode = IN_CELL_IM;
3710 }
3711 } else {
3712 !!!cp ('t28.4');
3713 $new_mode = {
3714 select => IN_SELECT_IM,
3715 ## NOTE: |option| and |optgroup| do not set
3716 ## insertion mode to "in select" by themselves.
3717 tr => IN_ROW_IM,
3718 tbody => IN_TABLE_BODY_IM,
3719 thead => IN_TABLE_BODY_IM,
3720 tfoot => IN_TABLE_BODY_IM,
3721 caption => IN_CAPTION_IM,
3722 colgroup => IN_COLUMN_GROUP_IM,
3723 table => IN_TABLE_IM,
3724 head => IN_BODY_IM, # not in head!
3725 body => IN_BODY_IM,
3726 frameset => IN_FRAMESET_IM,
3727 }->{$node->[0]->manakai_local_name};
3728 }
3729 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3730
3731 ## Step 15
3732 if ($node->[1] & HTML_EL) {
3733 unless (defined $self->{head_element}) {
3734 !!!cp ('t29');
3735 $self->{insertion_mode} = BEFORE_HEAD_IM;
3736 } else {
3737 ## ISSUE: Can this state be reached?
3738 !!!cp ('t30');
3739 $self->{insertion_mode} = AFTER_HEAD_IM;
3740 }
3741 return;
3742 } else {
3743 !!!cp ('t31');
3744 }
3745
3746 ## Step 16
3747 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3748
3749 ## Step 17
3750 $i--;
3751 $node = $self->{open_elements}->[$i];
3752
3753 ## Step 18
3754 redo S3;
3755 } # S3
3756
3757 die "$0: _reset_insertion_mode: This line should never be reached";
3758 } # _reset_insertion_mode
3759
3760 sub _tree_construction_main ($) {
3761 my $self = shift;
3762
3763 my $active_formatting_elements = [];
3764
3765 my $reconstruct_active_formatting_elements = sub { # MUST
3766 my $insert = shift;
3767
3768 ## Step 1
3769 return unless @$active_formatting_elements;
3770
3771 ## Step 3
3772 my $i = -1;
3773 my $entry = $active_formatting_elements->[$i];
3774
3775 ## Step 2
3776 return if $entry->[0] eq '#marker';
3777 for (@{$self->{open_elements}}) {
3778 if ($entry->[0] eq $_->[0]) {
3779 !!!cp ('t32');
3780 return;
3781 }
3782 }
3783
3784 S4: {
3785 ## Step 4
3786 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3787
3788 ## Step 5
3789 $i--;
3790 $entry = $active_formatting_elements->[$i];
3791
3792 ## Step 6
3793 if ($entry->[0] eq '#marker') {
3794 !!!cp ('t33_1');
3795 #
3796 } else {
3797 my $in_open_elements;
3798 OE: for (@{$self->{open_elements}}) {
3799 if ($entry->[0] eq $_->[0]) {
3800 !!!cp ('t33');
3801 $in_open_elements = 1;
3802 last OE;
3803 }
3804 }
3805 if ($in_open_elements) {
3806 !!!cp ('t34');
3807 #
3808 } else {
3809 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3810 !!!cp ('t35');
3811 redo S4;
3812 }
3813 }
3814
3815 ## Step 7
3816 $i++;
3817 $entry = $active_formatting_elements->[$i];
3818 } # S4
3819
3820 S7: {
3821 ## Step 8
3822 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3823
3824 ## Step 9
3825 $insert->($clone->[0]);
3826 push @{$self->{open_elements}}, $clone;
3827
3828 ## Step 10
3829 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3830
3831 ## Step 11
3832 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3833 !!!cp ('t36');
3834 ## Step 7'
3835 $i++;
3836 $entry = $active_formatting_elements->[$i];
3837
3838 redo S7;
3839 }
3840
3841 !!!cp ('t37');
3842 } # S7
3843 }; # $reconstruct_active_formatting_elements
3844
3845 my $clear_up_to_marker = sub {
3846 for (reverse 0..$#$active_formatting_elements) {
3847 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3848 !!!cp ('t38');
3849 splice @$active_formatting_elements, $_;
3850 return;
3851 }
3852 }
3853
3854 !!!cp ('t39');
3855 }; # $clear_up_to_marker
3856
3857 my $insert;
3858
3859 my $parse_rcdata = sub ($) {
3860 my ($content_model_flag) = @_;
3861
3862 ## Step 1
3863 my $start_tag_name = $token->{tag_name};
3864 my $el;
3865 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3866
3867 ## Step 2
3868 $insert->($el);
3869
3870 ## Step 3
3871 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3872 delete $self->{escape}; # MUST
3873
3874 ## Step 4
3875 my $text = '';
3876 !!!nack ('t40.1');
3877 !!!next-token;
3878 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3879 !!!cp ('t40');
3880 $text .= $token->{data};
3881 !!!next-token;
3882 }
3883
3884 ## Step 5
3885 if (length $text) {
3886 !!!cp ('t41');
3887 my $text = $self->{document}->create_text_node ($text);
3888 $el->append_child ($text);
3889 }
3890
3891 ## Step 6
3892 $self->{content_model} = PCDATA_CONTENT_MODEL;
3893
3894 ## Step 7
3895 if ($token->{type} == END_TAG_TOKEN and
3896 $token->{tag_name} eq $start_tag_name) {
3897 !!!cp ('t42');
3898 ## Ignore the token
3899 } else {
3900 ## NOTE: An end-of-file token.
3901 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3902 !!!cp ('t43');
3903 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3904 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3905 !!!cp ('t44');
3906 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
3907 } else {
3908 die "$0: $content_model_flag in parse_rcdata";
3909 }
3910 }
3911 !!!next-token;
3912 }; # $parse_rcdata
3913
3914 my $script_start_tag = sub () {
3915 my $script_el;
3916 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3917 ## TODO: mark as "parser-inserted"
3918
3919 $self->{content_model} = CDATA_CONTENT_MODEL;
3920 delete $self->{escape}; # MUST
3921
3922 my $text = '';
3923 !!!nack ('t45.1');
3924 !!!next-token;
3925 while ($token->{type} == CHARACTER_TOKEN) {
3926 !!!cp ('t45');
3927 $text .= $token->{data};
3928 !!!next-token;
3929 } # stop if non-character token or tokenizer stops tokenising
3930 if (length $text) {
3931 !!!cp ('t46');
3932 $script_el->manakai_append_text ($text);
3933 }
3934
3935 $self->{content_model} = PCDATA_CONTENT_MODEL;
3936
3937 if ($token->{type} == END_TAG_TOKEN and
3938 $token->{tag_name} eq 'script') {
3939 !!!cp ('t47');
3940 ## Ignore the token
3941 } else {
3942 !!!cp ('t48');
3943 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3944 ## ISSUE: And ignore?
3945 ## TODO: mark as "already executed"
3946 }
3947
3948 if (defined $self->{inner_html_node}) {
3949 !!!cp ('t49');
3950 ## TODO: mark as "already executed"
3951 } else {
3952 !!!cp ('t50');
3953 ## TODO: $old_insertion_point = current insertion point
3954 ## TODO: insertion point = just before the next input character
3955
3956 $insert->($script_el);
3957
3958 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3959
3960 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3961 }
3962
3963 !!!next-token;
3964 }; # $script_start_tag
3965
3966 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3967 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3968 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3969
3970 my $formatting_end_tag = sub {
3971 my $end_tag_token = shift;
3972 my $tag_name = $end_tag_token->{tag_name};
3973
3974 ## NOTE: The adoption agency algorithm (AAA).
3975
3976 FET: {
3977 ## Step 1
3978 my $formatting_element;
3979 my $formatting_element_i_in_active;
3980 AFE: for (reverse 0..$#$active_formatting_elements) {
3981 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3982 !!!cp ('t52');
3983 last AFE;
3984 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3985 eq $tag_name) {
3986 !!!cp ('t51');
3987 $formatting_element = $active_formatting_elements->[$_];
3988 $formatting_element_i_in_active = $_;
3989 last AFE;
3990 }
3991 } # AFE
3992 unless (defined $formatting_element) {
3993 !!!cp ('t53');
3994 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
3995 ## Ignore the token
3996 !!!next-token;
3997 return;
3998 }
3999 ## has an element in scope
4000 my $in_scope = 1;
4001 my $formatting_element_i_in_open;
4002 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4003 my $node = $self->{open_elements}->[$_];
4004 if ($node->[0] eq $formatting_element->[0]) {
4005 if ($in_scope) {
4006 !!!cp ('t54');
4007 $formatting_element_i_in_open = $_;
4008 last INSCOPE;
4009 } else { # in open elements but not in scope
4010 !!!cp ('t55');
4011 !!!parse-error (type => 'unmatched end tag',
4012 text => $token->{tag_name},
4013 token => $end_tag_token);
4014 ## Ignore the token
4015 !!!next-token;
4016 return;
4017 }
4018 } elsif ($node->[1] & SCOPING_EL) {
4019 !!!cp ('t56');
4020 $in_scope = 0;
4021 }
4022 } # INSCOPE
4023 unless (defined $formatting_element_i_in_open) {
4024 !!!cp ('t57');
4025 !!!parse-error (type => 'unmatched end tag',
4026 text => $token->{tag_name},
4027 token => $end_tag_token);
4028 pop @$active_formatting_elements; # $formatting_element
4029 !!!next-token; ## TODO: ok?
4030 return;
4031 }
4032 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
4033 !!!cp ('t58');
4034 !!!parse-error (type => 'not closed',
4035 text => $self->{open_elements}->[-1]->[0]
4036 ->manakai_local_name,
4037 token => $end_tag_token);
4038 }
4039
4040 ## Step 2
4041 my $furthest_block;
4042 my $furthest_block_i_in_open;
4043 OE: for (reverse 0..$#{$self->{open_elements}}) {
4044 my $node = $self->{open_elements}->[$_];
4045 if (not ($node->[1] & FORMATTING_EL) and
4046 #not $phrasing_category->{$node->[1]} and
4047 ($node->[1] & SPECIAL_EL or
4048 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
4049 !!!cp ('t59');
4050 $furthest_block = $node;
4051 $furthest_block_i_in_open = $_;
4052 } elsif ($node->[0] eq $formatting_element->[0]) {
4053 !!!cp ('t60');
4054 last OE;
4055 }
4056 } # OE
4057
4058 ## Step 3
4059 unless (defined $furthest_block) { # MUST
4060 !!!cp ('t61');
4061 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
4062 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
4063 !!!next-token;
4064 return;
4065 }
4066
4067 ## Step 4
4068 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
4069
4070 ## Step 5
4071 my $furthest_block_parent = $furthest_block->[0]->parent_node;
4072 if (defined $furthest_block_parent) {
4073 !!!cp ('t62');
4074 $furthest_block_parent->remove_child ($furthest_block->[0]);
4075 }
4076
4077 ## Step 6
4078 my $bookmark_prev_el
4079 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
4080 ->[0];
4081
4082 ## Step 7
4083 my $node = $furthest_block;
4084 my $node_i_in_open = $furthest_block_i_in_open;
4085 my $last_node = $furthest_block;
4086 S7: {
4087 ## Step 1
4088 $node_i_in_open--;
4089 $node = $self->{open_elements}->[$node_i_in_open];
4090
4091 ## Step 2
4092 my $node_i_in_active;
4093 S7S2: {
4094 for (reverse 0..$#$active_formatting_elements) {
4095 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4096 !!!cp ('t63');
4097 $node_i_in_active = $_;
4098 last S7S2;
4099 }
4100 }
4101 splice @{$self->{open_elements}}, $node_i_in_open, 1;
4102 redo S7;
4103 } # S7S2
4104
4105 ## Step 3
4106 last S7 if $node->[0] eq $formatting_element->[0];
4107
4108 ## Step 4
4109 if ($last_node->[0] eq $furthest_block->[0]) {
4110 !!!cp ('t64');
4111 $bookmark_prev_el = $node->[0];
4112 }
4113
4114 ## Step 5
4115 if ($node->[0]->has_child_nodes ()) {
4116 !!!cp ('t65');
4117 my $clone = [$node->[0]->clone_node (0), $node->[1]];
4118 $active_formatting_elements->[$node_i_in_active] = $clone;
4119 $self->{open_elements}->[$node_i_in_open] = $clone;
4120 $node = $clone;
4121 }
4122
4123 ## Step 6
4124 $node->[0]->append_child ($last_node->[0]);
4125
4126 ## Step 7
4127 $last_node = $node;
4128
4129 ## Step 8
4130 redo S7;
4131 } # S7
4132
4133 ## Step 8
4134 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
4135 my $foster_parent_element;
4136 my $next_sibling;
4137 OE: for (reverse 0..$#{$self->{open_elements}}) {
4138 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4139 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4140 if (defined $parent and $parent->node_type == 1) {
4141 !!!cp ('t65.1');
4142 $foster_parent_element = $parent;
4143 $next_sibling = $self->{open_elements}->[$_]->[0];
4144 } else {
4145 !!!cp ('t65.2');
4146 $foster_parent_element
4147 = $self->{open_elements}->[$_ - 1]->[0];
4148 }
4149 last OE;
4150 }
4151 } # OE
4152 $foster_parent_element = $self->{open_elements}->[0]->[0]
4153 unless defined $foster_parent_element;
4154 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
4155 $open_tables->[-1]->[1] = 1; # tainted
4156 } else {
4157 !!!cp ('t65.3');
4158 $common_ancestor_node->[0]->append_child ($last_node->[0]);
4159 }
4160
4161 ## Step 9
4162 my $clone = [$formatting_element->[0]->clone_node (0),
4163 $formatting_element->[1]];
4164
4165 ## Step 10
4166 my @cn = @{$furthest_block->[0]->child_nodes};
4167 $clone->[0]->append_child ($_) for @cn;
4168
4169 ## Step 11
4170 $furthest_block->[0]->append_child ($clone->[0]);
4171
4172 ## Step 12
4173 my $i;
4174 AFE: for (reverse 0..$#$active_formatting_elements) {
4175 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
4176 !!!cp ('t66');
4177 splice @$active_formatting_elements, $_, 1;
4178 $i-- and last AFE if defined $i;
4179 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
4180 !!!cp ('t67');
4181 $i = $_;
4182 }
4183 } # AFE
4184 splice @$active_formatting_elements, $i + 1, 0, $clone;
4185
4186 ## Step 13
4187 undef $i;
4188 OE: for (reverse 0..$#{$self->{open_elements}}) {
4189 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
4190 !!!cp ('t68');
4191 splice @{$self->{open_elements}}, $_, 1;
4192 $i-- and last OE if defined $i;
4193 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
4194 !!!cp ('t69');
4195 $i = $_;
4196 }
4197 } # OE
4198 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
4199
4200 ## Step 14
4201 redo FET;
4202 } # FET
4203 }; # $formatting_end_tag
4204
4205 $insert = my $insert_to_current = sub {
4206 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
4207 }; # $insert_to_current
4208
4209 my $insert_to_foster = sub {
4210 my $child = shift;
4211 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4212 # MUST
4213 my $foster_parent_element;
4214 my $next_sibling;
4215 OE: for (reverse 0..$#{$self->{open_elements}}) {
4216 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4217 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4218 if (defined $parent and $parent->node_type == 1) {
4219 !!!cp ('t70');
4220 $foster_parent_element = $parent;
4221 $next_sibling = $self->{open_elements}->[$_]->[0];
4222 } else {
4223 !!!cp ('t71');
4224 $foster_parent_element
4225 = $self->{open_elements}->[$_ - 1]->[0];
4226 }
4227 last OE;
4228 }
4229 } # OE
4230 $foster_parent_element = $self->{open_elements}->[0]->[0]
4231 unless defined $foster_parent_element;
4232 $foster_parent_element->insert_before
4233 ($child, $next_sibling);
4234 $open_tables->[-1]->[1] = 1; # tainted
4235 } else {
4236 !!!cp ('t72');
4237 $self->{open_elements}->[-1]->[0]->append_child ($child);
4238 }
4239 }; # $insert_to_foster
4240
4241 B: while (1) {
4242 if ($token->{type} == DOCTYPE_TOKEN) {
4243 !!!cp ('t73');
4244 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4245 ## Ignore the token
4246 ## Stay in the phase
4247 !!!next-token;
4248 next B;
4249 } elsif ($token->{type} == START_TAG_TOKEN and
4250 $token->{tag_name} eq 'html') {
4251 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4252 !!!cp ('t79');
4253 !!!parse-error (type => 'after html', text => 'html', token => $token);
4254 $self->{insertion_mode} = AFTER_BODY_IM;
4255 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4256 !!!cp ('t80');
4257 !!!parse-error (type => 'after html', text => 'html', token => $token);
4258 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4259 } else {
4260 !!!cp ('t81');
4261 }
4262
4263 !!!cp ('t82');
4264 !!!parse-error (type => 'not first start tag', token => $token);
4265 my $top_el = $self->{open_elements}->[0]->[0];
4266 for my $attr_name (keys %{$token->{attributes}}) {
4267 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4268 !!!cp ('t84');
4269 $top_el->set_attribute_ns
4270 (undef, [undef, $attr_name],
4271 $token->{attributes}->{$attr_name}->{value});
4272 }
4273 }
4274 !!!nack ('t84.1');
4275 !!!next-token;
4276 next B;
4277 } elsif ($token->{type} == COMMENT_TOKEN) {
4278 my $comment = $self->{document}->create_comment ($token->{data});
4279 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4280 !!!cp ('t85');
4281 $self->{document}->append_child ($comment);
4282 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4283 !!!cp ('t86');
4284 $self->{open_elements}->[0]->[0]->append_child ($comment);
4285 } else {
4286 !!!cp ('t87');
4287 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4288 }
4289 !!!next-token;
4290 next B;
4291 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4292 if ($token->{type} == CHARACTER_TOKEN) {
4293 !!!cp ('t87.1');
4294 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4295 !!!next-token;
4296 next B;
4297 } elsif ($token->{type} == START_TAG_TOKEN) {
4298 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4299 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4300 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4301 ($token->{tag_name} eq 'svg' and
4302 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4303 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4304 !!!cp ('t87.2');
4305 #
4306 } elsif ({
4307 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4308 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4309 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4310 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4311 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4312 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4313 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4314 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4315 }->{$token->{tag_name}}) {
4316 !!!cp ('t87.2');
4317 !!!parse-error (type => 'not closed',
4318 text => $self->{open_elements}->[-1]->[0]
4319 ->manakai_local_name,
4320 token => $token);
4321
4322 pop @{$self->{open_elements}}
4323 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4324
4325 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4326 ## Reprocess.
4327 next B;
4328 } else {
4329 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4330 my $tag_name = $token->{tag_name};
4331 if ($nsuri eq $SVG_NS) {
4332 $tag_name = {
4333 altglyph => 'altGlyph',
4334 altglyphdef => 'altGlyphDef',
4335 altglyphitem => 'altGlyphItem',
4336 animatecolor => 'animateColor',
4337 animatemotion => 'animateMotion',
4338 animatetransform => 'animateTransform',
4339 clippath => 'clipPath',
4340 feblend => 'feBlend',
4341 fecolormatrix => 'feColorMatrix',
4342 fecomponenttransfer => 'feComponentTransfer',
4343 fecomposite => 'feComposite',
4344 feconvolvematrix => 'feConvolveMatrix',
4345 fediffuselighting => 'feDiffuseLighting',
4346 fedisplacementmap => 'feDisplacementMap',
4347 fedistantlight => 'feDistantLight',
4348 feflood => 'feFlood',
4349 fefunca => 'feFuncA',
4350 fefuncb => 'feFuncB',
4351 fefuncg => 'feFuncG',
4352 fefuncr => 'feFuncR',
4353 fegaussianblur => 'feGaussianBlur',
4354 feimage => 'feImage',
4355 femerge => 'feMerge',
4356 femergenode => 'feMergeNode',
4357 femorphology => 'feMorphology',
4358 feoffset => 'feOffset',
4359 fepointlight => 'fePointLight',
4360 fespecularlighting => 'feSpecularLighting',
4361 fespotlight => 'feSpotLight',
4362 fetile => 'feTile',
4363 feturbulence => 'feTurbulence',
4364 foreignobject => 'foreignObject',
4365 glyphref => 'glyphRef',
4366 lineargradient => 'linearGradient',
4367 radialgradient => 'radialGradient',
4368 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4369 textpath => 'textPath',
4370 }->{$tag_name} || $tag_name;
4371 }
4372
4373 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4374
4375 ## "adjust foreign attributes" - done in insert-element-f
4376
4377 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4378
4379 if ($self->{self_closing}) {
4380 pop @{$self->{open_elements}};
4381 !!!ack ('t87.3');
4382 } else {
4383 !!!cp ('t87.4');
4384 }
4385
4386 !!!next-token;
4387 next B;
4388 }
4389 } elsif ($token->{type} == END_TAG_TOKEN) {
4390 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4391 !!!cp ('t87.5');
4392 #
4393 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4394 !!!cp ('t87.6');
4395 !!!parse-error (type => 'not closed',
4396 text => $self->{open_elements}->[-1]->[0]
4397 ->manakai_local_name,
4398 token => $token);
4399
4400 pop @{$self->{open_elements}}
4401 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4402
4403 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4404 ## Reprocess.
4405 next B;
4406 } else {
4407 die "$0: $token->{type}: Unknown token type";
4408 }
4409 }
4410
4411 if ($self->{insertion_mode} & HEAD_IMS) {
4412 if ($token->{type} == CHARACTER_TOKEN) {
4413 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4414 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4415 !!!cp ('t88.2');
4416 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4417 } else {
4418 !!!cp ('t88.1');
4419 ## Ignore the token.
4420 !!!next-token;
4421 next B;
4422 }
4423 unless (length $token->{data}) {
4424 !!!cp ('t88');
4425 !!!next-token;
4426 next B;
4427 }
4428 }
4429
4430 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4431 !!!cp ('t89');
4432 ## As if <head>
4433 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4434 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4435 push @{$self->{open_elements}},
4436 [$self->{head_element}, $el_category->{head}];
4437
4438 ## Reprocess in the "in head" insertion mode...
4439 pop @{$self->{open_elements}};
4440
4441 ## Reprocess in the "after head" insertion mode...
4442 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4443 !!!cp ('t90');
4444 ## As if </noscript>
4445 pop @{$self->{open_elements}};
4446 !!!parse-error (type => 'in noscript:#text', token => $token);
4447
4448 ## Reprocess in the "in head" insertion mode...
4449 ## As if </head>
4450 pop @{$self->{open_elements}};
4451
4452 ## Reprocess in the "after head" insertion mode...
4453 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4454 !!!cp ('t91');
4455 pop @{$self->{open_elements}};
4456
4457 ## Reprocess in the "after head" insertion mode...
4458 } else {
4459 !!!cp ('t92');
4460 }
4461
4462 ## "after head" insertion mode
4463 ## As if <body>
4464 !!!insert-element ('body',, $token);
4465 $self->{insertion_mode} = IN_BODY_IM;
4466 ## reprocess
4467 next B;
4468 } elsif ($token->{type} == START_TAG_TOKEN) {
4469 if ($token->{tag_name} eq 'head') {
4470 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4471 !!!cp ('t93');
4472 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4473 $self->{open_elements}->[-1]->[0]->append_child
4474 ($self->{head_element});
4475 push @{$self->{open_elements}},
4476 [$self->{head_element}, $el_category->{head}];
4477 $self->{insertion_mode} = IN_HEAD_IM;
4478 !!!nack ('t93.1');
4479 !!!next-token;
4480 next B;
4481 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4482 !!!cp ('t93.2');
4483 !!!parse-error (type => 'after head', text => 'head',
4484 token => $token);
4485 ## Ignore the token
4486 !!!nack ('t93.3');
4487 !!!next-token;
4488 next B;
4489 } else {
4490 !!!cp ('t95');
4491 !!!parse-error (type => 'in head:head',
4492 token => $token); # or in head noscript
4493 ## Ignore the token
4494 !!!nack ('t95.1');
4495 !!!next-token;
4496 next B;
4497 }
4498 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4499 !!!cp ('t96');
4500 ## As if <head>
4501 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4502 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4503 push @{$self->{open_elements}},
4504 [$self->{head_element}, $el_category->{head}];
4505
4506 $self->{insertion_mode} = IN_HEAD_IM;
4507 ## Reprocess in the "in head" insertion mode...
4508 } else {
4509 !!!cp ('t97');
4510 }
4511
4512 if ($token->{tag_name} eq 'base') {
4513 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4514 !!!cp ('t98');
4515 ## As if </noscript>
4516 pop @{$self->{open_elements}};
4517 !!!parse-error (type => 'in noscript', text => 'base',
4518 token => $token);
4519
4520 $self->{insertion_mode} = IN_HEAD_IM;
4521 ## Reprocess in the "in head" insertion mode...
4522 } else {
4523 !!!cp ('t99');
4524 }
4525
4526 ## NOTE: There is a "as if in head" code clone.
4527 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4528 !!!cp ('t100');
4529 !!!parse-error (type => 'after head',
4530 text => $token->{tag_name}, token => $token);
4531 push @{$self->{open_elements}},
4532 [$self->{head_element}, $el_category->{head}];
4533 } else {
4534 !!!cp ('t101');
4535 }
4536 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4537 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4538 pop @{$self->{open_elements}} # <head>
4539 if $self->{insertion_mode} == AFTER_HEAD_IM;
4540 !!!nack ('t101.1');
4541 !!!next-token;
4542 next B;
4543 } elsif ($token->{tag_name} eq 'link') {
4544 ## NOTE: There is a "as if in head" code clone.
4545 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4546 !!!cp ('t102');
4547 !!!parse-error (type => 'after head',
4548 text => $token->{tag_name}, token => $token);
4549 push @{$self->{open_elements}},
4550 [$self->{head_element}, $el_category->{head}];
4551 } else {
4552 !!!cp ('t103');
4553 }
4554 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4555 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4556 pop @{$self->{open_elements}} # <head>
4557 if $self->{insertion_mode} == AFTER_HEAD_IM;
4558 !!!ack ('t103.1');
4559 !!!next-token;
4560 next B;
4561 } elsif ($token->{tag_name} eq 'meta') {
4562 ## NOTE: There is a "as if in head" code clone.
4563 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4564 !!!cp ('t104');
4565 !!!parse-error (type => 'after head',
4566 text => $token->{tag_name}, token => $token);
4567 push @{$self->{open_elements}},
4568 [$self->{head_element}, $el_category->{head}];
4569 } else {
4570 !!!cp ('t105');
4571 }
4572 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4573 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4574
4575 unless ($self->{confident}) {
4576 if ($token->{attributes}->{charset}) {
4577 !!!cp ('t106');
4578 ## NOTE: Whether the encoding is supported or not is handled
4579 ## in the {change_encoding} callback.
4580 $self->{change_encoding}
4581 ->($self, $token->{attributes}->{charset}->{value},
4582 $token);
4583
4584 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4585 ->set_user_data (manakai_has_reference =>
4586 $token->{attributes}->{charset}
4587 ->{has_reference});
4588 } elsif ($token->{attributes}->{content}) {
4589 if ($token->{attributes}->{content}->{value}
4590 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4591 [\x09-\x0D\x20]*=
4592 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4593 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4594 !!!cp ('t107');
4595 ## NOTE: Whether the encoding is supported or not is handled
4596 ## in the {change_encoding} callback.
4597 $self->{change_encoding}
4598 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4599 $token);
4600 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4601 ->set_user_data (manakai_has_reference =>
4602 $token->{attributes}->{content}
4603 ->{has_reference});
4604 } else {
4605 !!!cp ('t108');
4606 }
4607 }
4608 } else {
4609 if ($token->{attributes}->{charset}) {
4610 !!!cp ('t109');
4611 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4612 ->set_user_data (manakai_has_reference =>
4613 $token->{attributes}->{charset}
4614 ->{has_reference});
4615 }
4616 if ($token->{attributes}->{content}) {
4617 !!!cp ('t110');
4618 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4619 ->set_user_data (manakai_has_reference =>
4620 $token->{attributes}->{content}
4621 ->{has_reference});
4622 }
4623 }
4624
4625 pop @{$self->{open_elements}} # <head>
4626 if $self->{insertion_mode} == AFTER_HEAD_IM;
4627 !!!ack ('t110.1');
4628 !!!next-token;
4629 next B;
4630 } elsif ($token->{tag_name} eq 'title') {
4631 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4632 !!!cp ('t111');
4633 ## As if </noscript>
4634 pop @{$self->{open_elements}};
4635 !!!parse-error (type => 'in noscript', text => 'title',
4636 token => $token);
4637
4638 $self->{insertion_mode} = IN_HEAD_IM;
4639 ## Reprocess in the "in head" insertion mode...
4640 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4641 !!!cp ('t112');
4642 !!!parse-error (type => 'after head',
4643 text => $token->{tag_name}, token => $token);
4644 push @{$self->{open_elements}},
4645 [$self->{head_element}, $el_category->{head}];
4646 } else {
4647 !!!cp ('t113');
4648 }
4649
4650 ## NOTE: There is a "as if in head" code clone.
4651 my $parent = defined $self->{head_element} ? $self->{head_element}
4652 : $self->{open_elements}->[-1]->[0];
4653 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4654 pop @{$self->{open_elements}} # <head>
4655 if $self->{insertion_mode} == AFTER_HEAD_IM;
4656 next B;
4657 } elsif ($token->{tag_name} eq 'style' or
4658 $token->{tag_name} eq 'noframes') {
4659 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4660 ## insertion mode IN_HEAD_IM)
4661 ## NOTE: There is a "as if in head" code clone.
4662 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4663 !!!cp ('t114');
4664 !!!parse-error (type => 'after head',
4665 text => $token->{tag_name}, token => $token);
4666 push @{$self->{open_elements}},
4667 [$self->{head_element}, $el_category->{head}];
4668 } else {
4669 !!!cp ('t115');
4670 }
4671 $parse_rcdata->(CDATA_CONTENT_MODEL);
4672 pop @{$self->{open_elements}} # <head>
4673 if $self->{insertion_mode} == AFTER_HEAD_IM;
4674 next B;
4675 } elsif ($token->{tag_name} eq 'noscript') {
4676 if ($self->{insertion_mode} == IN_HEAD_IM) {
4677 !!!cp ('t116');
4678 ## NOTE: and scripting is disalbed
4679 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4680 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4681 !!!nack ('t116.1');
4682 !!!next-token;
4683 next B;
4684 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4685 !!!cp ('t117');
4686 !!!parse-error (type => 'in noscript', text => 'noscript',
4687 token => $token);
4688 ## Ignore the token
4689 !!!nack ('t117.1');
4690 !!!next-token;
4691 next B;
4692 } else {
4693 !!!cp ('t118');
4694 #
4695 }
4696 } elsif ($token->{tag_name} eq 'script') {
4697 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4698 !!!cp ('t119');
4699 ## As if </noscript>
4700 pop @{$self->{open_elements}};
4701 !!!parse-error (type => 'in noscript', text => 'script',
4702 token => $token);
4703
4704 $self->{insertion_mode} = IN_HEAD_IM;
4705 ## Reprocess in the "in head" insertion mode...
4706 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4707 !!!cp ('t120');
4708 !!!parse-error (type => 'after head',
4709 text => $token->{tag_name}, token => $token);
4710 push @{$self->{open_elements}},
4711 [$self->{head_element}, $el_category->{head}];
4712 } else {
4713 !!!cp ('t121');
4714 }
4715
4716 ## NOTE: There is a "as if in head" code clone.
4717 $script_start_tag->();
4718 pop @{$self->{open_elements}} # <head>
4719 if $self->{insertion_mode} == AFTER_HEAD_IM;
4720 next B;
4721 } elsif ($token->{tag_name} eq 'body' or
4722 $token->{tag_name} eq 'frameset') {
4723 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4724 !!!cp ('t122');
4725 ## As if </noscript>
4726 pop @{$self->{open_elements}};
4727 !!!parse-error (type => 'in noscript',
4728 text => $token->{tag_name}, token => $token);
4729
4730 ## Reprocess in the "in head" insertion mode...
4731 ## As if </head>
4732 pop @{$self->{open_elements}};
4733
4734 ## Reprocess in the "after head" insertion mode...
4735 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4736 !!!cp ('t124');
4737 pop @{$self->{open_elements}};
4738
4739 ## Reprocess in the "after head" insertion mode...
4740 } else {
4741 !!!cp ('t125');
4742 }
4743
4744 ## "after head" insertion mode
4745 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4746 if ($token->{tag_name} eq 'body') {
4747 !!!cp ('t126');
4748 $self->{insertion_mode} = IN_BODY_IM;
4749 } elsif ($token->{tag_name} eq 'frameset') {
4750 !!!cp ('t127');
4751 $self->{insertion_mode} = IN_FRAMESET_IM;
4752 } else {
4753 die "$0: tag name: $self->{tag_name}";
4754 }
4755 !!!nack ('t127.1');
4756 !!!next-token;
4757 next B;
4758 } else {
4759 !!!cp ('t128');
4760 #
4761 }
4762
4763 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4764 !!!cp ('t129');
4765 ## As if </noscript>
4766 pop @{$self->{open_elements}};
4767 !!!parse-error (type => 'in noscript:/',
4768 text => $token->{tag_name}, token => $token);
4769
4770 ## Reprocess in the "in head" insertion mode...
4771 ## As if </head>
4772 pop @{$self->{open_elements}};
4773
4774 ## Reprocess in the "after head" insertion mode...
4775 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4776 !!!cp ('t130');
4777 ## As if </head>
4778 pop @{$self->{open_elements}};
4779
4780 ## Reprocess in the "after head" insertion mode...
4781 } else {
4782 !!!cp ('t131');
4783 }
4784
4785 ## "after head" insertion mode
4786 ## As if <body>
4787 !!!insert-element ('body',, $token);
4788 $self->{insertion_mode} = IN_BODY_IM;
4789 ## reprocess
4790 !!!ack-later;
4791 next B;
4792 } elsif ($token->{type} == END_TAG_TOKEN) {
4793 if ($token->{tag_name} eq 'head') {
4794 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4795 !!!cp ('t132');
4796 ## As if <head>
4797 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4798 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4799 push @{$self->{open_elements}},
4800 [$self->{head_element}, $el_category->{head}];
4801
4802 ## Reprocess in the "in head" insertion mode...
4803 pop @{$self->{open_elements}};
4804 $self->{insertion_mode} = AFTER_HEAD_IM;
4805 !!!next-token;
4806 next B;
4807 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4808 !!!cp ('t133');
4809 ## As if </noscript>
4810 pop @{$self->{open_elements}};
4811 !!!parse-error (type => 'in noscript:/',
4812 text => 'head', token => $token);
4813
4814 ## Reprocess in the "in head" insertion mode...
4815 pop @{$self->{open_elements}};
4816 $self->{insertion_mode} = AFTER_HEAD_IM;
4817 !!!next-token;
4818 next B;
4819 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4820 !!!cp ('t134');
4821 pop @{$self->{open_elements}};
4822 $self->{insertion_mode} = AFTER_HEAD_IM;
4823 !!!next-token;
4824 next B;
4825 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4826 !!!cp ('t134.1');
4827 !!!parse-error (type => 'unmatched end tag', text => 'head',
4828 token => $token);
4829 ## Ignore the token
4830 !!!next-token;
4831 next B;
4832 } else {
4833 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4834 }
4835 } elsif ($token->{tag_name} eq 'noscript') {
4836 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4837 !!!cp ('t136');
4838 pop @{$self->{open_elements}};
4839 $self->{insertion_mode} = IN_HEAD_IM;
4840 !!!next-token;
4841 next B;
4842 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4843 $self->{insertion_mode} == AFTER_HEAD_IM) {
4844 !!!cp ('t137');
4845 !!!parse-error (type => 'unmatched end tag',
4846 text => 'noscript', token => $token);
4847 ## Ignore the token ## ISSUE: An issue in the spec.
4848 !!!next-token;
4849 next B;
4850 } else {
4851 !!!cp ('t138');
4852 #
4853 }
4854 } elsif ({
4855 body => 1, html => 1,
4856 }->{$token->{tag_name}}) {
4857 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4858 $self->{insertion_mode} == IN_HEAD_IM or
4859 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4860 !!!cp ('t140');
4861 !!!parse-error (type => 'unmatched end tag',
4862 text => $token->{tag_name}, token => $token);
4863 ## Ignore the token
4864 !!!next-token;
4865 next B;
4866 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4867 !!!cp ('t140.1');
4868 !!!parse-error (type => 'unmatched end tag',
4869 text => $token->{tag_name}, token => $token);
4870 ## Ignore the token
4871 !!!next-token;
4872 next B;
4873 } else {
4874 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4875 }
4876 } elsif ($token->{tag_name} eq 'p') {
4877 !!!cp ('t142');
4878 !!!parse-error (type => 'unmatched end tag',
4879 text => $token->{tag_name}, token => $token);
4880 ## Ignore the token
4881 !!!next-token;
4882 next B;
4883 } elsif ($token->{tag_name} eq 'br') {
4884 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4885 !!!cp ('t142.2');
4886 ## (before head) as if <head>, (in head) as if </head>
4887 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4888 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4889 $self->{insertion_mode} = AFTER_HEAD_IM;
4890
4891 ## Reprocess in the "after head" insertion mode...
4892 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4893 !!!cp ('t143.2');
4894 ## As if </head>
4895 pop @{$self->{open_elements}};
4896 $self->{insertion_mode} = AFTER_HEAD_IM;
4897
4898 ## Reprocess in the "after head" insertion mode...
4899 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4900 !!!cp ('t143.3');
4901 ## ISSUE: Two parse errors for <head><noscript></br>
4902 !!!parse-error (type => 'unmatched end tag',
4903 text => 'br', token => $token);
4904 ## As if </noscript>
4905 pop @{$self->{open_elements}};
4906 $self->{insertion_mode} = IN_HEAD_IM;
4907
4908 ## Reprocess in the "in head" insertion mode...
4909 ## As if </head>
4910 pop @{$self->{open_elements}};
4911 $self->{insertion_mode} = AFTER_HEAD_IM;
4912
4913 ## Reprocess in the "after head" insertion mode...
4914 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4915 !!!cp ('t143.4');
4916 #
4917 } else {
4918 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4919 }
4920
4921 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4922 !!!parse-error (type => 'unmatched end tag',
4923 text => 'br', token => $token);
4924 ## Ignore the token
4925 !!!next-token;
4926 next B;
4927 } else {
4928 !!!cp ('t145');
4929 !!!parse-error (type => 'unmatched end tag',
4930 text => $token->{tag_name}, token => $token);
4931 ## Ignore the token
4932 !!!next-token;
4933 next B;
4934 }
4935
4936 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4937 !!!cp ('t146');
4938 ## As if </noscript>
4939 pop @{$self->{open_elements}};
4940 !!!parse-error (type => 'in noscript:/',
4941 text => $token->{tag_name}, token => $token);
4942
4943 ## Reprocess in the "in head" insertion mode...
4944 ## As if </head>
4945 pop @{$self->{open_elements}};
4946
4947 ## Reprocess in the "after head" insertion mode...
4948 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4949 !!!cp ('t147');
4950 ## As if </head>
4951 pop @{$self->{open_elements}};
4952
4953 ## Reprocess in the "after head" insertion mode...
4954 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4955 ## ISSUE: This case cannot be reached?
4956 !!!cp ('t148');
4957 !!!parse-error (type => 'unmatched end tag',
4958 text => $token->{tag_name}, token => $token);
4959 ## Ignore the token ## ISSUE: An issue in the spec.
4960 !!!next-token;
4961 next B;
4962 } else {
4963 !!!cp ('t149');
4964 }
4965
4966 ## "after head" insertion mode
4967 ## As if <body>
4968 !!!insert-element ('body',, $token);
4969 $self->{insertion_mode} = IN_BODY_IM;
4970 ## reprocess
4971 next B;
4972 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4973 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4974 !!!cp ('t149.1');
4975
4976 ## NOTE: As if <head>
4977 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4978 $self->{open_elements}->[-1]->[0]->append_child
4979 ($self->{head_element});
4980 #push @{$self->{open_elements}},
4981 # [$self->{head_element}, $el_category->{head}];
4982 #$self->{insertion_mode} = IN_HEAD_IM;
4983 ## NOTE: Reprocess.
4984
4985 ## NOTE: As if </head>
4986 #pop @{$self->{open_elements}};
4987 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4988 ## NOTE: Reprocess.
4989
4990 #
4991 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4992 !!!cp ('t149.2');
4993
4994 ## NOTE: As if </head>
4995 pop @{$self->{open_elements}};
4996 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4997 ## NOTE: Reprocess.
4998
4999 #
5000 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5001 !!!cp ('t149.3');
5002
5003 !!!parse-error (type => 'in noscript:#eof', token => $token);
5004
5005 ## As if </noscript>
5006 pop @{$self->{open_elements}};
5007 #$self->{insertion_mode} = IN_HEAD_IM;
5008 ## NOTE: Reprocess.
5009
5010 ## NOTE: As if </head>
5011 pop @{$self->{open_elements}};
5012 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5013 ## NOTE: Reprocess.
5014
5015 #
5016 } else {
5017 !!!cp ('t149.4');
5018 #
5019 }
5020
5021 ## NOTE: As if <body>
5022 !!!insert-element ('body',, $token);
5023 $self->{insertion_mode} = IN_BODY_IM;
5024 ## NOTE: Reprocess.
5025 next B;
5026 } else {
5027 die "$0: $token->{type}: Unknown token type";
5028 }
5029
5030 ## ISSUE: An issue in the spec.
5031 } elsif ($self->{insertion_mode} & BODY_IMS) {
5032 if ($token->{type} == CHARACTER_TOKEN) {
5033 !!!cp ('t150');
5034 ## NOTE: There is a code clone of "character in body".
5035 $reconstruct_active_formatting_elements->($insert_to_current);
5036
5037 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5038
5039 !!!next-token;
5040 next B;
5041 } elsif ($token->{type} == START_TAG_TOKEN) {
5042 if ({
5043 caption => 1, col => 1, colgroup => 1, tbody => 1,
5044 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
5045 }->{$token->{tag_name}}) {
5046 if ($self->{insertion_mode} == IN_CELL_IM) {
5047 ## have an element in table scope
5048 for (reverse 0..$#{$self->{open_elements}}) {
5049 my $node = $self->{open_elements}->[$_];
5050 if ($node->[1] & TABLE_CELL_EL) {
5051 !!!cp ('t151');
5052
5053 ## Close the cell
5054 !!!back-token; # <x>
5055 $token = {type => END_TAG_TOKEN,
5056 tag_name => $node->[0]->manakai_local_name,
5057 line => $token->{line},
5058 column => $token->{column}};
5059 next B;
5060 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5061 !!!cp ('t152');
5062 ## ISSUE: This case can never be reached, maybe.
5063 last;
5064 }
5065 }
5066
5067 !!!cp ('t153');
5068 !!!parse-error (type => 'start tag not allowed',
5069 text => $token->{tag_name}, token => $token);
5070 ## Ignore the token
5071 !!!nack ('t153.1');
5072 !!!next-token;
5073 next B;
5074 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5075 !!!parse-error (type => 'not closed', text => 'caption',
5076 token => $token);
5077
5078 ## NOTE: As if </caption>.
5079 ## have a table element in table scope
5080 my $i;
5081 INSCOPE: {
5082 for (reverse 0..$#{$self->{open_elements}}) {
5083 my $node = $self->{open_elements}->[$_];
5084 if ($node->[1] & CAPTION_EL) {
5085 !!!cp ('t155');
5086 $i = $_;
5087 last INSCOPE;
5088 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5089 !!!cp ('t156');
5090 last;
5091 }
5092 }
5093
5094 !!!cp ('t157');
5095 !!!parse-error (type => 'start tag not allowed',
5096 text => $token->{tag_name}, token => $token);
5097 ## Ignore the token
5098 !!!nack ('t157.1');
5099 !!!next-token;
5100 next B;
5101 } # INSCOPE
5102
5103 ## generate implied end tags
5104 while ($self->{open_elements}->[-1]->[1]
5105 & END_TAG_OPTIONAL_EL) {
5106 !!!cp ('t158');
5107 pop @{$self->{open_elements}};
5108 }
5109
5110 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5111 !!!cp ('t159');
5112 !!!parse-error (type => 'not closed',
5113 text => $self->{open_elements}->[-1]->[0]
5114 ->manakai_local_name,
5115 token => $token);
5116 } else {
5117 !!!cp ('t160');
5118 }
5119
5120 splice @{$self->{open_elements}}, $i;
5121
5122 $clear_up_to_marker->();
5123
5124 $self->{insertion_mode} = IN_TABLE_IM;
5125
5126 ## reprocess
5127 !!!ack-later;
5128 next B;
5129 } else {
5130 !!!cp ('t161');
5131 #
5132 }
5133 } else {
5134 !!!cp ('t162');
5135 #
5136 }
5137 } elsif ($token->{type} == END_TAG_TOKEN) {
5138 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5139 if ($self->{insertion_mode} == IN_CELL_IM) {
5140 ## have an element in table scope
5141 my $i;
5142 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5143 my $node = $self->{open_elements}->[$_];
5144 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5145 !!!cp ('t163');
5146 $i = $_;
5147 last INSCOPE;
5148 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5149 !!!cp ('t164');
5150 last INSCOPE;
5151 }
5152 } # INSCOPE
5153 unless (defined $i) {
5154 !!!cp ('t165');
5155 !!!parse-error (type => 'unmatched end tag',
5156 text => $token->{tag_name},
5157 token => $token);
5158 ## Ignore the token
5159 !!!next-token;
5160 next B;
5161 }
5162
5163 ## generate implied end tags
5164 while ($self->{open_elements}->[-1]->[1]
5165 & END_TAG_OPTIONAL_EL) {
5166 !!!cp ('t166');
5167 pop @{$self->{open_elements}};
5168 }
5169
5170 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
5171 ne $token->{tag_name}) {
5172 !!!cp ('t167');
5173 !!!parse-error (type => 'not closed',
5174 text => $self->{open_elements}->[-1]->[0]
5175 ->manakai_local_name,
5176 token => $token);
5177 } else {
5178 !!!cp ('t168');
5179 }
5180
5181 splice @{$self->{open_elements}}, $i;
5182
5183 $clear_up_to_marker->();
5184
5185 $self->{insertion_mode} = IN_ROW_IM;
5186
5187 !!!next-token;
5188 next B;
5189 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5190 !!!cp ('t169');
5191 !!!parse-error (type => 'unmatched end tag',
5192 text => $token->{tag_name}, token => $token);
5193 ## Ignore the token
5194 !!!next-token;
5195 next B;
5196 } else {
5197 !!!cp ('t170');
5198 #
5199 }
5200 } elsif ($token->{tag_name} eq 'caption') {
5201 if ($self->{insertion_mode} == IN_CAPTION_IM) {
5202 ## have a table element in table scope
5203 my $i;
5204 INSCOPE: {
5205 for (reverse 0..$#{$self->{open_elements}}) {
5206 my $node = $self->{open_elements}->[$_];
5207 if ($node->[1] & CAPTION_EL) {
5208 !!!cp ('t171');
5209 $i = $_;
5210 last INSCOPE;
5211 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5212 !!!cp ('t172');
5213 last;
5214 }
5215 }
5216
5217 !!!cp ('t173');
5218 !!!parse-error (type => 'unmatched end tag',
5219 text => $token->{tag_name}, token => $token);
5220 ## Ignore the token
5221 !!!next-token;
5222 next B;
5223 } # INSCOPE
5224
5225 ## generate implied end tags
5226 while ($self->{open_elements}->[-1]->[1]
5227 & END_TAG_OPTIONAL_EL) {
5228 !!!cp ('t174');
5229 pop @{$self->{open_elements}};
5230 }
5231
5232 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5233 !!!cp ('t175');
5234 !!!parse-error (type => 'not closed',
5235 text => $self->{open_elements}->[-1]->[0]
5236 ->manakai_local_name,
5237 token => $token);
5238 } else {
5239 !!!cp ('t176');
5240 }
5241
5242 splice @{$self->{open_elements}}, $i;
5243
5244 $clear_up_to_marker->();
5245
5246 $self->{insertion_mode} = IN_TABLE_IM;
5247
5248 !!!next-token;
5249 next B;
5250 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5251 !!!cp ('t177');
5252 !!!parse-error (type => 'unmatched end tag',
5253 text => $token->{tag_name}, token => $token);
5254 ## Ignore the token
5255 !!!next-token;
5256 next B;
5257 } else {
5258 !!!cp ('t178');
5259 #
5260 }
5261 } elsif ({
5262 table => 1, tbody => 1, tfoot => 1,
5263 thead => 1, tr => 1,
5264 }->{$token->{tag_name}} and
5265 $self->{insertion_mode} == IN_CELL_IM) {
5266 ## have an element in table scope
5267 my $i;
5268 my $tn;
5269 INSCOPE: {
5270 for (reverse 0..$#{$self->{open_elements}}) {
5271 my $node = $self->{open_elements}->[$_];
5272 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5273 !!!cp ('t179');
5274 $i = $_;
5275
5276 ## Close the cell
5277 !!!back-token; # </x>
5278 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5279 line => $token->{line},
5280 column => $token->{column}};
5281 next B;
5282 } elsif ($node->[1] & TABLE_CELL_EL) {
5283 !!!cp ('t180');
5284 $tn = $node->[0]->manakai_local_name;
5285 ## NOTE: There is exactly one |td| or |th| element
5286 ## in scope in the stack of open elements by definition.
5287 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5288 ## ISSUE: Can this be reached?
5289 !!!cp ('t181');
5290 last;
5291 }
5292 }
5293
5294 !!!cp ('t182');
5295 !!!parse-error (type => 'unmatched end tag',
5296 text => $token->{tag_name}, token => $token);
5297 ## Ignore the token
5298 !!!next-token;
5299 next B;
5300 } # INSCOPE
5301 } elsif ($token->{tag_name} eq 'table' and
5302 $self->{insertion_mode} == IN_CAPTION_IM) {
5303 !!!parse-error (type => 'not closed', text => 'caption',
5304 token => $token);
5305
5306 ## As if </caption>
5307 ## have a table element in table scope
5308 my $i;
5309 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5310 my $node = $self->{open_elements}->[$_];
5311 if ($node->[1] & CAPTION_EL) {
5312 !!!cp ('t184');
5313 $i = $_;
5314 last INSCOPE;
5315 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5316 !!!cp ('t185');
5317 last INSCOPE;
5318 }
5319 } # INSCOPE
5320 unless (defined $i) {
5321 !!!cp ('t186');
5322 !!!parse-error (type => 'unmatched end tag',
5323 text => 'caption', token => $token);
5324 ## Ignore the token
5325 !!!next-token;
5326 next B;
5327 }
5328
5329 ## generate implied end tags
5330 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5331 !!!cp ('t187');
5332 pop @{$self->{open_elements}};
5333 }
5334
5335 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5336 !!!cp ('t188');
5337 !!!parse-error (type => 'not closed',
5338 text => $self->{open_elements}->[-1]->[0]
5339 ->manakai_local_name,
5340 token => $token);
5341 } else {
5342 !!!cp ('t189');
5343 }
5344
5345 splice @{$self->{open_elements}}, $i;
5346
5347 $clear_up_to_marker->();
5348
5349 $self->{insertion_mode} = IN_TABLE_IM;
5350
5351 ## reprocess
5352 next B;
5353 } elsif ({
5354 body => 1, col => 1, colgroup => 1, html => 1,
5355 }->{$token->{tag_name}}) {
5356 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5357 !!!cp ('t190');
5358 !!!parse-error (type => 'unmatched end tag',
5359 text => $token->{tag_name}, token => $token);
5360 ## Ignore the token
5361 !!!next-token;
5362 next B;
5363 } else {
5364 !!!cp ('t191');
5365 #
5366 }
5367 } elsif ({
5368 tbody => 1, tfoot => 1,
5369 thead => 1, tr => 1,
5370 }->{$token->{tag_name}} and
5371 $self->{insertion_mode} == IN_CAPTION_IM) {
5372 !!!cp ('t192');
5373 !!!parse-error (type => 'unmatched end tag',
5374 text => $token->{tag_name}, token => $token);
5375 ## Ignore the token
5376 !!!next-token;
5377 next B;
5378 } else {
5379 !!!cp ('t193');
5380 #
5381 }
5382 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5383 for my $entry (@{$self->{open_elements}}) {
5384 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5385 !!!cp ('t75');
5386 !!!parse-error (type => 'in body:#eof', token => $token);
5387 last;
5388 }
5389 }
5390
5391 ## Stop parsing.
5392 last B;
5393 } else {
5394 die "$0: $token->{type}: Unknown token type";
5395 }
5396
5397 $insert = $insert_to_current;
5398 #
5399 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5400 if ($token->{type} == CHARACTER_TOKEN) {
5401 if (not $open_tables->[-1]->[1] and # tainted
5402 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5403 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5404
5405 unless (length $token->{data}) {
5406 !!!cp ('t194');
5407 !!!next-token;
5408 next B;
5409 } else {
5410 !!!cp ('t195');
5411 }
5412 }
5413
5414 !!!parse-error (type => 'in table:#text', token => $token);
5415
5416 ## As if in body, but insert into foster parent element
5417 ## ISSUE: Spec says that "whenever a node would be inserted
5418 ## into the current node" while characters might not be
5419 ## result in a new Text node.
5420 $reconstruct_active_formatting_elements->($insert_to_foster);
5421
5422 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5423 # MUST
5424 my $foster_parent_element;
5425 my $next_sibling;
5426 my $prev_sibling;
5427 OE: for (reverse 0..$#{$self->{open_elements}}) {
5428 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5429 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5430 if (defined $parent and $parent->node_type == 1) {
5431 !!!cp ('t196');
5432 $foster_parent_element = $parent;
5433 $next_sibling = $self->{open_elements}->[$_]->[0];
5434 $prev_sibling = $next_sibling->previous_sibling;
5435 } else {
5436 !!!cp ('t197');
5437 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5438 $prev_sibling = $foster_parent_element->last_child;
5439 }
5440 last OE;
5441 }
5442 } # OE
5443 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5444 $prev_sibling = $foster_parent_element->last_child
5445 unless defined $foster_parent_element;
5446 if (defined $prev_sibling and
5447 $prev_sibling->node_type == 3) {
5448 !!!cp ('t198');
5449 $prev_sibling->manakai_append_text ($token->{data});
5450 } else {
5451 !!!cp ('t199');
5452 $foster_parent_element->insert_before
5453 ($self->{document}->create_text_node ($token->{data}),
5454 $next_sibling);
5455 }
5456 $open_tables->[-1]->[1] = 1; # tainted
5457 } else {
5458 !!!cp ('t200');
5459 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5460 }
5461
5462 !!!next-token;
5463 next B;
5464 } elsif ($token->{type} == START_TAG_TOKEN) {
5465 if ({
5466 tr => ($self->{insertion_mode} != IN_ROW_IM),
5467 th => 1, td => 1,
5468 }->{$token->{tag_name}}) {
5469 if ($self->{insertion_mode} == IN_TABLE_IM) {
5470 ## Clear back to table context
5471 while (not ($self->{open_elements}->[-1]->[1]
5472 & TABLE_SCOPING_EL)) {
5473 !!!cp ('t201');
5474 pop @{$self->{open_elements}};
5475 }
5476
5477 !!!insert-element ('tbody',, $token);
5478 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5479 ## reprocess in the "in table body" insertion mode...
5480 }
5481
5482 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5483 unless ($token->{tag_name} eq 'tr') {
5484 !!!cp ('t202');
5485 !!!parse-error (type => 'missing start tag:tr', token => $token);
5486 }
5487
5488 ## Clear back to table body context
5489 while (not ($self->{open_elements}->[-1]->[1]
5490 & TABLE_ROWS_SCOPING_EL)) {
5491 !!!cp ('t203');
5492 ## ISSUE: Can this case be reached?
5493 pop @{$self->{open_elements}};
5494 }
5495
5496 $self->{insertion_mode} = IN_ROW_IM;
5497 if ($token->{tag_name} eq 'tr') {
5498 !!!cp ('t204');
5499 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5500 !!!nack ('t204');
5501 !!!next-token;
5502 next B;
5503 } else {
5504 !!!cp ('t205');
5505 !!!insert-element ('tr',, $token);
5506 ## reprocess in the "in row" insertion mode
5507 }
5508 } else {
5509 !!!cp ('t206');
5510 }
5511
5512 ## Clear back to table row context
5513 while (not ($self->{open_elements}->[-1]->[1]
5514 & TABLE_ROW_SCOPING_EL)) {
5515 !!!cp ('t207');
5516 pop @{$self->{open_elements}};
5517 }
5518
5519 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5520 $self->{insertion_mode} = IN_CELL_IM;
5521
5522 push @$active_formatting_elements, ['#marker', ''];
5523
5524 !!!nack ('t207.1');
5525 !!!next-token;
5526 next B;
5527 } elsif ({
5528 caption => 1, col => 1, colgroup => 1,
5529 tbody => 1, tfoot => 1, thead => 1,
5530 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5531 }->{$token->{tag_name}}) {
5532 if ($self->{insertion_mode} == IN_ROW_IM) {
5533 ## As if </tr>
5534 ## have an element in table scope
5535 my $i;
5536 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5537 my $node = $self->{open_elements}->[$_];
5538 if ($node->[1] & TABLE_ROW_EL) {
5539 !!!cp ('t208');
5540 $i = $_;
5541 last INSCOPE;
5542 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5543 !!!cp ('t209');
5544 last INSCOPE;
5545 }
5546 } # INSCOPE
5547 unless (defined $i) {
5548 !!!cp ('t210');
5549 ## TODO: This type is wrong.
5550 !!!parse-error (type => 'unmacthed end tag',
5551 text => $token->{tag_name}, token => $token);
5552 ## Ignore the token
5553 !!!nack ('t210.1');
5554 !!!next-token;
5555 next B;
5556 }
5557
5558 ## Clear back to table row context
5559 while (not ($self->{open_elements}->[-1]->[1]
5560 & TABLE_ROW_SCOPING_EL)) {
5561 !!!cp ('t211');
5562 ## ISSUE: Can this case be reached?
5563 pop @{$self->{open_elements}};
5564 }
5565
5566 pop @{$self->{open_elements}}; # tr
5567 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5568 if ($token->{tag_name} eq 'tr') {
5569 !!!cp ('t212');
5570 ## reprocess
5571 !!!ack-later;
5572 next B;
5573 } else {
5574 !!!cp ('t213');
5575 ## reprocess in the "in table body" insertion mode...
5576 }
5577 }
5578
5579 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5580 ## have an element in table scope
5581 my $i;
5582 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5583 my $node = $self->{open_elements}->[$_];
5584 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5585 !!!cp ('t214');
5586 $i = $_;
5587 last INSCOPE;
5588 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5589 !!!cp ('t215');
5590 last INSCOPE;
5591 }
5592 } # INSCOPE
5593 unless (defined $i) {
5594 !!!cp ('t216');
5595 ## TODO: This erorr type is wrong.
5596 !!!parse-error (type => 'unmatched end tag',
5597 text => $token->{tag_name}, token => $token);
5598 ## Ignore the token
5599 !!!nack ('t216.1');
5600 !!!next-token;
5601 next B;
5602 }
5603
5604 ## Clear back to table body context
5605 while (not ($self->{open_elements}->[-1]->[1]
5606 & TABLE_ROWS_SCOPING_EL)) {
5607 !!!cp ('t217');
5608 ## ISSUE: Can this state be reached?
5609 pop @{$self->{open_elements}};
5610 }
5611
5612 ## As if <{current node}>
5613 ## have an element in table scope
5614 ## true by definition
5615
5616 ## Clear back to table body context
5617 ## nop by definition
5618
5619 pop @{$self->{open_elements}};
5620 $self->{insertion_mode} = IN_TABLE_IM;
5621 ## reprocess in "in table" insertion mode...
5622 } else {
5623 !!!cp ('t218');
5624 }
5625
5626 if ($token->{tag_name} eq 'col') {
5627 ## Clear back to table context
5628 while (not ($self->{open_elements}->[-1]->[1]
5629 & TABLE_SCOPING_EL)) {
5630 !!!cp ('t219');
5631 ## ISSUE: Can this state be reached?
5632 pop @{$self->{open_elements}};
5633 }
5634
5635 !!!insert-element ('colgroup',, $token);
5636 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5637 ## reprocess
5638 !!!ack-later;
5639 next B;
5640 } elsif ({
5641 caption => 1,
5642 colgroup => 1,
5643 tbody => 1, tfoot => 1, thead => 1,
5644 }->{$token->{tag_name}}) {
5645 ## Clear back to table context
5646 while (not ($self->{open_elements}->[-1]->[1]
5647 & TABLE_SCOPING_EL)) {
5648 !!!cp ('t220');
5649 ## ISSUE: Can this state be reached?
5650 pop @{$self->{open_elements}};
5651 }
5652
5653 push @$active_formatting_elements, ['#marker', '']
5654 if $token->{tag_name} eq 'caption';
5655
5656 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5657 $self->{insertion_mode} = {
5658 caption => IN_CAPTION_IM,
5659 colgroup => IN_COLUMN_GROUP_IM,
5660 tbody => IN_TABLE_BODY_IM,
5661 tfoot => IN_TABLE_BODY_IM,
5662 thead => IN_TABLE_BODY_IM,
5663 }->{$token->{tag_name}};
5664 !!!next-token;
5665 !!!nack ('t220.1');
5666 next B;
5667 } else {
5668 die "$0: in table: <>: $token->{tag_name}";
5669 }
5670 } elsif ($token->{tag_name} eq 'table') {
5671 !!!parse-error (type => 'not closed',
5672 text => $self->{open_elements}->[-1]->[0]
5673 ->manakai_local_name,
5674 token => $token);
5675
5676 ## As if </table>
5677 ## have a table element in table scope
5678 my $i;
5679 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5680 my $node = $self->{open_elements}->[$_];
5681 if ($node->[1] & TABLE_EL) {
5682 !!!cp ('t221');
5683 $i = $_;
5684 last INSCOPE;
5685 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5686 !!!cp ('t222');
5687 last INSCOPE;
5688 }
5689 } # INSCOPE
5690 unless (defined $i) {
5691 !!!cp ('t223');
5692 ## TODO: The following is wrong, maybe.
5693 !!!parse-error (type => 'unmatched end tag', text => 'table',
5694 token => $token);
5695 ## Ignore tokens </table><table>
5696 !!!nack ('t223.1');
5697 !!!next-token;
5698 next B;
5699 }
5700
5701 ## TODO: Followings are removed from the latest spec.
5702 ## generate implied end tags
5703 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5704 !!!cp ('t224');
5705 pop @{$self->{open_elements}};
5706 }
5707
5708 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5709 !!!cp ('t225');
5710 ## NOTE: |<table><tr><table>|
5711 !!!parse-error (type => 'not closed',
5712 text => $self->{open_elements}->[-1]->[0]
5713 ->manakai_local_name,
5714 token => $token);
5715 } else {
5716 !!!cp ('t226');
5717 }
5718
5719 splice @{$self->{open_elements}}, $i;
5720 pop @{$open_tables};
5721
5722 $self->_reset_insertion_mode;
5723
5724 ## reprocess
5725 !!!ack-later;
5726 next B;
5727 } elsif ($token->{tag_name} eq 'style') {
5728 if (not $open_tables->[-1]->[1]) { # tainted
5729 !!!cp ('t227.8');
5730 ## NOTE: This is a "as if in head" code clone.
5731 $parse_rcdata->(CDATA_CONTENT_MODEL);
5732 next B;
5733 } else {
5734 !!!cp ('t227.7');
5735 #
5736 }
5737 } elsif ($token->{tag_name} eq 'script') {
5738 if (not $open_tables->[-1]->[1]) { # tainted
5739 !!!cp ('t227.6');
5740 ## NOTE: This is a "as if in head" code clone.
5741 $script_start_tag->();
5742 next B;
5743 } else {
5744 !!!cp ('t227.5');
5745 #
5746 }
5747 } elsif ($token->{tag_name} eq 'input') {
5748 if (not $open_tables->[-1]->[1]) { # tainted
5749 if ($token->{attributes}->{type}) { ## TODO: case
5750 my $type = lc $token->{attributes}->{type}->{value};
5751 if ($type eq 'hidden') {
5752 !!!cp ('t227.3');
5753 !!!parse-error (type => 'in table',
5754 text => $token->{tag_name}, token => $token);
5755
5756 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5757
5758 ## TODO: form element pointer
5759
5760 pop @{$self->{open_elements}};
5761
5762 !!!next-token;
5763 !!!ack ('t227.2.1');
5764 next B;
5765 } else {
5766 !!!cp ('t227.2');
5767 #
5768 }
5769 } else {
5770 !!!cp ('t227.1');
5771 #
5772 }
5773 } else {
5774 !!!cp ('t227.4');
5775 #
5776 }
5777 } else {
5778 !!!cp ('t227');
5779 #
5780 }
5781
5782 !!!parse-error (type => 'in table', text => $token->{tag_name},
5783 token => $token);
5784
5785 $insert = $insert_to_foster;
5786 #
5787 } elsif ($token->{type} == END_TAG_TOKEN) {
5788 if ($token->{tag_name} eq 'tr' and
5789 $self->{insertion_mode} == IN_ROW_IM) {
5790 ## have an element in table scope
5791 my $i;
5792 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5793 my $node = $self->{open_elements}->[$_];
5794 if ($node->[1] & TABLE_ROW_EL) {
5795 !!!cp ('t228');
5796 $i = $_;
5797 last INSCOPE;
5798 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5799 !!!cp ('t229');
5800 last INSCOPE;
5801 }
5802 } # INSCOPE
5803 unless (defined $i) {
5804 !!!cp ('t230');
5805 !!!parse-error (type => 'unmatched end tag',
5806 text => $token->{tag_name}, token => $token);
5807 ## Ignore the token
5808 !!!nack ('t230.1');
5809 !!!next-token;
5810 next B;
5811 } else {
5812 !!!cp ('t232');
5813 }
5814
5815 ## Clear back to table row context
5816 while (not ($self->{open_elements}->[-1]->[1]
5817 & TABLE_ROW_SCOPING_EL)) {
5818 !!!cp ('t231');
5819 ## ISSUE: Can this state be reached?
5820 pop @{$self->{open_elements}};
5821 }
5822
5823 pop @{$self->{open_elements}}; # tr
5824 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5825 !!!next-token;
5826 !!!nack ('t231.1');
5827 next B;
5828 } elsif ($token->{tag_name} eq 'table') {
5829 if ($self->{insertion_mode} == IN_ROW_IM) {
5830 ## As if </tr>
5831 ## have an element in table scope
5832 my $i;
5833 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5834 my $node = $self->{open_elements}->[$_];
5835 if ($node->[1] & TABLE_ROW_EL) {
5836 !!!cp ('t233');
5837 $i = $_;
5838 last INSCOPE;
5839 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5840 !!!cp ('t234');
5841 last INSCOPE;
5842 }
5843 } # INSCOPE
5844 unless (defined $i) {
5845 !!!cp ('t235');
5846 ## TODO: The following is wrong.
5847 !!!parse-error (type => 'unmatched end tag',
5848 text => $token->{type}, token => $token);
5849 ## Ignore the token
5850 !!!nack ('t236.1');
5851 !!!next-token;
5852 next B;
5853 }
5854
5855 ## Clear back to table row context
5856 while (not ($self->{open_elements}->[-1]->[1]
5857 & TABLE_ROW_SCOPING_EL)) {
5858 !!!cp ('t236');
5859 ## ISSUE: Can this state be reached?
5860 pop @{$self->{open_elements}};
5861 }
5862
5863 pop @{$self->{open_elements}}; # tr
5864 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5865 ## reprocess in the "in table body" insertion mode...
5866 }
5867
5868 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5869 ## have an element in table scope
5870 my $i;
5871 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5872 my $node = $self->{open_elements}->[$_];
5873 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5874 !!!cp ('t237');
5875 $i = $_;
5876 last INSCOPE;
5877 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5878 !!!cp ('t238');
5879 last INSCOPE;
5880 }
5881 } # INSCOPE
5882 unless (defined $i) {
5883 !!!cp ('t239');
5884 !!!parse-error (type => 'unmatched end tag',
5885 text => $token->{tag_name}, token => $token);
5886 ## Ignore the token
5887 !!!nack ('t239.1');
5888 !!!next-token;
5889 next B;
5890 }
5891
5892 ## Clear back to table body context
5893 while (not ($self->{open_elements}->[-1]->[1]
5894 & TABLE_ROWS_SCOPING_EL)) {
5895 !!!cp ('t240');
5896 pop @{$self->{open_elements}};
5897 }
5898
5899 ## As if <{current node}>
5900 ## have an element in table scope
5901 ## true by definition
5902
5903 ## Clear back to table body context
5904 ## nop by definition
5905
5906 pop @{$self->{open_elements}};
5907 $self->{insertion_mode} = IN_TABLE_IM;
5908 ## reprocess in the "in table" insertion mode...
5909 }
5910
5911 ## NOTE: </table> in the "in table" insertion mode.
5912 ## When you edit the code fragment below, please ensure that
5913 ## the code for <table> in the "in table" insertion mode
5914 ## is synced with it.
5915
5916 ## have a table element in table scope
5917 my $i;
5918 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5919 my $node = $self->{open_elements}->[$_];
5920 if ($node->[1] & TABLE_EL) {
5921 !!!cp ('t241');
5922 $i = $_;
5923 last INSCOPE;
5924 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5925 !!!cp ('t242');
5926 last INSCOPE;
5927 }
5928 } # INSCOPE
5929 unless (defined $i) {
5930 !!!cp ('t243');
5931 !!!parse-error (type => 'unmatched end tag',
5932 text => $token->{tag_name}, token => $token);
5933 ## Ignore the token
5934 !!!nack ('t243.1');
5935 !!!next-token;
5936 next B;
5937 }
5938
5939 splice @{$self->{open_elements}}, $i;
5940 pop @{$open_tables};
5941
5942 $self->_reset_insertion_mode;
5943
5944 !!!next-token;
5945 next B;
5946 } elsif ({
5947 tbody => 1, tfoot => 1, thead => 1,
5948 }->{$token->{tag_name}} and
5949 $self->{insertion_mode} & ROW_IMS) {
5950 if ($self->{insertion_mode} == IN_ROW_IM) {
5951 ## have an element in table scope
5952 my $i;
5953 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5954 my $node = $self->{open_elements}->[$_];
5955 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5956 !!!cp ('t247');
5957 $i = $_;
5958 last INSCOPE;
5959 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5960 !!!cp ('t248');
5961 last INSCOPE;
5962 }
5963 } # INSCOPE
5964 unless (defined $i) {
5965 !!!cp ('t249');
5966 !!!parse-error (type => 'unmatched end tag',
5967 text => $token->{tag_name}, token => $token);
5968 ## Ignore the token
5969 !!!nack ('t249.1');
5970 !!!next-token;
5971 next B;
5972 }
5973
5974 ## As if </tr>
5975 ## have an element in table scope
5976 my $i;
5977 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5978 my $node = $self->{open_elements}->[$_];
5979 if ($node->[1] & TABLE_ROW_EL) {
5980 !!!cp ('t250');
5981 $i = $_;
5982 last INSCOPE;
5983 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5984 !!!cp ('t251');
5985 last INSCOPE;
5986 }
5987 } # INSCOPE
5988 unless (defined $i) {
5989 !!!cp ('t252');
5990 !!!parse-error (type => 'unmatched end tag',
5991 text => 'tr', token => $token);
5992 ## Ignore the token
5993 !!!nack ('t252.1');
5994 !!!next-token;
5995 next B;
5996 }
5997
5998 ## Clear back to table row context
5999 while (not ($self->{open_elements}->[-1]->[1]
6000 & TABLE_ROW_SCOPING_EL)) {
6001 !!!cp ('t253');
6002 ## ISSUE: Can this case be reached?
6003 pop @{$self->{open_elements}};
6004 }
6005
6006 pop @{$self->{open_elements}}; # tr
6007 $self->{insertion_mode} = IN_TABLE_BODY_IM;
6008 ## reprocess in the "in table body" insertion mode...
6009 }
6010
6011 ## have an element in table scope
6012 my $i;
6013 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6014 my $node = $self->{open_elements}->[$_];
6015 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6016 !!!cp ('t254');
6017 $i = $_;
6018 last INSCOPE;
6019 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6020 !!!cp ('t255');
6021 last INSCOPE;
6022 }
6023 } # INSCOPE
6024 unless (defined $i) {
6025 !!!cp ('t256');
6026 !!!parse-error (type => 'unmatched end tag',
6027 text => $token->{tag_name}, token => $token);
6028 ## Ignore the token
6029 !!!nack ('t256.1');
6030 !!!next-token;
6031 next B;
6032 }
6033
6034 ## Clear back to table body context
6035 while (not ($self->{open_elements}->[-1]->[1]
6036 & TABLE_ROWS_SCOPING_EL)) {
6037 !!!cp ('t257');
6038 ## ISSUE: Can this case be reached?
6039 pop @{$self->{open_elements}};
6040 }
6041
6042 pop @{$self->{open_elements}};
6043 $self->{insertion_mode} = IN_TABLE_IM;
6044 !!!nack ('t257.1');
6045 !!!next-token;
6046 next B;
6047 } elsif ({
6048 body => 1, caption => 1, col => 1, colgroup => 1,
6049 html => 1, td => 1, th => 1,
6050 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
6051 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
6052 }->{$token->{tag_name}}) {
6053 !!!cp ('t258');
6054 !!!parse-error (type => 'unmatched end tag',
6055 text => $token->{tag_name}, token => $token);
6056 ## Ignore the token
6057 !!!nack ('t258.1');
6058 !!!next-token;
6059 next B;
6060 } else {
6061 !!!cp ('t259');
6062 !!!parse-error (type => 'in table:/',
6063 text => $token->{tag_name}, token => $token);
6064
6065 $insert = $insert_to_foster;
6066 #
6067 }
6068 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6069 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6070 @{$self->{open_elements}} == 1) { # redundant, maybe
6071 !!!parse-error (type => 'in body:#eof', token => $token);
6072 !!!cp ('t259.1');
6073 #
6074 } else {
6075 !!!cp ('t259.2');
6076 #
6077 }
6078
6079 ## Stop parsing
6080 last B;
6081 } else {
6082 die "$0: $token->{type}: Unknown token type";
6083 }
6084 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6085 if ($token->{type} == CHARACTER_TOKEN) {
6086 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6087 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6088 unless (length $token->{data}) {
6089 !!!cp ('t260');
6090 !!!next-token;
6091 next B;
6092 }
6093 }
6094
6095 !!!cp ('t261');
6096 #
6097 } elsif ($token->{type} == START_TAG_TOKEN) {
6098 if ($token->{tag_name} eq 'col') {
6099 !!!cp ('t262');
6100 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6101 pop @{$self->{open_elements}};
6102 !!!ack ('t262.1');
6103 !!!next-token;
6104 next B;
6105 } else {
6106 !!!cp ('t263');
6107 #
6108 }
6109 } elsif ($token->{type} == END_TAG_TOKEN) {
6110 if ($token->{tag_name} eq 'colgroup') {
6111 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6112 !!!cp ('t264');
6113 !!!parse-error (type => 'unmatched end tag',
6114 text => 'colgroup', token => $token);
6115 ## Ignore the token
6116 !!!next-token;
6117 next B;
6118 } else {
6119 !!!cp ('t265');
6120 pop @{$self->{open_elements}}; # colgroup
6121 $self->{insertion_mode} = IN_TABLE_IM;
6122 !!!next-token;
6123 next B;
6124 }
6125 } elsif ($token->{tag_name} eq 'col') {
6126 !!!cp ('t266');
6127 !!!parse-error (type => 'unmatched end tag',
6128 text => 'col', token => $token);
6129 ## Ignore the token
6130 !!!next-token;
6131 next B;
6132 } else {
6133 !!!cp ('t267');
6134 #
6135 }
6136 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6137 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6138 @{$self->{open_elements}} == 1) { # redundant, maybe
6139 !!!cp ('t270.2');
6140 ## Stop parsing.
6141 last B;
6142 } else {
6143 ## NOTE: As if </colgroup>.
6144 !!!cp ('t270.1');
6145 pop @{$self->{open_elements}}; # colgroup
6146 $self->{insertion_mode} = IN_TABLE_IM;
6147 ## Reprocess.
6148 next B;
6149 }
6150 } else {
6151 die "$0: $token->{type}: Unknown token type";
6152 }
6153
6154 ## As if </colgroup>
6155 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6156 !!!cp ('t269');
6157 ## TODO: Wrong error type?
6158 !!!parse-error (type => 'unmatched end tag',
6159 text => 'colgroup', token => $token);
6160 ## Ignore the token
6161 !!!nack ('t269.1');
6162 !!!next-token;
6163 next B;
6164 } else {
6165 !!!cp ('t270');
6166 pop @{$self->{open_elements}}; # colgroup
6167 $self->{insertion_mode} = IN_TABLE_IM;
6168 !!!ack-later;
6169 ## reprocess
6170 next B;
6171 }
6172 } elsif ($self->{insertion_mode} & SELECT_IMS) {
6173 if ($token->{type} == CHARACTER_TOKEN) {
6174 !!!cp ('t271');
6175 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
6176 !!!next-token;
6177 next B;
6178 } elsif ($token->{type} == START_TAG_TOKEN) {
6179 if ($token->{tag_name} eq 'option') {
6180 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6181 !!!cp ('t272');
6182 ## As if </option>
6183 pop @{$self->{open_elements}};
6184 } else {
6185 !!!cp ('t273');
6186 }
6187
6188 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6189 !!!nack ('t273.1');
6190 !!!next-token;
6191 next B;
6192 } elsif ($token->{tag_name} eq 'optgroup') {
6193 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6194 !!!cp ('t274');
6195 ## As if </option>
6196 pop @{$self->{open_elements}};
6197 } else {
6198 !!!cp ('t275');
6199 }
6200
6201 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6202 !!!cp ('t276');
6203 ## As if </optgroup>
6204 pop @{$self->{open_elements}};
6205 } else {
6206 !!!cp ('t277');
6207 }
6208
6209 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6210 !!!nack ('t277.1');
6211 !!!next-token;
6212 next B;
6213 } elsif ({
6214 select => 1, input => 1, textarea => 1,
6215 }->{$token->{tag_name}} or
6216 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6217 {
6218 caption => 1, table => 1,
6219 tbody => 1, tfoot => 1, thead => 1,
6220 tr => 1, td => 1, th => 1,
6221 }->{$token->{tag_name}})) {
6222 ## TODO: The type below is not good - <select> is replaced by </select>
6223 !!!parse-error (type => 'not closed', text => 'select',
6224 token => $token);
6225 ## NOTE: As if the token were </select> (<select> case) or
6226 ## as if there were </select> (otherwise).
6227 ## have an element in table scope
6228 my $i;
6229 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6230 my $node = $self->{open_elements}->[$_];
6231 if ($node->[1] & SELECT_EL) {
6232 !!!cp ('t278');
6233 $i = $_;
6234 last INSCOPE;
6235 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6236 !!!cp ('t279');
6237 last INSCOPE;
6238 }
6239 } # INSCOPE
6240 unless (defined $i) {
6241 !!!cp ('t280');
6242 !!!parse-error (type => 'unmatched end tag',
6243 text => 'select', token => $token);
6244 ## Ignore the token
6245 !!!nack ('t280.1');
6246 !!!next-token;
6247 next B;
6248 }
6249
6250 !!!cp ('t281');
6251 splice @{$self->{open_elements}}, $i;
6252
6253 $self->_reset_insertion_mode;
6254
6255 if ($token->{tag_name} eq 'select') {
6256 !!!nack ('t281.2');
6257 !!!next-token;
6258 next B;
6259 } else {
6260 !!!cp ('t281.1');
6261 !!!ack-later;
6262 ## Reprocess the token.
6263 next B;
6264 }
6265 } else {
6266 !!!cp ('t282');
6267 !!!parse-error (type => 'in select',
6268 text => $token->{tag_name}, token => $token);
6269 ## Ignore the token
6270 !!!nack ('t282.1');
6271 !!!next-token;
6272 next B;
6273 }
6274 } elsif ($token->{type} == END_TAG_TOKEN) {
6275 if ($token->{tag_name} eq 'optgroup') {
6276 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6277 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6278 !!!cp ('t283');
6279 ## As if </option>
6280 splice @{$self->{open_elements}}, -2;
6281 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6282 !!!cp ('t284');
6283 pop @{$self->{open_elements}};
6284 } else {
6285 !!!cp ('t285');
6286 !!!parse-error (type => 'unmatched end tag',
6287 text => $token->{tag_name}, token => $token);
6288 ## Ignore the token
6289 }
6290 !!!nack ('t285.1');
6291 !!!next-token;
6292 next B;
6293 } elsif ($token->{tag_name} eq 'option') {
6294 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6295 !!!cp ('t286');
6296 pop @{$self->{open_elements}};
6297 } else {
6298 !!!cp ('t287');
6299 !!!parse-error (type => 'unmatched end tag',
6300 text => $token->{tag_name}, token => $token);
6301 ## Ignore the token
6302 }
6303 !!!nack ('t287.1');
6304 !!!next-token;
6305 next B;
6306 } elsif ($token->{tag_name} eq 'select') {
6307 ## have an element in table scope
6308 my $i;
6309 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6310 my $node = $self->{open_elements}->[$_];
6311 if ($node->[1] & SELECT_EL) {
6312 !!!cp ('t288');
6313 $i = $_;
6314 last INSCOPE;
6315 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6316 !!!cp ('t289');
6317 last INSCOPE;
6318 }
6319 } # INSCOPE
6320 unless (defined $i) {
6321 !!!cp ('t290');
6322 !!!parse-error (type => 'unmatched end tag',
6323 text => $token->{tag_name}, token => $token);
6324 ## Ignore the token
6325 !!!nack ('t290.1');
6326 !!!next-token;
6327 next B;
6328 }
6329
6330 !!!cp ('t291');
6331 splice @{$self->{open_elements}}, $i;
6332
6333 $self->_reset_insertion_mode;
6334
6335 !!!nack ('t291.1');
6336 !!!next-token;
6337 next B;
6338 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6339 {
6340 caption => 1, table => 1, tbody => 1,
6341 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6342 }->{$token->{tag_name}}) {
6343 ## TODO: The following is wrong?
6344 !!!parse-error (type => 'unmatched end tag',
6345 text => $token->{tag_name}, token => $token);
6346
6347 ## have an element in table scope
6348 my $i;
6349 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6350 my $node = $self->{open_elements}->[$_];
6351 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6352 !!!cp ('t292');
6353 $i = $_;
6354 last INSCOPE;
6355 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6356 !!!cp ('t293');
6357 last INSCOPE;
6358 }
6359 } # INSCOPE
6360 unless (defined $i) {
6361 !!!cp ('t294');
6362 ## Ignore the token
6363 !!!nack ('t294.1');
6364 !!!next-token;
6365 next B;
6366 }
6367
6368 ## As if </select>
6369 ## have an element in table scope
6370 undef $i;
6371 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6372 my $node = $self->{open_elements}->[$_];
6373 if ($node->[1] & SELECT_EL) {
6374 !!!cp ('t295');
6375 $i = $_;
6376 last INSCOPE;
6377 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6378 ## ISSUE: Can this state be reached?
6379 !!!cp ('t296');
6380 last INSCOPE;
6381 }
6382 } # INSCOPE
6383 unless (defined $i) {
6384 !!!cp ('t297');
6385 ## TODO: The following error type is correct?
6386 !!!parse-error (type => 'unmatched end tag',
6387 text => 'select', token => $token);
6388 ## Ignore the </select> token
6389 !!!nack ('t297.1');
6390 !!!next-token; ## TODO: ok?
6391 next B;
6392 }
6393
6394 !!!cp ('t298');
6395 splice @{$self->{open_elements}}, $i;
6396
6397 $self->_reset_insertion_mode;
6398
6399 !!!ack-later;
6400 ## reprocess
6401 next B;
6402 } else {
6403 !!!cp ('t299');
6404 !!!parse-error (type => 'in select:/',
6405 text => $token->{tag_name}, token => $token);
6406 ## Ignore the token
6407 !!!nack ('t299.3');
6408 !!!next-token;
6409 next B;
6410 }
6411 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6412 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6413 @{$self->{open_elements}} == 1) { # redundant, maybe
6414 !!!cp ('t299.1');
6415 !!!parse-error (type => 'in body:#eof', token => $token);
6416 } else {
6417 !!!cp ('t299.2');
6418 }
6419
6420 ## Stop parsing.
6421 last B;
6422 } else {
6423 die "$0: $token->{type}: Unknown token type";
6424 }
6425 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6426 if ($token->{type} == CHARACTER_TOKEN) {
6427 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6428 my $data = $1;
6429 ## As if in body
6430 $reconstruct_active_formatting_elements->($insert_to_current);
6431
6432 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6433
6434 unless (length $token->{data}) {
6435 !!!cp ('t300');
6436 !!!next-token;
6437 next B;
6438 }
6439 }
6440
6441 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6442 !!!cp ('t301');
6443 !!!parse-error (type => 'after html:#text', token => $token);
6444
6445 ## Reprocess in the "after body" insertion mode.
6446 } else {
6447 !!!cp ('t302');
6448 }
6449
6450 ## "after body" insertion mode
6451 !!!parse-error (type => 'after body:#text', token => $token);
6452
6453 $self->{insertion_mode} = IN_BODY_IM;
6454 ## reprocess
6455 next B;
6456 } elsif ($token->{type} == START_TAG_TOKEN) {
6457 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6458 !!!cp ('t303');
6459 !!!parse-error (type => 'after html',
6460 text => $token->{tag_name}, token => $token);
6461
6462 ## Reprocess in the "after body" insertion mode.
6463 } else {
6464 !!!cp ('t304');
6465 }
6466
6467 ## "after body" insertion mode
6468 !!!parse-error (type => 'after body',
6469 text => $token->{tag_name}, token => $token);
6470
6471 $self->{insertion_mode} = IN_BODY_IM;
6472 !!!ack-later;
6473 ## reprocess
6474 next B;
6475 } elsif ($token->{type} == END_TAG_TOKEN) {
6476 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6477 !!!cp ('t305');
6478 !!!parse-error (type => 'after html:/',
6479 text => $token->{tag_name}, token => $token);
6480
6481 $self->{insertion_mode} = AFTER_BODY_IM;
6482 ## Reprocess in the "after body" insertion mode.
6483 } else {
6484 !!!cp ('t306');
6485 }
6486
6487 ## "after body" insertion mode
6488 if ($token->{tag_name} eq 'html') {
6489 if (defined $self->{inner_html_node}) {
6490 !!!cp ('t307');
6491 !!!parse-error (type => 'unmatched end tag',
6492 text => 'html', token => $token);
6493 ## Ignore the token
6494 !!!next-token;
6495 next B;
6496 } else {
6497 !!!cp ('t308');
6498 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6499 !!!next-token;
6500 next B;
6501 }
6502 } else {
6503 !!!cp ('t309');
6504 !!!parse-error (type => 'after body:/',
6505 text => $token->{tag_name}, token => $token);
6506
6507 $self->{insertion_mode} = IN_BODY_IM;
6508 ## reprocess
6509 next B;
6510 }
6511 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6512 !!!cp ('t309.2');
6513 ## Stop parsing
6514 last B;
6515 } else {
6516 die "$0: $token->{type}: Unknown token type";
6517 }
6518 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6519 if ($token->{type} == CHARACTER_TOKEN) {
6520 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6521 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6522
6523 unless (length $token->{data}) {
6524 !!!cp ('t310');
6525 !!!next-token;
6526 next B;
6527 }
6528 }
6529
6530 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6531 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6532 !!!cp ('t311');
6533 !!!parse-error (type => 'in frameset:#text', token => $token);
6534 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6535 !!!cp ('t312');
6536 !!!parse-error (type => 'after frameset:#text', token => $token);
6537 } else { # "after after frameset"
6538 !!!cp ('t313');
6539 !!!parse-error (type => 'after html:#text', token => $token);
6540 }
6541
6542 ## Ignore the token.
6543 if (length $token->{data}) {
6544 !!!cp ('t314');
6545 ## reprocess the rest of characters
6546 } else {
6547 !!!cp ('t315');
6548 !!!next-token;
6549 }
6550 next B;
6551 }
6552
6553 die qq[$0: Character "$token->{data}"];
6554 } elsif ($token->{type} == START_TAG_TOKEN) {
6555 if ($token->{tag_name} eq 'frameset' and
6556 $self->{insertion_mode} == IN_FRAMESET_IM) {
6557 !!!cp ('t318');
6558 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6559 !!!nack ('t318.1');
6560 !!!next-token;
6561 next B;
6562 } elsif ($token->{tag_name} eq 'frame' and
6563 $self->{insertion_mode} == IN_FRAMESET_IM) {
6564 !!!cp ('t319');
6565 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6566 pop @{$self->{open_elements}};
6567 !!!ack ('t319.1');
6568 !!!next-token;
6569 next B;
6570 } elsif ($token->{tag_name} eq 'noframes') {
6571 !!!cp ('t320');
6572 ## NOTE: As if in head.
6573 $parse_rcdata->(CDATA_CONTENT_MODEL);
6574 next B;
6575
6576 ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6577 ## has no parse error.
6578 } else {
6579 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6580 !!!cp ('t321');
6581 !!!parse-error (type => 'in frameset',
6582 text => $token->{tag_name}, token => $token);
6583 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6584 !!!cp ('t322');
6585 !!!parse-error (type => 'after frameset',
6586 text => $token->{tag_name}, token => $token);
6587 } else { # "after after frameset"
6588 !!!cp ('t322.2');
6589 !!!parse-error (type => 'after after frameset',
6590 text => $token->{tag_name}, token => $token);
6591 }
6592 ## Ignore the token
6593 !!!nack ('t322.1');
6594 !!!next-token;
6595 next B;
6596 }
6597 } elsif ($token->{type} == END_TAG_TOKEN) {
6598 if ($token->{tag_name} eq 'frameset' and
6599 $self->{insertion_mode} == IN_FRAMESET_IM) {
6600 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6601 @{$self->{open_elements}} == 1) {
6602 !!!cp ('t325');
6603 !!!parse-error (type => 'unmatched end tag',
6604 text => $token->{tag_name}, token => $token);
6605 ## Ignore the token
6606 !!!next-token;
6607 } else {
6608 !!!cp ('t326');
6609 pop @{$self->{open_elements}};
6610 !!!next-token;
6611 }
6612
6613 if (not defined $self->{inner_html_node} and
6614 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6615 !!!cp ('t327');
6616 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6617 } else {
6618 !!!cp ('t328');
6619 }
6620 next B;
6621 } elsif ($token->{tag_name} eq 'html' and
6622 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6623 !!!cp ('t329');
6624 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6625 !!!next-token;
6626 next B;
6627 } else {
6628 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6629 !!!cp ('t330');
6630 !!!parse-error (type => 'in frameset:/',
6631 text => $token->{tag_name}, token => $token);
6632 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6633 !!!cp ('t330.1');
6634 !!!parse-error (type => 'after frameset:/',
6635 text => $token->{tag_name}, token => $token);
6636 } else { # "after after html"
6637 !!!cp ('t331');
6638 !!!parse-error (type => 'after after frameset:/',
6639 text => $token->{tag_name}, token => $token);
6640 }
6641 ## Ignore the token
6642 !!!next-token;
6643 next B;
6644 }
6645 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6646 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6647 @{$self->{open_elements}} == 1) { # redundant, maybe
6648 !!!cp ('t331.1');
6649 !!!parse-error (type => 'in body:#eof', token => $token);
6650 } else {
6651 !!!cp ('t331.2');
6652 }
6653
6654 ## Stop parsing
6655 last B;
6656 } else {
6657 die "$0: $token->{type}: Unknown token type";
6658 }
6659
6660 ## ISSUE: An issue in spec here
6661 } else {
6662 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6663 }
6664
6665 ## "in body" insertion mode
6666 if ($token->{type} == START_TAG_TOKEN) {
6667 if ($token->{tag_name} eq 'script') {
6668 !!!cp ('t332');
6669 ## NOTE: This is an "as if in head" code clone
6670 $script_start_tag->();
6671 next B;
6672 } elsif ($token->{tag_name} eq 'style') {
6673 !!!cp ('t333');
6674 ## NOTE: This is an "as if in head" code clone
6675 $parse_rcdata->(CDATA_CONTENT_MODEL);
6676 next B;
6677 } elsif ({
6678 base => 1, link => 1,
6679 }->{$token->{tag_name}}) {
6680 !!!cp ('t334');
6681 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6682 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6683 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6684 !!!ack ('t334.1');
6685 !!!next-token;
6686 next B;
6687 } elsif ($token->{tag_name} eq 'meta') {
6688 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6689 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6690 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6691
6692 unless ($self->{confident}) {
6693 if ($token->{attributes}->{charset}) {
6694 !!!cp ('t335');
6695 ## NOTE: Whether the encoding is supported or not is handled
6696 ## in the {change_encoding} callback.
6697 $self->{change_encoding}
6698 ->($self, $token->{attributes}->{charset}->{value}, $token);
6699
6700 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6701 ->set_user_data (manakai_has_reference =>
6702 $token->{attributes}->{charset}
6703 ->{has_reference});
6704 } elsif ($token->{attributes}->{content}) {
6705 if ($token->{attributes}->{content}->{value}
6706 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6707 [\x09-\x0D\x20]*=
6708 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6709 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6710 !!!cp ('t336');
6711 ## NOTE: Whether the encoding is supported or not is handled
6712 ## in the {change_encoding} callback.
6713 $self->{change_encoding}
6714 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6715 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6716 ->set_user_data (manakai_has_reference =>
6717 $token->{attributes}->{content}
6718 ->{has_reference});
6719 }
6720 }
6721 } else {
6722 if ($token->{attributes}->{charset}) {
6723 !!!cp ('t337');
6724 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6725 ->set_user_data (manakai_has_reference =>
6726 $token->{attributes}->{charset}
6727 ->{has_reference});
6728 }
6729 if ($token->{attributes}->{content}) {
6730 !!!cp ('t338');
6731 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6732 ->set_user_data (manakai_has_reference =>
6733 $token->{attributes}->{content}
6734 ->{has_reference});
6735 }
6736 }
6737
6738 !!!ack ('t338.1');
6739 !!!next-token;
6740 next B;
6741 } elsif ($token->{tag_name} eq 'title') {
6742 !!!cp ('t341');
6743 ## NOTE: This is an "as if in head" code clone
6744 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6745 next B;
6746 } elsif ($token->{tag_name} eq 'body') {
6747 !!!parse-error (type => 'in body', text => 'body', token => $token);
6748
6749 if (@{$self->{open_elements}} == 1 or
6750 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6751 !!!cp ('t342');
6752 ## Ignore the token
6753 } else {
6754 my $body_el = $self->{open_elements}->[1]->[0];
6755 for my $attr_name (keys %{$token->{attributes}}) {
6756 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6757 !!!cp ('t343');
6758 $body_el->set_attribute_ns
6759 (undef, [undef, $attr_name],
6760 $token->{attributes}->{$attr_name}->{value});
6761 }
6762 }
6763 }
6764 !!!nack ('t343.1');
6765 !!!next-token;
6766 next B;
6767 } elsif ({
6768 address => 1, blockquote => 1, center => 1, dir => 1,
6769 div => 1, dl => 1, fieldset => 1,
6770 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6771 menu => 1, ol => 1, p => 1, ul => 1,
6772 pre => 1, listing => 1,
6773 form => 1,
6774 table => 1,
6775 hr => 1,
6776 }->{$token->{tag_name}}) {
6777 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6778 !!!cp ('t350');
6779 !!!parse-error (type => 'in form:form', token => $token);
6780 ## Ignore the token
6781 !!!nack ('t350.1');
6782 !!!next-token;
6783 next B;
6784 }
6785
6786 ## has a p element in scope
6787 INSCOPE: for (reverse @{$self->{open_elements}}) {
6788 if ($_->[1] & P_EL) {
6789 !!!cp ('t344');
6790 !!!back-token; # <form>
6791 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6792 line => $token->{line}, column => $token->{column}};
6793 next B;
6794 } elsif ($_->[1] & SCOPING_EL) {
6795 !!!cp ('t345');
6796 last INSCOPE;
6797 }
6798 } # INSCOPE
6799
6800 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6801 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6802 !!!nack ('t346.1');
6803 !!!next-token;
6804 if ($token->{type} == CHARACTER_TOKEN) {
6805 $token->{data} =~ s/^\x0A//;
6806 unless (length $token->{data}) {
6807 !!!cp ('t346');
6808 !!!next-token;
6809 } else {
6810 !!!cp ('t349');
6811 }
6812 } else {
6813 !!!cp ('t348');
6814 }
6815 } elsif ($token->{tag_name} eq 'form') {
6816 !!!cp ('t347.1');
6817 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6818
6819 !!!nack ('t347.2');
6820 !!!next-token;
6821 } elsif ($token->{tag_name} eq 'table') {
6822 !!!cp ('t382');
6823 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6824
6825 $self->{insertion_mode} = IN_TABLE_IM;
6826
6827 !!!nack ('t382.1');
6828 !!!next-token;
6829 } elsif ($token->{tag_name} eq 'hr') {
6830 !!!cp ('t386');
6831 pop @{$self->{open_elements}};
6832
6833 !!!nack ('t386.1');
6834 !!!next-token;
6835 } else {
6836 !!!nack ('t347.1');
6837 !!!next-token;
6838 }
6839 next B;
6840 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6841 ## has a p element in scope
6842 INSCOPE: for (reverse @{$self->{open_elements}}) {
6843 if ($_->[1] & P_EL) {
6844 !!!cp ('t353');
6845 !!!back-token; # <x>
6846 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6847 line => $token->{line}, column => $token->{column}};
6848 next B;
6849 } elsif ($_->[1] & SCOPING_EL) {
6850 !!!cp ('t354');
6851 last INSCOPE;
6852 }
6853 } # INSCOPE
6854
6855 ## Step 1
6856 my $i = -1;
6857 my $node = $self->{open_elements}->[$i];
6858 my $li_or_dtdd = {li => {li => 1},
6859 dt => {dt => 1, dd => 1},
6860 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6861 LI: {
6862 ## Step 2
6863 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6864 if ($i != -1) {
6865 !!!cp ('t355');
6866 !!!parse-error (type => 'not closed',
6867 text => $self->{open_elements}->[-1]->[0]
6868 ->manakai_local_name,
6869 token => $token);
6870 } else {
6871 !!!cp ('t356');
6872 }
6873 splice @{$self->{open_elements}}, $i;
6874 last LI;
6875 } else {
6876 !!!cp ('t357');
6877 }
6878
6879 ## Step 3
6880 if (not ($node->[1] & FORMATTING_EL) and
6881 #not $phrasing_category->{$node->[1]} and
6882 ($node->[1] & SPECIAL_EL or
6883 $node->[1] & SCOPING_EL) and
6884 not ($node->[1] & ADDRESS_EL) and
6885 not ($node->[1] & DIV_EL)) {
6886 !!!cp ('t358');
6887 last LI;
6888 }
6889
6890 !!!cp ('t359');
6891 ## Step 4
6892 $i--;
6893 $node = $self->{open_elements}->[$i];
6894 redo LI;
6895 } # LI
6896
6897 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6898 !!!nack ('t359.1');
6899 !!!next-token;
6900 next B;
6901 } elsif ($token->{tag_name} eq 'plaintext') {
6902 ## has a p element in scope
6903 INSCOPE: for (reverse @{$self->{open_elements}}) {
6904 if ($_->[1] & P_EL) {
6905 !!!cp ('t367');
6906 !!!back-token; # <plaintext>
6907 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6908 line => $token->{line}, column => $token->{column}};
6909 next B;
6910 } elsif ($_->[1] & SCOPING_EL) {
6911 !!!cp ('t368');
6912 last INSCOPE;
6913 }
6914 } # INSCOPE
6915
6916 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6917
6918 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6919
6920 !!!nack ('t368.1');
6921 !!!next-token;
6922 next B;
6923 } elsif ($token->{tag_name} eq 'a') {
6924 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6925 my $node = $active_formatting_elements->[$i];
6926 if ($node->[1] & A_EL) {
6927 !!!cp ('t371');
6928 !!!parse-error (type => 'in a:a', token => $token);
6929
6930 !!!back-token; # <a>
6931 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6932 line => $token->{line}, column => $token->{column}};
6933 $formatting_end_tag->($token);
6934
6935 AFE2: for (reverse 0..$#$active_formatting_elements) {
6936 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6937 !!!cp ('t372');
6938 splice @$active_formatting_elements, $_, 1;
6939 last AFE2;
6940 }
6941 } # AFE2
6942 OE: for (reverse 0..$#{$self->{open_elements}}) {
6943 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6944 !!!cp ('t373');
6945 splice @{$self->{open_elements}}, $_, 1;
6946 last OE;
6947 }
6948 } # OE
6949 last AFE;
6950 } elsif ($node->[0] eq '#marker') {
6951 !!!cp ('t374');
6952 last AFE;
6953 }
6954 } # AFE
6955
6956 $reconstruct_active_formatting_elements->($insert_to_current);
6957
6958 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6959 push @$active_formatting_elements, $self->{open_elements}->[-1];
6960
6961 !!!nack ('t374.1');
6962 !!!next-token;
6963 next B;
6964 } elsif ($token->{tag_name} eq 'nobr') {
6965 $reconstruct_active_formatting_elements->($insert_to_current);
6966
6967 ## has a |nobr| element in scope
6968 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6969 my $node = $self->{open_elements}->[$_];
6970 if ($node->[1] & NOBR_EL) {
6971 !!!cp ('t376');
6972 !!!parse-error (type => 'in nobr:nobr', token => $token);
6973 !!!back-token; # <nobr>
6974 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6975 line => $token->{line}, column => $token->{column}};
6976 next B;
6977 } elsif ($node->[1] & SCOPING_EL) {
6978 !!!cp ('t377');
6979 last INSCOPE;
6980 }
6981 } # INSCOPE
6982
6983 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6984 push @$active_formatting_elements, $self->{open_elements}->[-1];
6985
6986 !!!nack ('t377.1');
6987 !!!next-token;
6988 next B;
6989 } elsif ($token->{tag_name} eq 'button') {
6990 ## has a button element in scope
6991 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6992 my $node = $self->{open_elements}->[$_];
6993 if ($node->[1] & BUTTON_EL) {
6994 !!!cp ('t378');
6995 !!!parse-error (type => 'in button:button', token => $token);
6996 !!!back-token; # <button>
6997 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6998 line => $token->{line}, column => $token->{column}};
6999 next B;
7000 } elsif ($node->[1] & SCOPING_EL) {
7001 !!!cp ('t379');
7002 last INSCOPE;
7003 }
7004 } # INSCOPE
7005
7006 $reconstruct_active_formatting_elements->($insert_to_current);
7007
7008 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7009
7010 ## TODO: associate with $self->{form_element} if defined
7011
7012 push @$active_formatting_elements, ['#marker', ''];
7013
7014 !!!nack ('t379.1');
7015 !!!next-token;
7016 next B;
7017 } elsif ({
7018 xmp => 1,
7019 iframe => 1,
7020 noembed => 1,
7021 noframes => 1, ## NOTE: This is an "as if in head" code clone.
7022 noscript => 0, ## TODO: 1 if scripting is enabled
7023 }->{$token->{tag_name}}) {
7024 if ($token->{tag_name} eq 'xmp') {
7025 !!!cp ('t381');
7026 $reconstruct_active_formatting_elements->($insert_to_current);
7027 } else {
7028 !!!cp ('t399');
7029 }
7030 ## NOTE: There is an "as if in body" code clone.
7031 $parse_rcdata->(CDATA_CONTENT_MODEL);
7032 next B;
7033 } elsif ($token->{tag_name} eq 'isindex') {
7034 !!!parse-error (type => 'isindex', token => $token);
7035
7036 if (defined $self->{form_element}) {
7037 !!!cp ('t389');
7038 ## Ignore the token
7039 !!!nack ('t389'); ## NOTE: Not acknowledged.
7040 !!!next-token;
7041 next B;
7042 } else {
7043 !!!ack ('t391.1');
7044
7045 my $at = $token->{attributes};
7046 my $form_attrs;
7047 $form_attrs->{action} = $at->{action} if $at->{action};
7048 my $prompt_attr = $at->{prompt};
7049 $at->{name} = {name => 'name', value => 'isindex'};
7050 delete $at->{action};
7051 delete $at->{prompt};
7052 my @tokens = (
7053 {type => START_TAG_TOKEN, tag_name => 'form',
7054 attributes => $form_attrs,
7055 line => $token->{line}, column => $token->{column}},
7056 {type => START_TAG_TOKEN, tag_name => 'hr',
7057 line => $token->{line}, column => $token->{column}},
7058 {type => START_TAG_TOKEN, tag_name => 'p',
7059 line => $token->{line}, column => $token->{column}},
7060 {type => START_TAG_TOKEN, tag_name => 'label',
7061 line => $token->{line}, column => $token->{column}},
7062 );
7063 if ($prompt_attr) {
7064 !!!cp ('t390');
7065 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
7066 #line => $token->{line}, column => $token->{column},
7067 };
7068 } else {
7069 !!!cp ('t391');
7070 push @tokens, {type => CHARACTER_TOKEN,
7071 data => 'This is a searchable index. Insert your search keywords here: ',
7072 #line => $token->{line}, column => $token->{column},
7073 }; # SHOULD
7074 ## TODO: make this configurable
7075 }
7076 push @tokens,
7077 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
7078 line => $token->{line}, column => $token->{column}},
7079 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
7080 {type => END_TAG_TOKEN, tag_name => 'label',
7081 line => $token->{line}, column => $token->{column}},
7082 {type => END_TAG_TOKEN, tag_name => 'p',
7083 line => $token->{line}, column => $token->{column}},
7084 {type => START_TAG_TOKEN, tag_name => 'hr',
7085 line => $token->{line}, column => $token->{column}},
7086 {type => END_TAG_TOKEN, tag_name => 'form',
7087 line => $token->{line}, column => $token->{column}};
7088 !!!back-token (@tokens);
7089 !!!next-token;
7090 next B;
7091 }
7092 } elsif ($token->{tag_name} eq 'textarea') {
7093 my $tag_name = $token->{tag_name};
7094 my $el;
7095 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
7096
7097 ## TODO: $self->{form_element} if defined
7098 $self->{content_model} = RCDATA_CONTENT_MODEL;
7099 delete $self->{escape}; # MUST
7100
7101 $insert->($el);
7102
7103 my $text = '';
7104 !!!nack ('t392.1');
7105 !!!next-token;
7106 if ($token->{type} == CHARACTER_TOKEN) {
7107 $token->{data} =~ s/^\x0A//;
7108 unless (length $token->{data}) {
7109 !!!cp ('t392');
7110 !!!next-token;
7111 } else {
7112 !!!cp ('t393');
7113 }
7114 } else {
7115 !!!cp ('t394');
7116 }
7117 while ($token->{type} == CHARACTER_TOKEN) {
7118 !!!cp ('t395');
7119 $text .= $token->{data};
7120 !!!next-token;
7121 }
7122 if (length $text) {
7123 !!!cp ('t396');
7124 $el->manakai_append_text ($text);
7125 }
7126
7127 $self->{content_model} = PCDATA_CONTENT_MODEL;
7128
7129 if ($token->{type} == END_TAG_TOKEN and
7130 $token->{tag_name} eq $tag_name) {
7131 !!!cp ('t397');
7132 ## Ignore the token
7133 } else {
7134 !!!cp ('t398');
7135 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
7136 }
7137 !!!next-token;
7138 next B;
7139 } elsif ($token->{tag_name} eq 'rt' or
7140 $token->{tag_name} eq 'rp') {
7141 ## has a |ruby| element in scope
7142 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7143 my $node = $self->{open_elements}->[$_];
7144 if ($node->[1] & RUBY_EL) {
7145 !!!cp ('t398.1');
7146 ## generate implied end tags
7147 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7148 !!!cp ('t398.2');
7149 pop @{$self->{open_elements}};
7150 }
7151 unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
7152 !!!cp ('t398.3');
7153 !!!parse-error (type => 'not closed',
7154 text => $self->{open_elements}->[-1]->[0]
7155 ->manakai_local_name,
7156 token => $token);
7157 pop @{$self->{open_elements}}
7158 while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
7159 }
7160 last INSCOPE;
7161 } elsif ($node->[1] & SCOPING_EL) {
7162 !!!cp ('t398.4');
7163 last INSCOPE;
7164 }
7165 } # INSCOPE
7166
7167 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7168
7169 !!!nack ('t398.5');
7170 !!!next-token;
7171 redo B;
7172 } elsif ($token->{tag_name} eq 'math' or
7173 $token->{tag_name} eq 'svg') {
7174 $reconstruct_active_formatting_elements->($insert_to_current);
7175
7176 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
7177
7178 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
7179
7180 ## "adjust foreign attributes" - done in insert-element-f
7181
7182 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
7183
7184 if ($self->{self_closing}) {
7185 pop @{$self->{open_elements}};
7186 !!!ack ('t398.1');
7187 } else {
7188 !!!cp ('t398.2');
7189 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
7190 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
7191 ## mode, "in body" (not "in foreign content") secondary insertion
7192 ## mode, maybe.
7193 }
7194
7195 !!!next-token;
7196 next B;
7197 } elsif ({
7198 caption => 1, col => 1, colgroup => 1, frame => 1,
7199 frameset => 1, head => 1, option => 1, optgroup => 1,
7200 tbody => 1, td => 1, tfoot => 1, th => 1,
7201 thead => 1, tr => 1,
7202 }->{$token->{tag_name}}) {
7203 !!!cp ('t401');
7204 !!!parse-error (type => 'in body',
7205 text => $token->{tag_name}, token => $token);
7206 ## Ignore the token
7207 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
7208 !!!next-token;
7209 next B;
7210
7211 ## ISSUE: An issue on HTML5 new elements in the spec.
7212 } else {
7213 if ($token->{tag_name} eq 'image') {
7214 !!!cp ('t384');
7215 !!!parse-error (type => 'image', token => $token);
7216 $token->{tag_name} = 'img';
7217 } else {
7218 !!!cp ('t385');
7219 }
7220
7221 ## NOTE: There is an "as if <br>" code clone.
7222 $reconstruct_active_formatting_elements->($insert_to_current);
7223
7224 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7225
7226 if ({
7227 applet => 1, marquee => 1, object => 1,
7228 }->{$token->{tag_name}}) {
7229 !!!cp ('t380');
7230 push @$active_formatting_elements, ['#marker', ''];
7231 !!!nack ('t380.1');
7232 } elsif ({
7233 b => 1, big => 1, em => 1, font => 1, i => 1,
7234 s => 1, small => 1, strile => 1,
7235 strong => 1, tt => 1, u => 1,
7236 }->{$token->{tag_name}}) {
7237 !!!cp ('t375');
7238 push @$active_formatting_elements, $self->{open_elements}->[-1];
7239 !!!nack ('t375.1');
7240 } elsif ($token->{tag_name} eq 'input') {
7241 !!!cp ('t388');
7242 ## TODO: associate with $self->{form_element} if defined
7243 pop @{$self->{open_elements}};
7244 !!!ack ('t388.2');
7245 } elsif ({
7246 area => 1, basefont => 1, bgsound => 1, br => 1,
7247 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7248 #image => 1,
7249 }->{$token->{tag_name}}) {
7250 !!!cp ('t388.1');
7251 pop @{$self->{open_elements}};
7252 !!!ack ('t388.3');
7253 } elsif ($token->{tag_name} eq 'select') {
7254 ## TODO: associate with $self->{form_element} if defined
7255
7256 if ($self->{insertion_mode} & TABLE_IMS or
7257 $self->{insertion_mode} & BODY_TABLE_IMS or
7258 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7259 !!!cp ('t400.1');
7260 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7261 } else {
7262 !!!cp ('t400.2');
7263 $self->{insertion_mode} = IN_SELECT_IM;
7264 }
7265 !!!nack ('t400.3');
7266 } else {
7267 !!!nack ('t402');
7268 }
7269
7270 !!!next-token;
7271 next B;
7272 }
7273 } elsif ($token->{type} == END_TAG_TOKEN) {
7274 if ($token->{tag_name} eq 'body') {
7275 ## has a |body| element in scope
7276 my $i;
7277 INSCOPE: {
7278 for (reverse @{$self->{open_elements}}) {
7279 if ($_->[1] & BODY_EL) {
7280 !!!cp ('t405');
7281 $i = $_;
7282 last INSCOPE;
7283 } elsif ($_->[1] & SCOPING_EL) {
7284 !!!cp ('t405.1');
7285 last;
7286 }
7287 }
7288
7289 !!!parse-error (type => 'start tag not allowed',
7290 text => $token->{tag_name}, token => $token);
7291 ## NOTE: Ignore the token.
7292 !!!next-token;
7293 next B;
7294 } # INSCOPE
7295
7296 for (@{$self->{open_elements}}) {
7297 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7298 !!!cp ('t403');
7299 !!!parse-error (type => 'not closed',
7300 text => $_->[0]->manakai_local_name,
7301 token => $token);
7302 last;
7303 } else {
7304 !!!cp ('t404');
7305 }
7306 }
7307
7308 $self->{insertion_mode} = AFTER_BODY_IM;
7309 !!!next-token;
7310 next B;
7311 } elsif ($token->{tag_name} eq 'html') {
7312 ## TODO: Update this code. It seems that the code below is not
7313 ## up-to-date, though it has same effect as speced.
7314 if (@{$self->{open_elements}} > 1 and
7315 $self->{open_elements}->[1]->[1] & BODY_EL) {
7316 ## ISSUE: There is an issue in the spec.
7317 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7318 !!!cp ('t406');
7319 !!!parse-error (type => 'not closed',
7320 text => $self->{open_elements}->[1]->[0]
7321 ->manakai_local_name,
7322 token => $token);
7323 } else {
7324 !!!cp ('t407');
7325 }
7326 $self->{insertion_mode} = AFTER_BODY_IM;
7327 ## reprocess
7328 next B;
7329 } else {
7330 !!!cp ('t408');
7331 !!!parse-error (type => 'unmatched end tag',
7332 text => $token->{tag_name}, token => $token);
7333 ## Ignore the token
7334 !!!next-token;
7335 next B;
7336 }
7337 } elsif ({
7338 address => 1, blockquote => 1, center => 1, dir => 1,
7339 div => 1, dl => 1, fieldset => 1, listing => 1,
7340 menu => 1, ol => 1, pre => 1, ul => 1,
7341 dd => 1, dt => 1, li => 1,
7342 applet => 1, button => 1, marquee => 1, object => 1,
7343 }->{$token->{tag_name}}) {
7344 ## has an element in scope
7345 my $i;
7346 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7347 my $node = $self->{open_elements}->[$_];
7348 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7349 !!!cp ('t410');
7350 $i = $_;
7351 last INSCOPE;
7352 } elsif ($node->[1] & SCOPING_EL) {
7353 !!!cp ('t411');
7354 last INSCOPE;
7355 }
7356 } # INSCOPE
7357
7358 unless (defined $i) { # has an element in scope
7359 !!!cp ('t413');
7360 !!!parse-error (type => 'unmatched end tag',
7361 text => $token->{tag_name}, token => $token);
7362 ## NOTE: Ignore the token.
7363 } else {
7364 ## Step 1. generate implied end tags
7365 while ({
7366 ## END_TAG_OPTIONAL_EL
7367 dd => ($token->{tag_name} ne 'dd'),
7368 dt => ($token->{tag_name} ne 'dt'),
7369 li => ($token->{tag_name} ne 'li'),
7370 p => 1,
7371 rt => 1,
7372 rp => 1,
7373 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7374 !!!cp ('t409');
7375 pop @{$self->{open_elements}};
7376 }
7377
7378 ## Step 2.
7379 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7380 ne $token->{tag_name}) {
7381 !!!cp ('t412');
7382 !!!parse-error (type => 'not closed',
7383 text => $self->{open_elements}->[-1]->[0]
7384 ->manakai_local_name,
7385 token => $token);
7386 } else {
7387 !!!cp ('t414');
7388 }
7389
7390 ## Step 3.
7391 splice @{$self->{open_elements}}, $i;
7392
7393 ## Step 4.
7394 $clear_up_to_marker->()
7395 if {
7396 applet => 1, button => 1, marquee => 1, object => 1,
7397 }->{$token->{tag_name}};
7398 }
7399 !!!next-token;
7400 next B;
7401 } elsif ($token->{tag_name} eq 'form') {
7402 undef $self->{form_element};
7403
7404 ## has an element in scope
7405 my $i;
7406 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7407 my $node = $self->{open_elements}->[$_];
7408 if ($node->[1] & FORM_EL) {
7409 !!!cp ('t418');
7410 $i = $_;
7411 last INSCOPE;
7412 } elsif ($node->[1] & SCOPING_EL) {
7413 !!!cp ('t419');
7414 last INSCOPE;
7415 }
7416 } # INSCOPE
7417
7418 unless (defined $i) { # has an element in scope
7419 !!!cp ('t421');
7420 !!!parse-error (type => 'unmatched end tag',
7421 text => $token->{tag_name}, token => $token);
7422 ## NOTE: Ignore the token.
7423 } else {
7424 ## Step 1. generate implied end tags
7425 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7426 !!!cp ('t417');
7427 pop @{$self->{open_elements}};
7428 }
7429
7430 ## Step 2.
7431 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7432 ne $token->{tag_name}) {
7433 !!!cp ('t417.1');
7434 !!!parse-error (type => 'not closed',
7435 text => $self->{open_elements}->[-1]->[0]
7436 ->manakai_local_name,
7437 token => $token);
7438 } else {
7439 !!!cp ('t420');
7440 }
7441
7442 ## Step 3.
7443 splice @{$self->{open_elements}}, $i;
7444 }
7445
7446 !!!next-token;
7447 next B;
7448 } elsif ({
7449 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7450 }->{$token->{tag_name}}) {
7451 ## has an element in scope
7452 my $i;
7453 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7454 my $node = $self->{open_elements}->[$_];
7455 if ($node->[1] & HEADING_EL) {
7456 !!!cp ('t423');
7457 $i = $_;
7458 last INSCOPE;
7459 } elsif ($node->[1] & SCOPING_EL) {
7460 !!!cp ('t424');
7461 last INSCOPE;
7462 }
7463 } # INSCOPE
7464
7465 unless (defined $i) { # has an element in scope
7466 !!!cp ('t425.1');
7467 !!!parse-error (type => 'unmatched end tag',
7468 text => $token->{tag_name}, token => $token);
7469 ## NOTE: Ignore the token.
7470 } else {
7471 ## Step 1. generate implied end tags
7472 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7473 !!!cp ('t422');
7474 pop @{$self->{open_elements}};
7475 }
7476
7477 ## Step 2.
7478 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7479 ne $token->{tag_name}) {
7480 !!!cp ('t425');
7481 !!!parse-error (type => 'unmatched end tag',
7482 text => $token->{tag_name}, token => $token);
7483 } else {
7484 !!!cp ('t426');
7485 }
7486
7487 ## Step 3.
7488 splice @{$self->{open_elements}}, $i;
7489 }
7490
7491 !!!next-token;
7492 next B;
7493 } elsif ($token->{tag_name} eq 'p') {
7494 ## has an element in scope
7495 my $i;
7496 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7497 my $node = $self->{open_elements}->[$_];
7498 if ($node->[1] & P_EL) {
7499 !!!cp ('t410.1');
7500 $i = $_;
7501 last INSCOPE;
7502 } elsif ($node->[1] & SCOPING_EL) {
7503 !!!cp ('t411.1');
7504 last INSCOPE;
7505 }
7506 } # INSCOPE
7507
7508 if (defined $i) {
7509 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7510 ne $token->{tag_name}) {
7511 !!!cp ('t412.1');
7512 !!!parse-error (type => 'not closed',
7513 text => $self->{open_elements}->[-1]->[0]
7514 ->manakai_local_name,
7515 token => $token);
7516 } else {
7517 !!!cp ('t414.1');
7518 }
7519
7520 splice @{$self->{open_elements}}, $i;
7521 } else {
7522 !!!cp ('t413.1');
7523 !!!parse-error (type => 'unmatched end tag',
7524 text => $token->{tag_name}, token => $token);
7525
7526 !!!cp ('t415.1');
7527 ## As if <p>, then reprocess the current token
7528 my $el;
7529 !!!create-element ($el, $HTML_NS, 'p',, $token);
7530 $insert->($el);
7531 ## NOTE: Not inserted into |$self->{open_elements}|.
7532 }
7533
7534 !!!next-token;
7535 next B;
7536 } elsif ({
7537 a => 1,
7538 b => 1, big => 1, em => 1, font => 1, i => 1,
7539 nobr => 1, s => 1, small => 1, strile => 1,
7540 strong => 1, tt => 1, u => 1,
7541 }->{$token->{tag_name}}) {
7542 !!!cp ('t427');
7543 $formatting_end_tag->($token);
7544 next B;
7545 } elsif ($token->{tag_name} eq 'br') {
7546 !!!cp ('t428');
7547 !!!parse-error (type => 'unmatched end tag',
7548 text => 'br', token => $token);
7549
7550 ## As if <br>
7551 $reconstruct_active_formatting_elements->($insert_to_current);
7552
7553 my $el;
7554 !!!create-element ($el, $HTML_NS, 'br',, $token);
7555 $insert->($el);
7556
7557 ## Ignore the token.
7558 !!!next-token;
7559 next B;
7560 } elsif ({
7561 caption => 1, col => 1, colgroup => 1, frame => 1,
7562 frameset => 1, head => 1, option => 1, optgroup => 1,
7563 tbody => 1, td => 1, tfoot => 1, th => 1,
7564 thead => 1, tr => 1,
7565 area => 1, basefont => 1, bgsound => 1,
7566 embed => 1, hr => 1, iframe => 1, image => 1,
7567 img => 1, input => 1, isindex => 1, noembed => 1,
7568 noframes => 1, param => 1, select => 1, spacer => 1,
7569 table => 1, textarea => 1, wbr => 1,
7570 noscript => 0, ## TODO: if scripting is enabled
7571 }->{$token->{tag_name}}) {
7572 !!!cp ('t429');
7573 !!!parse-error (type => 'unmatched end tag',
7574 text => $token->{tag_name}, token => $token);
7575 ## Ignore the token
7576 !!!next-token;
7577 next B;
7578
7579 ## ISSUE: Issue on HTML5 new elements in spec
7580
7581 } else {
7582 ## Step 1
7583 my $node_i = -1;
7584 my $node = $self->{open_elements}->[$node_i];
7585
7586 ## Step 2
7587 S2: {
7588 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7589 ## Step 1
7590 ## generate implied end tags
7591 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7592 !!!cp ('t430');
7593 ## NOTE: |<ruby><rt></ruby>|.
7594 ## ISSUE: <ruby><rt></rt> will also take this code path,
7595 ## which seems wrong.
7596 pop @{$self->{open_elements}};
7597 $node_i++;
7598 }
7599
7600 ## Step 2
7601 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7602 ne $token->{tag_name}) {
7603 !!!cp ('t431');
7604 ## NOTE: <x><y></x>
7605 !!!parse-error (type => 'not closed',
7606 text => $self->{open_elements}->[-1]->[0]
7607 ->manakai_local_name,
7608 token => $token);
7609 } else {
7610 !!!cp ('t432');
7611 }
7612
7613 ## Step 3
7614 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7615
7616 !!!next-token;
7617 last S2;
7618 } else {
7619 ## Step 3
7620 if (not ($node->[1] & FORMATTING_EL) and
7621 #not $phrasing_category->{$node->[1]} and
7622 ($node->[1] & SPECIAL_EL or
7623 $node->[1] & SCOPING_EL)) {
7624 !!!cp ('t433');
7625 !!!parse-error (type => 'unmatched end tag',
7626 text => $token->{tag_name}, token => $token);
7627 ## Ignore the token
7628 !!!next-token;
7629 last S2;
7630 }
7631
7632 !!!cp ('t434');
7633 }
7634
7635 ## Step 4
7636 $node_i--;
7637 $node = $self->{open_elements}->[$node_i];
7638
7639 ## Step 5;
7640 redo S2;
7641 } # S2
7642 next B;
7643 }
7644 }
7645 next B;
7646 } continue { # B
7647 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7648 ## NOTE: The code below is executed in cases where it does not have
7649 ## to be, but it it is harmless even in those cases.
7650 ## has an element in scope
7651 INSCOPE: {
7652 for (reverse 0..$#{$self->{open_elements}}) {
7653 my $node = $self->{open_elements}->[$_];
7654 if ($node->[1] & FOREIGN_EL) {
7655 last INSCOPE;
7656 } elsif ($node->[1] & SCOPING_EL) {
7657 last;
7658 }
7659 }
7660
7661 ## NOTE: No foreign element in scope.
7662 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7663 } # INSCOPE
7664 }
7665 } # B
7666
7667 ## Stop parsing # MUST
7668
7669 ## TODO: script stuffs
7670 } # _tree_construct_main
7671
7672 sub set_inner_html ($$$;$) {
7673 my $class = shift;
7674 my $node = shift;
7675 my $s = \$_[0];
7676 my $onerror = $_[1];
7677 my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7678
7679 ## ISSUE: Should {confident} be true?
7680
7681 my $nt = $node->node_type;
7682 if ($nt == 9) {
7683 # MUST
7684
7685 ## Step 1 # MUST
7686 ## TODO: If the document has an active parser, ...
7687 ## ISSUE: There is an issue in the spec.
7688
7689 ## Step 2 # MUST
7690 my @cn = @{$node->child_nodes};
7691 for (@cn) {
7692 $node->remove_child ($_);
7693 }
7694
7695 ## Step 3, 4, 5 # MUST
7696 $class->parse_char_string ($$s => $node, $onerror, $get_wrapper);
7697 } elsif ($nt == 1) {
7698 ## TODO: If non-html element
7699
7700 ## NOTE: Most of this code is copied from |parse_string|
7701
7702 ## TODO: Support for $get_wrapper
7703
7704 ## Step 1 # MUST
7705 my $this_doc = $node->owner_document;
7706 my $doc = $this_doc->implementation->create_document;
7707 $doc->manakai_is_html (1);
7708 my $p = $class->new;
7709 $p->{document} = $doc;
7710
7711 ## Step 8 # MUST
7712 my $i = 0;
7713 $p->{line_prev} = $p->{line} = 1;
7714 $p->{column_prev} = $p->{column} = 0;
7715 $p->{set_next_char} = sub {
7716 my $self = shift;
7717
7718 pop @{$self->{prev_char}};
7719 unshift @{$self->{prev_char}}, $self->{next_char};
7720
7721 $self->{next_char} = -1 and return if $i >= length $$s;
7722 $self->{next_char} = ord substr $$s, $i++, 1;
7723
7724 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7725 $p->{column}++;
7726
7727 if ($self->{next_char} == 0x000A) { # LF
7728 $p->{line}++;
7729 $p->{column} = 0;
7730 !!!cp ('i1');
7731 } elsif ($self->{next_char} == 0x000D) { # CR
7732 $i++ if substr ($$s, $i, 1) eq "\x0A";
7733 $self->{next_char} = 0x000A; # LF # MUST
7734 $p->{line}++;
7735 $p->{column} = 0;
7736 !!!cp ('i2');
7737 } elsif ($self->{next_char} > 0x10FFFF) {
7738 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7739 !!!cp ('i3');
7740 } elsif ($self->{next_char} == 0x0000) { # NULL
7741 !!!cp ('i4');
7742 !!!parse-error (type => 'NULL');
7743 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7744 } elsif ($self->{next_char} <= 0x0008 or
7745 (0x000E <= $self->{next_char} and
7746 $self->{next_char} <= 0x001F) or
7747 (0x007F <= $self->{next_char} and
7748 $self->{next_char} <= 0x009F) or
7749 (0xD800 <= $self->{next_char} and
7750 $self->{next_char} <= 0xDFFF) or
7751 (0xFDD0 <= $self->{next_char} and
7752 $self->{next_char} <= 0xFDDF) or
7753 {
7754 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7755 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7756 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7757 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7758 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7759 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7760 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7761 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7762 0x10FFFE => 1, 0x10FFFF => 1,
7763 }->{$self->{next_char}}) {
7764 !!!cp ('i4.1');
7765 if ($self->{next_char} < 0x10000) {
7766 !!!parse-error (type => 'control char',
7767 text => (sprintf 'U+%04X', $self->{next_char}));
7768 } else {
7769 !!!parse-error (type => 'control char',
7770 text => (sprintf 'U-%08X', $self->{next_char}));
7771 }
7772 }
7773 };
7774 $p->{prev_char} = [-1, -1, -1];
7775 $p->{next_char} = -1;
7776
7777 my $ponerror = $onerror || sub {
7778 my (%opt) = @_;
7779 my $line = $opt{line};
7780 my $column = $opt{column};
7781 if (defined $opt{token} and defined $opt{token}->{line}) {
7782 $line = $opt{token}->{line};
7783 $column = $opt{token}->{column};
7784 }
7785 warn "Parse error ($opt{type}) at line $line column $column\n";
7786 };
7787 $p->{parse_error} = sub {
7788 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7789 };
7790
7791 $p->_initialize_tokenizer;
7792 $p->_initialize_tree_constructor;
7793
7794 ## Step 2
7795 my $node_ln = $node->manakai_local_name;
7796 $p->{content_model} = {
7797 title => RCDATA_CONTENT_MODEL,
7798 textarea => RCDATA_CONTENT_MODEL,
7799 style => CDATA_CONTENT_MODEL,
7800 script => CDATA_CONTENT_MODEL,
7801 xmp => CDATA_CONTENT_MODEL,
7802 iframe => CDATA_CONTENT_MODEL,
7803 noembed => CDATA_CONTENT_MODEL,
7804 noframes => CDATA_CONTENT_MODEL,
7805 noscript => CDATA_CONTENT_MODEL,
7806 plaintext => PLAINTEXT_CONTENT_MODEL,
7807 }->{$node_ln};
7808 $p->{content_model} = PCDATA_CONTENT_MODEL
7809 unless defined $p->{content_model};
7810 ## ISSUE: What is "the name of the element"? local name?
7811
7812 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7813 ## TODO: Foreign element OK?
7814
7815 ## Step 3
7816 my $root = $doc->create_element_ns
7817 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7818
7819 ## Step 4 # MUST
7820 $doc->append_child ($root);
7821
7822 ## Step 5 # MUST
7823 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7824
7825 undef $p->{head_element};
7826
7827 ## Step 6 # MUST
7828 $p->_reset_insertion_mode;
7829
7830 ## Step 7 # MUST
7831 my $anode = $node;
7832 AN: while (defined $anode) {
7833 if ($anode->node_type == 1) {
7834 my $nsuri = $anode->namespace_uri;
7835 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7836 if ($anode->manakai_local_name eq 'form') {
7837 !!!cp ('i5');
7838 $p->{form_element} = $anode;
7839 last AN;
7840 }
7841 }
7842 }
7843 $anode = $anode->parent_node;
7844 } # AN
7845
7846 ## Step 9 # MUST
7847 {
7848 my $self = $p;
7849 !!!next-token;
7850 }
7851 $p->_tree_construction_main;
7852
7853 ## Step 10 # MUST
7854 my @cn = @{$node->child_nodes};
7855 for (@cn) {
7856 $node->remove_child ($_);
7857 }
7858 ## ISSUE: mutation events? read-only?
7859
7860 ## Step 11 # MUST
7861 @cn = @{$root->child_nodes};
7862 for (@cn) {
7863 $this_doc->adopt_node ($_);
7864 $node->append_child ($_);
7865 }
7866 ## ISSUE: mutation events?
7867
7868 $p->_terminate_tree_constructor;
7869
7870 delete $p->{parse_error}; # delete loop
7871 } else {
7872 die "$0: |set_inner_html| is not defined for node of type $nt";
7873 }
7874 } # set_inner_html
7875
7876 } # tree construction stage
7877
7878 package Whatpm::HTML::RestartParser;
7879 push our @ISA, 'Error';
7880
7881 1;
7882 # $Date: 2008/09/13 09:02:28 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24