/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.167 - (show annotations) (download) (as text)
Sat Sep 13 09:02:28 2008 UTC (17 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.166: +101 -64 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	13 Sep 2008 09:02:17 -0000
	* HTML.pm: "Consume a character reference" algorithm is
	now implemented as a tokenizer's state, rather than
	a method, with minimum changes (more changes will
	be made, in due course).  "Bogus comment state"'s inner
	loop gets removed.

2008-09-13  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.166 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 require IO::Handle;
12
13 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15 my $SVG_NS = q<http://www.w3.org/2000/svg>;
16 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19
20 sub A_EL () { 0b1 }
21 sub ADDRESS_EL () { 0b10 }
22 sub BODY_EL () { 0b100 }
23 sub BUTTON_EL () { 0b1000 }
24 sub CAPTION_EL () { 0b10000 }
25 sub DD_EL () { 0b100000 }
26 sub DIV_EL () { 0b1000000 }
27 sub DT_EL () { 0b10000000 }
28 sub FORM_EL () { 0b100000000 }
29 sub FORMATTING_EL () { 0b1000000000 }
30 sub FRAMESET_EL () { 0b10000000000 }
31 sub HEADING_EL () { 0b100000000000 }
32 sub HTML_EL () { 0b1000000000000 }
33 sub LI_EL () { 0b10000000000000 }
34 sub NOBR_EL () { 0b100000000000000 }
35 sub OPTION_EL () { 0b1000000000000000 }
36 sub OPTGROUP_EL () { 0b10000000000000000 }
37 sub P_EL () { 0b100000000000000000 }
38 sub SELECT_EL () { 0b1000000000000000000 }
39 sub TABLE_EL () { 0b10000000000000000000 }
40 sub TABLE_CELL_EL () { 0b100000000000000000000 }
41 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48 sub RUBY_EL () { 0b10000000000000000000000000000 }
49 sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 ## NOTE: Used in "generate implied end tags" algorithm.
58 ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
59 ## is used in "generate implied end tags" implementation (search for the
60 ## function mae).
61 sub END_TAG_OPTIONAL_EL () {
62 DD_EL |
63 DT_EL |
64 LI_EL |
65 P_EL |
66 RUBY_COMPONENT_EL
67 }
68
69 ## NOTE: Used in </body> and EOF algorithms.
70 sub ALL_END_TAG_OPTIONAL_EL () {
71 DD_EL |
72 DT_EL |
73 LI_EL |
74 P_EL |
75
76 BODY_EL |
77 HTML_EL |
78 TABLE_CELL_EL |
79 TABLE_ROW_EL |
80 TABLE_ROW_GROUP_EL
81 }
82
83 sub SCOPING_EL () {
84 BUTTON_EL |
85 CAPTION_EL |
86 HTML_EL |
87 TABLE_EL |
88 TABLE_CELL_EL |
89 MISC_SCOPING_EL
90 }
91
92 sub TABLE_SCOPING_EL () {
93 HTML_EL |
94 TABLE_EL
95 }
96
97 sub TABLE_ROWS_SCOPING_EL () {
98 HTML_EL |
99 TABLE_ROW_GROUP_EL
100 }
101
102 sub TABLE_ROW_SCOPING_EL () {
103 HTML_EL |
104 TABLE_ROW_EL
105 }
106
107 sub SPECIAL_EL () {
108 ADDRESS_EL |
109 BODY_EL |
110 DIV_EL |
111
112 DD_EL |
113 DT_EL |
114 LI_EL |
115 P_EL |
116
117 FORM_EL |
118 FRAMESET_EL |
119 HEADING_EL |
120 OPTION_EL |
121 OPTGROUP_EL |
122 SELECT_EL |
123 TABLE_ROW_EL |
124 TABLE_ROW_GROUP_EL |
125 MISC_SPECIAL_EL
126 }
127
128 my $el_category = {
129 a => A_EL | FORMATTING_EL,
130 address => ADDRESS_EL,
131 applet => MISC_SCOPING_EL,
132 area => MISC_SPECIAL_EL,
133 b => FORMATTING_EL,
134 base => MISC_SPECIAL_EL,
135 basefont => MISC_SPECIAL_EL,
136 bgsound => MISC_SPECIAL_EL,
137 big => FORMATTING_EL,
138 blockquote => MISC_SPECIAL_EL,
139 body => BODY_EL,
140 br => MISC_SPECIAL_EL,
141 button => BUTTON_EL,
142 caption => CAPTION_EL,
143 center => MISC_SPECIAL_EL,
144 col => MISC_SPECIAL_EL,
145 colgroup => MISC_SPECIAL_EL,
146 dd => DD_EL,
147 dir => MISC_SPECIAL_EL,
148 div => DIV_EL,
149 dl => MISC_SPECIAL_EL,
150 dt => DT_EL,
151 em => FORMATTING_EL,
152 embed => MISC_SPECIAL_EL,
153 fieldset => MISC_SPECIAL_EL,
154 font => FORMATTING_EL,
155 form => FORM_EL,
156 frame => MISC_SPECIAL_EL,
157 frameset => FRAMESET_EL,
158 h1 => HEADING_EL,
159 h2 => HEADING_EL,
160 h3 => HEADING_EL,
161 h4 => HEADING_EL,
162 h5 => HEADING_EL,
163 h6 => HEADING_EL,
164 head => MISC_SPECIAL_EL,
165 hr => MISC_SPECIAL_EL,
166 html => HTML_EL,
167 i => FORMATTING_EL,
168 iframe => MISC_SPECIAL_EL,
169 img => MISC_SPECIAL_EL,
170 input => MISC_SPECIAL_EL,
171 isindex => MISC_SPECIAL_EL,
172 li => LI_EL,
173 link => MISC_SPECIAL_EL,
174 listing => MISC_SPECIAL_EL,
175 marquee => MISC_SCOPING_EL,
176 menu => MISC_SPECIAL_EL,
177 meta => MISC_SPECIAL_EL,
178 nobr => NOBR_EL | FORMATTING_EL,
179 noembed => MISC_SPECIAL_EL,
180 noframes => MISC_SPECIAL_EL,
181 noscript => MISC_SPECIAL_EL,
182 object => MISC_SCOPING_EL,
183 ol => MISC_SPECIAL_EL,
184 optgroup => OPTGROUP_EL,
185 option => OPTION_EL,
186 p => P_EL,
187 param => MISC_SPECIAL_EL,
188 plaintext => MISC_SPECIAL_EL,
189 pre => MISC_SPECIAL_EL,
190 rp => RUBY_COMPONENT_EL,
191 rt => RUBY_COMPONENT_EL,
192 ruby => RUBY_EL,
193 s => FORMATTING_EL,
194 script => MISC_SPECIAL_EL,
195 select => SELECT_EL,
196 small => FORMATTING_EL,
197 spacer => MISC_SPECIAL_EL,
198 strike => FORMATTING_EL,
199 strong => FORMATTING_EL,
200 style => MISC_SPECIAL_EL,
201 table => TABLE_EL,
202 tbody => TABLE_ROW_GROUP_EL,
203 td => TABLE_CELL_EL,
204 textarea => MISC_SPECIAL_EL,
205 tfoot => TABLE_ROW_GROUP_EL,
206 th => TABLE_CELL_EL,
207 thead => TABLE_ROW_GROUP_EL,
208 title => MISC_SPECIAL_EL,
209 tr => TABLE_ROW_EL,
210 tt => FORMATTING_EL,
211 u => FORMATTING_EL,
212 ul => MISC_SPECIAL_EL,
213 wbr => MISC_SPECIAL_EL,
214 };
215
216 my $el_category_f = {
217 $MML_NS => {
218 'annotation-xml' => MML_AXML_EL,
219 mi => FOREIGN_FLOW_CONTENT_EL,
220 mo => FOREIGN_FLOW_CONTENT_EL,
221 mn => FOREIGN_FLOW_CONTENT_EL,
222 ms => FOREIGN_FLOW_CONTENT_EL,
223 mtext => FOREIGN_FLOW_CONTENT_EL,
224 },
225 $SVG_NS => {
226 foreignObject => FOREIGN_FLOW_CONTENT_EL,
227 desc => FOREIGN_FLOW_CONTENT_EL,
228 title => FOREIGN_FLOW_CONTENT_EL,
229 },
230 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
231 };
232
233 my $svg_attr_name = {
234 attributename => 'attributeName',
235 attributetype => 'attributeType',
236 basefrequency => 'baseFrequency',
237 baseprofile => 'baseProfile',
238 calcmode => 'calcMode',
239 clippathunits => 'clipPathUnits',
240 contentscripttype => 'contentScriptType',
241 contentstyletype => 'contentStyleType',
242 diffuseconstant => 'diffuseConstant',
243 edgemode => 'edgeMode',
244 externalresourcesrequired => 'externalResourcesRequired',
245 filterres => 'filterRes',
246 filterunits => 'filterUnits',
247 glyphref => 'glyphRef',
248 gradienttransform => 'gradientTransform',
249 gradientunits => 'gradientUnits',
250 kernelmatrix => 'kernelMatrix',
251 kernelunitlength => 'kernelUnitLength',
252 keypoints => 'keyPoints',
253 keysplines => 'keySplines',
254 keytimes => 'keyTimes',
255 lengthadjust => 'lengthAdjust',
256 limitingconeangle => 'limitingConeAngle',
257 markerheight => 'markerHeight',
258 markerunits => 'markerUnits',
259 markerwidth => 'markerWidth',
260 maskcontentunits => 'maskContentUnits',
261 maskunits => 'maskUnits',
262 numoctaves => 'numOctaves',
263 pathlength => 'pathLength',
264 patterncontentunits => 'patternContentUnits',
265 patterntransform => 'patternTransform',
266 patternunits => 'patternUnits',
267 pointsatx => 'pointsAtX',
268 pointsaty => 'pointsAtY',
269 pointsatz => 'pointsAtZ',
270 preservealpha => 'preserveAlpha',
271 preserveaspectratio => 'preserveAspectRatio',
272 primitiveunits => 'primitiveUnits',
273 refx => 'refX',
274 refy => 'refY',
275 repeatcount => 'repeatCount',
276 repeatdur => 'repeatDur',
277 requiredextensions => 'requiredExtensions',
278 requiredfeatures => 'requiredFeatures',
279 specularconstant => 'specularConstant',
280 specularexponent => 'specularExponent',
281 spreadmethod => 'spreadMethod',
282 startoffset => 'startOffset',
283 stddeviation => 'stdDeviation',
284 stitchtiles => 'stitchTiles',
285 surfacescale => 'surfaceScale',
286 systemlanguage => 'systemLanguage',
287 tablevalues => 'tableValues',
288 targetx => 'targetX',
289 targety => 'targetY',
290 textlength => 'textLength',
291 viewbox => 'viewBox',
292 viewtarget => 'viewTarget',
293 xchannelselector => 'xChannelSelector',
294 ychannelselector => 'yChannelSelector',
295 zoomandpan => 'zoomAndPan',
296 };
297
298 my $foreign_attr_xname = {
299 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
300 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
301 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
302 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
303 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
304 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
305 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
306 'xml:base' => [$XML_NS, ['xml', 'base']],
307 'xml:lang' => [$XML_NS, ['xml', 'lang']],
308 'xml:space' => [$XML_NS, ['xml', 'space']],
309 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
310 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
311 };
312
313 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
314
315 my $c1_entity_char = {
316 0x80 => 0x20AC,
317 0x81 => 0xFFFD,
318 0x82 => 0x201A,
319 0x83 => 0x0192,
320 0x84 => 0x201E,
321 0x85 => 0x2026,
322 0x86 => 0x2020,
323 0x87 => 0x2021,
324 0x88 => 0x02C6,
325 0x89 => 0x2030,
326 0x8A => 0x0160,
327 0x8B => 0x2039,
328 0x8C => 0x0152,
329 0x8D => 0xFFFD,
330 0x8E => 0x017D,
331 0x8F => 0xFFFD,
332 0x90 => 0xFFFD,
333 0x91 => 0x2018,
334 0x92 => 0x2019,
335 0x93 => 0x201C,
336 0x94 => 0x201D,
337 0x95 => 0x2022,
338 0x96 => 0x2013,
339 0x97 => 0x2014,
340 0x98 => 0x02DC,
341 0x99 => 0x2122,
342 0x9A => 0x0161,
343 0x9B => 0x203A,
344 0x9C => 0x0153,
345 0x9D => 0xFFFD,
346 0x9E => 0x017E,
347 0x9F => 0x0178,
348 }; # $c1_entity_char
349
350 sub parse_byte_string ($$$$;$) {
351 my $self = shift;
352 my $charset_name = shift;
353 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
354 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
355 } # parse_byte_string
356
357 sub parse_byte_stream ($$$$;$$) {
358 # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_;
359 my $self = ref $_[0] ? shift : shift->new;
360 my $charset_name = shift;
361 my $byte_stream = $_[0];
362
363 my $onerror = $_[2] || sub {
364 my (%opt) = @_;
365 warn "Parse error ($opt{type})\n";
366 };
367 $self->{parse_error} = $onerror; # updated later by parse_char_string
368
369 my $get_wrapper = $_[3] || sub ($) {
370 return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle
371 };
372
373 ## HTML5 encoding sniffing algorithm
374 require Message::Charset::Info;
375 my $charset;
376 my $buffer;
377 my ($char_stream, $e_status);
378
379 SNIFFING: {
380 ## NOTE: By setting |allow_fallback| option true when the
381 ## |get_decode_handle| method is invoked, we ignore what the HTML5
382 ## spec requires, i.e. unsupported encoding should be ignored.
383 ## TODO: We should not do this unless the parser is invoked
384 ## in the conformance checking mode, in which this behavior
385 ## would be useful.
386
387 ## Step 1
388 if (defined $charset_name) {
389 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
390 ## TODO: Is this ok? Transfer protocol's parameter should be
391 ## interpreted in its semantics?
392
393 ## ISSUE: Unsupported encoding is not ignored according to the spec.
394 ($char_stream, $e_status) = $charset->get_decode_handle
395 ($byte_stream, allow_error_reporting => 1,
396 allow_fallback => 1);
397 if ($char_stream) {
398 $self->{confident} = 1;
399 last SNIFFING;
400 } else {
401 ## TODO: unsupported error
402 }
403 }
404
405 ## Step 2
406 my $byte_buffer = '';
407 for (1..1024) {
408 my $char = $byte_stream->getc;
409 last unless defined $char;
410 $byte_buffer .= $char;
411 } ## TODO: timeout
412
413 ## Step 3
414 if ($byte_buffer =~ /^\xFE\xFF/) {
415 $charset = Message::Charset::Info->get_by_html_name ('utf-16be');
416 ($char_stream, $e_status) = $charset->get_decode_handle
417 ($byte_stream, allow_error_reporting => 1,
418 allow_fallback => 1, byte_buffer => \$byte_buffer);
419 $self->{confident} = 1;
420 last SNIFFING;
421 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
422 $charset = Message::Charset::Info->get_by_html_name ('utf-16le');
423 ($char_stream, $e_status) = $charset->get_decode_handle
424 ($byte_stream, allow_error_reporting => 1,
425 allow_fallback => 1, byte_buffer => \$byte_buffer);
426 $self->{confident} = 1;
427 last SNIFFING;
428 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
429 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
430 ($char_stream, $e_status) = $charset->get_decode_handle
431 ($byte_stream, allow_error_reporting => 1,
432 allow_fallback => 1, byte_buffer => \$byte_buffer);
433 $self->{confident} = 1;
434 last SNIFFING;
435 }
436
437 ## Step 4
438 ## TODO: <meta charset>
439
440 ## Step 5
441 ## TODO: from history
442
443 ## Step 6
444 require Whatpm::Charset::UniversalCharDet;
445 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
446 ($byte_buffer);
447 if (defined $charset_name) {
448 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
449
450 ## ISSUE: Unsupported encoding is not ignored according to the spec.
451 require Whatpm::Charset::DecodeHandle;
452 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
453 ($byte_stream);
454 ($char_stream, $e_status) = $charset->get_decode_handle
455 ($buffer, allow_error_reporting => 1,
456 allow_fallback => 1, byte_buffer => \$byte_buffer);
457 if ($char_stream) {
458 $buffer->{buffer} = $byte_buffer;
459 !!!parse-error (type => 'sniffing:chardet',
460 text => $charset_name,
461 level => $self->{level}->{info},
462 layer => 'encode',
463 line => 1, column => 1);
464 $self->{confident} = 0;
465 last SNIFFING;
466 }
467 }
468
469 ## Step 7: default
470 ## TODO: Make this configurable.
471 $charset = Message::Charset::Info->get_by_html_name ('windows-1252');
472 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
473 ## detectable in the step 6.
474 require Whatpm::Charset::DecodeHandle;
475 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
476 ($byte_stream);
477 ($char_stream, $e_status)
478 = $charset->get_decode_handle ($buffer,
479 allow_error_reporting => 1,
480 allow_fallback => 1,
481 byte_buffer => \$byte_buffer);
482 $buffer->{buffer} = $byte_buffer;
483 !!!parse-error (type => 'sniffing:default',
484 text => 'windows-1252',
485 level => $self->{level}->{info},
486 line => 1, column => 1,
487 layer => 'encode');
488 $self->{confident} = 0;
489 } # SNIFFING
490
491 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
492 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
493 !!!parse-error (type => 'chardecode:fallback',
494 #text => $self->{input_encoding},
495 level => $self->{level}->{uncertain},
496 line => 1, column => 1,
497 layer => 'encode');
498 } elsif (not ($e_status &
499 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
500 $self->{input_encoding} = $charset->get_iana_name;
501 !!!parse-error (type => 'chardecode:no error',
502 text => $self->{input_encoding},
503 level => $self->{level}->{uncertain},
504 line => 1, column => 1,
505 layer => 'encode');
506 } else {
507 $self->{input_encoding} = $charset->get_iana_name;
508 }
509
510 $self->{change_encoding} = sub {
511 my $self = shift;
512 $charset_name = shift;
513 my $token = shift;
514
515 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
516 ($char_stream, $e_status) = $charset->get_decode_handle
517 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
518 byte_buffer => \ $buffer->{buffer});
519
520 if ($char_stream) { # if supported
521 ## "Change the encoding" algorithm:
522
523 ## Step 1
524 if ($charset->{category} &
525 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
526 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
527 ($char_stream, $e_status) = $charset->get_decode_handle
528 ($byte_stream,
529 byte_buffer => \ $buffer->{buffer});
530 }
531 $charset_name = $charset->get_iana_name;
532
533 ## Step 2
534 if (defined $self->{input_encoding} and
535 $self->{input_encoding} eq $charset_name) {
536 !!!parse-error (type => 'charset label:matching',
537 text => $charset_name,
538 level => $self->{level}->{info});
539 $self->{confident} = 1;
540 return;
541 }
542
543 !!!parse-error (type => 'charset label detected',
544 text => $self->{input_encoding},
545 value => $charset_name,
546 level => $self->{level}->{warn},
547 token => $token);
548
549 ## Step 3
550 # if (can) {
551 ## change the encoding on the fly.
552 #$self->{confident} = 1;
553 #return;
554 # }
555
556 ## Step 4
557 throw Whatpm::HTML::RestartParser ();
558 }
559 }; # $self->{change_encoding}
560
561 my $char_onerror = sub {
562 my (undef, $type, %opt) = @_;
563 !!!parse-error (layer => 'encode',
564 %opt, type => $type,
565 line => $self->{line}, column => $self->{column} + 1);
566 if ($opt{octets}) {
567 ${$opt{octets}} = "\x{FFFD}"; # relacement character
568 }
569 };
570
571 my $wrapped_char_stream = $get_wrapper->($char_stream);
572 $wrapped_char_stream->onerror ($char_onerror);
573
574 my @args = @_; shift @args; # $s
575 my $return;
576 try {
577 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
578 } catch Whatpm::HTML::RestartParser with {
579 ## NOTE: Invoked after {change_encoding}.
580
581 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
582 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
583 !!!parse-error (type => 'chardecode:fallback',
584 level => $self->{level}->{uncertain},
585 #text => $self->{input_encoding},
586 line => 1, column => 1,
587 layer => 'encode');
588 } elsif (not ($e_status &
589 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
590 $self->{input_encoding} = $charset->get_iana_name;
591 !!!parse-error (type => 'chardecode:no error',
592 text => $self->{input_encoding},
593 level => $self->{level}->{uncertain},
594 line => 1, column => 1,
595 layer => 'encode');
596 } else {
597 $self->{input_encoding} = $charset->get_iana_name;
598 }
599 $self->{confident} = 1;
600
601 $wrapped_char_stream = $get_wrapper->($char_stream);
602 $wrapped_char_stream->onerror ($char_onerror);
603
604 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
605 };
606 return $return;
607 } # parse_byte_stream
608
609 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
610 ## and the HTML layer MUST ignore it. However, we does strip BOM in
611 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
612 ## because the core part of our HTML parser expects a string of character,
613 ## not a string of bytes or code units or anything which might contain a BOM.
614 ## Therefore, any parser interface that accepts a string of bytes,
615 ## such as |parse_byte_string| in this module, must ensure that it does
616 ## strip the BOM and never strip any ZWNBSP.
617
618 sub parse_char_string ($$$;$$) {
619 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
620 my $self = shift;
621 require utf8;
622 my $s = ref $_[0] ? $_[0] : \($_[0]);
623 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
624 if ($_[3]) {
625 $input = $_[3]->($input);
626 }
627 return $self->parse_char_stream ($input, @_[1..$#_]);
628 } # parse_char_string
629 *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
630
631 sub parse_char_stream ($$$;$) {
632 my $self = ref $_[0] ? shift : shift->new;
633 my $input = $_[0];
634 $self->{document} = $_[1];
635 @{$self->{document}->child_nodes} = ();
636
637 ## NOTE: |set_inner_html| copies most of this method's code
638
639 $self->{confident} = 1 unless exists $self->{confident};
640 $self->{document}->input_encoding ($self->{input_encoding})
641 if defined $self->{input_encoding};
642
643 my $i = 0;
644 $self->{line_prev} = $self->{line} = 1;
645 $self->{column_prev} = $self->{column} = 0;
646 $self->{set_next_char} = sub {
647 my $self = shift;
648
649 pop @{$self->{prev_char}};
650 unshift @{$self->{prev_char}}, $self->{next_char};
651
652 my $char;
653 if (defined $self->{next_next_char}) {
654 $char = $self->{next_next_char};
655 delete $self->{next_next_char};
656 } else {
657 $char = $input->getc;
658 }
659 $self->{next_char} = -1 and return unless defined $char;
660 $self->{next_char} = ord $char;
661
662 ($self->{line_prev}, $self->{column_prev})
663 = ($self->{line}, $self->{column});
664 $self->{column}++;
665
666 if ($self->{next_char} == 0x000A) { # LF
667 !!!cp ('j1');
668 $self->{line}++;
669 $self->{column} = 0;
670 } elsif ($self->{next_char} == 0x000D) { # CR
671 !!!cp ('j2');
672 my $next = $input->getc;
673 if (defined $next and $next ne "\x0A") {
674 $self->{next_next_char} = $next;
675 }
676 $self->{next_char} = 0x000A; # LF # MUST
677 $self->{line}++;
678 $self->{column} = 0;
679 } elsif ($self->{next_char} > 0x10FFFF) {
680 !!!cp ('j3');
681 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
682 } elsif ($self->{next_char} == 0x0000) { # NULL
683 !!!cp ('j4');
684 !!!parse-error (type => 'NULL');
685 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
686 } elsif ($self->{next_char} <= 0x0008 or
687 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
688 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
689 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
690 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
691 {
692 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
693 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
694 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
695 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
696 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
697 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
698 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
699 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
700 0x10FFFE => 1, 0x10FFFF => 1,
701 }->{$self->{next_char}}) {
702 !!!cp ('j5');
703 if ($self->{next_char} < 0x10000) {
704 !!!parse-error (type => 'control char',
705 text => (sprintf 'U+%04X', $self->{next_char}));
706 } else {
707 !!!parse-error (type => 'control char',
708 text => (sprintf 'U-%08X', $self->{next_char}));
709 }
710 }
711 };
712 $self->{prev_char} = [-1, -1, -1];
713 $self->{next_char} = -1;
714
715 my $onerror = $_[2] || sub {
716 my (%opt) = @_;
717 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
718 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
719 warn "Parse error ($opt{type}) at line $line column $column\n";
720 };
721 $self->{parse_error} = sub {
722 $onerror->(line => $self->{line}, column => $self->{column}, @_);
723 };
724
725 $self->_initialize_tokenizer;
726 $self->_initialize_tree_constructor;
727 $self->_construct_tree;
728 $self->_terminate_tree_constructor;
729
730 delete $self->{parse_error}; # remove loop
731
732 return $self->{document};
733 } # parse_char_stream
734
735 sub new ($) {
736 my $class = shift;
737 my $self = bless {
738 level => {must => 'm',
739 should => 's',
740 warn => 'w',
741 info => 'i',
742 uncertain => 'u'},
743 }, $class;
744 $self->{set_next_char} = sub {
745 $self->{next_char} = -1;
746 };
747 $self->{parse_error} = sub {
748 #
749 };
750 $self->{change_encoding} = sub {
751 # if ($_[0] is a supported encoding) {
752 # run "change the encoding" algorithm;
753 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
754 # }
755 };
756 $self->{application_cache_selection} = sub {
757 #
758 };
759 return $self;
760 } # new
761
762 sub CM_ENTITY () { 0b001 } # & markup in data
763 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
764 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
765
766 sub PLAINTEXT_CONTENT_MODEL () { 0 }
767 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
768 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
769 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
770
771 sub DATA_STATE () { 0 }
772 sub ENTITY_DATA_STATE () { 1 }
773 sub TAG_OPEN_STATE () { 2 }
774 sub CLOSE_TAG_OPEN_STATE () { 3 }
775 sub TAG_NAME_STATE () { 4 }
776 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
777 sub ATTRIBUTE_NAME_STATE () { 6 }
778 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
779 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
780 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
781 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
782 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
783 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
784 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
785 sub COMMENT_START_STATE () { 14 }
786 sub COMMENT_START_DASH_STATE () { 15 }
787 sub COMMENT_STATE () { 16 }
788 sub COMMENT_END_STATE () { 17 }
789 sub COMMENT_END_DASH_STATE () { 18 }
790 sub BOGUS_COMMENT_STATE () { 19 }
791 sub DOCTYPE_STATE () { 20 }
792 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
793 sub DOCTYPE_NAME_STATE () { 22 }
794 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
795 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
796 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
797 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
798 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
799 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
800 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
801 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
802 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
803 sub BOGUS_DOCTYPE_STATE () { 32 }
804 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
805 sub SELF_CLOSING_START_TAG_STATE () { 34 }
806 sub CDATA_SECTION_STATE () { 35 }
807 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
808 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
809 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
810 sub CDATA_PCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
811 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
812 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
813 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
814 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
815 sub ENTITY_STATE () { 44 } # "consume a character reference" in the spec
816
817 sub DOCTYPE_TOKEN () { 1 }
818 sub COMMENT_TOKEN () { 2 }
819 sub START_TAG_TOKEN () { 3 }
820 sub END_TAG_TOKEN () { 4 }
821 sub END_OF_FILE_TOKEN () { 5 }
822 sub CHARACTER_TOKEN () { 6 }
823
824 sub AFTER_HTML_IMS () { 0b100 }
825 sub HEAD_IMS () { 0b1000 }
826 sub BODY_IMS () { 0b10000 }
827 sub BODY_TABLE_IMS () { 0b100000 }
828 sub TABLE_IMS () { 0b1000000 }
829 sub ROW_IMS () { 0b10000000 }
830 sub BODY_AFTER_IMS () { 0b100000000 }
831 sub FRAME_IMS () { 0b1000000000 }
832 sub SELECT_IMS () { 0b10000000000 }
833 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
834 ## NOTE: "in foreign content" insertion mode is special; it is combined
835 ## with the secondary insertion mode. In this parser, they are stored
836 ## together in the bit-or'ed form.
837
838 ## NOTE: "initial" and "before html" insertion modes have no constants.
839
840 ## NOTE: "after after body" insertion mode.
841 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
842
843 ## NOTE: "after after frameset" insertion mode.
844 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
845
846 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
847 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
848 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
849 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
850 sub IN_BODY_IM () { BODY_IMS }
851 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
852 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
853 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
854 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
855 sub IN_TABLE_IM () { TABLE_IMS }
856 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
857 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
858 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
859 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
860 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
861 sub IN_COLUMN_GROUP_IM () { 0b10 }
862
863 ## Implementations MUST act as if state machine in the spec
864
865 sub _initialize_tokenizer ($) {
866 my $self = shift;
867 $self->{state} = DATA_STATE; # MUST
868 #$self->{state_keyword}; # initialized when used
869 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
870 undef $self->{current_token};
871 undef $self->{current_attribute};
872 undef $self->{last_emitted_start_tag_name};
873 undef $self->{last_attribute_value_state};
874 delete $self->{self_closing};
875 $self->{char} = [];
876 # $self->{next_char}
877 !!!next-input-character;
878 $self->{token} = [];
879 # $self->{escape}
880 } # _initialize_tokenizer
881
882 ## A token has:
883 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
884 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
885 ## ->{name} (DOCTYPE_TOKEN)
886 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
887 ## ->{public_identifier} (DOCTYPE_TOKEN)
888 ## ->{system_identifier} (DOCTYPE_TOKEN)
889 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
890 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
891 ## ->{name}
892 ## ->{value}
893 ## ->{has_reference} == 1 or 0
894 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
895 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
896 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
897 ## while the token is pushed back to the stack.
898
899 ## Emitted token MUST immediately be handled by the tree construction state.
900
901 ## Before each step, UA MAY check to see if either one of the scripts in
902 ## "list of scripts that will execute as soon as possible" or the first
903 ## script in the "list of scripts that will execute asynchronously",
904 ## has completed loading. If one has, then it MUST be executed
905 ## and removed from the list.
906
907 ## NOTE: HTML5 "Writing HTML documents" section, applied to
908 ## documents and not to user agents and conformance checkers,
909 ## contains some requirements that are not detected by the
910 ## parsing algorithm:
911 ## - Some requirements on character encoding declarations. ## TODO
912 ## - "Elements MUST NOT contain content that their content model disallows."
913 ## ... Some are parse error, some are not (will be reported by c.c.).
914 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
915 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
916 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
917
918 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
919 ## be detected by the HTML5 parsing algorithm:
920 ## - Text,
921
922 sub _get_next_token ($) {
923 my $self = shift;
924
925 if ($self->{self_closing}) {
926 !!!parse-error (type => 'nestc', token => $self->{current_token});
927 ## NOTE: The |self_closing| flag is only set by start tag token.
928 ## In addition, when a start tag token is emitted, it is always set to
929 ## |current_token|.
930 delete $self->{self_closing};
931 }
932
933 if (@{$self->{token}}) {
934 $self->{self_closing} = $self->{token}->[0]->{self_closing};
935 return shift @{$self->{token}};
936 }
937
938 A: {
939 if ($self->{state} == DATA_STATE) {
940 if ($self->{next_char} == 0x0026) { # &
941 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
942 not $self->{escape}) {
943 !!!cp (1);
944 ## NOTE: In the spec, the tokenizer is switched to the
945 ## "entity data state". In this implementation, the tokenizer
946 ## is switched to the |ENTITY_STATE|, which is an implementation
947 ## of the "consume a character reference" algorithm.
948 #$self->{state} = ENTITY_DATA_STATE;
949 $self->{entity_in_attr} = 0;
950 $self->{entity_additional} = -1;
951 $self->{state} = ENTITY_STATE;
952 !!!next-input-character;
953 redo A;
954 } else {
955 !!!cp (2);
956 #
957 }
958 } elsif ($self->{next_char} == 0x002D) { # -
959 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
960 unless ($self->{escape}) {
961 if ($self->{prev_char}->[0] == 0x002D and # -
962 $self->{prev_char}->[1] == 0x0021 and # !
963 $self->{prev_char}->[2] == 0x003C) { # <
964 !!!cp (3);
965 $self->{escape} = 1;
966 } else {
967 !!!cp (4);
968 }
969 } else {
970 !!!cp (5);
971 }
972 }
973
974 #
975 } elsif ($self->{next_char} == 0x003C) { # <
976 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
977 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
978 not $self->{escape})) {
979 !!!cp (6);
980 $self->{state} = TAG_OPEN_STATE;
981 !!!next-input-character;
982 redo A;
983 } else {
984 !!!cp (7);
985 #
986 }
987 } elsif ($self->{next_char} == 0x003E) { # >
988 if ($self->{escape} and
989 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
990 if ($self->{prev_char}->[0] == 0x002D and # -
991 $self->{prev_char}->[1] == 0x002D) { # -
992 !!!cp (8);
993 delete $self->{escape};
994 } else {
995 !!!cp (9);
996 }
997 } else {
998 !!!cp (10);
999 }
1000
1001 #
1002 } elsif ($self->{next_char} == -1) {
1003 !!!cp (11);
1004 !!!emit ({type => END_OF_FILE_TOKEN,
1005 line => $self->{line}, column => $self->{column}});
1006 last A; ## TODO: ok?
1007 } else {
1008 !!!cp (12);
1009 }
1010 # Anything else
1011 my $token = {type => CHARACTER_TOKEN,
1012 data => chr $self->{next_char},
1013 line => $self->{line}, column => $self->{column},
1014 };
1015 ## Stay in the data state
1016 !!!next-input-character;
1017
1018 !!!emit ($token);
1019
1020 redo A;
1021 } elsif ($self->{state} == ENTITY_DATA_STATE) {
1022 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
1023
1024 my $token = $self->{entity_return};
1025
1026 $self->{state} = DATA_STATE;
1027 # next-input-character is already done
1028
1029 unless (defined $token) {
1030 !!!cp (13);
1031 !!!emit ({type => CHARACTER_TOKEN, data => '&',
1032 line => $l, column => $c,
1033 });
1034 } else {
1035 !!!cp (14);
1036 !!!emit ($token);
1037 }
1038
1039 redo A;
1040 } elsif ($self->{state} == TAG_OPEN_STATE) {
1041 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1042 if ($self->{next_char} == 0x002F) { # /
1043 !!!cp (15);
1044 !!!next-input-character;
1045 $self->{state} = CLOSE_TAG_OPEN_STATE;
1046 redo A;
1047 } else {
1048 !!!cp (16);
1049 ## reconsume
1050 $self->{state} = DATA_STATE;
1051
1052 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1053 line => $self->{line_prev},
1054 column => $self->{column_prev},
1055 });
1056
1057 redo A;
1058 }
1059 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1060 if ($self->{next_char} == 0x0021) { # !
1061 !!!cp (17);
1062 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1063 !!!next-input-character;
1064 redo A;
1065 } elsif ($self->{next_char} == 0x002F) { # /
1066 !!!cp (18);
1067 $self->{state} = CLOSE_TAG_OPEN_STATE;
1068 !!!next-input-character;
1069 redo A;
1070 } elsif (0x0041 <= $self->{next_char} and
1071 $self->{next_char} <= 0x005A) { # A..Z
1072 !!!cp (19);
1073 $self->{current_token}
1074 = {type => START_TAG_TOKEN,
1075 tag_name => chr ($self->{next_char} + 0x0020),
1076 line => $self->{line_prev},
1077 column => $self->{column_prev}};
1078 $self->{state} = TAG_NAME_STATE;
1079 !!!next-input-character;
1080 redo A;
1081 } elsif (0x0061 <= $self->{next_char} and
1082 $self->{next_char} <= 0x007A) { # a..z
1083 !!!cp (20);
1084 $self->{current_token} = {type => START_TAG_TOKEN,
1085 tag_name => chr ($self->{next_char}),
1086 line => $self->{line_prev},
1087 column => $self->{column_prev}};
1088 $self->{state} = TAG_NAME_STATE;
1089 !!!next-input-character;
1090 redo A;
1091 } elsif ($self->{next_char} == 0x003E) { # >
1092 !!!cp (21);
1093 !!!parse-error (type => 'empty start tag',
1094 line => $self->{line_prev},
1095 column => $self->{column_prev});
1096 $self->{state} = DATA_STATE;
1097 !!!next-input-character;
1098
1099 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1100 line => $self->{line_prev},
1101 column => $self->{column_prev},
1102 });
1103
1104 redo A;
1105 } elsif ($self->{next_char} == 0x003F) { # ?
1106 !!!cp (22);
1107 !!!parse-error (type => 'pio',
1108 line => $self->{line_prev},
1109 column => $self->{column_prev});
1110 $self->{state} = BOGUS_COMMENT_STATE;
1111 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1112 line => $self->{line_prev},
1113 column => $self->{column_prev},
1114 };
1115 ## $self->{next_char} is intentionally left as is
1116 redo A;
1117 } else {
1118 !!!cp (23);
1119 !!!parse-error (type => 'bare stago',
1120 line => $self->{line_prev},
1121 column => $self->{column_prev});
1122 $self->{state} = DATA_STATE;
1123 ## reconsume
1124
1125 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1126 line => $self->{line_prev},
1127 column => $self->{column_prev},
1128 });
1129
1130 redo A;
1131 }
1132 } else {
1133 die "$0: $self->{content_model} in tag open";
1134 }
1135 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1136 ## NOTE: The "close tag open state" in the spec is implemented as
1137 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_PCDATA_CLOSE_TAG_STATE|.
1138
1139 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1140 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1141 if (defined $self->{last_emitted_start_tag_name}) {
1142 $self->{state} = CDATA_PCDATA_CLOSE_TAG_STATE;
1143 $self->{state_keyword} = '';
1144 ## Reconsume.
1145 redo A;
1146 } else {
1147 ## No start tag token has ever been emitted
1148 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1149 !!!cp (28);
1150 $self->{state} = DATA_STATE;
1151 ## Reconsume.
1152 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1153 line => $l, column => $c,
1154 });
1155 redo A;
1156 }
1157 }
1158
1159 if (0x0041 <= $self->{next_char} and
1160 $self->{next_char} <= 0x005A) { # A..Z
1161 !!!cp (29);
1162 $self->{current_token}
1163 = {type => END_TAG_TOKEN,
1164 tag_name => chr ($self->{next_char} + 0x0020),
1165 line => $l, column => $c};
1166 $self->{state} = TAG_NAME_STATE;
1167 !!!next-input-character;
1168 redo A;
1169 } elsif (0x0061 <= $self->{next_char} and
1170 $self->{next_char} <= 0x007A) { # a..z
1171 !!!cp (30);
1172 $self->{current_token} = {type => END_TAG_TOKEN,
1173 tag_name => chr ($self->{next_char}),
1174 line => $l, column => $c};
1175 $self->{state} = TAG_NAME_STATE;
1176 !!!next-input-character;
1177 redo A;
1178 } elsif ($self->{next_char} == 0x003E) { # >
1179 !!!cp (31);
1180 !!!parse-error (type => 'empty end tag',
1181 line => $self->{line_prev}, ## "<" in "</>"
1182 column => $self->{column_prev} - 1);
1183 $self->{state} = DATA_STATE;
1184 !!!next-input-character;
1185 redo A;
1186 } elsif ($self->{next_char} == -1) {
1187 !!!cp (32);
1188 !!!parse-error (type => 'bare etago');
1189 $self->{state} = DATA_STATE;
1190 # reconsume
1191
1192 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1193 line => $l, column => $c,
1194 });
1195
1196 redo A;
1197 } else {
1198 !!!cp (33);
1199 !!!parse-error (type => 'bogus end tag');
1200 $self->{state} = BOGUS_COMMENT_STATE;
1201 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1202 line => $self->{line_prev}, # "<" of "</"
1203 column => $self->{column_prev} - 1,
1204 };
1205 ## NOTE: $self->{next_char} is intentionally left as is.
1206 ## Although the "anything else" case of the spec not explicitly
1207 ## states that the next input character is to be reconsumed,
1208 ## it will be included to the |data| of the comment token
1209 ## generated from the bogus end tag, as defined in the
1210 ## "bogus comment state" entry.
1211 redo A;
1212 }
1213 } elsif ($self->{state} == CDATA_PCDATA_CLOSE_TAG_STATE) {
1214 my $ch = substr $self->{last_emitted_start_tag_name}, length $self->{state_keyword}, 1;
1215 if (length $ch) {
1216 my $CH = $ch;
1217 $ch =~ tr/a-z/A-Z/;
1218 my $nch = chr $self->{next_char};
1219 if ($nch eq $ch or $nch eq $CH) {
1220 !!!cp (24);
1221 ## Stay in the state.
1222 $self->{state_keyword} .= $nch;
1223 !!!next-input-character;
1224 redo A;
1225 } else {
1226 !!!cp (25);
1227 $self->{state} = DATA_STATE;
1228 ## Reconsume.
1229 !!!emit ({type => CHARACTER_TOKEN,
1230 data => '</' . $self->{state_keyword},
1231 line => $self->{line_prev},
1232 column => $self->{column_prev} - 1 - length $self->{state_keyword},
1233 });
1234 redo A;
1235 }
1236 } else { # after "<{tag-name}"
1237 unless ({
1238 0x0009 => 1, # HT
1239 0x000A => 1, # LF
1240 0x000B => 1, # VT
1241 0x000C => 1, # FF
1242 0x0020 => 1, # SP
1243 0x003E => 1, # >
1244 0x002F => 1, # /
1245 -1 => 1, # EOF
1246 }->{$self->{next_char}}) {
1247 !!!cp (26);
1248 ## Reconsume.
1249 $self->{state} = DATA_STATE;
1250 !!!emit ({type => CHARACTER_TOKEN,
1251 data => '</' . $self->{state_keyword},
1252 line => $self->{line_prev},
1253 column => $self->{column_prev} - 1 - length $self->{state_keyword},
1254 });
1255 redo A;
1256 } else {
1257 !!!cp (27);
1258 $self->{current_token}
1259 = {type => END_TAG_TOKEN,
1260 tag_name => $self->{last_emitted_start_tag_name},
1261 line => $self->{line_prev},
1262 column => $self->{column_prev} - 1 - length $self->{state_keyword}};
1263 $self->{state} = TAG_NAME_STATE;
1264 ## Reconsume.
1265 redo A;
1266 }
1267 }
1268 } elsif ($self->{state} == TAG_NAME_STATE) {
1269 if ($self->{next_char} == 0x0009 or # HT
1270 $self->{next_char} == 0x000A or # LF
1271 $self->{next_char} == 0x000B or # VT
1272 $self->{next_char} == 0x000C or # FF
1273 $self->{next_char} == 0x0020) { # SP
1274 !!!cp (34);
1275 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1276 !!!next-input-character;
1277 redo A;
1278 } elsif ($self->{next_char} == 0x003E) { # >
1279 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1280 !!!cp (35);
1281 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1282 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1283 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1284 #if ($self->{current_token}->{attributes}) {
1285 # ## NOTE: This should never be reached.
1286 # !!! cp (36);
1287 # !!! parse-error (type => 'end tag attribute');
1288 #} else {
1289 !!!cp (37);
1290 #}
1291 } else {
1292 die "$0: $self->{current_token}->{type}: Unknown token type";
1293 }
1294 $self->{state} = DATA_STATE;
1295 !!!next-input-character;
1296
1297 !!!emit ($self->{current_token}); # start tag or end tag
1298
1299 redo A;
1300 } elsif (0x0041 <= $self->{next_char} and
1301 $self->{next_char} <= 0x005A) { # A..Z
1302 !!!cp (38);
1303 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1304 # start tag or end tag
1305 ## Stay in this state
1306 !!!next-input-character;
1307 redo A;
1308 } elsif ($self->{next_char} == -1) {
1309 !!!parse-error (type => 'unclosed tag');
1310 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1311 !!!cp (39);
1312 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1313 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1314 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1315 #if ($self->{current_token}->{attributes}) {
1316 # ## NOTE: This state should never be reached.
1317 # !!! cp (40);
1318 # !!! parse-error (type => 'end tag attribute');
1319 #} else {
1320 !!!cp (41);
1321 #}
1322 } else {
1323 die "$0: $self->{current_token}->{type}: Unknown token type";
1324 }
1325 $self->{state} = DATA_STATE;
1326 # reconsume
1327
1328 !!!emit ($self->{current_token}); # start tag or end tag
1329
1330 redo A;
1331 } elsif ($self->{next_char} == 0x002F) { # /
1332 !!!cp (42);
1333 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1334 !!!next-input-character;
1335 redo A;
1336 } else {
1337 !!!cp (44);
1338 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1339 # start tag or end tag
1340 ## Stay in the state
1341 !!!next-input-character;
1342 redo A;
1343 }
1344 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1345 if ($self->{next_char} == 0x0009 or # HT
1346 $self->{next_char} == 0x000A or # LF
1347 $self->{next_char} == 0x000B or # VT
1348 $self->{next_char} == 0x000C or # FF
1349 $self->{next_char} == 0x0020) { # SP
1350 !!!cp (45);
1351 ## Stay in the state
1352 !!!next-input-character;
1353 redo A;
1354 } elsif ($self->{next_char} == 0x003E) { # >
1355 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1356 !!!cp (46);
1357 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1358 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1359 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1360 if ($self->{current_token}->{attributes}) {
1361 !!!cp (47);
1362 !!!parse-error (type => 'end tag attribute');
1363 } else {
1364 !!!cp (48);
1365 }
1366 } else {
1367 die "$0: $self->{current_token}->{type}: Unknown token type";
1368 }
1369 $self->{state} = DATA_STATE;
1370 !!!next-input-character;
1371
1372 !!!emit ($self->{current_token}); # start tag or end tag
1373
1374 redo A;
1375 } elsif (0x0041 <= $self->{next_char} and
1376 $self->{next_char} <= 0x005A) { # A..Z
1377 !!!cp (49);
1378 $self->{current_attribute}
1379 = {name => chr ($self->{next_char} + 0x0020),
1380 value => '',
1381 line => $self->{line}, column => $self->{column}};
1382 $self->{state} = ATTRIBUTE_NAME_STATE;
1383 !!!next-input-character;
1384 redo A;
1385 } elsif ($self->{next_char} == 0x002F) { # /
1386 !!!cp (50);
1387 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1388 !!!next-input-character;
1389 redo A;
1390 } elsif ($self->{next_char} == -1) {
1391 !!!parse-error (type => 'unclosed tag');
1392 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1393 !!!cp (52);
1394 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1395 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1396 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1397 if ($self->{current_token}->{attributes}) {
1398 !!!cp (53);
1399 !!!parse-error (type => 'end tag attribute');
1400 } else {
1401 !!!cp (54);
1402 }
1403 } else {
1404 die "$0: $self->{current_token}->{type}: Unknown token type";
1405 }
1406 $self->{state} = DATA_STATE;
1407 # reconsume
1408
1409 !!!emit ($self->{current_token}); # start tag or end tag
1410
1411 redo A;
1412 } else {
1413 if ({
1414 0x0022 => 1, # "
1415 0x0027 => 1, # '
1416 0x003D => 1, # =
1417 }->{$self->{next_char}}) {
1418 !!!cp (55);
1419 !!!parse-error (type => 'bad attribute name');
1420 } else {
1421 !!!cp (56);
1422 }
1423 $self->{current_attribute}
1424 = {name => chr ($self->{next_char}),
1425 value => '',
1426 line => $self->{line}, column => $self->{column}};
1427 $self->{state} = ATTRIBUTE_NAME_STATE;
1428 !!!next-input-character;
1429 redo A;
1430 }
1431 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1432 my $before_leave = sub {
1433 if (exists $self->{current_token}->{attributes} # start tag or end tag
1434 ->{$self->{current_attribute}->{name}}) { # MUST
1435 !!!cp (57);
1436 !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1437 ## Discard $self->{current_attribute} # MUST
1438 } else {
1439 !!!cp (58);
1440 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1441 = $self->{current_attribute};
1442 }
1443 }; # $before_leave
1444
1445 if ($self->{next_char} == 0x0009 or # HT
1446 $self->{next_char} == 0x000A or # LF
1447 $self->{next_char} == 0x000B or # VT
1448 $self->{next_char} == 0x000C or # FF
1449 $self->{next_char} == 0x0020) { # SP
1450 !!!cp (59);
1451 $before_leave->();
1452 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1453 !!!next-input-character;
1454 redo A;
1455 } elsif ($self->{next_char} == 0x003D) { # =
1456 !!!cp (60);
1457 $before_leave->();
1458 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1459 !!!next-input-character;
1460 redo A;
1461 } elsif ($self->{next_char} == 0x003E) { # >
1462 $before_leave->();
1463 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1464 !!!cp (61);
1465 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1466 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1467 !!!cp (62);
1468 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1469 if ($self->{current_token}->{attributes}) {
1470 !!!parse-error (type => 'end tag attribute');
1471 }
1472 } else {
1473 die "$0: $self->{current_token}->{type}: Unknown token type";
1474 }
1475 $self->{state} = DATA_STATE;
1476 !!!next-input-character;
1477
1478 !!!emit ($self->{current_token}); # start tag or end tag
1479
1480 redo A;
1481 } elsif (0x0041 <= $self->{next_char} and
1482 $self->{next_char} <= 0x005A) { # A..Z
1483 !!!cp (63);
1484 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1485 ## Stay in the state
1486 !!!next-input-character;
1487 redo A;
1488 } elsif ($self->{next_char} == 0x002F) { # /
1489 !!!cp (64);
1490 $before_leave->();
1491 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1492 !!!next-input-character;
1493 redo A;
1494 } elsif ($self->{next_char} == -1) {
1495 !!!parse-error (type => 'unclosed tag');
1496 $before_leave->();
1497 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1498 !!!cp (66);
1499 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1500 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1501 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1502 if ($self->{current_token}->{attributes}) {
1503 !!!cp (67);
1504 !!!parse-error (type => 'end tag attribute');
1505 } else {
1506 ## NOTE: This state should never be reached.
1507 !!!cp (68);
1508 }
1509 } else {
1510 die "$0: $self->{current_token}->{type}: Unknown token type";
1511 }
1512 $self->{state} = DATA_STATE;
1513 # reconsume
1514
1515 !!!emit ($self->{current_token}); # start tag or end tag
1516
1517 redo A;
1518 } else {
1519 if ($self->{next_char} == 0x0022 or # "
1520 $self->{next_char} == 0x0027) { # '
1521 !!!cp (69);
1522 !!!parse-error (type => 'bad attribute name');
1523 } else {
1524 !!!cp (70);
1525 }
1526 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1527 ## Stay in the state
1528 !!!next-input-character;
1529 redo A;
1530 }
1531 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1532 if ($self->{next_char} == 0x0009 or # HT
1533 $self->{next_char} == 0x000A or # LF
1534 $self->{next_char} == 0x000B or # VT
1535 $self->{next_char} == 0x000C or # FF
1536 $self->{next_char} == 0x0020) { # SP
1537 !!!cp (71);
1538 ## Stay in the state
1539 !!!next-input-character;
1540 redo A;
1541 } elsif ($self->{next_char} == 0x003D) { # =
1542 !!!cp (72);
1543 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1544 !!!next-input-character;
1545 redo A;
1546 } elsif ($self->{next_char} == 0x003E) { # >
1547 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1548 !!!cp (73);
1549 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1550 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1551 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1552 if ($self->{current_token}->{attributes}) {
1553 !!!cp (74);
1554 !!!parse-error (type => 'end tag attribute');
1555 } else {
1556 ## NOTE: This state should never be reached.
1557 !!!cp (75);
1558 }
1559 } else {
1560 die "$0: $self->{current_token}->{type}: Unknown token type";
1561 }
1562 $self->{state} = DATA_STATE;
1563 !!!next-input-character;
1564
1565 !!!emit ($self->{current_token}); # start tag or end tag
1566
1567 redo A;
1568 } elsif (0x0041 <= $self->{next_char} and
1569 $self->{next_char} <= 0x005A) { # A..Z
1570 !!!cp (76);
1571 $self->{current_attribute}
1572 = {name => chr ($self->{next_char} + 0x0020),
1573 value => '',
1574 line => $self->{line}, column => $self->{column}};
1575 $self->{state} = ATTRIBUTE_NAME_STATE;
1576 !!!next-input-character;
1577 redo A;
1578 } elsif ($self->{next_char} == 0x002F) { # /
1579 !!!cp (77);
1580 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1581 !!!next-input-character;
1582 redo A;
1583 } elsif ($self->{next_char} == -1) {
1584 !!!parse-error (type => 'unclosed tag');
1585 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1586 !!!cp (79);
1587 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1588 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1589 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1590 if ($self->{current_token}->{attributes}) {
1591 !!!cp (80);
1592 !!!parse-error (type => 'end tag attribute');
1593 } else {
1594 ## NOTE: This state should never be reached.
1595 !!!cp (81);
1596 }
1597 } else {
1598 die "$0: $self->{current_token}->{type}: Unknown token type";
1599 }
1600 $self->{state} = DATA_STATE;
1601 # reconsume
1602
1603 !!!emit ($self->{current_token}); # start tag or end tag
1604
1605 redo A;
1606 } else {
1607 if ($self->{next_char} == 0x0022 or # "
1608 $self->{next_char} == 0x0027) { # '
1609 !!!cp (78);
1610 !!!parse-error (type => 'bad attribute name');
1611 } else {
1612 !!!cp (82);
1613 }
1614 $self->{current_attribute}
1615 = {name => chr ($self->{next_char}),
1616 value => '',
1617 line => $self->{line}, column => $self->{column}};
1618 $self->{state} = ATTRIBUTE_NAME_STATE;
1619 !!!next-input-character;
1620 redo A;
1621 }
1622 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1623 if ($self->{next_char} == 0x0009 or # HT
1624 $self->{next_char} == 0x000A or # LF
1625 $self->{next_char} == 0x000B or # VT
1626 $self->{next_char} == 0x000C or # FF
1627 $self->{next_char} == 0x0020) { # SP
1628 !!!cp (83);
1629 ## Stay in the state
1630 !!!next-input-character;
1631 redo A;
1632 } elsif ($self->{next_char} == 0x0022) { # "
1633 !!!cp (84);
1634 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1635 !!!next-input-character;
1636 redo A;
1637 } elsif ($self->{next_char} == 0x0026) { # &
1638 !!!cp (85);
1639 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1640 ## reconsume
1641 redo A;
1642 } elsif ($self->{next_char} == 0x0027) { # '
1643 !!!cp (86);
1644 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1645 !!!next-input-character;
1646 redo A;
1647 } elsif ($self->{next_char} == 0x003E) { # >
1648 !!!parse-error (type => 'empty unquoted attribute value');
1649 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1650 !!!cp (87);
1651 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1652 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1653 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1654 if ($self->{current_token}->{attributes}) {
1655 !!!cp (88);
1656 !!!parse-error (type => 'end tag attribute');
1657 } else {
1658 ## NOTE: This state should never be reached.
1659 !!!cp (89);
1660 }
1661 } else {
1662 die "$0: $self->{current_token}->{type}: Unknown token type";
1663 }
1664 $self->{state} = DATA_STATE;
1665 !!!next-input-character;
1666
1667 !!!emit ($self->{current_token}); # start tag or end tag
1668
1669 redo A;
1670 } elsif ($self->{next_char} == -1) {
1671 !!!parse-error (type => 'unclosed tag');
1672 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1673 !!!cp (90);
1674 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1675 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1676 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1677 if ($self->{current_token}->{attributes}) {
1678 !!!cp (91);
1679 !!!parse-error (type => 'end tag attribute');
1680 } else {
1681 ## NOTE: This state should never be reached.
1682 !!!cp (92);
1683 }
1684 } else {
1685 die "$0: $self->{current_token}->{type}: Unknown token type";
1686 }
1687 $self->{state} = DATA_STATE;
1688 ## reconsume
1689
1690 !!!emit ($self->{current_token}); # start tag or end tag
1691
1692 redo A;
1693 } else {
1694 if ($self->{next_char} == 0x003D) { # =
1695 !!!cp (93);
1696 !!!parse-error (type => 'bad attribute value');
1697 } else {
1698 !!!cp (94);
1699 }
1700 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1701 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1702 !!!next-input-character;
1703 redo A;
1704 }
1705 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1706 if ($self->{next_char} == 0x0022) { # "
1707 !!!cp (95);
1708 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1709 !!!next-input-character;
1710 redo A;
1711 } elsif ($self->{next_char} == 0x0026) { # &
1712 !!!cp (96);
1713 $self->{last_attribute_value_state} = $self->{state};
1714 ## NOTE: In the spec, the tokenizer is switched to the
1715 ## "entity in attribute value state". In this implementation, the
1716 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1717 ## implementation of the "consume a character reference" algorithm.
1718 #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1719 $self->{entity_in_attr} = 1;
1720 $self->{entity_additional} = 0x0022; # "
1721 $self->{state} = ENTITY_STATE;
1722 !!!next-input-character;
1723 redo A;
1724 } elsif ($self->{next_char} == -1) {
1725 !!!parse-error (type => 'unclosed attribute value');
1726 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1727 !!!cp (97);
1728 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1729 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1730 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1731 if ($self->{current_token}->{attributes}) {
1732 !!!cp (98);
1733 !!!parse-error (type => 'end tag attribute');
1734 } else {
1735 ## NOTE: This state should never be reached.
1736 !!!cp (99);
1737 }
1738 } else {
1739 die "$0: $self->{current_token}->{type}: Unknown token type";
1740 }
1741 $self->{state} = DATA_STATE;
1742 ## reconsume
1743
1744 !!!emit ($self->{current_token}); # start tag or end tag
1745
1746 redo A;
1747 } else {
1748 !!!cp (100);
1749 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1750 ## Stay in the state
1751 !!!next-input-character;
1752 redo A;
1753 }
1754 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1755 if ($self->{next_char} == 0x0027) { # '
1756 !!!cp (101);
1757 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1758 !!!next-input-character;
1759 redo A;
1760 } elsif ($self->{next_char} == 0x0026) { # &
1761 !!!cp (102);
1762 $self->{last_attribute_value_state} = $self->{state};
1763 ## NOTE: In the spec, the tokenizer is switched to the
1764 ## "entity in attribute value state". In this implementation, the
1765 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1766 ## implementation of the "consume a character reference" algorithm.
1767 #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1768 $self->{entity_in_attr} = 1;
1769 $self->{entity_additional} = 0x0027; # '
1770 $self->{state} = ENTITY_STATE;
1771 !!!next-input-character;
1772 redo A;
1773 } elsif ($self->{next_char} == -1) {
1774 !!!parse-error (type => 'unclosed attribute value');
1775 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1776 !!!cp (103);
1777 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1778 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1779 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1780 if ($self->{current_token}->{attributes}) {
1781 !!!cp (104);
1782 !!!parse-error (type => 'end tag attribute');
1783 } else {
1784 ## NOTE: This state should never be reached.
1785 !!!cp (105);
1786 }
1787 } else {
1788 die "$0: $self->{current_token}->{type}: Unknown token type";
1789 }
1790 $self->{state} = DATA_STATE;
1791 ## reconsume
1792
1793 !!!emit ($self->{current_token}); # start tag or end tag
1794
1795 redo A;
1796 } else {
1797 !!!cp (106);
1798 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1799 ## Stay in the state
1800 !!!next-input-character;
1801 redo A;
1802 }
1803 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1804 if ($self->{next_char} == 0x0009 or # HT
1805 $self->{next_char} == 0x000A or # LF
1806 $self->{next_char} == 0x000B or # HT
1807 $self->{next_char} == 0x000C or # FF
1808 $self->{next_char} == 0x0020) { # SP
1809 !!!cp (107);
1810 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1811 !!!next-input-character;
1812 redo A;
1813 } elsif ($self->{next_char} == 0x0026) { # &
1814 !!!cp (108);
1815 $self->{last_attribute_value_state} = $self->{state};
1816 ## NOTE: In the spec, the tokenizer is switched to the
1817 ## "entity in attribute value state". In this implementation, the
1818 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1819 ## implementation of the "consume a character reference" algorithm.
1820 #$self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1821 $self->{entity_in_attr} = 1;
1822 $self->{entity_additional} = -1;
1823 $self->{state} = ENTITY_STATE;
1824 !!!next-input-character;
1825 redo A;
1826 } elsif ($self->{next_char} == 0x003E) { # >
1827 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1828 !!!cp (109);
1829 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1830 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1831 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832 if ($self->{current_token}->{attributes}) {
1833 !!!cp (110);
1834 !!!parse-error (type => 'end tag attribute');
1835 } else {
1836 ## NOTE: This state should never be reached.
1837 !!!cp (111);
1838 }
1839 } else {
1840 die "$0: $self->{current_token}->{type}: Unknown token type";
1841 }
1842 $self->{state} = DATA_STATE;
1843 !!!next-input-character;
1844
1845 !!!emit ($self->{current_token}); # start tag or end tag
1846
1847 redo A;
1848 } elsif ($self->{next_char} == -1) {
1849 !!!parse-error (type => 'unclosed tag');
1850 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1851 !!!cp (112);
1852 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1853 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1854 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1855 if ($self->{current_token}->{attributes}) {
1856 !!!cp (113);
1857 !!!parse-error (type => 'end tag attribute');
1858 } else {
1859 ## NOTE: This state should never be reached.
1860 !!!cp (114);
1861 }
1862 } else {
1863 die "$0: $self->{current_token}->{type}: Unknown token type";
1864 }
1865 $self->{state} = DATA_STATE;
1866 ## reconsume
1867
1868 !!!emit ($self->{current_token}); # start tag or end tag
1869
1870 redo A;
1871 } else {
1872 if ({
1873 0x0022 => 1, # "
1874 0x0027 => 1, # '
1875 0x003D => 1, # =
1876 }->{$self->{next_char}}) {
1877 !!!cp (115);
1878 !!!parse-error (type => 'bad attribute value');
1879 } else {
1880 !!!cp (116);
1881 }
1882 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1883 ## Stay in the state
1884 !!!next-input-character;
1885 redo A;
1886 }
1887 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1888 my $token = $self->{entity_return};
1889
1890 unless (defined $token) {
1891 !!!cp (117);
1892 $self->{current_attribute}->{value} .= '&';
1893 } else {
1894 !!!cp (118);
1895 $self->{current_attribute}->{value} .= $token->{data};
1896 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1897 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1898 }
1899
1900 $self->{state} = $self->{last_attribute_value_state};
1901 # next-input-character is already done
1902 redo A;
1903 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1904 if ($self->{next_char} == 0x0009 or # HT
1905 $self->{next_char} == 0x000A or # LF
1906 $self->{next_char} == 0x000B or # VT
1907 $self->{next_char} == 0x000C or # FF
1908 $self->{next_char} == 0x0020) { # SP
1909 !!!cp (118);
1910 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1911 !!!next-input-character;
1912 redo A;
1913 } elsif ($self->{next_char} == 0x003E) { # >
1914 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1915 !!!cp (119);
1916 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1917 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1918 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1919 if ($self->{current_token}->{attributes}) {
1920 !!!cp (120);
1921 !!!parse-error (type => 'end tag attribute');
1922 } else {
1923 ## NOTE: This state should never be reached.
1924 !!!cp (121);
1925 }
1926 } else {
1927 die "$0: $self->{current_token}->{type}: Unknown token type";
1928 }
1929 $self->{state} = DATA_STATE;
1930 !!!next-input-character;
1931
1932 !!!emit ($self->{current_token}); # start tag or end tag
1933
1934 redo A;
1935 } elsif ($self->{next_char} == 0x002F) { # /
1936 !!!cp (122);
1937 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1938 !!!next-input-character;
1939 redo A;
1940 } elsif ($self->{next_char} == -1) {
1941 !!!parse-error (type => 'unclosed tag');
1942 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1943 !!!cp (122.3);
1944 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1945 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1946 if ($self->{current_token}->{attributes}) {
1947 !!!cp (122.1);
1948 !!!parse-error (type => 'end tag attribute');
1949 } else {
1950 ## NOTE: This state should never be reached.
1951 !!!cp (122.2);
1952 }
1953 } else {
1954 die "$0: $self->{current_token}->{type}: Unknown token type";
1955 }
1956 $self->{state} = DATA_STATE;
1957 ## Reconsume.
1958 !!!emit ($self->{current_token}); # start tag or end tag
1959 redo A;
1960 } else {
1961 !!!cp ('124.1');
1962 !!!parse-error (type => 'no space between attributes');
1963 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1964 ## reconsume
1965 redo A;
1966 }
1967 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1968 if ($self->{next_char} == 0x003E) { # >
1969 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1970 !!!cp ('124.2');
1971 !!!parse-error (type => 'nestc', token => $self->{current_token});
1972 ## TODO: Different type than slash in start tag
1973 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1974 if ($self->{current_token}->{attributes}) {
1975 !!!cp ('124.4');
1976 !!!parse-error (type => 'end tag attribute');
1977 } else {
1978 !!!cp ('124.5');
1979 }
1980 ## TODO: Test |<title></title/>|
1981 } else {
1982 !!!cp ('124.3');
1983 $self->{self_closing} = 1;
1984 }
1985
1986 $self->{state} = DATA_STATE;
1987 !!!next-input-character;
1988
1989 !!!emit ($self->{current_token}); # start tag or end tag
1990
1991 redo A;
1992 } elsif ($self->{next_char} == -1) {
1993 !!!parse-error (type => 'unclosed tag');
1994 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1995 !!!cp (124.7);
1996 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1997 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1998 if ($self->{current_token}->{attributes}) {
1999 !!!cp (124.5);
2000 !!!parse-error (type => 'end tag attribute');
2001 } else {
2002 ## NOTE: This state should never be reached.
2003 !!!cp (124.6);
2004 }
2005 } else {
2006 die "$0: $self->{current_token}->{type}: Unknown token type";
2007 }
2008 $self->{state} = DATA_STATE;
2009 ## Reconsume.
2010 !!!emit ($self->{current_token}); # start tag or end tag
2011 redo A;
2012 } else {
2013 !!!cp ('124.4');
2014 !!!parse-error (type => 'nestc');
2015 ## TODO: This error type is wrong.
2016 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2017 ## Reconsume.
2018 redo A;
2019 }
2020 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2021 ## (only happen if PCDATA state)
2022
2023 ## NOTE: Unlike spec's "bogus comment state", this implementation
2024 ## consumes characters one-by-one basis.
2025
2026 if ($self->{next_char} == 0x003E) { # >
2027 !!!cp (124);
2028 $self->{state} = DATA_STATE;
2029 !!!next-input-character;
2030
2031 !!!emit ($self->{current_token}); # comment
2032 redo A;
2033 } elsif ($self->{next_char} == -1) {
2034 !!!cp (125);
2035 $self->{state} = DATA_STATE;
2036 ## reconsume
2037
2038 !!!emit ($self->{current_token}); # comment
2039 redo A;
2040 } else {
2041 !!!cp (126);
2042 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2043 ## Stay in the state.
2044 !!!next-input-character;
2045 redo A;
2046 }
2047 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2048 ## (only happen if PCDATA state)
2049
2050 if ($self->{next_char} == 0x002D) { # -
2051 !!!cp (133);
2052 $self->{state} = MD_HYPHEN_STATE;
2053 !!!next-input-character;
2054 redo A;
2055 } elsif ($self->{next_char} == 0x0044 or # D
2056 $self->{next_char} == 0x0064) { # d
2057 ## ASCII case-insensitive.
2058 !!!cp (130);
2059 $self->{state} = MD_DOCTYPE_STATE;
2060 $self->{state_keyword} = chr $self->{next_char};
2061 !!!next-input-character;
2062 redo A;
2063 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2064 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2065 $self->{next_char} == 0x005B) { # [
2066 !!!cp (135.4);
2067 $self->{state} = MD_CDATA_STATE;
2068 $self->{state_keyword} = '[';
2069 !!!next-input-character;
2070 redo A;
2071 } else {
2072 !!!cp (136);
2073 }
2074
2075 !!!parse-error (type => 'bogus comment',
2076 line => $self->{line_prev},
2077 column => $self->{column_prev} - 1);
2078 ## Reconsume.
2079 $self->{state} = BOGUS_COMMENT_STATE;
2080 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2081 line => $self->{line_prev},
2082 column => $self->{column_prev} - 1,
2083 };
2084 redo A;
2085 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2086 if ($self->{next_char} == 0x002D) { # -
2087 !!!cp (127);
2088 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2089 line => $self->{line_prev},
2090 column => $self->{column_prev} - 2,
2091 };
2092 $self->{state} = COMMENT_START_STATE;
2093 !!!next-input-character;
2094 redo A;
2095 } else {
2096 !!!cp (128);
2097 !!!parse-error (type => 'bogus comment',
2098 line => $self->{line_prev},
2099 column => $self->{column_prev} - 2);
2100 $self->{state} = BOGUS_COMMENT_STATE;
2101 ## Reconsume.
2102 $self->{current_token} = {type => COMMENT_TOKEN,
2103 data => '-',
2104 line => $self->{line_prev},
2105 column => $self->{column_prev} - 2,
2106 };
2107 redo A;
2108 }
2109 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2110 ## ASCII case-insensitive.
2111 if ($self->{next_char} == [
2112 undef,
2113 0x004F, # O
2114 0x0043, # C
2115 0x0054, # T
2116 0x0059, # Y
2117 0x0050, # P
2118 ]->[length $self->{state_keyword}] or
2119 $self->{next_char} == [
2120 undef,
2121 0x006F, # o
2122 0x0063, # c
2123 0x0074, # t
2124 0x0079, # y
2125 0x0070, # p
2126 ]->[length $self->{state_keyword}]) {
2127 !!!cp (131);
2128 ## Stay in the state.
2129 $self->{state_keyword} .= chr $self->{next_char};
2130 !!!next-input-character;
2131 redo A;
2132 } elsif ((length $self->{state_keyword}) == 6 and
2133 ($self->{next_char} == 0x0045 or # E
2134 $self->{next_char} == 0x0065)) { # e
2135 !!!cp (129);
2136 $self->{state} = DOCTYPE_STATE;
2137 $self->{current_token} = {type => DOCTYPE_TOKEN,
2138 quirks => 1,
2139 line => $self->{line_prev},
2140 column => $self->{column_prev} - 7,
2141 };
2142 !!!next-input-character;
2143 redo A;
2144 } else {
2145 !!!cp (132);
2146 !!!parse-error (type => 'bogus comment',
2147 line => $self->{line_prev},
2148 column => $self->{column_prev} - 1 - length $self->{state_keyword});
2149 $self->{state} = BOGUS_COMMENT_STATE;
2150 ## Reconsume.
2151 $self->{current_token} = {type => COMMENT_TOKEN,
2152 data => $self->{state_keyword},
2153 line => $self->{line_prev},
2154 column => $self->{column_prev} - 1 - length $self->{state_keyword},
2155 };
2156 redo A;
2157 }
2158 } elsif ($self->{state} == MD_CDATA_STATE) {
2159 if ($self->{next_char} == {
2160 '[' => 0x0043, # C
2161 '[C' => 0x0044, # D
2162 '[CD' => 0x0041, # A
2163 '[CDA' => 0x0054, # T
2164 '[CDAT' => 0x0041, # A
2165 }->{$self->{state_keyword}}) {
2166 !!!cp (135.1);
2167 ## Stay in the state.
2168 $self->{state_keyword} .= chr $self->{next_char};
2169 !!!next-input-character;
2170 redo A;
2171 } elsif ($self->{state_keyword} eq '[CDATA' and
2172 $self->{next_char} == 0x005B) { # [
2173 !!!cp (135.2);
2174 $self->{current_token} = {type => CHARACTER_TOKEN,
2175 data => '',
2176 line => $self->{line_prev},
2177 column => $self->{column_prev} - 7};
2178 $self->{state} = CDATA_SECTION_STATE;
2179 !!!next-input-character;
2180 redo A;
2181 } else {
2182 !!!cp (135.3);
2183 !!!parse-error (type => 'bogus comment',
2184 line => $self->{line_prev},
2185 column => $self->{column_prev} - 1 - length $self->{state_keyword});
2186 $self->{state} = BOGUS_COMMENT_STATE;
2187 ## Reconsume.
2188 $self->{current_token} = {type => COMMENT_TOKEN,
2189 data => $self->{state_keyword},
2190 line => $self->{line_prev},
2191 column => $self->{column_prev} - 1 - length $self->{state_keyword},
2192 };
2193 redo A;
2194 }
2195 } elsif ($self->{state} == COMMENT_START_STATE) {
2196 if ($self->{next_char} == 0x002D) { # -
2197 !!!cp (137);
2198 $self->{state} = COMMENT_START_DASH_STATE;
2199 !!!next-input-character;
2200 redo A;
2201 } elsif ($self->{next_char} == 0x003E) { # >
2202 !!!cp (138);
2203 !!!parse-error (type => 'bogus comment');
2204 $self->{state} = DATA_STATE;
2205 !!!next-input-character;
2206
2207 !!!emit ($self->{current_token}); # comment
2208
2209 redo A;
2210 } elsif ($self->{next_char} == -1) {
2211 !!!cp (139);
2212 !!!parse-error (type => 'unclosed comment');
2213 $self->{state} = DATA_STATE;
2214 ## reconsume
2215
2216 !!!emit ($self->{current_token}); # comment
2217
2218 redo A;
2219 } else {
2220 !!!cp (140);
2221 $self->{current_token}->{data} # comment
2222 .= chr ($self->{next_char});
2223 $self->{state} = COMMENT_STATE;
2224 !!!next-input-character;
2225 redo A;
2226 }
2227 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2228 if ($self->{next_char} == 0x002D) { # -
2229 !!!cp (141);
2230 $self->{state} = COMMENT_END_STATE;
2231 !!!next-input-character;
2232 redo A;
2233 } elsif ($self->{next_char} == 0x003E) { # >
2234 !!!cp (142);
2235 !!!parse-error (type => 'bogus comment');
2236 $self->{state} = DATA_STATE;
2237 !!!next-input-character;
2238
2239 !!!emit ($self->{current_token}); # comment
2240
2241 redo A;
2242 } elsif ($self->{next_char} == -1) {
2243 !!!cp (143);
2244 !!!parse-error (type => 'unclosed comment');
2245 $self->{state} = DATA_STATE;
2246 ## reconsume
2247
2248 !!!emit ($self->{current_token}); # comment
2249
2250 redo A;
2251 } else {
2252 !!!cp (144);
2253 $self->{current_token}->{data} # comment
2254 .= '-' . chr ($self->{next_char});
2255 $self->{state} = COMMENT_STATE;
2256 !!!next-input-character;
2257 redo A;
2258 }
2259 } elsif ($self->{state} == COMMENT_STATE) {
2260 if ($self->{next_char} == 0x002D) { # -
2261 !!!cp (145);
2262 $self->{state} = COMMENT_END_DASH_STATE;
2263 !!!next-input-character;
2264 redo A;
2265 } elsif ($self->{next_char} == -1) {
2266 !!!cp (146);
2267 !!!parse-error (type => 'unclosed comment');
2268 $self->{state} = DATA_STATE;
2269 ## reconsume
2270
2271 !!!emit ($self->{current_token}); # comment
2272
2273 redo A;
2274 } else {
2275 !!!cp (147);
2276 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2277 ## Stay in the state
2278 !!!next-input-character;
2279 redo A;
2280 }
2281 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2282 if ($self->{next_char} == 0x002D) { # -
2283 !!!cp (148);
2284 $self->{state} = COMMENT_END_STATE;
2285 !!!next-input-character;
2286 redo A;
2287 } elsif ($self->{next_char} == -1) {
2288 !!!cp (149);
2289 !!!parse-error (type => 'unclosed comment');
2290 $self->{state} = DATA_STATE;
2291 ## reconsume
2292
2293 !!!emit ($self->{current_token}); # comment
2294
2295 redo A;
2296 } else {
2297 !!!cp (150);
2298 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2299 $self->{state} = COMMENT_STATE;
2300 !!!next-input-character;
2301 redo A;
2302 }
2303 } elsif ($self->{state} == COMMENT_END_STATE) {
2304 if ($self->{next_char} == 0x003E) { # >
2305 !!!cp (151);
2306 $self->{state} = DATA_STATE;
2307 !!!next-input-character;
2308
2309 !!!emit ($self->{current_token}); # comment
2310
2311 redo A;
2312 } elsif ($self->{next_char} == 0x002D) { # -
2313 !!!cp (152);
2314 !!!parse-error (type => 'dash in comment',
2315 line => $self->{line_prev},
2316 column => $self->{column_prev});
2317 $self->{current_token}->{data} .= '-'; # comment
2318 ## Stay in the state
2319 !!!next-input-character;
2320 redo A;
2321 } elsif ($self->{next_char} == -1) {
2322 !!!cp (153);
2323 !!!parse-error (type => 'unclosed comment');
2324 $self->{state} = DATA_STATE;
2325 ## reconsume
2326
2327 !!!emit ($self->{current_token}); # comment
2328
2329 redo A;
2330 } else {
2331 !!!cp (154);
2332 !!!parse-error (type => 'dash in comment',
2333 line => $self->{line_prev},
2334 column => $self->{column_prev});
2335 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2336 $self->{state} = COMMENT_STATE;
2337 !!!next-input-character;
2338 redo A;
2339 }
2340 } elsif ($self->{state} == DOCTYPE_STATE) {
2341 if ($self->{next_char} == 0x0009 or # HT
2342 $self->{next_char} == 0x000A or # LF
2343 $self->{next_char} == 0x000B or # VT
2344 $self->{next_char} == 0x000C or # FF
2345 $self->{next_char} == 0x0020) { # SP
2346 !!!cp (155);
2347 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2348 !!!next-input-character;
2349 redo A;
2350 } else {
2351 !!!cp (156);
2352 !!!parse-error (type => 'no space before DOCTYPE name');
2353 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2354 ## reconsume
2355 redo A;
2356 }
2357 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2358 if ($self->{next_char} == 0x0009 or # HT
2359 $self->{next_char} == 0x000A or # LF
2360 $self->{next_char} == 0x000B or # VT
2361 $self->{next_char} == 0x000C or # FF
2362 $self->{next_char} == 0x0020) { # SP
2363 !!!cp (157);
2364 ## Stay in the state
2365 !!!next-input-character;
2366 redo A;
2367 } elsif ($self->{next_char} == 0x003E) { # >
2368 !!!cp (158);
2369 !!!parse-error (type => 'no DOCTYPE name');
2370 $self->{state} = DATA_STATE;
2371 !!!next-input-character;
2372
2373 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2374
2375 redo A;
2376 } elsif ($self->{next_char} == -1) {
2377 !!!cp (159);
2378 !!!parse-error (type => 'no DOCTYPE name');
2379 $self->{state} = DATA_STATE;
2380 ## reconsume
2381
2382 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2383
2384 redo A;
2385 } else {
2386 !!!cp (160);
2387 $self->{current_token}->{name} = chr $self->{next_char};
2388 delete $self->{current_token}->{quirks};
2389 ## ISSUE: "Set the token's name name to the" in the spec
2390 $self->{state} = DOCTYPE_NAME_STATE;
2391 !!!next-input-character;
2392 redo A;
2393 }
2394 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2395 ## ISSUE: Redundant "First," in the spec.
2396 if ($self->{next_char} == 0x0009 or # HT
2397 $self->{next_char} == 0x000A or # LF
2398 $self->{next_char} == 0x000B or # VT
2399 $self->{next_char} == 0x000C or # FF
2400 $self->{next_char} == 0x0020) { # SP
2401 !!!cp (161);
2402 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2403 !!!next-input-character;
2404 redo A;
2405 } elsif ($self->{next_char} == 0x003E) { # >
2406 !!!cp (162);
2407 $self->{state} = DATA_STATE;
2408 !!!next-input-character;
2409
2410 !!!emit ($self->{current_token}); # DOCTYPE
2411
2412 redo A;
2413 } elsif ($self->{next_char} == -1) {
2414 !!!cp (163);
2415 !!!parse-error (type => 'unclosed DOCTYPE');
2416 $self->{state} = DATA_STATE;
2417 ## reconsume
2418
2419 $self->{current_token}->{quirks} = 1;
2420 !!!emit ($self->{current_token}); # DOCTYPE
2421
2422 redo A;
2423 } else {
2424 !!!cp (164);
2425 $self->{current_token}->{name}
2426 .= chr ($self->{next_char}); # DOCTYPE
2427 ## Stay in the state
2428 !!!next-input-character;
2429 redo A;
2430 }
2431 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2432 if ($self->{next_char} == 0x0009 or # HT
2433 $self->{next_char} == 0x000A or # LF
2434 $self->{next_char} == 0x000B or # VT
2435 $self->{next_char} == 0x000C or # FF
2436 $self->{next_char} == 0x0020) { # SP
2437 !!!cp (165);
2438 ## Stay in the state
2439 !!!next-input-character;
2440 redo A;
2441 } elsif ($self->{next_char} == 0x003E) { # >
2442 !!!cp (166);
2443 $self->{state} = DATA_STATE;
2444 !!!next-input-character;
2445
2446 !!!emit ($self->{current_token}); # DOCTYPE
2447
2448 redo A;
2449 } elsif ($self->{next_char} == -1) {
2450 !!!cp (167);
2451 !!!parse-error (type => 'unclosed DOCTYPE');
2452 $self->{state} = DATA_STATE;
2453 ## reconsume
2454
2455 $self->{current_token}->{quirks} = 1;
2456 !!!emit ($self->{current_token}); # DOCTYPE
2457
2458 redo A;
2459 } elsif ($self->{next_char} == 0x0050 or # P
2460 $self->{next_char} == 0x0070) { # p
2461 $self->{state} = PUBLIC_STATE;
2462 $self->{state_keyword} = chr $self->{next_char};
2463 !!!next-input-character;
2464 redo A;
2465 } elsif ($self->{next_char} == 0x0053 or # S
2466 $self->{next_char} == 0x0073) { # s
2467 $self->{state} = SYSTEM_STATE;
2468 $self->{state_keyword} = chr $self->{next_char};
2469 !!!next-input-character;
2470 redo A;
2471 } else {
2472 !!!cp (180);
2473 !!!parse-error (type => 'string after DOCTYPE name');
2474 $self->{current_token}->{quirks} = 1;
2475
2476 $self->{state} = BOGUS_DOCTYPE_STATE;
2477 !!!next-input-character;
2478 redo A;
2479 }
2480 } elsif ($self->{state} == PUBLIC_STATE) {
2481 ## ASCII case-insensitive
2482 if ($self->{next_char} == [
2483 undef,
2484 0x0055, # U
2485 0x0042, # B
2486 0x004C, # L
2487 0x0049, # I
2488 ]->[length $self->{state_keyword}] or
2489 $self->{next_char} == [
2490 undef,
2491 0x0075, # u
2492 0x0062, # b
2493 0x006C, # l
2494 0x0069, # i
2495 ]->[length $self->{state_keyword}]) {
2496 !!!cp (175);
2497 ## Stay in the state.
2498 $self->{state_keyword} .= chr $self->{next_char};
2499 !!!next-input-character;
2500 redo A;
2501 } elsif ((length $self->{state_keyword}) == 5 and
2502 ($self->{next_char} == 0x0043 or # C
2503 $self->{next_char} == 0x0063)) { # c
2504 !!!cp (168);
2505 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2506 !!!next-input-character;
2507 redo A;
2508 } else {
2509 !!!cp (169);
2510 !!!parse-error (type => 'string after DOCTYPE name',
2511 line => $self->{line_prev},
2512 column => $self->{column_prev} + 1 - length $self->{state_keyword});
2513 $self->{current_token}->{quirks} = 1;
2514
2515 $self->{state} = BOGUS_DOCTYPE_STATE;
2516 ## Reconsume.
2517 redo A;
2518 }
2519 } elsif ($self->{state} == SYSTEM_STATE) {
2520 ## ASCII case-insensitive
2521 if ($self->{next_char} == [
2522 undef,
2523 0x0059, # Y
2524 0x0053, # S
2525 0x0054, # T
2526 0x0045, # E
2527 ]->[length $self->{state_keyword}] or
2528 $self->{next_char} == [
2529 undef,
2530 0x0079, # y
2531 0x0073, # s
2532 0x0074, # t
2533 0x0065, # e
2534 ]->[length $self->{state_keyword}]) {
2535 !!!cp (170);
2536 ## Stay in the state.
2537 $self->{state_keyword} .= chr $self->{next_char};
2538 !!!next-input-character;
2539 redo A;
2540 } elsif ((length $self->{state_keyword}) == 5 and
2541 ($self->{next_char} == 0x004D or # M
2542 $self->{next_char} == 0x006D)) { # m
2543 !!!cp (171);
2544 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2545 !!!next-input-character;
2546 redo A;
2547 } else {
2548 !!!cp (172);
2549 !!!parse-error (type => 'string after DOCTYPE name',
2550 line => $self->{line_prev},
2551 column => $self->{column_prev} + 1 - length $self->{state_keyword});
2552 $self->{current_token}->{quirks} = 1;
2553
2554 $self->{state} = BOGUS_DOCTYPE_STATE;
2555 ## Reconsume.
2556 redo A;
2557 }
2558 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2559 if ({
2560 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2561 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2562 }->{$self->{next_char}}) {
2563 !!!cp (181);
2564 ## Stay in the state
2565 !!!next-input-character;
2566 redo A;
2567 } elsif ($self->{next_char} eq 0x0022) { # "
2568 !!!cp (182);
2569 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2570 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2571 !!!next-input-character;
2572 redo A;
2573 } elsif ($self->{next_char} eq 0x0027) { # '
2574 !!!cp (183);
2575 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2576 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2577 !!!next-input-character;
2578 redo A;
2579 } elsif ($self->{next_char} eq 0x003E) { # >
2580 !!!cp (184);
2581 !!!parse-error (type => 'no PUBLIC literal');
2582
2583 $self->{state} = DATA_STATE;
2584 !!!next-input-character;
2585
2586 $self->{current_token}->{quirks} = 1;
2587 !!!emit ($self->{current_token}); # DOCTYPE
2588
2589 redo A;
2590 } elsif ($self->{next_char} == -1) {
2591 !!!cp (185);
2592 !!!parse-error (type => 'unclosed DOCTYPE');
2593
2594 $self->{state} = DATA_STATE;
2595 ## reconsume
2596
2597 $self->{current_token}->{quirks} = 1;
2598 !!!emit ($self->{current_token}); # DOCTYPE
2599
2600 redo A;
2601 } else {
2602 !!!cp (186);
2603 !!!parse-error (type => 'string after PUBLIC');
2604 $self->{current_token}->{quirks} = 1;
2605
2606 $self->{state} = BOGUS_DOCTYPE_STATE;
2607 !!!next-input-character;
2608 redo A;
2609 }
2610 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2611 if ($self->{next_char} == 0x0022) { # "
2612 !!!cp (187);
2613 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2614 !!!next-input-character;
2615 redo A;
2616 } elsif ($self->{next_char} == 0x003E) { # >
2617 !!!cp (188);
2618 !!!parse-error (type => 'unclosed PUBLIC literal');
2619
2620 $self->{state} = DATA_STATE;
2621 !!!next-input-character;
2622
2623 $self->{current_token}->{quirks} = 1;
2624 !!!emit ($self->{current_token}); # DOCTYPE
2625
2626 redo A;
2627 } elsif ($self->{next_char} == -1) {
2628 !!!cp (189);
2629 !!!parse-error (type => 'unclosed PUBLIC literal');
2630
2631 $self->{state} = DATA_STATE;
2632 ## reconsume
2633
2634 $self->{current_token}->{quirks} = 1;
2635 !!!emit ($self->{current_token}); # DOCTYPE
2636
2637 redo A;
2638 } else {
2639 !!!cp (190);
2640 $self->{current_token}->{public_identifier} # DOCTYPE
2641 .= chr $self->{next_char};
2642 ## Stay in the state
2643 !!!next-input-character;
2644 redo A;
2645 }
2646 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2647 if ($self->{next_char} == 0x0027) { # '
2648 !!!cp (191);
2649 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2650 !!!next-input-character;
2651 redo A;
2652 } elsif ($self->{next_char} == 0x003E) { # >
2653 !!!cp (192);
2654 !!!parse-error (type => 'unclosed PUBLIC literal');
2655
2656 $self->{state} = DATA_STATE;
2657 !!!next-input-character;
2658
2659 $self->{current_token}->{quirks} = 1;
2660 !!!emit ($self->{current_token}); # DOCTYPE
2661
2662 redo A;
2663 } elsif ($self->{next_char} == -1) {
2664 !!!cp (193);
2665 !!!parse-error (type => 'unclosed PUBLIC literal');
2666
2667 $self->{state} = DATA_STATE;
2668 ## reconsume
2669
2670 $self->{current_token}->{quirks} = 1;
2671 !!!emit ($self->{current_token}); # DOCTYPE
2672
2673 redo A;
2674 } else {
2675 !!!cp (194);
2676 $self->{current_token}->{public_identifier} # DOCTYPE
2677 .= chr $self->{next_char};
2678 ## Stay in the state
2679 !!!next-input-character;
2680 redo A;
2681 }
2682 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2683 if ({
2684 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2685 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2686 }->{$self->{next_char}}) {
2687 !!!cp (195);
2688 ## Stay in the state
2689 !!!next-input-character;
2690 redo A;
2691 } elsif ($self->{next_char} == 0x0022) { # "
2692 !!!cp (196);
2693 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2694 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2695 !!!next-input-character;
2696 redo A;
2697 } elsif ($self->{next_char} == 0x0027) { # '
2698 !!!cp (197);
2699 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2700 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2701 !!!next-input-character;
2702 redo A;
2703 } elsif ($self->{next_char} == 0x003E) { # >
2704 !!!cp (198);
2705 $self->{state} = DATA_STATE;
2706 !!!next-input-character;
2707
2708 !!!emit ($self->{current_token}); # DOCTYPE
2709
2710 redo A;
2711 } elsif ($self->{next_char} == -1) {
2712 !!!cp (199);
2713 !!!parse-error (type => 'unclosed DOCTYPE');
2714
2715 $self->{state} = DATA_STATE;
2716 ## reconsume
2717
2718 $self->{current_token}->{quirks} = 1;
2719 !!!emit ($self->{current_token}); # DOCTYPE
2720
2721 redo A;
2722 } else {
2723 !!!cp (200);
2724 !!!parse-error (type => 'string after PUBLIC literal');
2725 $self->{current_token}->{quirks} = 1;
2726
2727 $self->{state} = BOGUS_DOCTYPE_STATE;
2728 !!!next-input-character;
2729 redo A;
2730 }
2731 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2732 if ({
2733 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2734 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2735 }->{$self->{next_char}}) {
2736 !!!cp (201);
2737 ## Stay in the state
2738 !!!next-input-character;
2739 redo A;
2740 } elsif ($self->{next_char} == 0x0022) { # "
2741 !!!cp (202);
2742 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2743 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2744 !!!next-input-character;
2745 redo A;
2746 } elsif ($self->{next_char} == 0x0027) { # '
2747 !!!cp (203);
2748 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2749 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2750 !!!next-input-character;
2751 redo A;
2752 } elsif ($self->{next_char} == 0x003E) { # >
2753 !!!cp (204);
2754 !!!parse-error (type => 'no SYSTEM literal');
2755 $self->{state} = DATA_STATE;
2756 !!!next-input-character;
2757
2758 $self->{current_token}->{quirks} = 1;
2759 !!!emit ($self->{current_token}); # DOCTYPE
2760
2761 redo A;
2762 } elsif ($self->{next_char} == -1) {
2763 !!!cp (205);
2764 !!!parse-error (type => 'unclosed DOCTYPE');
2765
2766 $self->{state} = DATA_STATE;
2767 ## reconsume
2768
2769 $self->{current_token}->{quirks} = 1;
2770 !!!emit ($self->{current_token}); # DOCTYPE
2771
2772 redo A;
2773 } else {
2774 !!!cp (206);
2775 !!!parse-error (type => 'string after SYSTEM');
2776 $self->{current_token}->{quirks} = 1;
2777
2778 $self->{state} = BOGUS_DOCTYPE_STATE;
2779 !!!next-input-character;
2780 redo A;
2781 }
2782 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2783 if ($self->{next_char} == 0x0022) { # "
2784 !!!cp (207);
2785 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2786 !!!next-input-character;
2787 redo A;
2788 } elsif ($self->{next_char} == 0x003E) { # >
2789 !!!cp (208);
2790 !!!parse-error (type => 'unclosed SYSTEM literal');
2791
2792 $self->{state} = DATA_STATE;
2793 !!!next-input-character;
2794
2795 $self->{current_token}->{quirks} = 1;
2796 !!!emit ($self->{current_token}); # DOCTYPE
2797
2798 redo A;
2799 } elsif ($self->{next_char} == -1) {
2800 !!!cp (209);
2801 !!!parse-error (type => 'unclosed SYSTEM literal');
2802
2803 $self->{state} = DATA_STATE;
2804 ## reconsume
2805
2806 $self->{current_token}->{quirks} = 1;
2807 !!!emit ($self->{current_token}); # DOCTYPE
2808
2809 redo A;
2810 } else {
2811 !!!cp (210);
2812 $self->{current_token}->{system_identifier} # DOCTYPE
2813 .= chr $self->{next_char};
2814 ## Stay in the state
2815 !!!next-input-character;
2816 redo A;
2817 }
2818 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2819 if ($self->{next_char} == 0x0027) { # '
2820 !!!cp (211);
2821 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2822 !!!next-input-character;
2823 redo A;
2824 } elsif ($self->{next_char} == 0x003E) { # >
2825 !!!cp (212);
2826 !!!parse-error (type => 'unclosed SYSTEM literal');
2827
2828 $self->{state} = DATA_STATE;
2829 !!!next-input-character;
2830
2831 $self->{current_token}->{quirks} = 1;
2832 !!!emit ($self->{current_token}); # DOCTYPE
2833
2834 redo A;
2835 } elsif ($self->{next_char} == -1) {
2836 !!!cp (213);
2837 !!!parse-error (type => 'unclosed SYSTEM literal');
2838
2839 $self->{state} = DATA_STATE;
2840 ## reconsume
2841
2842 $self->{current_token}->{quirks} = 1;
2843 !!!emit ($self->{current_token}); # DOCTYPE
2844
2845 redo A;
2846 } else {
2847 !!!cp (214);
2848 $self->{current_token}->{system_identifier} # DOCTYPE
2849 .= chr $self->{next_char};
2850 ## Stay in the state
2851 !!!next-input-character;
2852 redo A;
2853 }
2854 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2855 if ({
2856 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2857 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2858 }->{$self->{next_char}}) {
2859 !!!cp (215);
2860 ## Stay in the state
2861 !!!next-input-character;
2862 redo A;
2863 } elsif ($self->{next_char} == 0x003E) { # >
2864 !!!cp (216);
2865 $self->{state} = DATA_STATE;
2866 !!!next-input-character;
2867
2868 !!!emit ($self->{current_token}); # DOCTYPE
2869
2870 redo A;
2871 } elsif ($self->{next_char} == -1) {
2872 !!!cp (217);
2873 !!!parse-error (type => 'unclosed DOCTYPE');
2874 $self->{state} = DATA_STATE;
2875 ## reconsume
2876
2877 $self->{current_token}->{quirks} = 1;
2878 !!!emit ($self->{current_token}); # DOCTYPE
2879
2880 redo A;
2881 } else {
2882 !!!cp (218);
2883 !!!parse-error (type => 'string after SYSTEM literal');
2884 #$self->{current_token}->{quirks} = 1;
2885
2886 $self->{state} = BOGUS_DOCTYPE_STATE;
2887 !!!next-input-character;
2888 redo A;
2889 }
2890 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2891 if ($self->{next_char} == 0x003E) { # >
2892 !!!cp (219);
2893 $self->{state} = DATA_STATE;
2894 !!!next-input-character;
2895
2896 !!!emit ($self->{current_token}); # DOCTYPE
2897
2898 redo A;
2899 } elsif ($self->{next_char} == -1) {
2900 !!!cp (220);
2901 !!!parse-error (type => 'unclosed DOCTYPE');
2902 $self->{state} = DATA_STATE;
2903 ## reconsume
2904
2905 !!!emit ($self->{current_token}); # DOCTYPE
2906
2907 redo A;
2908 } else {
2909 !!!cp (221);
2910 ## Stay in the state
2911 !!!next-input-character;
2912 redo A;
2913 }
2914 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2915 ## NOTE: "CDATA section state" in the state is jointly implemented
2916 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2917 ## and |CDATA_SECTION_MSE2_STATE|.
2918
2919 if ($self->{next_char} == 0x005D) { # ]
2920 !!!cp (221.1);
2921 $self->{state} = CDATA_SECTION_MSE1_STATE;
2922 !!!next-input-character;
2923 redo A;
2924 } elsif ($self->{next_char} == -1) {
2925 $self->{state} = DATA_STATE;
2926 !!!next-input-character;
2927 if (length $self->{current_token}->{data}) { # character
2928 !!!cp (221.2);
2929 !!!emit ($self->{current_token}); # character
2930 } else {
2931 !!!cp (221.3);
2932 ## No token to emit. $self->{current_token} is discarded.
2933 }
2934 redo A;
2935 } else {
2936 !!!cp (221.4);
2937 $self->{current_token}->{data} .= chr $self->{next_char};
2938 ## Stay in the state.
2939 !!!next-input-character;
2940 redo A;
2941 }
2942
2943 ## ISSUE: "text tokens" in spec.
2944 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2945 if ($self->{next_char} == 0x005D) { # ]
2946 !!!cp (221.5);
2947 $self->{state} = CDATA_SECTION_MSE2_STATE;
2948 !!!next-input-character;
2949 redo A;
2950 } else {
2951 !!!cp (221.6);
2952 $self->{current_token}->{data} .= ']';
2953 $self->{state} = CDATA_SECTION_STATE;
2954 ## Reconsume.
2955 redo A;
2956 }
2957 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2958 if ($self->{next_char} == 0x003E) { # >
2959 $self->{state} = DATA_STATE;
2960 !!!next-input-character;
2961 if (length $self->{current_token}->{data}) { # character
2962 !!!cp (221.7);
2963 !!!emit ($self->{current_token}); # character
2964 } else {
2965 !!!cp (221.8);
2966 ## No token to emit. $self->{current_token} is discarded.
2967 }
2968 redo A;
2969 } elsif ($self->{next_char} == 0x005D) { # ]
2970 !!!cp (221.9); # character
2971 $self->{current_token}->{data} .= ']'; ## Add first "]" of "]]]".
2972 ## Stay in the state.
2973 !!!next-input-character;
2974 redo A;
2975 } else {
2976 !!!cp (221.11);
2977 $self->{current_token}->{data} .= ']]'; # character
2978 $self->{state} = CDATA_SECTION_STATE;
2979 ## Reconsume.
2980 redo A;
2981 }
2982
2983 } elsif ($self->{state} == ENTITY_STATE) {
2984 my $in_attr = $self->{entity_in_attr};
2985 my $additional = $self->{entity_additional};
2986
2987 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2988
2989 if ({
2990 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2991 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2992 $additional => 1,
2993 }->{$self->{next_char}}) {
2994 !!!cp (1001);
2995 ## Don't consume
2996 ## No error
2997 $self->{entity_return} = undef;
2998 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
2999 redo A;
3000 } elsif ($self->{next_char} == 0x0023) { # #
3001 !!!next-input-character;
3002 if ($self->{next_char} == 0x0078 or # x
3003 $self->{next_char} == 0x0058) { # X
3004 my $code;
3005 X: {
3006 my $x_char = $self->{next_char};
3007 !!!next-input-character;
3008 if (0x0030 <= $self->{next_char} and
3009 $self->{next_char} <= 0x0039) { # 0..9
3010 !!!cp (1002);
3011 $code ||= 0;
3012 $code *= 0x10;
3013 $code += $self->{next_char} - 0x0030;
3014 redo X;
3015 } elsif (0x0061 <= $self->{next_char} and
3016 $self->{next_char} <= 0x0066) { # a..f
3017 !!!cp (1003);
3018 $code ||= 0;
3019 $code *= 0x10;
3020 $code += $self->{next_char} - 0x0060 + 9;
3021 redo X;
3022 } elsif (0x0041 <= $self->{next_char} and
3023 $self->{next_char} <= 0x0046) { # A..F
3024 !!!cp (1004);
3025 $code ||= 0;
3026 $code *= 0x10;
3027 $code += $self->{next_char} - 0x0040 + 9;
3028 redo X;
3029 } elsif (not defined $code) { # no hexadecimal digit
3030 !!!cp (1005);
3031 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
3032 !!!back-next-input-character ($x_char, $self->{next_char});
3033 $self->{next_char} = 0x0023; # #
3034 $self->{entity_return} = undef;
3035 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3036 redo A;
3037 } elsif ($self->{next_char} == 0x003B) { # ;
3038 !!!cp (1006);
3039 !!!next-input-character;
3040 } else {
3041 !!!cp (1007);
3042 !!!parse-error (type => 'no refc', line => $l, column => $c);
3043 }
3044
3045 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3046 !!!cp (1008);
3047 !!!parse-error (type => 'invalid character reference',
3048 text => (sprintf 'U+%04X', $code),
3049 line => $l, column => $c);
3050 $code = 0xFFFD;
3051 } elsif ($code > 0x10FFFF) {
3052 !!!cp (1009);
3053 !!!parse-error (type => 'invalid character reference',
3054 text => (sprintf 'U-%08X', $code),
3055 line => $l, column => $c);
3056 $code = 0xFFFD;
3057 } elsif ($code == 0x000D) {
3058 !!!cp (1010);
3059 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
3060 $code = 0x000A;
3061 } elsif (0x80 <= $code and $code <= 0x9F) {
3062 !!!cp (1011);
3063 !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
3064 $code = $c1_entity_char->{$code};
3065 }
3066
3067 $self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code,
3068 has_reference => 1,
3069 line => $l, column => $c,
3070 };
3071 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3072 redo A;
3073 } # X
3074 } elsif (0x0030 <= $self->{next_char} and
3075 $self->{next_char} <= 0x0039) { # 0..9
3076 my $code = $self->{next_char} - 0x0030;
3077 !!!next-input-character;
3078
3079 while (0x0030 <= $self->{next_char} and
3080 $self->{next_char} <= 0x0039) { # 0..9
3081 !!!cp (1012);
3082 $code *= 10;
3083 $code += $self->{next_char} - 0x0030;
3084
3085 !!!next-input-character;
3086 }
3087
3088 if ($self->{next_char} == 0x003B) { # ;
3089 !!!cp (1013);
3090 !!!next-input-character;
3091 } else {
3092 !!!cp (1014);
3093 !!!parse-error (type => 'no refc', line => $l, column => $c);
3094 }
3095
3096 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3097 !!!cp (1015);
3098 !!!parse-error (type => 'invalid character reference',
3099 text => (sprintf 'U+%04X', $code),
3100 line => $l, column => $c);
3101 $code = 0xFFFD;
3102 } elsif ($code > 0x10FFFF) {
3103 !!!cp (1016);
3104 !!!parse-error (type => 'invalid character reference',
3105 text => (sprintf 'U-%08X', $code),
3106 line => $l, column => $c);
3107 $code = 0xFFFD;
3108 } elsif ($code == 0x000D) {
3109 !!!cp (1017);
3110 !!!parse-error (type => 'CR character reference',
3111 line => $l, column => $c);
3112 $code = 0x000A;
3113 } elsif (0x80 <= $code and $code <= 0x9F) {
3114 !!!cp (1018);
3115 !!!parse-error (type => 'C1 character reference',
3116 text => (sprintf 'U+%04X', $code),
3117 line => $l, column => $c);
3118 $code = $c1_entity_char->{$code};
3119 }
3120
3121 $self->{entity_return} = {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
3122 line => $l, column => $c,
3123 };
3124 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3125 redo A;
3126 } else {
3127 !!!cp (1019);
3128 !!!parse-error (type => 'bare nero', line => $l, column => $c);
3129 !!!back-next-input-character ($self->{next_char});
3130 $self->{next_char} = 0x0023; # #
3131 $self->{entity_return} = undef;
3132 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3133 redo A;
3134 }
3135 } elsif ((0x0041 <= $self->{next_char} and
3136 $self->{next_char} <= 0x005A) or
3137 (0x0061 <= $self->{next_char} and
3138 $self->{next_char} <= 0x007A)) {
3139 my $entity_name = chr $self->{next_char};
3140 !!!next-input-character;
3141
3142 my $value = $entity_name;
3143 my $match = 0;
3144 require Whatpm::_NamedEntityList;
3145 our $EntityChar;
3146
3147 while (length $entity_name < 30 and
3148 ## NOTE: Some number greater than the maximum length of entity name
3149 ((0x0041 <= $self->{next_char} and # a
3150 $self->{next_char} <= 0x005A) or # x
3151 (0x0061 <= $self->{next_char} and # a
3152 $self->{next_char} <= 0x007A) or # z
3153 (0x0030 <= $self->{next_char} and # 0
3154 $self->{next_char} <= 0x0039) or # 9
3155 $self->{next_char} == 0x003B)) { # ;
3156 $entity_name .= chr $self->{next_char};
3157 if (defined $EntityChar->{$entity_name}) {
3158 if ($self->{next_char} == 0x003B) { # ;
3159 !!!cp (1020);
3160 $value = $EntityChar->{$entity_name};
3161 $match = 1;
3162 !!!next-input-character;
3163 last;
3164 } else {
3165 !!!cp (1021);
3166 $value = $EntityChar->{$entity_name};
3167 $match = -1;
3168 !!!next-input-character;
3169 }
3170 } else {
3171 !!!cp (1022);
3172 $value .= chr $self->{next_char};
3173 $match *= 2;
3174 !!!next-input-character;
3175 }
3176 }
3177
3178 if ($match > 0) {
3179 !!!cp (1023);
3180 $self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3181 line => $l, column => $c,
3182 };
3183 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3184 redo A;
3185 } elsif ($match < 0) {
3186 !!!parse-error (type => 'no refc', line => $l, column => $c);
3187 if ($in_attr and $match < -1) {
3188 !!!cp (1024);
3189 $self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$entity_name,
3190 line => $l, column => $c,
3191 };
3192 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3193 redo A;
3194 } else {
3195 !!!cp (1025);
3196 $self->{entity_return} = {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3197 line => $l, column => $c,
3198 };
3199 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3200 redo A;
3201 }
3202 } else {
3203 !!!cp (1026);
3204 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3205 ## NOTE: "No characters are consumed" in the spec.
3206 $self->{entity_return} = {type => CHARACTER_TOKEN, data => '&'.$value,
3207 line => $l, column => $c,
3208 };
3209 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3210 redo A;
3211 }
3212 } else {
3213 !!!cp (1027);
3214 ## no characters are consumed
3215 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3216 $self->{entity_return} = undef;
3217 $self->{state} = $self->{entity_in_attr} ? ENTITY_IN_ATTRIBUTE_VALUE_STATE : ENTITY_DATA_STATE;
3218 redo A;
3219 }
3220
3221 } else {
3222 die "$0: $self->{state}: Unknown state";
3223 }
3224 } # A
3225
3226 die "$0: _get_next_token: unexpected case";
3227 } # _get_next_token
3228
3229 sub _initialize_tree_constructor ($) {
3230 my $self = shift;
3231 ## NOTE: $self->{document} MUST be specified before this method is called
3232 $self->{document}->strict_error_checking (0);
3233 ## TODO: Turn mutation events off # MUST
3234 ## TODO: Turn loose Document option (manakai extension) on
3235 $self->{document}->manakai_is_html (1); # MUST
3236 $self->{document}->set_user_data (manakai_source_line => 1);
3237 $self->{document}->set_user_data (manakai_source_column => 1);
3238 } # _initialize_tree_constructor
3239
3240 sub _terminate_tree_constructor ($) {
3241 my $self = shift;
3242 $self->{document}->strict_error_checking (1);
3243 ## TODO: Turn mutation events on
3244 } # _terminate_tree_constructor
3245
3246 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3247
3248 { # tree construction stage
3249 my $token;
3250
3251 sub _construct_tree ($) {
3252 my ($self) = @_;
3253
3254 ## When an interactive UA render the $self->{document} available
3255 ## to the user, or when it begin accepting user input, are
3256 ## not defined.
3257
3258 ## Append a character: collect it and all subsequent consecutive
3259 ## characters and insert one Text node whose data is concatenation
3260 ## of all those characters. # MUST
3261
3262 !!!next-token;
3263
3264 undef $self->{form_element};
3265 undef $self->{head_element};
3266 $self->{open_elements} = [];
3267 undef $self->{inner_html_node};
3268
3269 ## NOTE: The "initial" insertion mode.
3270 $self->_tree_construction_initial; # MUST
3271
3272 ## NOTE: The "before html" insertion mode.
3273 $self->_tree_construction_root_element;
3274 $self->{insertion_mode} = BEFORE_HEAD_IM;
3275
3276 ## NOTE: The "before head" insertion mode and so on.
3277 $self->_tree_construction_main;
3278 } # _construct_tree
3279
3280 sub _tree_construction_initial ($) {
3281 my $self = shift;
3282
3283 ## NOTE: "initial" insertion mode
3284
3285 INITIAL: {
3286 if ($token->{type} == DOCTYPE_TOKEN) {
3287 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3288 ## error, switch to a conformance checking mode for another
3289 ## language.
3290 my $doctype_name = $token->{name};
3291 $doctype_name = '' unless defined $doctype_name;
3292 $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3293 if (not defined $token->{name} or # <!DOCTYPE>
3294 defined $token->{system_identifier}) {
3295 !!!cp ('t1');
3296 !!!parse-error (type => 'not HTML5', token => $token);
3297 } elsif ($doctype_name ne 'HTML') {
3298 !!!cp ('t2');
3299 !!!parse-error (type => 'not HTML5', token => $token);
3300 } elsif (defined $token->{public_identifier}) {
3301 if ($token->{public_identifier} eq 'XSLT-compat') {
3302 !!!cp ('t1.2');
3303 !!!parse-error (type => 'XSLT-compat', token => $token,
3304 level => $self->{level}->{should});
3305 } else {
3306 !!!parse-error (type => 'not HTML5', token => $token);
3307 }
3308 } else {
3309 !!!cp ('t3');
3310 #
3311 }
3312
3313 my $doctype = $self->{document}->create_document_type_definition
3314 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3315 ## NOTE: Default value for both |public_id| and |system_id| attributes
3316 ## are empty strings, so that we don't set any value in missing cases.
3317 $doctype->public_id ($token->{public_identifier})
3318 if defined $token->{public_identifier};
3319 $doctype->system_id ($token->{system_identifier})
3320 if defined $token->{system_identifier};
3321 ## NOTE: Other DocumentType attributes are null or empty lists.
3322 ## ISSUE: internalSubset = null??
3323 $self->{document}->append_child ($doctype);
3324
3325 if ($token->{quirks} or $doctype_name ne 'HTML') {
3326 !!!cp ('t4');
3327 $self->{document}->manakai_compat_mode ('quirks');
3328 } elsif (defined $token->{public_identifier}) {
3329 my $pubid = $token->{public_identifier};
3330 $pubid =~ tr/a-z/A-z/;
3331 my $prefix = [
3332 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3333 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3334 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3335 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3336 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3337 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3338 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3339 "-//IETF//DTD HTML 2.0 STRICT//",
3340 "-//IETF//DTD HTML 2.0//",
3341 "-//IETF//DTD HTML 2.1E//",
3342 "-//IETF//DTD HTML 3.0//",
3343 "-//IETF//DTD HTML 3.2 FINAL//",
3344 "-//IETF//DTD HTML 3.2//",
3345 "-//IETF//DTD HTML 3//",
3346 "-//IETF//DTD HTML LEVEL 0//",
3347 "-//IETF//DTD HTML LEVEL 1//",
3348 "-//IETF//DTD HTML LEVEL 2//",
3349 "-//IETF//DTD HTML LEVEL 3//",
3350 "-//IETF//DTD HTML STRICT LEVEL 0//",
3351 "-//IETF//DTD HTML STRICT LEVEL 1//",
3352 "-//IETF//DTD HTML STRICT LEVEL 2//",
3353 "-//IETF//DTD HTML STRICT LEVEL 3//",
3354 "-//IETF//DTD HTML STRICT//",
3355 "-//IETF//DTD HTML//",
3356 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3357 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3358 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3359 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3360 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3361 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3362 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3363 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3364 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3365 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3366 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3367 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3368 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3369 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3370 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3371 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3372 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3373 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3374 "-//W3C//DTD HTML 3 1995-03-24//",
3375 "-//W3C//DTD HTML 3.2 DRAFT//",
3376 "-//W3C//DTD HTML 3.2 FINAL//",
3377 "-//W3C//DTD HTML 3.2//",
3378 "-//W3C//DTD HTML 3.2S DRAFT//",
3379 "-//W3C//DTD HTML 4.0 FRAMESET//",
3380 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3381 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3382 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3383 "-//W3C//DTD W3 HTML//",
3384 "-//W3O//DTD W3 HTML 3.0//",
3385 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3386 "-//WEBTECHS//DTD MOZILLA HTML//",
3387 ]; # $prefix
3388 my $match;
3389 for (@$prefix) {
3390 if (substr ($prefix, 0, length $_) eq $_) {
3391 $match = 1;
3392 last;
3393 }
3394 }
3395 if ($match or
3396 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3397 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3398 $pubid eq "HTML") {
3399 !!!cp ('t5');
3400 $self->{document}->manakai_compat_mode ('quirks');
3401 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3402 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3403 if (defined $token->{system_identifier}) {
3404 !!!cp ('t6');
3405 $self->{document}->manakai_compat_mode ('quirks');
3406 } else {
3407 !!!cp ('t7');
3408 $self->{document}->manakai_compat_mode ('limited quirks');
3409 }
3410 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3411 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3412 !!!cp ('t8');
3413 $self->{document}->manakai_compat_mode ('limited quirks');
3414 } else {
3415 !!!cp ('t9');
3416 }
3417 } else {
3418 !!!cp ('t10');
3419 }
3420 if (defined $token->{system_identifier}) {
3421 my $sysid = $token->{system_identifier};
3422 $sysid =~ tr/A-Z/a-z/;
3423 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3424 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3425 ## marked as quirks.
3426 $self->{document}->manakai_compat_mode ('quirks');
3427 !!!cp ('t11');
3428 } else {
3429 !!!cp ('t12');
3430 }
3431 } else {
3432 !!!cp ('t13');
3433 }
3434
3435 ## Go to the "before html" insertion mode.
3436 !!!next-token;
3437 return;
3438 } elsif ({
3439 START_TAG_TOKEN, 1,
3440 END_TAG_TOKEN, 1,
3441 END_OF_FILE_TOKEN, 1,
3442 }->{$token->{type}}) {
3443 !!!cp ('t14');
3444 !!!parse-error (type => 'no DOCTYPE', token => $token);
3445 $self->{document}->manakai_compat_mode ('quirks');
3446 ## Go to the "before html" insertion mode.
3447 ## reprocess
3448 !!!ack-later;
3449 return;
3450 } elsif ($token->{type} == CHARACTER_TOKEN) {
3451 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3452 ## Ignore the token
3453
3454 unless (length $token->{data}) {
3455 !!!cp ('t15');
3456 ## Stay in the insertion mode.
3457 !!!next-token;
3458 redo INITIAL;
3459 } else {
3460 !!!cp ('t16');
3461 }
3462 } else {
3463 !!!cp ('t17');
3464 }
3465
3466 !!!parse-error (type => 'no DOCTYPE', token => $token);
3467 $self->{document}->manakai_compat_mode ('quirks');
3468 ## Go to the "before html" insertion mode.
3469 ## reprocess
3470 return;
3471 } elsif ($token->{type} == COMMENT_TOKEN) {
3472 !!!cp ('t18');
3473 my $comment = $self->{document}->create_comment ($token->{data});
3474 $self->{document}->append_child ($comment);
3475
3476 ## Stay in the insertion mode.
3477 !!!next-token;
3478 redo INITIAL;
3479 } else {
3480 die "$0: $token->{type}: Unknown token type";
3481 }
3482 } # INITIAL
3483
3484 die "$0: _tree_construction_initial: This should be never reached";
3485 } # _tree_construction_initial
3486
3487 sub _tree_construction_root_element ($) {
3488 my $self = shift;
3489
3490 ## NOTE: "before html" insertion mode.
3491
3492 B: {
3493 if ($token->{type} == DOCTYPE_TOKEN) {
3494 !!!cp ('t19');
3495 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3496 ## Ignore the token
3497 ## Stay in the insertion mode.
3498 !!!next-token;
3499 redo B;
3500 } elsif ($token->{type} == COMMENT_TOKEN) {
3501 !!!cp ('t20');
3502 my $comment = $self->{document}->create_comment ($token->{data});
3503 $self->{document}->append_child ($comment);
3504 ## Stay in the insertion mode.
3505 !!!next-token;
3506 redo B;
3507 } elsif ($token->{type} == CHARACTER_TOKEN) {
3508 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3509 ## Ignore the token.
3510
3511 unless (length $token->{data}) {
3512 !!!cp ('t21');
3513 ## Stay in the insertion mode.
3514 !!!next-token;
3515 redo B;
3516 } else {
3517 !!!cp ('t22');
3518 }
3519 } else {
3520 !!!cp ('t23');
3521 }
3522
3523 $self->{application_cache_selection}->(undef);
3524
3525 #
3526 } elsif ($token->{type} == START_TAG_TOKEN) {
3527 if ($token->{tag_name} eq 'html') {
3528 my $root_element;
3529 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3530 $self->{document}->append_child ($root_element);
3531 push @{$self->{open_elements}},
3532 [$root_element, $el_category->{html}];
3533
3534 if ($token->{attributes}->{manifest}) {
3535 !!!cp ('t24');
3536 $self->{application_cache_selection}
3537 ->($token->{attributes}->{manifest}->{value});
3538 ## ISSUE: Spec is unclear on relative references.
3539 ## According to Hixie (#whatwg 2008-03-19), it should be
3540 ## resolved against the base URI of the document in HTML
3541 ## or xml:base of the element in XHTML.
3542 } else {
3543 !!!cp ('t25');
3544 $self->{application_cache_selection}->(undef);
3545 }
3546
3547 !!!nack ('t25c');
3548
3549 !!!next-token;
3550 return; ## Go to the "before head" insertion mode.
3551 } else {
3552 !!!cp ('t25.1');
3553 #
3554 }
3555 } elsif ({
3556 END_TAG_TOKEN, 1,
3557 END_OF_FILE_TOKEN, 1,
3558 }->{$token->{type}}) {
3559 !!!cp ('t26');
3560 #
3561 } else {
3562 die "$0: $token->{type}: Unknown token type";
3563 }
3564
3565 my $root_element;
3566 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3567 $self->{document}->append_child ($root_element);
3568 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3569
3570 $self->{application_cache_selection}->(undef);
3571
3572 ## NOTE: Reprocess the token.
3573 !!!ack-later;
3574 return; ## Go to the "before head" insertion mode.
3575
3576 ## ISSUE: There is an issue in the spec
3577 } # B
3578
3579 die "$0: _tree_construction_root_element: This should never be reached";
3580 } # _tree_construction_root_element
3581
3582 sub _reset_insertion_mode ($) {
3583 my $self = shift;
3584
3585 ## Step 1
3586 my $last;
3587
3588 ## Step 2
3589 my $i = -1;
3590 my $node = $self->{open_elements}->[$i];
3591
3592 ## Step 3
3593 S3: {
3594 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3595 $last = 1;
3596 if (defined $self->{inner_html_node}) {
3597 !!!cp ('t28');
3598 $node = $self->{inner_html_node};
3599 } else {
3600 die "_reset_insertion_mode: t27";
3601 }
3602 }
3603
3604 ## Step 4..14
3605 my $new_mode;
3606 if ($node->[1] & FOREIGN_EL) {
3607 !!!cp ('t28.1');
3608 ## NOTE: Strictly spaking, the line below only applies to MathML and
3609 ## SVG elements. Currently the HTML syntax supports only MathML and
3610 ## SVG elements as foreigners.
3611 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3612 } elsif ($node->[1] & TABLE_CELL_EL) {
3613 if ($last) {
3614 !!!cp ('t28.2');
3615 #
3616 } else {
3617 !!!cp ('t28.3');
3618 $new_mode = IN_CELL_IM;
3619 }
3620 } else {
3621 !!!cp ('t28.4');
3622 $new_mode = {
3623 select => IN_SELECT_IM,
3624 ## NOTE: |option| and |optgroup| do not set
3625 ## insertion mode to "in select" by themselves.
3626 tr => IN_ROW_IM,
3627 tbody => IN_TABLE_BODY_IM,
3628 thead => IN_TABLE_BODY_IM,
3629 tfoot => IN_TABLE_BODY_IM,
3630 caption => IN_CAPTION_IM,
3631 colgroup => IN_COLUMN_GROUP_IM,
3632 table => IN_TABLE_IM,
3633 head => IN_BODY_IM, # not in head!
3634 body => IN_BODY_IM,
3635 frameset => IN_FRAMESET_IM,
3636 }->{$node->[0]->manakai_local_name};
3637 }
3638 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3639
3640 ## Step 15
3641 if ($node->[1] & HTML_EL) {
3642 unless (defined $self->{head_element}) {
3643 !!!cp ('t29');
3644 $self->{insertion_mode} = BEFORE_HEAD_IM;
3645 } else {
3646 ## ISSUE: Can this state be reached?
3647 !!!cp ('t30');
3648 $self->{insertion_mode} = AFTER_HEAD_IM;
3649 }
3650 return;
3651 } else {
3652 !!!cp ('t31');
3653 }
3654
3655 ## Step 16
3656 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3657
3658 ## Step 17
3659 $i--;
3660 $node = $self->{open_elements}->[$i];
3661
3662 ## Step 18
3663 redo S3;
3664 } # S3
3665
3666 die "$0: _reset_insertion_mode: This line should never be reached";
3667 } # _reset_insertion_mode
3668
3669 sub _tree_construction_main ($) {
3670 my $self = shift;
3671
3672 my $active_formatting_elements = [];
3673
3674 my $reconstruct_active_formatting_elements = sub { # MUST
3675 my $insert = shift;
3676
3677 ## Step 1
3678 return unless @$active_formatting_elements;
3679
3680 ## Step 3
3681 my $i = -1;
3682 my $entry = $active_formatting_elements->[$i];
3683
3684 ## Step 2
3685 return if $entry->[0] eq '#marker';
3686 for (@{$self->{open_elements}}) {
3687 if ($entry->[0] eq $_->[0]) {
3688 !!!cp ('t32');
3689 return;
3690 }
3691 }
3692
3693 S4: {
3694 ## Step 4
3695 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3696
3697 ## Step 5
3698 $i--;
3699 $entry = $active_formatting_elements->[$i];
3700
3701 ## Step 6
3702 if ($entry->[0] eq '#marker') {
3703 !!!cp ('t33_1');
3704 #
3705 } else {
3706 my $in_open_elements;
3707 OE: for (@{$self->{open_elements}}) {
3708 if ($entry->[0] eq $_->[0]) {
3709 !!!cp ('t33');
3710 $in_open_elements = 1;
3711 last OE;
3712 }
3713 }
3714 if ($in_open_elements) {
3715 !!!cp ('t34');
3716 #
3717 } else {
3718 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3719 !!!cp ('t35');
3720 redo S4;
3721 }
3722 }
3723
3724 ## Step 7
3725 $i++;
3726 $entry = $active_formatting_elements->[$i];
3727 } # S4
3728
3729 S7: {
3730 ## Step 8
3731 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3732
3733 ## Step 9
3734 $insert->($clone->[0]);
3735 push @{$self->{open_elements}}, $clone;
3736
3737 ## Step 10
3738 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3739
3740 ## Step 11
3741 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3742 !!!cp ('t36');
3743 ## Step 7'
3744 $i++;
3745 $entry = $active_formatting_elements->[$i];
3746
3747 redo S7;
3748 }
3749
3750 !!!cp ('t37');
3751 } # S7
3752 }; # $reconstruct_active_formatting_elements
3753
3754 my $clear_up_to_marker = sub {
3755 for (reverse 0..$#$active_formatting_elements) {
3756 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3757 !!!cp ('t38');
3758 splice @$active_formatting_elements, $_;
3759 return;
3760 }
3761 }
3762
3763 !!!cp ('t39');
3764 }; # $clear_up_to_marker
3765
3766 my $insert;
3767
3768 my $parse_rcdata = sub ($) {
3769 my ($content_model_flag) = @_;
3770
3771 ## Step 1
3772 my $start_tag_name = $token->{tag_name};
3773 my $el;
3774 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3775
3776 ## Step 2
3777 $insert->($el);
3778
3779 ## Step 3
3780 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3781 delete $self->{escape}; # MUST
3782
3783 ## Step 4
3784 my $text = '';
3785 !!!nack ('t40.1');
3786 !!!next-token;
3787 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3788 !!!cp ('t40');
3789 $text .= $token->{data};
3790 !!!next-token;
3791 }
3792
3793 ## Step 5
3794 if (length $text) {
3795 !!!cp ('t41');
3796 my $text = $self->{document}->create_text_node ($text);
3797 $el->append_child ($text);
3798 }
3799
3800 ## Step 6
3801 $self->{content_model} = PCDATA_CONTENT_MODEL;
3802
3803 ## Step 7
3804 if ($token->{type} == END_TAG_TOKEN and
3805 $token->{tag_name} eq $start_tag_name) {
3806 !!!cp ('t42');
3807 ## Ignore the token
3808 } else {
3809 ## NOTE: An end-of-file token.
3810 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3811 !!!cp ('t43');
3812 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3813 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3814 !!!cp ('t44');
3815 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
3816 } else {
3817 die "$0: $content_model_flag in parse_rcdata";
3818 }
3819 }
3820 !!!next-token;
3821 }; # $parse_rcdata
3822
3823 my $script_start_tag = sub () {
3824 my $script_el;
3825 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3826 ## TODO: mark as "parser-inserted"
3827
3828 $self->{content_model} = CDATA_CONTENT_MODEL;
3829 delete $self->{escape}; # MUST
3830
3831 my $text = '';
3832 !!!nack ('t45.1');
3833 !!!next-token;
3834 while ($token->{type} == CHARACTER_TOKEN) {
3835 !!!cp ('t45');
3836 $text .= $token->{data};
3837 !!!next-token;
3838 } # stop if non-character token or tokenizer stops tokenising
3839 if (length $text) {
3840 !!!cp ('t46');
3841 $script_el->manakai_append_text ($text);
3842 }
3843
3844 $self->{content_model} = PCDATA_CONTENT_MODEL;
3845
3846 if ($token->{type} == END_TAG_TOKEN and
3847 $token->{tag_name} eq 'script') {
3848 !!!cp ('t47');
3849 ## Ignore the token
3850 } else {
3851 !!!cp ('t48');
3852 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3853 ## ISSUE: And ignore?
3854 ## TODO: mark as "already executed"
3855 }
3856
3857 if (defined $self->{inner_html_node}) {
3858 !!!cp ('t49');
3859 ## TODO: mark as "already executed"
3860 } else {
3861 !!!cp ('t50');
3862 ## TODO: $old_insertion_point = current insertion point
3863 ## TODO: insertion point = just before the next input character
3864
3865 $insert->($script_el);
3866
3867 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3868
3869 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3870 }
3871
3872 !!!next-token;
3873 }; # $script_start_tag
3874
3875 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3876 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3877 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3878
3879 my $formatting_end_tag = sub {
3880 my $end_tag_token = shift;
3881 my $tag_name = $end_tag_token->{tag_name};
3882
3883 ## NOTE: The adoption agency algorithm (AAA).
3884
3885 FET: {
3886 ## Step 1
3887 my $formatting_element;
3888 my $formatting_element_i_in_active;
3889 AFE: for (reverse 0..$#$active_formatting_elements) {
3890 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3891 !!!cp ('t52');
3892 last AFE;
3893 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3894 eq $tag_name) {
3895 !!!cp ('t51');
3896 $formatting_element = $active_formatting_elements->[$_];
3897 $formatting_element_i_in_active = $_;
3898 last AFE;
3899 }
3900 } # AFE
3901 unless (defined $formatting_element) {
3902 !!!cp ('t53');
3903 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
3904 ## Ignore the token
3905 !!!next-token;
3906 return;
3907 }
3908 ## has an element in scope
3909 my $in_scope = 1;
3910 my $formatting_element_i_in_open;
3911 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3912 my $node = $self->{open_elements}->[$_];
3913 if ($node->[0] eq $formatting_element->[0]) {
3914 if ($in_scope) {
3915 !!!cp ('t54');
3916 $formatting_element_i_in_open = $_;
3917 last INSCOPE;
3918 } else { # in open elements but not in scope
3919 !!!cp ('t55');
3920 !!!parse-error (type => 'unmatched end tag',
3921 text => $token->{tag_name},
3922 token => $end_tag_token);
3923 ## Ignore the token
3924 !!!next-token;
3925 return;
3926 }
3927 } elsif ($node->[1] & SCOPING_EL) {
3928 !!!cp ('t56');
3929 $in_scope = 0;
3930 }
3931 } # INSCOPE
3932 unless (defined $formatting_element_i_in_open) {
3933 !!!cp ('t57');
3934 !!!parse-error (type => 'unmatched end tag',
3935 text => $token->{tag_name},
3936 token => $end_tag_token);
3937 pop @$active_formatting_elements; # $formatting_element
3938 !!!next-token; ## TODO: ok?
3939 return;
3940 }
3941 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3942 !!!cp ('t58');
3943 !!!parse-error (type => 'not closed',
3944 text => $self->{open_elements}->[-1]->[0]
3945 ->manakai_local_name,
3946 token => $end_tag_token);
3947 }
3948
3949 ## Step 2
3950 my $furthest_block;
3951 my $furthest_block_i_in_open;
3952 OE: for (reverse 0..$#{$self->{open_elements}}) {
3953 my $node = $self->{open_elements}->[$_];
3954 if (not ($node->[1] & FORMATTING_EL) and
3955 #not $phrasing_category->{$node->[1]} and
3956 ($node->[1] & SPECIAL_EL or
3957 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3958 !!!cp ('t59');
3959 $furthest_block = $node;
3960 $furthest_block_i_in_open = $_;
3961 } elsif ($node->[0] eq $formatting_element->[0]) {
3962 !!!cp ('t60');
3963 last OE;
3964 }
3965 } # OE
3966
3967 ## Step 3
3968 unless (defined $furthest_block) { # MUST
3969 !!!cp ('t61');
3970 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3971 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3972 !!!next-token;
3973 return;
3974 }
3975
3976 ## Step 4
3977 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3978
3979 ## Step 5
3980 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3981 if (defined $furthest_block_parent) {
3982 !!!cp ('t62');
3983 $furthest_block_parent->remove_child ($furthest_block->[0]);
3984 }
3985
3986 ## Step 6
3987 my $bookmark_prev_el
3988 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3989 ->[0];
3990
3991 ## Step 7
3992 my $node = $furthest_block;
3993 my $node_i_in_open = $furthest_block_i_in_open;
3994 my $last_node = $furthest_block;
3995 S7: {
3996 ## Step 1
3997 $node_i_in_open--;
3998 $node = $self->{open_elements}->[$node_i_in_open];
3999
4000 ## Step 2
4001 my $node_i_in_active;
4002 S7S2: {
4003 for (reverse 0..$#$active_formatting_elements) {
4004 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4005 !!!cp ('t63');
4006 $node_i_in_active = $_;
4007 last S7S2;
4008 }
4009 }
4010 splice @{$self->{open_elements}}, $node_i_in_open, 1;
4011 redo S7;
4012 } # S7S2
4013
4014 ## Step 3
4015 last S7 if $node->[0] eq $formatting_element->[0];
4016
4017 ## Step 4
4018 if ($last_node->[0] eq $furthest_block->[0]) {
4019 !!!cp ('t64');
4020 $bookmark_prev_el = $node->[0];
4021 }
4022
4023 ## Step 5
4024 if ($node->[0]->has_child_nodes ()) {
4025 !!!cp ('t65');
4026 my $clone = [$node->[0]->clone_node (0), $node->[1]];
4027 $active_formatting_elements->[$node_i_in_active] = $clone;
4028 $self->{open_elements}->[$node_i_in_open] = $clone;
4029 $node = $clone;
4030 }
4031
4032 ## Step 6
4033 $node->[0]->append_child ($last_node->[0]);
4034
4035 ## Step 7
4036 $last_node = $node;
4037
4038 ## Step 8
4039 redo S7;
4040 } # S7
4041
4042 ## Step 8
4043 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
4044 my $foster_parent_element;
4045 my $next_sibling;
4046 OE: for (reverse 0..$#{$self->{open_elements}}) {
4047 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4048 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4049 if (defined $parent and $parent->node_type == 1) {
4050 !!!cp ('t65.1');
4051 $foster_parent_element = $parent;
4052 $next_sibling = $self->{open_elements}->[$_]->[0];
4053 } else {
4054 !!!cp ('t65.2');
4055 $foster_parent_element
4056 = $self->{open_elements}->[$_ - 1]->[0];
4057 }
4058 last OE;
4059 }
4060 } # OE
4061 $foster_parent_element = $self->{open_elements}->[0]->[0]
4062 unless defined $foster_parent_element;
4063 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
4064 $open_tables->[-1]->[1] = 1; # tainted
4065 } else {
4066 !!!cp ('t65.3');
4067 $common_ancestor_node->[0]->append_child ($last_node->[0]);
4068 }
4069
4070 ## Step 9
4071 my $clone = [$formatting_element->[0]->clone_node (0),
4072 $formatting_element->[1]];
4073
4074 ## Step 10
4075 my @cn = @{$furthest_block->[0]->child_nodes};
4076 $clone->[0]->append_child ($_) for @cn;
4077
4078 ## Step 11
4079 $furthest_block->[0]->append_child ($clone->[0]);
4080
4081 ## Step 12
4082 my $i;
4083 AFE: for (reverse 0..$#$active_formatting_elements) {
4084 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
4085 !!!cp ('t66');
4086 splice @$active_formatting_elements, $_, 1;
4087 $i-- and last AFE if defined $i;
4088 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
4089 !!!cp ('t67');
4090 $i = $_;
4091 }
4092 } # AFE
4093 splice @$active_formatting_elements, $i + 1, 0, $clone;
4094
4095 ## Step 13
4096 undef $i;
4097 OE: for (reverse 0..$#{$self->{open_elements}}) {
4098 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
4099 !!!cp ('t68');
4100 splice @{$self->{open_elements}}, $_, 1;
4101 $i-- and last OE if defined $i;
4102 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
4103 !!!cp ('t69');
4104 $i = $_;
4105 }
4106 } # OE
4107 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
4108
4109 ## Step 14
4110 redo FET;
4111 } # FET
4112 }; # $formatting_end_tag
4113
4114 $insert = my $insert_to_current = sub {
4115 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
4116 }; # $insert_to_current
4117
4118 my $insert_to_foster = sub {
4119 my $child = shift;
4120 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4121 # MUST
4122 my $foster_parent_element;
4123 my $next_sibling;
4124 OE: for (reverse 0..$#{$self->{open_elements}}) {
4125 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4126 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4127 if (defined $parent and $parent->node_type == 1) {
4128 !!!cp ('t70');
4129 $foster_parent_element = $parent;
4130 $next_sibling = $self->{open_elements}->[$_]->[0];
4131 } else {
4132 !!!cp ('t71');
4133 $foster_parent_element
4134 = $self->{open_elements}->[$_ - 1]->[0];
4135 }
4136 last OE;
4137 }
4138 } # OE
4139 $foster_parent_element = $self->{open_elements}->[0]->[0]
4140 unless defined $foster_parent_element;
4141 $foster_parent_element->insert_before
4142 ($child, $next_sibling);
4143 $open_tables->[-1]->[1] = 1; # tainted
4144 } else {
4145 !!!cp ('t72');
4146 $self->{open_elements}->[-1]->[0]->append_child ($child);
4147 }
4148 }; # $insert_to_foster
4149
4150 B: while (1) {
4151 if ($token->{type} == DOCTYPE_TOKEN) {
4152 !!!cp ('t73');
4153 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4154 ## Ignore the token
4155 ## Stay in the phase
4156 !!!next-token;
4157 next B;
4158 } elsif ($token->{type} == START_TAG_TOKEN and
4159 $token->{tag_name} eq 'html') {
4160 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4161 !!!cp ('t79');
4162 !!!parse-error (type => 'after html', text => 'html', token => $token);
4163 $self->{insertion_mode} = AFTER_BODY_IM;
4164 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4165 !!!cp ('t80');
4166 !!!parse-error (type => 'after html', text => 'html', token => $token);
4167 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4168 } else {
4169 !!!cp ('t81');
4170 }
4171
4172 !!!cp ('t82');
4173 !!!parse-error (type => 'not first start tag', token => $token);
4174 my $top_el = $self->{open_elements}->[0]->[0];
4175 for my $attr_name (keys %{$token->{attributes}}) {
4176 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4177 !!!cp ('t84');
4178 $top_el->set_attribute_ns
4179 (undef, [undef, $attr_name],
4180 $token->{attributes}->{$attr_name}->{value});
4181 }
4182 }
4183 !!!nack ('t84.1');
4184 !!!next-token;
4185 next B;
4186 } elsif ($token->{type} == COMMENT_TOKEN) {
4187 my $comment = $self->{document}->create_comment ($token->{data});
4188 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4189 !!!cp ('t85');
4190 $self->{document}->append_child ($comment);
4191 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4192 !!!cp ('t86');
4193 $self->{open_elements}->[0]->[0]->append_child ($comment);
4194 } else {
4195 !!!cp ('t87');
4196 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4197 }
4198 !!!next-token;
4199 next B;
4200 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4201 if ($token->{type} == CHARACTER_TOKEN) {
4202 !!!cp ('t87.1');
4203 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4204 !!!next-token;
4205 next B;
4206 } elsif ($token->{type} == START_TAG_TOKEN) {
4207 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4208 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4209 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4210 ($token->{tag_name} eq 'svg' and
4211 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4212 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4213 !!!cp ('t87.2');
4214 #
4215 } elsif ({
4216 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4217 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4218 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4219 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4220 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4221 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4222 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4223 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4224 }->{$token->{tag_name}}) {
4225 !!!cp ('t87.2');
4226 !!!parse-error (type => 'not closed',
4227 text => $self->{open_elements}->[-1]->[0]
4228 ->manakai_local_name,
4229 token => $token);
4230
4231 pop @{$self->{open_elements}}
4232 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4233
4234 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4235 ## Reprocess.
4236 next B;
4237 } else {
4238 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4239 my $tag_name = $token->{tag_name};
4240 if ($nsuri eq $SVG_NS) {
4241 $tag_name = {
4242 altglyph => 'altGlyph',
4243 altglyphdef => 'altGlyphDef',
4244 altglyphitem => 'altGlyphItem',
4245 animatecolor => 'animateColor',
4246 animatemotion => 'animateMotion',
4247 animatetransform => 'animateTransform',
4248 clippath => 'clipPath',
4249 feblend => 'feBlend',
4250 fecolormatrix => 'feColorMatrix',
4251 fecomponenttransfer => 'feComponentTransfer',
4252 fecomposite => 'feComposite',
4253 feconvolvematrix => 'feConvolveMatrix',
4254 fediffuselighting => 'feDiffuseLighting',
4255 fedisplacementmap => 'feDisplacementMap',
4256 fedistantlight => 'feDistantLight',
4257 feflood => 'feFlood',
4258 fefunca => 'feFuncA',
4259 fefuncb => 'feFuncB',
4260 fefuncg => 'feFuncG',
4261 fefuncr => 'feFuncR',
4262 fegaussianblur => 'feGaussianBlur',
4263 feimage => 'feImage',
4264 femerge => 'feMerge',
4265 femergenode => 'feMergeNode',
4266 femorphology => 'feMorphology',
4267 feoffset => 'feOffset',
4268 fepointlight => 'fePointLight',
4269 fespecularlighting => 'feSpecularLighting',
4270 fespotlight => 'feSpotLight',
4271 fetile => 'feTile',
4272 feturbulence => 'feTurbulence',
4273 foreignobject => 'foreignObject',
4274 glyphref => 'glyphRef',
4275 lineargradient => 'linearGradient',
4276 radialgradient => 'radialGradient',
4277 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4278 textpath => 'textPath',
4279 }->{$tag_name} || $tag_name;
4280 }
4281
4282 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4283
4284 ## "adjust foreign attributes" - done in insert-element-f
4285
4286 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4287
4288 if ($self->{self_closing}) {
4289 pop @{$self->{open_elements}};
4290 !!!ack ('t87.3');
4291 } else {
4292 !!!cp ('t87.4');
4293 }
4294
4295 !!!next-token;
4296 next B;
4297 }
4298 } elsif ($token->{type} == END_TAG_TOKEN) {
4299 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4300 !!!cp ('t87.5');
4301 #
4302 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4303 !!!cp ('t87.6');
4304 !!!parse-error (type => 'not closed',
4305 text => $self->{open_elements}->[-1]->[0]
4306 ->manakai_local_name,
4307 token => $token);
4308
4309 pop @{$self->{open_elements}}
4310 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4311
4312 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4313 ## Reprocess.
4314 next B;
4315 } else {
4316 die "$0: $token->{type}: Unknown token type";
4317 }
4318 }
4319
4320 if ($self->{insertion_mode} & HEAD_IMS) {
4321 if ($token->{type} == CHARACTER_TOKEN) {
4322 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4323 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4324 !!!cp ('t88.2');
4325 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4326 } else {
4327 !!!cp ('t88.1');
4328 ## Ignore the token.
4329 !!!next-token;
4330 next B;
4331 }
4332 unless (length $token->{data}) {
4333 !!!cp ('t88');
4334 !!!next-token;
4335 next B;
4336 }
4337 }
4338
4339 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4340 !!!cp ('t89');
4341 ## As if <head>
4342 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4343 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4344 push @{$self->{open_elements}},
4345 [$self->{head_element}, $el_category->{head}];
4346
4347 ## Reprocess in the "in head" insertion mode...
4348 pop @{$self->{open_elements}};
4349
4350 ## Reprocess in the "after head" insertion mode...
4351 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4352 !!!cp ('t90');
4353 ## As if </noscript>
4354 pop @{$self->{open_elements}};
4355 !!!parse-error (type => 'in noscript:#text', token => $token);
4356
4357 ## Reprocess in the "in head" insertion mode...
4358 ## As if </head>
4359 pop @{$self->{open_elements}};
4360
4361 ## Reprocess in the "after head" insertion mode...
4362 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4363 !!!cp ('t91');
4364 pop @{$self->{open_elements}};
4365
4366 ## Reprocess in the "after head" insertion mode...
4367 } else {
4368 !!!cp ('t92');
4369 }
4370
4371 ## "after head" insertion mode
4372 ## As if <body>
4373 !!!insert-element ('body',, $token);
4374 $self->{insertion_mode} = IN_BODY_IM;
4375 ## reprocess
4376 next B;
4377 } elsif ($token->{type} == START_TAG_TOKEN) {
4378 if ($token->{tag_name} eq 'head') {
4379 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4380 !!!cp ('t93');
4381 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4382 $self->{open_elements}->[-1]->[0]->append_child
4383 ($self->{head_element});
4384 push @{$self->{open_elements}},
4385 [$self->{head_element}, $el_category->{head}];
4386 $self->{insertion_mode} = IN_HEAD_IM;
4387 !!!nack ('t93.1');
4388 !!!next-token;
4389 next B;
4390 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4391 !!!cp ('t93.2');
4392 !!!parse-error (type => 'after head', text => 'head',
4393 token => $token);
4394 ## Ignore the token
4395 !!!nack ('t93.3');
4396 !!!next-token;
4397 next B;
4398 } else {
4399 !!!cp ('t95');
4400 !!!parse-error (type => 'in head:head',
4401 token => $token); # or in head noscript
4402 ## Ignore the token
4403 !!!nack ('t95.1');
4404 !!!next-token;
4405 next B;
4406 }
4407 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4408 !!!cp ('t96');
4409 ## As if <head>
4410 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4411 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4412 push @{$self->{open_elements}},
4413 [$self->{head_element}, $el_category->{head}];
4414
4415 $self->{insertion_mode} = IN_HEAD_IM;
4416 ## Reprocess in the "in head" insertion mode...
4417 } else {
4418 !!!cp ('t97');
4419 }
4420
4421 if ($token->{tag_name} eq 'base') {
4422 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4423 !!!cp ('t98');
4424 ## As if </noscript>
4425 pop @{$self->{open_elements}};
4426 !!!parse-error (type => 'in noscript', text => 'base',
4427 token => $token);
4428
4429 $self->{insertion_mode} = IN_HEAD_IM;
4430 ## Reprocess in the "in head" insertion mode...
4431 } else {
4432 !!!cp ('t99');
4433 }
4434
4435 ## NOTE: There is a "as if in head" code clone.
4436 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4437 !!!cp ('t100');
4438 !!!parse-error (type => 'after head',
4439 text => $token->{tag_name}, token => $token);
4440 push @{$self->{open_elements}},
4441 [$self->{head_element}, $el_category->{head}];
4442 } else {
4443 !!!cp ('t101');
4444 }
4445 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4446 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4447 pop @{$self->{open_elements}} # <head>
4448 if $self->{insertion_mode} == AFTER_HEAD_IM;
4449 !!!nack ('t101.1');
4450 !!!next-token;
4451 next B;
4452 } elsif ($token->{tag_name} eq 'link') {
4453 ## NOTE: There is a "as if in head" code clone.
4454 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4455 !!!cp ('t102');
4456 !!!parse-error (type => 'after head',
4457 text => $token->{tag_name}, token => $token);
4458 push @{$self->{open_elements}},
4459 [$self->{head_element}, $el_category->{head}];
4460 } else {
4461 !!!cp ('t103');
4462 }
4463 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4464 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4465 pop @{$self->{open_elements}} # <head>
4466 if $self->{insertion_mode} == AFTER_HEAD_IM;
4467 !!!ack ('t103.1');
4468 !!!next-token;
4469 next B;
4470 } elsif ($token->{tag_name} eq 'meta') {
4471 ## NOTE: There is a "as if in head" code clone.
4472 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4473 !!!cp ('t104');
4474 !!!parse-error (type => 'after head',
4475 text => $token->{tag_name}, token => $token);
4476 push @{$self->{open_elements}},
4477 [$self->{head_element}, $el_category->{head}];
4478 } else {
4479 !!!cp ('t105');
4480 }
4481 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4482 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4483
4484 unless ($self->{confident}) {
4485 if ($token->{attributes}->{charset}) {
4486 !!!cp ('t106');
4487 ## NOTE: Whether the encoding is supported or not is handled
4488 ## in the {change_encoding} callback.
4489 $self->{change_encoding}
4490 ->($self, $token->{attributes}->{charset}->{value},
4491 $token);
4492
4493 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4494 ->set_user_data (manakai_has_reference =>
4495 $token->{attributes}->{charset}
4496 ->{has_reference});
4497 } elsif ($token->{attributes}->{content}) {
4498 if ($token->{attributes}->{content}->{value}
4499 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4500 [\x09-\x0D\x20]*=
4501 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4502 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4503 !!!cp ('t107');
4504 ## NOTE: Whether the encoding is supported or not is handled
4505 ## in the {change_encoding} callback.
4506 $self->{change_encoding}
4507 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4508 $token);
4509 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4510 ->set_user_data (manakai_has_reference =>
4511 $token->{attributes}->{content}
4512 ->{has_reference});
4513 } else {
4514 !!!cp ('t108');
4515 }
4516 }
4517 } else {
4518 if ($token->{attributes}->{charset}) {
4519 !!!cp ('t109');
4520 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4521 ->set_user_data (manakai_has_reference =>
4522 $token->{attributes}->{charset}
4523 ->{has_reference});
4524 }
4525 if ($token->{attributes}->{content}) {
4526 !!!cp ('t110');
4527 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4528 ->set_user_data (manakai_has_reference =>
4529 $token->{attributes}->{content}
4530 ->{has_reference});
4531 }
4532 }
4533
4534 pop @{$self->{open_elements}} # <head>
4535 if $self->{insertion_mode} == AFTER_HEAD_IM;
4536 !!!ack ('t110.1');
4537 !!!next-token;
4538 next B;
4539 } elsif ($token->{tag_name} eq 'title') {
4540 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4541 !!!cp ('t111');
4542 ## As if </noscript>
4543 pop @{$self->{open_elements}};
4544 !!!parse-error (type => 'in noscript', text => 'title',
4545 token => $token);
4546
4547 $self->{insertion_mode} = IN_HEAD_IM;
4548 ## Reprocess in the "in head" insertion mode...
4549 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4550 !!!cp ('t112');
4551 !!!parse-error (type => 'after head',
4552 text => $token->{tag_name}, token => $token);
4553 push @{$self->{open_elements}},
4554 [$self->{head_element}, $el_category->{head}];
4555 } else {
4556 !!!cp ('t113');
4557 }
4558
4559 ## NOTE: There is a "as if in head" code clone.
4560 my $parent = defined $self->{head_element} ? $self->{head_element}
4561 : $self->{open_elements}->[-1]->[0];
4562 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4563 pop @{$self->{open_elements}} # <head>
4564 if $self->{insertion_mode} == AFTER_HEAD_IM;
4565 next B;
4566 } elsif ($token->{tag_name} eq 'style' or
4567 $token->{tag_name} eq 'noframes') {
4568 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4569 ## insertion mode IN_HEAD_IM)
4570 ## NOTE: There is a "as if in head" code clone.
4571 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4572 !!!cp ('t114');
4573 !!!parse-error (type => 'after head',
4574 text => $token->{tag_name}, token => $token);
4575 push @{$self->{open_elements}},
4576 [$self->{head_element}, $el_category->{head}];
4577 } else {
4578 !!!cp ('t115');
4579 }
4580 $parse_rcdata->(CDATA_CONTENT_MODEL);
4581 pop @{$self->{open_elements}} # <head>
4582 if $self->{insertion_mode} == AFTER_HEAD_IM;
4583 next B;
4584 } elsif ($token->{tag_name} eq 'noscript') {
4585 if ($self->{insertion_mode} == IN_HEAD_IM) {
4586 !!!cp ('t116');
4587 ## NOTE: and scripting is disalbed
4588 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4589 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4590 !!!nack ('t116.1');
4591 !!!next-token;
4592 next B;
4593 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4594 !!!cp ('t117');
4595 !!!parse-error (type => 'in noscript', text => 'noscript',
4596 token => $token);
4597 ## Ignore the token
4598 !!!nack ('t117.1');
4599 !!!next-token;
4600 next B;
4601 } else {
4602 !!!cp ('t118');
4603 #
4604 }
4605 } elsif ($token->{tag_name} eq 'script') {
4606 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4607 !!!cp ('t119');
4608 ## As if </noscript>
4609 pop @{$self->{open_elements}};
4610 !!!parse-error (type => 'in noscript', text => 'script',
4611 token => $token);
4612
4613 $self->{insertion_mode} = IN_HEAD_IM;
4614 ## Reprocess in the "in head" insertion mode...
4615 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4616 !!!cp ('t120');
4617 !!!parse-error (type => 'after head',
4618 text => $token->{tag_name}, token => $token);
4619 push @{$self->{open_elements}},
4620 [$self->{head_element}, $el_category->{head}];
4621 } else {
4622 !!!cp ('t121');
4623 }
4624
4625 ## NOTE: There is a "as if in head" code clone.
4626 $script_start_tag->();
4627 pop @{$self->{open_elements}} # <head>
4628 if $self->{insertion_mode} == AFTER_HEAD_IM;
4629 next B;
4630 } elsif ($token->{tag_name} eq 'body' or
4631 $token->{tag_name} eq 'frameset') {
4632 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4633 !!!cp ('t122');
4634 ## As if </noscript>
4635 pop @{$self->{open_elements}};
4636 !!!parse-error (type => 'in noscript',
4637 text => $token->{tag_name}, token => $token);
4638
4639 ## Reprocess in the "in head" insertion mode...
4640 ## As if </head>
4641 pop @{$self->{open_elements}};
4642
4643 ## Reprocess in the "after head" insertion mode...
4644 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4645 !!!cp ('t124');
4646 pop @{$self->{open_elements}};
4647
4648 ## Reprocess in the "after head" insertion mode...
4649 } else {
4650 !!!cp ('t125');
4651 }
4652
4653 ## "after head" insertion mode
4654 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4655 if ($token->{tag_name} eq 'body') {
4656 !!!cp ('t126');
4657 $self->{insertion_mode} = IN_BODY_IM;
4658 } elsif ($token->{tag_name} eq 'frameset') {
4659 !!!cp ('t127');
4660 $self->{insertion_mode} = IN_FRAMESET_IM;
4661 } else {
4662 die "$0: tag name: $self->{tag_name}";
4663 }
4664 !!!nack ('t127.1');
4665 !!!next-token;
4666 next B;
4667 } else {
4668 !!!cp ('t128');
4669 #
4670 }
4671
4672 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4673 !!!cp ('t129');
4674 ## As if </noscript>
4675 pop @{$self->{open_elements}};
4676 !!!parse-error (type => 'in noscript:/',
4677 text => $token->{tag_name}, token => $token);
4678
4679 ## Reprocess in the "in head" insertion mode...
4680 ## As if </head>
4681 pop @{$self->{open_elements}};
4682
4683 ## Reprocess in the "after head" insertion mode...
4684 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4685 !!!cp ('t130');
4686 ## As if </head>
4687 pop @{$self->{open_elements}};
4688
4689 ## Reprocess in the "after head" insertion mode...
4690 } else {
4691 !!!cp ('t131');
4692 }
4693
4694 ## "after head" insertion mode
4695 ## As if <body>
4696 !!!insert-element ('body',, $token);
4697 $self->{insertion_mode} = IN_BODY_IM;
4698 ## reprocess
4699 !!!ack-later;
4700 next B;
4701 } elsif ($token->{type} == END_TAG_TOKEN) {
4702 if ($token->{tag_name} eq 'head') {
4703 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4704 !!!cp ('t132');
4705 ## As if <head>
4706 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4707 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4708 push @{$self->{open_elements}},
4709 [$self->{head_element}, $el_category->{head}];
4710
4711 ## Reprocess in the "in head" insertion mode...
4712 pop @{$self->{open_elements}};
4713 $self->{insertion_mode} = AFTER_HEAD_IM;
4714 !!!next-token;
4715 next B;
4716 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4717 !!!cp ('t133');
4718 ## As if </noscript>
4719 pop @{$self->{open_elements}};
4720 !!!parse-error (type => 'in noscript:/',
4721 text => 'head', token => $token);
4722
4723 ## Reprocess in the "in head" insertion mode...
4724 pop @{$self->{open_elements}};
4725 $self->{insertion_mode} = AFTER_HEAD_IM;
4726 !!!next-token;
4727 next B;
4728 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4729 !!!cp ('t134');
4730 pop @{$self->{open_elements}};
4731 $self->{insertion_mode} = AFTER_HEAD_IM;
4732 !!!next-token;
4733 next B;
4734 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4735 !!!cp ('t134.1');
4736 !!!parse-error (type => 'unmatched end tag', text => 'head',
4737 token => $token);
4738 ## Ignore the token
4739 !!!next-token;
4740 next B;
4741 } else {
4742 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4743 }
4744 } elsif ($token->{tag_name} eq 'noscript') {
4745 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4746 !!!cp ('t136');
4747 pop @{$self->{open_elements}};
4748 $self->{insertion_mode} = IN_HEAD_IM;
4749 !!!next-token;
4750 next B;
4751 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4752 $self->{insertion_mode} == AFTER_HEAD_IM) {
4753 !!!cp ('t137');
4754 !!!parse-error (type => 'unmatched end tag',
4755 text => 'noscript', token => $token);
4756 ## Ignore the token ## ISSUE: An issue in the spec.
4757 !!!next-token;
4758 next B;
4759 } else {
4760 !!!cp ('t138');
4761 #
4762 }
4763 } elsif ({
4764 body => 1, html => 1,
4765 }->{$token->{tag_name}}) {
4766 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4767 $self->{insertion_mode} == IN_HEAD_IM or
4768 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4769 !!!cp ('t140');
4770 !!!parse-error (type => 'unmatched end tag',
4771 text => $token->{tag_name}, token => $token);
4772 ## Ignore the token
4773 !!!next-token;
4774 next B;
4775 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4776 !!!cp ('t140.1');
4777 !!!parse-error (type => 'unmatched end tag',
4778 text => $token->{tag_name}, token => $token);
4779 ## Ignore the token
4780 !!!next-token;
4781 next B;
4782 } else {
4783 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4784 }
4785 } elsif ($token->{tag_name} eq 'p') {
4786 !!!cp ('t142');
4787 !!!parse-error (type => 'unmatched end tag',
4788 text => $token->{tag_name}, token => $token);
4789 ## Ignore the token
4790 !!!next-token;
4791 next B;
4792 } elsif ($token->{tag_name} eq 'br') {
4793 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4794 !!!cp ('t142.2');
4795 ## (before head) as if <head>, (in head) as if </head>
4796 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4797 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4798 $self->{insertion_mode} = AFTER_HEAD_IM;
4799
4800 ## Reprocess in the "after head" insertion mode...
4801 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4802 !!!cp ('t143.2');
4803 ## As if </head>
4804 pop @{$self->{open_elements}};
4805 $self->{insertion_mode} = AFTER_HEAD_IM;
4806
4807 ## Reprocess in the "after head" insertion mode...
4808 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4809 !!!cp ('t143.3');
4810 ## ISSUE: Two parse errors for <head><noscript></br>
4811 !!!parse-error (type => 'unmatched end tag',
4812 text => 'br', token => $token);
4813 ## As if </noscript>
4814 pop @{$self->{open_elements}};
4815 $self->{insertion_mode} = IN_HEAD_IM;
4816
4817 ## Reprocess in the "in head" insertion mode...
4818 ## As if </head>
4819 pop @{$self->{open_elements}};
4820 $self->{insertion_mode} = AFTER_HEAD_IM;
4821
4822 ## Reprocess in the "after head" insertion mode...
4823 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4824 !!!cp ('t143.4');
4825 #
4826 } else {
4827 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4828 }
4829
4830 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4831 !!!parse-error (type => 'unmatched end tag',
4832 text => 'br', token => $token);
4833 ## Ignore the token
4834 !!!next-token;
4835 next B;
4836 } else {
4837 !!!cp ('t145');
4838 !!!parse-error (type => 'unmatched end tag',
4839 text => $token->{tag_name}, token => $token);
4840 ## Ignore the token
4841 !!!next-token;
4842 next B;
4843 }
4844
4845 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4846 !!!cp ('t146');
4847 ## As if </noscript>
4848 pop @{$self->{open_elements}};
4849 !!!parse-error (type => 'in noscript:/',
4850 text => $token->{tag_name}, token => $token);
4851
4852 ## Reprocess in the "in head" insertion mode...
4853 ## As if </head>
4854 pop @{$self->{open_elements}};
4855
4856 ## Reprocess in the "after head" insertion mode...
4857 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4858 !!!cp ('t147');
4859 ## As if </head>
4860 pop @{$self->{open_elements}};
4861
4862 ## Reprocess in the "after head" insertion mode...
4863 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4864 ## ISSUE: This case cannot be reached?
4865 !!!cp ('t148');
4866 !!!parse-error (type => 'unmatched end tag',
4867 text => $token->{tag_name}, token => $token);
4868 ## Ignore the token ## ISSUE: An issue in the spec.
4869 !!!next-token;
4870 next B;
4871 } else {
4872 !!!cp ('t149');
4873 }
4874
4875 ## "after head" insertion mode
4876 ## As if <body>
4877 !!!insert-element ('body',, $token);
4878 $self->{insertion_mode} = IN_BODY_IM;
4879 ## reprocess
4880 next B;
4881 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4882 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4883 !!!cp ('t149.1');
4884
4885 ## NOTE: As if <head>
4886 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4887 $self->{open_elements}->[-1]->[0]->append_child
4888 ($self->{head_element});
4889 #push @{$self->{open_elements}},
4890 # [$self->{head_element}, $el_category->{head}];
4891 #$self->{insertion_mode} = IN_HEAD_IM;
4892 ## NOTE: Reprocess.
4893
4894 ## NOTE: As if </head>
4895 #pop @{$self->{open_elements}};
4896 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4897 ## NOTE: Reprocess.
4898
4899 #
4900 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4901 !!!cp ('t149.2');
4902
4903 ## NOTE: As if </head>
4904 pop @{$self->{open_elements}};
4905 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4906 ## NOTE: Reprocess.
4907
4908 #
4909 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4910 !!!cp ('t149.3');
4911
4912 !!!parse-error (type => 'in noscript:#eof', token => $token);
4913
4914 ## As if </noscript>
4915 pop @{$self->{open_elements}};
4916 #$self->{insertion_mode} = IN_HEAD_IM;
4917 ## NOTE: Reprocess.
4918
4919 ## NOTE: As if </head>
4920 pop @{$self->{open_elements}};
4921 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4922 ## NOTE: Reprocess.
4923
4924 #
4925 } else {
4926 !!!cp ('t149.4');
4927 #
4928 }
4929
4930 ## NOTE: As if <body>
4931 !!!insert-element ('body',, $token);
4932 $self->{insertion_mode} = IN_BODY_IM;
4933 ## NOTE: Reprocess.
4934 next B;
4935 } else {
4936 die "$0: $token->{type}: Unknown token type";
4937 }
4938
4939 ## ISSUE: An issue in the spec.
4940 } elsif ($self->{insertion_mode} & BODY_IMS) {
4941 if ($token->{type} == CHARACTER_TOKEN) {
4942 !!!cp ('t150');
4943 ## NOTE: There is a code clone of "character in body".
4944 $reconstruct_active_formatting_elements->($insert_to_current);
4945
4946 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4947
4948 !!!next-token;
4949 next B;
4950 } elsif ($token->{type} == START_TAG_TOKEN) {
4951 if ({
4952 caption => 1, col => 1, colgroup => 1, tbody => 1,
4953 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4954 }->{$token->{tag_name}}) {
4955 if ($self->{insertion_mode} == IN_CELL_IM) {
4956 ## have an element in table scope
4957 for (reverse 0..$#{$self->{open_elements}}) {
4958 my $node = $self->{open_elements}->[$_];
4959 if ($node->[1] & TABLE_CELL_EL) {
4960 !!!cp ('t151');
4961
4962 ## Close the cell
4963 !!!back-token; # <x>
4964 $token = {type => END_TAG_TOKEN,
4965 tag_name => $node->[0]->manakai_local_name,
4966 line => $token->{line},
4967 column => $token->{column}};
4968 next B;
4969 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4970 !!!cp ('t152');
4971 ## ISSUE: This case can never be reached, maybe.
4972 last;
4973 }
4974 }
4975
4976 !!!cp ('t153');
4977 !!!parse-error (type => 'start tag not allowed',
4978 text => $token->{tag_name}, token => $token);
4979 ## Ignore the token
4980 !!!nack ('t153.1');
4981 !!!next-token;
4982 next B;
4983 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4984 !!!parse-error (type => 'not closed', text => 'caption',
4985 token => $token);
4986
4987 ## NOTE: As if </caption>.
4988 ## have a table element in table scope
4989 my $i;
4990 INSCOPE: {
4991 for (reverse 0..$#{$self->{open_elements}}) {
4992 my $node = $self->{open_elements}->[$_];
4993 if ($node->[1] & CAPTION_EL) {
4994 !!!cp ('t155');
4995 $i = $_;
4996 last INSCOPE;
4997 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4998 !!!cp ('t156');
4999 last;
5000 }
5001 }
5002
5003 !!!cp ('t157');
5004 !!!parse-error (type => 'start tag not allowed',
5005 text => $token->{tag_name}, token => $token);
5006 ## Ignore the token
5007 !!!nack ('t157.1');
5008 !!!next-token;
5009 next B;
5010 } # INSCOPE
5011
5012 ## generate implied end tags
5013 while ($self->{open_elements}->[-1]->[1]
5014 & END_TAG_OPTIONAL_EL) {
5015 !!!cp ('t158');
5016 pop @{$self->{open_elements}};
5017 }
5018
5019 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5020 !!!cp ('t159');
5021 !!!parse-error (type => 'not closed',
5022 text => $self->{open_elements}->[-1]->[0]
5023 ->manakai_local_name,
5024 token => $token);
5025 } else {
5026 !!!cp ('t160');
5027 }
5028
5029 splice @{$self->{open_elements}}, $i;
5030
5031 $clear_up_to_marker->();
5032
5033 $self->{insertion_mode} = IN_TABLE_IM;
5034
5035 ## reprocess
5036 !!!ack-later;
5037 next B;
5038 } else {
5039 !!!cp ('t161');
5040 #
5041 }
5042 } else {
5043 !!!cp ('t162');
5044 #
5045 }
5046 } elsif ($token->{type} == END_TAG_TOKEN) {
5047 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5048 if ($self->{insertion_mode} == IN_CELL_IM) {
5049 ## have an element in table scope
5050 my $i;
5051 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5052 my $node = $self->{open_elements}->[$_];
5053 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5054 !!!cp ('t163');
5055 $i = $_;
5056 last INSCOPE;
5057 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5058 !!!cp ('t164');
5059 last INSCOPE;
5060 }
5061 } # INSCOPE
5062 unless (defined $i) {
5063 !!!cp ('t165');
5064 !!!parse-error (type => 'unmatched end tag',
5065 text => $token->{tag_name},
5066 token => $token);
5067 ## Ignore the token
5068 !!!next-token;
5069 next B;
5070 }
5071
5072 ## generate implied end tags
5073 while ($self->{open_elements}->[-1]->[1]
5074 & END_TAG_OPTIONAL_EL) {
5075 !!!cp ('t166');
5076 pop @{$self->{open_elements}};
5077 }
5078
5079 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
5080 ne $token->{tag_name}) {
5081 !!!cp ('t167');
5082 !!!parse-error (type => 'not closed',
5083 text => $self->{open_elements}->[-1]->[0]
5084 ->manakai_local_name,
5085 token => $token);
5086 } else {
5087 !!!cp ('t168');
5088 }
5089
5090 splice @{$self->{open_elements}}, $i;
5091
5092 $clear_up_to_marker->();
5093
5094 $self->{insertion_mode} = IN_ROW_IM;
5095
5096 !!!next-token;
5097 next B;
5098 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5099 !!!cp ('t169');
5100 !!!parse-error (type => 'unmatched end tag',
5101 text => $token->{tag_name}, token => $token);
5102 ## Ignore the token
5103 !!!next-token;
5104 next B;
5105 } else {
5106 !!!cp ('t170');
5107 #
5108 }
5109 } elsif ($token->{tag_name} eq 'caption') {
5110 if ($self->{insertion_mode} == IN_CAPTION_IM) {
5111 ## have a table element in table scope
5112 my $i;
5113 INSCOPE: {
5114 for (reverse 0..$#{$self->{open_elements}}) {
5115 my $node = $self->{open_elements}->[$_];
5116 if ($node->[1] & CAPTION_EL) {
5117 !!!cp ('t171');
5118 $i = $_;
5119 last INSCOPE;
5120 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5121 !!!cp ('t172');
5122 last;
5123 }
5124 }
5125
5126 !!!cp ('t173');
5127 !!!parse-error (type => 'unmatched end tag',
5128 text => $token->{tag_name}, token => $token);
5129 ## Ignore the token
5130 !!!next-token;
5131 next B;
5132 } # INSCOPE
5133
5134 ## generate implied end tags
5135 while ($self->{open_elements}->[-1]->[1]
5136 & END_TAG_OPTIONAL_EL) {
5137 !!!cp ('t174');
5138 pop @{$self->{open_elements}};
5139 }
5140
5141 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5142 !!!cp ('t175');
5143 !!!parse-error (type => 'not closed',
5144 text => $self->{open_elements}->[-1]->[0]
5145 ->manakai_local_name,
5146 token => $token);
5147 } else {
5148 !!!cp ('t176');
5149 }
5150
5151 splice @{$self->{open_elements}}, $i;
5152
5153 $clear_up_to_marker->();
5154
5155 $self->{insertion_mode} = IN_TABLE_IM;
5156
5157 !!!next-token;
5158 next B;
5159 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5160 !!!cp ('t177');
5161 !!!parse-error (type => 'unmatched end tag',
5162 text => $token->{tag_name}, token => $token);
5163 ## Ignore the token
5164 !!!next-token;
5165 next B;
5166 } else {
5167 !!!cp ('t178');
5168 #
5169 }
5170 } elsif ({
5171 table => 1, tbody => 1, tfoot => 1,
5172 thead => 1, tr => 1,
5173 }->{$token->{tag_name}} and
5174 $self->{insertion_mode} == IN_CELL_IM) {
5175 ## have an element in table scope
5176 my $i;
5177 my $tn;
5178 INSCOPE: {
5179 for (reverse 0..$#{$self->{open_elements}}) {
5180 my $node = $self->{open_elements}->[$_];
5181 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5182 !!!cp ('t179');
5183 $i = $_;
5184
5185 ## Close the cell
5186 !!!back-token; # </x>
5187 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5188 line => $token->{line},
5189 column => $token->{column}};
5190 next B;
5191 } elsif ($node->[1] & TABLE_CELL_EL) {
5192 !!!cp ('t180');
5193 $tn = $node->[0]->manakai_local_name;
5194 ## NOTE: There is exactly one |td| or |th| element
5195 ## in scope in the stack of open elements by definition.
5196 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5197 ## ISSUE: Can this be reached?
5198 !!!cp ('t181');
5199 last;
5200 }
5201 }
5202
5203 !!!cp ('t182');
5204 !!!parse-error (type => 'unmatched end tag',
5205 text => $token->{tag_name}, token => $token);
5206 ## Ignore the token
5207 !!!next-token;
5208 next B;
5209 } # INSCOPE
5210 } elsif ($token->{tag_name} eq 'table' and
5211 $self->{insertion_mode} == IN_CAPTION_IM) {
5212 !!!parse-error (type => 'not closed', text => 'caption',
5213 token => $token);
5214
5215 ## As if </caption>
5216 ## have a table element in table scope
5217 my $i;
5218 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5219 my $node = $self->{open_elements}->[$_];
5220 if ($node->[1] & CAPTION_EL) {
5221 !!!cp ('t184');
5222 $i = $_;
5223 last INSCOPE;
5224 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5225 !!!cp ('t185');
5226 last INSCOPE;
5227 }
5228 } # INSCOPE
5229 unless (defined $i) {
5230 !!!cp ('t186');
5231 !!!parse-error (type => 'unmatched end tag',
5232 text => 'caption', token => $token);
5233 ## Ignore the token
5234 !!!next-token;
5235 next B;
5236 }
5237
5238 ## generate implied end tags
5239 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5240 !!!cp ('t187');
5241 pop @{$self->{open_elements}};
5242 }
5243
5244 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5245 !!!cp ('t188');
5246 !!!parse-error (type => 'not closed',
5247 text => $self->{open_elements}->[-1]->[0]
5248 ->manakai_local_name,
5249 token => $token);
5250 } else {
5251 !!!cp ('t189');
5252 }
5253
5254 splice @{$self->{open_elements}}, $i;
5255
5256 $clear_up_to_marker->();
5257
5258 $self->{insertion_mode} = IN_TABLE_IM;
5259
5260 ## reprocess
5261 next B;
5262 } elsif ({
5263 body => 1, col => 1, colgroup => 1, html => 1,
5264 }->{$token->{tag_name}}) {
5265 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5266 !!!cp ('t190');
5267 !!!parse-error (type => 'unmatched end tag',
5268 text => $token->{tag_name}, token => $token);
5269 ## Ignore the token
5270 !!!next-token;
5271 next B;
5272 } else {
5273 !!!cp ('t191');
5274 #
5275 }
5276 } elsif ({
5277 tbody => 1, tfoot => 1,
5278 thead => 1, tr => 1,
5279 }->{$token->{tag_name}} and
5280 $self->{insertion_mode} == IN_CAPTION_IM) {
5281 !!!cp ('t192');
5282 !!!parse-error (type => 'unmatched end tag',
5283 text => $token->{tag_name}, token => $token);
5284 ## Ignore the token
5285 !!!next-token;
5286 next B;
5287 } else {
5288 !!!cp ('t193');
5289 #
5290 }
5291 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5292 for my $entry (@{$self->{open_elements}}) {
5293 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5294 !!!cp ('t75');
5295 !!!parse-error (type => 'in body:#eof', token => $token);
5296 last;
5297 }
5298 }
5299
5300 ## Stop parsing.
5301 last B;
5302 } else {
5303 die "$0: $token->{type}: Unknown token type";
5304 }
5305
5306 $insert = $insert_to_current;
5307 #
5308 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5309 if ($token->{type} == CHARACTER_TOKEN) {
5310 if (not $open_tables->[-1]->[1] and # tainted
5311 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5312 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5313
5314 unless (length $token->{data}) {
5315 !!!cp ('t194');
5316 !!!next-token;
5317 next B;
5318 } else {
5319 !!!cp ('t195');
5320 }
5321 }
5322
5323 !!!parse-error (type => 'in table:#text', token => $token);
5324
5325 ## As if in body, but insert into foster parent element
5326 ## ISSUE: Spec says that "whenever a node would be inserted
5327 ## into the current node" while characters might not be
5328 ## result in a new Text node.
5329 $reconstruct_active_formatting_elements->($insert_to_foster);
5330
5331 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5332 # MUST
5333 my $foster_parent_element;
5334 my $next_sibling;
5335 my $prev_sibling;
5336 OE: for (reverse 0..$#{$self->{open_elements}}) {
5337 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5338 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5339 if (defined $parent and $parent->node_type == 1) {
5340 !!!cp ('t196');
5341 $foster_parent_element = $parent;
5342 $next_sibling = $self->{open_elements}->[$_]->[0];
5343 $prev_sibling = $next_sibling->previous_sibling;
5344 } else {
5345 !!!cp ('t197');
5346 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5347 $prev_sibling = $foster_parent_element->last_child;
5348 }
5349 last OE;
5350 }
5351 } # OE
5352 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5353 $prev_sibling = $foster_parent_element->last_child
5354 unless defined $foster_parent_element;
5355 if (defined $prev_sibling and
5356 $prev_sibling->node_type == 3) {
5357 !!!cp ('t198');
5358 $prev_sibling->manakai_append_text ($token->{data});
5359 } else {
5360 !!!cp ('t199');
5361 $foster_parent_element->insert_before
5362 ($self->{document}->create_text_node ($token->{data}),
5363 $next_sibling);
5364 }
5365 $open_tables->[-1]->[1] = 1; # tainted
5366 } else {
5367 !!!cp ('t200');
5368 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5369 }
5370
5371 !!!next-token;
5372 next B;
5373 } elsif ($token->{type} == START_TAG_TOKEN) {
5374 if ({
5375 tr => ($self->{insertion_mode} != IN_ROW_IM),
5376 th => 1, td => 1,
5377 }->{$token->{tag_name}}) {
5378 if ($self->{insertion_mode} == IN_TABLE_IM) {
5379 ## Clear back to table context
5380 while (not ($self->{open_elements}->[-1]->[1]
5381 & TABLE_SCOPING_EL)) {
5382 !!!cp ('t201');
5383 pop @{$self->{open_elements}};
5384 }
5385
5386 !!!insert-element ('tbody',, $token);
5387 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5388 ## reprocess in the "in table body" insertion mode...
5389 }
5390
5391 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5392 unless ($token->{tag_name} eq 'tr') {
5393 !!!cp ('t202');
5394 !!!parse-error (type => 'missing start tag:tr', token => $token);
5395 }
5396
5397 ## Clear back to table body context
5398 while (not ($self->{open_elements}->[-1]->[1]
5399 & TABLE_ROWS_SCOPING_EL)) {
5400 !!!cp ('t203');
5401 ## ISSUE: Can this case be reached?
5402 pop @{$self->{open_elements}};
5403 }
5404
5405 $self->{insertion_mode} = IN_ROW_IM;
5406 if ($token->{tag_name} eq 'tr') {
5407 !!!cp ('t204');
5408 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5409 !!!nack ('t204');
5410 !!!next-token;
5411 next B;
5412 } else {
5413 !!!cp ('t205');
5414 !!!insert-element ('tr',, $token);
5415 ## reprocess in the "in row" insertion mode
5416 }
5417 } else {
5418 !!!cp ('t206');
5419 }
5420
5421 ## Clear back to table row context
5422 while (not ($self->{open_elements}->[-1]->[1]
5423 & TABLE_ROW_SCOPING_EL)) {
5424 !!!cp ('t207');
5425 pop @{$self->{open_elements}};
5426 }
5427
5428 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5429 $self->{insertion_mode} = IN_CELL_IM;
5430
5431 push @$active_formatting_elements, ['#marker', ''];
5432
5433 !!!nack ('t207.1');
5434 !!!next-token;
5435 next B;
5436 } elsif ({
5437 caption => 1, col => 1, colgroup => 1,
5438 tbody => 1, tfoot => 1, thead => 1,
5439 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5440 }->{$token->{tag_name}}) {
5441 if ($self->{insertion_mode} == IN_ROW_IM) {
5442 ## As if </tr>
5443 ## have an element in table scope
5444 my $i;
5445 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5446 my $node = $self->{open_elements}->[$_];
5447 if ($node->[1] & TABLE_ROW_EL) {
5448 !!!cp ('t208');
5449 $i = $_;
5450 last INSCOPE;
5451 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5452 !!!cp ('t209');
5453 last INSCOPE;
5454 }
5455 } # INSCOPE
5456 unless (defined $i) {
5457 !!!cp ('t210');
5458 ## TODO: This type is wrong.
5459 !!!parse-error (type => 'unmacthed end tag',
5460 text => $token->{tag_name}, token => $token);
5461 ## Ignore the token
5462 !!!nack ('t210.1');
5463 !!!next-token;
5464 next B;
5465 }
5466
5467 ## Clear back to table row context
5468 while (not ($self->{open_elements}->[-1]->[1]
5469 & TABLE_ROW_SCOPING_EL)) {
5470 !!!cp ('t211');
5471 ## ISSUE: Can this case be reached?
5472 pop @{$self->{open_elements}};
5473 }
5474
5475 pop @{$self->{open_elements}}; # tr
5476 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5477 if ($token->{tag_name} eq 'tr') {
5478 !!!cp ('t212');
5479 ## reprocess
5480 !!!ack-later;
5481 next B;
5482 } else {
5483 !!!cp ('t213');
5484 ## reprocess in the "in table body" insertion mode...
5485 }
5486 }
5487
5488 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5489 ## have an element in table scope
5490 my $i;
5491 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5492 my $node = $self->{open_elements}->[$_];
5493 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5494 !!!cp ('t214');
5495 $i = $_;
5496 last INSCOPE;
5497 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5498 !!!cp ('t215');
5499 last INSCOPE;
5500 }
5501 } # INSCOPE
5502 unless (defined $i) {
5503 !!!cp ('t216');
5504 ## TODO: This erorr type is wrong.
5505 !!!parse-error (type => 'unmatched end tag',
5506 text => $token->{tag_name}, token => $token);
5507 ## Ignore the token
5508 !!!nack ('t216.1');
5509 !!!next-token;
5510 next B;
5511 }
5512
5513 ## Clear back to table body context
5514 while (not ($self->{open_elements}->[-1]->[1]
5515 & TABLE_ROWS_SCOPING_EL)) {
5516 !!!cp ('t217');
5517 ## ISSUE: Can this state be reached?
5518 pop @{$self->{open_elements}};
5519 }
5520
5521 ## As if <{current node}>
5522 ## have an element in table scope
5523 ## true by definition
5524
5525 ## Clear back to table body context
5526 ## nop by definition
5527
5528 pop @{$self->{open_elements}};
5529 $self->{insertion_mode} = IN_TABLE_IM;
5530 ## reprocess in "in table" insertion mode...
5531 } else {
5532 !!!cp ('t218');
5533 }
5534
5535 if ($token->{tag_name} eq 'col') {
5536 ## Clear back to table context
5537 while (not ($self->{open_elements}->[-1]->[1]
5538 & TABLE_SCOPING_EL)) {
5539 !!!cp ('t219');
5540 ## ISSUE: Can this state be reached?
5541 pop @{$self->{open_elements}};
5542 }
5543
5544 !!!insert-element ('colgroup',, $token);
5545 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5546 ## reprocess
5547 !!!ack-later;
5548 next B;
5549 } elsif ({
5550 caption => 1,
5551 colgroup => 1,
5552 tbody => 1, tfoot => 1, thead => 1,
5553 }->{$token->{tag_name}}) {
5554 ## Clear back to table context
5555 while (not ($self->{open_elements}->[-1]->[1]
5556 & TABLE_SCOPING_EL)) {
5557 !!!cp ('t220');
5558 ## ISSUE: Can this state be reached?
5559 pop @{$self->{open_elements}};
5560 }
5561
5562 push @$active_formatting_elements, ['#marker', '']
5563 if $token->{tag_name} eq 'caption';
5564
5565 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5566 $self->{insertion_mode} = {
5567 caption => IN_CAPTION_IM,
5568 colgroup => IN_COLUMN_GROUP_IM,
5569 tbody => IN_TABLE_BODY_IM,
5570 tfoot => IN_TABLE_BODY_IM,
5571 thead => IN_TABLE_BODY_IM,
5572 }->{$token->{tag_name}};
5573 !!!next-token;
5574 !!!nack ('t220.1');
5575 next B;
5576 } else {
5577 die "$0: in table: <>: $token->{tag_name}";
5578 }
5579 } elsif ($token->{tag_name} eq 'table') {
5580 !!!parse-error (type => 'not closed',
5581 text => $self->{open_elements}->[-1]->[0]
5582 ->manakai_local_name,
5583 token => $token);
5584
5585 ## As if </table>
5586 ## have a table element in table scope
5587 my $i;
5588 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5589 my $node = $self->{open_elements}->[$_];
5590 if ($node->[1] & TABLE_EL) {
5591 !!!cp ('t221');
5592 $i = $_;
5593 last INSCOPE;
5594 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5595 !!!cp ('t222');
5596 last INSCOPE;
5597 }
5598 } # INSCOPE
5599 unless (defined $i) {
5600 !!!cp ('t223');
5601 ## TODO: The following is wrong, maybe.
5602 !!!parse-error (type => 'unmatched end tag', text => 'table',
5603 token => $token);
5604 ## Ignore tokens </table><table>
5605 !!!nack ('t223.1');
5606 !!!next-token;
5607 next B;
5608 }
5609
5610 ## TODO: Followings are removed from the latest spec.
5611 ## generate implied end tags
5612 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5613 !!!cp ('t224');
5614 pop @{$self->{open_elements}};
5615 }
5616
5617 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5618 !!!cp ('t225');
5619 ## NOTE: |<table><tr><table>|
5620 !!!parse-error (type => 'not closed',
5621 text => $self->{open_elements}->[-1]->[0]
5622 ->manakai_local_name,
5623 token => $token);
5624 } else {
5625 !!!cp ('t226');
5626 }
5627
5628 splice @{$self->{open_elements}}, $i;
5629 pop @{$open_tables};
5630
5631 $self->_reset_insertion_mode;
5632
5633 ## reprocess
5634 !!!ack-later;
5635 next B;
5636 } elsif ($token->{tag_name} eq 'style') {
5637 if (not $open_tables->[-1]->[1]) { # tainted
5638 !!!cp ('t227.8');
5639 ## NOTE: This is a "as if in head" code clone.
5640 $parse_rcdata->(CDATA_CONTENT_MODEL);
5641 next B;
5642 } else {
5643 !!!cp ('t227.7');
5644 #
5645 }
5646 } elsif ($token->{tag_name} eq 'script') {
5647 if (not $open_tables->[-1]->[1]) { # tainted
5648 !!!cp ('t227.6');
5649 ## NOTE: This is a "as if in head" code clone.
5650 $script_start_tag->();
5651 next B;
5652 } else {
5653 !!!cp ('t227.5');
5654 #
5655 }
5656 } elsif ($token->{tag_name} eq 'input') {
5657 if (not $open_tables->[-1]->[1]) { # tainted
5658 if ($token->{attributes}->{type}) { ## TODO: case
5659 my $type = lc $token->{attributes}->{type}->{value};
5660 if ($type eq 'hidden') {
5661 !!!cp ('t227.3');
5662 !!!parse-error (type => 'in table',
5663 text => $token->{tag_name}, token => $token);
5664
5665 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5666
5667 ## TODO: form element pointer
5668
5669 pop @{$self->{open_elements}};
5670
5671 !!!next-token;
5672 !!!ack ('t227.2.1');
5673 next B;
5674 } else {
5675 !!!cp ('t227.2');
5676 #
5677 }
5678 } else {
5679 !!!cp ('t227.1');
5680 #
5681 }
5682 } else {
5683 !!!cp ('t227.4');
5684 #
5685 }
5686 } else {
5687 !!!cp ('t227');
5688 #
5689 }
5690
5691 !!!parse-error (type => 'in table', text => $token->{tag_name},
5692 token => $token);
5693
5694 $insert = $insert_to_foster;
5695 #
5696 } elsif ($token->{type} == END_TAG_TOKEN) {
5697 if ($token->{tag_name} eq 'tr' and
5698 $self->{insertion_mode} == IN_ROW_IM) {
5699 ## have an element in table scope
5700 my $i;
5701 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5702 my $node = $self->{open_elements}->[$_];
5703 if ($node->[1] & TABLE_ROW_EL) {
5704 !!!cp ('t228');
5705 $i = $_;
5706 last INSCOPE;
5707 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5708 !!!cp ('t229');
5709 last INSCOPE;
5710 }
5711 } # INSCOPE
5712 unless (defined $i) {
5713 !!!cp ('t230');
5714 !!!parse-error (type => 'unmatched end tag',
5715 text => $token->{tag_name}, token => $token);
5716 ## Ignore the token
5717 !!!nack ('t230.1');
5718 !!!next-token;
5719 next B;
5720 } else {
5721 !!!cp ('t232');
5722 }
5723
5724 ## Clear back to table row context
5725 while (not ($self->{open_elements}->[-1]->[1]
5726 & TABLE_ROW_SCOPING_EL)) {
5727 !!!cp ('t231');
5728 ## ISSUE: Can this state be reached?
5729 pop @{$self->{open_elements}};
5730 }
5731
5732 pop @{$self->{open_elements}}; # tr
5733 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5734 !!!next-token;
5735 !!!nack ('t231.1');
5736 next B;
5737 } elsif ($token->{tag_name} eq 'table') {
5738 if ($self->{insertion_mode} == IN_ROW_IM) {
5739 ## As if </tr>
5740 ## have an element in table scope
5741 my $i;
5742 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5743 my $node = $self->{open_elements}->[$_];
5744 if ($node->[1] & TABLE_ROW_EL) {
5745 !!!cp ('t233');
5746 $i = $_;
5747 last INSCOPE;
5748 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5749 !!!cp ('t234');
5750 last INSCOPE;
5751 }
5752 } # INSCOPE
5753 unless (defined $i) {
5754 !!!cp ('t235');
5755 ## TODO: The following is wrong.
5756 !!!parse-error (type => 'unmatched end tag',
5757 text => $token->{type}, token => $token);
5758 ## Ignore the token
5759 !!!nack ('t236.1');
5760 !!!next-token;
5761 next B;
5762 }
5763
5764 ## Clear back to table row context
5765 while (not ($self->{open_elements}->[-1]->[1]
5766 & TABLE_ROW_SCOPING_EL)) {
5767 !!!cp ('t236');
5768 ## ISSUE: Can this state be reached?
5769 pop @{$self->{open_elements}};
5770 }
5771
5772 pop @{$self->{open_elements}}; # tr
5773 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5774 ## reprocess in the "in table body" insertion mode...
5775 }
5776
5777 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5778 ## have an element in table scope
5779 my $i;
5780 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5781 my $node = $self->{open_elements}->[$_];
5782 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5783 !!!cp ('t237');
5784 $i = $_;
5785 last INSCOPE;
5786 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5787 !!!cp ('t238');
5788 last INSCOPE;
5789 }
5790 } # INSCOPE
5791 unless (defined $i) {
5792 !!!cp ('t239');
5793 !!!parse-error (type => 'unmatched end tag',
5794 text => $token->{tag_name}, token => $token);
5795 ## Ignore the token
5796 !!!nack ('t239.1');
5797 !!!next-token;
5798 next B;
5799 }
5800
5801 ## Clear back to table body context
5802 while (not ($self->{open_elements}->[-1]->[1]
5803 & TABLE_ROWS_SCOPING_EL)) {
5804 !!!cp ('t240');
5805 pop @{$self->{open_elements}};
5806 }
5807
5808 ## As if <{current node}>
5809 ## have an element in table scope
5810 ## true by definition
5811
5812 ## Clear back to table body context
5813 ## nop by definition
5814
5815 pop @{$self->{open_elements}};
5816 $self->{insertion_mode} = IN_TABLE_IM;
5817 ## reprocess in the "in table" insertion mode...
5818 }
5819
5820 ## NOTE: </table> in the "in table" insertion mode.
5821 ## When you edit the code fragment below, please ensure that
5822 ## the code for <table> in the "in table" insertion mode
5823 ## is synced with it.
5824
5825 ## have a table element in table scope
5826 my $i;
5827 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5828 my $node = $self->{open_elements}->[$_];
5829 if ($node->[1] & TABLE_EL) {
5830 !!!cp ('t241');
5831 $i = $_;
5832 last INSCOPE;
5833 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5834 !!!cp ('t242');
5835 last INSCOPE;
5836 }
5837 } # INSCOPE
5838 unless (defined $i) {
5839 !!!cp ('t243');
5840 !!!parse-error (type => 'unmatched end tag',
5841 text => $token->{tag_name}, token => $token);
5842 ## Ignore the token
5843 !!!nack ('t243.1');
5844 !!!next-token;
5845 next B;
5846 }
5847
5848 splice @{$self->{open_elements}}, $i;
5849 pop @{$open_tables};
5850
5851 $self->_reset_insertion_mode;
5852
5853 !!!next-token;
5854 next B;
5855 } elsif ({
5856 tbody => 1, tfoot => 1, thead => 1,
5857 }->{$token->{tag_name}} and
5858 $self->{insertion_mode} & ROW_IMS) {
5859 if ($self->{insertion_mode} == IN_ROW_IM) {
5860 ## have an element in table scope
5861 my $i;
5862 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5863 my $node = $self->{open_elements}->[$_];
5864 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5865 !!!cp ('t247');
5866 $i = $_;
5867 last INSCOPE;
5868 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5869 !!!cp ('t248');
5870 last INSCOPE;
5871 }
5872 } # INSCOPE
5873 unless (defined $i) {
5874 !!!cp ('t249');
5875 !!!parse-error (type => 'unmatched end tag',
5876 text => $token->{tag_name}, token => $token);
5877 ## Ignore the token
5878 !!!nack ('t249.1');
5879 !!!next-token;
5880 next B;
5881 }
5882
5883 ## As if </tr>
5884 ## have an element in table scope
5885 my $i;
5886 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5887 my $node = $self->{open_elements}->[$_];
5888 if ($node->[1] & TABLE_ROW_EL) {
5889 !!!cp ('t250');
5890 $i = $_;
5891 last INSCOPE;
5892 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5893 !!!cp ('t251');
5894 last INSCOPE;
5895 }
5896 } # INSCOPE
5897 unless (defined $i) {
5898 !!!cp ('t252');
5899 !!!parse-error (type => 'unmatched end tag',
5900 text => 'tr', token => $token);
5901 ## Ignore the token
5902 !!!nack ('t252.1');
5903 !!!next-token;
5904 next B;
5905 }
5906
5907 ## Clear back to table row context
5908 while (not ($self->{open_elements}->[-1]->[1]
5909 & TABLE_ROW_SCOPING_EL)) {
5910 !!!cp ('t253');
5911 ## ISSUE: Can this case be reached?
5912 pop @{$self->{open_elements}};
5913 }
5914
5915 pop @{$self->{open_elements}}; # tr
5916 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5917 ## reprocess in the "in table body" insertion mode...
5918 }
5919
5920 ## have an element in table scope
5921 my $i;
5922 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5923 my $node = $self->{open_elements}->[$_];
5924 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5925 !!!cp ('t254');
5926 $i = $_;
5927 last INSCOPE;
5928 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5929 !!!cp ('t255');
5930 last INSCOPE;
5931 }
5932 } # INSCOPE
5933 unless (defined $i) {
5934 !!!cp ('t256');
5935 !!!parse-error (type => 'unmatched end tag',
5936 text => $token->{tag_name}, token => $token);
5937 ## Ignore the token
5938 !!!nack ('t256.1');
5939 !!!next-token;
5940 next B;
5941 }
5942
5943 ## Clear back to table body context
5944 while (not ($self->{open_elements}->[-1]->[1]
5945 & TABLE_ROWS_SCOPING_EL)) {
5946 !!!cp ('t257');
5947 ## ISSUE: Can this case be reached?
5948 pop @{$self->{open_elements}};
5949 }
5950
5951 pop @{$self->{open_elements}};
5952 $self->{insertion_mode} = IN_TABLE_IM;
5953 !!!nack ('t257.1');
5954 !!!next-token;
5955 next B;
5956 } elsif ({
5957 body => 1, caption => 1, col => 1, colgroup => 1,
5958 html => 1, td => 1, th => 1,
5959 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5960 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5961 }->{$token->{tag_name}}) {
5962 !!!cp ('t258');
5963 !!!parse-error (type => 'unmatched end tag',
5964 text => $token->{tag_name}, token => $token);
5965 ## Ignore the token
5966 !!!nack ('t258.1');
5967 !!!next-token;
5968 next B;
5969 } else {
5970 !!!cp ('t259');
5971 !!!parse-error (type => 'in table:/',
5972 text => $token->{tag_name}, token => $token);
5973
5974 $insert = $insert_to_foster;
5975 #
5976 }
5977 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5978 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5979 @{$self->{open_elements}} == 1) { # redundant, maybe
5980 !!!parse-error (type => 'in body:#eof', token => $token);
5981 !!!cp ('t259.1');
5982 #
5983 } else {
5984 !!!cp ('t259.2');
5985 #
5986 }
5987
5988 ## Stop parsing
5989 last B;
5990 } else {
5991 die "$0: $token->{type}: Unknown token type";
5992 }
5993 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5994 if ($token->{type} == CHARACTER_TOKEN) {
5995 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5996 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5997 unless (length $token->{data}) {
5998 !!!cp ('t260');
5999 !!!next-token;
6000 next B;
6001 }
6002 }
6003
6004 !!!cp ('t261');
6005 #
6006 } elsif ($token->{type} == START_TAG_TOKEN) {
6007 if ($token->{tag_name} eq 'col') {
6008 !!!cp ('t262');
6009 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6010 pop @{$self->{open_elements}};
6011 !!!ack ('t262.1');
6012 !!!next-token;
6013 next B;
6014 } else {
6015 !!!cp ('t263');
6016 #
6017 }
6018 } elsif ($token->{type} == END_TAG_TOKEN) {
6019 if ($token->{tag_name} eq 'colgroup') {
6020 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6021 !!!cp ('t264');
6022 !!!parse-error (type => 'unmatched end tag',
6023 text => 'colgroup', token => $token);
6024 ## Ignore the token
6025 !!!next-token;
6026 next B;
6027 } else {
6028 !!!cp ('t265');
6029 pop @{$self->{open_elements}}; # colgroup
6030 $self->{insertion_mode} = IN_TABLE_IM;
6031 !!!next-token;
6032 next B;
6033 }
6034 } elsif ($token->{tag_name} eq 'col') {
6035 !!!cp ('t266');
6036 !!!parse-error (type => 'unmatched end tag',
6037 text => 'col', token => $token);
6038 ## Ignore the token
6039 !!!next-token;
6040 next B;
6041 } else {
6042 !!!cp ('t267');
6043 #
6044 }
6045 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6046 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6047 @{$self->{open_elements}} == 1) { # redundant, maybe
6048 !!!cp ('t270.2');
6049 ## Stop parsing.
6050 last B;
6051 } else {
6052 ## NOTE: As if </colgroup>.
6053 !!!cp ('t270.1');
6054 pop @{$self->{open_elements}}; # colgroup
6055 $self->{insertion_mode} = IN_TABLE_IM;
6056 ## Reprocess.
6057 next B;
6058 }
6059 } else {
6060 die "$0: $token->{type}: Unknown token type";
6061 }
6062
6063 ## As if </colgroup>
6064 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6065 !!!cp ('t269');
6066 ## TODO: Wrong error type?
6067 !!!parse-error (type => 'unmatched end tag',
6068 text => 'colgroup', token => $token);
6069 ## Ignore the token
6070 !!!nack ('t269.1');
6071 !!!next-token;
6072 next B;
6073 } else {
6074 !!!cp ('t270');
6075 pop @{$self->{open_elements}}; # colgroup
6076 $self->{insertion_mode} = IN_TABLE_IM;
6077 !!!ack-later;
6078 ## reprocess
6079 next B;
6080 }
6081 } elsif ($self->{insertion_mode} & SELECT_IMS) {
6082 if ($token->{type} == CHARACTER_TOKEN) {
6083 !!!cp ('t271');
6084 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
6085 !!!next-token;
6086 next B;
6087 } elsif ($token->{type} == START_TAG_TOKEN) {
6088 if ($token->{tag_name} eq 'option') {
6089 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6090 !!!cp ('t272');
6091 ## As if </option>
6092 pop @{$self->{open_elements}};
6093 } else {
6094 !!!cp ('t273');
6095 }
6096
6097 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6098 !!!nack ('t273.1');
6099 !!!next-token;
6100 next B;
6101 } elsif ($token->{tag_name} eq 'optgroup') {
6102 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6103 !!!cp ('t274');
6104 ## As if </option>
6105 pop @{$self->{open_elements}};
6106 } else {
6107 !!!cp ('t275');
6108 }
6109
6110 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6111 !!!cp ('t276');
6112 ## As if </optgroup>
6113 pop @{$self->{open_elements}};
6114 } else {
6115 !!!cp ('t277');
6116 }
6117
6118 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6119 !!!nack ('t277.1');
6120 !!!next-token;
6121 next B;
6122 } elsif ({
6123 select => 1, input => 1, textarea => 1,
6124 }->{$token->{tag_name}} or
6125 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6126 {
6127 caption => 1, table => 1,
6128 tbody => 1, tfoot => 1, thead => 1,
6129 tr => 1, td => 1, th => 1,
6130 }->{$token->{tag_name}})) {
6131 ## TODO: The type below is not good - <select> is replaced by </select>
6132 !!!parse-error (type => 'not closed', text => 'select',
6133 token => $token);
6134 ## NOTE: As if the token were </select> (<select> case) or
6135 ## as if there were </select> (otherwise).
6136 ## have an element in table scope
6137 my $i;
6138 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6139 my $node = $self->{open_elements}->[$_];
6140 if ($node->[1] & SELECT_EL) {
6141 !!!cp ('t278');
6142 $i = $_;
6143 last INSCOPE;
6144 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6145 !!!cp ('t279');
6146 last INSCOPE;
6147 }
6148 } # INSCOPE
6149 unless (defined $i) {
6150 !!!cp ('t280');
6151 !!!parse-error (type => 'unmatched end tag',
6152 text => 'select', token => $token);
6153 ## Ignore the token
6154 !!!nack ('t280.1');
6155 !!!next-token;
6156 next B;
6157 }
6158
6159 !!!cp ('t281');
6160 splice @{$self->{open_elements}}, $i;
6161
6162 $self->_reset_insertion_mode;
6163
6164 if ($token->{tag_name} eq 'select') {
6165 !!!nack ('t281.2');
6166 !!!next-token;
6167 next B;
6168 } else {
6169 !!!cp ('t281.1');
6170 !!!ack-later;
6171 ## Reprocess the token.
6172 next B;
6173 }
6174 } else {
6175 !!!cp ('t282');
6176 !!!parse-error (type => 'in select',
6177 text => $token->{tag_name}, token => $token);
6178 ## Ignore the token
6179 !!!nack ('t282.1');
6180 !!!next-token;
6181 next B;
6182 }
6183 } elsif ($token->{type} == END_TAG_TOKEN) {
6184 if ($token->{tag_name} eq 'optgroup') {
6185 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6186 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6187 !!!cp ('t283');
6188 ## As if </option>
6189 splice @{$self->{open_elements}}, -2;
6190 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6191 !!!cp ('t284');
6192 pop @{$self->{open_elements}};
6193 } else {
6194 !!!cp ('t285');
6195 !!!parse-error (type => 'unmatched end tag',
6196 text => $token->{tag_name}, token => $token);
6197 ## Ignore the token
6198 }
6199 !!!nack ('t285.1');
6200 !!!next-token;
6201 next B;
6202 } elsif ($token->{tag_name} eq 'option') {
6203 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6204 !!!cp ('t286');
6205 pop @{$self->{open_elements}};
6206 } else {
6207 !!!cp ('t287');
6208 !!!parse-error (type => 'unmatched end tag',
6209 text => $token->{tag_name}, token => $token);
6210 ## Ignore the token
6211 }
6212 !!!nack ('t287.1');
6213 !!!next-token;
6214 next B;
6215 } elsif ($token->{tag_name} eq 'select') {
6216 ## have an element in table scope
6217 my $i;
6218 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6219 my $node = $self->{open_elements}->[$_];
6220 if ($node->[1] & SELECT_EL) {
6221 !!!cp ('t288');
6222 $i = $_;
6223 last INSCOPE;
6224 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6225 !!!cp ('t289');
6226 last INSCOPE;
6227 }
6228 } # INSCOPE
6229 unless (defined $i) {
6230 !!!cp ('t290');
6231 !!!parse-error (type => 'unmatched end tag',
6232 text => $token->{tag_name}, token => $token);
6233 ## Ignore the token
6234 !!!nack ('t290.1');
6235 !!!next-token;
6236 next B;
6237 }
6238
6239 !!!cp ('t291');
6240 splice @{$self->{open_elements}}, $i;
6241
6242 $self->_reset_insertion_mode;
6243
6244 !!!nack ('t291.1');
6245 !!!next-token;
6246 next B;
6247 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6248 {
6249 caption => 1, table => 1, tbody => 1,
6250 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6251 }->{$token->{tag_name}}) {
6252 ## TODO: The following is wrong?
6253 !!!parse-error (type => 'unmatched end tag',
6254 text => $token->{tag_name}, token => $token);
6255
6256 ## have an element in table scope
6257 my $i;
6258 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6259 my $node = $self->{open_elements}->[$_];
6260 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6261 !!!cp ('t292');
6262 $i = $_;
6263 last INSCOPE;
6264 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6265 !!!cp ('t293');
6266 last INSCOPE;
6267 }
6268 } # INSCOPE
6269 unless (defined $i) {
6270 !!!cp ('t294');
6271 ## Ignore the token
6272 !!!nack ('t294.1');
6273 !!!next-token;
6274 next B;
6275 }
6276
6277 ## As if </select>
6278 ## have an element in table scope
6279 undef $i;
6280 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6281 my $node = $self->{open_elements}->[$_];
6282 if ($node->[1] & SELECT_EL) {
6283 !!!cp ('t295');
6284 $i = $_;
6285 last INSCOPE;
6286 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6287 ## ISSUE: Can this state be reached?
6288 !!!cp ('t296');
6289 last INSCOPE;
6290 }
6291 } # INSCOPE
6292 unless (defined $i) {
6293 !!!cp ('t297');
6294 ## TODO: The following error type is correct?
6295 !!!parse-error (type => 'unmatched end tag',
6296 text => 'select', token => $token);
6297 ## Ignore the </select> token
6298 !!!nack ('t297.1');
6299 !!!next-token; ## TODO: ok?
6300 next B;
6301 }
6302
6303 !!!cp ('t298');
6304 splice @{$self->{open_elements}}, $i;
6305
6306 $self->_reset_insertion_mode;
6307
6308 !!!ack-later;
6309 ## reprocess
6310 next B;
6311 } else {
6312 !!!cp ('t299');
6313 !!!parse-error (type => 'in select:/',
6314 text => $token->{tag_name}, token => $token);
6315 ## Ignore the token
6316 !!!nack ('t299.3');
6317 !!!next-token;
6318 next B;
6319 }
6320 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6321 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6322 @{$self->{open_elements}} == 1) { # redundant, maybe
6323 !!!cp ('t299.1');
6324 !!!parse-error (type => 'in body:#eof', token => $token);
6325 } else {
6326 !!!cp ('t299.2');
6327 }
6328
6329 ## Stop parsing.
6330 last B;
6331 } else {
6332 die "$0: $token->{type}: Unknown token type";
6333 }
6334 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6335 if ($token->{type} == CHARACTER_TOKEN) {
6336 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6337 my $data = $1;
6338 ## As if in body
6339 $reconstruct_active_formatting_elements->($insert_to_current);
6340
6341 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6342
6343 unless (length $token->{data}) {
6344 !!!cp ('t300');
6345 !!!next-token;
6346 next B;
6347 }
6348 }
6349
6350 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6351 !!!cp ('t301');
6352 !!!parse-error (type => 'after html:#text', token => $token);
6353
6354 ## Reprocess in the "after body" insertion mode.
6355 } else {
6356 !!!cp ('t302');
6357 }
6358
6359 ## "after body" insertion mode
6360 !!!parse-error (type => 'after body:#text', token => $token);
6361
6362 $self->{insertion_mode} = IN_BODY_IM;
6363 ## reprocess
6364 next B;
6365 } elsif ($token->{type} == START_TAG_TOKEN) {
6366 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6367 !!!cp ('t303');
6368 !!!parse-error (type => 'after html',
6369 text => $token->{tag_name}, token => $token);
6370
6371 ## Reprocess in the "after body" insertion mode.
6372 } else {
6373 !!!cp ('t304');
6374 }
6375
6376 ## "after body" insertion mode
6377 !!!parse-error (type => 'after body',
6378 text => $token->{tag_name}, token => $token);
6379
6380 $self->{insertion_mode} = IN_BODY_IM;
6381 !!!ack-later;
6382 ## reprocess
6383 next B;
6384 } elsif ($token->{type} == END_TAG_TOKEN) {
6385 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6386 !!!cp ('t305');
6387 !!!parse-error (type => 'after html:/',
6388 text => $token->{tag_name}, token => $token);
6389
6390 $self->{insertion_mode} = AFTER_BODY_IM;
6391 ## Reprocess in the "after body" insertion mode.
6392 } else {
6393 !!!cp ('t306');
6394 }
6395
6396 ## "after body" insertion mode
6397 if ($token->{tag_name} eq 'html') {
6398 if (defined $self->{inner_html_node}) {
6399 !!!cp ('t307');
6400 !!!parse-error (type => 'unmatched end tag',
6401 text => 'html', token => $token);
6402 ## Ignore the token
6403 !!!next-token;
6404 next B;
6405 } else {
6406 !!!cp ('t308');
6407 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6408 !!!next-token;
6409 next B;
6410 }
6411 } else {
6412 !!!cp ('t309');
6413 !!!parse-error (type => 'after body:/',
6414 text => $token->{tag_name}, token => $token);
6415
6416 $self->{insertion_mode} = IN_BODY_IM;
6417 ## reprocess
6418 next B;
6419 }
6420 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6421 !!!cp ('t309.2');
6422 ## Stop parsing
6423 last B;
6424 } else {
6425 die "$0: $token->{type}: Unknown token type";
6426 }
6427 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6428 if ($token->{type} == CHARACTER_TOKEN) {
6429 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6430 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6431
6432 unless (length $token->{data}) {
6433 !!!cp ('t310');
6434 !!!next-token;
6435 next B;
6436 }
6437 }
6438
6439 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6440 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6441 !!!cp ('t311');
6442 !!!parse-error (type => 'in frameset:#text', token => $token);
6443 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6444 !!!cp ('t312');
6445 !!!parse-error (type => 'after frameset:#text', token => $token);
6446 } else { # "after after frameset"
6447 !!!cp ('t313');
6448 !!!parse-error (type => 'after html:#text', token => $token);
6449 }
6450
6451 ## Ignore the token.
6452 if (length $token->{data}) {
6453 !!!cp ('t314');
6454 ## reprocess the rest of characters
6455 } else {
6456 !!!cp ('t315');
6457 !!!next-token;
6458 }
6459 next B;
6460 }
6461
6462 die qq[$0: Character "$token->{data}"];
6463 } elsif ($token->{type} == START_TAG_TOKEN) {
6464 if ($token->{tag_name} eq 'frameset' and
6465 $self->{insertion_mode} == IN_FRAMESET_IM) {
6466 !!!cp ('t318');
6467 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6468 !!!nack ('t318.1');
6469 !!!next-token;
6470 next B;
6471 } elsif ($token->{tag_name} eq 'frame' and
6472 $self->{insertion_mode} == IN_FRAMESET_IM) {
6473 !!!cp ('t319');
6474 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6475 pop @{$self->{open_elements}};
6476 !!!ack ('t319.1');
6477 !!!next-token;
6478 next B;
6479 } elsif ($token->{tag_name} eq 'noframes') {
6480 !!!cp ('t320');
6481 ## NOTE: As if in head.
6482 $parse_rcdata->(CDATA_CONTENT_MODEL);
6483 next B;
6484
6485 ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6486 ## has no parse error.
6487 } else {
6488 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6489 !!!cp ('t321');
6490 !!!parse-error (type => 'in frameset',
6491 text => $token->{tag_name}, token => $token);
6492 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6493 !!!cp ('t322');
6494 !!!parse-error (type => 'after frameset',
6495 text => $token->{tag_name}, token => $token);
6496 } else { # "after after frameset"
6497 !!!cp ('t322.2');
6498 !!!parse-error (type => 'after after frameset',
6499 text => $token->{tag_name}, token => $token);
6500 }
6501 ## Ignore the token
6502 !!!nack ('t322.1');
6503 !!!next-token;
6504 next B;
6505 }
6506 } elsif ($token->{type} == END_TAG_TOKEN) {
6507 if ($token->{tag_name} eq 'frameset' and
6508 $self->{insertion_mode} == IN_FRAMESET_IM) {
6509 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6510 @{$self->{open_elements}} == 1) {
6511 !!!cp ('t325');
6512 !!!parse-error (type => 'unmatched end tag',
6513 text => $token->{tag_name}, token => $token);
6514 ## Ignore the token
6515 !!!next-token;
6516 } else {
6517 !!!cp ('t326');
6518 pop @{$self->{open_elements}};
6519 !!!next-token;
6520 }
6521
6522 if (not defined $self->{inner_html_node} and
6523 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6524 !!!cp ('t327');
6525 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6526 } else {
6527 !!!cp ('t328');
6528 }
6529 next B;
6530 } elsif ($token->{tag_name} eq 'html' and
6531 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6532 !!!cp ('t329');
6533 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6534 !!!next-token;
6535 next B;
6536 } else {
6537 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6538 !!!cp ('t330');
6539 !!!parse-error (type => 'in frameset:/',
6540 text => $token->{tag_name}, token => $token);
6541 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6542 !!!cp ('t330.1');
6543 !!!parse-error (type => 'after frameset:/',
6544 text => $token->{tag_name}, token => $token);
6545 } else { # "after after html"
6546 !!!cp ('t331');
6547 !!!parse-error (type => 'after after frameset:/',
6548 text => $token->{tag_name}, token => $token);
6549 }
6550 ## Ignore the token
6551 !!!next-token;
6552 next B;
6553 }
6554 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6555 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6556 @{$self->{open_elements}} == 1) { # redundant, maybe
6557 !!!cp ('t331.1');
6558 !!!parse-error (type => 'in body:#eof', token => $token);
6559 } else {
6560 !!!cp ('t331.2');
6561 }
6562
6563 ## Stop parsing
6564 last B;
6565 } else {
6566 die "$0: $token->{type}: Unknown token type";
6567 }
6568
6569 ## ISSUE: An issue in spec here
6570 } else {
6571 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6572 }
6573
6574 ## "in body" insertion mode
6575 if ($token->{type} == START_TAG_TOKEN) {
6576 if ($token->{tag_name} eq 'script') {
6577 !!!cp ('t332');
6578 ## NOTE: This is an "as if in head" code clone
6579 $script_start_tag->();
6580 next B;
6581 } elsif ($token->{tag_name} eq 'style') {
6582 !!!cp ('t333');
6583 ## NOTE: This is an "as if in head" code clone
6584 $parse_rcdata->(CDATA_CONTENT_MODEL);
6585 next B;
6586 } elsif ({
6587 base => 1, link => 1,
6588 }->{$token->{tag_name}}) {
6589 !!!cp ('t334');
6590 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6591 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6592 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6593 !!!ack ('t334.1');
6594 !!!next-token;
6595 next B;
6596 } elsif ($token->{tag_name} eq 'meta') {
6597 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6598 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6599 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6600
6601 unless ($self->{confident}) {
6602 if ($token->{attributes}->{charset}) {
6603 !!!cp ('t335');
6604 ## NOTE: Whether the encoding is supported or not is handled
6605 ## in the {change_encoding} callback.
6606 $self->{change_encoding}
6607 ->($self, $token->{attributes}->{charset}->{value}, $token);
6608
6609 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6610 ->set_user_data (manakai_has_reference =>
6611 $token->{attributes}->{charset}
6612 ->{has_reference});
6613 } elsif ($token->{attributes}->{content}) {
6614 if ($token->{attributes}->{content}->{value}
6615 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6616 [\x09-\x0D\x20]*=
6617 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6618 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6619 !!!cp ('t336');
6620 ## NOTE: Whether the encoding is supported or not is handled
6621 ## in the {change_encoding} callback.
6622 $self->{change_encoding}
6623 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6624 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6625 ->set_user_data (manakai_has_reference =>
6626 $token->{attributes}->{content}
6627 ->{has_reference});
6628 }
6629 }
6630 } else {
6631 if ($token->{attributes}->{charset}) {
6632 !!!cp ('t337');
6633 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6634 ->set_user_data (manakai_has_reference =>
6635 $token->{attributes}->{charset}
6636 ->{has_reference});
6637 }
6638 if ($token->{attributes}->{content}) {
6639 !!!cp ('t338');
6640 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6641 ->set_user_data (manakai_has_reference =>
6642 $token->{attributes}->{content}
6643 ->{has_reference});
6644 }
6645 }
6646
6647 !!!ack ('t338.1');
6648 !!!next-token;
6649 next B;
6650 } elsif ($token->{tag_name} eq 'title') {
6651 !!!cp ('t341');
6652 ## NOTE: This is an "as if in head" code clone
6653 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6654 next B;
6655 } elsif ($token->{tag_name} eq 'body') {
6656 !!!parse-error (type => 'in body', text => 'body', token => $token);
6657
6658 if (@{$self->{open_elements}} == 1 or
6659 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6660 !!!cp ('t342');
6661 ## Ignore the token
6662 } else {
6663 my $body_el = $self->{open_elements}->[1]->[0];
6664 for my $attr_name (keys %{$token->{attributes}}) {
6665 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6666 !!!cp ('t343');
6667 $body_el->set_attribute_ns
6668 (undef, [undef, $attr_name],
6669 $token->{attributes}->{$attr_name}->{value});
6670 }
6671 }
6672 }
6673 !!!nack ('t343.1');
6674 !!!next-token;
6675 next B;
6676 } elsif ({
6677 address => 1, blockquote => 1, center => 1, dir => 1,
6678 div => 1, dl => 1, fieldset => 1,
6679 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6680 menu => 1, ol => 1, p => 1, ul => 1,
6681 pre => 1, listing => 1,
6682 form => 1,
6683 table => 1,
6684 hr => 1,
6685 }->{$token->{tag_name}}) {
6686 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6687 !!!cp ('t350');
6688 !!!parse-error (type => 'in form:form', token => $token);
6689 ## Ignore the token
6690 !!!nack ('t350.1');
6691 !!!next-token;
6692 next B;
6693 }
6694
6695 ## has a p element in scope
6696 INSCOPE: for (reverse @{$self->{open_elements}}) {
6697 if ($_->[1] & P_EL) {
6698 !!!cp ('t344');
6699 !!!back-token; # <form>
6700 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6701 line => $token->{line}, column => $token->{column}};
6702 next B;
6703 } elsif ($_->[1] & SCOPING_EL) {
6704 !!!cp ('t345');
6705 last INSCOPE;
6706 }
6707 } # INSCOPE
6708
6709 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6710 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6711 !!!nack ('t346.1');
6712 !!!next-token;
6713 if ($token->{type} == CHARACTER_TOKEN) {
6714 $token->{data} =~ s/^\x0A//;
6715 unless (length $token->{data}) {
6716 !!!cp ('t346');
6717 !!!next-token;
6718 } else {
6719 !!!cp ('t349');
6720 }
6721 } else {
6722 !!!cp ('t348');
6723 }
6724 } elsif ($token->{tag_name} eq 'form') {
6725 !!!cp ('t347.1');
6726 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6727
6728 !!!nack ('t347.2');
6729 !!!next-token;
6730 } elsif ($token->{tag_name} eq 'table') {
6731 !!!cp ('t382');
6732 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6733
6734 $self->{insertion_mode} = IN_TABLE_IM;
6735
6736 !!!nack ('t382.1');
6737 !!!next-token;
6738 } elsif ($token->{tag_name} eq 'hr') {
6739 !!!cp ('t386');
6740 pop @{$self->{open_elements}};
6741
6742 !!!nack ('t386.1');
6743 !!!next-token;
6744 } else {
6745 !!!nack ('t347.1');
6746 !!!next-token;
6747 }
6748 next B;
6749 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6750 ## has a p element in scope
6751 INSCOPE: for (reverse @{$self->{open_elements}}) {
6752 if ($_->[1] & P_EL) {
6753 !!!cp ('t353');
6754 !!!back-token; # <x>
6755 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6756 line => $token->{line}, column => $token->{column}};
6757 next B;
6758 } elsif ($_->[1] & SCOPING_EL) {
6759 !!!cp ('t354');
6760 last INSCOPE;
6761 }
6762 } # INSCOPE
6763
6764 ## Step 1
6765 my $i = -1;
6766 my $node = $self->{open_elements}->[$i];
6767 my $li_or_dtdd = {li => {li => 1},
6768 dt => {dt => 1, dd => 1},
6769 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6770 LI: {
6771 ## Step 2
6772 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6773 if ($i != -1) {
6774 !!!cp ('t355');
6775 !!!parse-error (type => 'not closed',
6776 text => $self->{open_elements}->[-1]->[0]
6777 ->manakai_local_name,
6778 token => $token);
6779 } else {
6780 !!!cp ('t356');
6781 }
6782 splice @{$self->{open_elements}}, $i;
6783 last LI;
6784 } else {
6785 !!!cp ('t357');
6786 }
6787
6788 ## Step 3
6789 if (not ($node->[1] & FORMATTING_EL) and
6790 #not $phrasing_category->{$node->[1]} and
6791 ($node->[1] & SPECIAL_EL or
6792 $node->[1] & SCOPING_EL) and
6793 not ($node->[1] & ADDRESS_EL) and
6794 not ($node->[1] & DIV_EL)) {
6795 !!!cp ('t358');
6796 last LI;
6797 }
6798
6799 !!!cp ('t359');
6800 ## Step 4
6801 $i--;
6802 $node = $self->{open_elements}->[$i];
6803 redo LI;
6804 } # LI
6805
6806 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6807 !!!nack ('t359.1');
6808 !!!next-token;
6809 next B;
6810 } elsif ($token->{tag_name} eq 'plaintext') {
6811 ## has a p element in scope
6812 INSCOPE: for (reverse @{$self->{open_elements}}) {
6813 if ($_->[1] & P_EL) {
6814 !!!cp ('t367');
6815 !!!back-token; # <plaintext>
6816 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6817 line => $token->{line}, column => $token->{column}};
6818 next B;
6819 } elsif ($_->[1] & SCOPING_EL) {
6820 !!!cp ('t368');
6821 last INSCOPE;
6822 }
6823 } # INSCOPE
6824
6825 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6826
6827 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6828
6829 !!!nack ('t368.1');
6830 !!!next-token;
6831 next B;
6832 } elsif ($token->{tag_name} eq 'a') {
6833 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6834 my $node = $active_formatting_elements->[$i];
6835 if ($node->[1] & A_EL) {
6836 !!!cp ('t371');
6837 !!!parse-error (type => 'in a:a', token => $token);
6838
6839 !!!back-token; # <a>
6840 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6841 line => $token->{line}, column => $token->{column}};
6842 $formatting_end_tag->($token);
6843
6844 AFE2: for (reverse 0..$#$active_formatting_elements) {
6845 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6846 !!!cp ('t372');
6847 splice @$active_formatting_elements, $_, 1;
6848 last AFE2;
6849 }
6850 } # AFE2
6851 OE: for (reverse 0..$#{$self->{open_elements}}) {
6852 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6853 !!!cp ('t373');
6854 splice @{$self->{open_elements}}, $_, 1;
6855 last OE;
6856 }
6857 } # OE
6858 last AFE;
6859 } elsif ($node->[0] eq '#marker') {
6860 !!!cp ('t374');
6861 last AFE;
6862 }
6863 } # AFE
6864
6865 $reconstruct_active_formatting_elements->($insert_to_current);
6866
6867 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6868 push @$active_formatting_elements, $self->{open_elements}->[-1];
6869
6870 !!!nack ('t374.1');
6871 !!!next-token;
6872 next B;
6873 } elsif ($token->{tag_name} eq 'nobr') {
6874 $reconstruct_active_formatting_elements->($insert_to_current);
6875
6876 ## has a |nobr| element in scope
6877 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6878 my $node = $self->{open_elements}->[$_];
6879 if ($node->[1] & NOBR_EL) {
6880 !!!cp ('t376');
6881 !!!parse-error (type => 'in nobr:nobr', token => $token);
6882 !!!back-token; # <nobr>
6883 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6884 line => $token->{line}, column => $token->{column}};
6885 next B;
6886 } elsif ($node->[1] & SCOPING_EL) {
6887 !!!cp ('t377');
6888 last INSCOPE;
6889 }
6890 } # INSCOPE
6891
6892 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6893 push @$active_formatting_elements, $self->{open_elements}->[-1];
6894
6895 !!!nack ('t377.1');
6896 !!!next-token;
6897 next B;
6898 } elsif ($token->{tag_name} eq 'button') {
6899 ## has a button element in scope
6900 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6901 my $node = $self->{open_elements}->[$_];
6902 if ($node->[1] & BUTTON_EL) {
6903 !!!cp ('t378');
6904 !!!parse-error (type => 'in button:button', token => $token);
6905 !!!back-token; # <button>
6906 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6907 line => $token->{line}, column => $token->{column}};
6908 next B;
6909 } elsif ($node->[1] & SCOPING_EL) {
6910 !!!cp ('t379');
6911 last INSCOPE;
6912 }
6913 } # INSCOPE
6914
6915 $reconstruct_active_formatting_elements->($insert_to_current);
6916
6917 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6918
6919 ## TODO: associate with $self->{form_element} if defined
6920
6921 push @$active_formatting_elements, ['#marker', ''];
6922
6923 !!!nack ('t379.1');
6924 !!!next-token;
6925 next B;
6926 } elsif ({
6927 xmp => 1,
6928 iframe => 1,
6929 noembed => 1,
6930 noframes => 1, ## NOTE: This is an "as if in head" code clone.
6931 noscript => 0, ## TODO: 1 if scripting is enabled
6932 }->{$token->{tag_name}}) {
6933 if ($token->{tag_name} eq 'xmp') {
6934 !!!cp ('t381');
6935 $reconstruct_active_formatting_elements->($insert_to_current);
6936 } else {
6937 !!!cp ('t399');
6938 }
6939 ## NOTE: There is an "as if in body" code clone.
6940 $parse_rcdata->(CDATA_CONTENT_MODEL);
6941 next B;
6942 } elsif ($token->{tag_name} eq 'isindex') {
6943 !!!parse-error (type => 'isindex', token => $token);
6944
6945 if (defined $self->{form_element}) {
6946 !!!cp ('t389');
6947 ## Ignore the token
6948 !!!nack ('t389'); ## NOTE: Not acknowledged.
6949 !!!next-token;
6950 next B;
6951 } else {
6952 !!!ack ('t391.1');
6953
6954 my $at = $token->{attributes};
6955 my $form_attrs;
6956 $form_attrs->{action} = $at->{action} if $at->{action};
6957 my $prompt_attr = $at->{prompt};
6958 $at->{name} = {name => 'name', value => 'isindex'};
6959 delete $at->{action};
6960 delete $at->{prompt};
6961 my @tokens = (
6962 {type => START_TAG_TOKEN, tag_name => 'form',
6963 attributes => $form_attrs,
6964 line => $token->{line}, column => $token->{column}},
6965 {type => START_TAG_TOKEN, tag_name => 'hr',
6966 line => $token->{line}, column => $token->{column}},
6967 {type => START_TAG_TOKEN, tag_name => 'p',
6968 line => $token->{line}, column => $token->{column}},
6969 {type => START_TAG_TOKEN, tag_name => 'label',
6970 line => $token->{line}, column => $token->{column}},
6971 );
6972 if ($prompt_attr) {
6973 !!!cp ('t390');
6974 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6975 #line => $token->{line}, column => $token->{column},
6976 };
6977 } else {
6978 !!!cp ('t391');
6979 push @tokens, {type => CHARACTER_TOKEN,
6980 data => 'This is a searchable index. Insert your search keywords here: ',
6981 #line => $token->{line}, column => $token->{column},
6982 }; # SHOULD
6983 ## TODO: make this configurable
6984 }
6985 push @tokens,
6986 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6987 line => $token->{line}, column => $token->{column}},
6988 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6989 {type => END_TAG_TOKEN, tag_name => 'label',
6990 line => $token->{line}, column => $token->{column}},
6991 {type => END_TAG_TOKEN, tag_name => 'p',
6992 line => $token->{line}, column => $token->{column}},
6993 {type => START_TAG_TOKEN, tag_name => 'hr',
6994 line => $token->{line}, column => $token->{column}},
6995 {type => END_TAG_TOKEN, tag_name => 'form',
6996 line => $token->{line}, column => $token->{column}};
6997 !!!back-token (@tokens);
6998 !!!next-token;
6999 next B;
7000 }
7001 } elsif ($token->{tag_name} eq 'textarea') {
7002 my $tag_name = $token->{tag_name};
7003 my $el;
7004 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
7005
7006 ## TODO: $self->{form_element} if defined
7007 $self->{content_model} = RCDATA_CONTENT_MODEL;
7008 delete $self->{escape}; # MUST
7009
7010 $insert->($el);
7011
7012 my $text = '';
7013 !!!nack ('t392.1');
7014 !!!next-token;
7015 if ($token->{type} == CHARACTER_TOKEN) {
7016 $token->{data} =~ s/^\x0A//;
7017 unless (length $token->{data}) {
7018 !!!cp ('t392');
7019 !!!next-token;
7020 } else {
7021 !!!cp ('t393');
7022 }
7023 } else {
7024 !!!cp ('t394');
7025 }
7026 while ($token->{type} == CHARACTER_TOKEN) {
7027 !!!cp ('t395');
7028 $text .= $token->{data};
7029 !!!next-token;
7030 }
7031 if (length $text) {
7032 !!!cp ('t396');
7033 $el->manakai_append_text ($text);
7034 }
7035
7036 $self->{content_model} = PCDATA_CONTENT_MODEL;
7037
7038 if ($token->{type} == END_TAG_TOKEN and
7039 $token->{tag_name} eq $tag_name) {
7040 !!!cp ('t397');
7041 ## Ignore the token
7042 } else {
7043 !!!cp ('t398');
7044 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
7045 }
7046 !!!next-token;
7047 next B;
7048 } elsif ($token->{tag_name} eq 'rt' or
7049 $token->{tag_name} eq 'rp') {
7050 ## has a |ruby| element in scope
7051 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7052 my $node = $self->{open_elements}->[$_];
7053 if ($node->[1] & RUBY_EL) {
7054 !!!cp ('t398.1');
7055 ## generate implied end tags
7056 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7057 !!!cp ('t398.2');
7058 pop @{$self->{open_elements}};
7059 }
7060 unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
7061 !!!cp ('t398.3');
7062 !!!parse-error (type => 'not closed',
7063 text => $self->{open_elements}->[-1]->[0]
7064 ->manakai_local_name,
7065 token => $token);
7066 pop @{$self->{open_elements}}
7067 while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
7068 }
7069 last INSCOPE;
7070 } elsif ($node->[1] & SCOPING_EL) {
7071 !!!cp ('t398.4');
7072 last INSCOPE;
7073 }
7074 } # INSCOPE
7075
7076 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7077
7078 !!!nack ('t398.5');
7079 !!!next-token;
7080 redo B;
7081 } elsif ($token->{tag_name} eq 'math' or
7082 $token->{tag_name} eq 'svg') {
7083 $reconstruct_active_formatting_elements->($insert_to_current);
7084
7085 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
7086
7087 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
7088
7089 ## "adjust foreign attributes" - done in insert-element-f
7090
7091 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
7092
7093 if ($self->{self_closing}) {
7094 pop @{$self->{open_elements}};
7095 !!!ack ('t398.1');
7096 } else {
7097 !!!cp ('t398.2');
7098 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
7099 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
7100 ## mode, "in body" (not "in foreign content") secondary insertion
7101 ## mode, maybe.
7102 }
7103
7104 !!!next-token;
7105 next B;
7106 } elsif ({
7107 caption => 1, col => 1, colgroup => 1, frame => 1,
7108 frameset => 1, head => 1, option => 1, optgroup => 1,
7109 tbody => 1, td => 1, tfoot => 1, th => 1,
7110 thead => 1, tr => 1,
7111 }->{$token->{tag_name}}) {
7112 !!!cp ('t401');
7113 !!!parse-error (type => 'in body',
7114 text => $token->{tag_name}, token => $token);
7115 ## Ignore the token
7116 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
7117 !!!next-token;
7118 next B;
7119
7120 ## ISSUE: An issue on HTML5 new elements in the spec.
7121 } else {
7122 if ($token->{tag_name} eq 'image') {
7123 !!!cp ('t384');
7124 !!!parse-error (type => 'image', token => $token);
7125 $token->{tag_name} = 'img';
7126 } else {
7127 !!!cp ('t385');
7128 }
7129
7130 ## NOTE: There is an "as if <br>" code clone.
7131 $reconstruct_active_formatting_elements->($insert_to_current);
7132
7133 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7134
7135 if ({
7136 applet => 1, marquee => 1, object => 1,
7137 }->{$token->{tag_name}}) {
7138 !!!cp ('t380');
7139 push @$active_formatting_elements, ['#marker', ''];
7140 !!!nack ('t380.1');
7141 } elsif ({
7142 b => 1, big => 1, em => 1, font => 1, i => 1,
7143 s => 1, small => 1, strile => 1,
7144 strong => 1, tt => 1, u => 1,
7145 }->{$token->{tag_name}}) {
7146 !!!cp ('t375');
7147 push @$active_formatting_elements, $self->{open_elements}->[-1];
7148 !!!nack ('t375.1');
7149 } elsif ($token->{tag_name} eq 'input') {
7150 !!!cp ('t388');
7151 ## TODO: associate with $self->{form_element} if defined
7152 pop @{$self->{open_elements}};
7153 !!!ack ('t388.2');
7154 } elsif ({
7155 area => 1, basefont => 1, bgsound => 1, br => 1,
7156 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7157 #image => 1,
7158 }->{$token->{tag_name}}) {
7159 !!!cp ('t388.1');
7160 pop @{$self->{open_elements}};
7161 !!!ack ('t388.3');
7162 } elsif ($token->{tag_name} eq 'select') {
7163 ## TODO: associate with $self->{form_element} if defined
7164
7165 if ($self->{insertion_mode} & TABLE_IMS or
7166 $self->{insertion_mode} & BODY_TABLE_IMS or
7167 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7168 !!!cp ('t400.1');
7169 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7170 } else {
7171 !!!cp ('t400.2');
7172 $self->{insertion_mode} = IN_SELECT_IM;
7173 }
7174 !!!nack ('t400.3');
7175 } else {
7176 !!!nack ('t402');
7177 }
7178
7179 !!!next-token;
7180 next B;
7181 }
7182 } elsif ($token->{type} == END_TAG_TOKEN) {
7183 if ($token->{tag_name} eq 'body') {
7184 ## has a |body| element in scope
7185 my $i;
7186 INSCOPE: {
7187 for (reverse @{$self->{open_elements}}) {
7188 if ($_->[1] & BODY_EL) {
7189 !!!cp ('t405');
7190 $i = $_;
7191 last INSCOPE;
7192 } elsif ($_->[1] & SCOPING_EL) {
7193 !!!cp ('t405.1');
7194 last;
7195 }
7196 }
7197
7198 !!!parse-error (type => 'start tag not allowed',
7199 text => $token->{tag_name}, token => $token);
7200 ## NOTE: Ignore the token.
7201 !!!next-token;
7202 next B;
7203 } # INSCOPE
7204
7205 for (@{$self->{open_elements}}) {
7206 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7207 !!!cp ('t403');
7208 !!!parse-error (type => 'not closed',
7209 text => $_->[0]->manakai_local_name,
7210 token => $token);
7211 last;
7212 } else {
7213 !!!cp ('t404');
7214 }
7215 }
7216
7217 $self->{insertion_mode} = AFTER_BODY_IM;
7218 !!!next-token;
7219 next B;
7220 } elsif ($token->{tag_name} eq 'html') {
7221 ## TODO: Update this code. It seems that the code below is not
7222 ## up-to-date, though it has same effect as speced.
7223 if (@{$self->{open_elements}} > 1 and
7224 $self->{open_elements}->[1]->[1] & BODY_EL) {
7225 ## ISSUE: There is an issue in the spec.
7226 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7227 !!!cp ('t406');
7228 !!!parse-error (type => 'not closed',
7229 text => $self->{open_elements}->[1]->[0]
7230 ->manakai_local_name,
7231 token => $token);
7232 } else {
7233 !!!cp ('t407');
7234 }
7235 $self->{insertion_mode} = AFTER_BODY_IM;
7236 ## reprocess
7237 next B;
7238 } else {
7239 !!!cp ('t408');
7240 !!!parse-error (type => 'unmatched end tag',
7241 text => $token->{tag_name}, token => $token);
7242 ## Ignore the token
7243 !!!next-token;
7244 next B;
7245 }
7246 } elsif ({
7247 address => 1, blockquote => 1, center => 1, dir => 1,
7248 div => 1, dl => 1, fieldset => 1, listing => 1,
7249 menu => 1, ol => 1, pre => 1, ul => 1,
7250 dd => 1, dt => 1, li => 1,
7251 applet => 1, button => 1, marquee => 1, object => 1,
7252 }->{$token->{tag_name}}) {
7253 ## has an element in scope
7254 my $i;
7255 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7256 my $node = $self->{open_elements}->[$_];
7257 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7258 !!!cp ('t410');
7259 $i = $_;
7260 last INSCOPE;
7261 } elsif ($node->[1] & SCOPING_EL) {
7262 !!!cp ('t411');
7263 last INSCOPE;
7264 }
7265 } # INSCOPE
7266
7267 unless (defined $i) { # has an element in scope
7268 !!!cp ('t413');
7269 !!!parse-error (type => 'unmatched end tag',
7270 text => $token->{tag_name}, token => $token);
7271 ## NOTE: Ignore the token.
7272 } else {
7273 ## Step 1. generate implied end tags
7274 while ({
7275 ## END_TAG_OPTIONAL_EL
7276 dd => ($token->{tag_name} ne 'dd'),
7277 dt => ($token->{tag_name} ne 'dt'),
7278 li => ($token->{tag_name} ne 'li'),
7279 p => 1,
7280 rt => 1,
7281 rp => 1,
7282 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7283 !!!cp ('t409');
7284 pop @{$self->{open_elements}};
7285 }
7286
7287 ## Step 2.
7288 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7289 ne $token->{tag_name}) {
7290 !!!cp ('t412');
7291 !!!parse-error (type => 'not closed',
7292 text => $self->{open_elements}->[-1]->[0]
7293 ->manakai_local_name,
7294 token => $token);
7295 } else {
7296 !!!cp ('t414');
7297 }
7298
7299 ## Step 3.
7300 splice @{$self->{open_elements}}, $i;
7301
7302 ## Step 4.
7303 $clear_up_to_marker->()
7304 if {
7305 applet => 1, button => 1, marquee => 1, object => 1,
7306 }->{$token->{tag_name}};
7307 }
7308 !!!next-token;
7309 next B;
7310 } elsif ($token->{tag_name} eq 'form') {
7311 undef $self->{form_element};
7312
7313 ## has an element in scope
7314 my $i;
7315 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7316 my $node = $self->{open_elements}->[$_];
7317 if ($node->[1] & FORM_EL) {
7318 !!!cp ('t418');
7319 $i = $_;
7320 last INSCOPE;
7321 } elsif ($node->[1] & SCOPING_EL) {
7322 !!!cp ('t419');
7323 last INSCOPE;
7324 }
7325 } # INSCOPE
7326
7327 unless (defined $i) { # has an element in scope
7328 !!!cp ('t421');
7329 !!!parse-error (type => 'unmatched end tag',
7330 text => $token->{tag_name}, token => $token);
7331 ## NOTE: Ignore the token.
7332 } else {
7333 ## Step 1. generate implied end tags
7334 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7335 !!!cp ('t417');
7336 pop @{$self->{open_elements}};
7337 }
7338
7339 ## Step 2.
7340 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7341 ne $token->{tag_name}) {
7342 !!!cp ('t417.1');
7343 !!!parse-error (type => 'not closed',
7344 text => $self->{open_elements}->[-1]->[0]
7345 ->manakai_local_name,
7346 token => $token);
7347 } else {
7348 !!!cp ('t420');
7349 }
7350
7351 ## Step 3.
7352 splice @{$self->{open_elements}}, $i;
7353 }
7354
7355 !!!next-token;
7356 next B;
7357 } elsif ({
7358 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7359 }->{$token->{tag_name}}) {
7360 ## has an element in scope
7361 my $i;
7362 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7363 my $node = $self->{open_elements}->[$_];
7364 if ($node->[1] & HEADING_EL) {
7365 !!!cp ('t423');
7366 $i = $_;
7367 last INSCOPE;
7368 } elsif ($node->[1] & SCOPING_EL) {
7369 !!!cp ('t424');
7370 last INSCOPE;
7371 }
7372 } # INSCOPE
7373
7374 unless (defined $i) { # has an element in scope
7375 !!!cp ('t425.1');
7376 !!!parse-error (type => 'unmatched end tag',
7377 text => $token->{tag_name}, token => $token);
7378 ## NOTE: Ignore the token.
7379 } else {
7380 ## Step 1. generate implied end tags
7381 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7382 !!!cp ('t422');
7383 pop @{$self->{open_elements}};
7384 }
7385
7386 ## Step 2.
7387 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7388 ne $token->{tag_name}) {
7389 !!!cp ('t425');
7390 !!!parse-error (type => 'unmatched end tag',
7391 text => $token->{tag_name}, token => $token);
7392 } else {
7393 !!!cp ('t426');
7394 }
7395
7396 ## Step 3.
7397 splice @{$self->{open_elements}}, $i;
7398 }
7399
7400 !!!next-token;
7401 next B;
7402 } elsif ($token->{tag_name} eq 'p') {
7403 ## has an element in scope
7404 my $i;
7405 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7406 my $node = $self->{open_elements}->[$_];
7407 if ($node->[1] & P_EL) {
7408 !!!cp ('t410.1');
7409 $i = $_;
7410 last INSCOPE;
7411 } elsif ($node->[1] & SCOPING_EL) {
7412 !!!cp ('t411.1');
7413 last INSCOPE;
7414 }
7415 } # INSCOPE
7416
7417 if (defined $i) {
7418 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7419 ne $token->{tag_name}) {
7420 !!!cp ('t412.1');
7421 !!!parse-error (type => 'not closed',
7422 text => $self->{open_elements}->[-1]->[0]
7423 ->manakai_local_name,
7424 token => $token);
7425 } else {
7426 !!!cp ('t414.1');
7427 }
7428
7429 splice @{$self->{open_elements}}, $i;
7430 } else {
7431 !!!cp ('t413.1');
7432 !!!parse-error (type => 'unmatched end tag',
7433 text => $token->{tag_name}, token => $token);
7434
7435 !!!cp ('t415.1');
7436 ## As if <p>, then reprocess the current token
7437 my $el;
7438 !!!create-element ($el, $HTML_NS, 'p',, $token);
7439 $insert->($el);
7440 ## NOTE: Not inserted into |$self->{open_elements}|.
7441 }
7442
7443 !!!next-token;
7444 next B;
7445 } elsif ({
7446 a => 1,
7447 b => 1, big => 1, em => 1, font => 1, i => 1,
7448 nobr => 1, s => 1, small => 1, strile => 1,
7449 strong => 1, tt => 1, u => 1,
7450 }->{$token->{tag_name}}) {
7451 !!!cp ('t427');
7452 $formatting_end_tag->($token);
7453 next B;
7454 } elsif ($token->{tag_name} eq 'br') {
7455 !!!cp ('t428');
7456 !!!parse-error (type => 'unmatched end tag',
7457 text => 'br', token => $token);
7458
7459 ## As if <br>
7460 $reconstruct_active_formatting_elements->($insert_to_current);
7461
7462 my $el;
7463 !!!create-element ($el, $HTML_NS, 'br',, $token);
7464 $insert->($el);
7465
7466 ## Ignore the token.
7467 !!!next-token;
7468 next B;
7469 } elsif ({
7470 caption => 1, col => 1, colgroup => 1, frame => 1,
7471 frameset => 1, head => 1, option => 1, optgroup => 1,
7472 tbody => 1, td => 1, tfoot => 1, th => 1,
7473 thead => 1, tr => 1,
7474 area => 1, basefont => 1, bgsound => 1,
7475 embed => 1, hr => 1, iframe => 1, image => 1,
7476 img => 1, input => 1, isindex => 1, noembed => 1,
7477 noframes => 1, param => 1, select => 1, spacer => 1,
7478 table => 1, textarea => 1, wbr => 1,
7479 noscript => 0, ## TODO: if scripting is enabled
7480 }->{$token->{tag_name}}) {
7481 !!!cp ('t429');
7482 !!!parse-error (type => 'unmatched end tag',
7483 text => $token->{tag_name}, token => $token);
7484 ## Ignore the token
7485 !!!next-token;
7486 next B;
7487
7488 ## ISSUE: Issue on HTML5 new elements in spec
7489
7490 } else {
7491 ## Step 1
7492 my $node_i = -1;
7493 my $node = $self->{open_elements}->[$node_i];
7494
7495 ## Step 2
7496 S2: {
7497 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7498 ## Step 1
7499 ## generate implied end tags
7500 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7501 !!!cp ('t430');
7502 ## NOTE: |<ruby><rt></ruby>|.
7503 ## ISSUE: <ruby><rt></rt> will also take this code path,
7504 ## which seems wrong.
7505 pop @{$self->{open_elements}};
7506 $node_i++;
7507 }
7508
7509 ## Step 2
7510 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7511 ne $token->{tag_name}) {
7512 !!!cp ('t431');
7513 ## NOTE: <x><y></x>
7514 !!!parse-error (type => 'not closed',
7515 text => $self->{open_elements}->[-1]->[0]
7516 ->manakai_local_name,
7517 token => $token);
7518 } else {
7519 !!!cp ('t432');
7520 }
7521
7522 ## Step 3
7523 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7524
7525 !!!next-token;
7526 last S2;
7527 } else {
7528 ## Step 3
7529 if (not ($node->[1] & FORMATTING_EL) and
7530 #not $phrasing_category->{$node->[1]} and
7531 ($node->[1] & SPECIAL_EL or
7532 $node->[1] & SCOPING_EL)) {
7533 !!!cp ('t433');
7534 !!!parse-error (type => 'unmatched end tag',
7535 text => $token->{tag_name}, token => $token);
7536 ## Ignore the token
7537 !!!next-token;
7538 last S2;
7539 }
7540
7541 !!!cp ('t434');
7542 }
7543
7544 ## Step 4
7545 $node_i--;
7546 $node = $self->{open_elements}->[$node_i];
7547
7548 ## Step 5;
7549 redo S2;
7550 } # S2
7551 next B;
7552 }
7553 }
7554 next B;
7555 } continue { # B
7556 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7557 ## NOTE: The code below is executed in cases where it does not have
7558 ## to be, but it it is harmless even in those cases.
7559 ## has an element in scope
7560 INSCOPE: {
7561 for (reverse 0..$#{$self->{open_elements}}) {
7562 my $node = $self->{open_elements}->[$_];
7563 if ($node->[1] & FOREIGN_EL) {
7564 last INSCOPE;
7565 } elsif ($node->[1] & SCOPING_EL) {
7566 last;
7567 }
7568 }
7569
7570 ## NOTE: No foreign element in scope.
7571 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7572 } # INSCOPE
7573 }
7574 } # B
7575
7576 ## Stop parsing # MUST
7577
7578 ## TODO: script stuffs
7579 } # _tree_construct_main
7580
7581 sub set_inner_html ($$$;$) {
7582 my $class = shift;
7583 my $node = shift;
7584 my $s = \$_[0];
7585 my $onerror = $_[1];
7586 my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7587
7588 ## ISSUE: Should {confident} be true?
7589
7590 my $nt = $node->node_type;
7591 if ($nt == 9) {
7592 # MUST
7593
7594 ## Step 1 # MUST
7595 ## TODO: If the document has an active parser, ...
7596 ## ISSUE: There is an issue in the spec.
7597
7598 ## Step 2 # MUST
7599 my @cn = @{$node->child_nodes};
7600 for (@cn) {
7601 $node->remove_child ($_);
7602 }
7603
7604 ## Step 3, 4, 5 # MUST
7605 $class->parse_char_string ($$s => $node, $onerror, $get_wrapper);
7606 } elsif ($nt == 1) {
7607 ## TODO: If non-html element
7608
7609 ## NOTE: Most of this code is copied from |parse_string|
7610
7611 ## TODO: Support for $get_wrapper
7612
7613 ## Step 1 # MUST
7614 my $this_doc = $node->owner_document;
7615 my $doc = $this_doc->implementation->create_document;
7616 $doc->manakai_is_html (1);
7617 my $p = $class->new;
7618 $p->{document} = $doc;
7619
7620 ## Step 8 # MUST
7621 my $i = 0;
7622 $p->{line_prev} = $p->{line} = 1;
7623 $p->{column_prev} = $p->{column} = 0;
7624 $p->{set_next_char} = sub {
7625 my $self = shift;
7626
7627 pop @{$self->{prev_char}};
7628 unshift @{$self->{prev_char}}, $self->{next_char};
7629
7630 $self->{next_char} = -1 and return if $i >= length $$s;
7631 $self->{next_char} = ord substr $$s, $i++, 1;
7632
7633 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7634 $p->{column}++;
7635
7636 if ($self->{next_char} == 0x000A) { # LF
7637 $p->{line}++;
7638 $p->{column} = 0;
7639 !!!cp ('i1');
7640 } elsif ($self->{next_char} == 0x000D) { # CR
7641 $i++ if substr ($$s, $i, 1) eq "\x0A";
7642 $self->{next_char} = 0x000A; # LF # MUST
7643 $p->{line}++;
7644 $p->{column} = 0;
7645 !!!cp ('i2');
7646 } elsif ($self->{next_char} > 0x10FFFF) {
7647 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7648 !!!cp ('i3');
7649 } elsif ($self->{next_char} == 0x0000) { # NULL
7650 !!!cp ('i4');
7651 !!!parse-error (type => 'NULL');
7652 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7653 } elsif ($self->{next_char} <= 0x0008 or
7654 (0x000E <= $self->{next_char} and
7655 $self->{next_char} <= 0x001F) or
7656 (0x007F <= $self->{next_char} and
7657 $self->{next_char} <= 0x009F) or
7658 (0xD800 <= $self->{next_char} and
7659 $self->{next_char} <= 0xDFFF) or
7660 (0xFDD0 <= $self->{next_char} and
7661 $self->{next_char} <= 0xFDDF) or
7662 {
7663 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7664 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7665 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7666 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7667 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7668 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7669 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7670 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7671 0x10FFFE => 1, 0x10FFFF => 1,
7672 }->{$self->{next_char}}) {
7673 !!!cp ('i4.1');
7674 if ($self->{next_char} < 0x10000) {
7675 !!!parse-error (type => 'control char',
7676 text => (sprintf 'U+%04X', $self->{next_char}));
7677 } else {
7678 !!!parse-error (type => 'control char',
7679 text => (sprintf 'U-%08X', $self->{next_char}));
7680 }
7681 }
7682 };
7683 $p->{prev_char} = [-1, -1, -1];
7684 $p->{next_char} = -1;
7685
7686 my $ponerror = $onerror || sub {
7687 my (%opt) = @_;
7688 my $line = $opt{line};
7689 my $column = $opt{column};
7690 if (defined $opt{token} and defined $opt{token}->{line}) {
7691 $line = $opt{token}->{line};
7692 $column = $opt{token}->{column};
7693 }
7694 warn "Parse error ($opt{type}) at line $line column $column\n";
7695 };
7696 $p->{parse_error} = sub {
7697 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7698 };
7699
7700 $p->_initialize_tokenizer;
7701 $p->_initialize_tree_constructor;
7702
7703 ## Step 2
7704 my $node_ln = $node->manakai_local_name;
7705 $p->{content_model} = {
7706 title => RCDATA_CONTENT_MODEL,
7707 textarea => RCDATA_CONTENT_MODEL,
7708 style => CDATA_CONTENT_MODEL,
7709 script => CDATA_CONTENT_MODEL,
7710 xmp => CDATA_CONTENT_MODEL,
7711 iframe => CDATA_CONTENT_MODEL,
7712 noembed => CDATA_CONTENT_MODEL,
7713 noframes => CDATA_CONTENT_MODEL,
7714 noscript => CDATA_CONTENT_MODEL,
7715 plaintext => PLAINTEXT_CONTENT_MODEL,
7716 }->{$node_ln};
7717 $p->{content_model} = PCDATA_CONTENT_MODEL
7718 unless defined $p->{content_model};
7719 ## ISSUE: What is "the name of the element"? local name?
7720
7721 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7722 ## TODO: Foreign element OK?
7723
7724 ## Step 3
7725 my $root = $doc->create_element_ns
7726 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7727
7728 ## Step 4 # MUST
7729 $doc->append_child ($root);
7730
7731 ## Step 5 # MUST
7732 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7733
7734 undef $p->{head_element};
7735
7736 ## Step 6 # MUST
7737 $p->_reset_insertion_mode;
7738
7739 ## Step 7 # MUST
7740 my $anode = $node;
7741 AN: while (defined $anode) {
7742 if ($anode->node_type == 1) {
7743 my $nsuri = $anode->namespace_uri;
7744 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7745 if ($anode->manakai_local_name eq 'form') {
7746 !!!cp ('i5');
7747 $p->{form_element} = $anode;
7748 last AN;
7749 }
7750 }
7751 }
7752 $anode = $anode->parent_node;
7753 } # AN
7754
7755 ## Step 9 # MUST
7756 {
7757 my $self = $p;
7758 !!!next-token;
7759 }
7760 $p->_tree_construction_main;
7761
7762 ## Step 10 # MUST
7763 my @cn = @{$node->child_nodes};
7764 for (@cn) {
7765 $node->remove_child ($_);
7766 }
7767 ## ISSUE: mutation events? read-only?
7768
7769 ## Step 11 # MUST
7770 @cn = @{$root->child_nodes};
7771 for (@cn) {
7772 $this_doc->adopt_node ($_);
7773 $node->append_child ($_);
7774 }
7775 ## ISSUE: mutation events?
7776
7777 $p->_terminate_tree_constructor;
7778
7779 delete $p->{parse_error}; # delete loop
7780 } else {
7781 die "$0: |set_inner_html| is not defined for node of type $nt";
7782 }
7783 } # set_inner_html
7784
7785 } # tree construction stage
7786
7787 package Whatpm::HTML::RestartParser;
7788 push our @ISA, 'Error';
7789
7790 1;
7791 # $Date: 2008/09/13 08:21:35 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24