/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.156 - (show annotations) (download) (as text)
Sat Aug 30 13:43:50 2008 UTC (16 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.155: +10 -3 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	30 Aug 2008 13:30:24 -0000
	* tokenizer-test-1.dat: '"' and "'" at the end of attribute
	name (after another attribute) now raise parse error (HTML5
	revision 2123).  Empty unquoted attribute is no
	longer allowed (HTML5 revision 2122).

	* HTML-tokenizer.t: Hash keys were not sorted when dumped.

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	30 Aug 2008 13:43:44 -0000
	* HTML.pm.src: '"' and "'" at the end of attribute
	name (after another attribute) now raise parse error (HTML5
	revision 2123).  Empty unquoted attribute values are no
	longer allowed (HTML5 revision 2122).

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.155 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 require IO::Handle;
12
13 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15 my $SVG_NS = q<http://www.w3.org/2000/svg>;
16 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19
20 sub A_EL () { 0b1 }
21 sub ADDRESS_EL () { 0b10 }
22 sub BODY_EL () { 0b100 }
23 sub BUTTON_EL () { 0b1000 }
24 sub CAPTION_EL () { 0b10000 }
25 sub DD_EL () { 0b100000 }
26 sub DIV_EL () { 0b1000000 }
27 sub DT_EL () { 0b10000000 }
28 sub FORM_EL () { 0b100000000 }
29 sub FORMATTING_EL () { 0b1000000000 }
30 sub FRAMESET_EL () { 0b10000000000 }
31 sub HEADING_EL () { 0b100000000000 }
32 sub HTML_EL () { 0b1000000000000 }
33 sub LI_EL () { 0b10000000000000 }
34 sub NOBR_EL () { 0b100000000000000 }
35 sub OPTION_EL () { 0b1000000000000000 }
36 sub OPTGROUP_EL () { 0b10000000000000000 }
37 sub P_EL () { 0b100000000000000000 }
38 sub SELECT_EL () { 0b1000000000000000000 }
39 sub TABLE_EL () { 0b10000000000000000000 }
40 sub TABLE_CELL_EL () { 0b100000000000000000000 }
41 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48 sub RUBY_EL () { 0b10000000000000000000000000000 }
49 sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 ## NOTE: Used in "generate implied end tags" algorithm.
58 ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
59 ## is used in "generate implied end tags" implementation (search for the
60 ## function mae).
61 sub END_TAG_OPTIONAL_EL () {
62 DD_EL |
63 DT_EL |
64 LI_EL |
65 P_EL |
66 RUBY_COMPONENT_EL
67 }
68
69 ## NOTE: Used in </body> and EOF algorithms.
70 sub ALL_END_TAG_OPTIONAL_EL () {
71 DD_EL |
72 DT_EL |
73 LI_EL |
74 P_EL |
75
76 BODY_EL |
77 HTML_EL |
78 TABLE_CELL_EL |
79 TABLE_ROW_EL |
80 TABLE_ROW_GROUP_EL
81 }
82
83 sub SCOPING_EL () {
84 BUTTON_EL |
85 CAPTION_EL |
86 HTML_EL |
87 TABLE_EL |
88 TABLE_CELL_EL |
89 MISC_SCOPING_EL
90 }
91
92 sub TABLE_SCOPING_EL () {
93 HTML_EL |
94 TABLE_EL
95 }
96
97 sub TABLE_ROWS_SCOPING_EL () {
98 HTML_EL |
99 TABLE_ROW_GROUP_EL
100 }
101
102 sub TABLE_ROW_SCOPING_EL () {
103 HTML_EL |
104 TABLE_ROW_EL
105 }
106
107 sub SPECIAL_EL () {
108 ADDRESS_EL |
109 BODY_EL |
110 DIV_EL |
111
112 DD_EL |
113 DT_EL |
114 LI_EL |
115 P_EL |
116
117 FORM_EL |
118 FRAMESET_EL |
119 HEADING_EL |
120 OPTION_EL |
121 OPTGROUP_EL |
122 SELECT_EL |
123 TABLE_ROW_EL |
124 TABLE_ROW_GROUP_EL |
125 MISC_SPECIAL_EL
126 }
127
128 my $el_category = {
129 a => A_EL | FORMATTING_EL,
130 address => ADDRESS_EL,
131 applet => MISC_SCOPING_EL,
132 area => MISC_SPECIAL_EL,
133 b => FORMATTING_EL,
134 base => MISC_SPECIAL_EL,
135 basefont => MISC_SPECIAL_EL,
136 bgsound => MISC_SPECIAL_EL,
137 big => FORMATTING_EL,
138 blockquote => MISC_SPECIAL_EL,
139 body => BODY_EL,
140 br => MISC_SPECIAL_EL,
141 button => BUTTON_EL,
142 caption => CAPTION_EL,
143 center => MISC_SPECIAL_EL,
144 col => MISC_SPECIAL_EL,
145 colgroup => MISC_SPECIAL_EL,
146 dd => DD_EL,
147 dir => MISC_SPECIAL_EL,
148 div => DIV_EL,
149 dl => MISC_SPECIAL_EL,
150 dt => DT_EL,
151 em => FORMATTING_EL,
152 embed => MISC_SPECIAL_EL,
153 fieldset => MISC_SPECIAL_EL,
154 font => FORMATTING_EL,
155 form => FORM_EL,
156 frame => MISC_SPECIAL_EL,
157 frameset => FRAMESET_EL,
158 h1 => HEADING_EL,
159 h2 => HEADING_EL,
160 h3 => HEADING_EL,
161 h4 => HEADING_EL,
162 h5 => HEADING_EL,
163 h6 => HEADING_EL,
164 head => MISC_SPECIAL_EL,
165 hr => MISC_SPECIAL_EL,
166 html => HTML_EL,
167 i => FORMATTING_EL,
168 iframe => MISC_SPECIAL_EL,
169 img => MISC_SPECIAL_EL,
170 input => MISC_SPECIAL_EL,
171 isindex => MISC_SPECIAL_EL,
172 li => LI_EL,
173 link => MISC_SPECIAL_EL,
174 listing => MISC_SPECIAL_EL,
175 marquee => MISC_SCOPING_EL,
176 menu => MISC_SPECIAL_EL,
177 meta => MISC_SPECIAL_EL,
178 nobr => NOBR_EL | FORMATTING_EL,
179 noembed => MISC_SPECIAL_EL,
180 noframes => MISC_SPECIAL_EL,
181 noscript => MISC_SPECIAL_EL,
182 object => MISC_SCOPING_EL,
183 ol => MISC_SPECIAL_EL,
184 optgroup => OPTGROUP_EL,
185 option => OPTION_EL,
186 p => P_EL,
187 param => MISC_SPECIAL_EL,
188 plaintext => MISC_SPECIAL_EL,
189 pre => MISC_SPECIAL_EL,
190 rp => RUBY_COMPONENT_EL,
191 rt => RUBY_COMPONENT_EL,
192 ruby => RUBY_EL,
193 s => FORMATTING_EL,
194 script => MISC_SPECIAL_EL,
195 select => SELECT_EL,
196 small => FORMATTING_EL,
197 spacer => MISC_SPECIAL_EL,
198 strike => FORMATTING_EL,
199 strong => FORMATTING_EL,
200 style => MISC_SPECIAL_EL,
201 table => TABLE_EL,
202 tbody => TABLE_ROW_GROUP_EL,
203 td => TABLE_CELL_EL,
204 textarea => MISC_SPECIAL_EL,
205 tfoot => TABLE_ROW_GROUP_EL,
206 th => TABLE_CELL_EL,
207 thead => TABLE_ROW_GROUP_EL,
208 title => MISC_SPECIAL_EL,
209 tr => TABLE_ROW_EL,
210 tt => FORMATTING_EL,
211 u => FORMATTING_EL,
212 ul => MISC_SPECIAL_EL,
213 wbr => MISC_SPECIAL_EL,
214 };
215
216 my $el_category_f = {
217 $MML_NS => {
218 'annotation-xml' => MML_AXML_EL,
219 mi => FOREIGN_FLOW_CONTENT_EL,
220 mo => FOREIGN_FLOW_CONTENT_EL,
221 mn => FOREIGN_FLOW_CONTENT_EL,
222 ms => FOREIGN_FLOW_CONTENT_EL,
223 mtext => FOREIGN_FLOW_CONTENT_EL,
224 },
225 $SVG_NS => {
226 foreignObject => FOREIGN_FLOW_CONTENT_EL,
227 desc => FOREIGN_FLOW_CONTENT_EL,
228 title => FOREIGN_FLOW_CONTENT_EL,
229 },
230 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
231 };
232
233 my $svg_attr_name = {
234 attributename => 'attributeName',
235 attributetype => 'attributeType',
236 basefrequency => 'baseFrequency',
237 baseprofile => 'baseProfile',
238 calcmode => 'calcMode',
239 clippathunits => 'clipPathUnits',
240 contentscripttype => 'contentScriptType',
241 contentstyletype => 'contentStyleType',
242 diffuseconstant => 'diffuseConstant',
243 edgemode => 'edgeMode',
244 externalresourcesrequired => 'externalResourcesRequired',
245 filterres => 'filterRes',
246 filterunits => 'filterUnits',
247 glyphref => 'glyphRef',
248 gradienttransform => 'gradientTransform',
249 gradientunits => 'gradientUnits',
250 kernelmatrix => 'kernelMatrix',
251 kernelunitlength => 'kernelUnitLength',
252 keypoints => 'keyPoints',
253 keysplines => 'keySplines',
254 keytimes => 'keyTimes',
255 lengthadjust => 'lengthAdjust',
256 limitingconeangle => 'limitingConeAngle',
257 markerheight => 'markerHeight',
258 markerunits => 'markerUnits',
259 markerwidth => 'markerWidth',
260 maskcontentunits => 'maskContentUnits',
261 maskunits => 'maskUnits',
262 numoctaves => 'numOctaves',
263 pathlength => 'pathLength',
264 patterncontentunits => 'patternContentUnits',
265 patterntransform => 'patternTransform',
266 patternunits => 'patternUnits',
267 pointsatx => 'pointsAtX',
268 pointsaty => 'pointsAtY',
269 pointsatz => 'pointsAtZ',
270 preservealpha => 'preserveAlpha',
271 preserveaspectratio => 'preserveAspectRatio',
272 primitiveunits => 'primitiveUnits',
273 refx => 'refX',
274 refy => 'refY',
275 repeatcount => 'repeatCount',
276 repeatdur => 'repeatDur',
277 requiredextensions => 'requiredExtensions',
278 requiredfeatures => 'requiredFeatures',
279 specularconstant => 'specularConstant',
280 specularexponent => 'specularExponent',
281 spreadmethod => 'spreadMethod',
282 startoffset => 'startOffset',
283 stddeviation => 'stdDeviation',
284 stitchtiles => 'stitchTiles',
285 surfacescale => 'surfaceScale',
286 systemlanguage => 'systemLanguage',
287 tablevalues => 'tableValues',
288 targetx => 'targetX',
289 targety => 'targetY',
290 textlength => 'textLength',
291 viewbox => 'viewBox',
292 viewtarget => 'viewTarget',
293 xchannelselector => 'xChannelSelector',
294 ychannelselector => 'yChannelSelector',
295 zoomandpan => 'zoomAndPan',
296 };
297
298 my $foreign_attr_xname = {
299 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
300 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
301 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
302 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
303 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
304 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
305 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
306 'xml:base' => [$XML_NS, ['xml', 'base']],
307 'xml:lang' => [$XML_NS, ['xml', 'lang']],
308 'xml:space' => [$XML_NS, ['xml', 'space']],
309 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
310 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
311 };
312
313 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
314
315 my $c1_entity_char = {
316 0x80 => 0x20AC,
317 0x81 => 0xFFFD,
318 0x82 => 0x201A,
319 0x83 => 0x0192,
320 0x84 => 0x201E,
321 0x85 => 0x2026,
322 0x86 => 0x2020,
323 0x87 => 0x2021,
324 0x88 => 0x02C6,
325 0x89 => 0x2030,
326 0x8A => 0x0160,
327 0x8B => 0x2039,
328 0x8C => 0x0152,
329 0x8D => 0xFFFD,
330 0x8E => 0x017D,
331 0x8F => 0xFFFD,
332 0x90 => 0xFFFD,
333 0x91 => 0x2018,
334 0x92 => 0x2019,
335 0x93 => 0x201C,
336 0x94 => 0x201D,
337 0x95 => 0x2022,
338 0x96 => 0x2013,
339 0x97 => 0x2014,
340 0x98 => 0x02DC,
341 0x99 => 0x2122,
342 0x9A => 0x0161,
343 0x9B => 0x203A,
344 0x9C => 0x0153,
345 0x9D => 0xFFFD,
346 0x9E => 0x017E,
347 0x9F => 0x0178,
348 }; # $c1_entity_char
349
350 sub parse_byte_string ($$$$;$) {
351 my $self = shift;
352 my $charset_name = shift;
353 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
354 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
355 } # parse_byte_string
356
357 sub parse_byte_stream ($$$$;$) {
358 my $self = ref $_[0] ? shift : shift->new;
359 my $charset_name = shift;
360 my $byte_stream = $_[0];
361
362 my $onerror = $_[2] || sub {
363 my (%opt) = @_;
364 warn "Parse error ($opt{type})\n";
365 };
366 $self->{parse_error} = $onerror; # updated later by parse_char_string
367
368 ## HTML5 encoding sniffing algorithm
369 require Message::Charset::Info;
370 my $charset;
371 my $buffer;
372 my ($char_stream, $e_status);
373
374 SNIFFING: {
375
376 ## Step 1
377 if (defined $charset_name) {
378 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
379
380 ## ISSUE: Unsupported encoding is not ignored according to the spec.
381 ($char_stream, $e_status) = $charset->get_decode_handle
382 ($byte_stream, allow_error_reporting => 1,
383 allow_fallback => 1);
384 if ($char_stream) {
385 $self->{confident} = 1;
386 last SNIFFING;
387 } else {
388 ## TODO: unsupported error
389 }
390 }
391
392 ## Step 2
393 my $byte_buffer = '';
394 for (1..1024) {
395 my $char = $byte_stream->getc;
396 last unless defined $char;
397 $byte_buffer .= $char;
398 } ## TODO: timeout
399
400 ## Step 3
401 if ($byte_buffer =~ /^\xFE\xFF/) {
402 $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
403 ($char_stream, $e_status) = $charset->get_decode_handle
404 ($byte_stream, allow_error_reporting => 1,
405 allow_fallback => 1, byte_buffer => \$byte_buffer);
406 $self->{confident} = 1;
407 last SNIFFING;
408 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
409 $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
410 ($char_stream, $e_status) = $charset->get_decode_handle
411 ($byte_stream, allow_error_reporting => 1,
412 allow_fallback => 1, byte_buffer => \$byte_buffer);
413 $self->{confident} = 1;
414 last SNIFFING;
415 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
416 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
417 ($char_stream, $e_status) = $charset->get_decode_handle
418 ($byte_stream, allow_error_reporting => 1,
419 allow_fallback => 1, byte_buffer => \$byte_buffer);
420 $self->{confident} = 1;
421 last SNIFFING;
422 }
423
424 ## Step 4
425 ## TODO: <meta charset>
426
427 ## Step 5
428 ## TODO: from history
429
430 ## Step 6
431 require Whatpm::Charset::UniversalCharDet;
432 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
433 ($byte_buffer);
434 if (defined $charset_name) {
435 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
436
437 ## ISSUE: Unsupported encoding is not ignored according to the spec.
438 require Whatpm::Charset::DecodeHandle;
439 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
440 ($byte_stream);
441 ($char_stream, $e_status) = $charset->get_decode_handle
442 ($buffer, allow_error_reporting => 1,
443 allow_fallback => 1, byte_buffer => \$byte_buffer);
444 if ($char_stream) {
445 $buffer->{buffer} = $byte_buffer;
446 !!!parse-error (type => 'sniffing:chardet',
447 text => $charset_name,
448 level => $self->{level}->{info},
449 layer => 'encode',
450 line => 1, column => 1);
451 $self->{confident} = 0;
452 last SNIFFING;
453 }
454 }
455
456 ## Step 7: default
457 ## TODO: Make this configurable.
458 $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
459 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
460 ## detectable in the step 6.
461 require Whatpm::Charset::DecodeHandle;
462 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
463 ($byte_stream);
464 ($char_stream, $e_status)
465 = $charset->get_decode_handle ($buffer,
466 allow_error_reporting => 1,
467 allow_fallback => 1,
468 byte_buffer => \$byte_buffer);
469 $buffer->{buffer} = $byte_buffer;
470 !!!parse-error (type => 'sniffing:default',
471 text => 'windows-1252',
472 level => $self->{level}->{info},
473 line => 1, column => 1,
474 layer => 'encode');
475 $self->{confident} = 0;
476 } # SNIFFING
477
478 $self->{input_encoding} = $charset->get_iana_name;
479 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
480 !!!parse-error (type => 'chardecode:fallback',
481 text => $self->{input_encoding},
482 level => $self->{level}->{uncertain},
483 line => 1, column => 1,
484 layer => 'encode');
485 } elsif (not ($e_status &
486 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
487 !!!parse-error (type => 'chardecode:no error',
488 text => $self->{input_encoding},
489 level => $self->{level}->{uncertain},
490 line => 1, column => 1,
491 layer => 'encode');
492 }
493
494 $self->{change_encoding} = sub {
495 my $self = shift;
496 $charset_name = shift;
497 my $token = shift;
498
499 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
500 ($char_stream, $e_status) = $charset->get_decode_handle
501 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
502 byte_buffer => \ $buffer->{buffer});
503
504 if ($char_stream) { # if supported
505 ## "Change the encoding" algorithm:
506
507 ## Step 1
508 if ($charset->{category} &
509 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
510 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
511 ($char_stream, $e_status) = $charset->get_decode_handle
512 ($byte_stream,
513 byte_buffer => \ $buffer->{buffer});
514 }
515 $charset_name = $charset->get_iana_name;
516
517 ## Step 2
518 if (defined $self->{input_encoding} and
519 $self->{input_encoding} eq $charset_name) {
520 !!!parse-error (type => 'charset label:matching',
521 text => $charset_name,
522 level => $self->{level}->{info});
523 $self->{confident} = 1;
524 return;
525 }
526
527 !!!parse-error (type => 'charset label detected',
528 text => $self->{input_encoding},
529 value => $charset_name,
530 level => $self->{level}->{warn},
531 token => $token);
532
533 ## Step 3
534 # if (can) {
535 ## change the encoding on the fly.
536 #$self->{confident} = 1;
537 #return;
538 # }
539
540 ## Step 4
541 throw Whatpm::HTML::RestartParser ();
542 }
543 }; # $self->{change_encoding}
544
545 my $char_onerror = sub {
546 my (undef, $type, %opt) = @_;
547 !!!parse-error (layer => 'encode',
548 %opt, type => $type,
549 line => $self->{line}, column => $self->{column} + 1);
550 if ($opt{octets}) {
551 ${$opt{octets}} = "\x{FFFD}"; # relacement character
552 }
553 };
554 $char_stream->onerror ($char_onerror);
555
556 my @args = @_; shift @args; # $s
557 my $return;
558 try {
559 $return = $self->parse_char_stream ($char_stream, @args);
560 } catch Whatpm::HTML::RestartParser with {
561 ## NOTE: Invoked after {change_encoding}.
562
563 $self->{input_encoding} = $charset->get_iana_name;
564 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
565 !!!parse-error (type => 'chardecode:fallback',
566 text => $self->{input_encoding},
567 level => $self->{level}->{uncertain},
568 line => 1, column => 1,
569 layer => 'encode');
570 } elsif (not ($e_status &
571 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
572 !!!parse-error (type => 'chardecode:no error',
573 text => $self->{input_encoding},
574 level => $self->{level}->{uncertain},
575 line => 1, column => 1,
576 layer => 'encode');
577 }
578 $self->{confident} = 1;
579 $char_stream->onerror ($char_onerror);
580 $return = $self->parse_char_stream ($char_stream, @args);
581 };
582 return $return;
583 } # parse_byte_stream
584
585 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
586 ## and the HTML layer MUST ignore it. However, we does strip BOM in
587 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
588 ## because the core part of our HTML parser expects a string of character,
589 ## not a string of bytes or code units or anything which might contain a BOM.
590 ## Therefore, any parser interface that accepts a string of bytes,
591 ## such as |parse_byte_string| in this module, must ensure that it does
592 ## strip the BOM and never strip any ZWNBSP.
593
594 sub parse_char_string ($$$;$) {
595 my $self = shift;
596 require utf8;
597 my $s = ref $_[0] ? $_[0] : \($_[0]);
598 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
599 return $self->parse_char_stream ($input, @_[1..$#_]);
600 } # parse_char_string
601 *parse_string = \&parse_char_string;
602
603 sub parse_char_stream ($$$;$) {
604 my $self = ref $_[0] ? shift : shift->new;
605 my $input = $_[0];
606 $self->{document} = $_[1];
607 @{$self->{document}->child_nodes} = ();
608
609 ## NOTE: |set_inner_html| copies most of this method's code
610
611 $self->{confident} = 1 unless exists $self->{confident};
612 $self->{document}->input_encoding ($self->{input_encoding})
613 if defined $self->{input_encoding};
614
615 my $i = 0;
616 $self->{line_prev} = $self->{line} = 1;
617 $self->{column_prev} = $self->{column} = 0;
618 $self->{set_next_char} = sub {
619 my $self = shift;
620
621 pop @{$self->{prev_char}};
622 unshift @{$self->{prev_char}}, $self->{next_char};
623
624 my $char;
625 if (defined $self->{next_next_char}) {
626 $char = $self->{next_next_char};
627 delete $self->{next_next_char};
628 } else {
629 $char = $input->getc;
630 }
631 $self->{next_char} = -1 and return unless defined $char;
632 $self->{next_char} = ord $char;
633
634 ($self->{line_prev}, $self->{column_prev})
635 = ($self->{line}, $self->{column});
636 $self->{column}++;
637
638 if ($self->{next_char} == 0x000A) { # LF
639 !!!cp ('j1');
640 $self->{line}++;
641 $self->{column} = 0;
642 } elsif ($self->{next_char} == 0x000D) { # CR
643 !!!cp ('j2');
644 my $next = $input->getc;
645 if (defined $next and $next ne "\x0A") {
646 $self->{next_next_char} = $next;
647 }
648 $self->{next_char} = 0x000A; # LF # MUST
649 $self->{line}++;
650 $self->{column} = 0;
651 } elsif ($self->{next_char} > 0x10FFFF) {
652 !!!cp ('j3');
653 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
654 } elsif ($self->{next_char} == 0x0000) { # NULL
655 !!!cp ('j4');
656 !!!parse-error (type => 'NULL');
657 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
658 } elsif ($self->{next_char} <= 0x0008 or
659 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
660 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
661 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
662 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
663 {
664 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
665 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
666 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
667 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
668 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
669 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
670 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
671 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
672 0x10FFFE => 1, 0x10FFFF => 1,
673 }->{$self->{next_char}}) {
674 !!!cp ('j5');
675 if ($self->{next_char} < 0x10000) {
676 !!!parse-error (type => 'control char',
677 text => (sprintf 'U+%04X', $self->{next_char}));
678 } else {
679 !!!parse-error (type => 'control char',
680 text => (sprintf 'U-%08X', $self->{next_char}));
681 }
682 }
683 };
684 $self->{prev_char} = [-1, -1, -1];
685 $self->{next_char} = -1;
686
687 my $onerror = $_[2] || sub {
688 my (%opt) = @_;
689 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
690 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
691 warn "Parse error ($opt{type}) at line $line column $column\n";
692 };
693 $self->{parse_error} = sub {
694 $onerror->(line => $self->{line}, column => $self->{column}, @_);
695 };
696
697 $self->_initialize_tokenizer;
698 $self->_initialize_tree_constructor;
699 $self->_construct_tree;
700 $self->_terminate_tree_constructor;
701
702 delete $self->{parse_error}; # remove loop
703
704 return $self->{document};
705 } # parse_char_stream
706
707 sub new ($) {
708 my $class = shift;
709 my $self = bless {
710 level => {must => 'm',
711 warn => 'w',
712 info => 'i',
713 uncertain => 'u'},
714 }, $class;
715 $self->{set_next_char} = sub {
716 $self->{next_char} = -1;
717 };
718 $self->{parse_error} = sub {
719 #
720 };
721 $self->{change_encoding} = sub {
722 # if ($_[0] is a supported encoding) {
723 # run "change the encoding" algorithm;
724 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
725 # }
726 };
727 $self->{application_cache_selection} = sub {
728 #
729 };
730 return $self;
731 } # new
732
733 sub CM_ENTITY () { 0b001 } # & markup in data
734 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
735 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
736
737 sub PLAINTEXT_CONTENT_MODEL () { 0 }
738 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
739 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
740 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
741
742 sub DATA_STATE () { 0 }
743 sub ENTITY_DATA_STATE () { 1 }
744 sub TAG_OPEN_STATE () { 2 }
745 sub CLOSE_TAG_OPEN_STATE () { 3 }
746 sub TAG_NAME_STATE () { 4 }
747 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
748 sub ATTRIBUTE_NAME_STATE () { 6 }
749 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
750 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
751 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
752 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
753 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
754 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
755 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
756 sub COMMENT_START_STATE () { 14 }
757 sub COMMENT_START_DASH_STATE () { 15 }
758 sub COMMENT_STATE () { 16 }
759 sub COMMENT_END_STATE () { 17 }
760 sub COMMENT_END_DASH_STATE () { 18 }
761 sub BOGUS_COMMENT_STATE () { 19 }
762 sub DOCTYPE_STATE () { 20 }
763 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
764 sub DOCTYPE_NAME_STATE () { 22 }
765 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
766 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
767 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
768 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
769 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
770 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
771 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
772 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
773 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
774 sub BOGUS_DOCTYPE_STATE () { 32 }
775 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
776 sub SELF_CLOSING_START_TAG_STATE () { 34 }
777 sub CDATA_BLOCK_STATE () { 35 }
778
779 sub DOCTYPE_TOKEN () { 1 }
780 sub COMMENT_TOKEN () { 2 }
781 sub START_TAG_TOKEN () { 3 }
782 sub END_TAG_TOKEN () { 4 }
783 sub END_OF_FILE_TOKEN () { 5 }
784 sub CHARACTER_TOKEN () { 6 }
785
786 sub AFTER_HTML_IMS () { 0b100 }
787 sub HEAD_IMS () { 0b1000 }
788 sub BODY_IMS () { 0b10000 }
789 sub BODY_TABLE_IMS () { 0b100000 }
790 sub TABLE_IMS () { 0b1000000 }
791 sub ROW_IMS () { 0b10000000 }
792 sub BODY_AFTER_IMS () { 0b100000000 }
793 sub FRAME_IMS () { 0b1000000000 }
794 sub SELECT_IMS () { 0b10000000000 }
795 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
796 ## NOTE: "in foreign content" insertion mode is special; it is combined
797 ## with the secondary insertion mode. In this parser, they are stored
798 ## together in the bit-or'ed form.
799
800 ## NOTE: "initial" and "before html" insertion modes have no constants.
801
802 ## NOTE: "after after body" insertion mode.
803 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
804
805 ## NOTE: "after after frameset" insertion mode.
806 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
807
808 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
809 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
810 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
811 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
812 sub IN_BODY_IM () { BODY_IMS }
813 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
814 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
815 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
816 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
817 sub IN_TABLE_IM () { TABLE_IMS }
818 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
819 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
820 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
821 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
822 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
823 sub IN_COLUMN_GROUP_IM () { 0b10 }
824
825 ## Implementations MUST act as if state machine in the spec
826
827 sub _initialize_tokenizer ($) {
828 my $self = shift;
829 $self->{state} = DATA_STATE; # MUST
830 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
831 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
832 undef $self->{current_attribute};
833 undef $self->{last_emitted_start_tag_name};
834 undef $self->{last_attribute_value_state};
835 delete $self->{self_closing};
836 $self->{char} = [];
837 # $self->{next_char}
838 !!!next-input-character;
839 $self->{token} = [];
840 # $self->{escape}
841 } # _initialize_tokenizer
842
843 ## A token has:
844 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
845 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
846 ## ->{name} (DOCTYPE_TOKEN)
847 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
848 ## ->{public_identifier} (DOCTYPE_TOKEN)
849 ## ->{system_identifier} (DOCTYPE_TOKEN)
850 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
851 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
852 ## ->{name}
853 ## ->{value}
854 ## ->{has_reference} == 1 or 0
855 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
856 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
857 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
858 ## while the token is pushed back to the stack.
859
860 ## Emitted token MUST immediately be handled by the tree construction state.
861
862 ## Before each step, UA MAY check to see if either one of the scripts in
863 ## "list of scripts that will execute as soon as possible" or the first
864 ## script in the "list of scripts that will execute asynchronously",
865 ## has completed loading. If one has, then it MUST be executed
866 ## and removed from the list.
867
868 ## NOTE: HTML5 "Writing HTML documents" section, applied to
869 ## documents and not to user agents and conformance checkers,
870 ## contains some requirements that are not detected by the
871 ## parsing algorithm:
872 ## - Some requirements on character encoding declarations. ## TODO
873 ## - "Elements MUST NOT contain content that their content model disallows."
874 ## ... Some are parse error, some are not (will be reported by c.c.).
875 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
876 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
877 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
878
879 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
880 ## be detected by the HTML5 parsing algorithm:
881 ## - Text,
882
883 sub _get_next_token ($) {
884 my $self = shift;
885
886 if ($self->{self_closing}) {
887 !!!parse-error (type => 'nestc', token => $self->{current_token});
888 ## NOTE: The |self_closing| flag is only set by start tag token.
889 ## In addition, when a start tag token is emitted, it is always set to
890 ## |current_token|.
891 delete $self->{self_closing};
892 }
893
894 if (@{$self->{token}}) {
895 $self->{self_closing} = $self->{token}->[0]->{self_closing};
896 return shift @{$self->{token}};
897 }
898
899 A: {
900 if ($self->{state} == DATA_STATE) {
901 if ($self->{next_char} == 0x0026) { # &
902 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
903 not $self->{escape}) {
904 !!!cp (1);
905 $self->{state} = ENTITY_DATA_STATE;
906 !!!next-input-character;
907 redo A;
908 } else {
909 !!!cp (2);
910 #
911 }
912 } elsif ($self->{next_char} == 0x002D) { # -
913 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
914 unless ($self->{escape}) {
915 if ($self->{prev_char}->[0] == 0x002D and # -
916 $self->{prev_char}->[1] == 0x0021 and # !
917 $self->{prev_char}->[2] == 0x003C) { # <
918 !!!cp (3);
919 $self->{escape} = 1;
920 } else {
921 !!!cp (4);
922 }
923 } else {
924 !!!cp (5);
925 }
926 }
927
928 #
929 } elsif ($self->{next_char} == 0x003C) { # <
930 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
931 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
932 not $self->{escape})) {
933 !!!cp (6);
934 $self->{state} = TAG_OPEN_STATE;
935 !!!next-input-character;
936 redo A;
937 } else {
938 !!!cp (7);
939 #
940 }
941 } elsif ($self->{next_char} == 0x003E) { # >
942 if ($self->{escape} and
943 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
944 if ($self->{prev_char}->[0] == 0x002D and # -
945 $self->{prev_char}->[1] == 0x002D) { # -
946 !!!cp (8);
947 delete $self->{escape};
948 } else {
949 !!!cp (9);
950 }
951 } else {
952 !!!cp (10);
953 }
954
955 #
956 } elsif ($self->{next_char} == -1) {
957 !!!cp (11);
958 !!!emit ({type => END_OF_FILE_TOKEN,
959 line => $self->{line}, column => $self->{column}});
960 last A; ## TODO: ok?
961 } else {
962 !!!cp (12);
963 }
964 # Anything else
965 my $token = {type => CHARACTER_TOKEN,
966 data => chr $self->{next_char},
967 line => $self->{line}, column => $self->{column},
968 };
969 ## Stay in the data state
970 !!!next-input-character;
971
972 !!!emit ($token);
973
974 redo A;
975 } elsif ($self->{state} == ENTITY_DATA_STATE) {
976 ## (cannot happen in CDATA state)
977
978 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
979
980 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
981
982 $self->{state} = DATA_STATE;
983 # next-input-character is already done
984
985 unless (defined $token) {
986 !!!cp (13);
987 !!!emit ({type => CHARACTER_TOKEN, data => '&',
988 line => $l, column => $c,
989 });
990 } else {
991 !!!cp (14);
992 !!!emit ($token);
993 }
994
995 redo A;
996 } elsif ($self->{state} == TAG_OPEN_STATE) {
997 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
998 if ($self->{next_char} == 0x002F) { # /
999 !!!cp (15);
1000 !!!next-input-character;
1001 $self->{state} = CLOSE_TAG_OPEN_STATE;
1002 redo A;
1003 } else {
1004 !!!cp (16);
1005 ## reconsume
1006 $self->{state} = DATA_STATE;
1007
1008 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1009 line => $self->{line_prev},
1010 column => $self->{column_prev},
1011 });
1012
1013 redo A;
1014 }
1015 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1016 if ($self->{next_char} == 0x0021) { # !
1017 !!!cp (17);
1018 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1019 !!!next-input-character;
1020 redo A;
1021 } elsif ($self->{next_char} == 0x002F) { # /
1022 !!!cp (18);
1023 $self->{state} = CLOSE_TAG_OPEN_STATE;
1024 !!!next-input-character;
1025 redo A;
1026 } elsif (0x0041 <= $self->{next_char} and
1027 $self->{next_char} <= 0x005A) { # A..Z
1028 !!!cp (19);
1029 $self->{current_token}
1030 = {type => START_TAG_TOKEN,
1031 tag_name => chr ($self->{next_char} + 0x0020),
1032 line => $self->{line_prev},
1033 column => $self->{column_prev}};
1034 $self->{state} = TAG_NAME_STATE;
1035 !!!next-input-character;
1036 redo A;
1037 } elsif (0x0061 <= $self->{next_char} and
1038 $self->{next_char} <= 0x007A) { # a..z
1039 !!!cp (20);
1040 $self->{current_token} = {type => START_TAG_TOKEN,
1041 tag_name => chr ($self->{next_char}),
1042 line => $self->{line_prev},
1043 column => $self->{column_prev}};
1044 $self->{state} = TAG_NAME_STATE;
1045 !!!next-input-character;
1046 redo A;
1047 } elsif ($self->{next_char} == 0x003E) { # >
1048 !!!cp (21);
1049 !!!parse-error (type => 'empty start tag',
1050 line => $self->{line_prev},
1051 column => $self->{column_prev});
1052 $self->{state} = DATA_STATE;
1053 !!!next-input-character;
1054
1055 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1056 line => $self->{line_prev},
1057 column => $self->{column_prev},
1058 });
1059
1060 redo A;
1061 } elsif ($self->{next_char} == 0x003F) { # ?
1062 !!!cp (22);
1063 !!!parse-error (type => 'pio',
1064 line => $self->{line_prev},
1065 column => $self->{column_prev});
1066 $self->{state} = BOGUS_COMMENT_STATE;
1067 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1068 line => $self->{line_prev},
1069 column => $self->{column_prev},
1070 };
1071 ## $self->{next_char} is intentionally left as is
1072 redo A;
1073 } else {
1074 !!!cp (23);
1075 !!!parse-error (type => 'bare stago',
1076 line => $self->{line_prev},
1077 column => $self->{column_prev});
1078 $self->{state} = DATA_STATE;
1079 ## reconsume
1080
1081 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1082 line => $self->{line_prev},
1083 column => $self->{column_prev},
1084 });
1085
1086 redo A;
1087 }
1088 } else {
1089 die "$0: $self->{content_model} in tag open";
1090 }
1091 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1092 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1093 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1094 if (defined $self->{last_emitted_start_tag_name}) {
1095
1096 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
1097 my @next_char;
1098 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
1099 push @next_char, $self->{next_char};
1100 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
1101 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
1102 if ($self->{next_char} == $c or $self->{next_char} == $C) {
1103 !!!cp (24);
1104 !!!next-input-character;
1105 next TAGNAME;
1106 } else {
1107 !!!cp (25);
1108 $self->{next_char} = shift @next_char; # reconsume
1109 !!!back-next-input-character (@next_char);
1110 $self->{state} = DATA_STATE;
1111
1112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1113 line => $l, column => $c,
1114 });
1115
1116 redo A;
1117 }
1118 }
1119 push @next_char, $self->{next_char};
1120
1121 unless ($self->{next_char} == 0x0009 or # HT
1122 $self->{next_char} == 0x000A or # LF
1123 $self->{next_char} == 0x000B or # VT
1124 $self->{next_char} == 0x000C or # FF
1125 $self->{next_char} == 0x0020 or # SP
1126 $self->{next_char} == 0x003E or # >
1127 $self->{next_char} == 0x002F or # /
1128 $self->{next_char} == -1) {
1129 !!!cp (26);
1130 $self->{next_char} = shift @next_char; # reconsume
1131 !!!back-next-input-character (@next_char);
1132 $self->{state} = DATA_STATE;
1133 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1134 line => $l, column => $c,
1135 });
1136 redo A;
1137 } else {
1138 !!!cp (27);
1139 $self->{next_char} = shift @next_char;
1140 !!!back-next-input-character (@next_char);
1141 # and consume...
1142 }
1143 } else {
1144 ## No start tag token has ever been emitted
1145 !!!cp (28);
1146 # next-input-character is already done
1147 $self->{state} = DATA_STATE;
1148 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1149 line => $l, column => $c,
1150 });
1151 redo A;
1152 }
1153 }
1154
1155 if (0x0041 <= $self->{next_char} and
1156 $self->{next_char} <= 0x005A) { # A..Z
1157 !!!cp (29);
1158 $self->{current_token}
1159 = {type => END_TAG_TOKEN,
1160 tag_name => chr ($self->{next_char} + 0x0020),
1161 line => $l, column => $c};
1162 $self->{state} = TAG_NAME_STATE;
1163 !!!next-input-character;
1164 redo A;
1165 } elsif (0x0061 <= $self->{next_char} and
1166 $self->{next_char} <= 0x007A) { # a..z
1167 !!!cp (30);
1168 $self->{current_token} = {type => END_TAG_TOKEN,
1169 tag_name => chr ($self->{next_char}),
1170 line => $l, column => $c};
1171 $self->{state} = TAG_NAME_STATE;
1172 !!!next-input-character;
1173 redo A;
1174 } elsif ($self->{next_char} == 0x003E) { # >
1175 !!!cp (31);
1176 !!!parse-error (type => 'empty end tag',
1177 line => $self->{line_prev}, ## "<" in "</>"
1178 column => $self->{column_prev} - 1);
1179 $self->{state} = DATA_STATE;
1180 !!!next-input-character;
1181 redo A;
1182 } elsif ($self->{next_char} == -1) {
1183 !!!cp (32);
1184 !!!parse-error (type => 'bare etago');
1185 $self->{state} = DATA_STATE;
1186 # reconsume
1187
1188 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1189 line => $l, column => $c,
1190 });
1191
1192 redo A;
1193 } else {
1194 !!!cp (33);
1195 !!!parse-error (type => 'bogus end tag');
1196 $self->{state} = BOGUS_COMMENT_STATE;
1197 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1198 line => $self->{line_prev}, # "<" of "</"
1199 column => $self->{column_prev} - 1,
1200 };
1201 ## $self->{next_char} is intentionally left as is
1202 redo A;
1203 }
1204 } elsif ($self->{state} == TAG_NAME_STATE) {
1205 if ($self->{next_char} == 0x0009 or # HT
1206 $self->{next_char} == 0x000A or # LF
1207 $self->{next_char} == 0x000B or # VT
1208 $self->{next_char} == 0x000C or # FF
1209 $self->{next_char} == 0x0020) { # SP
1210 !!!cp (34);
1211 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1212 !!!next-input-character;
1213 redo A;
1214 } elsif ($self->{next_char} == 0x003E) { # >
1215 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1216 !!!cp (35);
1217 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1218 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1219 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1220 #if ($self->{current_token}->{attributes}) {
1221 # ## NOTE: This should never be reached.
1222 # !!! cp (36);
1223 # !!! parse-error (type => 'end tag attribute');
1224 #} else {
1225 !!!cp (37);
1226 #}
1227 } else {
1228 die "$0: $self->{current_token}->{type}: Unknown token type";
1229 }
1230 $self->{state} = DATA_STATE;
1231 !!!next-input-character;
1232
1233 !!!emit ($self->{current_token}); # start tag or end tag
1234
1235 redo A;
1236 } elsif (0x0041 <= $self->{next_char} and
1237 $self->{next_char} <= 0x005A) { # A..Z
1238 !!!cp (38);
1239 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1240 # start tag or end tag
1241 ## Stay in this state
1242 !!!next-input-character;
1243 redo A;
1244 } elsif ($self->{next_char} == -1) {
1245 !!!parse-error (type => 'unclosed tag');
1246 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1247 !!!cp (39);
1248 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1249 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1250 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1251 #if ($self->{current_token}->{attributes}) {
1252 # ## NOTE: This state should never be reached.
1253 # !!! cp (40);
1254 # !!! parse-error (type => 'end tag attribute');
1255 #} else {
1256 !!!cp (41);
1257 #}
1258 } else {
1259 die "$0: $self->{current_token}->{type}: Unknown token type";
1260 }
1261 $self->{state} = DATA_STATE;
1262 # reconsume
1263
1264 !!!emit ($self->{current_token}); # start tag or end tag
1265
1266 redo A;
1267 } elsif ($self->{next_char} == 0x002F) { # /
1268 !!!cp (42);
1269 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1270 !!!next-input-character;
1271 redo A;
1272 } else {
1273 !!!cp (44);
1274 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1275 # start tag or end tag
1276 ## Stay in the state
1277 !!!next-input-character;
1278 redo A;
1279 }
1280 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1281 if ($self->{next_char} == 0x0009 or # HT
1282 $self->{next_char} == 0x000A or # LF
1283 $self->{next_char} == 0x000B or # VT
1284 $self->{next_char} == 0x000C or # FF
1285 $self->{next_char} == 0x0020) { # SP
1286 !!!cp (45);
1287 ## Stay in the state
1288 !!!next-input-character;
1289 redo A;
1290 } elsif ($self->{next_char} == 0x003E) { # >
1291 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1292 !!!cp (46);
1293 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1294 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1295 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1296 if ($self->{current_token}->{attributes}) {
1297 !!!cp (47);
1298 !!!parse-error (type => 'end tag attribute');
1299 } else {
1300 !!!cp (48);
1301 }
1302 } else {
1303 die "$0: $self->{current_token}->{type}: Unknown token type";
1304 }
1305 $self->{state} = DATA_STATE;
1306 !!!next-input-character;
1307
1308 !!!emit ($self->{current_token}); # start tag or end tag
1309
1310 redo A;
1311 } elsif (0x0041 <= $self->{next_char} and
1312 $self->{next_char} <= 0x005A) { # A..Z
1313 !!!cp (49);
1314 $self->{current_attribute}
1315 = {name => chr ($self->{next_char} + 0x0020),
1316 value => '',
1317 line => $self->{line}, column => $self->{column}};
1318 $self->{state} = ATTRIBUTE_NAME_STATE;
1319 !!!next-input-character;
1320 redo A;
1321 } elsif ($self->{next_char} == 0x002F) { # /
1322 !!!cp (50);
1323 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1324 !!!next-input-character;
1325 redo A;
1326 } elsif ($self->{next_char} == -1) {
1327 !!!parse-error (type => 'unclosed tag');
1328 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1329 !!!cp (52);
1330 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1331 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1332 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1333 if ($self->{current_token}->{attributes}) {
1334 !!!cp (53);
1335 !!!parse-error (type => 'end tag attribute');
1336 } else {
1337 !!!cp (54);
1338 }
1339 } else {
1340 die "$0: $self->{current_token}->{type}: Unknown token type";
1341 }
1342 $self->{state} = DATA_STATE;
1343 # reconsume
1344
1345 !!!emit ($self->{current_token}); # start tag or end tag
1346
1347 redo A;
1348 } else {
1349 if ({
1350 0x0022 => 1, # "
1351 0x0027 => 1, # '
1352 0x003D => 1, # =
1353 }->{$self->{next_char}}) {
1354 !!!cp (55);
1355 !!!parse-error (type => 'bad attribute name');
1356 } else {
1357 !!!cp (56);
1358 }
1359 $self->{current_attribute}
1360 = {name => chr ($self->{next_char}),
1361 value => '',
1362 line => $self->{line}, column => $self->{column}};
1363 $self->{state} = ATTRIBUTE_NAME_STATE;
1364 !!!next-input-character;
1365 redo A;
1366 }
1367 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1368 my $before_leave = sub {
1369 if (exists $self->{current_token}->{attributes} # start tag or end tag
1370 ->{$self->{current_attribute}->{name}}) { # MUST
1371 !!!cp (57);
1372 !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1373 ## Discard $self->{current_attribute} # MUST
1374 } else {
1375 !!!cp (58);
1376 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1377 = $self->{current_attribute};
1378 }
1379 }; # $before_leave
1380
1381 if ($self->{next_char} == 0x0009 or # HT
1382 $self->{next_char} == 0x000A or # LF
1383 $self->{next_char} == 0x000B or # VT
1384 $self->{next_char} == 0x000C or # FF
1385 $self->{next_char} == 0x0020) { # SP
1386 !!!cp (59);
1387 $before_leave->();
1388 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1389 !!!next-input-character;
1390 redo A;
1391 } elsif ($self->{next_char} == 0x003D) { # =
1392 !!!cp (60);
1393 $before_leave->();
1394 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1395 !!!next-input-character;
1396 redo A;
1397 } elsif ($self->{next_char} == 0x003E) { # >
1398 $before_leave->();
1399 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1400 !!!cp (61);
1401 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1402 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1403 !!!cp (62);
1404 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1405 if ($self->{current_token}->{attributes}) {
1406 !!!parse-error (type => 'end tag attribute');
1407 }
1408 } else {
1409 die "$0: $self->{current_token}->{type}: Unknown token type";
1410 }
1411 $self->{state} = DATA_STATE;
1412 !!!next-input-character;
1413
1414 !!!emit ($self->{current_token}); # start tag or end tag
1415
1416 redo A;
1417 } elsif (0x0041 <= $self->{next_char} and
1418 $self->{next_char} <= 0x005A) { # A..Z
1419 !!!cp (63);
1420 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1421 ## Stay in the state
1422 !!!next-input-character;
1423 redo A;
1424 } elsif ($self->{next_char} == 0x002F) { # /
1425 !!!cp (64);
1426 $before_leave->();
1427 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1428 !!!next-input-character;
1429 redo A;
1430 } elsif ($self->{next_char} == -1) {
1431 !!!parse-error (type => 'unclosed tag');
1432 $before_leave->();
1433 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1434 !!!cp (66);
1435 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1436 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1437 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1438 if ($self->{current_token}->{attributes}) {
1439 !!!cp (67);
1440 !!!parse-error (type => 'end tag attribute');
1441 } else {
1442 ## NOTE: This state should never be reached.
1443 !!!cp (68);
1444 }
1445 } else {
1446 die "$0: $self->{current_token}->{type}: Unknown token type";
1447 }
1448 $self->{state} = DATA_STATE;
1449 # reconsume
1450
1451 !!!emit ($self->{current_token}); # start tag or end tag
1452
1453 redo A;
1454 } else {
1455 if ($self->{next_char} == 0x0022 or # "
1456 $self->{next_char} == 0x0027) { # '
1457 !!!cp (69);
1458 !!!parse-error (type => 'bad attribute name');
1459 } else {
1460 !!!cp (70);
1461 }
1462 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1463 ## Stay in the state
1464 !!!next-input-character;
1465 redo A;
1466 }
1467 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1468 if ($self->{next_char} == 0x0009 or # HT
1469 $self->{next_char} == 0x000A or # LF
1470 $self->{next_char} == 0x000B or # VT
1471 $self->{next_char} == 0x000C or # FF
1472 $self->{next_char} == 0x0020) { # SP
1473 !!!cp (71);
1474 ## Stay in the state
1475 !!!next-input-character;
1476 redo A;
1477 } elsif ($self->{next_char} == 0x003D) { # =
1478 !!!cp (72);
1479 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1480 !!!next-input-character;
1481 redo A;
1482 } elsif ($self->{next_char} == 0x003E) { # >
1483 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1484 !!!cp (73);
1485 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1486 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1487 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1488 if ($self->{current_token}->{attributes}) {
1489 !!!cp (74);
1490 !!!parse-error (type => 'end tag attribute');
1491 } else {
1492 ## NOTE: This state should never be reached.
1493 !!!cp (75);
1494 }
1495 } else {
1496 die "$0: $self->{current_token}->{type}: Unknown token type";
1497 }
1498 $self->{state} = DATA_STATE;
1499 !!!next-input-character;
1500
1501 !!!emit ($self->{current_token}); # start tag or end tag
1502
1503 redo A;
1504 } elsif (0x0041 <= $self->{next_char} and
1505 $self->{next_char} <= 0x005A) { # A..Z
1506 !!!cp (76);
1507 $self->{current_attribute}
1508 = {name => chr ($self->{next_char} + 0x0020),
1509 value => '',
1510 line => $self->{line}, column => $self->{column}};
1511 $self->{state} = ATTRIBUTE_NAME_STATE;
1512 !!!next-input-character;
1513 redo A;
1514 } elsif ($self->{next_char} == 0x002F) { # /
1515 !!!cp (77);
1516 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1517 !!!next-input-character;
1518 redo A;
1519 } elsif ($self->{next_char} == -1) {
1520 !!!parse-error (type => 'unclosed tag');
1521 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1522 !!!cp (79);
1523 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1524 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1525 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1526 if ($self->{current_token}->{attributes}) {
1527 !!!cp (80);
1528 !!!parse-error (type => 'end tag attribute');
1529 } else {
1530 ## NOTE: This state should never be reached.
1531 !!!cp (81);
1532 }
1533 } else {
1534 die "$0: $self->{current_token}->{type}: Unknown token type";
1535 }
1536 $self->{state} = DATA_STATE;
1537 # reconsume
1538
1539 !!!emit ($self->{current_token}); # start tag or end tag
1540
1541 redo A;
1542 } else {
1543 if ($self->{next_char} == 0x0022 or # "
1544 $self->{next_char} == 0x0027) { # '
1545 !!!cp (78);
1546 !!!parse-error (type => 'bad attribute name');
1547 } else {
1548 !!!cp (82);
1549 }
1550 $self->{current_attribute}
1551 = {name => chr ($self->{next_char}),
1552 value => '',
1553 line => $self->{line}, column => $self->{column}};
1554 $self->{state} = ATTRIBUTE_NAME_STATE;
1555 !!!next-input-character;
1556 redo A;
1557 }
1558 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1559 if ($self->{next_char} == 0x0009 or # HT
1560 $self->{next_char} == 0x000A or # LF
1561 $self->{next_char} == 0x000B or # VT
1562 $self->{next_char} == 0x000C or # FF
1563 $self->{next_char} == 0x0020) { # SP
1564 !!!cp (83);
1565 ## Stay in the state
1566 !!!next-input-character;
1567 redo A;
1568 } elsif ($self->{next_char} == 0x0022) { # "
1569 !!!cp (84);
1570 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1571 !!!next-input-character;
1572 redo A;
1573 } elsif ($self->{next_char} == 0x0026) { # &
1574 !!!cp (85);
1575 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1576 ## reconsume
1577 redo A;
1578 } elsif ($self->{next_char} == 0x0027) { # '
1579 !!!cp (86);
1580 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1581 !!!next-input-character;
1582 redo A;
1583 } elsif ($self->{next_char} == 0x003E) { # >
1584 !!!parse-error (type => 'empty unquoted attribute value');
1585 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1586 !!!cp (87);
1587 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1588 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1589 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1590 if ($self->{current_token}->{attributes}) {
1591 !!!cp (88);
1592 !!!parse-error (type => 'end tag attribute');
1593 } else {
1594 ## NOTE: This state should never be reached.
1595 !!!cp (89);
1596 }
1597 } else {
1598 die "$0: $self->{current_token}->{type}: Unknown token type";
1599 }
1600 $self->{state} = DATA_STATE;
1601 !!!next-input-character;
1602
1603 !!!emit ($self->{current_token}); # start tag or end tag
1604
1605 redo A;
1606 } elsif ($self->{next_char} == -1) {
1607 !!!parse-error (type => 'unclosed tag');
1608 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1609 !!!cp (90);
1610 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1611 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1612 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1613 if ($self->{current_token}->{attributes}) {
1614 !!!cp (91);
1615 !!!parse-error (type => 'end tag attribute');
1616 } else {
1617 ## NOTE: This state should never be reached.
1618 !!!cp (92);
1619 }
1620 } else {
1621 die "$0: $self->{current_token}->{type}: Unknown token type";
1622 }
1623 $self->{state} = DATA_STATE;
1624 ## reconsume
1625
1626 !!!emit ($self->{current_token}); # start tag or end tag
1627
1628 redo A;
1629 } else {
1630 if ($self->{next_char} == 0x003D) { # =
1631 !!!cp (93);
1632 !!!parse-error (type => 'bad attribute value');
1633 } else {
1634 !!!cp (94);
1635 }
1636 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1637 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1638 !!!next-input-character;
1639 redo A;
1640 }
1641 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1642 if ($self->{next_char} == 0x0022) { # "
1643 !!!cp (95);
1644 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1645 !!!next-input-character;
1646 redo A;
1647 } elsif ($self->{next_char} == 0x0026) { # &
1648 !!!cp (96);
1649 $self->{last_attribute_value_state} = $self->{state};
1650 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1651 !!!next-input-character;
1652 redo A;
1653 } elsif ($self->{next_char} == -1) {
1654 !!!parse-error (type => 'unclosed attribute value');
1655 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1656 !!!cp (97);
1657 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1658 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1659 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1660 if ($self->{current_token}->{attributes}) {
1661 !!!cp (98);
1662 !!!parse-error (type => 'end tag attribute');
1663 } else {
1664 ## NOTE: This state should never be reached.
1665 !!!cp (99);
1666 }
1667 } else {
1668 die "$0: $self->{current_token}->{type}: Unknown token type";
1669 }
1670 $self->{state} = DATA_STATE;
1671 ## reconsume
1672
1673 !!!emit ($self->{current_token}); # start tag or end tag
1674
1675 redo A;
1676 } else {
1677 !!!cp (100);
1678 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1679 ## Stay in the state
1680 !!!next-input-character;
1681 redo A;
1682 }
1683 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1684 if ($self->{next_char} == 0x0027) { # '
1685 !!!cp (101);
1686 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1687 !!!next-input-character;
1688 redo A;
1689 } elsif ($self->{next_char} == 0x0026) { # &
1690 !!!cp (102);
1691 $self->{last_attribute_value_state} = $self->{state};
1692 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1693 !!!next-input-character;
1694 redo A;
1695 } elsif ($self->{next_char} == -1) {
1696 !!!parse-error (type => 'unclosed attribute value');
1697 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1698 !!!cp (103);
1699 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1700 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1701 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1702 if ($self->{current_token}->{attributes}) {
1703 !!!cp (104);
1704 !!!parse-error (type => 'end tag attribute');
1705 } else {
1706 ## NOTE: This state should never be reached.
1707 !!!cp (105);
1708 }
1709 } else {
1710 die "$0: $self->{current_token}->{type}: Unknown token type";
1711 }
1712 $self->{state} = DATA_STATE;
1713 ## reconsume
1714
1715 !!!emit ($self->{current_token}); # start tag or end tag
1716
1717 redo A;
1718 } else {
1719 !!!cp (106);
1720 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1721 ## Stay in the state
1722 !!!next-input-character;
1723 redo A;
1724 }
1725 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1726 if ($self->{next_char} == 0x0009 or # HT
1727 $self->{next_char} == 0x000A or # LF
1728 $self->{next_char} == 0x000B or # HT
1729 $self->{next_char} == 0x000C or # FF
1730 $self->{next_char} == 0x0020) { # SP
1731 !!!cp (107);
1732 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1733 !!!next-input-character;
1734 redo A;
1735 } elsif ($self->{next_char} == 0x0026) { # &
1736 !!!cp (108);
1737 $self->{last_attribute_value_state} = $self->{state};
1738 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1739 !!!next-input-character;
1740 redo A;
1741 } elsif ($self->{next_char} == 0x003E) { # >
1742 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1743 !!!cp (109);
1744 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1745 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1746 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1747 if ($self->{current_token}->{attributes}) {
1748 !!!cp (110);
1749 !!!parse-error (type => 'end tag attribute');
1750 } else {
1751 ## NOTE: This state should never be reached.
1752 !!!cp (111);
1753 }
1754 } else {
1755 die "$0: $self->{current_token}->{type}: Unknown token type";
1756 }
1757 $self->{state} = DATA_STATE;
1758 !!!next-input-character;
1759
1760 !!!emit ($self->{current_token}); # start tag or end tag
1761
1762 redo A;
1763 } elsif ($self->{next_char} == -1) {
1764 !!!parse-error (type => 'unclosed tag');
1765 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1766 !!!cp (112);
1767 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1768 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1769 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1770 if ($self->{current_token}->{attributes}) {
1771 !!!cp (113);
1772 !!!parse-error (type => 'end tag attribute');
1773 } else {
1774 ## NOTE: This state should never be reached.
1775 !!!cp (114);
1776 }
1777 } else {
1778 die "$0: $self->{current_token}->{type}: Unknown token type";
1779 }
1780 $self->{state} = DATA_STATE;
1781 ## reconsume
1782
1783 !!!emit ($self->{current_token}); # start tag or end tag
1784
1785 redo A;
1786 } else {
1787 if ({
1788 0x0022 => 1, # "
1789 0x0027 => 1, # '
1790 0x003D => 1, # =
1791 }->{$self->{next_char}}) {
1792 !!!cp (115);
1793 !!!parse-error (type => 'bad attribute value');
1794 } else {
1795 !!!cp (116);
1796 }
1797 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1798 ## Stay in the state
1799 !!!next-input-character;
1800 redo A;
1801 }
1802 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1803 my $token = $self->_tokenize_attempt_to_consume_an_entity
1804 (1,
1805 $self->{last_attribute_value_state}
1806 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1807 $self->{last_attribute_value_state}
1808 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1809 -1);
1810
1811 unless (defined $token) {
1812 !!!cp (117);
1813 $self->{current_attribute}->{value} .= '&';
1814 } else {
1815 !!!cp (118);
1816 $self->{current_attribute}->{value} .= $token->{data};
1817 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1818 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1819 }
1820
1821 $self->{state} = $self->{last_attribute_value_state};
1822 # next-input-character is already done
1823 redo A;
1824 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1825 if ($self->{next_char} == 0x0009 or # HT
1826 $self->{next_char} == 0x000A or # LF
1827 $self->{next_char} == 0x000B or # VT
1828 $self->{next_char} == 0x000C or # FF
1829 $self->{next_char} == 0x0020) { # SP
1830 !!!cp (118);
1831 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1832 !!!next-input-character;
1833 redo A;
1834 } elsif ($self->{next_char} == 0x003E) { # >
1835 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1836 !!!cp (119);
1837 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1838 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1839 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1840 if ($self->{current_token}->{attributes}) {
1841 !!!cp (120);
1842 !!!parse-error (type => 'end tag attribute');
1843 } else {
1844 ## NOTE: This state should never be reached.
1845 !!!cp (121);
1846 }
1847 } else {
1848 die "$0: $self->{current_token}->{type}: Unknown token type";
1849 }
1850 $self->{state} = DATA_STATE;
1851 !!!next-input-character;
1852
1853 !!!emit ($self->{current_token}); # start tag or end tag
1854
1855 redo A;
1856 } elsif ($self->{next_char} == 0x002F) { # /
1857 !!!cp (122);
1858 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1859 !!!next-input-character;
1860 redo A;
1861 } elsif ($self->{next_char} == -1) {
1862 !!!parse-error (type => 'unclosed tag');
1863 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1864 !!!cp (122.3);
1865 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1866 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1867 if ($self->{current_token}->{attributes}) {
1868 !!!cp (122.1);
1869 !!!parse-error (type => 'end tag attribute');
1870 } else {
1871 ## NOTE: This state should never be reached.
1872 !!!cp (122.2);
1873 }
1874 } else {
1875 die "$0: $self->{current_token}->{type}: Unknown token type";
1876 }
1877 $self->{state} = DATA_STATE;
1878 ## Reconsume.
1879 !!!emit ($self->{current_token}); # start tag or end tag
1880 redo A;
1881 } else {
1882 !!!cp ('124.1');
1883 !!!parse-error (type => 'no space between attributes');
1884 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1885 ## reconsume
1886 redo A;
1887 }
1888 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1889 if ($self->{next_char} == 0x003E) { # >
1890 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1891 !!!cp ('124.2');
1892 !!!parse-error (type => 'nestc', token => $self->{current_token});
1893 ## TODO: Different type than slash in start tag
1894 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1895 if ($self->{current_token}->{attributes}) {
1896 !!!cp ('124.4');
1897 !!!parse-error (type => 'end tag attribute');
1898 } else {
1899 !!!cp ('124.5');
1900 }
1901 ## TODO: Test |<title></title/>|
1902 } else {
1903 !!!cp ('124.3');
1904 $self->{self_closing} = 1;
1905 }
1906
1907 $self->{state} = DATA_STATE;
1908 !!!next-input-character;
1909
1910 !!!emit ($self->{current_token}); # start tag or end tag
1911
1912 redo A;
1913 } elsif ($self->{next_char} == -1) {
1914 !!!parse-error (type => 'unclosed tag');
1915 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1916 !!!cp (124.7);
1917 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1918 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1919 if ($self->{current_token}->{attributes}) {
1920 !!!cp (124.5);
1921 !!!parse-error (type => 'end tag attribute');
1922 } else {
1923 ## NOTE: This state should never be reached.
1924 !!!cp (124.6);
1925 }
1926 } else {
1927 die "$0: $self->{current_token}->{type}: Unknown token type";
1928 }
1929 $self->{state} = DATA_STATE;
1930 ## Reconsume.
1931 !!!emit ($self->{current_token}); # start tag or end tag
1932 redo A;
1933 } else {
1934 !!!cp ('124.4');
1935 !!!parse-error (type => 'nestc');
1936 ## TODO: This error type is wrong.
1937 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1938 ## Reconsume.
1939 redo A;
1940 }
1941 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1942 ## (only happen if PCDATA state)
1943
1944 ## NOTE: Set by the previous state
1945 #my $token = {type => COMMENT_TOKEN, data => ''};
1946
1947 BC: {
1948 if ($self->{next_char} == 0x003E) { # >
1949 !!!cp (124);
1950 $self->{state} = DATA_STATE;
1951 !!!next-input-character;
1952
1953 !!!emit ($self->{current_token}); # comment
1954
1955 redo A;
1956 } elsif ($self->{next_char} == -1) {
1957 !!!cp (125);
1958 $self->{state} = DATA_STATE;
1959 ## reconsume
1960
1961 !!!emit ($self->{current_token}); # comment
1962
1963 redo A;
1964 } else {
1965 !!!cp (126);
1966 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1967 !!!next-input-character;
1968 redo BC;
1969 }
1970 } # BC
1971
1972 die "$0: _get_next_token: unexpected case [BC]";
1973 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1974 ## (only happen if PCDATA state)
1975
1976 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1977
1978 my @next_char;
1979 push @next_char, $self->{next_char};
1980
1981 if ($self->{next_char} == 0x002D) { # -
1982 !!!next-input-character;
1983 push @next_char, $self->{next_char};
1984 if ($self->{next_char} == 0x002D) { # -
1985 !!!cp (127);
1986 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1987 line => $l, column => $c,
1988 };
1989 $self->{state} = COMMENT_START_STATE;
1990 !!!next-input-character;
1991 redo A;
1992 } else {
1993 !!!cp (128);
1994 }
1995 } elsif ($self->{next_char} == 0x0044 or # D
1996 $self->{next_char} == 0x0064) { # d
1997 !!!next-input-character;
1998 push @next_char, $self->{next_char};
1999 if ($self->{next_char} == 0x004F or # O
2000 $self->{next_char} == 0x006F) { # o
2001 !!!next-input-character;
2002 push @next_char, $self->{next_char};
2003 if ($self->{next_char} == 0x0043 or # C
2004 $self->{next_char} == 0x0063) { # c
2005 !!!next-input-character;
2006 push @next_char, $self->{next_char};
2007 if ($self->{next_char} == 0x0054 or # T
2008 $self->{next_char} == 0x0074) { # t
2009 !!!next-input-character;
2010 push @next_char, $self->{next_char};
2011 if ($self->{next_char} == 0x0059 or # Y
2012 $self->{next_char} == 0x0079) { # y
2013 !!!next-input-character;
2014 push @next_char, $self->{next_char};
2015 if ($self->{next_char} == 0x0050 or # P
2016 $self->{next_char} == 0x0070) { # p
2017 !!!next-input-character;
2018 push @next_char, $self->{next_char};
2019 if ($self->{next_char} == 0x0045 or # E
2020 $self->{next_char} == 0x0065) { # e
2021 !!!cp (129);
2022 ## TODO: What a stupid code this is!
2023 $self->{state} = DOCTYPE_STATE;
2024 $self->{current_token} = {type => DOCTYPE_TOKEN,
2025 quirks => 1,
2026 line => $l, column => $c,
2027 };
2028 !!!next-input-character;
2029 redo A;
2030 } else {
2031 !!!cp (130);
2032 }
2033 } else {
2034 !!!cp (131);
2035 }
2036 } else {
2037 !!!cp (132);
2038 }
2039 } else {
2040 !!!cp (133);
2041 }
2042 } else {
2043 !!!cp (134);
2044 }
2045 } else {
2046 !!!cp (135);
2047 }
2048 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2049 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2050 $self->{next_char} == 0x005B) { # [
2051 !!!next-input-character;
2052 push @next_char, $self->{next_char};
2053 if ($self->{next_char} == 0x0043) { # C
2054 !!!next-input-character;
2055 push @next_char, $self->{next_char};
2056 if ($self->{next_char} == 0x0044) { # D
2057 !!!next-input-character;
2058 push @next_char, $self->{next_char};
2059 if ($self->{next_char} == 0x0041) { # A
2060 !!!next-input-character;
2061 push @next_char, $self->{next_char};
2062 if ($self->{next_char} == 0x0054) { # T
2063 !!!next-input-character;
2064 push @next_char, $self->{next_char};
2065 if ($self->{next_char} == 0x0041) { # A
2066 !!!next-input-character;
2067 push @next_char, $self->{next_char};
2068 if ($self->{next_char} == 0x005B) { # [
2069 !!!cp (135.1);
2070 $self->{state} = CDATA_BLOCK_STATE;
2071 !!!next-input-character;
2072 redo A;
2073 } else {
2074 !!!cp (135.2);
2075 }
2076 } else {
2077 !!!cp (135.3);
2078 }
2079 } else {
2080 !!!cp (135.4);
2081 }
2082 } else {
2083 !!!cp (135.5);
2084 }
2085 } else {
2086 !!!cp (135.6);
2087 }
2088 } else {
2089 !!!cp (135.7);
2090 }
2091 } else {
2092 !!!cp (136);
2093 }
2094
2095 !!!parse-error (type => 'bogus comment');
2096 $self->{next_char} = shift @next_char;
2097 !!!back-next-input-character (@next_char);
2098 $self->{state} = BOGUS_COMMENT_STATE;
2099 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2100 line => $l, column => $c,
2101 };
2102 redo A;
2103
2104 ## ISSUE: typos in spec: chacacters, is is a parse error
2105 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
2106 } elsif ($self->{state} == COMMENT_START_STATE) {
2107 if ($self->{next_char} == 0x002D) { # -
2108 !!!cp (137);
2109 $self->{state} = COMMENT_START_DASH_STATE;
2110 !!!next-input-character;
2111 redo A;
2112 } elsif ($self->{next_char} == 0x003E) { # >
2113 !!!cp (138);
2114 !!!parse-error (type => 'bogus comment');
2115 $self->{state} = DATA_STATE;
2116 !!!next-input-character;
2117
2118 !!!emit ($self->{current_token}); # comment
2119
2120 redo A;
2121 } elsif ($self->{next_char} == -1) {
2122 !!!cp (139);
2123 !!!parse-error (type => 'unclosed comment');
2124 $self->{state} = DATA_STATE;
2125 ## reconsume
2126
2127 !!!emit ($self->{current_token}); # comment
2128
2129 redo A;
2130 } else {
2131 !!!cp (140);
2132 $self->{current_token}->{data} # comment
2133 .= chr ($self->{next_char});
2134 $self->{state} = COMMENT_STATE;
2135 !!!next-input-character;
2136 redo A;
2137 }
2138 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2139 if ($self->{next_char} == 0x002D) { # -
2140 !!!cp (141);
2141 $self->{state} = COMMENT_END_STATE;
2142 !!!next-input-character;
2143 redo A;
2144 } elsif ($self->{next_char} == 0x003E) { # >
2145 !!!cp (142);
2146 !!!parse-error (type => 'bogus comment');
2147 $self->{state} = DATA_STATE;
2148 !!!next-input-character;
2149
2150 !!!emit ($self->{current_token}); # comment
2151
2152 redo A;
2153 } elsif ($self->{next_char} == -1) {
2154 !!!cp (143);
2155 !!!parse-error (type => 'unclosed comment');
2156 $self->{state} = DATA_STATE;
2157 ## reconsume
2158
2159 !!!emit ($self->{current_token}); # comment
2160
2161 redo A;
2162 } else {
2163 !!!cp (144);
2164 $self->{current_token}->{data} # comment
2165 .= '-' . chr ($self->{next_char});
2166 $self->{state} = COMMENT_STATE;
2167 !!!next-input-character;
2168 redo A;
2169 }
2170 } elsif ($self->{state} == COMMENT_STATE) {
2171 if ($self->{next_char} == 0x002D) { # -
2172 !!!cp (145);
2173 $self->{state} = COMMENT_END_DASH_STATE;
2174 !!!next-input-character;
2175 redo A;
2176 } elsif ($self->{next_char} == -1) {
2177 !!!cp (146);
2178 !!!parse-error (type => 'unclosed comment');
2179 $self->{state} = DATA_STATE;
2180 ## reconsume
2181
2182 !!!emit ($self->{current_token}); # comment
2183
2184 redo A;
2185 } else {
2186 !!!cp (147);
2187 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2188 ## Stay in the state
2189 !!!next-input-character;
2190 redo A;
2191 }
2192 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2193 if ($self->{next_char} == 0x002D) { # -
2194 !!!cp (148);
2195 $self->{state} = COMMENT_END_STATE;
2196 !!!next-input-character;
2197 redo A;
2198 } elsif ($self->{next_char} == -1) {
2199 !!!cp (149);
2200 !!!parse-error (type => 'unclosed comment');
2201 $self->{state} = DATA_STATE;
2202 ## reconsume
2203
2204 !!!emit ($self->{current_token}); # comment
2205
2206 redo A;
2207 } else {
2208 !!!cp (150);
2209 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2210 $self->{state} = COMMENT_STATE;
2211 !!!next-input-character;
2212 redo A;
2213 }
2214 } elsif ($self->{state} == COMMENT_END_STATE) {
2215 if ($self->{next_char} == 0x003E) { # >
2216 !!!cp (151);
2217 $self->{state} = DATA_STATE;
2218 !!!next-input-character;
2219
2220 !!!emit ($self->{current_token}); # comment
2221
2222 redo A;
2223 } elsif ($self->{next_char} == 0x002D) { # -
2224 !!!cp (152);
2225 !!!parse-error (type => 'dash in comment',
2226 line => $self->{line_prev},
2227 column => $self->{column_prev});
2228 $self->{current_token}->{data} .= '-'; # comment
2229 ## Stay in the state
2230 !!!next-input-character;
2231 redo A;
2232 } elsif ($self->{next_char} == -1) {
2233 !!!cp (153);
2234 !!!parse-error (type => 'unclosed comment');
2235 $self->{state} = DATA_STATE;
2236 ## reconsume
2237
2238 !!!emit ($self->{current_token}); # comment
2239
2240 redo A;
2241 } else {
2242 !!!cp (154);
2243 !!!parse-error (type => 'dash in comment',
2244 line => $self->{line_prev},
2245 column => $self->{column_prev});
2246 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2247 $self->{state} = COMMENT_STATE;
2248 !!!next-input-character;
2249 redo A;
2250 }
2251 } elsif ($self->{state} == DOCTYPE_STATE) {
2252 if ($self->{next_char} == 0x0009 or # HT
2253 $self->{next_char} == 0x000A or # LF
2254 $self->{next_char} == 0x000B or # VT
2255 $self->{next_char} == 0x000C or # FF
2256 $self->{next_char} == 0x0020) { # SP
2257 !!!cp (155);
2258 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2259 !!!next-input-character;
2260 redo A;
2261 } else {
2262 !!!cp (156);
2263 !!!parse-error (type => 'no space before DOCTYPE name');
2264 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2265 ## reconsume
2266 redo A;
2267 }
2268 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2269 if ($self->{next_char} == 0x0009 or # HT
2270 $self->{next_char} == 0x000A or # LF
2271 $self->{next_char} == 0x000B or # VT
2272 $self->{next_char} == 0x000C or # FF
2273 $self->{next_char} == 0x0020) { # SP
2274 !!!cp (157);
2275 ## Stay in the state
2276 !!!next-input-character;
2277 redo A;
2278 } elsif ($self->{next_char} == 0x003E) { # >
2279 !!!cp (158);
2280 !!!parse-error (type => 'no DOCTYPE name');
2281 $self->{state} = DATA_STATE;
2282 !!!next-input-character;
2283
2284 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2285
2286 redo A;
2287 } elsif ($self->{next_char} == -1) {
2288 !!!cp (159);
2289 !!!parse-error (type => 'no DOCTYPE name');
2290 $self->{state} = DATA_STATE;
2291 ## reconsume
2292
2293 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2294
2295 redo A;
2296 } else {
2297 !!!cp (160);
2298 $self->{current_token}->{name} = chr $self->{next_char};
2299 delete $self->{current_token}->{quirks};
2300 ## ISSUE: "Set the token's name name to the" in the spec
2301 $self->{state} = DOCTYPE_NAME_STATE;
2302 !!!next-input-character;
2303 redo A;
2304 }
2305 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2306 ## ISSUE: Redundant "First," in the spec.
2307 if ($self->{next_char} == 0x0009 or # HT
2308 $self->{next_char} == 0x000A or # LF
2309 $self->{next_char} == 0x000B or # VT
2310 $self->{next_char} == 0x000C or # FF
2311 $self->{next_char} == 0x0020) { # SP
2312 !!!cp (161);
2313 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2314 !!!next-input-character;
2315 redo A;
2316 } elsif ($self->{next_char} == 0x003E) { # >
2317 !!!cp (162);
2318 $self->{state} = DATA_STATE;
2319 !!!next-input-character;
2320
2321 !!!emit ($self->{current_token}); # DOCTYPE
2322
2323 redo A;
2324 } elsif ($self->{next_char} == -1) {
2325 !!!cp (163);
2326 !!!parse-error (type => 'unclosed DOCTYPE');
2327 $self->{state} = DATA_STATE;
2328 ## reconsume
2329
2330 $self->{current_token}->{quirks} = 1;
2331 !!!emit ($self->{current_token}); # DOCTYPE
2332
2333 redo A;
2334 } else {
2335 !!!cp (164);
2336 $self->{current_token}->{name}
2337 .= chr ($self->{next_char}); # DOCTYPE
2338 ## Stay in the state
2339 !!!next-input-character;
2340 redo A;
2341 }
2342 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2343 if ($self->{next_char} == 0x0009 or # HT
2344 $self->{next_char} == 0x000A or # LF
2345 $self->{next_char} == 0x000B or # VT
2346 $self->{next_char} == 0x000C or # FF
2347 $self->{next_char} == 0x0020) { # SP
2348 !!!cp (165);
2349 ## Stay in the state
2350 !!!next-input-character;
2351 redo A;
2352 } elsif ($self->{next_char} == 0x003E) { # >
2353 !!!cp (166);
2354 $self->{state} = DATA_STATE;
2355 !!!next-input-character;
2356
2357 !!!emit ($self->{current_token}); # DOCTYPE
2358
2359 redo A;
2360 } elsif ($self->{next_char} == -1) {
2361 !!!cp (167);
2362 !!!parse-error (type => 'unclosed DOCTYPE');
2363 $self->{state} = DATA_STATE;
2364 ## reconsume
2365
2366 $self->{current_token}->{quirks} = 1;
2367 !!!emit ($self->{current_token}); # DOCTYPE
2368
2369 redo A;
2370 } elsif ($self->{next_char} == 0x0050 or # P
2371 $self->{next_char} == 0x0070) { # p
2372 !!!next-input-character;
2373 if ($self->{next_char} == 0x0055 or # U
2374 $self->{next_char} == 0x0075) { # u
2375 !!!next-input-character;
2376 if ($self->{next_char} == 0x0042 or # B
2377 $self->{next_char} == 0x0062) { # b
2378 !!!next-input-character;
2379 if ($self->{next_char} == 0x004C or # L
2380 $self->{next_char} == 0x006C) { # l
2381 !!!next-input-character;
2382 if ($self->{next_char} == 0x0049 or # I
2383 $self->{next_char} == 0x0069) { # i
2384 !!!next-input-character;
2385 if ($self->{next_char} == 0x0043 or # C
2386 $self->{next_char} == 0x0063) { # c
2387 !!!cp (168);
2388 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2389 !!!next-input-character;
2390 redo A;
2391 } else {
2392 !!!cp (169);
2393 }
2394 } else {
2395 !!!cp (170);
2396 }
2397 } else {
2398 !!!cp (171);
2399 }
2400 } else {
2401 !!!cp (172);
2402 }
2403 } else {
2404 !!!cp (173);
2405 }
2406
2407 #
2408 } elsif ($self->{next_char} == 0x0053 or # S
2409 $self->{next_char} == 0x0073) { # s
2410 !!!next-input-character;
2411 if ($self->{next_char} == 0x0059 or # Y
2412 $self->{next_char} == 0x0079) { # y
2413 !!!next-input-character;
2414 if ($self->{next_char} == 0x0053 or # S
2415 $self->{next_char} == 0x0073) { # s
2416 !!!next-input-character;
2417 if ($self->{next_char} == 0x0054 or # T
2418 $self->{next_char} == 0x0074) { # t
2419 !!!next-input-character;
2420 if ($self->{next_char} == 0x0045 or # E
2421 $self->{next_char} == 0x0065) { # e
2422 !!!next-input-character;
2423 if ($self->{next_char} == 0x004D or # M
2424 $self->{next_char} == 0x006D) { # m
2425 !!!cp (174);
2426 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2427 !!!next-input-character;
2428 redo A;
2429 } else {
2430 !!!cp (175);
2431 }
2432 } else {
2433 !!!cp (176);
2434 }
2435 } else {
2436 !!!cp (177);
2437 }
2438 } else {
2439 !!!cp (178);
2440 }
2441 } else {
2442 !!!cp (179);
2443 }
2444
2445 #
2446 } else {
2447 !!!cp (180);
2448 !!!next-input-character;
2449 #
2450 }
2451
2452 !!!parse-error (type => 'string after DOCTYPE name');
2453 $self->{current_token}->{quirks} = 1;
2454
2455 $self->{state} = BOGUS_DOCTYPE_STATE;
2456 # next-input-character is already done
2457 redo A;
2458 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2459 if ({
2460 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2461 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2462 }->{$self->{next_char}}) {
2463 !!!cp (181);
2464 ## Stay in the state
2465 !!!next-input-character;
2466 redo A;
2467 } elsif ($self->{next_char} eq 0x0022) { # "
2468 !!!cp (182);
2469 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2470 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2471 !!!next-input-character;
2472 redo A;
2473 } elsif ($self->{next_char} eq 0x0027) { # '
2474 !!!cp (183);
2475 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2476 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2477 !!!next-input-character;
2478 redo A;
2479 } elsif ($self->{next_char} eq 0x003E) { # >
2480 !!!cp (184);
2481 !!!parse-error (type => 'no PUBLIC literal');
2482
2483 $self->{state} = DATA_STATE;
2484 !!!next-input-character;
2485
2486 $self->{current_token}->{quirks} = 1;
2487 !!!emit ($self->{current_token}); # DOCTYPE
2488
2489 redo A;
2490 } elsif ($self->{next_char} == -1) {
2491 !!!cp (185);
2492 !!!parse-error (type => 'unclosed DOCTYPE');
2493
2494 $self->{state} = DATA_STATE;
2495 ## reconsume
2496
2497 $self->{current_token}->{quirks} = 1;
2498 !!!emit ($self->{current_token}); # DOCTYPE
2499
2500 redo A;
2501 } else {
2502 !!!cp (186);
2503 !!!parse-error (type => 'string after PUBLIC');
2504 $self->{current_token}->{quirks} = 1;
2505
2506 $self->{state} = BOGUS_DOCTYPE_STATE;
2507 !!!next-input-character;
2508 redo A;
2509 }
2510 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2511 if ($self->{next_char} == 0x0022) { # "
2512 !!!cp (187);
2513 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2514 !!!next-input-character;
2515 redo A;
2516 } elsif ($self->{next_char} == 0x003E) { # >
2517 !!!cp (188);
2518 !!!parse-error (type => 'unclosed PUBLIC literal');
2519
2520 $self->{state} = DATA_STATE;
2521 !!!next-input-character;
2522
2523 $self->{current_token}->{quirks} = 1;
2524 !!!emit ($self->{current_token}); # DOCTYPE
2525
2526 redo A;
2527 } elsif ($self->{next_char} == -1) {
2528 !!!cp (189);
2529 !!!parse-error (type => 'unclosed PUBLIC literal');
2530
2531 $self->{state} = DATA_STATE;
2532 ## reconsume
2533
2534 $self->{current_token}->{quirks} = 1;
2535 !!!emit ($self->{current_token}); # DOCTYPE
2536
2537 redo A;
2538 } else {
2539 !!!cp (190);
2540 $self->{current_token}->{public_identifier} # DOCTYPE
2541 .= chr $self->{next_char};
2542 ## Stay in the state
2543 !!!next-input-character;
2544 redo A;
2545 }
2546 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2547 if ($self->{next_char} == 0x0027) { # '
2548 !!!cp (191);
2549 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2550 !!!next-input-character;
2551 redo A;
2552 } elsif ($self->{next_char} == 0x003E) { # >
2553 !!!cp (192);
2554 !!!parse-error (type => 'unclosed PUBLIC literal');
2555
2556 $self->{state} = DATA_STATE;
2557 !!!next-input-character;
2558
2559 $self->{current_token}->{quirks} = 1;
2560 !!!emit ($self->{current_token}); # DOCTYPE
2561
2562 redo A;
2563 } elsif ($self->{next_char} == -1) {
2564 !!!cp (193);
2565 !!!parse-error (type => 'unclosed PUBLIC literal');
2566
2567 $self->{state} = DATA_STATE;
2568 ## reconsume
2569
2570 $self->{current_token}->{quirks} = 1;
2571 !!!emit ($self->{current_token}); # DOCTYPE
2572
2573 redo A;
2574 } else {
2575 !!!cp (194);
2576 $self->{current_token}->{public_identifier} # DOCTYPE
2577 .= chr $self->{next_char};
2578 ## Stay in the state
2579 !!!next-input-character;
2580 redo A;
2581 }
2582 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2583 if ({
2584 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2585 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2586 }->{$self->{next_char}}) {
2587 !!!cp (195);
2588 ## Stay in the state
2589 !!!next-input-character;
2590 redo A;
2591 } elsif ($self->{next_char} == 0x0022) { # "
2592 !!!cp (196);
2593 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2594 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2595 !!!next-input-character;
2596 redo A;
2597 } elsif ($self->{next_char} == 0x0027) { # '
2598 !!!cp (197);
2599 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2600 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2601 !!!next-input-character;
2602 redo A;
2603 } elsif ($self->{next_char} == 0x003E) { # >
2604 !!!cp (198);
2605 $self->{state} = DATA_STATE;
2606 !!!next-input-character;
2607
2608 !!!emit ($self->{current_token}); # DOCTYPE
2609
2610 redo A;
2611 } elsif ($self->{next_char} == -1) {
2612 !!!cp (199);
2613 !!!parse-error (type => 'unclosed DOCTYPE');
2614
2615 $self->{state} = DATA_STATE;
2616 ## reconsume
2617
2618 $self->{current_token}->{quirks} = 1;
2619 !!!emit ($self->{current_token}); # DOCTYPE
2620
2621 redo A;
2622 } else {
2623 !!!cp (200);
2624 !!!parse-error (type => 'string after PUBLIC literal');
2625 $self->{current_token}->{quirks} = 1;
2626
2627 $self->{state} = BOGUS_DOCTYPE_STATE;
2628 !!!next-input-character;
2629 redo A;
2630 }
2631 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2632 if ({
2633 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2634 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2635 }->{$self->{next_char}}) {
2636 !!!cp (201);
2637 ## Stay in the state
2638 !!!next-input-character;
2639 redo A;
2640 } elsif ($self->{next_char} == 0x0022) { # "
2641 !!!cp (202);
2642 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2643 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2644 !!!next-input-character;
2645 redo A;
2646 } elsif ($self->{next_char} == 0x0027) { # '
2647 !!!cp (203);
2648 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2649 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2650 !!!next-input-character;
2651 redo A;
2652 } elsif ($self->{next_char} == 0x003E) { # >
2653 !!!cp (204);
2654 !!!parse-error (type => 'no SYSTEM literal');
2655 $self->{state} = DATA_STATE;
2656 !!!next-input-character;
2657
2658 $self->{current_token}->{quirks} = 1;
2659 !!!emit ($self->{current_token}); # DOCTYPE
2660
2661 redo A;
2662 } elsif ($self->{next_char} == -1) {
2663 !!!cp (205);
2664 !!!parse-error (type => 'unclosed DOCTYPE');
2665
2666 $self->{state} = DATA_STATE;
2667 ## reconsume
2668
2669 $self->{current_token}->{quirks} = 1;
2670 !!!emit ($self->{current_token}); # DOCTYPE
2671
2672 redo A;
2673 } else {
2674 !!!cp (206);
2675 !!!parse-error (type => 'string after SYSTEM');
2676 $self->{current_token}->{quirks} = 1;
2677
2678 $self->{state} = BOGUS_DOCTYPE_STATE;
2679 !!!next-input-character;
2680 redo A;
2681 }
2682 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2683 if ($self->{next_char} == 0x0022) { # "
2684 !!!cp (207);
2685 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2686 !!!next-input-character;
2687 redo A;
2688 } elsif ($self->{next_char} == 0x003E) { # >
2689 !!!cp (208);
2690 !!!parse-error (type => 'unclosed SYSTEM literal');
2691
2692 $self->{state} = DATA_STATE;
2693 !!!next-input-character;
2694
2695 $self->{current_token}->{quirks} = 1;
2696 !!!emit ($self->{current_token}); # DOCTYPE
2697
2698 redo A;
2699 } elsif ($self->{next_char} == -1) {
2700 !!!cp (209);
2701 !!!parse-error (type => 'unclosed SYSTEM literal');
2702
2703 $self->{state} = DATA_STATE;
2704 ## reconsume
2705
2706 $self->{current_token}->{quirks} = 1;
2707 !!!emit ($self->{current_token}); # DOCTYPE
2708
2709 redo A;
2710 } else {
2711 !!!cp (210);
2712 $self->{current_token}->{system_identifier} # DOCTYPE
2713 .= chr $self->{next_char};
2714 ## Stay in the state
2715 !!!next-input-character;
2716 redo A;
2717 }
2718 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2719 if ($self->{next_char} == 0x0027) { # '
2720 !!!cp (211);
2721 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2722 !!!next-input-character;
2723 redo A;
2724 } elsif ($self->{next_char} == 0x003E) { # >
2725 !!!cp (212);
2726 !!!parse-error (type => 'unclosed SYSTEM literal');
2727
2728 $self->{state} = DATA_STATE;
2729 !!!next-input-character;
2730
2731 $self->{current_token}->{quirks} = 1;
2732 !!!emit ($self->{current_token}); # DOCTYPE
2733
2734 redo A;
2735 } elsif ($self->{next_char} == -1) {
2736 !!!cp (213);
2737 !!!parse-error (type => 'unclosed SYSTEM literal');
2738
2739 $self->{state} = DATA_STATE;
2740 ## reconsume
2741
2742 $self->{current_token}->{quirks} = 1;
2743 !!!emit ($self->{current_token}); # DOCTYPE
2744
2745 redo A;
2746 } else {
2747 !!!cp (214);
2748 $self->{current_token}->{system_identifier} # DOCTYPE
2749 .= chr $self->{next_char};
2750 ## Stay in the state
2751 !!!next-input-character;
2752 redo A;
2753 }
2754 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2755 if ({
2756 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2757 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2758 }->{$self->{next_char}}) {
2759 !!!cp (215);
2760 ## Stay in the state
2761 !!!next-input-character;
2762 redo A;
2763 } elsif ($self->{next_char} == 0x003E) { # >
2764 !!!cp (216);
2765 $self->{state} = DATA_STATE;
2766 !!!next-input-character;
2767
2768 !!!emit ($self->{current_token}); # DOCTYPE
2769
2770 redo A;
2771 } elsif ($self->{next_char} == -1) {
2772 !!!cp (217);
2773 !!!parse-error (type => 'unclosed DOCTYPE');
2774 $self->{state} = DATA_STATE;
2775 ## reconsume
2776
2777 $self->{current_token}->{quirks} = 1;
2778 !!!emit ($self->{current_token}); # DOCTYPE
2779
2780 redo A;
2781 } else {
2782 !!!cp (218);
2783 !!!parse-error (type => 'string after SYSTEM literal');
2784 #$self->{current_token}->{quirks} = 1;
2785
2786 $self->{state} = BOGUS_DOCTYPE_STATE;
2787 !!!next-input-character;
2788 redo A;
2789 }
2790 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2791 if ($self->{next_char} == 0x003E) { # >
2792 !!!cp (219);
2793 $self->{state} = DATA_STATE;
2794 !!!next-input-character;
2795
2796 !!!emit ($self->{current_token}); # DOCTYPE
2797
2798 redo A;
2799 } elsif ($self->{next_char} == -1) {
2800 !!!cp (220);
2801 !!!parse-error (type => 'unclosed DOCTYPE');
2802 $self->{state} = DATA_STATE;
2803 ## reconsume
2804
2805 !!!emit ($self->{current_token}); # DOCTYPE
2806
2807 redo A;
2808 } else {
2809 !!!cp (221);
2810 ## Stay in the state
2811 !!!next-input-character;
2812 redo A;
2813 }
2814 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2815 my $s = '';
2816
2817 my ($l, $c) = ($self->{line}, $self->{column});
2818
2819 CS: while ($self->{next_char} != -1) {
2820 if ($self->{next_char} == 0x005D) { # ]
2821 !!!next-input-character;
2822 if ($self->{next_char} == 0x005D) { # ]
2823 !!!next-input-character;
2824 MDC: {
2825 if ($self->{next_char} == 0x003E) { # >
2826 !!!cp (221.1);
2827 !!!next-input-character;
2828 last CS;
2829 } elsif ($self->{next_char} == 0x005D) { # ]
2830 !!!cp (221.2);
2831 $s .= ']';
2832 !!!next-input-character;
2833 redo MDC;
2834 } else {
2835 !!!cp (221.3);
2836 $s .= ']]';
2837 #
2838 }
2839 } # MDC
2840 } else {
2841 !!!cp (221.4);
2842 $s .= ']';
2843 #
2844 }
2845 } else {
2846 !!!cp (221.5);
2847 #
2848 }
2849 $s .= chr $self->{next_char};
2850 !!!next-input-character;
2851 } # CS
2852
2853 $self->{state} = DATA_STATE;
2854 ## next-input-character done or EOF, which is reconsumed.
2855
2856 if (length $s) {
2857 !!!cp (221.6);
2858 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2859 line => $l, column => $c});
2860 } else {
2861 !!!cp (221.7);
2862 }
2863
2864 redo A;
2865
2866 ## ISSUE: "text tokens" in spec.
2867 ## TODO: Streaming support
2868 } else {
2869 die "$0: $self->{state}: Unknown state";
2870 }
2871 } # A
2872
2873 die "$0: _get_next_token: unexpected case";
2874 } # _get_next_token
2875
2876 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2877 my ($self, $in_attr, $additional) = @_;
2878
2879 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2880
2881 if ({
2882 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2883 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2884 $additional => 1,
2885 }->{$self->{next_char}}) {
2886 !!!cp (1001);
2887 ## Don't consume
2888 ## No error
2889 return undef;
2890 } elsif ($self->{next_char} == 0x0023) { # #
2891 !!!next-input-character;
2892 if ($self->{next_char} == 0x0078 or # x
2893 $self->{next_char} == 0x0058) { # X
2894 my $code;
2895 X: {
2896 my $x_char = $self->{next_char};
2897 !!!next-input-character;
2898 if (0x0030 <= $self->{next_char} and
2899 $self->{next_char} <= 0x0039) { # 0..9
2900 !!!cp (1002);
2901 $code ||= 0;
2902 $code *= 0x10;
2903 $code += $self->{next_char} - 0x0030;
2904 redo X;
2905 } elsif (0x0061 <= $self->{next_char} and
2906 $self->{next_char} <= 0x0066) { # a..f
2907 !!!cp (1003);
2908 $code ||= 0;
2909 $code *= 0x10;
2910 $code += $self->{next_char} - 0x0060 + 9;
2911 redo X;
2912 } elsif (0x0041 <= $self->{next_char} and
2913 $self->{next_char} <= 0x0046) { # A..F
2914 !!!cp (1004);
2915 $code ||= 0;
2916 $code *= 0x10;
2917 $code += $self->{next_char} - 0x0040 + 9;
2918 redo X;
2919 } elsif (not defined $code) { # no hexadecimal digit
2920 !!!cp (1005);
2921 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2922 !!!back-next-input-character ($x_char, $self->{next_char});
2923 $self->{next_char} = 0x0023; # #
2924 return undef;
2925 } elsif ($self->{next_char} == 0x003B) { # ;
2926 !!!cp (1006);
2927 !!!next-input-character;
2928 } else {
2929 !!!cp (1007);
2930 !!!parse-error (type => 'no refc', line => $l, column => $c);
2931 }
2932
2933 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2934 !!!cp (1008);
2935 !!!parse-error (type => 'invalid character reference',
2936 text => (sprintf 'U+%04X', $code),
2937 line => $l, column => $c);
2938 $code = 0xFFFD;
2939 } elsif ($code > 0x10FFFF) {
2940 !!!cp (1009);
2941 !!!parse-error (type => 'invalid character reference',
2942 text => (sprintf 'U-%08X', $code),
2943 line => $l, column => $c);
2944 $code = 0xFFFD;
2945 } elsif ($code == 0x000D) {
2946 !!!cp (1010);
2947 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2948 $code = 0x000A;
2949 } elsif (0x80 <= $code and $code <= 0x9F) {
2950 !!!cp (1011);
2951 !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
2952 $code = $c1_entity_char->{$code};
2953 }
2954
2955 return {type => CHARACTER_TOKEN, data => chr $code,
2956 has_reference => 1,
2957 line => $l, column => $c,
2958 };
2959 } # X
2960 } elsif (0x0030 <= $self->{next_char} and
2961 $self->{next_char} <= 0x0039) { # 0..9
2962 my $code = $self->{next_char} - 0x0030;
2963 !!!next-input-character;
2964
2965 while (0x0030 <= $self->{next_char} and
2966 $self->{next_char} <= 0x0039) { # 0..9
2967 !!!cp (1012);
2968 $code *= 10;
2969 $code += $self->{next_char} - 0x0030;
2970
2971 !!!next-input-character;
2972 }
2973
2974 if ($self->{next_char} == 0x003B) { # ;
2975 !!!cp (1013);
2976 !!!next-input-character;
2977 } else {
2978 !!!cp (1014);
2979 !!!parse-error (type => 'no refc', line => $l, column => $c);
2980 }
2981
2982 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2983 !!!cp (1015);
2984 !!!parse-error (type => 'invalid character reference',
2985 text => (sprintf 'U+%04X', $code),
2986 line => $l, column => $c);
2987 $code = 0xFFFD;
2988 } elsif ($code > 0x10FFFF) {
2989 !!!cp (1016);
2990 !!!parse-error (type => 'invalid character reference',
2991 text => (sprintf 'U-%08X', $code),
2992 line => $l, column => $c);
2993 $code = 0xFFFD;
2994 } elsif ($code == 0x000D) {
2995 !!!cp (1017);
2996 !!!parse-error (type => 'CR character reference',
2997 line => $l, column => $c);
2998 $code = 0x000A;
2999 } elsif (0x80 <= $code and $code <= 0x9F) {
3000 !!!cp (1018);
3001 !!!parse-error (type => 'C1 character reference',
3002 text => (sprintf 'U+%04X', $code),
3003 line => $l, column => $c);
3004 $code = $c1_entity_char->{$code};
3005 }
3006
3007 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
3008 line => $l, column => $c,
3009 };
3010 } else {
3011 !!!cp (1019);
3012 !!!parse-error (type => 'bare nero', line => $l, column => $c);
3013 !!!back-next-input-character ($self->{next_char});
3014 $self->{next_char} = 0x0023; # #
3015 return undef;
3016 }
3017 } elsif ((0x0041 <= $self->{next_char} and
3018 $self->{next_char} <= 0x005A) or
3019 (0x0061 <= $self->{next_char} and
3020 $self->{next_char} <= 0x007A)) {
3021 my $entity_name = chr $self->{next_char};
3022 !!!next-input-character;
3023
3024 my $value = $entity_name;
3025 my $match = 0;
3026 require Whatpm::_NamedEntityList;
3027 our $EntityChar;
3028
3029 while (length $entity_name < 30 and
3030 ## NOTE: Some number greater than the maximum length of entity name
3031 ((0x0041 <= $self->{next_char} and # a
3032 $self->{next_char} <= 0x005A) or # x
3033 (0x0061 <= $self->{next_char} and # a
3034 $self->{next_char} <= 0x007A) or # z
3035 (0x0030 <= $self->{next_char} and # 0
3036 $self->{next_char} <= 0x0039) or # 9
3037 $self->{next_char} == 0x003B)) { # ;
3038 $entity_name .= chr $self->{next_char};
3039 if (defined $EntityChar->{$entity_name}) {
3040 if ($self->{next_char} == 0x003B) { # ;
3041 !!!cp (1020);
3042 $value = $EntityChar->{$entity_name};
3043 $match = 1;
3044 !!!next-input-character;
3045 last;
3046 } else {
3047 !!!cp (1021);
3048 $value = $EntityChar->{$entity_name};
3049 $match = -1;
3050 !!!next-input-character;
3051 }
3052 } else {
3053 !!!cp (1022);
3054 $value .= chr $self->{next_char};
3055 $match *= 2;
3056 !!!next-input-character;
3057 }
3058 }
3059
3060 if ($match > 0) {
3061 !!!cp (1023);
3062 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3063 line => $l, column => $c,
3064 };
3065 } elsif ($match < 0) {
3066 !!!parse-error (type => 'no refc', line => $l, column => $c);
3067 if ($in_attr and $match < -1) {
3068 !!!cp (1024);
3069 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
3070 line => $l, column => $c,
3071 };
3072 } else {
3073 !!!cp (1025);
3074 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3075 line => $l, column => $c,
3076 };
3077 }
3078 } else {
3079 !!!cp (1026);
3080 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3081 ## NOTE: "No characters are consumed" in the spec.
3082 return {type => CHARACTER_TOKEN, data => '&'.$value,
3083 line => $l, column => $c,
3084 };
3085 }
3086 } else {
3087 !!!cp (1027);
3088 ## no characters are consumed
3089 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3090 return undef;
3091 }
3092 } # _tokenize_attempt_to_consume_an_entity
3093
3094 sub _initialize_tree_constructor ($) {
3095 my $self = shift;
3096 ## NOTE: $self->{document} MUST be specified before this method is called
3097 $self->{document}->strict_error_checking (0);
3098 ## TODO: Turn mutation events off # MUST
3099 ## TODO: Turn loose Document option (manakai extension) on
3100 $self->{document}->manakai_is_html (1); # MUST
3101 $self->{document}->set_user_data (manakai_source_line => 1);
3102 $self->{document}->set_user_data (manakai_source_column => 1);
3103 } # _initialize_tree_constructor
3104
3105 sub _terminate_tree_constructor ($) {
3106 my $self = shift;
3107 $self->{document}->strict_error_checking (1);
3108 ## TODO: Turn mutation events on
3109 } # _terminate_tree_constructor
3110
3111 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3112
3113 { # tree construction stage
3114 my $token;
3115
3116 sub _construct_tree ($) {
3117 my ($self) = @_;
3118
3119 ## When an interactive UA render the $self->{document} available
3120 ## to the user, or when it begin accepting user input, are
3121 ## not defined.
3122
3123 ## Append a character: collect it and all subsequent consecutive
3124 ## characters and insert one Text node whose data is concatenation
3125 ## of all those characters. # MUST
3126
3127 !!!next-token;
3128
3129 undef $self->{form_element};
3130 undef $self->{head_element};
3131 $self->{open_elements} = [];
3132 undef $self->{inner_html_node};
3133
3134 ## NOTE: The "initial" insertion mode.
3135 $self->_tree_construction_initial; # MUST
3136
3137 ## NOTE: The "before html" insertion mode.
3138 $self->_tree_construction_root_element;
3139 $self->{insertion_mode} = BEFORE_HEAD_IM;
3140
3141 ## NOTE: The "before head" insertion mode and so on.
3142 $self->_tree_construction_main;
3143 } # _construct_tree
3144
3145 sub _tree_construction_initial ($) {
3146 my $self = shift;
3147
3148 ## NOTE: "initial" insertion mode
3149
3150 INITIAL: {
3151 if ($token->{type} == DOCTYPE_TOKEN) {
3152 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3153 ## error, switch to a conformance checking mode for another
3154 ## language.
3155 my $doctype_name = $token->{name};
3156 $doctype_name = '' unless defined $doctype_name;
3157 $doctype_name =~ tr/a-z/A-Z/;
3158 if (not defined $token->{name} or # <!DOCTYPE>
3159 defined $token->{public_identifier} or
3160 defined $token->{system_identifier}) {
3161 !!!cp ('t1');
3162 !!!parse-error (type => 'not HTML5', token => $token);
3163 } elsif ($doctype_name ne 'HTML') {
3164 !!!cp ('t2');
3165 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
3166 !!!parse-error (type => 'not HTML5', token => $token);
3167 } else {
3168 !!!cp ('t3');
3169 }
3170
3171 my $doctype = $self->{document}->create_document_type_definition
3172 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3173 ## NOTE: Default value for both |public_id| and |system_id| attributes
3174 ## are empty strings, so that we don't set any value in missing cases.
3175 $doctype->public_id ($token->{public_identifier})
3176 if defined $token->{public_identifier};
3177 $doctype->system_id ($token->{system_identifier})
3178 if defined $token->{system_identifier};
3179 ## NOTE: Other DocumentType attributes are null or empty lists.
3180 ## ISSUE: internalSubset = null??
3181 $self->{document}->append_child ($doctype);
3182
3183 if ($token->{quirks} or $doctype_name ne 'HTML') {
3184 !!!cp ('t4');
3185 $self->{document}->manakai_compat_mode ('quirks');
3186 } elsif (defined $token->{public_identifier}) {
3187 my $pubid = $token->{public_identifier};
3188 $pubid =~ tr/a-z/A-z/;
3189 my $prefix = [
3190 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3191 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3192 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3193 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3194 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3195 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3196 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3197 "-//IETF//DTD HTML 2.0 STRICT//",
3198 "-//IETF//DTD HTML 2.0//",
3199 "-//IETF//DTD HTML 2.1E//",
3200 "-//IETF//DTD HTML 3.0//",
3201 "-//IETF//DTD HTML 3.2 FINAL//",
3202 "-//IETF//DTD HTML 3.2//",
3203 "-//IETF//DTD HTML 3//",
3204 "-//IETF//DTD HTML LEVEL 0//",
3205 "-//IETF//DTD HTML LEVEL 1//",
3206 "-//IETF//DTD HTML LEVEL 2//",
3207 "-//IETF//DTD HTML LEVEL 3//",
3208 "-//IETF//DTD HTML STRICT LEVEL 0//",
3209 "-//IETF//DTD HTML STRICT LEVEL 1//",
3210 "-//IETF//DTD HTML STRICT LEVEL 2//",
3211 "-//IETF//DTD HTML STRICT LEVEL 3//",
3212 "-//IETF//DTD HTML STRICT//",
3213 "-//IETF//DTD HTML//",
3214 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3215 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3216 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3217 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3218 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3219 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3220 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3221 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3222 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3223 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3224 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3225 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3226 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3227 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3228 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3229 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3230 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3231 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3232 "-//W3C//DTD HTML 3 1995-03-24//",
3233 "-//W3C//DTD HTML 3.2 DRAFT//",
3234 "-//W3C//DTD HTML 3.2 FINAL//",
3235 "-//W3C//DTD HTML 3.2//",
3236 "-//W3C//DTD HTML 3.2S DRAFT//",
3237 "-//W3C//DTD HTML 4.0 FRAMESET//",
3238 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3239 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3240 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3241 "-//W3C//DTD W3 HTML//",
3242 "-//W3O//DTD W3 HTML 3.0//",
3243 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3244 "-//WEBTECHS//DTD MOZILLA HTML//",
3245 ]; # $prefix
3246 my $match;
3247 for (@$prefix) {
3248 if (substr ($prefix, 0, length $_) eq $_) {
3249 $match = 1;
3250 last;
3251 }
3252 }
3253 if ($match or
3254 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3255 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3256 $pubid eq "HTML") {
3257 !!!cp ('t5');
3258 $self->{document}->manakai_compat_mode ('quirks');
3259 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3260 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3261 if (defined $token->{system_identifier}) {
3262 !!!cp ('t6');
3263 $self->{document}->manakai_compat_mode ('quirks');
3264 } else {
3265 !!!cp ('t7');
3266 $self->{document}->manakai_compat_mode ('limited quirks');
3267 }
3268 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3269 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3270 !!!cp ('t8');
3271 $self->{document}->manakai_compat_mode ('limited quirks');
3272 } else {
3273 !!!cp ('t9');
3274 }
3275 } else {
3276 !!!cp ('t10');
3277 }
3278 if (defined $token->{system_identifier}) {
3279 my $sysid = $token->{system_identifier};
3280 $sysid =~ tr/A-Z/a-z/;
3281 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3282 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3283 ## marked as quirks.
3284 $self->{document}->manakai_compat_mode ('quirks');
3285 !!!cp ('t11');
3286 } else {
3287 !!!cp ('t12');
3288 }
3289 } else {
3290 !!!cp ('t13');
3291 }
3292
3293 ## Go to the "before html" insertion mode.
3294 !!!next-token;
3295 return;
3296 } elsif ({
3297 START_TAG_TOKEN, 1,
3298 END_TAG_TOKEN, 1,
3299 END_OF_FILE_TOKEN, 1,
3300 }->{$token->{type}}) {
3301 !!!cp ('t14');
3302 !!!parse-error (type => 'no DOCTYPE', token => $token);
3303 $self->{document}->manakai_compat_mode ('quirks');
3304 ## Go to the "before html" insertion mode.
3305 ## reprocess
3306 !!!ack-later;
3307 return;
3308 } elsif ($token->{type} == CHARACTER_TOKEN) {
3309 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3310 ## Ignore the token
3311
3312 unless (length $token->{data}) {
3313 !!!cp ('t15');
3314 ## Stay in the insertion mode.
3315 !!!next-token;
3316 redo INITIAL;
3317 } else {
3318 !!!cp ('t16');
3319 }
3320 } else {
3321 !!!cp ('t17');
3322 }
3323
3324 !!!parse-error (type => 'no DOCTYPE', token => $token);
3325 $self->{document}->manakai_compat_mode ('quirks');
3326 ## Go to the "before html" insertion mode.
3327 ## reprocess
3328 return;
3329 } elsif ($token->{type} == COMMENT_TOKEN) {
3330 !!!cp ('t18');
3331 my $comment = $self->{document}->create_comment ($token->{data});
3332 $self->{document}->append_child ($comment);
3333
3334 ## Stay in the insertion mode.
3335 !!!next-token;
3336 redo INITIAL;
3337 } else {
3338 die "$0: $token->{type}: Unknown token type";
3339 }
3340 } # INITIAL
3341
3342 die "$0: _tree_construction_initial: This should be never reached";
3343 } # _tree_construction_initial
3344
3345 sub _tree_construction_root_element ($) {
3346 my $self = shift;
3347
3348 ## NOTE: "before html" insertion mode.
3349
3350 B: {
3351 if ($token->{type} == DOCTYPE_TOKEN) {
3352 !!!cp ('t19');
3353 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3354 ## Ignore the token
3355 ## Stay in the insertion mode.
3356 !!!next-token;
3357 redo B;
3358 } elsif ($token->{type} == COMMENT_TOKEN) {
3359 !!!cp ('t20');
3360 my $comment = $self->{document}->create_comment ($token->{data});
3361 $self->{document}->append_child ($comment);
3362 ## Stay in the insertion mode.
3363 !!!next-token;
3364 redo B;
3365 } elsif ($token->{type} == CHARACTER_TOKEN) {
3366 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3367 ## Ignore the token.
3368
3369 unless (length $token->{data}) {
3370 !!!cp ('t21');
3371 ## Stay in the insertion mode.
3372 !!!next-token;
3373 redo B;
3374 } else {
3375 !!!cp ('t22');
3376 }
3377 } else {
3378 !!!cp ('t23');
3379 }
3380
3381 $self->{application_cache_selection}->(undef);
3382
3383 #
3384 } elsif ($token->{type} == START_TAG_TOKEN) {
3385 if ($token->{tag_name} eq 'html') {
3386 my $root_element;
3387 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3388 $self->{document}->append_child ($root_element);
3389 push @{$self->{open_elements}},
3390 [$root_element, $el_category->{html}];
3391
3392 if ($token->{attributes}->{manifest}) {
3393 !!!cp ('t24');
3394 $self->{application_cache_selection}
3395 ->($token->{attributes}->{manifest}->{value});
3396 ## ISSUE: Spec is unclear on relative references.
3397 ## According to Hixie (#whatwg 2008-03-19), it should be
3398 ## resolved against the base URI of the document in HTML
3399 ## or xml:base of the element in XHTML.
3400 } else {
3401 !!!cp ('t25');
3402 $self->{application_cache_selection}->(undef);
3403 }
3404
3405 !!!nack ('t25c');
3406
3407 !!!next-token;
3408 return; ## Go to the "before head" insertion mode.
3409 } else {
3410 !!!cp ('t25.1');
3411 #
3412 }
3413 } elsif ({
3414 END_TAG_TOKEN, 1,
3415 END_OF_FILE_TOKEN, 1,
3416 }->{$token->{type}}) {
3417 !!!cp ('t26');
3418 #
3419 } else {
3420 die "$0: $token->{type}: Unknown token type";
3421 }
3422
3423 my $root_element;
3424 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3425 $self->{document}->append_child ($root_element);
3426 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3427
3428 $self->{application_cache_selection}->(undef);
3429
3430 ## NOTE: Reprocess the token.
3431 !!!ack-later;
3432 return; ## Go to the "before head" insertion mode.
3433
3434 ## ISSUE: There is an issue in the spec
3435 } # B
3436
3437 die "$0: _tree_construction_root_element: This should never be reached";
3438 } # _tree_construction_root_element
3439
3440 sub _reset_insertion_mode ($) {
3441 my $self = shift;
3442
3443 ## Step 1
3444 my $last;
3445
3446 ## Step 2
3447 my $i = -1;
3448 my $node = $self->{open_elements}->[$i];
3449
3450 ## Step 3
3451 S3: {
3452 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3453 $last = 1;
3454 if (defined $self->{inner_html_node}) {
3455 !!!cp ('t28');
3456 $node = $self->{inner_html_node};
3457 } else {
3458 die "_reset_insertion_mode: t27";
3459 }
3460 }
3461
3462 ## Step 4..14
3463 my $new_mode;
3464 if ($node->[1] & FOREIGN_EL) {
3465 !!!cp ('t28.1');
3466 ## NOTE: Strictly spaking, the line below only applies to MathML and
3467 ## SVG elements. Currently the HTML syntax supports only MathML and
3468 ## SVG elements as foreigners.
3469 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3470 } elsif ($node->[1] & TABLE_CELL_EL) {
3471 if ($last) {
3472 !!!cp ('t28.2');
3473 #
3474 } else {
3475 !!!cp ('t28.3');
3476 $new_mode = IN_CELL_IM;
3477 }
3478 } else {
3479 !!!cp ('t28.4');
3480 $new_mode = {
3481 select => IN_SELECT_IM,
3482 ## NOTE: |option| and |optgroup| do not set
3483 ## insertion mode to "in select" by themselves.
3484 tr => IN_ROW_IM,
3485 tbody => IN_TABLE_BODY_IM,
3486 thead => IN_TABLE_BODY_IM,
3487 tfoot => IN_TABLE_BODY_IM,
3488 caption => IN_CAPTION_IM,
3489 colgroup => IN_COLUMN_GROUP_IM,
3490 table => IN_TABLE_IM,
3491 head => IN_BODY_IM, # not in head!
3492 body => IN_BODY_IM,
3493 frameset => IN_FRAMESET_IM,
3494 }->{$node->[0]->manakai_local_name};
3495 }
3496 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3497
3498 ## Step 15
3499 if ($node->[1] & HTML_EL) {
3500 unless (defined $self->{head_element}) {
3501 !!!cp ('t29');
3502 $self->{insertion_mode} = BEFORE_HEAD_IM;
3503 } else {
3504 ## ISSUE: Can this state be reached?
3505 !!!cp ('t30');
3506 $self->{insertion_mode} = AFTER_HEAD_IM;
3507 }
3508 return;
3509 } else {
3510 !!!cp ('t31');
3511 }
3512
3513 ## Step 16
3514 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3515
3516 ## Step 17
3517 $i--;
3518 $node = $self->{open_elements}->[$i];
3519
3520 ## Step 18
3521 redo S3;
3522 } # S3
3523
3524 die "$0: _reset_insertion_mode: This line should never be reached";
3525 } # _reset_insertion_mode
3526
3527 sub _tree_construction_main ($) {
3528 my $self = shift;
3529
3530 my $active_formatting_elements = [];
3531
3532 my $reconstruct_active_formatting_elements = sub { # MUST
3533 my $insert = shift;
3534
3535 ## Step 1
3536 return unless @$active_formatting_elements;
3537
3538 ## Step 3
3539 my $i = -1;
3540 my $entry = $active_formatting_elements->[$i];
3541
3542 ## Step 2
3543 return if $entry->[0] eq '#marker';
3544 for (@{$self->{open_elements}}) {
3545 if ($entry->[0] eq $_->[0]) {
3546 !!!cp ('t32');
3547 return;
3548 }
3549 }
3550
3551 S4: {
3552 ## Step 4
3553 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3554
3555 ## Step 5
3556 $i--;
3557 $entry = $active_formatting_elements->[$i];
3558
3559 ## Step 6
3560 if ($entry->[0] eq '#marker') {
3561 !!!cp ('t33_1');
3562 #
3563 } else {
3564 my $in_open_elements;
3565 OE: for (@{$self->{open_elements}}) {
3566 if ($entry->[0] eq $_->[0]) {
3567 !!!cp ('t33');
3568 $in_open_elements = 1;
3569 last OE;
3570 }
3571 }
3572 if ($in_open_elements) {
3573 !!!cp ('t34');
3574 #
3575 } else {
3576 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3577 !!!cp ('t35');
3578 redo S4;
3579 }
3580 }
3581
3582 ## Step 7
3583 $i++;
3584 $entry = $active_formatting_elements->[$i];
3585 } # S4
3586
3587 S7: {
3588 ## Step 8
3589 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3590
3591 ## Step 9
3592 $insert->($clone->[0]);
3593 push @{$self->{open_elements}}, $clone;
3594
3595 ## Step 10
3596 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3597
3598 ## Step 11
3599 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3600 !!!cp ('t36');
3601 ## Step 7'
3602 $i++;
3603 $entry = $active_formatting_elements->[$i];
3604
3605 redo S7;
3606 }
3607
3608 !!!cp ('t37');
3609 } # S7
3610 }; # $reconstruct_active_formatting_elements
3611
3612 my $clear_up_to_marker = sub {
3613 for (reverse 0..$#$active_formatting_elements) {
3614 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3615 !!!cp ('t38');
3616 splice @$active_formatting_elements, $_;
3617 return;
3618 }
3619 }
3620
3621 !!!cp ('t39');
3622 }; # $clear_up_to_marker
3623
3624 my $insert;
3625
3626 my $parse_rcdata = sub ($) {
3627 my ($content_model_flag) = @_;
3628
3629 ## Step 1
3630 my $start_tag_name = $token->{tag_name};
3631 my $el;
3632 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3633
3634 ## Step 2
3635 $insert->($el);
3636
3637 ## Step 3
3638 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3639 delete $self->{escape}; # MUST
3640
3641 ## Step 4
3642 my $text = '';
3643 !!!nack ('t40.1');
3644 !!!next-token;
3645 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3646 !!!cp ('t40');
3647 $text .= $token->{data};
3648 !!!next-token;
3649 }
3650
3651 ## Step 5
3652 if (length $text) {
3653 !!!cp ('t41');
3654 my $text = $self->{document}->create_text_node ($text);
3655 $el->append_child ($text);
3656 }
3657
3658 ## Step 6
3659 $self->{content_model} = PCDATA_CONTENT_MODEL;
3660
3661 ## Step 7
3662 if ($token->{type} == END_TAG_TOKEN and
3663 $token->{tag_name} eq $start_tag_name) {
3664 !!!cp ('t42');
3665 ## Ignore the token
3666 } else {
3667 ## NOTE: An end-of-file token.
3668 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3669 !!!cp ('t43');
3670 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3671 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3672 !!!cp ('t44');
3673 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
3674 } else {
3675 die "$0: $content_model_flag in parse_rcdata";
3676 }
3677 }
3678 !!!next-token;
3679 }; # $parse_rcdata
3680
3681 my $script_start_tag = sub () {
3682 my $script_el;
3683 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3684 ## TODO: mark as "parser-inserted"
3685
3686 $self->{content_model} = CDATA_CONTENT_MODEL;
3687 delete $self->{escape}; # MUST
3688
3689 my $text = '';
3690 !!!nack ('t45.1');
3691 !!!next-token;
3692 while ($token->{type} == CHARACTER_TOKEN) {
3693 !!!cp ('t45');
3694 $text .= $token->{data};
3695 !!!next-token;
3696 } # stop if non-character token or tokenizer stops tokenising
3697 if (length $text) {
3698 !!!cp ('t46');
3699 $script_el->manakai_append_text ($text);
3700 }
3701
3702 $self->{content_model} = PCDATA_CONTENT_MODEL;
3703
3704 if ($token->{type} == END_TAG_TOKEN and
3705 $token->{tag_name} eq 'script') {
3706 !!!cp ('t47');
3707 ## Ignore the token
3708 } else {
3709 !!!cp ('t48');
3710 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3711 ## ISSUE: And ignore?
3712 ## TODO: mark as "already executed"
3713 }
3714
3715 if (defined $self->{inner_html_node}) {
3716 !!!cp ('t49');
3717 ## TODO: mark as "already executed"
3718 } else {
3719 !!!cp ('t50');
3720 ## TODO: $old_insertion_point = current insertion point
3721 ## TODO: insertion point = just before the next input character
3722
3723 $insert->($script_el);
3724
3725 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3726
3727 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3728 }
3729
3730 !!!next-token;
3731 }; # $script_start_tag
3732
3733 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3734 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3735 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3736
3737 my $formatting_end_tag = sub {
3738 my $end_tag_token = shift;
3739 my $tag_name = $end_tag_token->{tag_name};
3740
3741 ## NOTE: The adoption agency algorithm (AAA).
3742
3743 FET: {
3744 ## Step 1
3745 my $formatting_element;
3746 my $formatting_element_i_in_active;
3747 AFE: for (reverse 0..$#$active_formatting_elements) {
3748 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3749 !!!cp ('t52');
3750 last AFE;
3751 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3752 eq $tag_name) {
3753 !!!cp ('t51');
3754 $formatting_element = $active_formatting_elements->[$_];
3755 $formatting_element_i_in_active = $_;
3756 last AFE;
3757 }
3758 } # AFE
3759 unless (defined $formatting_element) {
3760 !!!cp ('t53');
3761 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
3762 ## Ignore the token
3763 !!!next-token;
3764 return;
3765 }
3766 ## has an element in scope
3767 my $in_scope = 1;
3768 my $formatting_element_i_in_open;
3769 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3770 my $node = $self->{open_elements}->[$_];
3771 if ($node->[0] eq $formatting_element->[0]) {
3772 if ($in_scope) {
3773 !!!cp ('t54');
3774 $formatting_element_i_in_open = $_;
3775 last INSCOPE;
3776 } else { # in open elements but not in scope
3777 !!!cp ('t55');
3778 !!!parse-error (type => 'unmatched end tag',
3779 text => $token->{tag_name},
3780 token => $end_tag_token);
3781 ## Ignore the token
3782 !!!next-token;
3783 return;
3784 }
3785 } elsif ($node->[1] & SCOPING_EL) {
3786 !!!cp ('t56');
3787 $in_scope = 0;
3788 }
3789 } # INSCOPE
3790 unless (defined $formatting_element_i_in_open) {
3791 !!!cp ('t57');
3792 !!!parse-error (type => 'unmatched end tag',
3793 text => $token->{tag_name},
3794 token => $end_tag_token);
3795 pop @$active_formatting_elements; # $formatting_element
3796 !!!next-token; ## TODO: ok?
3797 return;
3798 }
3799 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3800 !!!cp ('t58');
3801 !!!parse-error (type => 'not closed',
3802 text => $self->{open_elements}->[-1]->[0]
3803 ->manakai_local_name,
3804 token => $end_tag_token);
3805 }
3806
3807 ## Step 2
3808 my $furthest_block;
3809 my $furthest_block_i_in_open;
3810 OE: for (reverse 0..$#{$self->{open_elements}}) {
3811 my $node = $self->{open_elements}->[$_];
3812 if (not ($node->[1] & FORMATTING_EL) and
3813 #not $phrasing_category->{$node->[1]} and
3814 ($node->[1] & SPECIAL_EL or
3815 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3816 !!!cp ('t59');
3817 $furthest_block = $node;
3818 $furthest_block_i_in_open = $_;
3819 } elsif ($node->[0] eq $formatting_element->[0]) {
3820 !!!cp ('t60');
3821 last OE;
3822 }
3823 } # OE
3824
3825 ## Step 3
3826 unless (defined $furthest_block) { # MUST
3827 !!!cp ('t61');
3828 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3829 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3830 !!!next-token;
3831 return;
3832 }
3833
3834 ## Step 4
3835 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3836
3837 ## Step 5
3838 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3839 if (defined $furthest_block_parent) {
3840 !!!cp ('t62');
3841 $furthest_block_parent->remove_child ($furthest_block->[0]);
3842 }
3843
3844 ## Step 6
3845 my $bookmark_prev_el
3846 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3847 ->[0];
3848
3849 ## Step 7
3850 my $node = $furthest_block;
3851 my $node_i_in_open = $furthest_block_i_in_open;
3852 my $last_node = $furthest_block;
3853 S7: {
3854 ## Step 1
3855 $node_i_in_open--;
3856 $node = $self->{open_elements}->[$node_i_in_open];
3857
3858 ## Step 2
3859 my $node_i_in_active;
3860 S7S2: {
3861 for (reverse 0..$#$active_formatting_elements) {
3862 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3863 !!!cp ('t63');
3864 $node_i_in_active = $_;
3865 last S7S2;
3866 }
3867 }
3868 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3869 redo S7;
3870 } # S7S2
3871
3872 ## Step 3
3873 last S7 if $node->[0] eq $formatting_element->[0];
3874
3875 ## Step 4
3876 if ($last_node->[0] eq $furthest_block->[0]) {
3877 !!!cp ('t64');
3878 $bookmark_prev_el = $node->[0];
3879 }
3880
3881 ## Step 5
3882 if ($node->[0]->has_child_nodes ()) {
3883 !!!cp ('t65');
3884 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3885 $active_formatting_elements->[$node_i_in_active] = $clone;
3886 $self->{open_elements}->[$node_i_in_open] = $clone;
3887 $node = $clone;
3888 }
3889
3890 ## Step 6
3891 $node->[0]->append_child ($last_node->[0]);
3892
3893 ## Step 7
3894 $last_node = $node;
3895
3896 ## Step 8
3897 redo S7;
3898 } # S7
3899
3900 ## Step 8
3901 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3902 my $foster_parent_element;
3903 my $next_sibling;
3904 OE: for (reverse 0..$#{$self->{open_elements}}) {
3905 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3906 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3907 if (defined $parent and $parent->node_type == 1) {
3908 !!!cp ('t65.1');
3909 $foster_parent_element = $parent;
3910 $next_sibling = $self->{open_elements}->[$_]->[0];
3911 } else {
3912 !!!cp ('t65.2');
3913 $foster_parent_element
3914 = $self->{open_elements}->[$_ - 1]->[0];
3915 }
3916 last OE;
3917 }
3918 } # OE
3919 $foster_parent_element = $self->{open_elements}->[0]->[0]
3920 unless defined $foster_parent_element;
3921 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3922 $open_tables->[-1]->[1] = 1; # tainted
3923 } else {
3924 !!!cp ('t65.3');
3925 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3926 }
3927
3928 ## Step 9
3929 my $clone = [$formatting_element->[0]->clone_node (0),
3930 $formatting_element->[1]];
3931
3932 ## Step 10
3933 my @cn = @{$furthest_block->[0]->child_nodes};
3934 $clone->[0]->append_child ($_) for @cn;
3935
3936 ## Step 11
3937 $furthest_block->[0]->append_child ($clone->[0]);
3938
3939 ## Step 12
3940 my $i;
3941 AFE: for (reverse 0..$#$active_formatting_elements) {
3942 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3943 !!!cp ('t66');
3944 splice @$active_formatting_elements, $_, 1;
3945 $i-- and last AFE if defined $i;
3946 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3947 !!!cp ('t67');
3948 $i = $_;
3949 }
3950 } # AFE
3951 splice @$active_formatting_elements, $i + 1, 0, $clone;
3952
3953 ## Step 13
3954 undef $i;
3955 OE: for (reverse 0..$#{$self->{open_elements}}) {
3956 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3957 !!!cp ('t68');
3958 splice @{$self->{open_elements}}, $_, 1;
3959 $i-- and last OE if defined $i;
3960 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3961 !!!cp ('t69');
3962 $i = $_;
3963 }
3964 } # OE
3965 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3966
3967 ## Step 14
3968 redo FET;
3969 } # FET
3970 }; # $formatting_end_tag
3971
3972 $insert = my $insert_to_current = sub {
3973 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3974 }; # $insert_to_current
3975
3976 my $insert_to_foster = sub {
3977 my $child = shift;
3978 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3979 # MUST
3980 my $foster_parent_element;
3981 my $next_sibling;
3982 OE: for (reverse 0..$#{$self->{open_elements}}) {
3983 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3984 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3985 if (defined $parent and $parent->node_type == 1) {
3986 !!!cp ('t70');
3987 $foster_parent_element = $parent;
3988 $next_sibling = $self->{open_elements}->[$_]->[0];
3989 } else {
3990 !!!cp ('t71');
3991 $foster_parent_element
3992 = $self->{open_elements}->[$_ - 1]->[0];
3993 }
3994 last OE;
3995 }
3996 } # OE
3997 $foster_parent_element = $self->{open_elements}->[0]->[0]
3998 unless defined $foster_parent_element;
3999 $foster_parent_element->insert_before
4000 ($child, $next_sibling);
4001 $open_tables->[-1]->[1] = 1; # tainted
4002 } else {
4003 !!!cp ('t72');
4004 $self->{open_elements}->[-1]->[0]->append_child ($child);
4005 }
4006 }; # $insert_to_foster
4007
4008 B: while (1) {
4009 if ($token->{type} == DOCTYPE_TOKEN) {
4010 !!!cp ('t73');
4011 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4012 ## Ignore the token
4013 ## Stay in the phase
4014 !!!next-token;
4015 next B;
4016 } elsif ($token->{type} == START_TAG_TOKEN and
4017 $token->{tag_name} eq 'html') {
4018 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4019 !!!cp ('t79');
4020 !!!parse-error (type => 'after html', text => 'html', token => $token);
4021 $self->{insertion_mode} = AFTER_BODY_IM;
4022 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4023 !!!cp ('t80');
4024 !!!parse-error (type => 'after html', text => 'html', token => $token);
4025 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4026 } else {
4027 !!!cp ('t81');
4028 }
4029
4030 !!!cp ('t82');
4031 !!!parse-error (type => 'not first start tag', token => $token);
4032 my $top_el = $self->{open_elements}->[0]->[0];
4033 for my $attr_name (keys %{$token->{attributes}}) {
4034 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4035 !!!cp ('t84');
4036 $top_el->set_attribute_ns
4037 (undef, [undef, $attr_name],
4038 $token->{attributes}->{$attr_name}->{value});
4039 }
4040 }
4041 !!!nack ('t84.1');
4042 !!!next-token;
4043 next B;
4044 } elsif ($token->{type} == COMMENT_TOKEN) {
4045 my $comment = $self->{document}->create_comment ($token->{data});
4046 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4047 !!!cp ('t85');
4048 $self->{document}->append_child ($comment);
4049 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4050 !!!cp ('t86');
4051 $self->{open_elements}->[0]->[0]->append_child ($comment);
4052 } else {
4053 !!!cp ('t87');
4054 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4055 }
4056 !!!next-token;
4057 next B;
4058 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4059 if ($token->{type} == CHARACTER_TOKEN) {
4060 !!!cp ('t87.1');
4061 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4062 !!!next-token;
4063 next B;
4064 } elsif ($token->{type} == START_TAG_TOKEN) {
4065 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4066 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4067 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4068 ($token->{tag_name} eq 'svg' and
4069 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4070 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4071 !!!cp ('t87.2');
4072 #
4073 } elsif ({
4074 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4075 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4076 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4077 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4078 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4079 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4080 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4081 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4082 }->{$token->{tag_name}}) {
4083 !!!cp ('t87.2');
4084 !!!parse-error (type => 'not closed',
4085 text => $self->{open_elements}->[-1]->[0]
4086 ->manakai_local_name,
4087 token => $token);
4088
4089 pop @{$self->{open_elements}}
4090 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4091
4092 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4093 ## Reprocess.
4094 next B;
4095 } else {
4096 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4097 my $tag_name = $token->{tag_name};
4098 if ($nsuri eq $SVG_NS) {
4099 $tag_name = {
4100 altglyph => 'altGlyph',
4101 altglyphdef => 'altGlyphDef',
4102 altglyphitem => 'altGlyphItem',
4103 animatecolor => 'animateColor',
4104 animatemotion => 'animateMotion',
4105 animatetransform => 'animateTransform',
4106 clippath => 'clipPath',
4107 feblend => 'feBlend',
4108 fecolormatrix => 'feColorMatrix',
4109 fecomponenttransfer => 'feComponentTransfer',
4110 fecomposite => 'feComposite',
4111 feconvolvematrix => 'feConvolveMatrix',
4112 fediffuselighting => 'feDiffuseLighting',
4113 fedisplacementmap => 'feDisplacementMap',
4114 fedistantlight => 'feDistantLight',
4115 feflood => 'feFlood',
4116 fefunca => 'feFuncA',
4117 fefuncb => 'feFuncB',
4118 fefuncg => 'feFuncG',
4119 fefuncr => 'feFuncR',
4120 fegaussianblur => 'feGaussianBlur',
4121 feimage => 'feImage',
4122 femerge => 'feMerge',
4123 femergenode => 'feMergeNode',
4124 femorphology => 'feMorphology',
4125 feoffset => 'feOffset',
4126 fepointlight => 'fePointLight',
4127 fespecularlighting => 'feSpecularLighting',
4128 fespotlight => 'feSpotLight',
4129 fetile => 'feTile',
4130 feturbulence => 'feTurbulence',
4131 foreignobject => 'foreignObject',
4132 glyphref => 'glyphRef',
4133 lineargradient => 'linearGradient',
4134 radialgradient => 'radialGradient',
4135 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4136 textpath => 'textPath',
4137 }->{$tag_name} || $tag_name;
4138 }
4139
4140 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4141
4142 ## "adjust foreign attributes" - done in insert-element-f
4143
4144 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4145
4146 if ($self->{self_closing}) {
4147 pop @{$self->{open_elements}};
4148 !!!ack ('t87.3');
4149 } else {
4150 !!!cp ('t87.4');
4151 }
4152
4153 !!!next-token;
4154 next B;
4155 }
4156 } elsif ($token->{type} == END_TAG_TOKEN) {
4157 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4158 !!!cp ('t87.5');
4159 #
4160 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4161 !!!cp ('t87.6');
4162 !!!parse-error (type => 'not closed',
4163 text => $self->{open_elements}->[-1]->[0]
4164 ->manakai_local_name,
4165 token => $token);
4166
4167 pop @{$self->{open_elements}}
4168 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4169
4170 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4171 ## Reprocess.
4172 next B;
4173 } else {
4174 die "$0: $token->{type}: Unknown token type";
4175 }
4176 }
4177
4178 if ($self->{insertion_mode} & HEAD_IMS) {
4179 if ($token->{type} == CHARACTER_TOKEN) {
4180 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4181 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4182 !!!cp ('t88.2');
4183 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4184 } else {
4185 !!!cp ('t88.1');
4186 ## Ignore the token.
4187 !!!next-token;
4188 next B;
4189 }
4190 unless (length $token->{data}) {
4191 !!!cp ('t88');
4192 !!!next-token;
4193 next B;
4194 }
4195 }
4196
4197 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4198 !!!cp ('t89');
4199 ## As if <head>
4200 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4201 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4202 push @{$self->{open_elements}},
4203 [$self->{head_element}, $el_category->{head}];
4204
4205 ## Reprocess in the "in head" insertion mode...
4206 pop @{$self->{open_elements}};
4207
4208 ## Reprocess in the "after head" insertion mode...
4209 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4210 !!!cp ('t90');
4211 ## As if </noscript>
4212 pop @{$self->{open_elements}};
4213 !!!parse-error (type => 'in noscript:#text', token => $token);
4214
4215 ## Reprocess in the "in head" insertion mode...
4216 ## As if </head>
4217 pop @{$self->{open_elements}};
4218
4219 ## Reprocess in the "after head" insertion mode...
4220 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4221 !!!cp ('t91');
4222 pop @{$self->{open_elements}};
4223
4224 ## Reprocess in the "after head" insertion mode...
4225 } else {
4226 !!!cp ('t92');
4227 }
4228
4229 ## "after head" insertion mode
4230 ## As if <body>
4231 !!!insert-element ('body',, $token);
4232 $self->{insertion_mode} = IN_BODY_IM;
4233 ## reprocess
4234 next B;
4235 } elsif ($token->{type} == START_TAG_TOKEN) {
4236 if ($token->{tag_name} eq 'head') {
4237 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4238 !!!cp ('t93');
4239 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4240 $self->{open_elements}->[-1]->[0]->append_child
4241 ($self->{head_element});
4242 push @{$self->{open_elements}},
4243 [$self->{head_element}, $el_category->{head}];
4244 $self->{insertion_mode} = IN_HEAD_IM;
4245 !!!nack ('t93.1');
4246 !!!next-token;
4247 next B;
4248 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4249 !!!cp ('t93.2');
4250 !!!parse-error (type => 'after head', text => 'head',
4251 token => $token);
4252 ## Ignore the token
4253 !!!nack ('t93.3');
4254 !!!next-token;
4255 next B;
4256 } else {
4257 !!!cp ('t95');
4258 !!!parse-error (type => 'in head:head',
4259 token => $token); # or in head noscript
4260 ## Ignore the token
4261 !!!nack ('t95.1');
4262 !!!next-token;
4263 next B;
4264 }
4265 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4266 !!!cp ('t96');
4267 ## As if <head>
4268 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4269 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4270 push @{$self->{open_elements}},
4271 [$self->{head_element}, $el_category->{head}];
4272
4273 $self->{insertion_mode} = IN_HEAD_IM;
4274 ## Reprocess in the "in head" insertion mode...
4275 } else {
4276 !!!cp ('t97');
4277 }
4278
4279 if ($token->{tag_name} eq 'base') {
4280 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4281 !!!cp ('t98');
4282 ## As if </noscript>
4283 pop @{$self->{open_elements}};
4284 !!!parse-error (type => 'in noscript', text => 'base',
4285 token => $token);
4286
4287 $self->{insertion_mode} = IN_HEAD_IM;
4288 ## Reprocess in the "in head" insertion mode...
4289 } else {
4290 !!!cp ('t99');
4291 }
4292
4293 ## NOTE: There is a "as if in head" code clone.
4294 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4295 !!!cp ('t100');
4296 !!!parse-error (type => 'after head',
4297 text => $token->{tag_name}, token => $token);
4298 push @{$self->{open_elements}},
4299 [$self->{head_element}, $el_category->{head}];
4300 } else {
4301 !!!cp ('t101');
4302 }
4303 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4304 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4305 pop @{$self->{open_elements}} # <head>
4306 if $self->{insertion_mode} == AFTER_HEAD_IM;
4307 !!!nack ('t101.1');
4308 !!!next-token;
4309 next B;
4310 } elsif ($token->{tag_name} eq 'link') {
4311 ## NOTE: There is a "as if in head" code clone.
4312 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4313 !!!cp ('t102');
4314 !!!parse-error (type => 'after head',
4315 text => $token->{tag_name}, token => $token);
4316 push @{$self->{open_elements}},
4317 [$self->{head_element}, $el_category->{head}];
4318 } else {
4319 !!!cp ('t103');
4320 }
4321 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4322 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4323 pop @{$self->{open_elements}} # <head>
4324 if $self->{insertion_mode} == AFTER_HEAD_IM;
4325 !!!ack ('t103.1');
4326 !!!next-token;
4327 next B;
4328 } elsif ($token->{tag_name} eq 'meta') {
4329 ## NOTE: There is a "as if in head" code clone.
4330 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4331 !!!cp ('t104');
4332 !!!parse-error (type => 'after head',
4333 text => $token->{tag_name}, token => $token);
4334 push @{$self->{open_elements}},
4335 [$self->{head_element}, $el_category->{head}];
4336 } else {
4337 !!!cp ('t105');
4338 }
4339 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4340 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4341
4342 unless ($self->{confident}) {
4343 if ($token->{attributes}->{charset}) {
4344 !!!cp ('t106');
4345 ## NOTE: Whether the encoding is supported or not is handled
4346 ## in the {change_encoding} callback.
4347 $self->{change_encoding}
4348 ->($self, $token->{attributes}->{charset}->{value},
4349 $token);
4350
4351 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4352 ->set_user_data (manakai_has_reference =>
4353 $token->{attributes}->{charset}
4354 ->{has_reference});
4355 } elsif ($token->{attributes}->{content}) {
4356 if ($token->{attributes}->{content}->{value}
4357 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4358 [\x09-\x0D\x20]*=
4359 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4360 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4361 !!!cp ('t107');
4362 ## NOTE: Whether the encoding is supported or not is handled
4363 ## in the {change_encoding} callback.
4364 $self->{change_encoding}
4365 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4366 $token);
4367 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4368 ->set_user_data (manakai_has_reference =>
4369 $token->{attributes}->{content}
4370 ->{has_reference});
4371 } else {
4372 !!!cp ('t108');
4373 }
4374 }
4375 } else {
4376 if ($token->{attributes}->{charset}) {
4377 !!!cp ('t109');
4378 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4379 ->set_user_data (manakai_has_reference =>
4380 $token->{attributes}->{charset}
4381 ->{has_reference});
4382 }
4383 if ($token->{attributes}->{content}) {
4384 !!!cp ('t110');
4385 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4386 ->set_user_data (manakai_has_reference =>
4387 $token->{attributes}->{content}
4388 ->{has_reference});
4389 }
4390 }
4391
4392 pop @{$self->{open_elements}} # <head>
4393 if $self->{insertion_mode} == AFTER_HEAD_IM;
4394 !!!ack ('t110.1');
4395 !!!next-token;
4396 next B;
4397 } elsif ($token->{tag_name} eq 'title') {
4398 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4399 !!!cp ('t111');
4400 ## As if </noscript>
4401 pop @{$self->{open_elements}};
4402 !!!parse-error (type => 'in noscript', text => 'title',
4403 token => $token);
4404
4405 $self->{insertion_mode} = IN_HEAD_IM;
4406 ## Reprocess in the "in head" insertion mode...
4407 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4408 !!!cp ('t112');
4409 !!!parse-error (type => 'after head',
4410 text => $token->{tag_name}, token => $token);
4411 push @{$self->{open_elements}},
4412 [$self->{head_element}, $el_category->{head}];
4413 } else {
4414 !!!cp ('t113');
4415 }
4416
4417 ## NOTE: There is a "as if in head" code clone.
4418 my $parent = defined $self->{head_element} ? $self->{head_element}
4419 : $self->{open_elements}->[-1]->[0];
4420 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4421 pop @{$self->{open_elements}} # <head>
4422 if $self->{insertion_mode} == AFTER_HEAD_IM;
4423 next B;
4424 } elsif ($token->{tag_name} eq 'style' or
4425 $token->{tag_name} eq 'noframes') {
4426 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4427 ## insertion mode IN_HEAD_IM)
4428 ## NOTE: There is a "as if in head" code clone.
4429 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4430 !!!cp ('t114');
4431 !!!parse-error (type => 'after head',
4432 text => $token->{tag_name}, token => $token);
4433 push @{$self->{open_elements}},
4434 [$self->{head_element}, $el_category->{head}];
4435 } else {
4436 !!!cp ('t115');
4437 }
4438 $parse_rcdata->(CDATA_CONTENT_MODEL);
4439 pop @{$self->{open_elements}} # <head>
4440 if $self->{insertion_mode} == AFTER_HEAD_IM;
4441 next B;
4442 } elsif ($token->{tag_name} eq 'noscript') {
4443 if ($self->{insertion_mode} == IN_HEAD_IM) {
4444 !!!cp ('t116');
4445 ## NOTE: and scripting is disalbed
4446 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4447 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4448 !!!nack ('t116.1');
4449 !!!next-token;
4450 next B;
4451 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4452 !!!cp ('t117');
4453 !!!parse-error (type => 'in noscript', text => 'noscript',
4454 token => $token);
4455 ## Ignore the token
4456 !!!nack ('t117.1');
4457 !!!next-token;
4458 next B;
4459 } else {
4460 !!!cp ('t118');
4461 #
4462 }
4463 } elsif ($token->{tag_name} eq 'script') {
4464 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4465 !!!cp ('t119');
4466 ## As if </noscript>
4467 pop @{$self->{open_elements}};
4468 !!!parse-error (type => 'in noscript', text => 'script',
4469 token => $token);
4470
4471 $self->{insertion_mode} = IN_HEAD_IM;
4472 ## Reprocess in the "in head" insertion mode...
4473 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4474 !!!cp ('t120');
4475 !!!parse-error (type => 'after head',
4476 text => $token->{tag_name}, token => $token);
4477 push @{$self->{open_elements}},
4478 [$self->{head_element}, $el_category->{head}];
4479 } else {
4480 !!!cp ('t121');
4481 }
4482
4483 ## NOTE: There is a "as if in head" code clone.
4484 $script_start_tag->();
4485 pop @{$self->{open_elements}} # <head>
4486 if $self->{insertion_mode} == AFTER_HEAD_IM;
4487 next B;
4488 } elsif ($token->{tag_name} eq 'body' or
4489 $token->{tag_name} eq 'frameset') {
4490 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4491 !!!cp ('t122');
4492 ## As if </noscript>
4493 pop @{$self->{open_elements}};
4494 !!!parse-error (type => 'in noscript',
4495 text => $token->{tag_name}, token => $token);
4496
4497 ## Reprocess in the "in head" insertion mode...
4498 ## As if </head>
4499 pop @{$self->{open_elements}};
4500
4501 ## Reprocess in the "after head" insertion mode...
4502 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4503 !!!cp ('t124');
4504 pop @{$self->{open_elements}};
4505
4506 ## Reprocess in the "after head" insertion mode...
4507 } else {
4508 !!!cp ('t125');
4509 }
4510
4511 ## "after head" insertion mode
4512 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4513 if ($token->{tag_name} eq 'body') {
4514 !!!cp ('t126');
4515 $self->{insertion_mode} = IN_BODY_IM;
4516 } elsif ($token->{tag_name} eq 'frameset') {
4517 !!!cp ('t127');
4518 $self->{insertion_mode} = IN_FRAMESET_IM;
4519 } else {
4520 die "$0: tag name: $self->{tag_name}";
4521 }
4522 !!!nack ('t127.1');
4523 !!!next-token;
4524 next B;
4525 } else {
4526 !!!cp ('t128');
4527 #
4528 }
4529
4530 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4531 !!!cp ('t129');
4532 ## As if </noscript>
4533 pop @{$self->{open_elements}};
4534 !!!parse-error (type => 'in noscript:/',
4535 text => $token->{tag_name}, token => $token);
4536
4537 ## Reprocess in the "in head" insertion mode...
4538 ## As if </head>
4539 pop @{$self->{open_elements}};
4540
4541 ## Reprocess in the "after head" insertion mode...
4542 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4543 !!!cp ('t130');
4544 ## As if </head>
4545 pop @{$self->{open_elements}};
4546
4547 ## Reprocess in the "after head" insertion mode...
4548 } else {
4549 !!!cp ('t131');
4550 }
4551
4552 ## "after head" insertion mode
4553 ## As if <body>
4554 !!!insert-element ('body',, $token);
4555 $self->{insertion_mode} = IN_BODY_IM;
4556 ## reprocess
4557 !!!ack-later;
4558 next B;
4559 } elsif ($token->{type} == END_TAG_TOKEN) {
4560 if ($token->{tag_name} eq 'head') {
4561 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4562 !!!cp ('t132');
4563 ## As if <head>
4564 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4565 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4566 push @{$self->{open_elements}},
4567 [$self->{head_element}, $el_category->{head}];
4568
4569 ## Reprocess in the "in head" insertion mode...
4570 pop @{$self->{open_elements}};
4571 $self->{insertion_mode} = AFTER_HEAD_IM;
4572 !!!next-token;
4573 next B;
4574 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4575 !!!cp ('t133');
4576 ## As if </noscript>
4577 pop @{$self->{open_elements}};
4578 !!!parse-error (type => 'in noscript:/',
4579 text => 'head', token => $token);
4580
4581 ## Reprocess in the "in head" insertion mode...
4582 pop @{$self->{open_elements}};
4583 $self->{insertion_mode} = AFTER_HEAD_IM;
4584 !!!next-token;
4585 next B;
4586 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4587 !!!cp ('t134');
4588 pop @{$self->{open_elements}};
4589 $self->{insertion_mode} = AFTER_HEAD_IM;
4590 !!!next-token;
4591 next B;
4592 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4593 !!!cp ('t134.1');
4594 !!!parse-error (type => 'unmatched end tag', text => 'head',
4595 token => $token);
4596 ## Ignore the token
4597 !!!next-token;
4598 next B;
4599 } else {
4600 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4601 }
4602 } elsif ($token->{tag_name} eq 'noscript') {
4603 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4604 !!!cp ('t136');
4605 pop @{$self->{open_elements}};
4606 $self->{insertion_mode} = IN_HEAD_IM;
4607 !!!next-token;
4608 next B;
4609 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4610 $self->{insertion_mode} == AFTER_HEAD_IM) {
4611 !!!cp ('t137');
4612 !!!parse-error (type => 'unmatched end tag',
4613 text => 'noscript', token => $token);
4614 ## Ignore the token ## ISSUE: An issue in the spec.
4615 !!!next-token;
4616 next B;
4617 } else {
4618 !!!cp ('t138');
4619 #
4620 }
4621 } elsif ({
4622 body => 1, html => 1,
4623 }->{$token->{tag_name}}) {
4624 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4625 $self->{insertion_mode} == IN_HEAD_IM or
4626 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4627 !!!cp ('t140');
4628 !!!parse-error (type => 'unmatched end tag',
4629 text => $token->{tag_name}, token => $token);
4630 ## Ignore the token
4631 !!!next-token;
4632 next B;
4633 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4634 !!!cp ('t140.1');
4635 !!!parse-error (type => 'unmatched end tag',
4636 text => $token->{tag_name}, token => $token);
4637 ## Ignore the token
4638 !!!next-token;
4639 next B;
4640 } else {
4641 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4642 }
4643 } elsif ($token->{tag_name} eq 'p') {
4644 !!!cp ('t142');
4645 !!!parse-error (type => 'unmatched end tag',
4646 text => $token->{tag_name}, token => $token);
4647 ## Ignore the token
4648 !!!next-token;
4649 next B;
4650 } elsif ($token->{tag_name} eq 'br') {
4651 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4652 !!!cp ('t142.2');
4653 ## (before head) as if <head>, (in head) as if </head>
4654 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4655 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4656 $self->{insertion_mode} = AFTER_HEAD_IM;
4657
4658 ## Reprocess in the "after head" insertion mode...
4659 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4660 !!!cp ('t143.2');
4661 ## As if </head>
4662 pop @{$self->{open_elements}};
4663 $self->{insertion_mode} = AFTER_HEAD_IM;
4664
4665 ## Reprocess in the "after head" insertion mode...
4666 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4667 !!!cp ('t143.3');
4668 ## ISSUE: Two parse errors for <head><noscript></br>
4669 !!!parse-error (type => 'unmatched end tag',
4670 text => 'br', token => $token);
4671 ## As if </noscript>
4672 pop @{$self->{open_elements}};
4673 $self->{insertion_mode} = IN_HEAD_IM;
4674
4675 ## Reprocess in the "in head" insertion mode...
4676 ## As if </head>
4677 pop @{$self->{open_elements}};
4678 $self->{insertion_mode} = AFTER_HEAD_IM;
4679
4680 ## Reprocess in the "after head" insertion mode...
4681 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4682 !!!cp ('t143.4');
4683 #
4684 } else {
4685 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4686 }
4687
4688 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4689 !!!parse-error (type => 'unmatched end tag',
4690 text => 'br', token => $token);
4691 ## Ignore the token
4692 !!!next-token;
4693 next B;
4694 } else {
4695 !!!cp ('t145');
4696 !!!parse-error (type => 'unmatched end tag',
4697 text => $token->{tag_name}, token => $token);
4698 ## Ignore the token
4699 !!!next-token;
4700 next B;
4701 }
4702
4703 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4704 !!!cp ('t146');
4705 ## As if </noscript>
4706 pop @{$self->{open_elements}};
4707 !!!parse-error (type => 'in noscript:/',
4708 text => $token->{tag_name}, token => $token);
4709
4710 ## Reprocess in the "in head" insertion mode...
4711 ## As if </head>
4712 pop @{$self->{open_elements}};
4713
4714 ## Reprocess in the "after head" insertion mode...
4715 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4716 !!!cp ('t147');
4717 ## As if </head>
4718 pop @{$self->{open_elements}};
4719
4720 ## Reprocess in the "after head" insertion mode...
4721 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4722 ## ISSUE: This case cannot be reached?
4723 !!!cp ('t148');
4724 !!!parse-error (type => 'unmatched end tag',
4725 text => $token->{tag_name}, token => $token);
4726 ## Ignore the token ## ISSUE: An issue in the spec.
4727 !!!next-token;
4728 next B;
4729 } else {
4730 !!!cp ('t149');
4731 }
4732
4733 ## "after head" insertion mode
4734 ## As if <body>
4735 !!!insert-element ('body',, $token);
4736 $self->{insertion_mode} = IN_BODY_IM;
4737 ## reprocess
4738 next B;
4739 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4740 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4741 !!!cp ('t149.1');
4742
4743 ## NOTE: As if <head>
4744 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4745 $self->{open_elements}->[-1]->[0]->append_child
4746 ($self->{head_element});
4747 #push @{$self->{open_elements}},
4748 # [$self->{head_element}, $el_category->{head}];
4749 #$self->{insertion_mode} = IN_HEAD_IM;
4750 ## NOTE: Reprocess.
4751
4752 ## NOTE: As if </head>
4753 #pop @{$self->{open_elements}};
4754 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4755 ## NOTE: Reprocess.
4756
4757 #
4758 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4759 !!!cp ('t149.2');
4760
4761 ## NOTE: As if </head>
4762 pop @{$self->{open_elements}};
4763 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4764 ## NOTE: Reprocess.
4765
4766 #
4767 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4768 !!!cp ('t149.3');
4769
4770 !!!parse-error (type => 'in noscript:#eof', token => $token);
4771
4772 ## As if </noscript>
4773 pop @{$self->{open_elements}};
4774 #$self->{insertion_mode} = IN_HEAD_IM;
4775 ## NOTE: Reprocess.
4776
4777 ## NOTE: As if </head>
4778 pop @{$self->{open_elements}};
4779 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4780 ## NOTE: Reprocess.
4781
4782 #
4783 } else {
4784 !!!cp ('t149.4');
4785 #
4786 }
4787
4788 ## NOTE: As if <body>
4789 !!!insert-element ('body',, $token);
4790 $self->{insertion_mode} = IN_BODY_IM;
4791 ## NOTE: Reprocess.
4792 next B;
4793 } else {
4794 die "$0: $token->{type}: Unknown token type";
4795 }
4796
4797 ## ISSUE: An issue in the spec.
4798 } elsif ($self->{insertion_mode} & BODY_IMS) {
4799 if ($token->{type} == CHARACTER_TOKEN) {
4800 !!!cp ('t150');
4801 ## NOTE: There is a code clone of "character in body".
4802 $reconstruct_active_formatting_elements->($insert_to_current);
4803
4804 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4805
4806 !!!next-token;
4807 next B;
4808 } elsif ($token->{type} == START_TAG_TOKEN) {
4809 if ({
4810 caption => 1, col => 1, colgroup => 1, tbody => 1,
4811 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4812 }->{$token->{tag_name}}) {
4813 if ($self->{insertion_mode} == IN_CELL_IM) {
4814 ## have an element in table scope
4815 for (reverse 0..$#{$self->{open_elements}}) {
4816 my $node = $self->{open_elements}->[$_];
4817 if ($node->[1] & TABLE_CELL_EL) {
4818 !!!cp ('t151');
4819
4820 ## Close the cell
4821 !!!back-token; # <x>
4822 $token = {type => END_TAG_TOKEN,
4823 tag_name => $node->[0]->manakai_local_name,
4824 line => $token->{line},
4825 column => $token->{column}};
4826 next B;
4827 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4828 !!!cp ('t152');
4829 ## ISSUE: This case can never be reached, maybe.
4830 last;
4831 }
4832 }
4833
4834 !!!cp ('t153');
4835 !!!parse-error (type => 'start tag not allowed',
4836 text => $token->{tag_name}, token => $token);
4837 ## Ignore the token
4838 !!!nack ('t153.1');
4839 !!!next-token;
4840 next B;
4841 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4842 !!!parse-error (type => 'not closed', text => 'caption',
4843 token => $token);
4844
4845 ## NOTE: As if </caption>.
4846 ## have a table element in table scope
4847 my $i;
4848 INSCOPE: {
4849 for (reverse 0..$#{$self->{open_elements}}) {
4850 my $node = $self->{open_elements}->[$_];
4851 if ($node->[1] & CAPTION_EL) {
4852 !!!cp ('t155');
4853 $i = $_;
4854 last INSCOPE;
4855 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4856 !!!cp ('t156');
4857 last;
4858 }
4859 }
4860
4861 !!!cp ('t157');
4862 !!!parse-error (type => 'start tag not allowed',
4863 text => $token->{tag_name}, token => $token);
4864 ## Ignore the token
4865 !!!nack ('t157.1');
4866 !!!next-token;
4867 next B;
4868 } # INSCOPE
4869
4870 ## generate implied end tags
4871 while ($self->{open_elements}->[-1]->[1]
4872 & END_TAG_OPTIONAL_EL) {
4873 !!!cp ('t158');
4874 pop @{$self->{open_elements}};
4875 }
4876
4877 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4878 !!!cp ('t159');
4879 !!!parse-error (type => 'not closed',
4880 text => $self->{open_elements}->[-1]->[0]
4881 ->manakai_local_name,
4882 token => $token);
4883 } else {
4884 !!!cp ('t160');
4885 }
4886
4887 splice @{$self->{open_elements}}, $i;
4888
4889 $clear_up_to_marker->();
4890
4891 $self->{insertion_mode} = IN_TABLE_IM;
4892
4893 ## reprocess
4894 !!!ack-later;
4895 next B;
4896 } else {
4897 !!!cp ('t161');
4898 #
4899 }
4900 } else {
4901 !!!cp ('t162');
4902 #
4903 }
4904 } elsif ($token->{type} == END_TAG_TOKEN) {
4905 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4906 if ($self->{insertion_mode} == IN_CELL_IM) {
4907 ## have an element in table scope
4908 my $i;
4909 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4910 my $node = $self->{open_elements}->[$_];
4911 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4912 !!!cp ('t163');
4913 $i = $_;
4914 last INSCOPE;
4915 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4916 !!!cp ('t164');
4917 last INSCOPE;
4918 }
4919 } # INSCOPE
4920 unless (defined $i) {
4921 !!!cp ('t165');
4922 !!!parse-error (type => 'unmatched end tag',
4923 text => $token->{tag_name},
4924 token => $token);
4925 ## Ignore the token
4926 !!!next-token;
4927 next B;
4928 }
4929
4930 ## generate implied end tags
4931 while ($self->{open_elements}->[-1]->[1]
4932 & END_TAG_OPTIONAL_EL) {
4933 !!!cp ('t166');
4934 pop @{$self->{open_elements}};
4935 }
4936
4937 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4938 ne $token->{tag_name}) {
4939 !!!cp ('t167');
4940 !!!parse-error (type => 'not closed',
4941 text => $self->{open_elements}->[-1]->[0]
4942 ->manakai_local_name,
4943 token => $token);
4944 } else {
4945 !!!cp ('t168');
4946 }
4947
4948 splice @{$self->{open_elements}}, $i;
4949
4950 $clear_up_to_marker->();
4951
4952 $self->{insertion_mode} = IN_ROW_IM;
4953
4954 !!!next-token;
4955 next B;
4956 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4957 !!!cp ('t169');
4958 !!!parse-error (type => 'unmatched end tag',
4959 text => $token->{tag_name}, token => $token);
4960 ## Ignore the token
4961 !!!next-token;
4962 next B;
4963 } else {
4964 !!!cp ('t170');
4965 #
4966 }
4967 } elsif ($token->{tag_name} eq 'caption') {
4968 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4969 ## have a table element in table scope
4970 my $i;
4971 INSCOPE: {
4972 for (reverse 0..$#{$self->{open_elements}}) {
4973 my $node = $self->{open_elements}->[$_];
4974 if ($node->[1] & CAPTION_EL) {
4975 !!!cp ('t171');
4976 $i = $_;
4977 last INSCOPE;
4978 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4979 !!!cp ('t172');
4980 last;
4981 }
4982 }
4983
4984 !!!cp ('t173');
4985 !!!parse-error (type => 'unmatched end tag',
4986 text => $token->{tag_name}, token => $token);
4987 ## Ignore the token
4988 !!!next-token;
4989 next B;
4990 } # INSCOPE
4991
4992 ## generate implied end tags
4993 while ($self->{open_elements}->[-1]->[1]
4994 & END_TAG_OPTIONAL_EL) {
4995 !!!cp ('t174');
4996 pop @{$self->{open_elements}};
4997 }
4998
4999 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5000 !!!cp ('t175');
5001 !!!parse-error (type => 'not closed',
5002 text => $self->{open_elements}->[-1]->[0]
5003 ->manakai_local_name,
5004 token => $token);
5005 } else {
5006 !!!cp ('t176');
5007 }
5008
5009 splice @{$self->{open_elements}}, $i;
5010
5011 $clear_up_to_marker->();
5012
5013 $self->{insertion_mode} = IN_TABLE_IM;
5014
5015 !!!next-token;
5016 next B;
5017 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5018 !!!cp ('t177');
5019 !!!parse-error (type => 'unmatched end tag',
5020 text => $token->{tag_name}, token => $token);
5021 ## Ignore the token
5022 !!!next-token;
5023 next B;
5024 } else {
5025 !!!cp ('t178');
5026 #
5027 }
5028 } elsif ({
5029 table => 1, tbody => 1, tfoot => 1,
5030 thead => 1, tr => 1,
5031 }->{$token->{tag_name}} and
5032 $self->{insertion_mode} == IN_CELL_IM) {
5033 ## have an element in table scope
5034 my $i;
5035 my $tn;
5036 INSCOPE: {
5037 for (reverse 0..$#{$self->{open_elements}}) {
5038 my $node = $self->{open_elements}->[$_];
5039 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5040 !!!cp ('t179');
5041 $i = $_;
5042
5043 ## Close the cell
5044 !!!back-token; # </x>
5045 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5046 line => $token->{line},
5047 column => $token->{column}};
5048 next B;
5049 } elsif ($node->[1] & TABLE_CELL_EL) {
5050 !!!cp ('t180');
5051 $tn = $node->[0]->manakai_local_name;
5052 ## NOTE: There is exactly one |td| or |th| element
5053 ## in scope in the stack of open elements by definition.
5054 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5055 ## ISSUE: Can this be reached?
5056 !!!cp ('t181');
5057 last;
5058 }
5059 }
5060
5061 !!!cp ('t182');
5062 !!!parse-error (type => 'unmatched end tag',
5063 text => $token->{tag_name}, token => $token);
5064 ## Ignore the token
5065 !!!next-token;
5066 next B;
5067 } # INSCOPE
5068 } elsif ($token->{tag_name} eq 'table' and
5069 $self->{insertion_mode} == IN_CAPTION_IM) {
5070 !!!parse-error (type => 'not closed', text => 'caption',
5071 token => $token);
5072
5073 ## As if </caption>
5074 ## have a table element in table scope
5075 my $i;
5076 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5077 my $node = $self->{open_elements}->[$_];
5078 if ($node->[1] & CAPTION_EL) {
5079 !!!cp ('t184');
5080 $i = $_;
5081 last INSCOPE;
5082 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5083 !!!cp ('t185');
5084 last INSCOPE;
5085 }
5086 } # INSCOPE
5087 unless (defined $i) {
5088 !!!cp ('t186');
5089 !!!parse-error (type => 'unmatched end tag',
5090 text => 'caption', token => $token);
5091 ## Ignore the token
5092 !!!next-token;
5093 next B;
5094 }
5095
5096 ## generate implied end tags
5097 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5098 !!!cp ('t187');
5099 pop @{$self->{open_elements}};
5100 }
5101
5102 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5103 !!!cp ('t188');
5104 !!!parse-error (type => 'not closed',
5105 text => $self->{open_elements}->[-1]->[0]
5106 ->manakai_local_name,
5107 token => $token);
5108 } else {
5109 !!!cp ('t189');
5110 }
5111
5112 splice @{$self->{open_elements}}, $i;
5113
5114 $clear_up_to_marker->();
5115
5116 $self->{insertion_mode} = IN_TABLE_IM;
5117
5118 ## reprocess
5119 next B;
5120 } elsif ({
5121 body => 1, col => 1, colgroup => 1, html => 1,
5122 }->{$token->{tag_name}}) {
5123 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5124 !!!cp ('t190');
5125 !!!parse-error (type => 'unmatched end tag',
5126 text => $token->{tag_name}, token => $token);
5127 ## Ignore the token
5128 !!!next-token;
5129 next B;
5130 } else {
5131 !!!cp ('t191');
5132 #
5133 }
5134 } elsif ({
5135 tbody => 1, tfoot => 1,
5136 thead => 1, tr => 1,
5137 }->{$token->{tag_name}} and
5138 $self->{insertion_mode} == IN_CAPTION_IM) {
5139 !!!cp ('t192');
5140 !!!parse-error (type => 'unmatched end tag',
5141 text => $token->{tag_name}, token => $token);
5142 ## Ignore the token
5143 !!!next-token;
5144 next B;
5145 } else {
5146 !!!cp ('t193');
5147 #
5148 }
5149 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5150 for my $entry (@{$self->{open_elements}}) {
5151 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5152 !!!cp ('t75');
5153 !!!parse-error (type => 'in body:#eof', token => $token);
5154 last;
5155 }
5156 }
5157
5158 ## Stop parsing.
5159 last B;
5160 } else {
5161 die "$0: $token->{type}: Unknown token type";
5162 }
5163
5164 $insert = $insert_to_current;
5165 #
5166 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5167 if ($token->{type} == CHARACTER_TOKEN) {
5168 if (not $open_tables->[-1]->[1] and # tainted
5169 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5170 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5171
5172 unless (length $token->{data}) {
5173 !!!cp ('t194');
5174 !!!next-token;
5175 next B;
5176 } else {
5177 !!!cp ('t195');
5178 }
5179 }
5180
5181 !!!parse-error (type => 'in table:#text', token => $token);
5182
5183 ## As if in body, but insert into foster parent element
5184 ## ISSUE: Spec says that "whenever a node would be inserted
5185 ## into the current node" while characters might not be
5186 ## result in a new Text node.
5187 $reconstruct_active_formatting_elements->($insert_to_foster);
5188
5189 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5190 # MUST
5191 my $foster_parent_element;
5192 my $next_sibling;
5193 my $prev_sibling;
5194 OE: for (reverse 0..$#{$self->{open_elements}}) {
5195 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5196 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5197 if (defined $parent and $parent->node_type == 1) {
5198 !!!cp ('t196');
5199 $foster_parent_element = $parent;
5200 $next_sibling = $self->{open_elements}->[$_]->[0];
5201 $prev_sibling = $next_sibling->previous_sibling;
5202 } else {
5203 !!!cp ('t197');
5204 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5205 $prev_sibling = $foster_parent_element->last_child;
5206 }
5207 last OE;
5208 }
5209 } # OE
5210 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5211 $prev_sibling = $foster_parent_element->last_child
5212 unless defined $foster_parent_element;
5213 if (defined $prev_sibling and
5214 $prev_sibling->node_type == 3) {
5215 !!!cp ('t198');
5216 $prev_sibling->manakai_append_text ($token->{data});
5217 } else {
5218 !!!cp ('t199');
5219 $foster_parent_element->insert_before
5220 ($self->{document}->create_text_node ($token->{data}),
5221 $next_sibling);
5222 }
5223 $open_tables->[-1]->[1] = 1; # tainted
5224 } else {
5225 !!!cp ('t200');
5226 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5227 }
5228
5229 !!!next-token;
5230 next B;
5231 } elsif ($token->{type} == START_TAG_TOKEN) {
5232 if ({
5233 tr => ($self->{insertion_mode} != IN_ROW_IM),
5234 th => 1, td => 1,
5235 }->{$token->{tag_name}}) {
5236 if ($self->{insertion_mode} == IN_TABLE_IM) {
5237 ## Clear back to table context
5238 while (not ($self->{open_elements}->[-1]->[1]
5239 & TABLE_SCOPING_EL)) {
5240 !!!cp ('t201');
5241 pop @{$self->{open_elements}};
5242 }
5243
5244 !!!insert-element ('tbody',, $token);
5245 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5246 ## reprocess in the "in table body" insertion mode...
5247 }
5248
5249 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5250 unless ($token->{tag_name} eq 'tr') {
5251 !!!cp ('t202');
5252 !!!parse-error (type => 'missing start tag:tr', token => $token);
5253 }
5254
5255 ## Clear back to table body context
5256 while (not ($self->{open_elements}->[-1]->[1]
5257 & TABLE_ROWS_SCOPING_EL)) {
5258 !!!cp ('t203');
5259 ## ISSUE: Can this case be reached?
5260 pop @{$self->{open_elements}};
5261 }
5262
5263 $self->{insertion_mode} = IN_ROW_IM;
5264 if ($token->{tag_name} eq 'tr') {
5265 !!!cp ('t204');
5266 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5267 !!!nack ('t204');
5268 !!!next-token;
5269 next B;
5270 } else {
5271 !!!cp ('t205');
5272 !!!insert-element ('tr',, $token);
5273 ## reprocess in the "in row" insertion mode
5274 }
5275 } else {
5276 !!!cp ('t206');
5277 }
5278
5279 ## Clear back to table row context
5280 while (not ($self->{open_elements}->[-1]->[1]
5281 & TABLE_ROW_SCOPING_EL)) {
5282 !!!cp ('t207');
5283 pop @{$self->{open_elements}};
5284 }
5285
5286 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5287 $self->{insertion_mode} = IN_CELL_IM;
5288
5289 push @$active_formatting_elements, ['#marker', ''];
5290
5291 !!!nack ('t207.1');
5292 !!!next-token;
5293 next B;
5294 } elsif ({
5295 caption => 1, col => 1, colgroup => 1,
5296 tbody => 1, tfoot => 1, thead => 1,
5297 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5298 }->{$token->{tag_name}}) {
5299 if ($self->{insertion_mode} == IN_ROW_IM) {
5300 ## As if </tr>
5301 ## have an element in table scope
5302 my $i;
5303 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5304 my $node = $self->{open_elements}->[$_];
5305 if ($node->[1] & TABLE_ROW_EL) {
5306 !!!cp ('t208');
5307 $i = $_;
5308 last INSCOPE;
5309 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5310 !!!cp ('t209');
5311 last INSCOPE;
5312 }
5313 } # INSCOPE
5314 unless (defined $i) {
5315 !!!cp ('t210');
5316 ## TODO: This type is wrong.
5317 !!!parse-error (type => 'unmacthed end tag',
5318 text => $token->{tag_name}, token => $token);
5319 ## Ignore the token
5320 !!!nack ('t210.1');
5321 !!!next-token;
5322 next B;
5323 }
5324
5325 ## Clear back to table row context
5326 while (not ($self->{open_elements}->[-1]->[1]
5327 & TABLE_ROW_SCOPING_EL)) {
5328 !!!cp ('t211');
5329 ## ISSUE: Can this case be reached?
5330 pop @{$self->{open_elements}};
5331 }
5332
5333 pop @{$self->{open_elements}}; # tr
5334 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5335 if ($token->{tag_name} eq 'tr') {
5336 !!!cp ('t212');
5337 ## reprocess
5338 !!!ack-later;
5339 next B;
5340 } else {
5341 !!!cp ('t213');
5342 ## reprocess in the "in table body" insertion mode...
5343 }
5344 }
5345
5346 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5347 ## have an element in table scope
5348 my $i;
5349 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5350 my $node = $self->{open_elements}->[$_];
5351 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5352 !!!cp ('t214');
5353 $i = $_;
5354 last INSCOPE;
5355 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5356 !!!cp ('t215');
5357 last INSCOPE;
5358 }
5359 } # INSCOPE
5360 unless (defined $i) {
5361 !!!cp ('t216');
5362 ## TODO: This erorr type is wrong.
5363 !!!parse-error (type => 'unmatched end tag',
5364 text => $token->{tag_name}, token => $token);
5365 ## Ignore the token
5366 !!!nack ('t216.1');
5367 !!!next-token;
5368 next B;
5369 }
5370
5371 ## Clear back to table body context
5372 while (not ($self->{open_elements}->[-1]->[1]
5373 & TABLE_ROWS_SCOPING_EL)) {
5374 !!!cp ('t217');
5375 ## ISSUE: Can this state be reached?
5376 pop @{$self->{open_elements}};
5377 }
5378
5379 ## As if <{current node}>
5380 ## have an element in table scope
5381 ## true by definition
5382
5383 ## Clear back to table body context
5384 ## nop by definition
5385
5386 pop @{$self->{open_elements}};
5387 $self->{insertion_mode} = IN_TABLE_IM;
5388 ## reprocess in "in table" insertion mode...
5389 } else {
5390 !!!cp ('t218');
5391 }
5392
5393 if ($token->{tag_name} eq 'col') {
5394 ## Clear back to table context
5395 while (not ($self->{open_elements}->[-1]->[1]
5396 & TABLE_SCOPING_EL)) {
5397 !!!cp ('t219');
5398 ## ISSUE: Can this state be reached?
5399 pop @{$self->{open_elements}};
5400 }
5401
5402 !!!insert-element ('colgroup',, $token);
5403 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5404 ## reprocess
5405 !!!ack-later;
5406 next B;
5407 } elsif ({
5408 caption => 1,
5409 colgroup => 1,
5410 tbody => 1, tfoot => 1, thead => 1,
5411 }->{$token->{tag_name}}) {
5412 ## Clear back to table context
5413 while (not ($self->{open_elements}->[-1]->[1]
5414 & TABLE_SCOPING_EL)) {
5415 !!!cp ('t220');
5416 ## ISSUE: Can this state be reached?
5417 pop @{$self->{open_elements}};
5418 }
5419
5420 push @$active_formatting_elements, ['#marker', '']
5421 if $token->{tag_name} eq 'caption';
5422
5423 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5424 $self->{insertion_mode} = {
5425 caption => IN_CAPTION_IM,
5426 colgroup => IN_COLUMN_GROUP_IM,
5427 tbody => IN_TABLE_BODY_IM,
5428 tfoot => IN_TABLE_BODY_IM,
5429 thead => IN_TABLE_BODY_IM,
5430 }->{$token->{tag_name}};
5431 !!!next-token;
5432 !!!nack ('t220.1');
5433 next B;
5434 } else {
5435 die "$0: in table: <>: $token->{tag_name}";
5436 }
5437 } elsif ($token->{tag_name} eq 'table') {
5438 !!!parse-error (type => 'not closed',
5439 text => $self->{open_elements}->[-1]->[0]
5440 ->manakai_local_name,
5441 token => $token);
5442
5443 ## As if </table>
5444 ## have a table element in table scope
5445 my $i;
5446 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5447 my $node = $self->{open_elements}->[$_];
5448 if ($node->[1] & TABLE_EL) {
5449 !!!cp ('t221');
5450 $i = $_;
5451 last INSCOPE;
5452 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5453 !!!cp ('t222');
5454 last INSCOPE;
5455 }
5456 } # INSCOPE
5457 unless (defined $i) {
5458 !!!cp ('t223');
5459 ## TODO: The following is wrong, maybe.
5460 !!!parse-error (type => 'unmatched end tag', text => 'table',
5461 token => $token);
5462 ## Ignore tokens </table><table>
5463 !!!nack ('t223.1');
5464 !!!next-token;
5465 next B;
5466 }
5467
5468 ## TODO: Followings are removed from the latest spec.
5469 ## generate implied end tags
5470 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5471 !!!cp ('t224');
5472 pop @{$self->{open_elements}};
5473 }
5474
5475 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5476 !!!cp ('t225');
5477 ## NOTE: |<table><tr><table>|
5478 !!!parse-error (type => 'not closed',
5479 text => $self->{open_elements}->[-1]->[0]
5480 ->manakai_local_name,
5481 token => $token);
5482 } else {
5483 !!!cp ('t226');
5484 }
5485
5486 splice @{$self->{open_elements}}, $i;
5487 pop @{$open_tables};
5488
5489 $self->_reset_insertion_mode;
5490
5491 ## reprocess
5492 !!!ack-later;
5493 next B;
5494 } elsif ($token->{tag_name} eq 'style') {
5495 if (not $open_tables->[-1]->[1]) { # tainted
5496 !!!cp ('t227.8');
5497 ## NOTE: This is a "as if in head" code clone.
5498 $parse_rcdata->(CDATA_CONTENT_MODEL);
5499 next B;
5500 } else {
5501 !!!cp ('t227.7');
5502 #
5503 }
5504 } elsif ($token->{tag_name} eq 'script') {
5505 if (not $open_tables->[-1]->[1]) { # tainted
5506 !!!cp ('t227.6');
5507 ## NOTE: This is a "as if in head" code clone.
5508 $script_start_tag->();
5509 next B;
5510 } else {
5511 !!!cp ('t227.5');
5512 #
5513 }
5514 } elsif ($token->{tag_name} eq 'input') {
5515 if (not $open_tables->[-1]->[1]) { # tainted
5516 if ($token->{attributes}->{type}) { ## TODO: case
5517 my $type = lc $token->{attributes}->{type}->{value};
5518 if ($type eq 'hidden') {
5519 !!!cp ('t227.3');
5520 !!!parse-error (type => 'in table',
5521 text => $token->{tag_name}, token => $token);
5522
5523 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5524
5525 ## TODO: form element pointer
5526
5527 pop @{$self->{open_elements}};
5528
5529 !!!next-token;
5530 !!!ack ('t227.2.1');
5531 next B;
5532 } else {
5533 !!!cp ('t227.2');
5534 #
5535 }
5536 } else {
5537 !!!cp ('t227.1');
5538 #
5539 }
5540 } else {
5541 !!!cp ('t227.4');
5542 #
5543 }
5544 } else {
5545 !!!cp ('t227');
5546 #
5547 }
5548
5549 !!!parse-error (type => 'in table', text => $token->{tag_name},
5550 token => $token);
5551
5552 $insert = $insert_to_foster;
5553 #
5554 } elsif ($token->{type} == END_TAG_TOKEN) {
5555 if ($token->{tag_name} eq 'tr' and
5556 $self->{insertion_mode} == IN_ROW_IM) {
5557 ## have an element in table scope
5558 my $i;
5559 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5560 my $node = $self->{open_elements}->[$_];
5561 if ($node->[1] & TABLE_ROW_EL) {
5562 !!!cp ('t228');
5563 $i = $_;
5564 last INSCOPE;
5565 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5566 !!!cp ('t229');
5567 last INSCOPE;
5568 }
5569 } # INSCOPE
5570 unless (defined $i) {
5571 !!!cp ('t230');
5572 !!!parse-error (type => 'unmatched end tag',
5573 text => $token->{tag_name}, token => $token);
5574 ## Ignore the token
5575 !!!nack ('t230.1');
5576 !!!next-token;
5577 next B;
5578 } else {
5579 !!!cp ('t232');
5580 }
5581
5582 ## Clear back to table row context
5583 while (not ($self->{open_elements}->[-1]->[1]
5584 & TABLE_ROW_SCOPING_EL)) {
5585 !!!cp ('t231');
5586 ## ISSUE: Can this state be reached?
5587 pop @{$self->{open_elements}};
5588 }
5589
5590 pop @{$self->{open_elements}}; # tr
5591 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5592 !!!next-token;
5593 !!!nack ('t231.1');
5594 next B;
5595 } elsif ($token->{tag_name} eq 'table') {
5596 if ($self->{insertion_mode} == IN_ROW_IM) {
5597 ## As if </tr>
5598 ## have an element in table scope
5599 my $i;
5600 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5601 my $node = $self->{open_elements}->[$_];
5602 if ($node->[1] & TABLE_ROW_EL) {
5603 !!!cp ('t233');
5604 $i = $_;
5605 last INSCOPE;
5606 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5607 !!!cp ('t234');
5608 last INSCOPE;
5609 }
5610 } # INSCOPE
5611 unless (defined $i) {
5612 !!!cp ('t235');
5613 ## TODO: The following is wrong.
5614 !!!parse-error (type => 'unmatched end tag',
5615 text => $token->{type}, token => $token);
5616 ## Ignore the token
5617 !!!nack ('t236.1');
5618 !!!next-token;
5619 next B;
5620 }
5621
5622 ## Clear back to table row context
5623 while (not ($self->{open_elements}->[-1]->[1]
5624 & TABLE_ROW_SCOPING_EL)) {
5625 !!!cp ('t236');
5626 ## ISSUE: Can this state be reached?
5627 pop @{$self->{open_elements}};
5628 }
5629
5630 pop @{$self->{open_elements}}; # tr
5631 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5632 ## reprocess in the "in table body" insertion mode...
5633 }
5634
5635 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5636 ## have an element in table scope
5637 my $i;
5638 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5639 my $node = $self->{open_elements}->[$_];
5640 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5641 !!!cp ('t237');
5642 $i = $_;
5643 last INSCOPE;
5644 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5645 !!!cp ('t238');
5646 last INSCOPE;
5647 }
5648 } # INSCOPE
5649 unless (defined $i) {
5650 !!!cp ('t239');
5651 !!!parse-error (type => 'unmatched end tag',
5652 text => $token->{tag_name}, token => $token);
5653 ## Ignore the token
5654 !!!nack ('t239.1');
5655 !!!next-token;
5656 next B;
5657 }
5658
5659 ## Clear back to table body context
5660 while (not ($self->{open_elements}->[-1]->[1]
5661 & TABLE_ROWS_SCOPING_EL)) {
5662 !!!cp ('t240');
5663 pop @{$self->{open_elements}};
5664 }
5665
5666 ## As if <{current node}>
5667 ## have an element in table scope
5668 ## true by definition
5669
5670 ## Clear back to table body context
5671 ## nop by definition
5672
5673 pop @{$self->{open_elements}};
5674 $self->{insertion_mode} = IN_TABLE_IM;
5675 ## reprocess in the "in table" insertion mode...
5676 }
5677
5678 ## NOTE: </table> in the "in table" insertion mode.
5679 ## When you edit the code fragment below, please ensure that
5680 ## the code for <table> in the "in table" insertion mode
5681 ## is synced with it.
5682
5683 ## have a table element in table scope
5684 my $i;
5685 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5686 my $node = $self->{open_elements}->[$_];
5687 if ($node->[1] & TABLE_EL) {
5688 !!!cp ('t241');
5689 $i = $_;
5690 last INSCOPE;
5691 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5692 !!!cp ('t242');
5693 last INSCOPE;
5694 }
5695 } # INSCOPE
5696 unless (defined $i) {
5697 !!!cp ('t243');
5698 !!!parse-error (type => 'unmatched end tag',
5699 text => $token->{tag_name}, token => $token);
5700 ## Ignore the token
5701 !!!nack ('t243.1');
5702 !!!next-token;
5703 next B;
5704 }
5705
5706 splice @{$self->{open_elements}}, $i;
5707 pop @{$open_tables};
5708
5709 $self->_reset_insertion_mode;
5710
5711 !!!next-token;
5712 next B;
5713 } elsif ({
5714 tbody => 1, tfoot => 1, thead => 1,
5715 }->{$token->{tag_name}} and
5716 $self->{insertion_mode} & ROW_IMS) {
5717 if ($self->{insertion_mode} == IN_ROW_IM) {
5718 ## have an element in table scope
5719 my $i;
5720 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5721 my $node = $self->{open_elements}->[$_];
5722 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5723 !!!cp ('t247');
5724 $i = $_;
5725 last INSCOPE;
5726 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5727 !!!cp ('t248');
5728 last INSCOPE;
5729 }
5730 } # INSCOPE
5731 unless (defined $i) {
5732 !!!cp ('t249');
5733 !!!parse-error (type => 'unmatched end tag',
5734 text => $token->{tag_name}, token => $token);
5735 ## Ignore the token
5736 !!!nack ('t249.1');
5737 !!!next-token;
5738 next B;
5739 }
5740
5741 ## As if </tr>
5742 ## have an element in table scope
5743 my $i;
5744 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5745 my $node = $self->{open_elements}->[$_];
5746 if ($node->[1] & TABLE_ROW_EL) {
5747 !!!cp ('t250');
5748 $i = $_;
5749 last INSCOPE;
5750 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5751 !!!cp ('t251');
5752 last INSCOPE;
5753 }
5754 } # INSCOPE
5755 unless (defined $i) {
5756 !!!cp ('t252');
5757 !!!parse-error (type => 'unmatched end tag',
5758 text => 'tr', token => $token);
5759 ## Ignore the token
5760 !!!nack ('t252.1');
5761 !!!next-token;
5762 next B;
5763 }
5764
5765 ## Clear back to table row context
5766 while (not ($self->{open_elements}->[-1]->[1]
5767 & TABLE_ROW_SCOPING_EL)) {
5768 !!!cp ('t253');
5769 ## ISSUE: Can this case be reached?
5770 pop @{$self->{open_elements}};
5771 }
5772
5773 pop @{$self->{open_elements}}; # tr
5774 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5775 ## reprocess in the "in table body" insertion mode...
5776 }
5777
5778 ## have an element in table scope
5779 my $i;
5780 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5781 my $node = $self->{open_elements}->[$_];
5782 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5783 !!!cp ('t254');
5784 $i = $_;
5785 last INSCOPE;
5786 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5787 !!!cp ('t255');
5788 last INSCOPE;
5789 }
5790 } # INSCOPE
5791 unless (defined $i) {
5792 !!!cp ('t256');
5793 !!!parse-error (type => 'unmatched end tag',
5794 text => $token->{tag_name}, token => $token);
5795 ## Ignore the token
5796 !!!nack ('t256.1');
5797 !!!next-token;
5798 next B;
5799 }
5800
5801 ## Clear back to table body context
5802 while (not ($self->{open_elements}->[-1]->[1]
5803 & TABLE_ROWS_SCOPING_EL)) {
5804 !!!cp ('t257');
5805 ## ISSUE: Can this case be reached?
5806 pop @{$self->{open_elements}};
5807 }
5808
5809 pop @{$self->{open_elements}};
5810 $self->{insertion_mode} = IN_TABLE_IM;
5811 !!!nack ('t257.1');
5812 !!!next-token;
5813 next B;
5814 } elsif ({
5815 body => 1, caption => 1, col => 1, colgroup => 1,
5816 html => 1, td => 1, th => 1,
5817 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5818 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5819 }->{$token->{tag_name}}) {
5820 !!!cp ('t258');
5821 !!!parse-error (type => 'unmatched end tag',
5822 text => $token->{tag_name}, token => $token);
5823 ## Ignore the token
5824 !!!nack ('t258.1');
5825 !!!next-token;
5826 next B;
5827 } else {
5828 !!!cp ('t259');
5829 !!!parse-error (type => 'in table:/',
5830 text => $token->{tag_name}, token => $token);
5831
5832 $insert = $insert_to_foster;
5833 #
5834 }
5835 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5836 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5837 @{$self->{open_elements}} == 1) { # redundant, maybe
5838 !!!parse-error (type => 'in body:#eof', token => $token);
5839 !!!cp ('t259.1');
5840 #
5841 } else {
5842 !!!cp ('t259.2');
5843 #
5844 }
5845
5846 ## Stop parsing
5847 last B;
5848 } else {
5849 die "$0: $token->{type}: Unknown token type";
5850 }
5851 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5852 if ($token->{type} == CHARACTER_TOKEN) {
5853 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5854 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5855 unless (length $token->{data}) {
5856 !!!cp ('t260');
5857 !!!next-token;
5858 next B;
5859 }
5860 }
5861
5862 !!!cp ('t261');
5863 #
5864 } elsif ($token->{type} == START_TAG_TOKEN) {
5865 if ($token->{tag_name} eq 'col') {
5866 !!!cp ('t262');
5867 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5868 pop @{$self->{open_elements}};
5869 !!!ack ('t262.1');
5870 !!!next-token;
5871 next B;
5872 } else {
5873 !!!cp ('t263');
5874 #
5875 }
5876 } elsif ($token->{type} == END_TAG_TOKEN) {
5877 if ($token->{tag_name} eq 'colgroup') {
5878 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5879 !!!cp ('t264');
5880 !!!parse-error (type => 'unmatched end tag',
5881 text => 'colgroup', token => $token);
5882 ## Ignore the token
5883 !!!next-token;
5884 next B;
5885 } else {
5886 !!!cp ('t265');
5887 pop @{$self->{open_elements}}; # colgroup
5888 $self->{insertion_mode} = IN_TABLE_IM;
5889 !!!next-token;
5890 next B;
5891 }
5892 } elsif ($token->{tag_name} eq 'col') {
5893 !!!cp ('t266');
5894 !!!parse-error (type => 'unmatched end tag',
5895 text => 'col', token => $token);
5896 ## Ignore the token
5897 !!!next-token;
5898 next B;
5899 } else {
5900 !!!cp ('t267');
5901 #
5902 }
5903 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5904 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5905 @{$self->{open_elements}} == 1) { # redundant, maybe
5906 !!!cp ('t270.2');
5907 ## Stop parsing.
5908 last B;
5909 } else {
5910 ## NOTE: As if </colgroup>.
5911 !!!cp ('t270.1');
5912 pop @{$self->{open_elements}}; # colgroup
5913 $self->{insertion_mode} = IN_TABLE_IM;
5914 ## Reprocess.
5915 next B;
5916 }
5917 } else {
5918 die "$0: $token->{type}: Unknown token type";
5919 }
5920
5921 ## As if </colgroup>
5922 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5923 !!!cp ('t269');
5924 ## TODO: Wrong error type?
5925 !!!parse-error (type => 'unmatched end tag',
5926 text => 'colgroup', token => $token);
5927 ## Ignore the token
5928 !!!nack ('t269.1');
5929 !!!next-token;
5930 next B;
5931 } else {
5932 !!!cp ('t270');
5933 pop @{$self->{open_elements}}; # colgroup
5934 $self->{insertion_mode} = IN_TABLE_IM;
5935 !!!ack-later;
5936 ## reprocess
5937 next B;
5938 }
5939 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5940 if ($token->{type} == CHARACTER_TOKEN) {
5941 !!!cp ('t271');
5942 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5943 !!!next-token;
5944 next B;
5945 } elsif ($token->{type} == START_TAG_TOKEN) {
5946 if ($token->{tag_name} eq 'option') {
5947 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5948 !!!cp ('t272');
5949 ## As if </option>
5950 pop @{$self->{open_elements}};
5951 } else {
5952 !!!cp ('t273');
5953 }
5954
5955 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5956 !!!nack ('t273.1');
5957 !!!next-token;
5958 next B;
5959 } elsif ($token->{tag_name} eq 'optgroup') {
5960 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5961 !!!cp ('t274');
5962 ## As if </option>
5963 pop @{$self->{open_elements}};
5964 } else {
5965 !!!cp ('t275');
5966 }
5967
5968 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5969 !!!cp ('t276');
5970 ## As if </optgroup>
5971 pop @{$self->{open_elements}};
5972 } else {
5973 !!!cp ('t277');
5974 }
5975
5976 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5977 !!!nack ('t277.1');
5978 !!!next-token;
5979 next B;
5980 } elsif ({
5981 select => 1, input => 1, textarea => 1,
5982 }->{$token->{tag_name}} or
5983 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5984 {
5985 caption => 1, table => 1,
5986 tbody => 1, tfoot => 1, thead => 1,
5987 tr => 1, td => 1, th => 1,
5988 }->{$token->{tag_name}})) {
5989 ## TODO: The type below is not good - <select> is replaced by </select>
5990 !!!parse-error (type => 'not closed', text => 'select',
5991 token => $token);
5992 ## NOTE: As if the token were </select> (<select> case) or
5993 ## as if there were </select> (otherwise).
5994 ## have an element in table scope
5995 my $i;
5996 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5997 my $node = $self->{open_elements}->[$_];
5998 if ($node->[1] & SELECT_EL) {
5999 !!!cp ('t278');
6000 $i = $_;
6001 last INSCOPE;
6002 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6003 !!!cp ('t279');
6004 last INSCOPE;
6005 }
6006 } # INSCOPE
6007 unless (defined $i) {
6008 !!!cp ('t280');
6009 !!!parse-error (type => 'unmatched end tag',
6010 text => 'select', token => $token);
6011 ## Ignore the token
6012 !!!nack ('t280.1');
6013 !!!next-token;
6014 next B;
6015 }
6016
6017 !!!cp ('t281');
6018 splice @{$self->{open_elements}}, $i;
6019
6020 $self->_reset_insertion_mode;
6021
6022 if ($token->{tag_name} eq 'select') {
6023 !!!nack ('t281.2');
6024 !!!next-token;
6025 next B;
6026 } else {
6027 !!!cp ('t281.1');
6028 !!!ack-later;
6029 ## Reprocess the token.
6030 next B;
6031 }
6032 } else {
6033 !!!cp ('t282');
6034 !!!parse-error (type => 'in select',
6035 text => $token->{tag_name}, token => $token);
6036 ## Ignore the token
6037 !!!nack ('t282.1');
6038 !!!next-token;
6039 next B;
6040 }
6041 } elsif ($token->{type} == END_TAG_TOKEN) {
6042 if ($token->{tag_name} eq 'optgroup') {
6043 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6044 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6045 !!!cp ('t283');
6046 ## As if </option>
6047 splice @{$self->{open_elements}}, -2;
6048 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6049 !!!cp ('t284');
6050 pop @{$self->{open_elements}};
6051 } else {
6052 !!!cp ('t285');
6053 !!!parse-error (type => 'unmatched end tag',
6054 text => $token->{tag_name}, token => $token);
6055 ## Ignore the token
6056 }
6057 !!!nack ('t285.1');
6058 !!!next-token;
6059 next B;
6060 } elsif ($token->{tag_name} eq 'option') {
6061 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6062 !!!cp ('t286');
6063 pop @{$self->{open_elements}};
6064 } else {
6065 !!!cp ('t287');
6066 !!!parse-error (type => 'unmatched end tag',
6067 text => $token->{tag_name}, token => $token);
6068 ## Ignore the token
6069 }
6070 !!!nack ('t287.1');
6071 !!!next-token;
6072 next B;
6073 } elsif ($token->{tag_name} eq 'select') {
6074 ## have an element in table scope
6075 my $i;
6076 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6077 my $node = $self->{open_elements}->[$_];
6078 if ($node->[1] & SELECT_EL) {
6079 !!!cp ('t288');
6080 $i = $_;
6081 last INSCOPE;
6082 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6083 !!!cp ('t289');
6084 last INSCOPE;
6085 }
6086 } # INSCOPE
6087 unless (defined $i) {
6088 !!!cp ('t290');
6089 !!!parse-error (type => 'unmatched end tag',
6090 text => $token->{tag_name}, token => $token);
6091 ## Ignore the token
6092 !!!nack ('t290.1');
6093 !!!next-token;
6094 next B;
6095 }
6096
6097 !!!cp ('t291');
6098 splice @{$self->{open_elements}}, $i;
6099
6100 $self->_reset_insertion_mode;
6101
6102 !!!nack ('t291.1');
6103 !!!next-token;
6104 next B;
6105 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6106 {
6107 caption => 1, table => 1, tbody => 1,
6108 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6109 }->{$token->{tag_name}}) {
6110 ## TODO: The following is wrong?
6111 !!!parse-error (type => 'unmatched end tag',
6112 text => $token->{tag_name}, token => $token);
6113
6114 ## have an element in table scope
6115 my $i;
6116 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6117 my $node = $self->{open_elements}->[$_];
6118 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6119 !!!cp ('t292');
6120 $i = $_;
6121 last INSCOPE;
6122 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6123 !!!cp ('t293');
6124 last INSCOPE;
6125 }
6126 } # INSCOPE
6127 unless (defined $i) {
6128 !!!cp ('t294');
6129 ## Ignore the token
6130 !!!nack ('t294.1');
6131 !!!next-token;
6132 next B;
6133 }
6134
6135 ## As if </select>
6136 ## have an element in table scope
6137 undef $i;
6138 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6139 my $node = $self->{open_elements}->[$_];
6140 if ($node->[1] & SELECT_EL) {
6141 !!!cp ('t295');
6142 $i = $_;
6143 last INSCOPE;
6144 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6145 ## ISSUE: Can this state be reached?
6146 !!!cp ('t296');
6147 last INSCOPE;
6148 }
6149 } # INSCOPE
6150 unless (defined $i) {
6151 !!!cp ('t297');
6152 ## TODO: The following error type is correct?
6153 !!!parse-error (type => 'unmatched end tag',
6154 text => 'select', token => $token);
6155 ## Ignore the </select> token
6156 !!!nack ('t297.1');
6157 !!!next-token; ## TODO: ok?
6158 next B;
6159 }
6160
6161 !!!cp ('t298');
6162 splice @{$self->{open_elements}}, $i;
6163
6164 $self->_reset_insertion_mode;
6165
6166 !!!ack-later;
6167 ## reprocess
6168 next B;
6169 } else {
6170 !!!cp ('t299');
6171 !!!parse-error (type => 'in select:/',
6172 text => $token->{tag_name}, token => $token);
6173 ## Ignore the token
6174 !!!nack ('t299.3');
6175 !!!next-token;
6176 next B;
6177 }
6178 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6179 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6180 @{$self->{open_elements}} == 1) { # redundant, maybe
6181 !!!cp ('t299.1');
6182 !!!parse-error (type => 'in body:#eof', token => $token);
6183 } else {
6184 !!!cp ('t299.2');
6185 }
6186
6187 ## Stop parsing.
6188 last B;
6189 } else {
6190 die "$0: $token->{type}: Unknown token type";
6191 }
6192 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6193 if ($token->{type} == CHARACTER_TOKEN) {
6194 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6195 my $data = $1;
6196 ## As if in body
6197 $reconstruct_active_formatting_elements->($insert_to_current);
6198
6199 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6200
6201 unless (length $token->{data}) {
6202 !!!cp ('t300');
6203 !!!next-token;
6204 next B;
6205 }
6206 }
6207
6208 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6209 !!!cp ('t301');
6210 !!!parse-error (type => 'after html:#text', token => $token);
6211
6212 ## Reprocess in the "after body" insertion mode.
6213 } else {
6214 !!!cp ('t302');
6215 }
6216
6217 ## "after body" insertion mode
6218 !!!parse-error (type => 'after body:#text', token => $token);
6219
6220 $self->{insertion_mode} = IN_BODY_IM;
6221 ## reprocess
6222 next B;
6223 } elsif ($token->{type} == START_TAG_TOKEN) {
6224 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6225 !!!cp ('t303');
6226 !!!parse-error (type => 'after html',
6227 text => $token->{tag_name}, token => $token);
6228
6229 ## Reprocess in the "after body" insertion mode.
6230 } else {
6231 !!!cp ('t304');
6232 }
6233
6234 ## "after body" insertion mode
6235 !!!parse-error (type => 'after body',
6236 text => $token->{tag_name}, token => $token);
6237
6238 $self->{insertion_mode} = IN_BODY_IM;
6239 !!!ack-later;
6240 ## reprocess
6241 next B;
6242 } elsif ($token->{type} == END_TAG_TOKEN) {
6243 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6244 !!!cp ('t305');
6245 !!!parse-error (type => 'after html:/',
6246 text => $token->{tag_name}, token => $token);
6247
6248 $self->{insertion_mode} = AFTER_BODY_IM;
6249 ## Reprocess in the "after body" insertion mode.
6250 } else {
6251 !!!cp ('t306');
6252 }
6253
6254 ## "after body" insertion mode
6255 if ($token->{tag_name} eq 'html') {
6256 if (defined $self->{inner_html_node}) {
6257 !!!cp ('t307');
6258 !!!parse-error (type => 'unmatched end tag',
6259 text => 'html', token => $token);
6260 ## Ignore the token
6261 !!!next-token;
6262 next B;
6263 } else {
6264 !!!cp ('t308');
6265 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6266 !!!next-token;
6267 next B;
6268 }
6269 } else {
6270 !!!cp ('t309');
6271 !!!parse-error (type => 'after body:/',
6272 text => $token->{tag_name}, token => $token);
6273
6274 $self->{insertion_mode} = IN_BODY_IM;
6275 ## reprocess
6276 next B;
6277 }
6278 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6279 !!!cp ('t309.2');
6280 ## Stop parsing
6281 last B;
6282 } else {
6283 die "$0: $token->{type}: Unknown token type";
6284 }
6285 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6286 if ($token->{type} == CHARACTER_TOKEN) {
6287 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6288 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6289
6290 unless (length $token->{data}) {
6291 !!!cp ('t310');
6292 !!!next-token;
6293 next B;
6294 }
6295 }
6296
6297 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6298 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6299 !!!cp ('t311');
6300 !!!parse-error (type => 'in frameset:#text', token => $token);
6301 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6302 !!!cp ('t312');
6303 !!!parse-error (type => 'after frameset:#text', token => $token);
6304 } else { # "after html frameset"
6305 !!!cp ('t313');
6306 !!!parse-error (type => 'after html:#text', token => $token);
6307
6308 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6309 ## Reprocess in the "after frameset" insertion mode.
6310 !!!parse-error (type => 'after frameset:#text', token => $token);
6311 }
6312
6313 ## Ignore the token.
6314 if (length $token->{data}) {
6315 !!!cp ('t314');
6316 ## reprocess the rest of characters
6317 } else {
6318 !!!cp ('t315');
6319 !!!next-token;
6320 }
6321 next B;
6322 }
6323
6324 die qq[$0: Character "$token->{data}"];
6325 } elsif ($token->{type} == START_TAG_TOKEN) {
6326 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6327 !!!cp ('t316');
6328 !!!parse-error (type => 'after html',
6329 text => $token->{tag_name}, token => $token);
6330
6331 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6332 ## Process in the "after frameset" insertion mode.
6333 } else {
6334 !!!cp ('t317');
6335 }
6336
6337 if ($token->{tag_name} eq 'frameset' and
6338 $self->{insertion_mode} == IN_FRAMESET_IM) {
6339 !!!cp ('t318');
6340 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6341 !!!nack ('t318.1');
6342 !!!next-token;
6343 next B;
6344 } elsif ($token->{tag_name} eq 'frame' and
6345 $self->{insertion_mode} == IN_FRAMESET_IM) {
6346 !!!cp ('t319');
6347 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6348 pop @{$self->{open_elements}};
6349 !!!ack ('t319.1');
6350 !!!next-token;
6351 next B;
6352 } elsif ($token->{tag_name} eq 'noframes') {
6353 !!!cp ('t320');
6354 ## NOTE: As if in head.
6355 $parse_rcdata->(CDATA_CONTENT_MODEL);
6356 next B;
6357 } else {
6358 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6359 !!!cp ('t321');
6360 !!!parse-error (type => 'in frameset',
6361 text => $token->{tag_name}, token => $token);
6362 } else {
6363 !!!cp ('t322');
6364 !!!parse-error (type => 'after frameset',
6365 text => $token->{tag_name}, token => $token);
6366 }
6367 ## Ignore the token
6368 !!!nack ('t322.1');
6369 !!!next-token;
6370 next B;
6371 }
6372 } elsif ($token->{type} == END_TAG_TOKEN) {
6373 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6374 !!!cp ('t323');
6375 !!!parse-error (type => 'after html:/',
6376 text => $token->{tag_name}, token => $token);
6377
6378 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6379 ## Process in the "after frameset" insertion mode.
6380 } else {
6381 !!!cp ('t324');
6382 }
6383
6384 if ($token->{tag_name} eq 'frameset' and
6385 $self->{insertion_mode} == IN_FRAMESET_IM) {
6386 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6387 @{$self->{open_elements}} == 1) {
6388 !!!cp ('t325');
6389 !!!parse-error (type => 'unmatched end tag',
6390 text => $token->{tag_name}, token => $token);
6391 ## Ignore the token
6392 !!!next-token;
6393 } else {
6394 !!!cp ('t326');
6395 pop @{$self->{open_elements}};
6396 !!!next-token;
6397 }
6398
6399 if (not defined $self->{inner_html_node} and
6400 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6401 !!!cp ('t327');
6402 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6403 } else {
6404 !!!cp ('t328');
6405 }
6406 next B;
6407 } elsif ($token->{tag_name} eq 'html' and
6408 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6409 !!!cp ('t329');
6410 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6411 !!!next-token;
6412 next B;
6413 } else {
6414 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6415 !!!cp ('t330');
6416 !!!parse-error (type => 'in frameset:/',
6417 text => $token->{tag_name}, token => $token);
6418 } else {
6419 !!!cp ('t331');
6420 !!!parse-error (type => 'after frameset:/',
6421 text => $token->{tag_name}, token => $token);
6422 }
6423 ## Ignore the token
6424 !!!next-token;
6425 next B;
6426 }
6427 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6428 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6429 @{$self->{open_elements}} == 1) { # redundant, maybe
6430 !!!cp ('t331.1');
6431 !!!parse-error (type => 'in body:#eof', token => $token);
6432 } else {
6433 !!!cp ('t331.2');
6434 }
6435
6436 ## Stop parsing
6437 last B;
6438 } else {
6439 die "$0: $token->{type}: Unknown token type";
6440 }
6441
6442 ## ISSUE: An issue in spec here
6443 } else {
6444 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6445 }
6446
6447 ## "in body" insertion mode
6448 if ($token->{type} == START_TAG_TOKEN) {
6449 if ($token->{tag_name} eq 'script') {
6450 !!!cp ('t332');
6451 ## NOTE: This is an "as if in head" code clone
6452 $script_start_tag->();
6453 next B;
6454 } elsif ($token->{tag_name} eq 'style') {
6455 !!!cp ('t333');
6456 ## NOTE: This is an "as if in head" code clone
6457 $parse_rcdata->(CDATA_CONTENT_MODEL);
6458 next B;
6459 } elsif ({
6460 base => 1, link => 1,
6461 }->{$token->{tag_name}}) {
6462 !!!cp ('t334');
6463 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6464 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6465 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6466 !!!ack ('t334.1');
6467 !!!next-token;
6468 next B;
6469 } elsif ($token->{tag_name} eq 'meta') {
6470 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6471 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6472 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6473
6474 unless ($self->{confident}) {
6475 if ($token->{attributes}->{charset}) {
6476 !!!cp ('t335');
6477 ## NOTE: Whether the encoding is supported or not is handled
6478 ## in the {change_encoding} callback.
6479 $self->{change_encoding}
6480 ->($self, $token->{attributes}->{charset}->{value}, $token);
6481
6482 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6483 ->set_user_data (manakai_has_reference =>
6484 $token->{attributes}->{charset}
6485 ->{has_reference});
6486 } elsif ($token->{attributes}->{content}) {
6487 if ($token->{attributes}->{content}->{value}
6488 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6489 [\x09-\x0D\x20]*=
6490 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6491 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6492 !!!cp ('t336');
6493 ## NOTE: Whether the encoding is supported or not is handled
6494 ## in the {change_encoding} callback.
6495 $self->{change_encoding}
6496 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6497 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6498 ->set_user_data (manakai_has_reference =>
6499 $token->{attributes}->{content}
6500 ->{has_reference});
6501 }
6502 }
6503 } else {
6504 if ($token->{attributes}->{charset}) {
6505 !!!cp ('t337');
6506 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6507 ->set_user_data (manakai_has_reference =>
6508 $token->{attributes}->{charset}
6509 ->{has_reference});
6510 }
6511 if ($token->{attributes}->{content}) {
6512 !!!cp ('t338');
6513 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6514 ->set_user_data (manakai_has_reference =>
6515 $token->{attributes}->{content}
6516 ->{has_reference});
6517 }
6518 }
6519
6520 !!!ack ('t338.1');
6521 !!!next-token;
6522 next B;
6523 } elsif ($token->{tag_name} eq 'title') {
6524 !!!cp ('t341');
6525 ## NOTE: This is an "as if in head" code clone
6526 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6527 next B;
6528 } elsif ($token->{tag_name} eq 'body') {
6529 !!!parse-error (type => 'in body', text => 'body', token => $token);
6530
6531 if (@{$self->{open_elements}} == 1 or
6532 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6533 !!!cp ('t342');
6534 ## Ignore the token
6535 } else {
6536 my $body_el = $self->{open_elements}->[1]->[0];
6537 for my $attr_name (keys %{$token->{attributes}}) {
6538 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6539 !!!cp ('t343');
6540 $body_el->set_attribute_ns
6541 (undef, [undef, $attr_name],
6542 $token->{attributes}->{$attr_name}->{value});
6543 }
6544 }
6545 }
6546 !!!nack ('t343.1');
6547 !!!next-token;
6548 next B;
6549 } elsif ({
6550 address => 1, blockquote => 1, center => 1, dir => 1,
6551 div => 1, dl => 1, fieldset => 1,
6552 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6553 menu => 1, ol => 1, p => 1, ul => 1,
6554 pre => 1, listing => 1,
6555 form => 1,
6556 table => 1,
6557 hr => 1,
6558 }->{$token->{tag_name}}) {
6559 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6560 !!!cp ('t350');
6561 !!!parse-error (type => 'in form:form', token => $token);
6562 ## Ignore the token
6563 !!!nack ('t350.1');
6564 !!!next-token;
6565 next B;
6566 }
6567
6568 ## has a p element in scope
6569 INSCOPE: for (reverse @{$self->{open_elements}}) {
6570 if ($_->[1] & P_EL) {
6571 !!!cp ('t344');
6572 !!!back-token; # <form>
6573 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6574 line => $token->{line}, column => $token->{column}};
6575 next B;
6576 } elsif ($_->[1] & SCOPING_EL) {
6577 !!!cp ('t345');
6578 last INSCOPE;
6579 }
6580 } # INSCOPE
6581
6582 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6583 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6584 !!!nack ('t346.1');
6585 !!!next-token;
6586 if ($token->{type} == CHARACTER_TOKEN) {
6587 $token->{data} =~ s/^\x0A//;
6588 unless (length $token->{data}) {
6589 !!!cp ('t346');
6590 !!!next-token;
6591 } else {
6592 !!!cp ('t349');
6593 }
6594 } else {
6595 !!!cp ('t348');
6596 }
6597 } elsif ($token->{tag_name} eq 'form') {
6598 !!!cp ('t347.1');
6599 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6600
6601 !!!nack ('t347.2');
6602 !!!next-token;
6603 } elsif ($token->{tag_name} eq 'table') {
6604 !!!cp ('t382');
6605 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6606
6607 $self->{insertion_mode} = IN_TABLE_IM;
6608
6609 !!!nack ('t382.1');
6610 !!!next-token;
6611 } elsif ($token->{tag_name} eq 'hr') {
6612 !!!cp ('t386');
6613 pop @{$self->{open_elements}};
6614
6615 !!!nack ('t386.1');
6616 !!!next-token;
6617 } else {
6618 !!!nack ('t347.1');
6619 !!!next-token;
6620 }
6621 next B;
6622 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6623 ## has a p element in scope
6624 INSCOPE: for (reverse @{$self->{open_elements}}) {
6625 if ($_->[1] & P_EL) {
6626 !!!cp ('t353');
6627 !!!back-token; # <x>
6628 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6629 line => $token->{line}, column => $token->{column}};
6630 next B;
6631 } elsif ($_->[1] & SCOPING_EL) {
6632 !!!cp ('t354');
6633 last INSCOPE;
6634 }
6635 } # INSCOPE
6636
6637 ## Step 1
6638 my $i = -1;
6639 my $node = $self->{open_elements}->[$i];
6640 my $li_or_dtdd = {li => {li => 1},
6641 dt => {dt => 1, dd => 1},
6642 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6643 LI: {
6644 ## Step 2
6645 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6646 if ($i != -1) {
6647 !!!cp ('t355');
6648 !!!parse-error (type => 'not closed',
6649 text => $self->{open_elements}->[-1]->[0]
6650 ->manakai_local_name,
6651 token => $token);
6652 } else {
6653 !!!cp ('t356');
6654 }
6655 splice @{$self->{open_elements}}, $i;
6656 last LI;
6657 } else {
6658 !!!cp ('t357');
6659 }
6660
6661 ## Step 3
6662 if (not ($node->[1] & FORMATTING_EL) and
6663 #not $phrasing_category->{$node->[1]} and
6664 ($node->[1] & SPECIAL_EL or
6665 $node->[1] & SCOPING_EL) and
6666 not ($node->[1] & ADDRESS_EL) and
6667 not ($node->[1] & DIV_EL)) {
6668 !!!cp ('t358');
6669 last LI;
6670 }
6671
6672 !!!cp ('t359');
6673 ## Step 4
6674 $i--;
6675 $node = $self->{open_elements}->[$i];
6676 redo LI;
6677 } # LI
6678
6679 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6680 !!!nack ('t359.1');
6681 !!!next-token;
6682 next B;
6683 } elsif ($token->{tag_name} eq 'plaintext') {
6684 ## has a p element in scope
6685 INSCOPE: for (reverse @{$self->{open_elements}}) {
6686 if ($_->[1] & P_EL) {
6687 !!!cp ('t367');
6688 !!!back-token; # <plaintext>
6689 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6690 line => $token->{line}, column => $token->{column}};
6691 next B;
6692 } elsif ($_->[1] & SCOPING_EL) {
6693 !!!cp ('t368');
6694 last INSCOPE;
6695 }
6696 } # INSCOPE
6697
6698 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6699
6700 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6701
6702 !!!nack ('t368.1');
6703 !!!next-token;
6704 next B;
6705 } elsif ($token->{tag_name} eq 'a') {
6706 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6707 my $node = $active_formatting_elements->[$i];
6708 if ($node->[1] & A_EL) {
6709 !!!cp ('t371');
6710 !!!parse-error (type => 'in a:a', token => $token);
6711
6712 !!!back-token; # <a>
6713 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6714 line => $token->{line}, column => $token->{column}};
6715 $formatting_end_tag->($token);
6716
6717 AFE2: for (reverse 0..$#$active_formatting_elements) {
6718 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6719 !!!cp ('t372');
6720 splice @$active_formatting_elements, $_, 1;
6721 last AFE2;
6722 }
6723 } # AFE2
6724 OE: for (reverse 0..$#{$self->{open_elements}}) {
6725 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6726 !!!cp ('t373');
6727 splice @{$self->{open_elements}}, $_, 1;
6728 last OE;
6729 }
6730 } # OE
6731 last AFE;
6732 } elsif ($node->[0] eq '#marker') {
6733 !!!cp ('t374');
6734 last AFE;
6735 }
6736 } # AFE
6737
6738 $reconstruct_active_formatting_elements->($insert_to_current);
6739
6740 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6741 push @$active_formatting_elements, $self->{open_elements}->[-1];
6742
6743 !!!nack ('t374.1');
6744 !!!next-token;
6745 next B;
6746 } elsif ($token->{tag_name} eq 'nobr') {
6747 $reconstruct_active_formatting_elements->($insert_to_current);
6748
6749 ## has a |nobr| element in scope
6750 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6751 my $node = $self->{open_elements}->[$_];
6752 if ($node->[1] & NOBR_EL) {
6753 !!!cp ('t376');
6754 !!!parse-error (type => 'in nobr:nobr', token => $token);
6755 !!!back-token; # <nobr>
6756 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6757 line => $token->{line}, column => $token->{column}};
6758 next B;
6759 } elsif ($node->[1] & SCOPING_EL) {
6760 !!!cp ('t377');
6761 last INSCOPE;
6762 }
6763 } # INSCOPE
6764
6765 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6766 push @$active_formatting_elements, $self->{open_elements}->[-1];
6767
6768 !!!nack ('t377.1');
6769 !!!next-token;
6770 next B;
6771 } elsif ($token->{tag_name} eq 'button') {
6772 ## has a button element in scope
6773 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6774 my $node = $self->{open_elements}->[$_];
6775 if ($node->[1] & BUTTON_EL) {
6776 !!!cp ('t378');
6777 !!!parse-error (type => 'in button:button', token => $token);
6778 !!!back-token; # <button>
6779 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6780 line => $token->{line}, column => $token->{column}};
6781 next B;
6782 } elsif ($node->[1] & SCOPING_EL) {
6783 !!!cp ('t379');
6784 last INSCOPE;
6785 }
6786 } # INSCOPE
6787
6788 $reconstruct_active_formatting_elements->($insert_to_current);
6789
6790 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6791
6792 ## TODO: associate with $self->{form_element} if defined
6793
6794 push @$active_formatting_elements, ['#marker', ''];
6795
6796 !!!nack ('t379.1');
6797 !!!next-token;
6798 next B;
6799 } elsif ({
6800 xmp => 1,
6801 iframe => 1,
6802 noembed => 1,
6803 noframes => 1, ## NOTE: This is an "as if in head" code clone.
6804 noscript => 0, ## TODO: 1 if scripting is enabled
6805 }->{$token->{tag_name}}) {
6806 if ($token->{tag_name} eq 'xmp') {
6807 !!!cp ('t381');
6808 $reconstruct_active_formatting_elements->($insert_to_current);
6809 } else {
6810 !!!cp ('t399');
6811 }
6812 ## NOTE: There is an "as if in body" code clone.
6813 $parse_rcdata->(CDATA_CONTENT_MODEL);
6814 next B;
6815 } elsif ($token->{tag_name} eq 'isindex') {
6816 !!!parse-error (type => 'isindex', token => $token);
6817
6818 if (defined $self->{form_element}) {
6819 !!!cp ('t389');
6820 ## Ignore the token
6821 !!!nack ('t389'); ## NOTE: Not acknowledged.
6822 !!!next-token;
6823 next B;
6824 } else {
6825 !!!ack ('t391.1');
6826
6827 my $at = $token->{attributes};
6828 my $form_attrs;
6829 $form_attrs->{action} = $at->{action} if $at->{action};
6830 my $prompt_attr = $at->{prompt};
6831 $at->{name} = {name => 'name', value => 'isindex'};
6832 delete $at->{action};
6833 delete $at->{prompt};
6834 my @tokens = (
6835 {type => START_TAG_TOKEN, tag_name => 'form',
6836 attributes => $form_attrs,
6837 line => $token->{line}, column => $token->{column}},
6838 {type => START_TAG_TOKEN, tag_name => 'hr',
6839 line => $token->{line}, column => $token->{column}},
6840 {type => START_TAG_TOKEN, tag_name => 'p',
6841 line => $token->{line}, column => $token->{column}},
6842 {type => START_TAG_TOKEN, tag_name => 'label',
6843 line => $token->{line}, column => $token->{column}},
6844 );
6845 if ($prompt_attr) {
6846 !!!cp ('t390');
6847 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6848 #line => $token->{line}, column => $token->{column},
6849 };
6850 } else {
6851 !!!cp ('t391');
6852 push @tokens, {type => CHARACTER_TOKEN,
6853 data => 'This is a searchable index. Insert your search keywords here: ',
6854 #line => $token->{line}, column => $token->{column},
6855 }; # SHOULD
6856 ## TODO: make this configurable
6857 }
6858 push @tokens,
6859 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6860 line => $token->{line}, column => $token->{column}},
6861 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6862 {type => END_TAG_TOKEN, tag_name => 'label',
6863 line => $token->{line}, column => $token->{column}},
6864 {type => END_TAG_TOKEN, tag_name => 'p',
6865 line => $token->{line}, column => $token->{column}},
6866 {type => START_TAG_TOKEN, tag_name => 'hr',
6867 line => $token->{line}, column => $token->{column}},
6868 {type => END_TAG_TOKEN, tag_name => 'form',
6869 line => $token->{line}, column => $token->{column}};
6870 !!!back-token (@tokens);
6871 !!!next-token;
6872 next B;
6873 }
6874 } elsif ($token->{tag_name} eq 'textarea') {
6875 my $tag_name = $token->{tag_name};
6876 my $el;
6877 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6878
6879 ## TODO: $self->{form_element} if defined
6880 $self->{content_model} = RCDATA_CONTENT_MODEL;
6881 delete $self->{escape}; # MUST
6882
6883 $insert->($el);
6884
6885 my $text = '';
6886 !!!nack ('t392.1');
6887 !!!next-token;
6888 if ($token->{type} == CHARACTER_TOKEN) {
6889 $token->{data} =~ s/^\x0A//;
6890 unless (length $token->{data}) {
6891 !!!cp ('t392');
6892 !!!next-token;
6893 } else {
6894 !!!cp ('t393');
6895 }
6896 } else {
6897 !!!cp ('t394');
6898 }
6899 while ($token->{type} == CHARACTER_TOKEN) {
6900 !!!cp ('t395');
6901 $text .= $token->{data};
6902 !!!next-token;
6903 }
6904 if (length $text) {
6905 !!!cp ('t396');
6906 $el->manakai_append_text ($text);
6907 }
6908
6909 $self->{content_model} = PCDATA_CONTENT_MODEL;
6910
6911 if ($token->{type} == END_TAG_TOKEN and
6912 $token->{tag_name} eq $tag_name) {
6913 !!!cp ('t397');
6914 ## Ignore the token
6915 } else {
6916 !!!cp ('t398');
6917 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
6918 }
6919 !!!next-token;
6920 next B;
6921 } elsif ($token->{tag_name} eq 'rt' or
6922 $token->{tag_name} eq 'rp') {
6923 ## has a |ruby| element in scope
6924 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6925 my $node = $self->{open_elements}->[$_];
6926 if ($node->[1] & RUBY_EL) {
6927 !!!cp ('t398.1');
6928 ## generate implied end tags
6929 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6930 !!!cp ('t398.2');
6931 pop @{$self->{open_elements}};
6932 }
6933 unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
6934 !!!cp ('t398.3');
6935 !!!parse-error (type => 'not closed',
6936 text => $self->{open_elements}->[-1]->[0]
6937 ->manakai_local_name,
6938 token => $token);
6939 pop @{$self->{open_elements}}
6940 while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
6941 }
6942 last INSCOPE;
6943 } elsif ($node->[1] & SCOPING_EL) {
6944 !!!cp ('t398.4');
6945 last INSCOPE;
6946 }
6947 } # INSCOPE
6948
6949 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6950
6951 !!!nack ('t398.5');
6952 !!!next-token;
6953 redo B;
6954 } elsif ($token->{tag_name} eq 'math' or
6955 $token->{tag_name} eq 'svg') {
6956 $reconstruct_active_formatting_elements->($insert_to_current);
6957
6958 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
6959
6960 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
6961
6962 ## "adjust foreign attributes" - done in insert-element-f
6963
6964 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
6965
6966 if ($self->{self_closing}) {
6967 pop @{$self->{open_elements}};
6968 !!!ack ('t398.1');
6969 } else {
6970 !!!cp ('t398.2');
6971 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6972 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6973 ## mode, "in body" (not "in foreign content") secondary insertion
6974 ## mode, maybe.
6975 }
6976
6977 !!!next-token;
6978 next B;
6979 } elsif ({
6980 caption => 1, col => 1, colgroup => 1, frame => 1,
6981 frameset => 1, head => 1, option => 1, optgroup => 1,
6982 tbody => 1, td => 1, tfoot => 1, th => 1,
6983 thead => 1, tr => 1,
6984 }->{$token->{tag_name}}) {
6985 !!!cp ('t401');
6986 !!!parse-error (type => 'in body',
6987 text => $token->{tag_name}, token => $token);
6988 ## Ignore the token
6989 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6990 !!!next-token;
6991 next B;
6992
6993 ## ISSUE: An issue on HTML5 new elements in the spec.
6994 } else {
6995 if ($token->{tag_name} eq 'image') {
6996 !!!cp ('t384');
6997 !!!parse-error (type => 'image', token => $token);
6998 $token->{tag_name} = 'img';
6999 } else {
7000 !!!cp ('t385');
7001 }
7002
7003 ## NOTE: There is an "as if <br>" code clone.
7004 $reconstruct_active_formatting_elements->($insert_to_current);
7005
7006 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7007
7008 if ({
7009 applet => 1, marquee => 1, object => 1,
7010 }->{$token->{tag_name}}) {
7011 !!!cp ('t380');
7012 push @$active_formatting_elements, ['#marker', ''];
7013 !!!nack ('t380.1');
7014 } elsif ({
7015 b => 1, big => 1, em => 1, font => 1, i => 1,
7016 s => 1, small => 1, strile => 1,
7017 strong => 1, tt => 1, u => 1,
7018 }->{$token->{tag_name}}) {
7019 !!!cp ('t375');
7020 push @$active_formatting_elements, $self->{open_elements}->[-1];
7021 !!!nack ('t375.1');
7022 } elsif ($token->{tag_name} eq 'input') {
7023 !!!cp ('t388');
7024 ## TODO: associate with $self->{form_element} if defined
7025 pop @{$self->{open_elements}};
7026 !!!ack ('t388.2');
7027 } elsif ({
7028 area => 1, basefont => 1, bgsound => 1, br => 1,
7029 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7030 #image => 1,
7031 }->{$token->{tag_name}}) {
7032 !!!cp ('t388.1');
7033 pop @{$self->{open_elements}};
7034 !!!ack ('t388.3');
7035 } elsif ($token->{tag_name} eq 'select') {
7036 ## TODO: associate with $self->{form_element} if defined
7037
7038 if ($self->{insertion_mode} & TABLE_IMS or
7039 $self->{insertion_mode} & BODY_TABLE_IMS or
7040 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7041 !!!cp ('t400.1');
7042 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7043 } else {
7044 !!!cp ('t400.2');
7045 $self->{insertion_mode} = IN_SELECT_IM;
7046 }
7047 !!!nack ('t400.3');
7048 } else {
7049 !!!nack ('t402');
7050 }
7051
7052 !!!next-token;
7053 next B;
7054 }
7055 } elsif ($token->{type} == END_TAG_TOKEN) {
7056 if ($token->{tag_name} eq 'body') {
7057 ## has a |body| element in scope
7058 my $i;
7059 INSCOPE: {
7060 for (reverse @{$self->{open_elements}}) {
7061 if ($_->[1] & BODY_EL) {
7062 !!!cp ('t405');
7063 $i = $_;
7064 last INSCOPE;
7065 } elsif ($_->[1] & SCOPING_EL) {
7066 !!!cp ('t405.1');
7067 last;
7068 }
7069 }
7070
7071 !!!parse-error (type => 'start tag not allowed',
7072 text => $token->{tag_name}, token => $token);
7073 ## NOTE: Ignore the token.
7074 !!!next-token;
7075 next B;
7076 } # INSCOPE
7077
7078 for (@{$self->{open_elements}}) {
7079 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7080 !!!cp ('t403');
7081 !!!parse-error (type => 'not closed',
7082 text => $_->[0]->manakai_local_name,
7083 token => $token);
7084 last;
7085 } else {
7086 !!!cp ('t404');
7087 }
7088 }
7089
7090 $self->{insertion_mode} = AFTER_BODY_IM;
7091 !!!next-token;
7092 next B;
7093 } elsif ($token->{tag_name} eq 'html') {
7094 ## TODO: Update this code. It seems that the code below is not
7095 ## up-to-date, though it has same effect as speced.
7096 if (@{$self->{open_elements}} > 1 and
7097 $self->{open_elements}->[1]->[1] & BODY_EL) {
7098 ## ISSUE: There is an issue in the spec.
7099 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7100 !!!cp ('t406');
7101 !!!parse-error (type => 'not closed',
7102 text => $self->{open_elements}->[1]->[0]
7103 ->manakai_local_name,
7104 token => $token);
7105 } else {
7106 !!!cp ('t407');
7107 }
7108 $self->{insertion_mode} = AFTER_BODY_IM;
7109 ## reprocess
7110 next B;
7111 } else {
7112 !!!cp ('t408');
7113 !!!parse-error (type => 'unmatched end tag',
7114 text => $token->{tag_name}, token => $token);
7115 ## Ignore the token
7116 !!!next-token;
7117 next B;
7118 }
7119 } elsif ({
7120 address => 1, blockquote => 1, center => 1, dir => 1,
7121 div => 1, dl => 1, fieldset => 1, listing => 1,
7122 menu => 1, ol => 1, pre => 1, ul => 1,
7123 dd => 1, dt => 1, li => 1,
7124 applet => 1, button => 1, marquee => 1, object => 1,
7125 }->{$token->{tag_name}}) {
7126 ## has an element in scope
7127 my $i;
7128 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7129 my $node = $self->{open_elements}->[$_];
7130 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7131 !!!cp ('t410');
7132 $i = $_;
7133 last INSCOPE;
7134 } elsif ($node->[1] & SCOPING_EL) {
7135 !!!cp ('t411');
7136 last INSCOPE;
7137 }
7138 } # INSCOPE
7139
7140 unless (defined $i) { # has an element in scope
7141 !!!cp ('t413');
7142 !!!parse-error (type => 'unmatched end tag',
7143 text => $token->{tag_name}, token => $token);
7144 } else {
7145 ## Step 1. generate implied end tags
7146 while ({
7147 ## END_TAG_OPTIONAL_EL
7148 dd => ($token->{tag_name} ne 'dd'),
7149 dt => ($token->{tag_name} ne 'dt'),
7150 li => ($token->{tag_name} ne 'li'),
7151 p => 1,
7152 rt => 1,
7153 rp => 1,
7154 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7155 !!!cp ('t409');
7156 pop @{$self->{open_elements}};
7157 }
7158
7159 ## Step 2.
7160 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7161 ne $token->{tag_name}) {
7162 !!!cp ('t412');
7163 !!!parse-error (type => 'not closed',
7164 text => $self->{open_elements}->[-1]->[0]
7165 ->manakai_local_name,
7166 token => $token);
7167 } else {
7168 !!!cp ('t414');
7169 }
7170
7171 ## Step 3.
7172 splice @{$self->{open_elements}}, $i;
7173
7174 ## Step 4.
7175 $clear_up_to_marker->()
7176 if {
7177 applet => 1, button => 1, marquee => 1, object => 1,
7178 }->{$token->{tag_name}};
7179 }
7180 !!!next-token;
7181 next B;
7182 } elsif ($token->{tag_name} eq 'form') {
7183 undef $self->{form_element};
7184
7185 ## has an element in scope
7186 my $i;
7187 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7188 my $node = $self->{open_elements}->[$_];
7189 if ($node->[1] & FORM_EL) {
7190 !!!cp ('t418');
7191 $i = $_;
7192 last INSCOPE;
7193 } elsif ($node->[1] & SCOPING_EL) {
7194 !!!cp ('t419');
7195 last INSCOPE;
7196 }
7197 } # INSCOPE
7198
7199 unless (defined $i) { # has an element in scope
7200 !!!cp ('t421');
7201 !!!parse-error (type => 'unmatched end tag',
7202 text => $token->{tag_name}, token => $token);
7203 } else {
7204 ## Step 1. generate implied end tags
7205 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7206 !!!cp ('t417');
7207 pop @{$self->{open_elements}};
7208 }
7209
7210 ## Step 2.
7211 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7212 ne $token->{tag_name}) {
7213 !!!cp ('t417.1');
7214 !!!parse-error (type => 'not closed',
7215 text => $self->{open_elements}->[-1]->[0]
7216 ->manakai_local_name,
7217 token => $token);
7218 } else {
7219 !!!cp ('t420');
7220 }
7221
7222 ## Step 3.
7223 splice @{$self->{open_elements}}, $i;
7224 }
7225
7226 !!!next-token;
7227 next B;
7228 } elsif ({
7229 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7230 }->{$token->{tag_name}}) {
7231 ## has an element in scope
7232 my $i;
7233 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7234 my $node = $self->{open_elements}->[$_];
7235 if ($node->[1] & HEADING_EL) {
7236 !!!cp ('t423');
7237 $i = $_;
7238 last INSCOPE;
7239 } elsif ($node->[1] & SCOPING_EL) {
7240 !!!cp ('t424');
7241 last INSCOPE;
7242 }
7243 } # INSCOPE
7244
7245 unless (defined $i) { # has an element in scope
7246 !!!cp ('t425.1');
7247 !!!parse-error (type => 'unmatched end tag',
7248 text => $token->{tag_name}, token => $token);
7249 } else {
7250 ## Step 1. generate implied end tags
7251 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7252 !!!cp ('t422');
7253 pop @{$self->{open_elements}};
7254 }
7255
7256 ## Step 2.
7257 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7258 ne $token->{tag_name}) {
7259 !!!cp ('t425');
7260 !!!parse-error (type => 'unmatched end tag',
7261 text => $token->{tag_name}, token => $token);
7262 } else {
7263 !!!cp ('t426');
7264 }
7265
7266 ## Step 3.
7267 splice @{$self->{open_elements}}, $i;
7268 }
7269
7270 !!!next-token;
7271 next B;
7272 } elsif ($token->{tag_name} eq 'p') {
7273 ## has an element in scope
7274 my $i;
7275 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7276 my $node = $self->{open_elements}->[$_];
7277 if ($node->[1] & P_EL) {
7278 !!!cp ('t410.1');
7279 $i = $_;
7280 last INSCOPE;
7281 } elsif ($node->[1] & SCOPING_EL) {
7282 !!!cp ('t411.1');
7283 last INSCOPE;
7284 }
7285 } # INSCOPE
7286
7287 if (defined $i) {
7288 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7289 ne $token->{tag_name}) {
7290 !!!cp ('t412.1');
7291 !!!parse-error (type => 'not closed',
7292 text => $self->{open_elements}->[-1]->[0]
7293 ->manakai_local_name,
7294 token => $token);
7295 } else {
7296 !!!cp ('t414.1');
7297 }
7298
7299 splice @{$self->{open_elements}}, $i;
7300 } else {
7301 !!!cp ('t413.1');
7302 !!!parse-error (type => 'unmatched end tag',
7303 text => $token->{tag_name}, token => $token);
7304
7305 !!!cp ('t415.1');
7306 ## As if <p>, then reprocess the current token
7307 my $el;
7308 !!!create-element ($el, $HTML_NS, 'p',, $token);
7309 $insert->($el);
7310 ## NOTE: Not inserted into |$self->{open_elements}|.
7311 }
7312
7313 !!!next-token;
7314 next B;
7315 } elsif ({
7316 a => 1,
7317 b => 1, big => 1, em => 1, font => 1, i => 1,
7318 nobr => 1, s => 1, small => 1, strile => 1,
7319 strong => 1, tt => 1, u => 1,
7320 }->{$token->{tag_name}}) {
7321 !!!cp ('t427');
7322 $formatting_end_tag->($token);
7323 next B;
7324 } elsif ($token->{tag_name} eq 'br') {
7325 !!!cp ('t428');
7326 !!!parse-error (type => 'unmatched end tag',
7327 text => 'br', token => $token);
7328
7329 ## As if <br>
7330 $reconstruct_active_formatting_elements->($insert_to_current);
7331
7332 my $el;
7333 !!!create-element ($el, $HTML_NS, 'br',, $token);
7334 $insert->($el);
7335
7336 ## Ignore the token.
7337 !!!next-token;
7338 next B;
7339 } elsif ({
7340 caption => 1, col => 1, colgroup => 1, frame => 1,
7341 frameset => 1, head => 1, option => 1, optgroup => 1,
7342 tbody => 1, td => 1, tfoot => 1, th => 1,
7343 thead => 1, tr => 1,
7344 area => 1, basefont => 1, bgsound => 1,
7345 embed => 1, hr => 1, iframe => 1, image => 1,
7346 img => 1, input => 1, isindex => 1, noembed => 1,
7347 noframes => 1, param => 1, select => 1, spacer => 1,
7348 table => 1, textarea => 1, wbr => 1,
7349 noscript => 0, ## TODO: if scripting is enabled
7350 }->{$token->{tag_name}}) {
7351 !!!cp ('t429');
7352 !!!parse-error (type => 'unmatched end tag',
7353 text => $token->{tag_name}, token => $token);
7354 ## Ignore the token
7355 !!!next-token;
7356 next B;
7357
7358 ## ISSUE: Issue on HTML5 new elements in spec
7359
7360 } else {
7361 ## Step 1
7362 my $node_i = -1;
7363 my $node = $self->{open_elements}->[$node_i];
7364
7365 ## Step 2
7366 S2: {
7367 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7368 ## Step 1
7369 ## generate implied end tags
7370 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7371 !!!cp ('t430');
7372 ## NOTE: |<ruby><rt></ruby>|.
7373 ## ISSUE: <ruby><rt></rt> will also take this code path,
7374 ## which seems wrong.
7375 pop @{$self->{open_elements}};
7376 $node_i++;
7377 }
7378
7379 ## Step 2
7380 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7381 ne $token->{tag_name}) {
7382 !!!cp ('t431');
7383 ## NOTE: <x><y></x>
7384 !!!parse-error (type => 'not closed',
7385 text => $self->{open_elements}->[-1]->[0]
7386 ->manakai_local_name,
7387 token => $token);
7388 } else {
7389 !!!cp ('t432');
7390 }
7391
7392 ## Step 3
7393 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7394
7395 !!!next-token;
7396 last S2;
7397 } else {
7398 ## Step 3
7399 if (not ($node->[1] & FORMATTING_EL) and
7400 #not $phrasing_category->{$node->[1]} and
7401 ($node->[1] & SPECIAL_EL or
7402 $node->[1] & SCOPING_EL)) {
7403 !!!cp ('t433');
7404 !!!parse-error (type => 'unmatched end tag',
7405 text => $token->{tag_name}, token => $token);
7406 ## Ignore the token
7407 !!!next-token;
7408 last S2;
7409 }
7410
7411 !!!cp ('t434');
7412 }
7413
7414 ## Step 4
7415 $node_i--;
7416 $node = $self->{open_elements}->[$node_i];
7417
7418 ## Step 5;
7419 redo S2;
7420 } # S2
7421 next B;
7422 }
7423 }
7424 next B;
7425 } continue { # B
7426 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7427 ## NOTE: The code below is executed in cases where it does not have
7428 ## to be, but it it is harmless even in those cases.
7429 ## has an element in scope
7430 INSCOPE: {
7431 for (reverse 0..$#{$self->{open_elements}}) {
7432 my $node = $self->{open_elements}->[$_];
7433 if ($node->[1] & FOREIGN_EL) {
7434 last INSCOPE;
7435 } elsif ($node->[1] & SCOPING_EL) {
7436 last;
7437 }
7438 }
7439
7440 ## NOTE: No foreign element in scope.
7441 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7442 } # INSCOPE
7443 }
7444 } # B
7445
7446 ## Stop parsing # MUST
7447
7448 ## TODO: script stuffs
7449 } # _tree_construct_main
7450
7451 sub set_inner_html ($$$) {
7452 my $class = shift;
7453 my $node = shift;
7454 my $s = \$_[0];
7455 my $onerror = $_[1];
7456
7457 ## ISSUE: Should {confident} be true?
7458
7459 my $nt = $node->node_type;
7460 if ($nt == 9) {
7461 # MUST
7462
7463 ## Step 1 # MUST
7464 ## TODO: If the document has an active parser, ...
7465 ## ISSUE: There is an issue in the spec.
7466
7467 ## Step 2 # MUST
7468 my @cn = @{$node->child_nodes};
7469 for (@cn) {
7470 $node->remove_child ($_);
7471 }
7472
7473 ## Step 3, 4, 5 # MUST
7474 $class->parse_string ($$s => $node, $onerror);
7475 } elsif ($nt == 1) {
7476 ## TODO: If non-html element
7477
7478 ## NOTE: Most of this code is copied from |parse_string|
7479
7480 ## Step 1 # MUST
7481 my $this_doc = $node->owner_document;
7482 my $doc = $this_doc->implementation->create_document;
7483 $doc->manakai_is_html (1);
7484 my $p = $class->new;
7485 $p->{document} = $doc;
7486
7487 ## Step 8 # MUST
7488 my $i = 0;
7489 $p->{line_prev} = $p->{line} = 1;
7490 $p->{column_prev} = $p->{column} = 0;
7491 $p->{set_next_char} = sub {
7492 my $self = shift;
7493
7494 pop @{$self->{prev_char}};
7495 unshift @{$self->{prev_char}}, $self->{next_char};
7496
7497 $self->{next_char} = -1 and return if $i >= length $$s;
7498 $self->{next_char} = ord substr $$s, $i++, 1;
7499
7500 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7501 $p->{column}++;
7502
7503 if ($self->{next_char} == 0x000A) { # LF
7504 $p->{line}++;
7505 $p->{column} = 0;
7506 !!!cp ('i1');
7507 } elsif ($self->{next_char} == 0x000D) { # CR
7508 $i++ if substr ($$s, $i, 1) eq "\x0A";
7509 $self->{next_char} = 0x000A; # LF # MUST
7510 $p->{line}++;
7511 $p->{column} = 0;
7512 !!!cp ('i2');
7513 } elsif ($self->{next_char} > 0x10FFFF) {
7514 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7515 !!!cp ('i3');
7516 } elsif ($self->{next_char} == 0x0000) { # NULL
7517 !!!cp ('i4');
7518 !!!parse-error (type => 'NULL');
7519 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7520 } elsif ($self->{next_char} <= 0x0008 or
7521 (0x000E <= $self->{next_char} and
7522 $self->{next_char} <= 0x001F) or
7523 (0x007F <= $self->{next_char} and
7524 $self->{next_char} <= 0x009F) or
7525 (0xD800 <= $self->{next_char} and
7526 $self->{next_char} <= 0xDFFF) or
7527 (0xFDD0 <= $self->{next_char} and
7528 $self->{next_char} <= 0xFDDF) or
7529 {
7530 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7531 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7532 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7533 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7534 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7535 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7536 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7537 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7538 0x10FFFE => 1, 0x10FFFF => 1,
7539 }->{$self->{next_char}}) {
7540 !!!cp ('i4.1');
7541 if ($self->{next_char} < 0x10000) {
7542 !!!parse-error (type => 'control char',
7543 text => (sprintf 'U+%04X', $self->{next_char}));
7544 } else {
7545 !!!parse-error (type => 'control char',
7546 text => (sprintf 'U-%08X', $self->{next_char}));
7547 }
7548 }
7549 };
7550 $p->{prev_char} = [-1, -1, -1];
7551 $p->{next_char} = -1;
7552
7553 my $ponerror = $onerror || sub {
7554 my (%opt) = @_;
7555 my $line = $opt{line};
7556 my $column = $opt{column};
7557 if (defined $opt{token} and defined $opt{token}->{line}) {
7558 $line = $opt{token}->{line};
7559 $column = $opt{token}->{column};
7560 }
7561 warn "Parse error ($opt{type}) at line $line column $column\n";
7562 };
7563 $p->{parse_error} = sub {
7564 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7565 };
7566
7567 $p->_initialize_tokenizer;
7568 $p->_initialize_tree_constructor;
7569
7570 ## Step 2
7571 my $node_ln = $node->manakai_local_name;
7572 $p->{content_model} = {
7573 title => RCDATA_CONTENT_MODEL,
7574 textarea => RCDATA_CONTENT_MODEL,
7575 style => CDATA_CONTENT_MODEL,
7576 script => CDATA_CONTENT_MODEL,
7577 xmp => CDATA_CONTENT_MODEL,
7578 iframe => CDATA_CONTENT_MODEL,
7579 noembed => CDATA_CONTENT_MODEL,
7580 noframes => CDATA_CONTENT_MODEL,
7581 noscript => CDATA_CONTENT_MODEL,
7582 plaintext => PLAINTEXT_CONTENT_MODEL,
7583 }->{$node_ln};
7584 $p->{content_model} = PCDATA_CONTENT_MODEL
7585 unless defined $p->{content_model};
7586 ## ISSUE: What is "the name of the element"? local name?
7587
7588 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7589 ## TODO: Foreign element OK?
7590
7591 ## Step 3
7592 my $root = $doc->create_element_ns
7593 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7594
7595 ## Step 4 # MUST
7596 $doc->append_child ($root);
7597
7598 ## Step 5 # MUST
7599 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7600
7601 undef $p->{head_element};
7602
7603 ## Step 6 # MUST
7604 $p->_reset_insertion_mode;
7605
7606 ## Step 7 # MUST
7607 my $anode = $node;
7608 AN: while (defined $anode) {
7609 if ($anode->node_type == 1) {
7610 my $nsuri = $anode->namespace_uri;
7611 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7612 if ($anode->manakai_local_name eq 'form') {
7613 !!!cp ('i5');
7614 $p->{form_element} = $anode;
7615 last AN;
7616 }
7617 }
7618 }
7619 $anode = $anode->parent_node;
7620 } # AN
7621
7622 ## Step 9 # MUST
7623 {
7624 my $self = $p;
7625 !!!next-token;
7626 }
7627 $p->_tree_construction_main;
7628
7629 ## Step 10 # MUST
7630 my @cn = @{$node->child_nodes};
7631 for (@cn) {
7632 $node->remove_child ($_);
7633 }
7634 ## ISSUE: mutation events? read-only?
7635
7636 ## Step 11 # MUST
7637 @cn = @{$root->child_nodes};
7638 for (@cn) {
7639 $this_doc->adopt_node ($_);
7640 $node->append_child ($_);
7641 }
7642 ## ISSUE: mutation events?
7643
7644 $p->_terminate_tree_constructor;
7645
7646 delete $p->{parse_error}; # delete loop
7647 } else {
7648 die "$0: |set_inner_html| is not defined for node of type $nt";
7649 }
7650 } # set_inner_html
7651
7652 } # tree construction stage
7653
7654 package Whatpm::HTML::RestartParser;
7655 push our @ISA, 'Error';
7656
7657 1;
7658 # $Date: 2008/08/30 12:57:05 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24