/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.158 - (show annotations) (download) (as text)
Sun Aug 31 12:11:42 2008 UTC (17 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.157: +17 -32 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	31 Aug 2008 11:27:22 -0000
	* tree-test-1.dat: Test data for after after frameset
	insertion mode are added (cf. HTML5 revision 1909).

2008-08-31  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	31 Aug 2008 11:28:41 -0000
2008-08-31  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src: Bug fix and sync with the spec with regard
	to after after frameset insertion mode processing (HTML5
	revision 1909).  Note that the implementation was wrong
	per the old spec before the r1909 changes.

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.157 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 require IO::Handle;
12
13 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15 my $SVG_NS = q<http://www.w3.org/2000/svg>;
16 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19
20 sub A_EL () { 0b1 }
21 sub ADDRESS_EL () { 0b10 }
22 sub BODY_EL () { 0b100 }
23 sub BUTTON_EL () { 0b1000 }
24 sub CAPTION_EL () { 0b10000 }
25 sub DD_EL () { 0b100000 }
26 sub DIV_EL () { 0b1000000 }
27 sub DT_EL () { 0b10000000 }
28 sub FORM_EL () { 0b100000000 }
29 sub FORMATTING_EL () { 0b1000000000 }
30 sub FRAMESET_EL () { 0b10000000000 }
31 sub HEADING_EL () { 0b100000000000 }
32 sub HTML_EL () { 0b1000000000000 }
33 sub LI_EL () { 0b10000000000000 }
34 sub NOBR_EL () { 0b100000000000000 }
35 sub OPTION_EL () { 0b1000000000000000 }
36 sub OPTGROUP_EL () { 0b10000000000000000 }
37 sub P_EL () { 0b100000000000000000 }
38 sub SELECT_EL () { 0b1000000000000000000 }
39 sub TABLE_EL () { 0b10000000000000000000 }
40 sub TABLE_CELL_EL () { 0b100000000000000000000 }
41 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48 sub RUBY_EL () { 0b10000000000000000000000000000 }
49 sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 ## NOTE: Used in "generate implied end tags" algorithm.
58 ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
59 ## is used in "generate implied end tags" implementation (search for the
60 ## function mae).
61 sub END_TAG_OPTIONAL_EL () {
62 DD_EL |
63 DT_EL |
64 LI_EL |
65 P_EL |
66 RUBY_COMPONENT_EL
67 }
68
69 ## NOTE: Used in </body> and EOF algorithms.
70 sub ALL_END_TAG_OPTIONAL_EL () {
71 DD_EL |
72 DT_EL |
73 LI_EL |
74 P_EL |
75
76 BODY_EL |
77 HTML_EL |
78 TABLE_CELL_EL |
79 TABLE_ROW_EL |
80 TABLE_ROW_GROUP_EL
81 }
82
83 sub SCOPING_EL () {
84 BUTTON_EL |
85 CAPTION_EL |
86 HTML_EL |
87 TABLE_EL |
88 TABLE_CELL_EL |
89 MISC_SCOPING_EL
90 }
91
92 sub TABLE_SCOPING_EL () {
93 HTML_EL |
94 TABLE_EL
95 }
96
97 sub TABLE_ROWS_SCOPING_EL () {
98 HTML_EL |
99 TABLE_ROW_GROUP_EL
100 }
101
102 sub TABLE_ROW_SCOPING_EL () {
103 HTML_EL |
104 TABLE_ROW_EL
105 }
106
107 sub SPECIAL_EL () {
108 ADDRESS_EL |
109 BODY_EL |
110 DIV_EL |
111
112 DD_EL |
113 DT_EL |
114 LI_EL |
115 P_EL |
116
117 FORM_EL |
118 FRAMESET_EL |
119 HEADING_EL |
120 OPTION_EL |
121 OPTGROUP_EL |
122 SELECT_EL |
123 TABLE_ROW_EL |
124 TABLE_ROW_GROUP_EL |
125 MISC_SPECIAL_EL
126 }
127
128 my $el_category = {
129 a => A_EL | FORMATTING_EL,
130 address => ADDRESS_EL,
131 applet => MISC_SCOPING_EL,
132 area => MISC_SPECIAL_EL,
133 b => FORMATTING_EL,
134 base => MISC_SPECIAL_EL,
135 basefont => MISC_SPECIAL_EL,
136 bgsound => MISC_SPECIAL_EL,
137 big => FORMATTING_EL,
138 blockquote => MISC_SPECIAL_EL,
139 body => BODY_EL,
140 br => MISC_SPECIAL_EL,
141 button => BUTTON_EL,
142 caption => CAPTION_EL,
143 center => MISC_SPECIAL_EL,
144 col => MISC_SPECIAL_EL,
145 colgroup => MISC_SPECIAL_EL,
146 dd => DD_EL,
147 dir => MISC_SPECIAL_EL,
148 div => DIV_EL,
149 dl => MISC_SPECIAL_EL,
150 dt => DT_EL,
151 em => FORMATTING_EL,
152 embed => MISC_SPECIAL_EL,
153 fieldset => MISC_SPECIAL_EL,
154 font => FORMATTING_EL,
155 form => FORM_EL,
156 frame => MISC_SPECIAL_EL,
157 frameset => FRAMESET_EL,
158 h1 => HEADING_EL,
159 h2 => HEADING_EL,
160 h3 => HEADING_EL,
161 h4 => HEADING_EL,
162 h5 => HEADING_EL,
163 h6 => HEADING_EL,
164 head => MISC_SPECIAL_EL,
165 hr => MISC_SPECIAL_EL,
166 html => HTML_EL,
167 i => FORMATTING_EL,
168 iframe => MISC_SPECIAL_EL,
169 img => MISC_SPECIAL_EL,
170 input => MISC_SPECIAL_EL,
171 isindex => MISC_SPECIAL_EL,
172 li => LI_EL,
173 link => MISC_SPECIAL_EL,
174 listing => MISC_SPECIAL_EL,
175 marquee => MISC_SCOPING_EL,
176 menu => MISC_SPECIAL_EL,
177 meta => MISC_SPECIAL_EL,
178 nobr => NOBR_EL | FORMATTING_EL,
179 noembed => MISC_SPECIAL_EL,
180 noframes => MISC_SPECIAL_EL,
181 noscript => MISC_SPECIAL_EL,
182 object => MISC_SCOPING_EL,
183 ol => MISC_SPECIAL_EL,
184 optgroup => OPTGROUP_EL,
185 option => OPTION_EL,
186 p => P_EL,
187 param => MISC_SPECIAL_EL,
188 plaintext => MISC_SPECIAL_EL,
189 pre => MISC_SPECIAL_EL,
190 rp => RUBY_COMPONENT_EL,
191 rt => RUBY_COMPONENT_EL,
192 ruby => RUBY_EL,
193 s => FORMATTING_EL,
194 script => MISC_SPECIAL_EL,
195 select => SELECT_EL,
196 small => FORMATTING_EL,
197 spacer => MISC_SPECIAL_EL,
198 strike => FORMATTING_EL,
199 strong => FORMATTING_EL,
200 style => MISC_SPECIAL_EL,
201 table => TABLE_EL,
202 tbody => TABLE_ROW_GROUP_EL,
203 td => TABLE_CELL_EL,
204 textarea => MISC_SPECIAL_EL,
205 tfoot => TABLE_ROW_GROUP_EL,
206 th => TABLE_CELL_EL,
207 thead => TABLE_ROW_GROUP_EL,
208 title => MISC_SPECIAL_EL,
209 tr => TABLE_ROW_EL,
210 tt => FORMATTING_EL,
211 u => FORMATTING_EL,
212 ul => MISC_SPECIAL_EL,
213 wbr => MISC_SPECIAL_EL,
214 };
215
216 my $el_category_f = {
217 $MML_NS => {
218 'annotation-xml' => MML_AXML_EL,
219 mi => FOREIGN_FLOW_CONTENT_EL,
220 mo => FOREIGN_FLOW_CONTENT_EL,
221 mn => FOREIGN_FLOW_CONTENT_EL,
222 ms => FOREIGN_FLOW_CONTENT_EL,
223 mtext => FOREIGN_FLOW_CONTENT_EL,
224 },
225 $SVG_NS => {
226 foreignObject => FOREIGN_FLOW_CONTENT_EL,
227 desc => FOREIGN_FLOW_CONTENT_EL,
228 title => FOREIGN_FLOW_CONTENT_EL,
229 },
230 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
231 };
232
233 my $svg_attr_name = {
234 attributename => 'attributeName',
235 attributetype => 'attributeType',
236 basefrequency => 'baseFrequency',
237 baseprofile => 'baseProfile',
238 calcmode => 'calcMode',
239 clippathunits => 'clipPathUnits',
240 contentscripttype => 'contentScriptType',
241 contentstyletype => 'contentStyleType',
242 diffuseconstant => 'diffuseConstant',
243 edgemode => 'edgeMode',
244 externalresourcesrequired => 'externalResourcesRequired',
245 filterres => 'filterRes',
246 filterunits => 'filterUnits',
247 glyphref => 'glyphRef',
248 gradienttransform => 'gradientTransform',
249 gradientunits => 'gradientUnits',
250 kernelmatrix => 'kernelMatrix',
251 kernelunitlength => 'kernelUnitLength',
252 keypoints => 'keyPoints',
253 keysplines => 'keySplines',
254 keytimes => 'keyTimes',
255 lengthadjust => 'lengthAdjust',
256 limitingconeangle => 'limitingConeAngle',
257 markerheight => 'markerHeight',
258 markerunits => 'markerUnits',
259 markerwidth => 'markerWidth',
260 maskcontentunits => 'maskContentUnits',
261 maskunits => 'maskUnits',
262 numoctaves => 'numOctaves',
263 pathlength => 'pathLength',
264 patterncontentunits => 'patternContentUnits',
265 patterntransform => 'patternTransform',
266 patternunits => 'patternUnits',
267 pointsatx => 'pointsAtX',
268 pointsaty => 'pointsAtY',
269 pointsatz => 'pointsAtZ',
270 preservealpha => 'preserveAlpha',
271 preserveaspectratio => 'preserveAspectRatio',
272 primitiveunits => 'primitiveUnits',
273 refx => 'refX',
274 refy => 'refY',
275 repeatcount => 'repeatCount',
276 repeatdur => 'repeatDur',
277 requiredextensions => 'requiredExtensions',
278 requiredfeatures => 'requiredFeatures',
279 specularconstant => 'specularConstant',
280 specularexponent => 'specularExponent',
281 spreadmethod => 'spreadMethod',
282 startoffset => 'startOffset',
283 stddeviation => 'stdDeviation',
284 stitchtiles => 'stitchTiles',
285 surfacescale => 'surfaceScale',
286 systemlanguage => 'systemLanguage',
287 tablevalues => 'tableValues',
288 targetx => 'targetX',
289 targety => 'targetY',
290 textlength => 'textLength',
291 viewbox => 'viewBox',
292 viewtarget => 'viewTarget',
293 xchannelselector => 'xChannelSelector',
294 ychannelselector => 'yChannelSelector',
295 zoomandpan => 'zoomAndPan',
296 };
297
298 my $foreign_attr_xname = {
299 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
300 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
301 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
302 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
303 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
304 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
305 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
306 'xml:base' => [$XML_NS, ['xml', 'base']],
307 'xml:lang' => [$XML_NS, ['xml', 'lang']],
308 'xml:space' => [$XML_NS, ['xml', 'space']],
309 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
310 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
311 };
312
313 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
314
315 my $c1_entity_char = {
316 0x80 => 0x20AC,
317 0x81 => 0xFFFD,
318 0x82 => 0x201A,
319 0x83 => 0x0192,
320 0x84 => 0x201E,
321 0x85 => 0x2026,
322 0x86 => 0x2020,
323 0x87 => 0x2021,
324 0x88 => 0x02C6,
325 0x89 => 0x2030,
326 0x8A => 0x0160,
327 0x8B => 0x2039,
328 0x8C => 0x0152,
329 0x8D => 0xFFFD,
330 0x8E => 0x017D,
331 0x8F => 0xFFFD,
332 0x90 => 0xFFFD,
333 0x91 => 0x2018,
334 0x92 => 0x2019,
335 0x93 => 0x201C,
336 0x94 => 0x201D,
337 0x95 => 0x2022,
338 0x96 => 0x2013,
339 0x97 => 0x2014,
340 0x98 => 0x02DC,
341 0x99 => 0x2122,
342 0x9A => 0x0161,
343 0x9B => 0x203A,
344 0x9C => 0x0153,
345 0x9D => 0xFFFD,
346 0x9E => 0x017E,
347 0x9F => 0x0178,
348 }; # $c1_entity_char
349
350 sub parse_byte_string ($$$$;$) {
351 my $self = shift;
352 my $charset_name = shift;
353 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
354 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
355 } # parse_byte_string
356
357 sub parse_byte_stream ($$$$;$) {
358 my $self = ref $_[0] ? shift : shift->new;
359 my $charset_name = shift;
360 my $byte_stream = $_[0];
361
362 my $onerror = $_[2] || sub {
363 my (%opt) = @_;
364 warn "Parse error ($opt{type})\n";
365 };
366 $self->{parse_error} = $onerror; # updated later by parse_char_string
367
368 ## HTML5 encoding sniffing algorithm
369 require Message::Charset::Info;
370 my $charset;
371 my $buffer;
372 my ($char_stream, $e_status);
373
374 SNIFFING: {
375
376 ## Step 1
377 if (defined $charset_name) {
378 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
379
380 ## ISSUE: Unsupported encoding is not ignored according to the spec.
381 ($char_stream, $e_status) = $charset->get_decode_handle
382 ($byte_stream, allow_error_reporting => 1,
383 allow_fallback => 1);
384 if ($char_stream) {
385 $self->{confident} = 1;
386 last SNIFFING;
387 } else {
388 ## TODO: unsupported error
389 }
390 }
391
392 ## Step 2
393 my $byte_buffer = '';
394 for (1..1024) {
395 my $char = $byte_stream->getc;
396 last unless defined $char;
397 $byte_buffer .= $char;
398 } ## TODO: timeout
399
400 ## Step 3
401 if ($byte_buffer =~ /^\xFE\xFF/) {
402 $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
403 ($char_stream, $e_status) = $charset->get_decode_handle
404 ($byte_stream, allow_error_reporting => 1,
405 allow_fallback => 1, byte_buffer => \$byte_buffer);
406 $self->{confident} = 1;
407 last SNIFFING;
408 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
409 $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
410 ($char_stream, $e_status) = $charset->get_decode_handle
411 ($byte_stream, allow_error_reporting => 1,
412 allow_fallback => 1, byte_buffer => \$byte_buffer);
413 $self->{confident} = 1;
414 last SNIFFING;
415 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
416 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
417 ($char_stream, $e_status) = $charset->get_decode_handle
418 ($byte_stream, allow_error_reporting => 1,
419 allow_fallback => 1, byte_buffer => \$byte_buffer);
420 $self->{confident} = 1;
421 last SNIFFING;
422 }
423
424 ## Step 4
425 ## TODO: <meta charset>
426
427 ## Step 5
428 ## TODO: from history
429
430 ## Step 6
431 require Whatpm::Charset::UniversalCharDet;
432 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
433 ($byte_buffer);
434 if (defined $charset_name) {
435 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
436
437 ## ISSUE: Unsupported encoding is not ignored according to the spec.
438 require Whatpm::Charset::DecodeHandle;
439 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
440 ($byte_stream);
441 ($char_stream, $e_status) = $charset->get_decode_handle
442 ($buffer, allow_error_reporting => 1,
443 allow_fallback => 1, byte_buffer => \$byte_buffer);
444 if ($char_stream) {
445 $buffer->{buffer} = $byte_buffer;
446 !!!parse-error (type => 'sniffing:chardet',
447 text => $charset_name,
448 level => $self->{level}->{info},
449 layer => 'encode',
450 line => 1, column => 1);
451 $self->{confident} = 0;
452 last SNIFFING;
453 }
454 }
455
456 ## Step 7: default
457 ## TODO: Make this configurable.
458 $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
459 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
460 ## detectable in the step 6.
461 require Whatpm::Charset::DecodeHandle;
462 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
463 ($byte_stream);
464 ($char_stream, $e_status)
465 = $charset->get_decode_handle ($buffer,
466 allow_error_reporting => 1,
467 allow_fallback => 1,
468 byte_buffer => \$byte_buffer);
469 $buffer->{buffer} = $byte_buffer;
470 !!!parse-error (type => 'sniffing:default',
471 text => 'windows-1252',
472 level => $self->{level}->{info},
473 line => 1, column => 1,
474 layer => 'encode');
475 $self->{confident} = 0;
476 } # SNIFFING
477
478 $self->{input_encoding} = $charset->get_iana_name;
479 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
480 !!!parse-error (type => 'chardecode:fallback',
481 text => $self->{input_encoding},
482 level => $self->{level}->{uncertain},
483 line => 1, column => 1,
484 layer => 'encode');
485 } elsif (not ($e_status &
486 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
487 !!!parse-error (type => 'chardecode:no error',
488 text => $self->{input_encoding},
489 level => $self->{level}->{uncertain},
490 line => 1, column => 1,
491 layer => 'encode');
492 }
493
494 $self->{change_encoding} = sub {
495 my $self = shift;
496 $charset_name = shift;
497 my $token = shift;
498
499 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
500 ($char_stream, $e_status) = $charset->get_decode_handle
501 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
502 byte_buffer => \ $buffer->{buffer});
503
504 if ($char_stream) { # if supported
505 ## "Change the encoding" algorithm:
506
507 ## Step 1
508 if ($charset->{category} &
509 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
510 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
511 ($char_stream, $e_status) = $charset->get_decode_handle
512 ($byte_stream,
513 byte_buffer => \ $buffer->{buffer});
514 }
515 $charset_name = $charset->get_iana_name;
516
517 ## Step 2
518 if (defined $self->{input_encoding} and
519 $self->{input_encoding} eq $charset_name) {
520 !!!parse-error (type => 'charset label:matching',
521 text => $charset_name,
522 level => $self->{level}->{info});
523 $self->{confident} = 1;
524 return;
525 }
526
527 !!!parse-error (type => 'charset label detected',
528 text => $self->{input_encoding},
529 value => $charset_name,
530 level => $self->{level}->{warn},
531 token => $token);
532
533 ## Step 3
534 # if (can) {
535 ## change the encoding on the fly.
536 #$self->{confident} = 1;
537 #return;
538 # }
539
540 ## Step 4
541 throw Whatpm::HTML::RestartParser ();
542 }
543 }; # $self->{change_encoding}
544
545 my $char_onerror = sub {
546 my (undef, $type, %opt) = @_;
547 !!!parse-error (layer => 'encode',
548 %opt, type => $type,
549 line => $self->{line}, column => $self->{column} + 1);
550 if ($opt{octets}) {
551 ${$opt{octets}} = "\x{FFFD}"; # relacement character
552 }
553 };
554 $char_stream->onerror ($char_onerror);
555
556 my @args = @_; shift @args; # $s
557 my $return;
558 try {
559 $return = $self->parse_char_stream ($char_stream, @args);
560 } catch Whatpm::HTML::RestartParser with {
561 ## NOTE: Invoked after {change_encoding}.
562
563 $self->{input_encoding} = $charset->get_iana_name;
564 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
565 !!!parse-error (type => 'chardecode:fallback',
566 text => $self->{input_encoding},
567 level => $self->{level}->{uncertain},
568 line => 1, column => 1,
569 layer => 'encode');
570 } elsif (not ($e_status &
571 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
572 !!!parse-error (type => 'chardecode:no error',
573 text => $self->{input_encoding},
574 level => $self->{level}->{uncertain},
575 line => 1, column => 1,
576 layer => 'encode');
577 }
578 $self->{confident} = 1;
579 $char_stream->onerror ($char_onerror);
580 $return = $self->parse_char_stream ($char_stream, @args);
581 };
582 return $return;
583 } # parse_byte_stream
584
585 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
586 ## and the HTML layer MUST ignore it. However, we does strip BOM in
587 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
588 ## because the core part of our HTML parser expects a string of character,
589 ## not a string of bytes or code units or anything which might contain a BOM.
590 ## Therefore, any parser interface that accepts a string of bytes,
591 ## such as |parse_byte_string| in this module, must ensure that it does
592 ## strip the BOM and never strip any ZWNBSP.
593
594 sub parse_char_string ($$$;$) {
595 my $self = shift;
596 require utf8;
597 my $s = ref $_[0] ? $_[0] : \($_[0]);
598 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
599 return $self->parse_char_stream ($input, @_[1..$#_]);
600 } # parse_char_string
601 *parse_string = \&parse_char_string;
602
603 sub parse_char_stream ($$$;$) {
604 my $self = ref $_[0] ? shift : shift->new;
605 my $input = $_[0];
606 $self->{document} = $_[1];
607 @{$self->{document}->child_nodes} = ();
608
609 ## NOTE: |set_inner_html| copies most of this method's code
610
611 $self->{confident} = 1 unless exists $self->{confident};
612 $self->{document}->input_encoding ($self->{input_encoding})
613 if defined $self->{input_encoding};
614
615 my $i = 0;
616 $self->{line_prev} = $self->{line} = 1;
617 $self->{column_prev} = $self->{column} = 0;
618 $self->{set_next_char} = sub {
619 my $self = shift;
620
621 pop @{$self->{prev_char}};
622 unshift @{$self->{prev_char}}, $self->{next_char};
623
624 my $char;
625 if (defined $self->{next_next_char}) {
626 $char = $self->{next_next_char};
627 delete $self->{next_next_char};
628 } else {
629 $char = $input->getc;
630 }
631 $self->{next_char} = -1 and return unless defined $char;
632 $self->{next_char} = ord $char;
633
634 ($self->{line_prev}, $self->{column_prev})
635 = ($self->{line}, $self->{column});
636 $self->{column}++;
637
638 if ($self->{next_char} == 0x000A) { # LF
639 !!!cp ('j1');
640 $self->{line}++;
641 $self->{column} = 0;
642 } elsif ($self->{next_char} == 0x000D) { # CR
643 !!!cp ('j2');
644 my $next = $input->getc;
645 if (defined $next and $next ne "\x0A") {
646 $self->{next_next_char} = $next;
647 }
648 $self->{next_char} = 0x000A; # LF # MUST
649 $self->{line}++;
650 $self->{column} = 0;
651 } elsif ($self->{next_char} > 0x10FFFF) {
652 !!!cp ('j3');
653 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
654 } elsif ($self->{next_char} == 0x0000) { # NULL
655 !!!cp ('j4');
656 !!!parse-error (type => 'NULL');
657 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
658 } elsif ($self->{next_char} <= 0x0008 or
659 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
660 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
661 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
662 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
663 {
664 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
665 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
666 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
667 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
668 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
669 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
670 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
671 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
672 0x10FFFE => 1, 0x10FFFF => 1,
673 }->{$self->{next_char}}) {
674 !!!cp ('j5');
675 if ($self->{next_char} < 0x10000) {
676 !!!parse-error (type => 'control char',
677 text => (sprintf 'U+%04X', $self->{next_char}));
678 } else {
679 !!!parse-error (type => 'control char',
680 text => (sprintf 'U-%08X', $self->{next_char}));
681 }
682 }
683 };
684 $self->{prev_char} = [-1, -1, -1];
685 $self->{next_char} = -1;
686
687 my $onerror = $_[2] || sub {
688 my (%opt) = @_;
689 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
690 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
691 warn "Parse error ($opt{type}) at line $line column $column\n";
692 };
693 $self->{parse_error} = sub {
694 $onerror->(line => $self->{line}, column => $self->{column}, @_);
695 };
696
697 $self->_initialize_tokenizer;
698 $self->_initialize_tree_constructor;
699 $self->_construct_tree;
700 $self->_terminate_tree_constructor;
701
702 delete $self->{parse_error}; # remove loop
703
704 return $self->{document};
705 } # parse_char_stream
706
707 sub new ($) {
708 my $class = shift;
709 my $self = bless {
710 level => {must => 'm',
711 warn => 'w',
712 info => 'i',
713 uncertain => 'u'},
714 }, $class;
715 $self->{set_next_char} = sub {
716 $self->{next_char} = -1;
717 };
718 $self->{parse_error} = sub {
719 #
720 };
721 $self->{change_encoding} = sub {
722 # if ($_[0] is a supported encoding) {
723 # run "change the encoding" algorithm;
724 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
725 # }
726 };
727 $self->{application_cache_selection} = sub {
728 #
729 };
730 return $self;
731 } # new
732
733 sub CM_ENTITY () { 0b001 } # & markup in data
734 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
735 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
736
737 sub PLAINTEXT_CONTENT_MODEL () { 0 }
738 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
739 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
740 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
741
742 sub DATA_STATE () { 0 }
743 sub ENTITY_DATA_STATE () { 1 }
744 sub TAG_OPEN_STATE () { 2 }
745 sub CLOSE_TAG_OPEN_STATE () { 3 }
746 sub TAG_NAME_STATE () { 4 }
747 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
748 sub ATTRIBUTE_NAME_STATE () { 6 }
749 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
750 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
751 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
752 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
753 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
754 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
755 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
756 sub COMMENT_START_STATE () { 14 }
757 sub COMMENT_START_DASH_STATE () { 15 }
758 sub COMMENT_STATE () { 16 }
759 sub COMMENT_END_STATE () { 17 }
760 sub COMMENT_END_DASH_STATE () { 18 }
761 sub BOGUS_COMMENT_STATE () { 19 }
762 sub DOCTYPE_STATE () { 20 }
763 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
764 sub DOCTYPE_NAME_STATE () { 22 }
765 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
766 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
767 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
768 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
769 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
770 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
771 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
772 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
773 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
774 sub BOGUS_DOCTYPE_STATE () { 32 }
775 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
776 sub SELF_CLOSING_START_TAG_STATE () { 34 }
777 sub CDATA_BLOCK_STATE () { 35 }
778
779 sub DOCTYPE_TOKEN () { 1 }
780 sub COMMENT_TOKEN () { 2 }
781 sub START_TAG_TOKEN () { 3 }
782 sub END_TAG_TOKEN () { 4 }
783 sub END_OF_FILE_TOKEN () { 5 }
784 sub CHARACTER_TOKEN () { 6 }
785
786 sub AFTER_HTML_IMS () { 0b100 }
787 sub HEAD_IMS () { 0b1000 }
788 sub BODY_IMS () { 0b10000 }
789 sub BODY_TABLE_IMS () { 0b100000 }
790 sub TABLE_IMS () { 0b1000000 }
791 sub ROW_IMS () { 0b10000000 }
792 sub BODY_AFTER_IMS () { 0b100000000 }
793 sub FRAME_IMS () { 0b1000000000 }
794 sub SELECT_IMS () { 0b10000000000 }
795 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
796 ## NOTE: "in foreign content" insertion mode is special; it is combined
797 ## with the secondary insertion mode. In this parser, they are stored
798 ## together in the bit-or'ed form.
799
800 ## NOTE: "initial" and "before html" insertion modes have no constants.
801
802 ## NOTE: "after after body" insertion mode.
803 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
804
805 ## NOTE: "after after frameset" insertion mode.
806 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
807
808 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
809 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
810 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
811 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
812 sub IN_BODY_IM () { BODY_IMS }
813 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
814 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
815 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
816 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
817 sub IN_TABLE_IM () { TABLE_IMS }
818 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
819 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
820 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
821 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
822 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
823 sub IN_COLUMN_GROUP_IM () { 0b10 }
824
825 ## Implementations MUST act as if state machine in the spec
826
827 sub _initialize_tokenizer ($) {
828 my $self = shift;
829 $self->{state} = DATA_STATE; # MUST
830 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
831 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
832 undef $self->{current_attribute};
833 undef $self->{last_emitted_start_tag_name};
834 undef $self->{last_attribute_value_state};
835 delete $self->{self_closing};
836 $self->{char} = [];
837 # $self->{next_char}
838 !!!next-input-character;
839 $self->{token} = [];
840 # $self->{escape}
841 } # _initialize_tokenizer
842
843 ## A token has:
844 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
845 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
846 ## ->{name} (DOCTYPE_TOKEN)
847 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
848 ## ->{public_identifier} (DOCTYPE_TOKEN)
849 ## ->{system_identifier} (DOCTYPE_TOKEN)
850 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
851 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
852 ## ->{name}
853 ## ->{value}
854 ## ->{has_reference} == 1 or 0
855 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
856 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
857 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
858 ## while the token is pushed back to the stack.
859
860 ## Emitted token MUST immediately be handled by the tree construction state.
861
862 ## Before each step, UA MAY check to see if either one of the scripts in
863 ## "list of scripts that will execute as soon as possible" or the first
864 ## script in the "list of scripts that will execute asynchronously",
865 ## has completed loading. If one has, then it MUST be executed
866 ## and removed from the list.
867
868 ## NOTE: HTML5 "Writing HTML documents" section, applied to
869 ## documents and not to user agents and conformance checkers,
870 ## contains some requirements that are not detected by the
871 ## parsing algorithm:
872 ## - Some requirements on character encoding declarations. ## TODO
873 ## - "Elements MUST NOT contain content that their content model disallows."
874 ## ... Some are parse error, some are not (will be reported by c.c.).
875 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
876 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
877 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
878
879 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
880 ## be detected by the HTML5 parsing algorithm:
881 ## - Text,
882
883 sub _get_next_token ($) {
884 my $self = shift;
885
886 if ($self->{self_closing}) {
887 !!!parse-error (type => 'nestc', token => $self->{current_token});
888 ## NOTE: The |self_closing| flag is only set by start tag token.
889 ## In addition, when a start tag token is emitted, it is always set to
890 ## |current_token|.
891 delete $self->{self_closing};
892 }
893
894 if (@{$self->{token}}) {
895 $self->{self_closing} = $self->{token}->[0]->{self_closing};
896 return shift @{$self->{token}};
897 }
898
899 A: {
900 if ($self->{state} == DATA_STATE) {
901 if ($self->{next_char} == 0x0026) { # &
902 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
903 not $self->{escape}) {
904 !!!cp (1);
905 $self->{state} = ENTITY_DATA_STATE;
906 !!!next-input-character;
907 redo A;
908 } else {
909 !!!cp (2);
910 #
911 }
912 } elsif ($self->{next_char} == 0x002D) { # -
913 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
914 unless ($self->{escape}) {
915 if ($self->{prev_char}->[0] == 0x002D and # -
916 $self->{prev_char}->[1] == 0x0021 and # !
917 $self->{prev_char}->[2] == 0x003C) { # <
918 !!!cp (3);
919 $self->{escape} = 1;
920 } else {
921 !!!cp (4);
922 }
923 } else {
924 !!!cp (5);
925 }
926 }
927
928 #
929 } elsif ($self->{next_char} == 0x003C) { # <
930 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
931 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
932 not $self->{escape})) {
933 !!!cp (6);
934 $self->{state} = TAG_OPEN_STATE;
935 !!!next-input-character;
936 redo A;
937 } else {
938 !!!cp (7);
939 #
940 }
941 } elsif ($self->{next_char} == 0x003E) { # >
942 if ($self->{escape} and
943 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
944 if ($self->{prev_char}->[0] == 0x002D and # -
945 $self->{prev_char}->[1] == 0x002D) { # -
946 !!!cp (8);
947 delete $self->{escape};
948 } else {
949 !!!cp (9);
950 }
951 } else {
952 !!!cp (10);
953 }
954
955 #
956 } elsif ($self->{next_char} == -1) {
957 !!!cp (11);
958 !!!emit ({type => END_OF_FILE_TOKEN,
959 line => $self->{line}, column => $self->{column}});
960 last A; ## TODO: ok?
961 } else {
962 !!!cp (12);
963 }
964 # Anything else
965 my $token = {type => CHARACTER_TOKEN,
966 data => chr $self->{next_char},
967 line => $self->{line}, column => $self->{column},
968 };
969 ## Stay in the data state
970 !!!next-input-character;
971
972 !!!emit ($token);
973
974 redo A;
975 } elsif ($self->{state} == ENTITY_DATA_STATE) {
976 ## (cannot happen in CDATA state)
977
978 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
979
980 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
981
982 $self->{state} = DATA_STATE;
983 # next-input-character is already done
984
985 unless (defined $token) {
986 !!!cp (13);
987 !!!emit ({type => CHARACTER_TOKEN, data => '&',
988 line => $l, column => $c,
989 });
990 } else {
991 !!!cp (14);
992 !!!emit ($token);
993 }
994
995 redo A;
996 } elsif ($self->{state} == TAG_OPEN_STATE) {
997 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
998 if ($self->{next_char} == 0x002F) { # /
999 !!!cp (15);
1000 !!!next-input-character;
1001 $self->{state} = CLOSE_TAG_OPEN_STATE;
1002 redo A;
1003 } else {
1004 !!!cp (16);
1005 ## reconsume
1006 $self->{state} = DATA_STATE;
1007
1008 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1009 line => $self->{line_prev},
1010 column => $self->{column_prev},
1011 });
1012
1013 redo A;
1014 }
1015 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1016 if ($self->{next_char} == 0x0021) { # !
1017 !!!cp (17);
1018 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1019 !!!next-input-character;
1020 redo A;
1021 } elsif ($self->{next_char} == 0x002F) { # /
1022 !!!cp (18);
1023 $self->{state} = CLOSE_TAG_OPEN_STATE;
1024 !!!next-input-character;
1025 redo A;
1026 } elsif (0x0041 <= $self->{next_char} and
1027 $self->{next_char} <= 0x005A) { # A..Z
1028 !!!cp (19);
1029 $self->{current_token}
1030 = {type => START_TAG_TOKEN,
1031 tag_name => chr ($self->{next_char} + 0x0020),
1032 line => $self->{line_prev},
1033 column => $self->{column_prev}};
1034 $self->{state} = TAG_NAME_STATE;
1035 !!!next-input-character;
1036 redo A;
1037 } elsif (0x0061 <= $self->{next_char} and
1038 $self->{next_char} <= 0x007A) { # a..z
1039 !!!cp (20);
1040 $self->{current_token} = {type => START_TAG_TOKEN,
1041 tag_name => chr ($self->{next_char}),
1042 line => $self->{line_prev},
1043 column => $self->{column_prev}};
1044 $self->{state} = TAG_NAME_STATE;
1045 !!!next-input-character;
1046 redo A;
1047 } elsif ($self->{next_char} == 0x003E) { # >
1048 !!!cp (21);
1049 !!!parse-error (type => 'empty start tag',
1050 line => $self->{line_prev},
1051 column => $self->{column_prev});
1052 $self->{state} = DATA_STATE;
1053 !!!next-input-character;
1054
1055 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1056 line => $self->{line_prev},
1057 column => $self->{column_prev},
1058 });
1059
1060 redo A;
1061 } elsif ($self->{next_char} == 0x003F) { # ?
1062 !!!cp (22);
1063 !!!parse-error (type => 'pio',
1064 line => $self->{line_prev},
1065 column => $self->{column_prev});
1066 $self->{state} = BOGUS_COMMENT_STATE;
1067 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1068 line => $self->{line_prev},
1069 column => $self->{column_prev},
1070 };
1071 ## $self->{next_char} is intentionally left as is
1072 redo A;
1073 } else {
1074 !!!cp (23);
1075 !!!parse-error (type => 'bare stago',
1076 line => $self->{line_prev},
1077 column => $self->{column_prev});
1078 $self->{state} = DATA_STATE;
1079 ## reconsume
1080
1081 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1082 line => $self->{line_prev},
1083 column => $self->{column_prev},
1084 });
1085
1086 redo A;
1087 }
1088 } else {
1089 die "$0: $self->{content_model} in tag open";
1090 }
1091 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1092 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1093 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1094 if (defined $self->{last_emitted_start_tag_name}) {
1095
1096 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
1097 my @next_char;
1098 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
1099 push @next_char, $self->{next_char};
1100 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
1101 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
1102 if ($self->{next_char} == $c or $self->{next_char} == $C) {
1103 !!!cp (24);
1104 !!!next-input-character;
1105 next TAGNAME;
1106 } else {
1107 !!!cp (25);
1108 $self->{next_char} = shift @next_char; # reconsume
1109 !!!back-next-input-character (@next_char);
1110 $self->{state} = DATA_STATE;
1111
1112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1113 line => $l, column => $c,
1114 });
1115
1116 redo A;
1117 }
1118 }
1119 push @next_char, $self->{next_char};
1120
1121 unless ($self->{next_char} == 0x0009 or # HT
1122 $self->{next_char} == 0x000A or # LF
1123 $self->{next_char} == 0x000B or # VT
1124 $self->{next_char} == 0x000C or # FF
1125 $self->{next_char} == 0x0020 or # SP
1126 $self->{next_char} == 0x003E or # >
1127 $self->{next_char} == 0x002F or # /
1128 $self->{next_char} == -1) {
1129 !!!cp (26);
1130 $self->{next_char} = shift @next_char; # reconsume
1131 !!!back-next-input-character (@next_char);
1132 $self->{state} = DATA_STATE;
1133 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1134 line => $l, column => $c,
1135 });
1136 redo A;
1137 } else {
1138 !!!cp (27);
1139 $self->{next_char} = shift @next_char;
1140 !!!back-next-input-character (@next_char);
1141 # and consume...
1142 }
1143 } else {
1144 ## No start tag token has ever been emitted
1145 !!!cp (28);
1146 # next-input-character is already done
1147 $self->{state} = DATA_STATE;
1148 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1149 line => $l, column => $c,
1150 });
1151 redo A;
1152 }
1153 }
1154
1155 if (0x0041 <= $self->{next_char} and
1156 $self->{next_char} <= 0x005A) { # A..Z
1157 !!!cp (29);
1158 $self->{current_token}
1159 = {type => END_TAG_TOKEN,
1160 tag_name => chr ($self->{next_char} + 0x0020),
1161 line => $l, column => $c};
1162 $self->{state} = TAG_NAME_STATE;
1163 !!!next-input-character;
1164 redo A;
1165 } elsif (0x0061 <= $self->{next_char} and
1166 $self->{next_char} <= 0x007A) { # a..z
1167 !!!cp (30);
1168 $self->{current_token} = {type => END_TAG_TOKEN,
1169 tag_name => chr ($self->{next_char}),
1170 line => $l, column => $c};
1171 $self->{state} = TAG_NAME_STATE;
1172 !!!next-input-character;
1173 redo A;
1174 } elsif ($self->{next_char} == 0x003E) { # >
1175 !!!cp (31);
1176 !!!parse-error (type => 'empty end tag',
1177 line => $self->{line_prev}, ## "<" in "</>"
1178 column => $self->{column_prev} - 1);
1179 $self->{state} = DATA_STATE;
1180 !!!next-input-character;
1181 redo A;
1182 } elsif ($self->{next_char} == -1) {
1183 !!!cp (32);
1184 !!!parse-error (type => 'bare etago');
1185 $self->{state} = DATA_STATE;
1186 # reconsume
1187
1188 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1189 line => $l, column => $c,
1190 });
1191
1192 redo A;
1193 } else {
1194 !!!cp (33);
1195 !!!parse-error (type => 'bogus end tag');
1196 $self->{state} = BOGUS_COMMENT_STATE;
1197 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1198 line => $self->{line_prev}, # "<" of "</"
1199 column => $self->{column_prev} - 1,
1200 };
1201 ## $self->{next_char} is intentionally left as is
1202 redo A;
1203 }
1204 } elsif ($self->{state} == TAG_NAME_STATE) {
1205 if ($self->{next_char} == 0x0009 or # HT
1206 $self->{next_char} == 0x000A or # LF
1207 $self->{next_char} == 0x000B or # VT
1208 $self->{next_char} == 0x000C or # FF
1209 $self->{next_char} == 0x0020) { # SP
1210 !!!cp (34);
1211 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1212 !!!next-input-character;
1213 redo A;
1214 } elsif ($self->{next_char} == 0x003E) { # >
1215 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1216 !!!cp (35);
1217 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1218 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1219 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1220 #if ($self->{current_token}->{attributes}) {
1221 # ## NOTE: This should never be reached.
1222 # !!! cp (36);
1223 # !!! parse-error (type => 'end tag attribute');
1224 #} else {
1225 !!!cp (37);
1226 #}
1227 } else {
1228 die "$0: $self->{current_token}->{type}: Unknown token type";
1229 }
1230 $self->{state} = DATA_STATE;
1231 !!!next-input-character;
1232
1233 !!!emit ($self->{current_token}); # start tag or end tag
1234
1235 redo A;
1236 } elsif (0x0041 <= $self->{next_char} and
1237 $self->{next_char} <= 0x005A) { # A..Z
1238 !!!cp (38);
1239 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1240 # start tag or end tag
1241 ## Stay in this state
1242 !!!next-input-character;
1243 redo A;
1244 } elsif ($self->{next_char} == -1) {
1245 !!!parse-error (type => 'unclosed tag');
1246 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1247 !!!cp (39);
1248 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1249 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1250 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1251 #if ($self->{current_token}->{attributes}) {
1252 # ## NOTE: This state should never be reached.
1253 # !!! cp (40);
1254 # !!! parse-error (type => 'end tag attribute');
1255 #} else {
1256 !!!cp (41);
1257 #}
1258 } else {
1259 die "$0: $self->{current_token}->{type}: Unknown token type";
1260 }
1261 $self->{state} = DATA_STATE;
1262 # reconsume
1263
1264 !!!emit ($self->{current_token}); # start tag or end tag
1265
1266 redo A;
1267 } elsif ($self->{next_char} == 0x002F) { # /
1268 !!!cp (42);
1269 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1270 !!!next-input-character;
1271 redo A;
1272 } else {
1273 !!!cp (44);
1274 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1275 # start tag or end tag
1276 ## Stay in the state
1277 !!!next-input-character;
1278 redo A;
1279 }
1280 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1281 if ($self->{next_char} == 0x0009 or # HT
1282 $self->{next_char} == 0x000A or # LF
1283 $self->{next_char} == 0x000B or # VT
1284 $self->{next_char} == 0x000C or # FF
1285 $self->{next_char} == 0x0020) { # SP
1286 !!!cp (45);
1287 ## Stay in the state
1288 !!!next-input-character;
1289 redo A;
1290 } elsif ($self->{next_char} == 0x003E) { # >
1291 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1292 !!!cp (46);
1293 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1294 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1295 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1296 if ($self->{current_token}->{attributes}) {
1297 !!!cp (47);
1298 !!!parse-error (type => 'end tag attribute');
1299 } else {
1300 !!!cp (48);
1301 }
1302 } else {
1303 die "$0: $self->{current_token}->{type}: Unknown token type";
1304 }
1305 $self->{state} = DATA_STATE;
1306 !!!next-input-character;
1307
1308 !!!emit ($self->{current_token}); # start tag or end tag
1309
1310 redo A;
1311 } elsif (0x0041 <= $self->{next_char} and
1312 $self->{next_char} <= 0x005A) { # A..Z
1313 !!!cp (49);
1314 $self->{current_attribute}
1315 = {name => chr ($self->{next_char} + 0x0020),
1316 value => '',
1317 line => $self->{line}, column => $self->{column}};
1318 $self->{state} = ATTRIBUTE_NAME_STATE;
1319 !!!next-input-character;
1320 redo A;
1321 } elsif ($self->{next_char} == 0x002F) { # /
1322 !!!cp (50);
1323 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1324 !!!next-input-character;
1325 redo A;
1326 } elsif ($self->{next_char} == -1) {
1327 !!!parse-error (type => 'unclosed tag');
1328 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1329 !!!cp (52);
1330 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1331 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1332 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1333 if ($self->{current_token}->{attributes}) {
1334 !!!cp (53);
1335 !!!parse-error (type => 'end tag attribute');
1336 } else {
1337 !!!cp (54);
1338 }
1339 } else {
1340 die "$0: $self->{current_token}->{type}: Unknown token type";
1341 }
1342 $self->{state} = DATA_STATE;
1343 # reconsume
1344
1345 !!!emit ($self->{current_token}); # start tag or end tag
1346
1347 redo A;
1348 } else {
1349 if ({
1350 0x0022 => 1, # "
1351 0x0027 => 1, # '
1352 0x003D => 1, # =
1353 }->{$self->{next_char}}) {
1354 !!!cp (55);
1355 !!!parse-error (type => 'bad attribute name');
1356 } else {
1357 !!!cp (56);
1358 }
1359 $self->{current_attribute}
1360 = {name => chr ($self->{next_char}),
1361 value => '',
1362 line => $self->{line}, column => $self->{column}};
1363 $self->{state} = ATTRIBUTE_NAME_STATE;
1364 !!!next-input-character;
1365 redo A;
1366 }
1367 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1368 my $before_leave = sub {
1369 if (exists $self->{current_token}->{attributes} # start tag or end tag
1370 ->{$self->{current_attribute}->{name}}) { # MUST
1371 !!!cp (57);
1372 !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1373 ## Discard $self->{current_attribute} # MUST
1374 } else {
1375 !!!cp (58);
1376 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1377 = $self->{current_attribute};
1378 }
1379 }; # $before_leave
1380
1381 if ($self->{next_char} == 0x0009 or # HT
1382 $self->{next_char} == 0x000A or # LF
1383 $self->{next_char} == 0x000B or # VT
1384 $self->{next_char} == 0x000C or # FF
1385 $self->{next_char} == 0x0020) { # SP
1386 !!!cp (59);
1387 $before_leave->();
1388 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1389 !!!next-input-character;
1390 redo A;
1391 } elsif ($self->{next_char} == 0x003D) { # =
1392 !!!cp (60);
1393 $before_leave->();
1394 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1395 !!!next-input-character;
1396 redo A;
1397 } elsif ($self->{next_char} == 0x003E) { # >
1398 $before_leave->();
1399 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1400 !!!cp (61);
1401 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1402 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1403 !!!cp (62);
1404 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1405 if ($self->{current_token}->{attributes}) {
1406 !!!parse-error (type => 'end tag attribute');
1407 }
1408 } else {
1409 die "$0: $self->{current_token}->{type}: Unknown token type";
1410 }
1411 $self->{state} = DATA_STATE;
1412 !!!next-input-character;
1413
1414 !!!emit ($self->{current_token}); # start tag or end tag
1415
1416 redo A;
1417 } elsif (0x0041 <= $self->{next_char} and
1418 $self->{next_char} <= 0x005A) { # A..Z
1419 !!!cp (63);
1420 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1421 ## Stay in the state
1422 !!!next-input-character;
1423 redo A;
1424 } elsif ($self->{next_char} == 0x002F) { # /
1425 !!!cp (64);
1426 $before_leave->();
1427 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1428 !!!next-input-character;
1429 redo A;
1430 } elsif ($self->{next_char} == -1) {
1431 !!!parse-error (type => 'unclosed tag');
1432 $before_leave->();
1433 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1434 !!!cp (66);
1435 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1436 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1437 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1438 if ($self->{current_token}->{attributes}) {
1439 !!!cp (67);
1440 !!!parse-error (type => 'end tag attribute');
1441 } else {
1442 ## NOTE: This state should never be reached.
1443 !!!cp (68);
1444 }
1445 } else {
1446 die "$0: $self->{current_token}->{type}: Unknown token type";
1447 }
1448 $self->{state} = DATA_STATE;
1449 # reconsume
1450
1451 !!!emit ($self->{current_token}); # start tag or end tag
1452
1453 redo A;
1454 } else {
1455 if ($self->{next_char} == 0x0022 or # "
1456 $self->{next_char} == 0x0027) { # '
1457 !!!cp (69);
1458 !!!parse-error (type => 'bad attribute name');
1459 } else {
1460 !!!cp (70);
1461 }
1462 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1463 ## Stay in the state
1464 !!!next-input-character;
1465 redo A;
1466 }
1467 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1468 if ($self->{next_char} == 0x0009 or # HT
1469 $self->{next_char} == 0x000A or # LF
1470 $self->{next_char} == 0x000B or # VT
1471 $self->{next_char} == 0x000C or # FF
1472 $self->{next_char} == 0x0020) { # SP
1473 !!!cp (71);
1474 ## Stay in the state
1475 !!!next-input-character;
1476 redo A;
1477 } elsif ($self->{next_char} == 0x003D) { # =
1478 !!!cp (72);
1479 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1480 !!!next-input-character;
1481 redo A;
1482 } elsif ($self->{next_char} == 0x003E) { # >
1483 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1484 !!!cp (73);
1485 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1486 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1487 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1488 if ($self->{current_token}->{attributes}) {
1489 !!!cp (74);
1490 !!!parse-error (type => 'end tag attribute');
1491 } else {
1492 ## NOTE: This state should never be reached.
1493 !!!cp (75);
1494 }
1495 } else {
1496 die "$0: $self->{current_token}->{type}: Unknown token type";
1497 }
1498 $self->{state} = DATA_STATE;
1499 !!!next-input-character;
1500
1501 !!!emit ($self->{current_token}); # start tag or end tag
1502
1503 redo A;
1504 } elsif (0x0041 <= $self->{next_char} and
1505 $self->{next_char} <= 0x005A) { # A..Z
1506 !!!cp (76);
1507 $self->{current_attribute}
1508 = {name => chr ($self->{next_char} + 0x0020),
1509 value => '',
1510 line => $self->{line}, column => $self->{column}};
1511 $self->{state} = ATTRIBUTE_NAME_STATE;
1512 !!!next-input-character;
1513 redo A;
1514 } elsif ($self->{next_char} == 0x002F) { # /
1515 !!!cp (77);
1516 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1517 !!!next-input-character;
1518 redo A;
1519 } elsif ($self->{next_char} == -1) {
1520 !!!parse-error (type => 'unclosed tag');
1521 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1522 !!!cp (79);
1523 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1524 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1525 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1526 if ($self->{current_token}->{attributes}) {
1527 !!!cp (80);
1528 !!!parse-error (type => 'end tag attribute');
1529 } else {
1530 ## NOTE: This state should never be reached.
1531 !!!cp (81);
1532 }
1533 } else {
1534 die "$0: $self->{current_token}->{type}: Unknown token type";
1535 }
1536 $self->{state} = DATA_STATE;
1537 # reconsume
1538
1539 !!!emit ($self->{current_token}); # start tag or end tag
1540
1541 redo A;
1542 } else {
1543 if ($self->{next_char} == 0x0022 or # "
1544 $self->{next_char} == 0x0027) { # '
1545 !!!cp (78);
1546 !!!parse-error (type => 'bad attribute name');
1547 } else {
1548 !!!cp (82);
1549 }
1550 $self->{current_attribute}
1551 = {name => chr ($self->{next_char}),
1552 value => '',
1553 line => $self->{line}, column => $self->{column}};
1554 $self->{state} = ATTRIBUTE_NAME_STATE;
1555 !!!next-input-character;
1556 redo A;
1557 }
1558 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1559 if ($self->{next_char} == 0x0009 or # HT
1560 $self->{next_char} == 0x000A or # LF
1561 $self->{next_char} == 0x000B or # VT
1562 $self->{next_char} == 0x000C or # FF
1563 $self->{next_char} == 0x0020) { # SP
1564 !!!cp (83);
1565 ## Stay in the state
1566 !!!next-input-character;
1567 redo A;
1568 } elsif ($self->{next_char} == 0x0022) { # "
1569 !!!cp (84);
1570 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1571 !!!next-input-character;
1572 redo A;
1573 } elsif ($self->{next_char} == 0x0026) { # &
1574 !!!cp (85);
1575 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1576 ## reconsume
1577 redo A;
1578 } elsif ($self->{next_char} == 0x0027) { # '
1579 !!!cp (86);
1580 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1581 !!!next-input-character;
1582 redo A;
1583 } elsif ($self->{next_char} == 0x003E) { # >
1584 !!!parse-error (type => 'empty unquoted attribute value');
1585 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1586 !!!cp (87);
1587 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1588 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1589 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1590 if ($self->{current_token}->{attributes}) {
1591 !!!cp (88);
1592 !!!parse-error (type => 'end tag attribute');
1593 } else {
1594 ## NOTE: This state should never be reached.
1595 !!!cp (89);
1596 }
1597 } else {
1598 die "$0: $self->{current_token}->{type}: Unknown token type";
1599 }
1600 $self->{state} = DATA_STATE;
1601 !!!next-input-character;
1602
1603 !!!emit ($self->{current_token}); # start tag or end tag
1604
1605 redo A;
1606 } elsif ($self->{next_char} == -1) {
1607 !!!parse-error (type => 'unclosed tag');
1608 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1609 !!!cp (90);
1610 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1611 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1612 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1613 if ($self->{current_token}->{attributes}) {
1614 !!!cp (91);
1615 !!!parse-error (type => 'end tag attribute');
1616 } else {
1617 ## NOTE: This state should never be reached.
1618 !!!cp (92);
1619 }
1620 } else {
1621 die "$0: $self->{current_token}->{type}: Unknown token type";
1622 }
1623 $self->{state} = DATA_STATE;
1624 ## reconsume
1625
1626 !!!emit ($self->{current_token}); # start tag or end tag
1627
1628 redo A;
1629 } else {
1630 if ($self->{next_char} == 0x003D) { # =
1631 !!!cp (93);
1632 !!!parse-error (type => 'bad attribute value');
1633 } else {
1634 !!!cp (94);
1635 }
1636 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1637 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1638 !!!next-input-character;
1639 redo A;
1640 }
1641 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1642 if ($self->{next_char} == 0x0022) { # "
1643 !!!cp (95);
1644 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1645 !!!next-input-character;
1646 redo A;
1647 } elsif ($self->{next_char} == 0x0026) { # &
1648 !!!cp (96);
1649 $self->{last_attribute_value_state} = $self->{state};
1650 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1651 !!!next-input-character;
1652 redo A;
1653 } elsif ($self->{next_char} == -1) {
1654 !!!parse-error (type => 'unclosed attribute value');
1655 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1656 !!!cp (97);
1657 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1658 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1659 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1660 if ($self->{current_token}->{attributes}) {
1661 !!!cp (98);
1662 !!!parse-error (type => 'end tag attribute');
1663 } else {
1664 ## NOTE: This state should never be reached.
1665 !!!cp (99);
1666 }
1667 } else {
1668 die "$0: $self->{current_token}->{type}: Unknown token type";
1669 }
1670 $self->{state} = DATA_STATE;
1671 ## reconsume
1672
1673 !!!emit ($self->{current_token}); # start tag or end tag
1674
1675 redo A;
1676 } else {
1677 !!!cp (100);
1678 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1679 ## Stay in the state
1680 !!!next-input-character;
1681 redo A;
1682 }
1683 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1684 if ($self->{next_char} == 0x0027) { # '
1685 !!!cp (101);
1686 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1687 !!!next-input-character;
1688 redo A;
1689 } elsif ($self->{next_char} == 0x0026) { # &
1690 !!!cp (102);
1691 $self->{last_attribute_value_state} = $self->{state};
1692 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1693 !!!next-input-character;
1694 redo A;
1695 } elsif ($self->{next_char} == -1) {
1696 !!!parse-error (type => 'unclosed attribute value');
1697 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1698 !!!cp (103);
1699 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1700 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1701 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1702 if ($self->{current_token}->{attributes}) {
1703 !!!cp (104);
1704 !!!parse-error (type => 'end tag attribute');
1705 } else {
1706 ## NOTE: This state should never be reached.
1707 !!!cp (105);
1708 }
1709 } else {
1710 die "$0: $self->{current_token}->{type}: Unknown token type";
1711 }
1712 $self->{state} = DATA_STATE;
1713 ## reconsume
1714
1715 !!!emit ($self->{current_token}); # start tag or end tag
1716
1717 redo A;
1718 } else {
1719 !!!cp (106);
1720 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1721 ## Stay in the state
1722 !!!next-input-character;
1723 redo A;
1724 }
1725 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1726 if ($self->{next_char} == 0x0009 or # HT
1727 $self->{next_char} == 0x000A or # LF
1728 $self->{next_char} == 0x000B or # HT
1729 $self->{next_char} == 0x000C or # FF
1730 $self->{next_char} == 0x0020) { # SP
1731 !!!cp (107);
1732 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1733 !!!next-input-character;
1734 redo A;
1735 } elsif ($self->{next_char} == 0x0026) { # &
1736 !!!cp (108);
1737 $self->{last_attribute_value_state} = $self->{state};
1738 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1739 !!!next-input-character;
1740 redo A;
1741 } elsif ($self->{next_char} == 0x003E) { # >
1742 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1743 !!!cp (109);
1744 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1745 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1746 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1747 if ($self->{current_token}->{attributes}) {
1748 !!!cp (110);
1749 !!!parse-error (type => 'end tag attribute');
1750 } else {
1751 ## NOTE: This state should never be reached.
1752 !!!cp (111);
1753 }
1754 } else {
1755 die "$0: $self->{current_token}->{type}: Unknown token type";
1756 }
1757 $self->{state} = DATA_STATE;
1758 !!!next-input-character;
1759
1760 !!!emit ($self->{current_token}); # start tag or end tag
1761
1762 redo A;
1763 } elsif ($self->{next_char} == -1) {
1764 !!!parse-error (type => 'unclosed tag');
1765 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1766 !!!cp (112);
1767 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1768 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1769 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1770 if ($self->{current_token}->{attributes}) {
1771 !!!cp (113);
1772 !!!parse-error (type => 'end tag attribute');
1773 } else {
1774 ## NOTE: This state should never be reached.
1775 !!!cp (114);
1776 }
1777 } else {
1778 die "$0: $self->{current_token}->{type}: Unknown token type";
1779 }
1780 $self->{state} = DATA_STATE;
1781 ## reconsume
1782
1783 !!!emit ($self->{current_token}); # start tag or end tag
1784
1785 redo A;
1786 } else {
1787 if ({
1788 0x0022 => 1, # "
1789 0x0027 => 1, # '
1790 0x003D => 1, # =
1791 }->{$self->{next_char}}) {
1792 !!!cp (115);
1793 !!!parse-error (type => 'bad attribute value');
1794 } else {
1795 !!!cp (116);
1796 }
1797 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1798 ## Stay in the state
1799 !!!next-input-character;
1800 redo A;
1801 }
1802 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1803 my $token = $self->_tokenize_attempt_to_consume_an_entity
1804 (1,
1805 $self->{last_attribute_value_state}
1806 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1807 $self->{last_attribute_value_state}
1808 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1809 -1);
1810
1811 unless (defined $token) {
1812 !!!cp (117);
1813 $self->{current_attribute}->{value} .= '&';
1814 } else {
1815 !!!cp (118);
1816 $self->{current_attribute}->{value} .= $token->{data};
1817 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1818 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1819 }
1820
1821 $self->{state} = $self->{last_attribute_value_state};
1822 # next-input-character is already done
1823 redo A;
1824 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1825 if ($self->{next_char} == 0x0009 or # HT
1826 $self->{next_char} == 0x000A or # LF
1827 $self->{next_char} == 0x000B or # VT
1828 $self->{next_char} == 0x000C or # FF
1829 $self->{next_char} == 0x0020) { # SP
1830 !!!cp (118);
1831 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1832 !!!next-input-character;
1833 redo A;
1834 } elsif ($self->{next_char} == 0x003E) { # >
1835 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1836 !!!cp (119);
1837 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1838 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1839 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1840 if ($self->{current_token}->{attributes}) {
1841 !!!cp (120);
1842 !!!parse-error (type => 'end tag attribute');
1843 } else {
1844 ## NOTE: This state should never be reached.
1845 !!!cp (121);
1846 }
1847 } else {
1848 die "$0: $self->{current_token}->{type}: Unknown token type";
1849 }
1850 $self->{state} = DATA_STATE;
1851 !!!next-input-character;
1852
1853 !!!emit ($self->{current_token}); # start tag or end tag
1854
1855 redo A;
1856 } elsif ($self->{next_char} == 0x002F) { # /
1857 !!!cp (122);
1858 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1859 !!!next-input-character;
1860 redo A;
1861 } elsif ($self->{next_char} == -1) {
1862 !!!parse-error (type => 'unclosed tag');
1863 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1864 !!!cp (122.3);
1865 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1866 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1867 if ($self->{current_token}->{attributes}) {
1868 !!!cp (122.1);
1869 !!!parse-error (type => 'end tag attribute');
1870 } else {
1871 ## NOTE: This state should never be reached.
1872 !!!cp (122.2);
1873 }
1874 } else {
1875 die "$0: $self->{current_token}->{type}: Unknown token type";
1876 }
1877 $self->{state} = DATA_STATE;
1878 ## Reconsume.
1879 !!!emit ($self->{current_token}); # start tag or end tag
1880 redo A;
1881 } else {
1882 !!!cp ('124.1');
1883 !!!parse-error (type => 'no space between attributes');
1884 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1885 ## reconsume
1886 redo A;
1887 }
1888 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1889 if ($self->{next_char} == 0x003E) { # >
1890 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1891 !!!cp ('124.2');
1892 !!!parse-error (type => 'nestc', token => $self->{current_token});
1893 ## TODO: Different type than slash in start tag
1894 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1895 if ($self->{current_token}->{attributes}) {
1896 !!!cp ('124.4');
1897 !!!parse-error (type => 'end tag attribute');
1898 } else {
1899 !!!cp ('124.5');
1900 }
1901 ## TODO: Test |<title></title/>|
1902 } else {
1903 !!!cp ('124.3');
1904 $self->{self_closing} = 1;
1905 }
1906
1907 $self->{state} = DATA_STATE;
1908 !!!next-input-character;
1909
1910 !!!emit ($self->{current_token}); # start tag or end tag
1911
1912 redo A;
1913 } elsif ($self->{next_char} == -1) {
1914 !!!parse-error (type => 'unclosed tag');
1915 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1916 !!!cp (124.7);
1917 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1918 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1919 if ($self->{current_token}->{attributes}) {
1920 !!!cp (124.5);
1921 !!!parse-error (type => 'end tag attribute');
1922 } else {
1923 ## NOTE: This state should never be reached.
1924 !!!cp (124.6);
1925 }
1926 } else {
1927 die "$0: $self->{current_token}->{type}: Unknown token type";
1928 }
1929 $self->{state} = DATA_STATE;
1930 ## Reconsume.
1931 !!!emit ($self->{current_token}); # start tag or end tag
1932 redo A;
1933 } else {
1934 !!!cp ('124.4');
1935 !!!parse-error (type => 'nestc');
1936 ## TODO: This error type is wrong.
1937 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1938 ## Reconsume.
1939 redo A;
1940 }
1941 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1942 ## (only happen if PCDATA state)
1943
1944 ## NOTE: Set by the previous state
1945 #my $token = {type => COMMENT_TOKEN, data => ''};
1946
1947 BC: {
1948 if ($self->{next_char} == 0x003E) { # >
1949 !!!cp (124);
1950 $self->{state} = DATA_STATE;
1951 !!!next-input-character;
1952
1953 !!!emit ($self->{current_token}); # comment
1954
1955 redo A;
1956 } elsif ($self->{next_char} == -1) {
1957 !!!cp (125);
1958 $self->{state} = DATA_STATE;
1959 ## reconsume
1960
1961 !!!emit ($self->{current_token}); # comment
1962
1963 redo A;
1964 } else {
1965 !!!cp (126);
1966 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1967 !!!next-input-character;
1968 redo BC;
1969 }
1970 } # BC
1971
1972 die "$0: _get_next_token: unexpected case [BC]";
1973 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1974 ## (only happen if PCDATA state)
1975
1976 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1977
1978 my @next_char;
1979 push @next_char, $self->{next_char};
1980
1981 if ($self->{next_char} == 0x002D) { # -
1982 !!!next-input-character;
1983 push @next_char, $self->{next_char};
1984 if ($self->{next_char} == 0x002D) { # -
1985 !!!cp (127);
1986 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1987 line => $l, column => $c,
1988 };
1989 $self->{state} = COMMENT_START_STATE;
1990 !!!next-input-character;
1991 redo A;
1992 } else {
1993 !!!cp (128);
1994 }
1995 } elsif ($self->{next_char} == 0x0044 or # D
1996 $self->{next_char} == 0x0064) { # d
1997 !!!next-input-character;
1998 push @next_char, $self->{next_char};
1999 if ($self->{next_char} == 0x004F or # O
2000 $self->{next_char} == 0x006F) { # o
2001 !!!next-input-character;
2002 push @next_char, $self->{next_char};
2003 if ($self->{next_char} == 0x0043 or # C
2004 $self->{next_char} == 0x0063) { # c
2005 !!!next-input-character;
2006 push @next_char, $self->{next_char};
2007 if ($self->{next_char} == 0x0054 or # T
2008 $self->{next_char} == 0x0074) { # t
2009 !!!next-input-character;
2010 push @next_char, $self->{next_char};
2011 if ($self->{next_char} == 0x0059 or # Y
2012 $self->{next_char} == 0x0079) { # y
2013 !!!next-input-character;
2014 push @next_char, $self->{next_char};
2015 if ($self->{next_char} == 0x0050 or # P
2016 $self->{next_char} == 0x0070) { # p
2017 !!!next-input-character;
2018 push @next_char, $self->{next_char};
2019 if ($self->{next_char} == 0x0045 or # E
2020 $self->{next_char} == 0x0065) { # e
2021 !!!cp (129);
2022 ## TODO: What a stupid code this is!
2023 $self->{state} = DOCTYPE_STATE;
2024 $self->{current_token} = {type => DOCTYPE_TOKEN,
2025 quirks => 1,
2026 line => $l, column => $c,
2027 };
2028 !!!next-input-character;
2029 redo A;
2030 } else {
2031 !!!cp (130);
2032 }
2033 } else {
2034 !!!cp (131);
2035 }
2036 } else {
2037 !!!cp (132);
2038 }
2039 } else {
2040 !!!cp (133);
2041 }
2042 } else {
2043 !!!cp (134);
2044 }
2045 } else {
2046 !!!cp (135);
2047 }
2048 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2049 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2050 $self->{next_char} == 0x005B) { # [
2051 !!!next-input-character;
2052 push @next_char, $self->{next_char};
2053 if ($self->{next_char} == 0x0043) { # C
2054 !!!next-input-character;
2055 push @next_char, $self->{next_char};
2056 if ($self->{next_char} == 0x0044) { # D
2057 !!!next-input-character;
2058 push @next_char, $self->{next_char};
2059 if ($self->{next_char} == 0x0041) { # A
2060 !!!next-input-character;
2061 push @next_char, $self->{next_char};
2062 if ($self->{next_char} == 0x0054) { # T
2063 !!!next-input-character;
2064 push @next_char, $self->{next_char};
2065 if ($self->{next_char} == 0x0041) { # A
2066 !!!next-input-character;
2067 push @next_char, $self->{next_char};
2068 if ($self->{next_char} == 0x005B) { # [
2069 !!!cp (135.1);
2070 $self->{state} = CDATA_BLOCK_STATE;
2071 !!!next-input-character;
2072 redo A;
2073 } else {
2074 !!!cp (135.2);
2075 }
2076 } else {
2077 !!!cp (135.3);
2078 }
2079 } else {
2080 !!!cp (135.4);
2081 }
2082 } else {
2083 !!!cp (135.5);
2084 }
2085 } else {
2086 !!!cp (135.6);
2087 }
2088 } else {
2089 !!!cp (135.7);
2090 }
2091 } else {
2092 !!!cp (136);
2093 }
2094
2095 !!!parse-error (type => 'bogus comment');
2096 $self->{next_char} = shift @next_char;
2097 !!!back-next-input-character (@next_char);
2098 $self->{state} = BOGUS_COMMENT_STATE;
2099 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2100 line => $l, column => $c,
2101 };
2102 redo A;
2103
2104 ## ISSUE: typos in spec: chacacters, is is a parse error
2105 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
2106 } elsif ($self->{state} == COMMENT_START_STATE) {
2107 if ($self->{next_char} == 0x002D) { # -
2108 !!!cp (137);
2109 $self->{state} = COMMENT_START_DASH_STATE;
2110 !!!next-input-character;
2111 redo A;
2112 } elsif ($self->{next_char} == 0x003E) { # >
2113 !!!cp (138);
2114 !!!parse-error (type => 'bogus comment');
2115 $self->{state} = DATA_STATE;
2116 !!!next-input-character;
2117
2118 !!!emit ($self->{current_token}); # comment
2119
2120 redo A;
2121 } elsif ($self->{next_char} == -1) {
2122 !!!cp (139);
2123 !!!parse-error (type => 'unclosed comment');
2124 $self->{state} = DATA_STATE;
2125 ## reconsume
2126
2127 !!!emit ($self->{current_token}); # comment
2128
2129 redo A;
2130 } else {
2131 !!!cp (140);
2132 $self->{current_token}->{data} # comment
2133 .= chr ($self->{next_char});
2134 $self->{state} = COMMENT_STATE;
2135 !!!next-input-character;
2136 redo A;
2137 }
2138 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2139 if ($self->{next_char} == 0x002D) { # -
2140 !!!cp (141);
2141 $self->{state} = COMMENT_END_STATE;
2142 !!!next-input-character;
2143 redo A;
2144 } elsif ($self->{next_char} == 0x003E) { # >
2145 !!!cp (142);
2146 !!!parse-error (type => 'bogus comment');
2147 $self->{state} = DATA_STATE;
2148 !!!next-input-character;
2149
2150 !!!emit ($self->{current_token}); # comment
2151
2152 redo A;
2153 } elsif ($self->{next_char} == -1) {
2154 !!!cp (143);
2155 !!!parse-error (type => 'unclosed comment');
2156 $self->{state} = DATA_STATE;
2157 ## reconsume
2158
2159 !!!emit ($self->{current_token}); # comment
2160
2161 redo A;
2162 } else {
2163 !!!cp (144);
2164 $self->{current_token}->{data} # comment
2165 .= '-' . chr ($self->{next_char});
2166 $self->{state} = COMMENT_STATE;
2167 !!!next-input-character;
2168 redo A;
2169 }
2170 } elsif ($self->{state} == COMMENT_STATE) {
2171 if ($self->{next_char} == 0x002D) { # -
2172 !!!cp (145);
2173 $self->{state} = COMMENT_END_DASH_STATE;
2174 !!!next-input-character;
2175 redo A;
2176 } elsif ($self->{next_char} == -1) {
2177 !!!cp (146);
2178 !!!parse-error (type => 'unclosed comment');
2179 $self->{state} = DATA_STATE;
2180 ## reconsume
2181
2182 !!!emit ($self->{current_token}); # comment
2183
2184 redo A;
2185 } else {
2186 !!!cp (147);
2187 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2188 ## Stay in the state
2189 !!!next-input-character;
2190 redo A;
2191 }
2192 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2193 if ($self->{next_char} == 0x002D) { # -
2194 !!!cp (148);
2195 $self->{state} = COMMENT_END_STATE;
2196 !!!next-input-character;
2197 redo A;
2198 } elsif ($self->{next_char} == -1) {
2199 !!!cp (149);
2200 !!!parse-error (type => 'unclosed comment');
2201 $self->{state} = DATA_STATE;
2202 ## reconsume
2203
2204 !!!emit ($self->{current_token}); # comment
2205
2206 redo A;
2207 } else {
2208 !!!cp (150);
2209 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2210 $self->{state} = COMMENT_STATE;
2211 !!!next-input-character;
2212 redo A;
2213 }
2214 } elsif ($self->{state} == COMMENT_END_STATE) {
2215 if ($self->{next_char} == 0x003E) { # >
2216 !!!cp (151);
2217 $self->{state} = DATA_STATE;
2218 !!!next-input-character;
2219
2220 !!!emit ($self->{current_token}); # comment
2221
2222 redo A;
2223 } elsif ($self->{next_char} == 0x002D) { # -
2224 !!!cp (152);
2225 !!!parse-error (type => 'dash in comment',
2226 line => $self->{line_prev},
2227 column => $self->{column_prev});
2228 $self->{current_token}->{data} .= '-'; # comment
2229 ## Stay in the state
2230 !!!next-input-character;
2231 redo A;
2232 } elsif ($self->{next_char} == -1) {
2233 !!!cp (153);
2234 !!!parse-error (type => 'unclosed comment');
2235 $self->{state} = DATA_STATE;
2236 ## reconsume
2237
2238 !!!emit ($self->{current_token}); # comment
2239
2240 redo A;
2241 } else {
2242 !!!cp (154);
2243 !!!parse-error (type => 'dash in comment',
2244 line => $self->{line_prev},
2245 column => $self->{column_prev});
2246 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2247 $self->{state} = COMMENT_STATE;
2248 !!!next-input-character;
2249 redo A;
2250 }
2251 } elsif ($self->{state} == DOCTYPE_STATE) {
2252 if ($self->{next_char} == 0x0009 or # HT
2253 $self->{next_char} == 0x000A or # LF
2254 $self->{next_char} == 0x000B or # VT
2255 $self->{next_char} == 0x000C or # FF
2256 $self->{next_char} == 0x0020) { # SP
2257 !!!cp (155);
2258 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2259 !!!next-input-character;
2260 redo A;
2261 } else {
2262 !!!cp (156);
2263 !!!parse-error (type => 'no space before DOCTYPE name');
2264 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2265 ## reconsume
2266 redo A;
2267 }
2268 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2269 if ($self->{next_char} == 0x0009 or # HT
2270 $self->{next_char} == 0x000A or # LF
2271 $self->{next_char} == 0x000B or # VT
2272 $self->{next_char} == 0x000C or # FF
2273 $self->{next_char} == 0x0020) { # SP
2274 !!!cp (157);
2275 ## Stay in the state
2276 !!!next-input-character;
2277 redo A;
2278 } elsif ($self->{next_char} == 0x003E) { # >
2279 !!!cp (158);
2280 !!!parse-error (type => 'no DOCTYPE name');
2281 $self->{state} = DATA_STATE;
2282 !!!next-input-character;
2283
2284 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2285
2286 redo A;
2287 } elsif ($self->{next_char} == -1) {
2288 !!!cp (159);
2289 !!!parse-error (type => 'no DOCTYPE name');
2290 $self->{state} = DATA_STATE;
2291 ## reconsume
2292
2293 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2294
2295 redo A;
2296 } else {
2297 !!!cp (160);
2298 $self->{current_token}->{name} = chr $self->{next_char};
2299 delete $self->{current_token}->{quirks};
2300 ## ISSUE: "Set the token's name name to the" in the spec
2301 $self->{state} = DOCTYPE_NAME_STATE;
2302 !!!next-input-character;
2303 redo A;
2304 }
2305 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2306 ## ISSUE: Redundant "First," in the spec.
2307 if ($self->{next_char} == 0x0009 or # HT
2308 $self->{next_char} == 0x000A or # LF
2309 $self->{next_char} == 0x000B or # VT
2310 $self->{next_char} == 0x000C or # FF
2311 $self->{next_char} == 0x0020) { # SP
2312 !!!cp (161);
2313 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2314 !!!next-input-character;
2315 redo A;
2316 } elsif ($self->{next_char} == 0x003E) { # >
2317 !!!cp (162);
2318 $self->{state} = DATA_STATE;
2319 !!!next-input-character;
2320
2321 !!!emit ($self->{current_token}); # DOCTYPE
2322
2323 redo A;
2324 } elsif ($self->{next_char} == -1) {
2325 !!!cp (163);
2326 !!!parse-error (type => 'unclosed DOCTYPE');
2327 $self->{state} = DATA_STATE;
2328 ## reconsume
2329
2330 $self->{current_token}->{quirks} = 1;
2331 !!!emit ($self->{current_token}); # DOCTYPE
2332
2333 redo A;
2334 } else {
2335 !!!cp (164);
2336 $self->{current_token}->{name}
2337 .= chr ($self->{next_char}); # DOCTYPE
2338 ## Stay in the state
2339 !!!next-input-character;
2340 redo A;
2341 }
2342 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2343 if ($self->{next_char} == 0x0009 or # HT
2344 $self->{next_char} == 0x000A or # LF
2345 $self->{next_char} == 0x000B or # VT
2346 $self->{next_char} == 0x000C or # FF
2347 $self->{next_char} == 0x0020) { # SP
2348 !!!cp (165);
2349 ## Stay in the state
2350 !!!next-input-character;
2351 redo A;
2352 } elsif ($self->{next_char} == 0x003E) { # >
2353 !!!cp (166);
2354 $self->{state} = DATA_STATE;
2355 !!!next-input-character;
2356
2357 !!!emit ($self->{current_token}); # DOCTYPE
2358
2359 redo A;
2360 } elsif ($self->{next_char} == -1) {
2361 !!!cp (167);
2362 !!!parse-error (type => 'unclosed DOCTYPE');
2363 $self->{state} = DATA_STATE;
2364 ## reconsume
2365
2366 $self->{current_token}->{quirks} = 1;
2367 !!!emit ($self->{current_token}); # DOCTYPE
2368
2369 redo A;
2370 } elsif ($self->{next_char} == 0x0050 or # P
2371 $self->{next_char} == 0x0070) { # p
2372 !!!next-input-character;
2373 if ($self->{next_char} == 0x0055 or # U
2374 $self->{next_char} == 0x0075) { # u
2375 !!!next-input-character;
2376 if ($self->{next_char} == 0x0042 or # B
2377 $self->{next_char} == 0x0062) { # b
2378 !!!next-input-character;
2379 if ($self->{next_char} == 0x004C or # L
2380 $self->{next_char} == 0x006C) { # l
2381 !!!next-input-character;
2382 if ($self->{next_char} == 0x0049 or # I
2383 $self->{next_char} == 0x0069) { # i
2384 !!!next-input-character;
2385 if ($self->{next_char} == 0x0043 or # C
2386 $self->{next_char} == 0x0063) { # c
2387 !!!cp (168);
2388 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2389 !!!next-input-character;
2390 redo A;
2391 } else {
2392 !!!cp (169);
2393 }
2394 } else {
2395 !!!cp (170);
2396 }
2397 } else {
2398 !!!cp (171);
2399 }
2400 } else {
2401 !!!cp (172);
2402 }
2403 } else {
2404 !!!cp (173);
2405 }
2406
2407 #
2408 } elsif ($self->{next_char} == 0x0053 or # S
2409 $self->{next_char} == 0x0073) { # s
2410 !!!next-input-character;
2411 if ($self->{next_char} == 0x0059 or # Y
2412 $self->{next_char} == 0x0079) { # y
2413 !!!next-input-character;
2414 if ($self->{next_char} == 0x0053 or # S
2415 $self->{next_char} == 0x0073) { # s
2416 !!!next-input-character;
2417 if ($self->{next_char} == 0x0054 or # T
2418 $self->{next_char} == 0x0074) { # t
2419 !!!next-input-character;
2420 if ($self->{next_char} == 0x0045 or # E
2421 $self->{next_char} == 0x0065) { # e
2422 !!!next-input-character;
2423 if ($self->{next_char} == 0x004D or # M
2424 $self->{next_char} == 0x006D) { # m
2425 !!!cp (174);
2426 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2427 !!!next-input-character;
2428 redo A;
2429 } else {
2430 !!!cp (175);
2431 }
2432 } else {
2433 !!!cp (176);
2434 }
2435 } else {
2436 !!!cp (177);
2437 }
2438 } else {
2439 !!!cp (178);
2440 }
2441 } else {
2442 !!!cp (179);
2443 }
2444
2445 #
2446 } else {
2447 !!!cp (180);
2448 !!!next-input-character;
2449 #
2450 }
2451
2452 !!!parse-error (type => 'string after DOCTYPE name');
2453 $self->{current_token}->{quirks} = 1;
2454
2455 $self->{state} = BOGUS_DOCTYPE_STATE;
2456 # next-input-character is already done
2457 redo A;
2458 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2459 if ({
2460 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2461 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2462 }->{$self->{next_char}}) {
2463 !!!cp (181);
2464 ## Stay in the state
2465 !!!next-input-character;
2466 redo A;
2467 } elsif ($self->{next_char} eq 0x0022) { # "
2468 !!!cp (182);
2469 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2470 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2471 !!!next-input-character;
2472 redo A;
2473 } elsif ($self->{next_char} eq 0x0027) { # '
2474 !!!cp (183);
2475 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2476 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2477 !!!next-input-character;
2478 redo A;
2479 } elsif ($self->{next_char} eq 0x003E) { # >
2480 !!!cp (184);
2481 !!!parse-error (type => 'no PUBLIC literal');
2482
2483 $self->{state} = DATA_STATE;
2484 !!!next-input-character;
2485
2486 $self->{current_token}->{quirks} = 1;
2487 !!!emit ($self->{current_token}); # DOCTYPE
2488
2489 redo A;
2490 } elsif ($self->{next_char} == -1) {
2491 !!!cp (185);
2492 !!!parse-error (type => 'unclosed DOCTYPE');
2493
2494 $self->{state} = DATA_STATE;
2495 ## reconsume
2496
2497 $self->{current_token}->{quirks} = 1;
2498 !!!emit ($self->{current_token}); # DOCTYPE
2499
2500 redo A;
2501 } else {
2502 !!!cp (186);
2503 !!!parse-error (type => 'string after PUBLIC');
2504 $self->{current_token}->{quirks} = 1;
2505
2506 $self->{state} = BOGUS_DOCTYPE_STATE;
2507 !!!next-input-character;
2508 redo A;
2509 }
2510 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2511 if ($self->{next_char} == 0x0022) { # "
2512 !!!cp (187);
2513 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2514 !!!next-input-character;
2515 redo A;
2516 } elsif ($self->{next_char} == 0x003E) { # >
2517 !!!cp (188);
2518 !!!parse-error (type => 'unclosed PUBLIC literal');
2519
2520 $self->{state} = DATA_STATE;
2521 !!!next-input-character;
2522
2523 $self->{current_token}->{quirks} = 1;
2524 !!!emit ($self->{current_token}); # DOCTYPE
2525
2526 redo A;
2527 } elsif ($self->{next_char} == -1) {
2528 !!!cp (189);
2529 !!!parse-error (type => 'unclosed PUBLIC literal');
2530
2531 $self->{state} = DATA_STATE;
2532 ## reconsume
2533
2534 $self->{current_token}->{quirks} = 1;
2535 !!!emit ($self->{current_token}); # DOCTYPE
2536
2537 redo A;
2538 } else {
2539 !!!cp (190);
2540 $self->{current_token}->{public_identifier} # DOCTYPE
2541 .= chr $self->{next_char};
2542 ## Stay in the state
2543 !!!next-input-character;
2544 redo A;
2545 }
2546 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2547 if ($self->{next_char} == 0x0027) { # '
2548 !!!cp (191);
2549 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2550 !!!next-input-character;
2551 redo A;
2552 } elsif ($self->{next_char} == 0x003E) { # >
2553 !!!cp (192);
2554 !!!parse-error (type => 'unclosed PUBLIC literal');
2555
2556 $self->{state} = DATA_STATE;
2557 !!!next-input-character;
2558
2559 $self->{current_token}->{quirks} = 1;
2560 !!!emit ($self->{current_token}); # DOCTYPE
2561
2562 redo A;
2563 } elsif ($self->{next_char} == -1) {
2564 !!!cp (193);
2565 !!!parse-error (type => 'unclosed PUBLIC literal');
2566
2567 $self->{state} = DATA_STATE;
2568 ## reconsume
2569
2570 $self->{current_token}->{quirks} = 1;
2571 !!!emit ($self->{current_token}); # DOCTYPE
2572
2573 redo A;
2574 } else {
2575 !!!cp (194);
2576 $self->{current_token}->{public_identifier} # DOCTYPE
2577 .= chr $self->{next_char};
2578 ## Stay in the state
2579 !!!next-input-character;
2580 redo A;
2581 }
2582 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2583 if ({
2584 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2585 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2586 }->{$self->{next_char}}) {
2587 !!!cp (195);
2588 ## Stay in the state
2589 !!!next-input-character;
2590 redo A;
2591 } elsif ($self->{next_char} == 0x0022) { # "
2592 !!!cp (196);
2593 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2594 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2595 !!!next-input-character;
2596 redo A;
2597 } elsif ($self->{next_char} == 0x0027) { # '
2598 !!!cp (197);
2599 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2600 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2601 !!!next-input-character;
2602 redo A;
2603 } elsif ($self->{next_char} == 0x003E) { # >
2604 !!!cp (198);
2605 $self->{state} = DATA_STATE;
2606 !!!next-input-character;
2607
2608 !!!emit ($self->{current_token}); # DOCTYPE
2609
2610 redo A;
2611 } elsif ($self->{next_char} == -1) {
2612 !!!cp (199);
2613 !!!parse-error (type => 'unclosed DOCTYPE');
2614
2615 $self->{state} = DATA_STATE;
2616 ## reconsume
2617
2618 $self->{current_token}->{quirks} = 1;
2619 !!!emit ($self->{current_token}); # DOCTYPE
2620
2621 redo A;
2622 } else {
2623 !!!cp (200);
2624 !!!parse-error (type => 'string after PUBLIC literal');
2625 $self->{current_token}->{quirks} = 1;
2626
2627 $self->{state} = BOGUS_DOCTYPE_STATE;
2628 !!!next-input-character;
2629 redo A;
2630 }
2631 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2632 if ({
2633 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2634 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2635 }->{$self->{next_char}}) {
2636 !!!cp (201);
2637 ## Stay in the state
2638 !!!next-input-character;
2639 redo A;
2640 } elsif ($self->{next_char} == 0x0022) { # "
2641 !!!cp (202);
2642 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2643 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2644 !!!next-input-character;
2645 redo A;
2646 } elsif ($self->{next_char} == 0x0027) { # '
2647 !!!cp (203);
2648 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2649 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2650 !!!next-input-character;
2651 redo A;
2652 } elsif ($self->{next_char} == 0x003E) { # >
2653 !!!cp (204);
2654 !!!parse-error (type => 'no SYSTEM literal');
2655 $self->{state} = DATA_STATE;
2656 !!!next-input-character;
2657
2658 $self->{current_token}->{quirks} = 1;
2659 !!!emit ($self->{current_token}); # DOCTYPE
2660
2661 redo A;
2662 } elsif ($self->{next_char} == -1) {
2663 !!!cp (205);
2664 !!!parse-error (type => 'unclosed DOCTYPE');
2665
2666 $self->{state} = DATA_STATE;
2667 ## reconsume
2668
2669 $self->{current_token}->{quirks} = 1;
2670 !!!emit ($self->{current_token}); # DOCTYPE
2671
2672 redo A;
2673 } else {
2674 !!!cp (206);
2675 !!!parse-error (type => 'string after SYSTEM');
2676 $self->{current_token}->{quirks} = 1;
2677
2678 $self->{state} = BOGUS_DOCTYPE_STATE;
2679 !!!next-input-character;
2680 redo A;
2681 }
2682 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2683 if ($self->{next_char} == 0x0022) { # "
2684 !!!cp (207);
2685 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2686 !!!next-input-character;
2687 redo A;
2688 } elsif ($self->{next_char} == 0x003E) { # >
2689 !!!cp (208);
2690 !!!parse-error (type => 'unclosed SYSTEM literal');
2691
2692 $self->{state} = DATA_STATE;
2693 !!!next-input-character;
2694
2695 $self->{current_token}->{quirks} = 1;
2696 !!!emit ($self->{current_token}); # DOCTYPE
2697
2698 redo A;
2699 } elsif ($self->{next_char} == -1) {
2700 !!!cp (209);
2701 !!!parse-error (type => 'unclosed SYSTEM literal');
2702
2703 $self->{state} = DATA_STATE;
2704 ## reconsume
2705
2706 $self->{current_token}->{quirks} = 1;
2707 !!!emit ($self->{current_token}); # DOCTYPE
2708
2709 redo A;
2710 } else {
2711 !!!cp (210);
2712 $self->{current_token}->{system_identifier} # DOCTYPE
2713 .= chr $self->{next_char};
2714 ## Stay in the state
2715 !!!next-input-character;
2716 redo A;
2717 }
2718 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2719 if ($self->{next_char} == 0x0027) { # '
2720 !!!cp (211);
2721 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2722 !!!next-input-character;
2723 redo A;
2724 } elsif ($self->{next_char} == 0x003E) { # >
2725 !!!cp (212);
2726 !!!parse-error (type => 'unclosed SYSTEM literal');
2727
2728 $self->{state} = DATA_STATE;
2729 !!!next-input-character;
2730
2731 $self->{current_token}->{quirks} = 1;
2732 !!!emit ($self->{current_token}); # DOCTYPE
2733
2734 redo A;
2735 } elsif ($self->{next_char} == -1) {
2736 !!!cp (213);
2737 !!!parse-error (type => 'unclosed SYSTEM literal');
2738
2739 $self->{state} = DATA_STATE;
2740 ## reconsume
2741
2742 $self->{current_token}->{quirks} = 1;
2743 !!!emit ($self->{current_token}); # DOCTYPE
2744
2745 redo A;
2746 } else {
2747 !!!cp (214);
2748 $self->{current_token}->{system_identifier} # DOCTYPE
2749 .= chr $self->{next_char};
2750 ## Stay in the state
2751 !!!next-input-character;
2752 redo A;
2753 }
2754 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2755 if ({
2756 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2757 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2758 }->{$self->{next_char}}) {
2759 !!!cp (215);
2760 ## Stay in the state
2761 !!!next-input-character;
2762 redo A;
2763 } elsif ($self->{next_char} == 0x003E) { # >
2764 !!!cp (216);
2765 $self->{state} = DATA_STATE;
2766 !!!next-input-character;
2767
2768 !!!emit ($self->{current_token}); # DOCTYPE
2769
2770 redo A;
2771 } elsif ($self->{next_char} == -1) {
2772 !!!cp (217);
2773 !!!parse-error (type => 'unclosed DOCTYPE');
2774 $self->{state} = DATA_STATE;
2775 ## reconsume
2776
2777 $self->{current_token}->{quirks} = 1;
2778 !!!emit ($self->{current_token}); # DOCTYPE
2779
2780 redo A;
2781 } else {
2782 !!!cp (218);
2783 !!!parse-error (type => 'string after SYSTEM literal');
2784 #$self->{current_token}->{quirks} = 1;
2785
2786 $self->{state} = BOGUS_DOCTYPE_STATE;
2787 !!!next-input-character;
2788 redo A;
2789 }
2790 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2791 if ($self->{next_char} == 0x003E) { # >
2792 !!!cp (219);
2793 $self->{state} = DATA_STATE;
2794 !!!next-input-character;
2795
2796 !!!emit ($self->{current_token}); # DOCTYPE
2797
2798 redo A;
2799 } elsif ($self->{next_char} == -1) {
2800 !!!cp (220);
2801 !!!parse-error (type => 'unclosed DOCTYPE');
2802 $self->{state} = DATA_STATE;
2803 ## reconsume
2804
2805 !!!emit ($self->{current_token}); # DOCTYPE
2806
2807 redo A;
2808 } else {
2809 !!!cp (221);
2810 ## Stay in the state
2811 !!!next-input-character;
2812 redo A;
2813 }
2814 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2815 my $s = '';
2816
2817 my ($l, $c) = ($self->{line}, $self->{column});
2818
2819 CS: while ($self->{next_char} != -1) {
2820 if ($self->{next_char} == 0x005D) { # ]
2821 !!!next-input-character;
2822 if ($self->{next_char} == 0x005D) { # ]
2823 !!!next-input-character;
2824 MDC: {
2825 if ($self->{next_char} == 0x003E) { # >
2826 !!!cp (221.1);
2827 !!!next-input-character;
2828 last CS;
2829 } elsif ($self->{next_char} == 0x005D) { # ]
2830 !!!cp (221.2);
2831 $s .= ']';
2832 !!!next-input-character;
2833 redo MDC;
2834 } else {
2835 !!!cp (221.3);
2836 $s .= ']]';
2837 #
2838 }
2839 } # MDC
2840 } else {
2841 !!!cp (221.4);
2842 $s .= ']';
2843 #
2844 }
2845 } else {
2846 !!!cp (221.5);
2847 #
2848 }
2849 $s .= chr $self->{next_char};
2850 !!!next-input-character;
2851 } # CS
2852
2853 $self->{state} = DATA_STATE;
2854 ## next-input-character done or EOF, which is reconsumed.
2855
2856 if (length $s) {
2857 !!!cp (221.6);
2858 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2859 line => $l, column => $c});
2860 } else {
2861 !!!cp (221.7);
2862 }
2863
2864 redo A;
2865
2866 ## ISSUE: "text tokens" in spec.
2867 ## TODO: Streaming support
2868 } else {
2869 die "$0: $self->{state}: Unknown state";
2870 }
2871 } # A
2872
2873 die "$0: _get_next_token: unexpected case";
2874 } # _get_next_token
2875
2876 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2877 my ($self, $in_attr, $additional) = @_;
2878
2879 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2880
2881 if ({
2882 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2883 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2884 $additional => 1,
2885 }->{$self->{next_char}}) {
2886 !!!cp (1001);
2887 ## Don't consume
2888 ## No error
2889 return undef;
2890 } elsif ($self->{next_char} == 0x0023) { # #
2891 !!!next-input-character;
2892 if ($self->{next_char} == 0x0078 or # x
2893 $self->{next_char} == 0x0058) { # X
2894 my $code;
2895 X: {
2896 my $x_char = $self->{next_char};
2897 !!!next-input-character;
2898 if (0x0030 <= $self->{next_char} and
2899 $self->{next_char} <= 0x0039) { # 0..9
2900 !!!cp (1002);
2901 $code ||= 0;
2902 $code *= 0x10;
2903 $code += $self->{next_char} - 0x0030;
2904 redo X;
2905 } elsif (0x0061 <= $self->{next_char} and
2906 $self->{next_char} <= 0x0066) { # a..f
2907 !!!cp (1003);
2908 $code ||= 0;
2909 $code *= 0x10;
2910 $code += $self->{next_char} - 0x0060 + 9;
2911 redo X;
2912 } elsif (0x0041 <= $self->{next_char} and
2913 $self->{next_char} <= 0x0046) { # A..F
2914 !!!cp (1004);
2915 $code ||= 0;
2916 $code *= 0x10;
2917 $code += $self->{next_char} - 0x0040 + 9;
2918 redo X;
2919 } elsif (not defined $code) { # no hexadecimal digit
2920 !!!cp (1005);
2921 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2922 !!!back-next-input-character ($x_char, $self->{next_char});
2923 $self->{next_char} = 0x0023; # #
2924 return undef;
2925 } elsif ($self->{next_char} == 0x003B) { # ;
2926 !!!cp (1006);
2927 !!!next-input-character;
2928 } else {
2929 !!!cp (1007);
2930 !!!parse-error (type => 'no refc', line => $l, column => $c);
2931 }
2932
2933 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2934 !!!cp (1008);
2935 !!!parse-error (type => 'invalid character reference',
2936 text => (sprintf 'U+%04X', $code),
2937 line => $l, column => $c);
2938 $code = 0xFFFD;
2939 } elsif ($code > 0x10FFFF) {
2940 !!!cp (1009);
2941 !!!parse-error (type => 'invalid character reference',
2942 text => (sprintf 'U-%08X', $code),
2943 line => $l, column => $c);
2944 $code = 0xFFFD;
2945 } elsif ($code == 0x000D) {
2946 !!!cp (1010);
2947 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2948 $code = 0x000A;
2949 } elsif (0x80 <= $code and $code <= 0x9F) {
2950 !!!cp (1011);
2951 !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
2952 $code = $c1_entity_char->{$code};
2953 }
2954
2955 return {type => CHARACTER_TOKEN, data => chr $code,
2956 has_reference => 1,
2957 line => $l, column => $c,
2958 };
2959 } # X
2960 } elsif (0x0030 <= $self->{next_char} and
2961 $self->{next_char} <= 0x0039) { # 0..9
2962 my $code = $self->{next_char} - 0x0030;
2963 !!!next-input-character;
2964
2965 while (0x0030 <= $self->{next_char} and
2966 $self->{next_char} <= 0x0039) { # 0..9
2967 !!!cp (1012);
2968 $code *= 10;
2969 $code += $self->{next_char} - 0x0030;
2970
2971 !!!next-input-character;
2972 }
2973
2974 if ($self->{next_char} == 0x003B) { # ;
2975 !!!cp (1013);
2976 !!!next-input-character;
2977 } else {
2978 !!!cp (1014);
2979 !!!parse-error (type => 'no refc', line => $l, column => $c);
2980 }
2981
2982 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2983 !!!cp (1015);
2984 !!!parse-error (type => 'invalid character reference',
2985 text => (sprintf 'U+%04X', $code),
2986 line => $l, column => $c);
2987 $code = 0xFFFD;
2988 } elsif ($code > 0x10FFFF) {
2989 !!!cp (1016);
2990 !!!parse-error (type => 'invalid character reference',
2991 text => (sprintf 'U-%08X', $code),
2992 line => $l, column => $c);
2993 $code = 0xFFFD;
2994 } elsif ($code == 0x000D) {
2995 !!!cp (1017);
2996 !!!parse-error (type => 'CR character reference',
2997 line => $l, column => $c);
2998 $code = 0x000A;
2999 } elsif (0x80 <= $code and $code <= 0x9F) {
3000 !!!cp (1018);
3001 !!!parse-error (type => 'C1 character reference',
3002 text => (sprintf 'U+%04X', $code),
3003 line => $l, column => $c);
3004 $code = $c1_entity_char->{$code};
3005 }
3006
3007 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
3008 line => $l, column => $c,
3009 };
3010 } else {
3011 !!!cp (1019);
3012 !!!parse-error (type => 'bare nero', line => $l, column => $c);
3013 !!!back-next-input-character ($self->{next_char});
3014 $self->{next_char} = 0x0023; # #
3015 return undef;
3016 }
3017 } elsif ((0x0041 <= $self->{next_char} and
3018 $self->{next_char} <= 0x005A) or
3019 (0x0061 <= $self->{next_char} and
3020 $self->{next_char} <= 0x007A)) {
3021 my $entity_name = chr $self->{next_char};
3022 !!!next-input-character;
3023
3024 my $value = $entity_name;
3025 my $match = 0;
3026 require Whatpm::_NamedEntityList;
3027 our $EntityChar;
3028
3029 while (length $entity_name < 30 and
3030 ## NOTE: Some number greater than the maximum length of entity name
3031 ((0x0041 <= $self->{next_char} and # a
3032 $self->{next_char} <= 0x005A) or # x
3033 (0x0061 <= $self->{next_char} and # a
3034 $self->{next_char} <= 0x007A) or # z
3035 (0x0030 <= $self->{next_char} and # 0
3036 $self->{next_char} <= 0x0039) or # 9
3037 $self->{next_char} == 0x003B)) { # ;
3038 $entity_name .= chr $self->{next_char};
3039 if (defined $EntityChar->{$entity_name}) {
3040 if ($self->{next_char} == 0x003B) { # ;
3041 !!!cp (1020);
3042 $value = $EntityChar->{$entity_name};
3043 $match = 1;
3044 !!!next-input-character;
3045 last;
3046 } else {
3047 !!!cp (1021);
3048 $value = $EntityChar->{$entity_name};
3049 $match = -1;
3050 !!!next-input-character;
3051 }
3052 } else {
3053 !!!cp (1022);
3054 $value .= chr $self->{next_char};
3055 $match *= 2;
3056 !!!next-input-character;
3057 }
3058 }
3059
3060 if ($match > 0) {
3061 !!!cp (1023);
3062 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3063 line => $l, column => $c,
3064 };
3065 } elsif ($match < 0) {
3066 !!!parse-error (type => 'no refc', line => $l, column => $c);
3067 if ($in_attr and $match < -1) {
3068 !!!cp (1024);
3069 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
3070 line => $l, column => $c,
3071 };
3072 } else {
3073 !!!cp (1025);
3074 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3075 line => $l, column => $c,
3076 };
3077 }
3078 } else {
3079 !!!cp (1026);
3080 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3081 ## NOTE: "No characters are consumed" in the spec.
3082 return {type => CHARACTER_TOKEN, data => '&'.$value,
3083 line => $l, column => $c,
3084 };
3085 }
3086 } else {
3087 !!!cp (1027);
3088 ## no characters are consumed
3089 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3090 return undef;
3091 }
3092 } # _tokenize_attempt_to_consume_an_entity
3093
3094 sub _initialize_tree_constructor ($) {
3095 my $self = shift;
3096 ## NOTE: $self->{document} MUST be specified before this method is called
3097 $self->{document}->strict_error_checking (0);
3098 ## TODO: Turn mutation events off # MUST
3099 ## TODO: Turn loose Document option (manakai extension) on
3100 $self->{document}->manakai_is_html (1); # MUST
3101 $self->{document}->set_user_data (manakai_source_line => 1);
3102 $self->{document}->set_user_data (manakai_source_column => 1);
3103 } # _initialize_tree_constructor
3104
3105 sub _terminate_tree_constructor ($) {
3106 my $self = shift;
3107 $self->{document}->strict_error_checking (1);
3108 ## TODO: Turn mutation events on
3109 } # _terminate_tree_constructor
3110
3111 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3112
3113 { # tree construction stage
3114 my $token;
3115
3116 sub _construct_tree ($) {
3117 my ($self) = @_;
3118
3119 ## When an interactive UA render the $self->{document} available
3120 ## to the user, or when it begin accepting user input, are
3121 ## not defined.
3122
3123 ## Append a character: collect it and all subsequent consecutive
3124 ## characters and insert one Text node whose data is concatenation
3125 ## of all those characters. # MUST
3126
3127 !!!next-token;
3128
3129 undef $self->{form_element};
3130 undef $self->{head_element};
3131 $self->{open_elements} = [];
3132 undef $self->{inner_html_node};
3133
3134 ## NOTE: The "initial" insertion mode.
3135 $self->_tree_construction_initial; # MUST
3136
3137 ## NOTE: The "before html" insertion mode.
3138 $self->_tree_construction_root_element;
3139 $self->{insertion_mode} = BEFORE_HEAD_IM;
3140
3141 ## NOTE: The "before head" insertion mode and so on.
3142 $self->_tree_construction_main;
3143 } # _construct_tree
3144
3145 sub _tree_construction_initial ($) {
3146 my $self = shift;
3147
3148 ## NOTE: "initial" insertion mode
3149
3150 INITIAL: {
3151 if ($token->{type} == DOCTYPE_TOKEN) {
3152 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3153 ## error, switch to a conformance checking mode for another
3154 ## language.
3155 my $doctype_name = $token->{name};
3156 $doctype_name = '' unless defined $doctype_name;
3157 $doctype_name =~ tr/a-z/A-Z/;
3158 if (not defined $token->{name} or # <!DOCTYPE>
3159 defined $token->{public_identifier} or
3160 defined $token->{system_identifier}) {
3161 !!!cp ('t1');
3162 !!!parse-error (type => 'not HTML5', token => $token);
3163 } elsif ($doctype_name ne 'HTML') {
3164 !!!cp ('t2');
3165 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
3166 !!!parse-error (type => 'not HTML5', token => $token);
3167 } else {
3168 !!!cp ('t3');
3169 }
3170
3171 my $doctype = $self->{document}->create_document_type_definition
3172 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3173 ## NOTE: Default value for both |public_id| and |system_id| attributes
3174 ## are empty strings, so that we don't set any value in missing cases.
3175 $doctype->public_id ($token->{public_identifier})
3176 if defined $token->{public_identifier};
3177 $doctype->system_id ($token->{system_identifier})
3178 if defined $token->{system_identifier};
3179 ## NOTE: Other DocumentType attributes are null or empty lists.
3180 ## ISSUE: internalSubset = null??
3181 $self->{document}->append_child ($doctype);
3182
3183 if ($token->{quirks} or $doctype_name ne 'HTML') {
3184 !!!cp ('t4');
3185 $self->{document}->manakai_compat_mode ('quirks');
3186 } elsif (defined $token->{public_identifier}) {
3187 my $pubid = $token->{public_identifier};
3188 $pubid =~ tr/a-z/A-z/;
3189 my $prefix = [
3190 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3191 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3192 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3193 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3194 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3195 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3196 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3197 "-//IETF//DTD HTML 2.0 STRICT//",
3198 "-//IETF//DTD HTML 2.0//",
3199 "-//IETF//DTD HTML 2.1E//",
3200 "-//IETF//DTD HTML 3.0//",
3201 "-//IETF//DTD HTML 3.2 FINAL//",
3202 "-//IETF//DTD HTML 3.2//",
3203 "-//IETF//DTD HTML 3//",
3204 "-//IETF//DTD HTML LEVEL 0//",
3205 "-//IETF//DTD HTML LEVEL 1//",
3206 "-//IETF//DTD HTML LEVEL 2//",
3207 "-//IETF//DTD HTML LEVEL 3//",
3208 "-//IETF//DTD HTML STRICT LEVEL 0//",
3209 "-//IETF//DTD HTML STRICT LEVEL 1//",
3210 "-//IETF//DTD HTML STRICT LEVEL 2//",
3211 "-//IETF//DTD HTML STRICT LEVEL 3//",
3212 "-//IETF//DTD HTML STRICT//",
3213 "-//IETF//DTD HTML//",
3214 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3215 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3216 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3217 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3218 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3219 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3220 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3221 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3222 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3223 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3224 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3225 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3226 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3227 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3228 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3229 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3230 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3231 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3232 "-//W3C//DTD HTML 3 1995-03-24//",
3233 "-//W3C//DTD HTML 3.2 DRAFT//",
3234 "-//W3C//DTD HTML 3.2 FINAL//",
3235 "-//W3C//DTD HTML 3.2//",
3236 "-//W3C//DTD HTML 3.2S DRAFT//",
3237 "-//W3C//DTD HTML 4.0 FRAMESET//",
3238 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3239 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3240 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3241 "-//W3C//DTD W3 HTML//",
3242 "-//W3O//DTD W3 HTML 3.0//",
3243 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3244 "-//WEBTECHS//DTD MOZILLA HTML//",
3245 ]; # $prefix
3246 my $match;
3247 for (@$prefix) {
3248 if (substr ($prefix, 0, length $_) eq $_) {
3249 $match = 1;
3250 last;
3251 }
3252 }
3253 if ($match or
3254 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3255 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3256 $pubid eq "HTML") {
3257 !!!cp ('t5');
3258 $self->{document}->manakai_compat_mode ('quirks');
3259 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3260 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3261 if (defined $token->{system_identifier}) {
3262 !!!cp ('t6');
3263 $self->{document}->manakai_compat_mode ('quirks');
3264 } else {
3265 !!!cp ('t7');
3266 $self->{document}->manakai_compat_mode ('limited quirks');
3267 }
3268 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3269 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3270 !!!cp ('t8');
3271 $self->{document}->manakai_compat_mode ('limited quirks');
3272 } else {
3273 !!!cp ('t9');
3274 }
3275 } else {
3276 !!!cp ('t10');
3277 }
3278 if (defined $token->{system_identifier}) {
3279 my $sysid = $token->{system_identifier};
3280 $sysid =~ tr/A-Z/a-z/;
3281 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3282 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3283 ## marked as quirks.
3284 $self->{document}->manakai_compat_mode ('quirks');
3285 !!!cp ('t11');
3286 } else {
3287 !!!cp ('t12');
3288 }
3289 } else {
3290 !!!cp ('t13');
3291 }
3292
3293 ## Go to the "before html" insertion mode.
3294 !!!next-token;
3295 return;
3296 } elsif ({
3297 START_TAG_TOKEN, 1,
3298 END_TAG_TOKEN, 1,
3299 END_OF_FILE_TOKEN, 1,
3300 }->{$token->{type}}) {
3301 !!!cp ('t14');
3302 !!!parse-error (type => 'no DOCTYPE', token => $token);
3303 $self->{document}->manakai_compat_mode ('quirks');
3304 ## Go to the "before html" insertion mode.
3305 ## reprocess
3306 !!!ack-later;
3307 return;
3308 } elsif ($token->{type} == CHARACTER_TOKEN) {
3309 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3310 ## Ignore the token
3311
3312 unless (length $token->{data}) {
3313 !!!cp ('t15');
3314 ## Stay in the insertion mode.
3315 !!!next-token;
3316 redo INITIAL;
3317 } else {
3318 !!!cp ('t16');
3319 }
3320 } else {
3321 !!!cp ('t17');
3322 }
3323
3324 !!!parse-error (type => 'no DOCTYPE', token => $token);
3325 $self->{document}->manakai_compat_mode ('quirks');
3326 ## Go to the "before html" insertion mode.
3327 ## reprocess
3328 return;
3329 } elsif ($token->{type} == COMMENT_TOKEN) {
3330 !!!cp ('t18');
3331 my $comment = $self->{document}->create_comment ($token->{data});
3332 $self->{document}->append_child ($comment);
3333
3334 ## Stay in the insertion mode.
3335 !!!next-token;
3336 redo INITIAL;
3337 } else {
3338 die "$0: $token->{type}: Unknown token type";
3339 }
3340 } # INITIAL
3341
3342 die "$0: _tree_construction_initial: This should be never reached";
3343 } # _tree_construction_initial
3344
3345 sub _tree_construction_root_element ($) {
3346 my $self = shift;
3347
3348 ## NOTE: "before html" insertion mode.
3349
3350 B: {
3351 if ($token->{type} == DOCTYPE_TOKEN) {
3352 !!!cp ('t19');
3353 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3354 ## Ignore the token
3355 ## Stay in the insertion mode.
3356 !!!next-token;
3357 redo B;
3358 } elsif ($token->{type} == COMMENT_TOKEN) {
3359 !!!cp ('t20');
3360 my $comment = $self->{document}->create_comment ($token->{data});
3361 $self->{document}->append_child ($comment);
3362 ## Stay in the insertion mode.
3363 !!!next-token;
3364 redo B;
3365 } elsif ($token->{type} == CHARACTER_TOKEN) {
3366 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3367 ## Ignore the token.
3368
3369 unless (length $token->{data}) {
3370 !!!cp ('t21');
3371 ## Stay in the insertion mode.
3372 !!!next-token;
3373 redo B;
3374 } else {
3375 !!!cp ('t22');
3376 }
3377 } else {
3378 !!!cp ('t23');
3379 }
3380
3381 $self->{application_cache_selection}->(undef);
3382
3383 #
3384 } elsif ($token->{type} == START_TAG_TOKEN) {
3385 if ($token->{tag_name} eq 'html') {
3386 my $root_element;
3387 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3388 $self->{document}->append_child ($root_element);
3389 push @{$self->{open_elements}},
3390 [$root_element, $el_category->{html}];
3391
3392 if ($token->{attributes}->{manifest}) {
3393 !!!cp ('t24');
3394 $self->{application_cache_selection}
3395 ->($token->{attributes}->{manifest}->{value});
3396 ## ISSUE: Spec is unclear on relative references.
3397 ## According to Hixie (#whatwg 2008-03-19), it should be
3398 ## resolved against the base URI of the document in HTML
3399 ## or xml:base of the element in XHTML.
3400 } else {
3401 !!!cp ('t25');
3402 $self->{application_cache_selection}->(undef);
3403 }
3404
3405 !!!nack ('t25c');
3406
3407 !!!next-token;
3408 return; ## Go to the "before head" insertion mode.
3409 } else {
3410 !!!cp ('t25.1');
3411 #
3412 }
3413 } elsif ({
3414 END_TAG_TOKEN, 1,
3415 END_OF_FILE_TOKEN, 1,
3416 }->{$token->{type}}) {
3417 !!!cp ('t26');
3418 #
3419 } else {
3420 die "$0: $token->{type}: Unknown token type";
3421 }
3422
3423 my $root_element;
3424 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3425 $self->{document}->append_child ($root_element);
3426 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3427
3428 $self->{application_cache_selection}->(undef);
3429
3430 ## NOTE: Reprocess the token.
3431 !!!ack-later;
3432 return; ## Go to the "before head" insertion mode.
3433
3434 ## ISSUE: There is an issue in the spec
3435 } # B
3436
3437 die "$0: _tree_construction_root_element: This should never be reached";
3438 } # _tree_construction_root_element
3439
3440 sub _reset_insertion_mode ($) {
3441 my $self = shift;
3442
3443 ## Step 1
3444 my $last;
3445
3446 ## Step 2
3447 my $i = -1;
3448 my $node = $self->{open_elements}->[$i];
3449
3450 ## Step 3
3451 S3: {
3452 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3453 $last = 1;
3454 if (defined $self->{inner_html_node}) {
3455 !!!cp ('t28');
3456 $node = $self->{inner_html_node};
3457 } else {
3458 die "_reset_insertion_mode: t27";
3459 }
3460 }
3461
3462 ## Step 4..14
3463 my $new_mode;
3464 if ($node->[1] & FOREIGN_EL) {
3465 !!!cp ('t28.1');
3466 ## NOTE: Strictly spaking, the line below only applies to MathML and
3467 ## SVG elements. Currently the HTML syntax supports only MathML and
3468 ## SVG elements as foreigners.
3469 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3470 } elsif ($node->[1] & TABLE_CELL_EL) {
3471 if ($last) {
3472 !!!cp ('t28.2');
3473 #
3474 } else {
3475 !!!cp ('t28.3');
3476 $new_mode = IN_CELL_IM;
3477 }
3478 } else {
3479 !!!cp ('t28.4');
3480 $new_mode = {
3481 select => IN_SELECT_IM,
3482 ## NOTE: |option| and |optgroup| do not set
3483 ## insertion mode to "in select" by themselves.
3484 tr => IN_ROW_IM,
3485 tbody => IN_TABLE_BODY_IM,
3486 thead => IN_TABLE_BODY_IM,
3487 tfoot => IN_TABLE_BODY_IM,
3488 caption => IN_CAPTION_IM,
3489 colgroup => IN_COLUMN_GROUP_IM,
3490 table => IN_TABLE_IM,
3491 head => IN_BODY_IM, # not in head!
3492 body => IN_BODY_IM,
3493 frameset => IN_FRAMESET_IM,
3494 }->{$node->[0]->manakai_local_name};
3495 }
3496 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3497
3498 ## Step 15
3499 if ($node->[1] & HTML_EL) {
3500 unless (defined $self->{head_element}) {
3501 !!!cp ('t29');
3502 $self->{insertion_mode} = BEFORE_HEAD_IM;
3503 } else {
3504 ## ISSUE: Can this state be reached?
3505 !!!cp ('t30');
3506 $self->{insertion_mode} = AFTER_HEAD_IM;
3507 }
3508 return;
3509 } else {
3510 !!!cp ('t31');
3511 }
3512
3513 ## Step 16
3514 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3515
3516 ## Step 17
3517 $i--;
3518 $node = $self->{open_elements}->[$i];
3519
3520 ## Step 18
3521 redo S3;
3522 } # S3
3523
3524 die "$0: _reset_insertion_mode: This line should never be reached";
3525 } # _reset_insertion_mode
3526
3527 sub _tree_construction_main ($) {
3528 my $self = shift;
3529
3530 my $active_formatting_elements = [];
3531
3532 my $reconstruct_active_formatting_elements = sub { # MUST
3533 my $insert = shift;
3534
3535 ## Step 1
3536 return unless @$active_formatting_elements;
3537
3538 ## Step 3
3539 my $i = -1;
3540 my $entry = $active_formatting_elements->[$i];
3541
3542 ## Step 2
3543 return if $entry->[0] eq '#marker';
3544 for (@{$self->{open_elements}}) {
3545 if ($entry->[0] eq $_->[0]) {
3546 !!!cp ('t32');
3547 return;
3548 }
3549 }
3550
3551 S4: {
3552 ## Step 4
3553 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3554
3555 ## Step 5
3556 $i--;
3557 $entry = $active_formatting_elements->[$i];
3558
3559 ## Step 6
3560 if ($entry->[0] eq '#marker') {
3561 !!!cp ('t33_1');
3562 #
3563 } else {
3564 my $in_open_elements;
3565 OE: for (@{$self->{open_elements}}) {
3566 if ($entry->[0] eq $_->[0]) {
3567 !!!cp ('t33');
3568 $in_open_elements = 1;
3569 last OE;
3570 }
3571 }
3572 if ($in_open_elements) {
3573 !!!cp ('t34');
3574 #
3575 } else {
3576 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3577 !!!cp ('t35');
3578 redo S4;
3579 }
3580 }
3581
3582 ## Step 7
3583 $i++;
3584 $entry = $active_formatting_elements->[$i];
3585 } # S4
3586
3587 S7: {
3588 ## Step 8
3589 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3590
3591 ## Step 9
3592 $insert->($clone->[0]);
3593 push @{$self->{open_elements}}, $clone;
3594
3595 ## Step 10
3596 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3597
3598 ## Step 11
3599 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3600 !!!cp ('t36');
3601 ## Step 7'
3602 $i++;
3603 $entry = $active_formatting_elements->[$i];
3604
3605 redo S7;
3606 }
3607
3608 !!!cp ('t37');
3609 } # S7
3610 }; # $reconstruct_active_formatting_elements
3611
3612 my $clear_up_to_marker = sub {
3613 for (reverse 0..$#$active_formatting_elements) {
3614 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3615 !!!cp ('t38');
3616 splice @$active_formatting_elements, $_;
3617 return;
3618 }
3619 }
3620
3621 !!!cp ('t39');
3622 }; # $clear_up_to_marker
3623
3624 my $insert;
3625
3626 my $parse_rcdata = sub ($) {
3627 my ($content_model_flag) = @_;
3628
3629 ## Step 1
3630 my $start_tag_name = $token->{tag_name};
3631 my $el;
3632 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3633
3634 ## Step 2
3635 $insert->($el);
3636
3637 ## Step 3
3638 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3639 delete $self->{escape}; # MUST
3640
3641 ## Step 4
3642 my $text = '';
3643 !!!nack ('t40.1');
3644 !!!next-token;
3645 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3646 !!!cp ('t40');
3647 $text .= $token->{data};
3648 !!!next-token;
3649 }
3650
3651 ## Step 5
3652 if (length $text) {
3653 !!!cp ('t41');
3654 my $text = $self->{document}->create_text_node ($text);
3655 $el->append_child ($text);
3656 }
3657
3658 ## Step 6
3659 $self->{content_model} = PCDATA_CONTENT_MODEL;
3660
3661 ## Step 7
3662 if ($token->{type} == END_TAG_TOKEN and
3663 $token->{tag_name} eq $start_tag_name) {
3664 !!!cp ('t42');
3665 ## Ignore the token
3666 } else {
3667 ## NOTE: An end-of-file token.
3668 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3669 !!!cp ('t43');
3670 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3671 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3672 !!!cp ('t44');
3673 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
3674 } else {
3675 die "$0: $content_model_flag in parse_rcdata";
3676 }
3677 }
3678 !!!next-token;
3679 }; # $parse_rcdata
3680
3681 my $script_start_tag = sub () {
3682 my $script_el;
3683 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3684 ## TODO: mark as "parser-inserted"
3685
3686 $self->{content_model} = CDATA_CONTENT_MODEL;
3687 delete $self->{escape}; # MUST
3688
3689 my $text = '';
3690 !!!nack ('t45.1');
3691 !!!next-token;
3692 while ($token->{type} == CHARACTER_TOKEN) {
3693 !!!cp ('t45');
3694 $text .= $token->{data};
3695 !!!next-token;
3696 } # stop if non-character token or tokenizer stops tokenising
3697 if (length $text) {
3698 !!!cp ('t46');
3699 $script_el->manakai_append_text ($text);
3700 }
3701
3702 $self->{content_model} = PCDATA_CONTENT_MODEL;
3703
3704 if ($token->{type} == END_TAG_TOKEN and
3705 $token->{tag_name} eq 'script') {
3706 !!!cp ('t47');
3707 ## Ignore the token
3708 } else {
3709 !!!cp ('t48');
3710 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3711 ## ISSUE: And ignore?
3712 ## TODO: mark as "already executed"
3713 }
3714
3715 if (defined $self->{inner_html_node}) {
3716 !!!cp ('t49');
3717 ## TODO: mark as "already executed"
3718 } else {
3719 !!!cp ('t50');
3720 ## TODO: $old_insertion_point = current insertion point
3721 ## TODO: insertion point = just before the next input character
3722
3723 $insert->($script_el);
3724
3725 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3726
3727 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3728 }
3729
3730 !!!next-token;
3731 }; # $script_start_tag
3732
3733 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3734 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3735 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3736
3737 my $formatting_end_tag = sub {
3738 my $end_tag_token = shift;
3739 my $tag_name = $end_tag_token->{tag_name};
3740
3741 ## NOTE: The adoption agency algorithm (AAA).
3742
3743 FET: {
3744 ## Step 1
3745 my $formatting_element;
3746 my $formatting_element_i_in_active;
3747 AFE: for (reverse 0..$#$active_formatting_elements) {
3748 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3749 !!!cp ('t52');
3750 last AFE;
3751 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3752 eq $tag_name) {
3753 !!!cp ('t51');
3754 $formatting_element = $active_formatting_elements->[$_];
3755 $formatting_element_i_in_active = $_;
3756 last AFE;
3757 }
3758 } # AFE
3759 unless (defined $formatting_element) {
3760 !!!cp ('t53');
3761 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
3762 ## Ignore the token
3763 !!!next-token;
3764 return;
3765 }
3766 ## has an element in scope
3767 my $in_scope = 1;
3768 my $formatting_element_i_in_open;
3769 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3770 my $node = $self->{open_elements}->[$_];
3771 if ($node->[0] eq $formatting_element->[0]) {
3772 if ($in_scope) {
3773 !!!cp ('t54');
3774 $formatting_element_i_in_open = $_;
3775 last INSCOPE;
3776 } else { # in open elements but not in scope
3777 !!!cp ('t55');
3778 !!!parse-error (type => 'unmatched end tag',
3779 text => $token->{tag_name},
3780 token => $end_tag_token);
3781 ## Ignore the token
3782 !!!next-token;
3783 return;
3784 }
3785 } elsif ($node->[1] & SCOPING_EL) {
3786 !!!cp ('t56');
3787 $in_scope = 0;
3788 }
3789 } # INSCOPE
3790 unless (defined $formatting_element_i_in_open) {
3791 !!!cp ('t57');
3792 !!!parse-error (type => 'unmatched end tag',
3793 text => $token->{tag_name},
3794 token => $end_tag_token);
3795 pop @$active_formatting_elements; # $formatting_element
3796 !!!next-token; ## TODO: ok?
3797 return;
3798 }
3799 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3800 !!!cp ('t58');
3801 !!!parse-error (type => 'not closed',
3802 text => $self->{open_elements}->[-1]->[0]
3803 ->manakai_local_name,
3804 token => $end_tag_token);
3805 }
3806
3807 ## Step 2
3808 my $furthest_block;
3809 my $furthest_block_i_in_open;
3810 OE: for (reverse 0..$#{$self->{open_elements}}) {
3811 my $node = $self->{open_elements}->[$_];
3812 if (not ($node->[1] & FORMATTING_EL) and
3813 #not $phrasing_category->{$node->[1]} and
3814 ($node->[1] & SPECIAL_EL or
3815 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3816 !!!cp ('t59');
3817 $furthest_block = $node;
3818 $furthest_block_i_in_open = $_;
3819 } elsif ($node->[0] eq $formatting_element->[0]) {
3820 !!!cp ('t60');
3821 last OE;
3822 }
3823 } # OE
3824
3825 ## Step 3
3826 unless (defined $furthest_block) { # MUST
3827 !!!cp ('t61');
3828 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3829 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3830 !!!next-token;
3831 return;
3832 }
3833
3834 ## Step 4
3835 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3836
3837 ## Step 5
3838 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3839 if (defined $furthest_block_parent) {
3840 !!!cp ('t62');
3841 $furthest_block_parent->remove_child ($furthest_block->[0]);
3842 }
3843
3844 ## Step 6
3845 my $bookmark_prev_el
3846 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3847 ->[0];
3848
3849 ## Step 7
3850 my $node = $furthest_block;
3851 my $node_i_in_open = $furthest_block_i_in_open;
3852 my $last_node = $furthest_block;
3853 S7: {
3854 ## Step 1
3855 $node_i_in_open--;
3856 $node = $self->{open_elements}->[$node_i_in_open];
3857
3858 ## Step 2
3859 my $node_i_in_active;
3860 S7S2: {
3861 for (reverse 0..$#$active_formatting_elements) {
3862 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3863 !!!cp ('t63');
3864 $node_i_in_active = $_;
3865 last S7S2;
3866 }
3867 }
3868 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3869 redo S7;
3870 } # S7S2
3871
3872 ## Step 3
3873 last S7 if $node->[0] eq $formatting_element->[0];
3874
3875 ## Step 4
3876 if ($last_node->[0] eq $furthest_block->[0]) {
3877 !!!cp ('t64');
3878 $bookmark_prev_el = $node->[0];
3879 }
3880
3881 ## Step 5
3882 if ($node->[0]->has_child_nodes ()) {
3883 !!!cp ('t65');
3884 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3885 $active_formatting_elements->[$node_i_in_active] = $clone;
3886 $self->{open_elements}->[$node_i_in_open] = $clone;
3887 $node = $clone;
3888 }
3889
3890 ## Step 6
3891 $node->[0]->append_child ($last_node->[0]);
3892
3893 ## Step 7
3894 $last_node = $node;
3895
3896 ## Step 8
3897 redo S7;
3898 } # S7
3899
3900 ## Step 8
3901 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3902 my $foster_parent_element;
3903 my $next_sibling;
3904 OE: for (reverse 0..$#{$self->{open_elements}}) {
3905 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3906 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3907 if (defined $parent and $parent->node_type == 1) {
3908 !!!cp ('t65.1');
3909 $foster_parent_element = $parent;
3910 $next_sibling = $self->{open_elements}->[$_]->[0];
3911 } else {
3912 !!!cp ('t65.2');
3913 $foster_parent_element
3914 = $self->{open_elements}->[$_ - 1]->[0];
3915 }
3916 last OE;
3917 }
3918 } # OE
3919 $foster_parent_element = $self->{open_elements}->[0]->[0]
3920 unless defined $foster_parent_element;
3921 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3922 $open_tables->[-1]->[1] = 1; # tainted
3923 } else {
3924 !!!cp ('t65.3');
3925 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3926 }
3927
3928 ## Step 9
3929 my $clone = [$formatting_element->[0]->clone_node (0),
3930 $formatting_element->[1]];
3931
3932 ## Step 10
3933 my @cn = @{$furthest_block->[0]->child_nodes};
3934 $clone->[0]->append_child ($_) for @cn;
3935
3936 ## Step 11
3937 $furthest_block->[0]->append_child ($clone->[0]);
3938
3939 ## Step 12
3940 my $i;
3941 AFE: for (reverse 0..$#$active_formatting_elements) {
3942 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3943 !!!cp ('t66');
3944 splice @$active_formatting_elements, $_, 1;
3945 $i-- and last AFE if defined $i;
3946 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3947 !!!cp ('t67');
3948 $i = $_;
3949 }
3950 } # AFE
3951 splice @$active_formatting_elements, $i + 1, 0, $clone;
3952
3953 ## Step 13
3954 undef $i;
3955 OE: for (reverse 0..$#{$self->{open_elements}}) {
3956 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3957 !!!cp ('t68');
3958 splice @{$self->{open_elements}}, $_, 1;
3959 $i-- and last OE if defined $i;
3960 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3961 !!!cp ('t69');
3962 $i = $_;
3963 }
3964 } # OE
3965 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3966
3967 ## Step 14
3968 redo FET;
3969 } # FET
3970 }; # $formatting_end_tag
3971
3972 $insert = my $insert_to_current = sub {
3973 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3974 }; # $insert_to_current
3975
3976 my $insert_to_foster = sub {
3977 my $child = shift;
3978 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3979 # MUST
3980 my $foster_parent_element;
3981 my $next_sibling;
3982 OE: for (reverse 0..$#{$self->{open_elements}}) {
3983 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3984 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3985 if (defined $parent and $parent->node_type == 1) {
3986 !!!cp ('t70');
3987 $foster_parent_element = $parent;
3988 $next_sibling = $self->{open_elements}->[$_]->[0];
3989 } else {
3990 !!!cp ('t71');
3991 $foster_parent_element
3992 = $self->{open_elements}->[$_ - 1]->[0];
3993 }
3994 last OE;
3995 }
3996 } # OE
3997 $foster_parent_element = $self->{open_elements}->[0]->[0]
3998 unless defined $foster_parent_element;
3999 $foster_parent_element->insert_before
4000 ($child, $next_sibling);
4001 $open_tables->[-1]->[1] = 1; # tainted
4002 } else {
4003 !!!cp ('t72');
4004 $self->{open_elements}->[-1]->[0]->append_child ($child);
4005 }
4006 }; # $insert_to_foster
4007
4008 B: while (1) {
4009 if ($token->{type} == DOCTYPE_TOKEN) {
4010 !!!cp ('t73');
4011 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4012 ## Ignore the token
4013 ## Stay in the phase
4014 !!!next-token;
4015 next B;
4016 } elsif ($token->{type} == START_TAG_TOKEN and
4017 $token->{tag_name} eq 'html') {
4018 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4019 !!!cp ('t79');
4020 !!!parse-error (type => 'after html', text => 'html', token => $token);
4021 $self->{insertion_mode} = AFTER_BODY_IM;
4022 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4023 !!!cp ('t80');
4024 !!!parse-error (type => 'after html', text => 'html', token => $token);
4025 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4026 } else {
4027 !!!cp ('t81');
4028 }
4029
4030 !!!cp ('t82');
4031 !!!parse-error (type => 'not first start tag', token => $token);
4032 my $top_el = $self->{open_elements}->[0]->[0];
4033 for my $attr_name (keys %{$token->{attributes}}) {
4034 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4035 !!!cp ('t84');
4036 $top_el->set_attribute_ns
4037 (undef, [undef, $attr_name],
4038 $token->{attributes}->{$attr_name}->{value});
4039 }
4040 }
4041 !!!nack ('t84.1');
4042 !!!next-token;
4043 next B;
4044 } elsif ($token->{type} == COMMENT_TOKEN) {
4045 my $comment = $self->{document}->create_comment ($token->{data});
4046 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4047 !!!cp ('t85');
4048 $self->{document}->append_child ($comment);
4049 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4050 !!!cp ('t86');
4051 $self->{open_elements}->[0]->[0]->append_child ($comment);
4052 } else {
4053 !!!cp ('t87');
4054 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4055 }
4056 !!!next-token;
4057 next B;
4058 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4059 if ($token->{type} == CHARACTER_TOKEN) {
4060 !!!cp ('t87.1');
4061 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4062 !!!next-token;
4063 next B;
4064 } elsif ($token->{type} == START_TAG_TOKEN) {
4065 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4066 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4067 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4068 ($token->{tag_name} eq 'svg' and
4069 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4070 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4071 !!!cp ('t87.2');
4072 #
4073 } elsif ({
4074 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4075 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4076 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4077 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4078 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4079 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4080 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4081 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4082 }->{$token->{tag_name}}) {
4083 !!!cp ('t87.2');
4084 !!!parse-error (type => 'not closed',
4085 text => $self->{open_elements}->[-1]->[0]
4086 ->manakai_local_name,
4087 token => $token);
4088
4089 pop @{$self->{open_elements}}
4090 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4091
4092 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4093 ## Reprocess.
4094 next B;
4095 } else {
4096 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4097 my $tag_name = $token->{tag_name};
4098 if ($nsuri eq $SVG_NS) {
4099 $tag_name = {
4100 altglyph => 'altGlyph',
4101 altglyphdef => 'altGlyphDef',
4102 altglyphitem => 'altGlyphItem',
4103 animatecolor => 'animateColor',
4104 animatemotion => 'animateMotion',
4105 animatetransform => 'animateTransform',
4106 clippath => 'clipPath',
4107 feblend => 'feBlend',
4108 fecolormatrix => 'feColorMatrix',
4109 fecomponenttransfer => 'feComponentTransfer',
4110 fecomposite => 'feComposite',
4111 feconvolvematrix => 'feConvolveMatrix',
4112 fediffuselighting => 'feDiffuseLighting',
4113 fedisplacementmap => 'feDisplacementMap',
4114 fedistantlight => 'feDistantLight',
4115 feflood => 'feFlood',
4116 fefunca => 'feFuncA',
4117 fefuncb => 'feFuncB',
4118 fefuncg => 'feFuncG',
4119 fefuncr => 'feFuncR',
4120 fegaussianblur => 'feGaussianBlur',
4121 feimage => 'feImage',
4122 femerge => 'feMerge',
4123 femergenode => 'feMergeNode',
4124 femorphology => 'feMorphology',
4125 feoffset => 'feOffset',
4126 fepointlight => 'fePointLight',
4127 fespecularlighting => 'feSpecularLighting',
4128 fespotlight => 'feSpotLight',
4129 fetile => 'feTile',
4130 feturbulence => 'feTurbulence',
4131 foreignobject => 'foreignObject',
4132 glyphref => 'glyphRef',
4133 lineargradient => 'linearGradient',
4134 radialgradient => 'radialGradient',
4135 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4136 textpath => 'textPath',
4137 }->{$tag_name} || $tag_name;
4138 }
4139
4140 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4141
4142 ## "adjust foreign attributes" - done in insert-element-f
4143
4144 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4145
4146 if ($self->{self_closing}) {
4147 pop @{$self->{open_elements}};
4148 !!!ack ('t87.3');
4149 } else {
4150 !!!cp ('t87.4');
4151 }
4152
4153 !!!next-token;
4154 next B;
4155 }
4156 } elsif ($token->{type} == END_TAG_TOKEN) {
4157 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4158 !!!cp ('t87.5');
4159 #
4160 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4161 !!!cp ('t87.6');
4162 !!!parse-error (type => 'not closed',
4163 text => $self->{open_elements}->[-1]->[0]
4164 ->manakai_local_name,
4165 token => $token);
4166
4167 pop @{$self->{open_elements}}
4168 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4169
4170 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4171 ## Reprocess.
4172 next B;
4173 } else {
4174 die "$0: $token->{type}: Unknown token type";
4175 }
4176 }
4177
4178 if ($self->{insertion_mode} & HEAD_IMS) {
4179 if ($token->{type} == CHARACTER_TOKEN) {
4180 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4181 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4182 !!!cp ('t88.2');
4183 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4184 } else {
4185 !!!cp ('t88.1');
4186 ## Ignore the token.
4187 !!!next-token;
4188 next B;
4189 }
4190 unless (length $token->{data}) {
4191 !!!cp ('t88');
4192 !!!next-token;
4193 next B;
4194 }
4195 }
4196
4197 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4198 !!!cp ('t89');
4199 ## As if <head>
4200 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4201 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4202 push @{$self->{open_elements}},
4203 [$self->{head_element}, $el_category->{head}];
4204
4205 ## Reprocess in the "in head" insertion mode...
4206 pop @{$self->{open_elements}};
4207
4208 ## Reprocess in the "after head" insertion mode...
4209 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4210 !!!cp ('t90');
4211 ## As if </noscript>
4212 pop @{$self->{open_elements}};
4213 !!!parse-error (type => 'in noscript:#text', token => $token);
4214
4215 ## Reprocess in the "in head" insertion mode...
4216 ## As if </head>
4217 pop @{$self->{open_elements}};
4218
4219 ## Reprocess in the "after head" insertion mode...
4220 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4221 !!!cp ('t91');
4222 pop @{$self->{open_elements}};
4223
4224 ## Reprocess in the "after head" insertion mode...
4225 } else {
4226 !!!cp ('t92');
4227 }
4228
4229 ## "after head" insertion mode
4230 ## As if <body>
4231 !!!insert-element ('body',, $token);
4232 $self->{insertion_mode} = IN_BODY_IM;
4233 ## reprocess
4234 next B;
4235 } elsif ($token->{type} == START_TAG_TOKEN) {
4236 if ($token->{tag_name} eq 'head') {
4237 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4238 !!!cp ('t93');
4239 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4240 $self->{open_elements}->[-1]->[0]->append_child
4241 ($self->{head_element});
4242 push @{$self->{open_elements}},
4243 [$self->{head_element}, $el_category->{head}];
4244 $self->{insertion_mode} = IN_HEAD_IM;
4245 !!!nack ('t93.1');
4246 !!!next-token;
4247 next B;
4248 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4249 !!!cp ('t93.2');
4250 !!!parse-error (type => 'after head', text => 'head',
4251 token => $token);
4252 ## Ignore the token
4253 !!!nack ('t93.3');
4254 !!!next-token;
4255 next B;
4256 } else {
4257 !!!cp ('t95');
4258 !!!parse-error (type => 'in head:head',
4259 token => $token); # or in head noscript
4260 ## Ignore the token
4261 !!!nack ('t95.1');
4262 !!!next-token;
4263 next B;
4264 }
4265 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4266 !!!cp ('t96');
4267 ## As if <head>
4268 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4269 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4270 push @{$self->{open_elements}},
4271 [$self->{head_element}, $el_category->{head}];
4272
4273 $self->{insertion_mode} = IN_HEAD_IM;
4274 ## Reprocess in the "in head" insertion mode...
4275 } else {
4276 !!!cp ('t97');
4277 }
4278
4279 if ($token->{tag_name} eq 'base') {
4280 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4281 !!!cp ('t98');
4282 ## As if </noscript>
4283 pop @{$self->{open_elements}};
4284 !!!parse-error (type => 'in noscript', text => 'base',
4285 token => $token);
4286
4287 $self->{insertion_mode} = IN_HEAD_IM;
4288 ## Reprocess in the "in head" insertion mode...
4289 } else {
4290 !!!cp ('t99');
4291 }
4292
4293 ## NOTE: There is a "as if in head" code clone.
4294 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4295 !!!cp ('t100');
4296 !!!parse-error (type => 'after head',
4297 text => $token->{tag_name}, token => $token);
4298 push @{$self->{open_elements}},
4299 [$self->{head_element}, $el_category->{head}];
4300 } else {
4301 !!!cp ('t101');
4302 }
4303 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4304 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4305 pop @{$self->{open_elements}} # <head>
4306 if $self->{insertion_mode} == AFTER_HEAD_IM;
4307 !!!nack ('t101.1');
4308 !!!next-token;
4309 next B;
4310 } elsif ($token->{tag_name} eq 'link') {
4311 ## NOTE: There is a "as if in head" code clone.
4312 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4313 !!!cp ('t102');
4314 !!!parse-error (type => 'after head',
4315 text => $token->{tag_name}, token => $token);
4316 push @{$self->{open_elements}},
4317 [$self->{head_element}, $el_category->{head}];
4318 } else {
4319 !!!cp ('t103');
4320 }
4321 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4322 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4323 pop @{$self->{open_elements}} # <head>
4324 if $self->{insertion_mode} == AFTER_HEAD_IM;
4325 !!!ack ('t103.1');
4326 !!!next-token;
4327 next B;
4328 } elsif ($token->{tag_name} eq 'meta') {
4329 ## NOTE: There is a "as if in head" code clone.
4330 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4331 !!!cp ('t104');
4332 !!!parse-error (type => 'after head',
4333 text => $token->{tag_name}, token => $token);
4334 push @{$self->{open_elements}},
4335 [$self->{head_element}, $el_category->{head}];
4336 } else {
4337 !!!cp ('t105');
4338 }
4339 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4340 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4341
4342 unless ($self->{confident}) {
4343 if ($token->{attributes}->{charset}) {
4344 !!!cp ('t106');
4345 ## NOTE: Whether the encoding is supported or not is handled
4346 ## in the {change_encoding} callback.
4347 $self->{change_encoding}
4348 ->($self, $token->{attributes}->{charset}->{value},
4349 $token);
4350
4351 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4352 ->set_user_data (manakai_has_reference =>
4353 $token->{attributes}->{charset}
4354 ->{has_reference});
4355 } elsif ($token->{attributes}->{content}) {
4356 if ($token->{attributes}->{content}->{value}
4357 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4358 [\x09-\x0D\x20]*=
4359 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4360 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4361 !!!cp ('t107');
4362 ## NOTE: Whether the encoding is supported or not is handled
4363 ## in the {change_encoding} callback.
4364 $self->{change_encoding}
4365 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4366 $token);
4367 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4368 ->set_user_data (manakai_has_reference =>
4369 $token->{attributes}->{content}
4370 ->{has_reference});
4371 } else {
4372 !!!cp ('t108');
4373 }
4374 }
4375 } else {
4376 if ($token->{attributes}->{charset}) {
4377 !!!cp ('t109');
4378 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4379 ->set_user_data (manakai_has_reference =>
4380 $token->{attributes}->{charset}
4381 ->{has_reference});
4382 }
4383 if ($token->{attributes}->{content}) {
4384 !!!cp ('t110');
4385 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4386 ->set_user_data (manakai_has_reference =>
4387 $token->{attributes}->{content}
4388 ->{has_reference});
4389 }
4390 }
4391
4392 pop @{$self->{open_elements}} # <head>
4393 if $self->{insertion_mode} == AFTER_HEAD_IM;
4394 !!!ack ('t110.1');
4395 !!!next-token;
4396 next B;
4397 } elsif ($token->{tag_name} eq 'title') {
4398 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4399 !!!cp ('t111');
4400 ## As if </noscript>
4401 pop @{$self->{open_elements}};
4402 !!!parse-error (type => 'in noscript', text => 'title',
4403 token => $token);
4404
4405 $self->{insertion_mode} = IN_HEAD_IM;
4406 ## Reprocess in the "in head" insertion mode...
4407 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4408 !!!cp ('t112');
4409 !!!parse-error (type => 'after head',
4410 text => $token->{tag_name}, token => $token);
4411 push @{$self->{open_elements}},
4412 [$self->{head_element}, $el_category->{head}];
4413 } else {
4414 !!!cp ('t113');
4415 }
4416
4417 ## NOTE: There is a "as if in head" code clone.
4418 my $parent = defined $self->{head_element} ? $self->{head_element}
4419 : $self->{open_elements}->[-1]->[0];
4420 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4421 pop @{$self->{open_elements}} # <head>
4422 if $self->{insertion_mode} == AFTER_HEAD_IM;
4423 next B;
4424 } elsif ($token->{tag_name} eq 'style' or
4425 $token->{tag_name} eq 'noframes') {
4426 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4427 ## insertion mode IN_HEAD_IM)
4428 ## NOTE: There is a "as if in head" code clone.
4429 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4430 !!!cp ('t114');
4431 !!!parse-error (type => 'after head',
4432 text => $token->{tag_name}, token => $token);
4433 push @{$self->{open_elements}},
4434 [$self->{head_element}, $el_category->{head}];
4435 } else {
4436 !!!cp ('t115');
4437 }
4438 $parse_rcdata->(CDATA_CONTENT_MODEL);
4439 pop @{$self->{open_elements}} # <head>
4440 if $self->{insertion_mode} == AFTER_HEAD_IM;
4441 next B;
4442 } elsif ($token->{tag_name} eq 'noscript') {
4443 if ($self->{insertion_mode} == IN_HEAD_IM) {
4444 !!!cp ('t116');
4445 ## NOTE: and scripting is disalbed
4446 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4447 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4448 !!!nack ('t116.1');
4449 !!!next-token;
4450 next B;
4451 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4452 !!!cp ('t117');
4453 !!!parse-error (type => 'in noscript', text => 'noscript',
4454 token => $token);
4455 ## Ignore the token
4456 !!!nack ('t117.1');
4457 !!!next-token;
4458 next B;
4459 } else {
4460 !!!cp ('t118');
4461 #
4462 }
4463 } elsif ($token->{tag_name} eq 'script') {
4464 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4465 !!!cp ('t119');
4466 ## As if </noscript>
4467 pop @{$self->{open_elements}};
4468 !!!parse-error (type => 'in noscript', text => 'script',
4469 token => $token);
4470
4471 $self->{insertion_mode} = IN_HEAD_IM;
4472 ## Reprocess in the "in head" insertion mode...
4473 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4474 !!!cp ('t120');
4475 !!!parse-error (type => 'after head',
4476 text => $token->{tag_name}, token => $token);
4477 push @{$self->{open_elements}},
4478 [$self->{head_element}, $el_category->{head}];
4479 } else {
4480 !!!cp ('t121');
4481 }
4482
4483 ## NOTE: There is a "as if in head" code clone.
4484 $script_start_tag->();
4485 pop @{$self->{open_elements}} # <head>
4486 if $self->{insertion_mode} == AFTER_HEAD_IM;
4487 next B;
4488 } elsif ($token->{tag_name} eq 'body' or
4489 $token->{tag_name} eq 'frameset') {
4490 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4491 !!!cp ('t122');
4492 ## As if </noscript>
4493 pop @{$self->{open_elements}};
4494 !!!parse-error (type => 'in noscript',
4495 text => $token->{tag_name}, token => $token);
4496
4497 ## Reprocess in the "in head" insertion mode...
4498 ## As if </head>
4499 pop @{$self->{open_elements}};
4500
4501 ## Reprocess in the "after head" insertion mode...
4502 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4503 !!!cp ('t124');
4504 pop @{$self->{open_elements}};
4505
4506 ## Reprocess in the "after head" insertion mode...
4507 } else {
4508 !!!cp ('t125');
4509 }
4510
4511 ## "after head" insertion mode
4512 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4513 if ($token->{tag_name} eq 'body') {
4514 !!!cp ('t126');
4515 $self->{insertion_mode} = IN_BODY_IM;
4516 } elsif ($token->{tag_name} eq 'frameset') {
4517 !!!cp ('t127');
4518 $self->{insertion_mode} = IN_FRAMESET_IM;
4519 } else {
4520 die "$0: tag name: $self->{tag_name}";
4521 }
4522 !!!nack ('t127.1');
4523 !!!next-token;
4524 next B;
4525 } else {
4526 !!!cp ('t128');
4527 #
4528 }
4529
4530 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4531 !!!cp ('t129');
4532 ## As if </noscript>
4533 pop @{$self->{open_elements}};
4534 !!!parse-error (type => 'in noscript:/',
4535 text => $token->{tag_name}, token => $token);
4536
4537 ## Reprocess in the "in head" insertion mode...
4538 ## As if </head>
4539 pop @{$self->{open_elements}};
4540
4541 ## Reprocess in the "after head" insertion mode...
4542 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4543 !!!cp ('t130');
4544 ## As if </head>
4545 pop @{$self->{open_elements}};
4546
4547 ## Reprocess in the "after head" insertion mode...
4548 } else {
4549 !!!cp ('t131');
4550 }
4551
4552 ## "after head" insertion mode
4553 ## As if <body>
4554 !!!insert-element ('body',, $token);
4555 $self->{insertion_mode} = IN_BODY_IM;
4556 ## reprocess
4557 !!!ack-later;
4558 next B;
4559 } elsif ($token->{type} == END_TAG_TOKEN) {
4560 if ($token->{tag_name} eq 'head') {
4561 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4562 !!!cp ('t132');
4563 ## As if <head>
4564 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4565 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4566 push @{$self->{open_elements}},
4567 [$self->{head_element}, $el_category->{head}];
4568
4569 ## Reprocess in the "in head" insertion mode...
4570 pop @{$self->{open_elements}};
4571 $self->{insertion_mode} = AFTER_HEAD_IM;
4572 !!!next-token;
4573 next B;
4574 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4575 !!!cp ('t133');
4576 ## As if </noscript>
4577 pop @{$self->{open_elements}};
4578 !!!parse-error (type => 'in noscript:/',
4579 text => 'head', token => $token);
4580
4581 ## Reprocess in the "in head" insertion mode...
4582 pop @{$self->{open_elements}};
4583 $self->{insertion_mode} = AFTER_HEAD_IM;
4584 !!!next-token;
4585 next B;
4586 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4587 !!!cp ('t134');
4588 pop @{$self->{open_elements}};
4589 $self->{insertion_mode} = AFTER_HEAD_IM;
4590 !!!next-token;
4591 next B;
4592 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4593 !!!cp ('t134.1');
4594 !!!parse-error (type => 'unmatched end tag', text => 'head',
4595 token => $token);
4596 ## Ignore the token
4597 !!!next-token;
4598 next B;
4599 } else {
4600 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4601 }
4602 } elsif ($token->{tag_name} eq 'noscript') {
4603 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4604 !!!cp ('t136');
4605 pop @{$self->{open_elements}};
4606 $self->{insertion_mode} = IN_HEAD_IM;
4607 !!!next-token;
4608 next B;
4609 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4610 $self->{insertion_mode} == AFTER_HEAD_IM) {
4611 !!!cp ('t137');
4612 !!!parse-error (type => 'unmatched end tag',
4613 text => 'noscript', token => $token);
4614 ## Ignore the token ## ISSUE: An issue in the spec.
4615 !!!next-token;
4616 next B;
4617 } else {
4618 !!!cp ('t138');
4619 #
4620 }
4621 } elsif ({
4622 body => 1, html => 1,
4623 }->{$token->{tag_name}}) {
4624 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4625 $self->{insertion_mode} == IN_HEAD_IM or
4626 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4627 !!!cp ('t140');
4628 !!!parse-error (type => 'unmatched end tag',
4629 text => $token->{tag_name}, token => $token);
4630 ## Ignore the token
4631 !!!next-token;
4632 next B;
4633 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4634 !!!cp ('t140.1');
4635 !!!parse-error (type => 'unmatched end tag',
4636 text => $token->{tag_name}, token => $token);
4637 ## Ignore the token
4638 !!!next-token;
4639 next B;
4640 } else {
4641 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4642 }
4643 } elsif ($token->{tag_name} eq 'p') {
4644 !!!cp ('t142');
4645 !!!parse-error (type => 'unmatched end tag',
4646 text => $token->{tag_name}, token => $token);
4647 ## Ignore the token
4648 !!!next-token;
4649 next B;
4650 } elsif ($token->{tag_name} eq 'br') {
4651 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4652 !!!cp ('t142.2');
4653 ## (before head) as if <head>, (in head) as if </head>
4654 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4655 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4656 $self->{insertion_mode} = AFTER_HEAD_IM;
4657
4658 ## Reprocess in the "after head" insertion mode...
4659 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4660 !!!cp ('t143.2');
4661 ## As if </head>
4662 pop @{$self->{open_elements}};
4663 $self->{insertion_mode} = AFTER_HEAD_IM;
4664
4665 ## Reprocess in the "after head" insertion mode...
4666 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4667 !!!cp ('t143.3');
4668 ## ISSUE: Two parse errors for <head><noscript></br>
4669 !!!parse-error (type => 'unmatched end tag',
4670 text => 'br', token => $token);
4671 ## As if </noscript>
4672 pop @{$self->{open_elements}};
4673 $self->{insertion_mode} = IN_HEAD_IM;
4674
4675 ## Reprocess in the "in head" insertion mode...
4676 ## As if </head>
4677 pop @{$self->{open_elements}};
4678 $self->{insertion_mode} = AFTER_HEAD_IM;
4679
4680 ## Reprocess in the "after head" insertion mode...
4681 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4682 !!!cp ('t143.4');
4683 #
4684 } else {
4685 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4686 }
4687
4688 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4689 !!!parse-error (type => 'unmatched end tag',
4690 text => 'br', token => $token);
4691 ## Ignore the token
4692 !!!next-token;
4693 next B;
4694 } else {
4695 !!!cp ('t145');
4696 !!!parse-error (type => 'unmatched end tag',
4697 text => $token->{tag_name}, token => $token);
4698 ## Ignore the token
4699 !!!next-token;
4700 next B;
4701 }
4702
4703 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4704 !!!cp ('t146');
4705 ## As if </noscript>
4706 pop @{$self->{open_elements}};
4707 !!!parse-error (type => 'in noscript:/',
4708 text => $token->{tag_name}, token => $token);
4709
4710 ## Reprocess in the "in head" insertion mode...
4711 ## As if </head>
4712 pop @{$self->{open_elements}};
4713
4714 ## Reprocess in the "after head" insertion mode...
4715 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4716 !!!cp ('t147');
4717 ## As if </head>
4718 pop @{$self->{open_elements}};
4719
4720 ## Reprocess in the "after head" insertion mode...
4721 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4722 ## ISSUE: This case cannot be reached?
4723 !!!cp ('t148');
4724 !!!parse-error (type => 'unmatched end tag',
4725 text => $token->{tag_name}, token => $token);
4726 ## Ignore the token ## ISSUE: An issue in the spec.
4727 !!!next-token;
4728 next B;
4729 } else {
4730 !!!cp ('t149');
4731 }
4732
4733 ## "after head" insertion mode
4734 ## As if <body>
4735 !!!insert-element ('body',, $token);
4736 $self->{insertion_mode} = IN_BODY_IM;
4737 ## reprocess
4738 next B;
4739 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4740 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4741 !!!cp ('t149.1');
4742
4743 ## NOTE: As if <head>
4744 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4745 $self->{open_elements}->[-1]->[0]->append_child
4746 ($self->{head_element});
4747 #push @{$self->{open_elements}},
4748 # [$self->{head_element}, $el_category->{head}];
4749 #$self->{insertion_mode} = IN_HEAD_IM;
4750 ## NOTE: Reprocess.
4751
4752 ## NOTE: As if </head>
4753 #pop @{$self->{open_elements}};
4754 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4755 ## NOTE: Reprocess.
4756
4757 #
4758 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4759 !!!cp ('t149.2');
4760
4761 ## NOTE: As if </head>
4762 pop @{$self->{open_elements}};
4763 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4764 ## NOTE: Reprocess.
4765
4766 #
4767 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4768 !!!cp ('t149.3');
4769
4770 !!!parse-error (type => 'in noscript:#eof', token => $token);
4771
4772 ## As if </noscript>
4773 pop @{$self->{open_elements}};
4774 #$self->{insertion_mode} = IN_HEAD_IM;
4775 ## NOTE: Reprocess.
4776
4777 ## NOTE: As if </head>
4778 pop @{$self->{open_elements}};
4779 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4780 ## NOTE: Reprocess.
4781
4782 #
4783 } else {
4784 !!!cp ('t149.4');
4785 #
4786 }
4787
4788 ## NOTE: As if <body>
4789 !!!insert-element ('body',, $token);
4790 $self->{insertion_mode} = IN_BODY_IM;
4791 ## NOTE: Reprocess.
4792 next B;
4793 } else {
4794 die "$0: $token->{type}: Unknown token type";
4795 }
4796
4797 ## ISSUE: An issue in the spec.
4798 } elsif ($self->{insertion_mode} & BODY_IMS) {
4799 if ($token->{type} == CHARACTER_TOKEN) {
4800 !!!cp ('t150');
4801 ## NOTE: There is a code clone of "character in body".
4802 $reconstruct_active_formatting_elements->($insert_to_current);
4803
4804 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4805
4806 !!!next-token;
4807 next B;
4808 } elsif ($token->{type} == START_TAG_TOKEN) {
4809 if ({
4810 caption => 1, col => 1, colgroup => 1, tbody => 1,
4811 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4812 }->{$token->{tag_name}}) {
4813 if ($self->{insertion_mode} == IN_CELL_IM) {
4814 ## have an element in table scope
4815 for (reverse 0..$#{$self->{open_elements}}) {
4816 my $node = $self->{open_elements}->[$_];
4817 if ($node->[1] & TABLE_CELL_EL) {
4818 !!!cp ('t151');
4819
4820 ## Close the cell
4821 !!!back-token; # <x>
4822 $token = {type => END_TAG_TOKEN,
4823 tag_name => $node->[0]->manakai_local_name,
4824 line => $token->{line},
4825 column => $token->{column}};
4826 next B;
4827 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4828 !!!cp ('t152');
4829 ## ISSUE: This case can never be reached, maybe.
4830 last;
4831 }
4832 }
4833
4834 !!!cp ('t153');
4835 !!!parse-error (type => 'start tag not allowed',
4836 text => $token->{tag_name}, token => $token);
4837 ## Ignore the token
4838 !!!nack ('t153.1');
4839 !!!next-token;
4840 next B;
4841 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4842 !!!parse-error (type => 'not closed', text => 'caption',
4843 token => $token);
4844
4845 ## NOTE: As if </caption>.
4846 ## have a table element in table scope
4847 my $i;
4848 INSCOPE: {
4849 for (reverse 0..$#{$self->{open_elements}}) {
4850 my $node = $self->{open_elements}->[$_];
4851 if ($node->[1] & CAPTION_EL) {
4852 !!!cp ('t155');
4853 $i = $_;
4854 last INSCOPE;
4855 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4856 !!!cp ('t156');
4857 last;
4858 }
4859 }
4860
4861 !!!cp ('t157');
4862 !!!parse-error (type => 'start tag not allowed',
4863 text => $token->{tag_name}, token => $token);
4864 ## Ignore the token
4865 !!!nack ('t157.1');
4866 !!!next-token;
4867 next B;
4868 } # INSCOPE
4869
4870 ## generate implied end tags
4871 while ($self->{open_elements}->[-1]->[1]
4872 & END_TAG_OPTIONAL_EL) {
4873 !!!cp ('t158');
4874 pop @{$self->{open_elements}};
4875 }
4876
4877 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4878 !!!cp ('t159');
4879 !!!parse-error (type => 'not closed',
4880 text => $self->{open_elements}->[-1]->[0]
4881 ->manakai_local_name,
4882 token => $token);
4883 } else {
4884 !!!cp ('t160');
4885 }
4886
4887 splice @{$self->{open_elements}}, $i;
4888
4889 $clear_up_to_marker->();
4890
4891 $self->{insertion_mode} = IN_TABLE_IM;
4892
4893 ## reprocess
4894 !!!ack-later;
4895 next B;
4896 } else {
4897 !!!cp ('t161');
4898 #
4899 }
4900 } else {
4901 !!!cp ('t162');
4902 #
4903 }
4904 } elsif ($token->{type} == END_TAG_TOKEN) {
4905 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4906 if ($self->{insertion_mode} == IN_CELL_IM) {
4907 ## have an element in table scope
4908 my $i;
4909 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4910 my $node = $self->{open_elements}->[$_];
4911 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4912 !!!cp ('t163');
4913 $i = $_;
4914 last INSCOPE;
4915 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4916 !!!cp ('t164');
4917 last INSCOPE;
4918 }
4919 } # INSCOPE
4920 unless (defined $i) {
4921 !!!cp ('t165');
4922 !!!parse-error (type => 'unmatched end tag',
4923 text => $token->{tag_name},
4924 token => $token);
4925 ## Ignore the token
4926 !!!next-token;
4927 next B;
4928 }
4929
4930 ## generate implied end tags
4931 while ($self->{open_elements}->[-1]->[1]
4932 & END_TAG_OPTIONAL_EL) {
4933 !!!cp ('t166');
4934 pop @{$self->{open_elements}};
4935 }
4936
4937 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4938 ne $token->{tag_name}) {
4939 !!!cp ('t167');
4940 !!!parse-error (type => 'not closed',
4941 text => $self->{open_elements}->[-1]->[0]
4942 ->manakai_local_name,
4943 token => $token);
4944 } else {
4945 !!!cp ('t168');
4946 }
4947
4948 splice @{$self->{open_elements}}, $i;
4949
4950 $clear_up_to_marker->();
4951
4952 $self->{insertion_mode} = IN_ROW_IM;
4953
4954 !!!next-token;
4955 next B;
4956 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4957 !!!cp ('t169');
4958 !!!parse-error (type => 'unmatched end tag',
4959 text => $token->{tag_name}, token => $token);
4960 ## Ignore the token
4961 !!!next-token;
4962 next B;
4963 } else {
4964 !!!cp ('t170');
4965 #
4966 }
4967 } elsif ($token->{tag_name} eq 'caption') {
4968 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4969 ## have a table element in table scope
4970 my $i;
4971 INSCOPE: {
4972 for (reverse 0..$#{$self->{open_elements}}) {
4973 my $node = $self->{open_elements}->[$_];
4974 if ($node->[1] & CAPTION_EL) {
4975 !!!cp ('t171');
4976 $i = $_;
4977 last INSCOPE;
4978 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4979 !!!cp ('t172');
4980 last;
4981 }
4982 }
4983
4984 !!!cp ('t173');
4985 !!!parse-error (type => 'unmatched end tag',
4986 text => $token->{tag_name}, token => $token);
4987 ## Ignore the token
4988 !!!next-token;
4989 next B;
4990 } # INSCOPE
4991
4992 ## generate implied end tags
4993 while ($self->{open_elements}->[-1]->[1]
4994 & END_TAG_OPTIONAL_EL) {
4995 !!!cp ('t174');
4996 pop @{$self->{open_elements}};
4997 }
4998
4999 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5000 !!!cp ('t175');
5001 !!!parse-error (type => 'not closed',
5002 text => $self->{open_elements}->[-1]->[0]
5003 ->manakai_local_name,
5004 token => $token);
5005 } else {
5006 !!!cp ('t176');
5007 }
5008
5009 splice @{$self->{open_elements}}, $i;
5010
5011 $clear_up_to_marker->();
5012
5013 $self->{insertion_mode} = IN_TABLE_IM;
5014
5015 !!!next-token;
5016 next B;
5017 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5018 !!!cp ('t177');
5019 !!!parse-error (type => 'unmatched end tag',
5020 text => $token->{tag_name}, token => $token);
5021 ## Ignore the token
5022 !!!next-token;
5023 next B;
5024 } else {
5025 !!!cp ('t178');
5026 #
5027 }
5028 } elsif ({
5029 table => 1, tbody => 1, tfoot => 1,
5030 thead => 1, tr => 1,
5031 }->{$token->{tag_name}} and
5032 $self->{insertion_mode} == IN_CELL_IM) {
5033 ## have an element in table scope
5034 my $i;
5035 my $tn;
5036 INSCOPE: {
5037 for (reverse 0..$#{$self->{open_elements}}) {
5038 my $node = $self->{open_elements}->[$_];
5039 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5040 !!!cp ('t179');
5041 $i = $_;
5042
5043 ## Close the cell
5044 !!!back-token; # </x>
5045 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5046 line => $token->{line},
5047 column => $token->{column}};
5048 next B;
5049 } elsif ($node->[1] & TABLE_CELL_EL) {
5050 !!!cp ('t180');
5051 $tn = $node->[0]->manakai_local_name;
5052 ## NOTE: There is exactly one |td| or |th| element
5053 ## in scope in the stack of open elements by definition.
5054 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5055 ## ISSUE: Can this be reached?
5056 !!!cp ('t181');
5057 last;
5058 }
5059 }
5060
5061 !!!cp ('t182');
5062 !!!parse-error (type => 'unmatched end tag',
5063 text => $token->{tag_name}, token => $token);
5064 ## Ignore the token
5065 !!!next-token;
5066 next B;
5067 } # INSCOPE
5068 } elsif ($token->{tag_name} eq 'table' and
5069 $self->{insertion_mode} == IN_CAPTION_IM) {
5070 !!!parse-error (type => 'not closed', text => 'caption',
5071 token => $token);
5072
5073 ## As if </caption>
5074 ## have a table element in table scope
5075 my $i;
5076 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5077 my $node = $self->{open_elements}->[$_];
5078 if ($node->[1] & CAPTION_EL) {
5079 !!!cp ('t184');
5080 $i = $_;
5081 last INSCOPE;
5082 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5083 !!!cp ('t185');
5084 last INSCOPE;
5085 }
5086 } # INSCOPE
5087 unless (defined $i) {
5088 !!!cp ('t186');
5089 !!!parse-error (type => 'unmatched end tag',
5090 text => 'caption', token => $token);
5091 ## Ignore the token
5092 !!!next-token;
5093 next B;
5094 }
5095
5096 ## generate implied end tags
5097 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5098 !!!cp ('t187');
5099 pop @{$self->{open_elements}};
5100 }
5101
5102 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5103 !!!cp ('t188');
5104 !!!parse-error (type => 'not closed',
5105 text => $self->{open_elements}->[-1]->[0]
5106 ->manakai_local_name,
5107 token => $token);
5108 } else {
5109 !!!cp ('t189');
5110 }
5111
5112 splice @{$self->{open_elements}}, $i;
5113
5114 $clear_up_to_marker->();
5115
5116 $self->{insertion_mode} = IN_TABLE_IM;
5117
5118 ## reprocess
5119 next B;
5120 } elsif ({
5121 body => 1, col => 1, colgroup => 1, html => 1,
5122 }->{$token->{tag_name}}) {
5123 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5124 !!!cp ('t190');
5125 !!!parse-error (type => 'unmatched end tag',
5126 text => $token->{tag_name}, token => $token);
5127 ## Ignore the token
5128 !!!next-token;
5129 next B;
5130 } else {
5131 !!!cp ('t191');
5132 #
5133 }
5134 } elsif ({
5135 tbody => 1, tfoot => 1,
5136 thead => 1, tr => 1,
5137 }->{$token->{tag_name}} and
5138 $self->{insertion_mode} == IN_CAPTION_IM) {
5139 !!!cp ('t192');
5140 !!!parse-error (type => 'unmatched end tag',
5141 text => $token->{tag_name}, token => $token);
5142 ## Ignore the token
5143 !!!next-token;
5144 next B;
5145 } else {
5146 !!!cp ('t193');
5147 #
5148 }
5149 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5150 for my $entry (@{$self->{open_elements}}) {
5151 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5152 !!!cp ('t75');
5153 !!!parse-error (type => 'in body:#eof', token => $token);
5154 last;
5155 }
5156 }
5157
5158 ## Stop parsing.
5159 last B;
5160 } else {
5161 die "$0: $token->{type}: Unknown token type";
5162 }
5163
5164 $insert = $insert_to_current;
5165 #
5166 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5167 if ($token->{type} == CHARACTER_TOKEN) {
5168 if (not $open_tables->[-1]->[1] and # tainted
5169 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5170 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5171
5172 unless (length $token->{data}) {
5173 !!!cp ('t194');
5174 !!!next-token;
5175 next B;
5176 } else {
5177 !!!cp ('t195');
5178 }
5179 }
5180
5181 !!!parse-error (type => 'in table:#text', token => $token);
5182
5183 ## As if in body, but insert into foster parent element
5184 ## ISSUE: Spec says that "whenever a node would be inserted
5185 ## into the current node" while characters might not be
5186 ## result in a new Text node.
5187 $reconstruct_active_formatting_elements->($insert_to_foster);
5188
5189 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5190 # MUST
5191 my $foster_parent_element;
5192 my $next_sibling;
5193 my $prev_sibling;
5194 OE: for (reverse 0..$#{$self->{open_elements}}) {
5195 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5196 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5197 if (defined $parent and $parent->node_type == 1) {
5198 !!!cp ('t196');
5199 $foster_parent_element = $parent;
5200 $next_sibling = $self->{open_elements}->[$_]->[0];
5201 $prev_sibling = $next_sibling->previous_sibling;
5202 } else {
5203 !!!cp ('t197');
5204 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5205 $prev_sibling = $foster_parent_element->last_child;
5206 }
5207 last OE;
5208 }
5209 } # OE
5210 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5211 $prev_sibling = $foster_parent_element->last_child
5212 unless defined $foster_parent_element;
5213 if (defined $prev_sibling and
5214 $prev_sibling->node_type == 3) {
5215 !!!cp ('t198');
5216 $prev_sibling->manakai_append_text ($token->{data});
5217 } else {
5218 !!!cp ('t199');
5219 $foster_parent_element->insert_before
5220 ($self->{document}->create_text_node ($token->{data}),
5221 $next_sibling);
5222 }
5223 $open_tables->[-1]->[1] = 1; # tainted
5224 } else {
5225 !!!cp ('t200');
5226 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5227 }
5228
5229 !!!next-token;
5230 next B;
5231 } elsif ($token->{type} == START_TAG_TOKEN) {
5232 if ({
5233 tr => ($self->{insertion_mode} != IN_ROW_IM),
5234 th => 1, td => 1,
5235 }->{$token->{tag_name}}) {
5236 if ($self->{insertion_mode} == IN_TABLE_IM) {
5237 ## Clear back to table context
5238 while (not ($self->{open_elements}->[-1]->[1]
5239 & TABLE_SCOPING_EL)) {
5240 !!!cp ('t201');
5241 pop @{$self->{open_elements}};
5242 }
5243
5244 !!!insert-element ('tbody',, $token);
5245 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5246 ## reprocess in the "in table body" insertion mode...
5247 }
5248
5249 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5250 unless ($token->{tag_name} eq 'tr') {
5251 !!!cp ('t202');
5252 !!!parse-error (type => 'missing start tag:tr', token => $token);
5253 }
5254
5255 ## Clear back to table body context
5256 while (not ($self->{open_elements}->[-1]->[1]
5257 & TABLE_ROWS_SCOPING_EL)) {
5258 !!!cp ('t203');
5259 ## ISSUE: Can this case be reached?
5260 pop @{$self->{open_elements}};
5261 }
5262
5263 $self->{insertion_mode} = IN_ROW_IM;
5264 if ($token->{tag_name} eq 'tr') {
5265 !!!cp ('t204');
5266 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5267 !!!nack ('t204');
5268 !!!next-token;
5269 next B;
5270 } else {
5271 !!!cp ('t205');
5272 !!!insert-element ('tr',, $token);
5273 ## reprocess in the "in row" insertion mode
5274 }
5275 } else {
5276 !!!cp ('t206');
5277 }
5278
5279 ## Clear back to table row context
5280 while (not ($self->{open_elements}->[-1]->[1]
5281 & TABLE_ROW_SCOPING_EL)) {
5282 !!!cp ('t207');
5283 pop @{$self->{open_elements}};
5284 }
5285
5286 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5287 $self->{insertion_mode} = IN_CELL_IM;
5288
5289 push @$active_formatting_elements, ['#marker', ''];
5290
5291 !!!nack ('t207.1');
5292 !!!next-token;
5293 next B;
5294 } elsif ({
5295 caption => 1, col => 1, colgroup => 1,
5296 tbody => 1, tfoot => 1, thead => 1,
5297 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5298 }->{$token->{tag_name}}) {
5299 if ($self->{insertion_mode} == IN_ROW_IM) {
5300 ## As if </tr>
5301 ## have an element in table scope
5302 my $i;
5303 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5304 my $node = $self->{open_elements}->[$_];
5305 if ($node->[1] & TABLE_ROW_EL) {
5306 !!!cp ('t208');
5307 $i = $_;
5308 last INSCOPE;
5309 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5310 !!!cp ('t209');
5311 last INSCOPE;
5312 }
5313 } # INSCOPE
5314 unless (defined $i) {
5315 !!!cp ('t210');
5316 ## TODO: This type is wrong.
5317 !!!parse-error (type => 'unmacthed end tag',
5318 text => $token->{tag_name}, token => $token);
5319 ## Ignore the token
5320 !!!nack ('t210.1');
5321 !!!next-token;
5322 next B;
5323 }
5324
5325 ## Clear back to table row context
5326 while (not ($self->{open_elements}->[-1]->[1]
5327 & TABLE_ROW_SCOPING_EL)) {
5328 !!!cp ('t211');
5329 ## ISSUE: Can this case be reached?
5330 pop @{$self->{open_elements}};
5331 }
5332
5333 pop @{$self->{open_elements}}; # tr
5334 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5335 if ($token->{tag_name} eq 'tr') {
5336 !!!cp ('t212');
5337 ## reprocess
5338 !!!ack-later;
5339 next B;
5340 } else {
5341 !!!cp ('t213');
5342 ## reprocess in the "in table body" insertion mode...
5343 }
5344 }
5345
5346 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5347 ## have an element in table scope
5348 my $i;
5349 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5350 my $node = $self->{open_elements}->[$_];
5351 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5352 !!!cp ('t214');
5353 $i = $_;
5354 last INSCOPE;
5355 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5356 !!!cp ('t215');
5357 last INSCOPE;
5358 }
5359 } # INSCOPE
5360 unless (defined $i) {
5361 !!!cp ('t216');
5362 ## TODO: This erorr type is wrong.
5363 !!!parse-error (type => 'unmatched end tag',
5364 text => $token->{tag_name}, token => $token);
5365 ## Ignore the token
5366 !!!nack ('t216.1');
5367 !!!next-token;
5368 next B;
5369 }
5370
5371 ## Clear back to table body context
5372 while (not ($self->{open_elements}->[-1]->[1]
5373 & TABLE_ROWS_SCOPING_EL)) {
5374 !!!cp ('t217');
5375 ## ISSUE: Can this state be reached?
5376 pop @{$self->{open_elements}};
5377 }
5378
5379 ## As if <{current node}>
5380 ## have an element in table scope
5381 ## true by definition
5382
5383 ## Clear back to table body context
5384 ## nop by definition
5385
5386 pop @{$self->{open_elements}};
5387 $self->{insertion_mode} = IN_TABLE_IM;
5388 ## reprocess in "in table" insertion mode...
5389 } else {
5390 !!!cp ('t218');
5391 }
5392
5393 if ($token->{tag_name} eq 'col') {
5394 ## Clear back to table context
5395 while (not ($self->{open_elements}->[-1]->[1]
5396 & TABLE_SCOPING_EL)) {
5397 !!!cp ('t219');
5398 ## ISSUE: Can this state be reached?
5399 pop @{$self->{open_elements}};
5400 }
5401
5402 !!!insert-element ('colgroup',, $token);
5403 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5404 ## reprocess
5405 !!!ack-later;
5406 next B;
5407 } elsif ({
5408 caption => 1,
5409 colgroup => 1,
5410 tbody => 1, tfoot => 1, thead => 1,
5411 }->{$token->{tag_name}}) {
5412 ## Clear back to table context
5413 while (not ($self->{open_elements}->[-1]->[1]
5414 & TABLE_SCOPING_EL)) {
5415 !!!cp ('t220');
5416 ## ISSUE: Can this state be reached?
5417 pop @{$self->{open_elements}};
5418 }
5419
5420 push @$active_formatting_elements, ['#marker', '']
5421 if $token->{tag_name} eq 'caption';
5422
5423 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5424 $self->{insertion_mode} = {
5425 caption => IN_CAPTION_IM,
5426 colgroup => IN_COLUMN_GROUP_IM,
5427 tbody => IN_TABLE_BODY_IM,
5428 tfoot => IN_TABLE_BODY_IM,
5429 thead => IN_TABLE_BODY_IM,
5430 }->{$token->{tag_name}};
5431 !!!next-token;
5432 !!!nack ('t220.1');
5433 next B;
5434 } else {
5435 die "$0: in table: <>: $token->{tag_name}";
5436 }
5437 } elsif ($token->{tag_name} eq 'table') {
5438 !!!parse-error (type => 'not closed',
5439 text => $self->{open_elements}->[-1]->[0]
5440 ->manakai_local_name,
5441 token => $token);
5442
5443 ## As if </table>
5444 ## have a table element in table scope
5445 my $i;
5446 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5447 my $node = $self->{open_elements}->[$_];
5448 if ($node->[1] & TABLE_EL) {
5449 !!!cp ('t221');
5450 $i = $_;
5451 last INSCOPE;
5452 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5453 !!!cp ('t222');
5454 last INSCOPE;
5455 }
5456 } # INSCOPE
5457 unless (defined $i) {
5458 !!!cp ('t223');
5459 ## TODO: The following is wrong, maybe.
5460 !!!parse-error (type => 'unmatched end tag', text => 'table',
5461 token => $token);
5462 ## Ignore tokens </table><table>
5463 !!!nack ('t223.1');
5464 !!!next-token;
5465 next B;
5466 }
5467
5468 ## TODO: Followings are removed from the latest spec.
5469 ## generate implied end tags
5470 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5471 !!!cp ('t224');
5472 pop @{$self->{open_elements}};
5473 }
5474
5475 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5476 !!!cp ('t225');
5477 ## NOTE: |<table><tr><table>|
5478 !!!parse-error (type => 'not closed',
5479 text => $self->{open_elements}->[-1]->[0]
5480 ->manakai_local_name,
5481 token => $token);
5482 } else {
5483 !!!cp ('t226');
5484 }
5485
5486 splice @{$self->{open_elements}}, $i;
5487 pop @{$open_tables};
5488
5489 $self->_reset_insertion_mode;
5490
5491 ## reprocess
5492 !!!ack-later;
5493 next B;
5494 } elsif ($token->{tag_name} eq 'style') {
5495 if (not $open_tables->[-1]->[1]) { # tainted
5496 !!!cp ('t227.8');
5497 ## NOTE: This is a "as if in head" code clone.
5498 $parse_rcdata->(CDATA_CONTENT_MODEL);
5499 next B;
5500 } else {
5501 !!!cp ('t227.7');
5502 #
5503 }
5504 } elsif ($token->{tag_name} eq 'script') {
5505 if (not $open_tables->[-1]->[1]) { # tainted
5506 !!!cp ('t227.6');
5507 ## NOTE: This is a "as if in head" code clone.
5508 $script_start_tag->();
5509 next B;
5510 } else {
5511 !!!cp ('t227.5');
5512 #
5513 }
5514 } elsif ($token->{tag_name} eq 'input') {
5515 if (not $open_tables->[-1]->[1]) { # tainted
5516 if ($token->{attributes}->{type}) { ## TODO: case
5517 my $type = lc $token->{attributes}->{type}->{value};
5518 if ($type eq 'hidden') {
5519 !!!cp ('t227.3');
5520 !!!parse-error (type => 'in table',
5521 text => $token->{tag_name}, token => $token);
5522
5523 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5524
5525 ## TODO: form element pointer
5526
5527 pop @{$self->{open_elements}};
5528
5529 !!!next-token;
5530 !!!ack ('t227.2.1');
5531 next B;
5532 } else {
5533 !!!cp ('t227.2');
5534 #
5535 }
5536 } else {
5537 !!!cp ('t227.1');
5538 #
5539 }
5540 } else {
5541 !!!cp ('t227.4');
5542 #
5543 }
5544 } else {
5545 !!!cp ('t227');
5546 #
5547 }
5548
5549 !!!parse-error (type => 'in table', text => $token->{tag_name},
5550 token => $token);
5551
5552 $insert = $insert_to_foster;
5553 #
5554 } elsif ($token->{type} == END_TAG_TOKEN) {
5555 if ($token->{tag_name} eq 'tr' and
5556 $self->{insertion_mode} == IN_ROW_IM) {
5557 ## have an element in table scope
5558 my $i;
5559 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5560 my $node = $self->{open_elements}->[$_];
5561 if ($node->[1] & TABLE_ROW_EL) {
5562 !!!cp ('t228');
5563 $i = $_;
5564 last INSCOPE;
5565 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5566 !!!cp ('t229');
5567 last INSCOPE;
5568 }
5569 } # INSCOPE
5570 unless (defined $i) {
5571 !!!cp ('t230');
5572 !!!parse-error (type => 'unmatched end tag',
5573 text => $token->{tag_name}, token => $token);
5574 ## Ignore the token
5575 !!!nack ('t230.1');
5576 !!!next-token;
5577 next B;
5578 } else {
5579 !!!cp ('t232');
5580 }
5581
5582 ## Clear back to table row context
5583 while (not ($self->{open_elements}->[-1]->[1]
5584 & TABLE_ROW_SCOPING_EL)) {
5585 !!!cp ('t231');
5586 ## ISSUE: Can this state be reached?
5587 pop @{$self->{open_elements}};
5588 }
5589
5590 pop @{$self->{open_elements}}; # tr
5591 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5592 !!!next-token;
5593 !!!nack ('t231.1');
5594 next B;
5595 } elsif ($token->{tag_name} eq 'table') {
5596 if ($self->{insertion_mode} == IN_ROW_IM) {
5597 ## As if </tr>
5598 ## have an element in table scope
5599 my $i;
5600 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5601 my $node = $self->{open_elements}->[$_];
5602 if ($node->[1] & TABLE_ROW_EL) {
5603 !!!cp ('t233');
5604 $i = $_;
5605 last INSCOPE;
5606 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5607 !!!cp ('t234');
5608 last INSCOPE;
5609 }
5610 } # INSCOPE
5611 unless (defined $i) {
5612 !!!cp ('t235');
5613 ## TODO: The following is wrong.
5614 !!!parse-error (type => 'unmatched end tag',
5615 text => $token->{type}, token => $token);
5616 ## Ignore the token
5617 !!!nack ('t236.1');
5618 !!!next-token;
5619 next B;
5620 }
5621
5622 ## Clear back to table row context
5623 while (not ($self->{open_elements}->[-1]->[1]
5624 & TABLE_ROW_SCOPING_EL)) {
5625 !!!cp ('t236');
5626 ## ISSUE: Can this state be reached?
5627 pop @{$self->{open_elements}};
5628 }
5629
5630 pop @{$self->{open_elements}}; # tr
5631 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5632 ## reprocess in the "in table body" insertion mode...
5633 }
5634
5635 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5636 ## have an element in table scope
5637 my $i;
5638 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5639 my $node = $self->{open_elements}->[$_];
5640 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5641 !!!cp ('t237');
5642 $i = $_;
5643 last INSCOPE;
5644 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5645 !!!cp ('t238');
5646 last INSCOPE;
5647 }
5648 } # INSCOPE
5649 unless (defined $i) {
5650 !!!cp ('t239');
5651 !!!parse-error (type => 'unmatched end tag',
5652 text => $token->{tag_name}, token => $token);
5653 ## Ignore the token
5654 !!!nack ('t239.1');
5655 !!!next-token;
5656 next B;
5657 }
5658
5659 ## Clear back to table body context
5660 while (not ($self->{open_elements}->[-1]->[1]
5661 & TABLE_ROWS_SCOPING_EL)) {
5662 !!!cp ('t240');
5663 pop @{$self->{open_elements}};
5664 }
5665
5666 ## As if <{current node}>
5667 ## have an element in table scope
5668 ## true by definition
5669
5670 ## Clear back to table body context
5671 ## nop by definition
5672
5673 pop @{$self->{open_elements}};
5674 $self->{insertion_mode} = IN_TABLE_IM;
5675 ## reprocess in the "in table" insertion mode...
5676 }
5677
5678 ## NOTE: </table> in the "in table" insertion mode.
5679 ## When you edit the code fragment below, please ensure that
5680 ## the code for <table> in the "in table" insertion mode
5681 ## is synced with it.
5682
5683 ## have a table element in table scope
5684 my $i;
5685 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5686 my $node = $self->{open_elements}->[$_];
5687 if ($node->[1] & TABLE_EL) {
5688 !!!cp ('t241');
5689 $i = $_;
5690 last INSCOPE;
5691 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5692 !!!cp ('t242');
5693 last INSCOPE;
5694 }
5695 } # INSCOPE
5696 unless (defined $i) {
5697 !!!cp ('t243');
5698 !!!parse-error (type => 'unmatched end tag',
5699 text => $token->{tag_name}, token => $token);
5700 ## Ignore the token
5701 !!!nack ('t243.1');
5702 !!!next-token;
5703 next B;
5704 }
5705
5706 splice @{$self->{open_elements}}, $i;
5707 pop @{$open_tables};
5708
5709 $self->_reset_insertion_mode;
5710
5711 !!!next-token;
5712 next B;
5713 } elsif ({
5714 tbody => 1, tfoot => 1, thead => 1,
5715 }->{$token->{tag_name}} and
5716 $self->{insertion_mode} & ROW_IMS) {
5717 if ($self->{insertion_mode} == IN_ROW_IM) {
5718 ## have an element in table scope
5719 my $i;
5720 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5721 my $node = $self->{open_elements}->[$_];
5722 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5723 !!!cp ('t247');
5724 $i = $_;
5725 last INSCOPE;
5726 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5727 !!!cp ('t248');
5728 last INSCOPE;
5729 }
5730 } # INSCOPE
5731 unless (defined $i) {
5732 !!!cp ('t249');
5733 !!!parse-error (type => 'unmatched end tag',
5734 text => $token->{tag_name}, token => $token);
5735 ## Ignore the token
5736 !!!nack ('t249.1');
5737 !!!next-token;
5738 next B;
5739 }
5740
5741 ## As if </tr>
5742 ## have an element in table scope
5743 my $i;
5744 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5745 my $node = $self->{open_elements}->[$_];
5746 if ($node->[1] & TABLE_ROW_EL) {
5747 !!!cp ('t250');
5748 $i = $_;
5749 last INSCOPE;
5750 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5751 !!!cp ('t251');
5752 last INSCOPE;
5753 }
5754 } # INSCOPE
5755 unless (defined $i) {
5756 !!!cp ('t252');
5757 !!!parse-error (type => 'unmatched end tag',
5758 text => 'tr', token => $token);
5759 ## Ignore the token
5760 !!!nack ('t252.1');
5761 !!!next-token;
5762 next B;
5763 }
5764
5765 ## Clear back to table row context
5766 while (not ($self->{open_elements}->[-1]->[1]
5767 & TABLE_ROW_SCOPING_EL)) {
5768 !!!cp ('t253');
5769 ## ISSUE: Can this case be reached?
5770 pop @{$self->{open_elements}};
5771 }
5772
5773 pop @{$self->{open_elements}}; # tr
5774 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5775 ## reprocess in the "in table body" insertion mode...
5776 }
5777
5778 ## have an element in table scope
5779 my $i;
5780 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5781 my $node = $self->{open_elements}->[$_];
5782 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5783 !!!cp ('t254');
5784 $i = $_;
5785 last INSCOPE;
5786 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5787 !!!cp ('t255');
5788 last INSCOPE;
5789 }
5790 } # INSCOPE
5791 unless (defined $i) {
5792 !!!cp ('t256');
5793 !!!parse-error (type => 'unmatched end tag',
5794 text => $token->{tag_name}, token => $token);
5795 ## Ignore the token
5796 !!!nack ('t256.1');
5797 !!!next-token;
5798 next B;
5799 }
5800
5801 ## Clear back to table body context
5802 while (not ($self->{open_elements}->[-1]->[1]
5803 & TABLE_ROWS_SCOPING_EL)) {
5804 !!!cp ('t257');
5805 ## ISSUE: Can this case be reached?
5806 pop @{$self->{open_elements}};
5807 }
5808
5809 pop @{$self->{open_elements}};
5810 $self->{insertion_mode} = IN_TABLE_IM;
5811 !!!nack ('t257.1');
5812 !!!next-token;
5813 next B;
5814 } elsif ({
5815 body => 1, caption => 1, col => 1, colgroup => 1,
5816 html => 1, td => 1, th => 1,
5817 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5818 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5819 }->{$token->{tag_name}}) {
5820 !!!cp ('t258');
5821 !!!parse-error (type => 'unmatched end tag',
5822 text => $token->{tag_name}, token => $token);
5823 ## Ignore the token
5824 !!!nack ('t258.1');
5825 !!!next-token;
5826 next B;
5827 } else {
5828 !!!cp ('t259');
5829 !!!parse-error (type => 'in table:/',
5830 text => $token->{tag_name}, token => $token);
5831
5832 $insert = $insert_to_foster;
5833 #
5834 }
5835 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5836 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5837 @{$self->{open_elements}} == 1) { # redundant, maybe
5838 !!!parse-error (type => 'in body:#eof', token => $token);
5839 !!!cp ('t259.1');
5840 #
5841 } else {
5842 !!!cp ('t259.2');
5843 #
5844 }
5845
5846 ## Stop parsing
5847 last B;
5848 } else {
5849 die "$0: $token->{type}: Unknown token type";
5850 }
5851 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5852 if ($token->{type} == CHARACTER_TOKEN) {
5853 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5854 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5855 unless (length $token->{data}) {
5856 !!!cp ('t260');
5857 !!!next-token;
5858 next B;
5859 }
5860 }
5861
5862 !!!cp ('t261');
5863 #
5864 } elsif ($token->{type} == START_TAG_TOKEN) {
5865 if ($token->{tag_name} eq 'col') {
5866 !!!cp ('t262');
5867 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5868 pop @{$self->{open_elements}};
5869 !!!ack ('t262.1');
5870 !!!next-token;
5871 next B;
5872 } else {
5873 !!!cp ('t263');
5874 #
5875 }
5876 } elsif ($token->{type} == END_TAG_TOKEN) {
5877 if ($token->{tag_name} eq 'colgroup') {
5878 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5879 !!!cp ('t264');
5880 !!!parse-error (type => 'unmatched end tag',
5881 text => 'colgroup', token => $token);
5882 ## Ignore the token
5883 !!!next-token;
5884 next B;
5885 } else {
5886 !!!cp ('t265');
5887 pop @{$self->{open_elements}}; # colgroup
5888 $self->{insertion_mode} = IN_TABLE_IM;
5889 !!!next-token;
5890 next B;
5891 }
5892 } elsif ($token->{tag_name} eq 'col') {
5893 !!!cp ('t266');
5894 !!!parse-error (type => 'unmatched end tag',
5895 text => 'col', token => $token);
5896 ## Ignore the token
5897 !!!next-token;
5898 next B;
5899 } else {
5900 !!!cp ('t267');
5901 #
5902 }
5903 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5904 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5905 @{$self->{open_elements}} == 1) { # redundant, maybe
5906 !!!cp ('t270.2');
5907 ## Stop parsing.
5908 last B;
5909 } else {
5910 ## NOTE: As if </colgroup>.
5911 !!!cp ('t270.1');
5912 pop @{$self->{open_elements}}; # colgroup
5913 $self->{insertion_mode} = IN_TABLE_IM;
5914 ## Reprocess.
5915 next B;
5916 }
5917 } else {
5918 die "$0: $token->{type}: Unknown token type";
5919 }
5920
5921 ## As if </colgroup>
5922 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5923 !!!cp ('t269');
5924 ## TODO: Wrong error type?
5925 !!!parse-error (type => 'unmatched end tag',
5926 text => 'colgroup', token => $token);
5927 ## Ignore the token
5928 !!!nack ('t269.1');
5929 !!!next-token;
5930 next B;
5931 } else {
5932 !!!cp ('t270');
5933 pop @{$self->{open_elements}}; # colgroup
5934 $self->{insertion_mode} = IN_TABLE_IM;
5935 !!!ack-later;
5936 ## reprocess
5937 next B;
5938 }
5939 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5940 if ($token->{type} == CHARACTER_TOKEN) {
5941 !!!cp ('t271');
5942 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5943 !!!next-token;
5944 next B;
5945 } elsif ($token->{type} == START_TAG_TOKEN) {
5946 if ($token->{tag_name} eq 'option') {
5947 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5948 !!!cp ('t272');
5949 ## As if </option>
5950 pop @{$self->{open_elements}};
5951 } else {
5952 !!!cp ('t273');
5953 }
5954
5955 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5956 !!!nack ('t273.1');
5957 !!!next-token;
5958 next B;
5959 } elsif ($token->{tag_name} eq 'optgroup') {
5960 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5961 !!!cp ('t274');
5962 ## As if </option>
5963 pop @{$self->{open_elements}};
5964 } else {
5965 !!!cp ('t275');
5966 }
5967
5968 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5969 !!!cp ('t276');
5970 ## As if </optgroup>
5971 pop @{$self->{open_elements}};
5972 } else {
5973 !!!cp ('t277');
5974 }
5975
5976 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5977 !!!nack ('t277.1');
5978 !!!next-token;
5979 next B;
5980 } elsif ({
5981 select => 1, input => 1, textarea => 1,
5982 }->{$token->{tag_name}} or
5983 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5984 {
5985 caption => 1, table => 1,
5986 tbody => 1, tfoot => 1, thead => 1,
5987 tr => 1, td => 1, th => 1,
5988 }->{$token->{tag_name}})) {
5989 ## TODO: The type below is not good - <select> is replaced by </select>
5990 !!!parse-error (type => 'not closed', text => 'select',
5991 token => $token);
5992 ## NOTE: As if the token were </select> (<select> case) or
5993 ## as if there were </select> (otherwise).
5994 ## have an element in table scope
5995 my $i;
5996 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5997 my $node = $self->{open_elements}->[$_];
5998 if ($node->[1] & SELECT_EL) {
5999 !!!cp ('t278');
6000 $i = $_;
6001 last INSCOPE;
6002 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6003 !!!cp ('t279');
6004 last INSCOPE;
6005 }
6006 } # INSCOPE
6007 unless (defined $i) {
6008 !!!cp ('t280');
6009 !!!parse-error (type => 'unmatched end tag',
6010 text => 'select', token => $token);
6011 ## Ignore the token
6012 !!!nack ('t280.1');
6013 !!!next-token;
6014 next B;
6015 }
6016
6017 !!!cp ('t281');
6018 splice @{$self->{open_elements}}, $i;
6019
6020 $self->_reset_insertion_mode;
6021
6022 if ($token->{tag_name} eq 'select') {
6023 !!!nack ('t281.2');
6024 !!!next-token;
6025 next B;
6026 } else {
6027 !!!cp ('t281.1');
6028 !!!ack-later;
6029 ## Reprocess the token.
6030 next B;
6031 }
6032 } else {
6033 !!!cp ('t282');
6034 !!!parse-error (type => 'in select',
6035 text => $token->{tag_name}, token => $token);
6036 ## Ignore the token
6037 !!!nack ('t282.1');
6038 !!!next-token;
6039 next B;
6040 }
6041 } elsif ($token->{type} == END_TAG_TOKEN) {
6042 if ($token->{tag_name} eq 'optgroup') {
6043 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6044 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6045 !!!cp ('t283');
6046 ## As if </option>
6047 splice @{$self->{open_elements}}, -2;
6048 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6049 !!!cp ('t284');
6050 pop @{$self->{open_elements}};
6051 } else {
6052 !!!cp ('t285');
6053 !!!parse-error (type => 'unmatched end tag',
6054 text => $token->{tag_name}, token => $token);
6055 ## Ignore the token
6056 }
6057 !!!nack ('t285.1');
6058 !!!next-token;
6059 next B;
6060 } elsif ($token->{tag_name} eq 'option') {
6061 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6062 !!!cp ('t286');
6063 pop @{$self->{open_elements}};
6064 } else {
6065 !!!cp ('t287');
6066 !!!parse-error (type => 'unmatched end tag',
6067 text => $token->{tag_name}, token => $token);
6068 ## Ignore the token
6069 }
6070 !!!nack ('t287.1');
6071 !!!next-token;
6072 next B;
6073 } elsif ($token->{tag_name} eq 'select') {
6074 ## have an element in table scope
6075 my $i;
6076 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6077 my $node = $self->{open_elements}->[$_];
6078 if ($node->[1] & SELECT_EL) {
6079 !!!cp ('t288');
6080 $i = $_;
6081 last INSCOPE;
6082 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6083 !!!cp ('t289');
6084 last INSCOPE;
6085 }
6086 } # INSCOPE
6087 unless (defined $i) {
6088 !!!cp ('t290');
6089 !!!parse-error (type => 'unmatched end tag',
6090 text => $token->{tag_name}, token => $token);
6091 ## Ignore the token
6092 !!!nack ('t290.1');
6093 !!!next-token;
6094 next B;
6095 }
6096
6097 !!!cp ('t291');
6098 splice @{$self->{open_elements}}, $i;
6099
6100 $self->_reset_insertion_mode;
6101
6102 !!!nack ('t291.1');
6103 !!!next-token;
6104 next B;
6105 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6106 {
6107 caption => 1, table => 1, tbody => 1,
6108 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6109 }->{$token->{tag_name}}) {
6110 ## TODO: The following is wrong?
6111 !!!parse-error (type => 'unmatched end tag',
6112 text => $token->{tag_name}, token => $token);
6113
6114 ## have an element in table scope
6115 my $i;
6116 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6117 my $node = $self->{open_elements}->[$_];
6118 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6119 !!!cp ('t292');
6120 $i = $_;
6121 last INSCOPE;
6122 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6123 !!!cp ('t293');
6124 last INSCOPE;
6125 }
6126 } # INSCOPE
6127 unless (defined $i) {
6128 !!!cp ('t294');
6129 ## Ignore the token
6130 !!!nack ('t294.1');
6131 !!!next-token;
6132 next B;
6133 }
6134
6135 ## As if </select>
6136 ## have an element in table scope
6137 undef $i;
6138 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6139 my $node = $self->{open_elements}->[$_];
6140 if ($node->[1] & SELECT_EL) {
6141 !!!cp ('t295');
6142 $i = $_;
6143 last INSCOPE;
6144 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6145 ## ISSUE: Can this state be reached?
6146 !!!cp ('t296');
6147 last INSCOPE;
6148 }
6149 } # INSCOPE
6150 unless (defined $i) {
6151 !!!cp ('t297');
6152 ## TODO: The following error type is correct?
6153 !!!parse-error (type => 'unmatched end tag',
6154 text => 'select', token => $token);
6155 ## Ignore the </select> token
6156 !!!nack ('t297.1');
6157 !!!next-token; ## TODO: ok?
6158 next B;
6159 }
6160
6161 !!!cp ('t298');
6162 splice @{$self->{open_elements}}, $i;
6163
6164 $self->_reset_insertion_mode;
6165
6166 !!!ack-later;
6167 ## reprocess
6168 next B;
6169 } else {
6170 !!!cp ('t299');
6171 !!!parse-error (type => 'in select:/',
6172 text => $token->{tag_name}, token => $token);
6173 ## Ignore the token
6174 !!!nack ('t299.3');
6175 !!!next-token;
6176 next B;
6177 }
6178 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6179 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6180 @{$self->{open_elements}} == 1) { # redundant, maybe
6181 !!!cp ('t299.1');
6182 !!!parse-error (type => 'in body:#eof', token => $token);
6183 } else {
6184 !!!cp ('t299.2');
6185 }
6186
6187 ## Stop parsing.
6188 last B;
6189 } else {
6190 die "$0: $token->{type}: Unknown token type";
6191 }
6192 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6193 if ($token->{type} == CHARACTER_TOKEN) {
6194 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6195 my $data = $1;
6196 ## As if in body
6197 $reconstruct_active_formatting_elements->($insert_to_current);
6198
6199 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6200
6201 unless (length $token->{data}) {
6202 !!!cp ('t300');
6203 !!!next-token;
6204 next B;
6205 }
6206 }
6207
6208 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6209 !!!cp ('t301');
6210 !!!parse-error (type => 'after html:#text', token => $token);
6211
6212 ## Reprocess in the "after body" insertion mode.
6213 } else {
6214 !!!cp ('t302');
6215 }
6216
6217 ## "after body" insertion mode
6218 !!!parse-error (type => 'after body:#text', token => $token);
6219
6220 $self->{insertion_mode} = IN_BODY_IM;
6221 ## reprocess
6222 next B;
6223 } elsif ($token->{type} == START_TAG_TOKEN) {
6224 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6225 !!!cp ('t303');
6226 !!!parse-error (type => 'after html',
6227 text => $token->{tag_name}, token => $token);
6228
6229 ## Reprocess in the "after body" insertion mode.
6230 } else {
6231 !!!cp ('t304');
6232 }
6233
6234 ## "after body" insertion mode
6235 !!!parse-error (type => 'after body',
6236 text => $token->{tag_name}, token => $token);
6237
6238 $self->{insertion_mode} = IN_BODY_IM;
6239 !!!ack-later;
6240 ## reprocess
6241 next B;
6242 } elsif ($token->{type} == END_TAG_TOKEN) {
6243 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6244 !!!cp ('t305');
6245 !!!parse-error (type => 'after html:/',
6246 text => $token->{tag_name}, token => $token);
6247
6248 $self->{insertion_mode} = AFTER_BODY_IM;
6249 ## Reprocess in the "after body" insertion mode.
6250 } else {
6251 !!!cp ('t306');
6252 }
6253
6254 ## "after body" insertion mode
6255 if ($token->{tag_name} eq 'html') {
6256 if (defined $self->{inner_html_node}) {
6257 !!!cp ('t307');
6258 !!!parse-error (type => 'unmatched end tag',
6259 text => 'html', token => $token);
6260 ## Ignore the token
6261 !!!next-token;
6262 next B;
6263 } else {
6264 !!!cp ('t308');
6265 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6266 !!!next-token;
6267 next B;
6268 }
6269 } else {
6270 !!!cp ('t309');
6271 !!!parse-error (type => 'after body:/',
6272 text => $token->{tag_name}, token => $token);
6273
6274 $self->{insertion_mode} = IN_BODY_IM;
6275 ## reprocess
6276 next B;
6277 }
6278 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6279 !!!cp ('t309.2');
6280 ## Stop parsing
6281 last B;
6282 } else {
6283 die "$0: $token->{type}: Unknown token type";
6284 }
6285 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6286 if ($token->{type} == CHARACTER_TOKEN) {
6287 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6288 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6289
6290 unless (length $token->{data}) {
6291 !!!cp ('t310');
6292 !!!next-token;
6293 next B;
6294 }
6295 }
6296
6297 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6298 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6299 !!!cp ('t311');
6300 !!!parse-error (type => 'in frameset:#text', token => $token);
6301 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6302 !!!cp ('t312');
6303 !!!parse-error (type => 'after frameset:#text', token => $token);
6304 } else { # "after after frameset"
6305 !!!cp ('t313');
6306 !!!parse-error (type => 'after html:#text', token => $token);
6307 }
6308
6309 ## Ignore the token.
6310 if (length $token->{data}) {
6311 !!!cp ('t314');
6312 ## reprocess the rest of characters
6313 } else {
6314 !!!cp ('t315');
6315 !!!next-token;
6316 }
6317 next B;
6318 }
6319
6320 die qq[$0: Character "$token->{data}"];
6321 } elsif ($token->{type} == START_TAG_TOKEN) {
6322 if ($token->{tag_name} eq 'frameset' and
6323 $self->{insertion_mode} == IN_FRAMESET_IM) {
6324 !!!cp ('t318');
6325 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6326 !!!nack ('t318.1');
6327 !!!next-token;
6328 next B;
6329 } elsif ($token->{tag_name} eq 'frame' and
6330 $self->{insertion_mode} == IN_FRAMESET_IM) {
6331 !!!cp ('t319');
6332 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6333 pop @{$self->{open_elements}};
6334 !!!ack ('t319.1');
6335 !!!next-token;
6336 next B;
6337 } elsif ($token->{tag_name} eq 'noframes') {
6338 !!!cp ('t320');
6339 ## NOTE: As if in head.
6340 $parse_rcdata->(CDATA_CONTENT_MODEL);
6341 next B;
6342
6343 ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6344 ## has no parse error.
6345 } else {
6346 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6347 !!!cp ('t321');
6348 !!!parse-error (type => 'in frameset',
6349 text => $token->{tag_name}, token => $token);
6350 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6351 !!!cp ('t322');
6352 !!!parse-error (type => 'after frameset',
6353 text => $token->{tag_name}, token => $token);
6354 } else { # "after after frameset"
6355 !!!cp ('t322.2');
6356 !!!parse-error (type => 'after after frameset',
6357 text => $token->{tag_name}, token => $token);
6358 }
6359 ## Ignore the token
6360 !!!nack ('t322.1');
6361 !!!next-token;
6362 next B;
6363 }
6364 } elsif ($token->{type} == END_TAG_TOKEN) {
6365 if ($token->{tag_name} eq 'frameset' and
6366 $self->{insertion_mode} == IN_FRAMESET_IM) {
6367 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6368 @{$self->{open_elements}} == 1) {
6369 !!!cp ('t325');
6370 !!!parse-error (type => 'unmatched end tag',
6371 text => $token->{tag_name}, token => $token);
6372 ## Ignore the token
6373 !!!next-token;
6374 } else {
6375 !!!cp ('t326');
6376 pop @{$self->{open_elements}};
6377 !!!next-token;
6378 }
6379
6380 if (not defined $self->{inner_html_node} and
6381 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6382 !!!cp ('t327');
6383 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6384 } else {
6385 !!!cp ('t328');
6386 }
6387 next B;
6388 } elsif ($token->{tag_name} eq 'html' and
6389 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6390 !!!cp ('t329');
6391 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6392 !!!next-token;
6393 next B;
6394 } else {
6395 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6396 !!!cp ('t330');
6397 !!!parse-error (type => 'in frameset:/',
6398 text => $token->{tag_name}, token => $token);
6399 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6400 !!!cp ('t330.1');
6401 !!!parse-error (type => 'after frameset:/',
6402 text => $token->{tag_name}, token => $token);
6403 } else { # "after after html"
6404 !!!cp ('t331');
6405 !!!parse-error (type => 'after after frameset:/',
6406 text => $token->{tag_name}, token => $token);
6407 }
6408 ## Ignore the token
6409 !!!next-token;
6410 next B;
6411 }
6412 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6413 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6414 @{$self->{open_elements}} == 1) { # redundant, maybe
6415 !!!cp ('t331.1');
6416 !!!parse-error (type => 'in body:#eof', token => $token);
6417 } else {
6418 !!!cp ('t331.2');
6419 }
6420
6421 ## Stop parsing
6422 last B;
6423 } else {
6424 die "$0: $token->{type}: Unknown token type";
6425 }
6426
6427 ## ISSUE: An issue in spec here
6428 } else {
6429 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6430 }
6431
6432 ## "in body" insertion mode
6433 if ($token->{type} == START_TAG_TOKEN) {
6434 if ($token->{tag_name} eq 'script') {
6435 !!!cp ('t332');
6436 ## NOTE: This is an "as if in head" code clone
6437 $script_start_tag->();
6438 next B;
6439 } elsif ($token->{tag_name} eq 'style') {
6440 !!!cp ('t333');
6441 ## NOTE: This is an "as if in head" code clone
6442 $parse_rcdata->(CDATA_CONTENT_MODEL);
6443 next B;
6444 } elsif ({
6445 base => 1, link => 1,
6446 }->{$token->{tag_name}}) {
6447 !!!cp ('t334');
6448 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6449 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6450 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6451 !!!ack ('t334.1');
6452 !!!next-token;
6453 next B;
6454 } elsif ($token->{tag_name} eq 'meta') {
6455 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6456 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6457 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6458
6459 unless ($self->{confident}) {
6460 if ($token->{attributes}->{charset}) {
6461 !!!cp ('t335');
6462 ## NOTE: Whether the encoding is supported or not is handled
6463 ## in the {change_encoding} callback.
6464 $self->{change_encoding}
6465 ->($self, $token->{attributes}->{charset}->{value}, $token);
6466
6467 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6468 ->set_user_data (manakai_has_reference =>
6469 $token->{attributes}->{charset}
6470 ->{has_reference});
6471 } elsif ($token->{attributes}->{content}) {
6472 if ($token->{attributes}->{content}->{value}
6473 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6474 [\x09-\x0D\x20]*=
6475 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6476 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6477 !!!cp ('t336');
6478 ## NOTE: Whether the encoding is supported or not is handled
6479 ## in the {change_encoding} callback.
6480 $self->{change_encoding}
6481 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6482 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6483 ->set_user_data (manakai_has_reference =>
6484 $token->{attributes}->{content}
6485 ->{has_reference});
6486 }
6487 }
6488 } else {
6489 if ($token->{attributes}->{charset}) {
6490 !!!cp ('t337');
6491 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6492 ->set_user_data (manakai_has_reference =>
6493 $token->{attributes}->{charset}
6494 ->{has_reference});
6495 }
6496 if ($token->{attributes}->{content}) {
6497 !!!cp ('t338');
6498 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6499 ->set_user_data (manakai_has_reference =>
6500 $token->{attributes}->{content}
6501 ->{has_reference});
6502 }
6503 }
6504
6505 !!!ack ('t338.1');
6506 !!!next-token;
6507 next B;
6508 } elsif ($token->{tag_name} eq 'title') {
6509 !!!cp ('t341');
6510 ## NOTE: This is an "as if in head" code clone
6511 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6512 next B;
6513 } elsif ($token->{tag_name} eq 'body') {
6514 !!!parse-error (type => 'in body', text => 'body', token => $token);
6515
6516 if (@{$self->{open_elements}} == 1 or
6517 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6518 !!!cp ('t342');
6519 ## Ignore the token
6520 } else {
6521 my $body_el = $self->{open_elements}->[1]->[0];
6522 for my $attr_name (keys %{$token->{attributes}}) {
6523 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6524 !!!cp ('t343');
6525 $body_el->set_attribute_ns
6526 (undef, [undef, $attr_name],
6527 $token->{attributes}->{$attr_name}->{value});
6528 }
6529 }
6530 }
6531 !!!nack ('t343.1');
6532 !!!next-token;
6533 next B;
6534 } elsif ({
6535 address => 1, blockquote => 1, center => 1, dir => 1,
6536 div => 1, dl => 1, fieldset => 1,
6537 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6538 menu => 1, ol => 1, p => 1, ul => 1,
6539 pre => 1, listing => 1,
6540 form => 1,
6541 table => 1,
6542 hr => 1,
6543 }->{$token->{tag_name}}) {
6544 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6545 !!!cp ('t350');
6546 !!!parse-error (type => 'in form:form', token => $token);
6547 ## Ignore the token
6548 !!!nack ('t350.1');
6549 !!!next-token;
6550 next B;
6551 }
6552
6553 ## has a p element in scope
6554 INSCOPE: for (reverse @{$self->{open_elements}}) {
6555 if ($_->[1] & P_EL) {
6556 !!!cp ('t344');
6557 !!!back-token; # <form>
6558 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6559 line => $token->{line}, column => $token->{column}};
6560 next B;
6561 } elsif ($_->[1] & SCOPING_EL) {
6562 !!!cp ('t345');
6563 last INSCOPE;
6564 }
6565 } # INSCOPE
6566
6567 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6568 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6569 !!!nack ('t346.1');
6570 !!!next-token;
6571 if ($token->{type} == CHARACTER_TOKEN) {
6572 $token->{data} =~ s/^\x0A//;
6573 unless (length $token->{data}) {
6574 !!!cp ('t346');
6575 !!!next-token;
6576 } else {
6577 !!!cp ('t349');
6578 }
6579 } else {
6580 !!!cp ('t348');
6581 }
6582 } elsif ($token->{tag_name} eq 'form') {
6583 !!!cp ('t347.1');
6584 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6585
6586 !!!nack ('t347.2');
6587 !!!next-token;
6588 } elsif ($token->{tag_name} eq 'table') {
6589 !!!cp ('t382');
6590 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6591
6592 $self->{insertion_mode} = IN_TABLE_IM;
6593
6594 !!!nack ('t382.1');
6595 !!!next-token;
6596 } elsif ($token->{tag_name} eq 'hr') {
6597 !!!cp ('t386');
6598 pop @{$self->{open_elements}};
6599
6600 !!!nack ('t386.1');
6601 !!!next-token;
6602 } else {
6603 !!!nack ('t347.1');
6604 !!!next-token;
6605 }
6606 next B;
6607 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6608 ## has a p element in scope
6609 INSCOPE: for (reverse @{$self->{open_elements}}) {
6610 if ($_->[1] & P_EL) {
6611 !!!cp ('t353');
6612 !!!back-token; # <x>
6613 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6614 line => $token->{line}, column => $token->{column}};
6615 next B;
6616 } elsif ($_->[1] & SCOPING_EL) {
6617 !!!cp ('t354');
6618 last INSCOPE;
6619 }
6620 } # INSCOPE
6621
6622 ## Step 1
6623 my $i = -1;
6624 my $node = $self->{open_elements}->[$i];
6625 my $li_or_dtdd = {li => {li => 1},
6626 dt => {dt => 1, dd => 1},
6627 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6628 LI: {
6629 ## Step 2
6630 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6631 if ($i != -1) {
6632 !!!cp ('t355');
6633 !!!parse-error (type => 'not closed',
6634 text => $self->{open_elements}->[-1]->[0]
6635 ->manakai_local_name,
6636 token => $token);
6637 } else {
6638 !!!cp ('t356');
6639 }
6640 splice @{$self->{open_elements}}, $i;
6641 last LI;
6642 } else {
6643 !!!cp ('t357');
6644 }
6645
6646 ## Step 3
6647 if (not ($node->[1] & FORMATTING_EL) and
6648 #not $phrasing_category->{$node->[1]} and
6649 ($node->[1] & SPECIAL_EL or
6650 $node->[1] & SCOPING_EL) and
6651 not ($node->[1] & ADDRESS_EL) and
6652 not ($node->[1] & DIV_EL)) {
6653 !!!cp ('t358');
6654 last LI;
6655 }
6656
6657 !!!cp ('t359');
6658 ## Step 4
6659 $i--;
6660 $node = $self->{open_elements}->[$i];
6661 redo LI;
6662 } # LI
6663
6664 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6665 !!!nack ('t359.1');
6666 !!!next-token;
6667 next B;
6668 } elsif ($token->{tag_name} eq 'plaintext') {
6669 ## has a p element in scope
6670 INSCOPE: for (reverse @{$self->{open_elements}}) {
6671 if ($_->[1] & P_EL) {
6672 !!!cp ('t367');
6673 !!!back-token; # <plaintext>
6674 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6675 line => $token->{line}, column => $token->{column}};
6676 next B;
6677 } elsif ($_->[1] & SCOPING_EL) {
6678 !!!cp ('t368');
6679 last INSCOPE;
6680 }
6681 } # INSCOPE
6682
6683 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6684
6685 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6686
6687 !!!nack ('t368.1');
6688 !!!next-token;
6689 next B;
6690 } elsif ($token->{tag_name} eq 'a') {
6691 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6692 my $node = $active_formatting_elements->[$i];
6693 if ($node->[1] & A_EL) {
6694 !!!cp ('t371');
6695 !!!parse-error (type => 'in a:a', token => $token);
6696
6697 !!!back-token; # <a>
6698 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6699 line => $token->{line}, column => $token->{column}};
6700 $formatting_end_tag->($token);
6701
6702 AFE2: for (reverse 0..$#$active_formatting_elements) {
6703 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6704 !!!cp ('t372');
6705 splice @$active_formatting_elements, $_, 1;
6706 last AFE2;
6707 }
6708 } # AFE2
6709 OE: for (reverse 0..$#{$self->{open_elements}}) {
6710 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6711 !!!cp ('t373');
6712 splice @{$self->{open_elements}}, $_, 1;
6713 last OE;
6714 }
6715 } # OE
6716 last AFE;
6717 } elsif ($node->[0] eq '#marker') {
6718 !!!cp ('t374');
6719 last AFE;
6720 }
6721 } # AFE
6722
6723 $reconstruct_active_formatting_elements->($insert_to_current);
6724
6725 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6726 push @$active_formatting_elements, $self->{open_elements}->[-1];
6727
6728 !!!nack ('t374.1');
6729 !!!next-token;
6730 next B;
6731 } elsif ($token->{tag_name} eq 'nobr') {
6732 $reconstruct_active_formatting_elements->($insert_to_current);
6733
6734 ## has a |nobr| element in scope
6735 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6736 my $node = $self->{open_elements}->[$_];
6737 if ($node->[1] & NOBR_EL) {
6738 !!!cp ('t376');
6739 !!!parse-error (type => 'in nobr:nobr', token => $token);
6740 !!!back-token; # <nobr>
6741 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6742 line => $token->{line}, column => $token->{column}};
6743 next B;
6744 } elsif ($node->[1] & SCOPING_EL) {
6745 !!!cp ('t377');
6746 last INSCOPE;
6747 }
6748 } # INSCOPE
6749
6750 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6751 push @$active_formatting_elements, $self->{open_elements}->[-1];
6752
6753 !!!nack ('t377.1');
6754 !!!next-token;
6755 next B;
6756 } elsif ($token->{tag_name} eq 'button') {
6757 ## has a button element in scope
6758 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6759 my $node = $self->{open_elements}->[$_];
6760 if ($node->[1] & BUTTON_EL) {
6761 !!!cp ('t378');
6762 !!!parse-error (type => 'in button:button', token => $token);
6763 !!!back-token; # <button>
6764 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6765 line => $token->{line}, column => $token->{column}};
6766 next B;
6767 } elsif ($node->[1] & SCOPING_EL) {
6768 !!!cp ('t379');
6769 last INSCOPE;
6770 }
6771 } # INSCOPE
6772
6773 $reconstruct_active_formatting_elements->($insert_to_current);
6774
6775 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6776
6777 ## TODO: associate with $self->{form_element} if defined
6778
6779 push @$active_formatting_elements, ['#marker', ''];
6780
6781 !!!nack ('t379.1');
6782 !!!next-token;
6783 next B;
6784 } elsif ({
6785 xmp => 1,
6786 iframe => 1,
6787 noembed => 1,
6788 noframes => 1, ## NOTE: This is an "as if in head" code clone.
6789 noscript => 0, ## TODO: 1 if scripting is enabled
6790 }->{$token->{tag_name}}) {
6791 if ($token->{tag_name} eq 'xmp') {
6792 !!!cp ('t381');
6793 $reconstruct_active_formatting_elements->($insert_to_current);
6794 } else {
6795 !!!cp ('t399');
6796 }
6797 ## NOTE: There is an "as if in body" code clone.
6798 $parse_rcdata->(CDATA_CONTENT_MODEL);
6799 next B;
6800 } elsif ($token->{tag_name} eq 'isindex') {
6801 !!!parse-error (type => 'isindex', token => $token);
6802
6803 if (defined $self->{form_element}) {
6804 !!!cp ('t389');
6805 ## Ignore the token
6806 !!!nack ('t389'); ## NOTE: Not acknowledged.
6807 !!!next-token;
6808 next B;
6809 } else {
6810 !!!ack ('t391.1');
6811
6812 my $at = $token->{attributes};
6813 my $form_attrs;
6814 $form_attrs->{action} = $at->{action} if $at->{action};
6815 my $prompt_attr = $at->{prompt};
6816 $at->{name} = {name => 'name', value => 'isindex'};
6817 delete $at->{action};
6818 delete $at->{prompt};
6819 my @tokens = (
6820 {type => START_TAG_TOKEN, tag_name => 'form',
6821 attributes => $form_attrs,
6822 line => $token->{line}, column => $token->{column}},
6823 {type => START_TAG_TOKEN, tag_name => 'hr',
6824 line => $token->{line}, column => $token->{column}},
6825 {type => START_TAG_TOKEN, tag_name => 'p',
6826 line => $token->{line}, column => $token->{column}},
6827 {type => START_TAG_TOKEN, tag_name => 'label',
6828 line => $token->{line}, column => $token->{column}},
6829 );
6830 if ($prompt_attr) {
6831 !!!cp ('t390');
6832 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6833 #line => $token->{line}, column => $token->{column},
6834 };
6835 } else {
6836 !!!cp ('t391');
6837 push @tokens, {type => CHARACTER_TOKEN,
6838 data => 'This is a searchable index. Insert your search keywords here: ',
6839 #line => $token->{line}, column => $token->{column},
6840 }; # SHOULD
6841 ## TODO: make this configurable
6842 }
6843 push @tokens,
6844 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6845 line => $token->{line}, column => $token->{column}},
6846 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6847 {type => END_TAG_TOKEN, tag_name => 'label',
6848 line => $token->{line}, column => $token->{column}},
6849 {type => END_TAG_TOKEN, tag_name => 'p',
6850 line => $token->{line}, column => $token->{column}},
6851 {type => START_TAG_TOKEN, tag_name => 'hr',
6852 line => $token->{line}, column => $token->{column}},
6853 {type => END_TAG_TOKEN, tag_name => 'form',
6854 line => $token->{line}, column => $token->{column}};
6855 !!!back-token (@tokens);
6856 !!!next-token;
6857 next B;
6858 }
6859 } elsif ($token->{tag_name} eq 'textarea') {
6860 my $tag_name = $token->{tag_name};
6861 my $el;
6862 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6863
6864 ## TODO: $self->{form_element} if defined
6865 $self->{content_model} = RCDATA_CONTENT_MODEL;
6866 delete $self->{escape}; # MUST
6867
6868 $insert->($el);
6869
6870 my $text = '';
6871 !!!nack ('t392.1');
6872 !!!next-token;
6873 if ($token->{type} == CHARACTER_TOKEN) {
6874 $token->{data} =~ s/^\x0A//;
6875 unless (length $token->{data}) {
6876 !!!cp ('t392');
6877 !!!next-token;
6878 } else {
6879 !!!cp ('t393');
6880 }
6881 } else {
6882 !!!cp ('t394');
6883 }
6884 while ($token->{type} == CHARACTER_TOKEN) {
6885 !!!cp ('t395');
6886 $text .= $token->{data};
6887 !!!next-token;
6888 }
6889 if (length $text) {
6890 !!!cp ('t396');
6891 $el->manakai_append_text ($text);
6892 }
6893
6894 $self->{content_model} = PCDATA_CONTENT_MODEL;
6895
6896 if ($token->{type} == END_TAG_TOKEN and
6897 $token->{tag_name} eq $tag_name) {
6898 !!!cp ('t397');
6899 ## Ignore the token
6900 } else {
6901 !!!cp ('t398');
6902 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
6903 }
6904 !!!next-token;
6905 next B;
6906 } elsif ($token->{tag_name} eq 'rt' or
6907 $token->{tag_name} eq 'rp') {
6908 ## has a |ruby| element in scope
6909 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6910 my $node = $self->{open_elements}->[$_];
6911 if ($node->[1] & RUBY_EL) {
6912 !!!cp ('t398.1');
6913 ## generate implied end tags
6914 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6915 !!!cp ('t398.2');
6916 pop @{$self->{open_elements}};
6917 }
6918 unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
6919 !!!cp ('t398.3');
6920 !!!parse-error (type => 'not closed',
6921 text => $self->{open_elements}->[-1]->[0]
6922 ->manakai_local_name,
6923 token => $token);
6924 pop @{$self->{open_elements}}
6925 while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
6926 }
6927 last INSCOPE;
6928 } elsif ($node->[1] & SCOPING_EL) {
6929 !!!cp ('t398.4');
6930 last INSCOPE;
6931 }
6932 } # INSCOPE
6933
6934 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6935
6936 !!!nack ('t398.5');
6937 !!!next-token;
6938 redo B;
6939 } elsif ($token->{tag_name} eq 'math' or
6940 $token->{tag_name} eq 'svg') {
6941 $reconstruct_active_formatting_elements->($insert_to_current);
6942
6943 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
6944
6945 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
6946
6947 ## "adjust foreign attributes" - done in insert-element-f
6948
6949 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
6950
6951 if ($self->{self_closing}) {
6952 pop @{$self->{open_elements}};
6953 !!!ack ('t398.1');
6954 } else {
6955 !!!cp ('t398.2');
6956 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6957 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6958 ## mode, "in body" (not "in foreign content") secondary insertion
6959 ## mode, maybe.
6960 }
6961
6962 !!!next-token;
6963 next B;
6964 } elsif ({
6965 caption => 1, col => 1, colgroup => 1, frame => 1,
6966 frameset => 1, head => 1, option => 1, optgroup => 1,
6967 tbody => 1, td => 1, tfoot => 1, th => 1,
6968 thead => 1, tr => 1,
6969 }->{$token->{tag_name}}) {
6970 !!!cp ('t401');
6971 !!!parse-error (type => 'in body',
6972 text => $token->{tag_name}, token => $token);
6973 ## Ignore the token
6974 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6975 !!!next-token;
6976 next B;
6977
6978 ## ISSUE: An issue on HTML5 new elements in the spec.
6979 } else {
6980 if ($token->{tag_name} eq 'image') {
6981 !!!cp ('t384');
6982 !!!parse-error (type => 'image', token => $token);
6983 $token->{tag_name} = 'img';
6984 } else {
6985 !!!cp ('t385');
6986 }
6987
6988 ## NOTE: There is an "as if <br>" code clone.
6989 $reconstruct_active_formatting_elements->($insert_to_current);
6990
6991 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6992
6993 if ({
6994 applet => 1, marquee => 1, object => 1,
6995 }->{$token->{tag_name}}) {
6996 !!!cp ('t380');
6997 push @$active_formatting_elements, ['#marker', ''];
6998 !!!nack ('t380.1');
6999 } elsif ({
7000 b => 1, big => 1, em => 1, font => 1, i => 1,
7001 s => 1, small => 1, strile => 1,
7002 strong => 1, tt => 1, u => 1,
7003 }->{$token->{tag_name}}) {
7004 !!!cp ('t375');
7005 push @$active_formatting_elements, $self->{open_elements}->[-1];
7006 !!!nack ('t375.1');
7007 } elsif ($token->{tag_name} eq 'input') {
7008 !!!cp ('t388');
7009 ## TODO: associate with $self->{form_element} if defined
7010 pop @{$self->{open_elements}};
7011 !!!ack ('t388.2');
7012 } elsif ({
7013 area => 1, basefont => 1, bgsound => 1, br => 1,
7014 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7015 #image => 1,
7016 }->{$token->{tag_name}}) {
7017 !!!cp ('t388.1');
7018 pop @{$self->{open_elements}};
7019 !!!ack ('t388.3');
7020 } elsif ($token->{tag_name} eq 'select') {
7021 ## TODO: associate with $self->{form_element} if defined
7022
7023 if ($self->{insertion_mode} & TABLE_IMS or
7024 $self->{insertion_mode} & BODY_TABLE_IMS or
7025 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7026 !!!cp ('t400.1');
7027 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7028 } else {
7029 !!!cp ('t400.2');
7030 $self->{insertion_mode} = IN_SELECT_IM;
7031 }
7032 !!!nack ('t400.3');
7033 } else {
7034 !!!nack ('t402');
7035 }
7036
7037 !!!next-token;
7038 next B;
7039 }
7040 } elsif ($token->{type} == END_TAG_TOKEN) {
7041 if ($token->{tag_name} eq 'body') {
7042 ## has a |body| element in scope
7043 my $i;
7044 INSCOPE: {
7045 for (reverse @{$self->{open_elements}}) {
7046 if ($_->[1] & BODY_EL) {
7047 !!!cp ('t405');
7048 $i = $_;
7049 last INSCOPE;
7050 } elsif ($_->[1] & SCOPING_EL) {
7051 !!!cp ('t405.1');
7052 last;
7053 }
7054 }
7055
7056 !!!parse-error (type => 'start tag not allowed',
7057 text => $token->{tag_name}, token => $token);
7058 ## NOTE: Ignore the token.
7059 !!!next-token;
7060 next B;
7061 } # INSCOPE
7062
7063 for (@{$self->{open_elements}}) {
7064 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7065 !!!cp ('t403');
7066 !!!parse-error (type => 'not closed',
7067 text => $_->[0]->manakai_local_name,
7068 token => $token);
7069 last;
7070 } else {
7071 !!!cp ('t404');
7072 }
7073 }
7074
7075 $self->{insertion_mode} = AFTER_BODY_IM;
7076 !!!next-token;
7077 next B;
7078 } elsif ($token->{tag_name} eq 'html') {
7079 ## TODO: Update this code. It seems that the code below is not
7080 ## up-to-date, though it has same effect as speced.
7081 if (@{$self->{open_elements}} > 1 and
7082 $self->{open_elements}->[1]->[1] & BODY_EL) {
7083 ## ISSUE: There is an issue in the spec.
7084 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7085 !!!cp ('t406');
7086 !!!parse-error (type => 'not closed',
7087 text => $self->{open_elements}->[1]->[0]
7088 ->manakai_local_name,
7089 token => $token);
7090 } else {
7091 !!!cp ('t407');
7092 }
7093 $self->{insertion_mode} = AFTER_BODY_IM;
7094 ## reprocess
7095 next B;
7096 } else {
7097 !!!cp ('t408');
7098 !!!parse-error (type => 'unmatched end tag',
7099 text => $token->{tag_name}, token => $token);
7100 ## Ignore the token
7101 !!!next-token;
7102 next B;
7103 }
7104 } elsif ({
7105 address => 1, blockquote => 1, center => 1, dir => 1,
7106 div => 1, dl => 1, fieldset => 1, listing => 1,
7107 menu => 1, ol => 1, pre => 1, ul => 1,
7108 dd => 1, dt => 1, li => 1,
7109 applet => 1, button => 1, marquee => 1, object => 1,
7110 }->{$token->{tag_name}}) {
7111 ## has an element in scope
7112 my $i;
7113 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7114 my $node = $self->{open_elements}->[$_];
7115 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7116 !!!cp ('t410');
7117 $i = $_;
7118 last INSCOPE;
7119 } elsif ($node->[1] & SCOPING_EL) {
7120 !!!cp ('t411');
7121 last INSCOPE;
7122 }
7123 } # INSCOPE
7124
7125 unless (defined $i) { # has an element in scope
7126 !!!cp ('t413');
7127 !!!parse-error (type => 'unmatched end tag',
7128 text => $token->{tag_name}, token => $token);
7129 ## NOTE: Ignore the token.
7130 } else {
7131 ## Step 1. generate implied end tags
7132 while ({
7133 ## END_TAG_OPTIONAL_EL
7134 dd => ($token->{tag_name} ne 'dd'),
7135 dt => ($token->{tag_name} ne 'dt'),
7136 li => ($token->{tag_name} ne 'li'),
7137 p => 1,
7138 rt => 1,
7139 rp => 1,
7140 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7141 !!!cp ('t409');
7142 pop @{$self->{open_elements}};
7143 }
7144
7145 ## Step 2.
7146 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7147 ne $token->{tag_name}) {
7148 !!!cp ('t412');
7149 !!!parse-error (type => 'not closed',
7150 text => $self->{open_elements}->[-1]->[0]
7151 ->manakai_local_name,
7152 token => $token);
7153 } else {
7154 !!!cp ('t414');
7155 }
7156
7157 ## Step 3.
7158 splice @{$self->{open_elements}}, $i;
7159
7160 ## Step 4.
7161 $clear_up_to_marker->()
7162 if {
7163 applet => 1, button => 1, marquee => 1, object => 1,
7164 }->{$token->{tag_name}};
7165 }
7166 !!!next-token;
7167 next B;
7168 } elsif ($token->{tag_name} eq 'form') {
7169 undef $self->{form_element};
7170
7171 ## has an element in scope
7172 my $i;
7173 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7174 my $node = $self->{open_elements}->[$_];
7175 if ($node->[1] & FORM_EL) {
7176 !!!cp ('t418');
7177 $i = $_;
7178 last INSCOPE;
7179 } elsif ($node->[1] & SCOPING_EL) {
7180 !!!cp ('t419');
7181 last INSCOPE;
7182 }
7183 } # INSCOPE
7184
7185 unless (defined $i) { # has an element in scope
7186 !!!cp ('t421');
7187 !!!parse-error (type => 'unmatched end tag',
7188 text => $token->{tag_name}, token => $token);
7189 ## NOTE: Ignore the token.
7190 } else {
7191 ## Step 1. generate implied end tags
7192 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7193 !!!cp ('t417');
7194 pop @{$self->{open_elements}};
7195 }
7196
7197 ## Step 2.
7198 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7199 ne $token->{tag_name}) {
7200 !!!cp ('t417.1');
7201 !!!parse-error (type => 'not closed',
7202 text => $self->{open_elements}->[-1]->[0]
7203 ->manakai_local_name,
7204 token => $token);
7205 } else {
7206 !!!cp ('t420');
7207 }
7208
7209 ## Step 3.
7210 splice @{$self->{open_elements}}, $i;
7211 }
7212
7213 !!!next-token;
7214 next B;
7215 } elsif ({
7216 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7217 }->{$token->{tag_name}}) {
7218 ## has an element in scope
7219 my $i;
7220 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7221 my $node = $self->{open_elements}->[$_];
7222 if ($node->[1] & HEADING_EL) {
7223 !!!cp ('t423');
7224 $i = $_;
7225 last INSCOPE;
7226 } elsif ($node->[1] & SCOPING_EL) {
7227 !!!cp ('t424');
7228 last INSCOPE;
7229 }
7230 } # INSCOPE
7231
7232 unless (defined $i) { # has an element in scope
7233 !!!cp ('t425.1');
7234 !!!parse-error (type => 'unmatched end tag',
7235 text => $token->{tag_name}, token => $token);
7236 ## NOTE: Ignore the token.
7237 } else {
7238 ## Step 1. generate implied end tags
7239 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7240 !!!cp ('t422');
7241 pop @{$self->{open_elements}};
7242 }
7243
7244 ## Step 2.
7245 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7246 ne $token->{tag_name}) {
7247 !!!cp ('t425');
7248 !!!parse-error (type => 'unmatched end tag',
7249 text => $token->{tag_name}, token => $token);
7250 } else {
7251 !!!cp ('t426');
7252 }
7253
7254 ## Step 3.
7255 splice @{$self->{open_elements}}, $i;
7256 }
7257
7258 !!!next-token;
7259 next B;
7260 } elsif ($token->{tag_name} eq 'p') {
7261 ## has an element in scope
7262 my $i;
7263 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7264 my $node = $self->{open_elements}->[$_];
7265 if ($node->[1] & P_EL) {
7266 !!!cp ('t410.1');
7267 $i = $_;
7268 last INSCOPE;
7269 } elsif ($node->[1] & SCOPING_EL) {
7270 !!!cp ('t411.1');
7271 last INSCOPE;
7272 }
7273 } # INSCOPE
7274
7275 if (defined $i) {
7276 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7277 ne $token->{tag_name}) {
7278 !!!cp ('t412.1');
7279 !!!parse-error (type => 'not closed',
7280 text => $self->{open_elements}->[-1]->[0]
7281 ->manakai_local_name,
7282 token => $token);
7283 } else {
7284 !!!cp ('t414.1');
7285 }
7286
7287 splice @{$self->{open_elements}}, $i;
7288 } else {
7289 !!!cp ('t413.1');
7290 !!!parse-error (type => 'unmatched end tag',
7291 text => $token->{tag_name}, token => $token);
7292
7293 !!!cp ('t415.1');
7294 ## As if <p>, then reprocess the current token
7295 my $el;
7296 !!!create-element ($el, $HTML_NS, 'p',, $token);
7297 $insert->($el);
7298 ## NOTE: Not inserted into |$self->{open_elements}|.
7299 }
7300
7301 !!!next-token;
7302 next B;
7303 } elsif ({
7304 a => 1,
7305 b => 1, big => 1, em => 1, font => 1, i => 1,
7306 nobr => 1, s => 1, small => 1, strile => 1,
7307 strong => 1, tt => 1, u => 1,
7308 }->{$token->{tag_name}}) {
7309 !!!cp ('t427');
7310 $formatting_end_tag->($token);
7311 next B;
7312 } elsif ($token->{tag_name} eq 'br') {
7313 !!!cp ('t428');
7314 !!!parse-error (type => 'unmatched end tag',
7315 text => 'br', token => $token);
7316
7317 ## As if <br>
7318 $reconstruct_active_formatting_elements->($insert_to_current);
7319
7320 my $el;
7321 !!!create-element ($el, $HTML_NS, 'br',, $token);
7322 $insert->($el);
7323
7324 ## Ignore the token.
7325 !!!next-token;
7326 next B;
7327 } elsif ({
7328 caption => 1, col => 1, colgroup => 1, frame => 1,
7329 frameset => 1, head => 1, option => 1, optgroup => 1,
7330 tbody => 1, td => 1, tfoot => 1, th => 1,
7331 thead => 1, tr => 1,
7332 area => 1, basefont => 1, bgsound => 1,
7333 embed => 1, hr => 1, iframe => 1, image => 1,
7334 img => 1, input => 1, isindex => 1, noembed => 1,
7335 noframes => 1, param => 1, select => 1, spacer => 1,
7336 table => 1, textarea => 1, wbr => 1,
7337 noscript => 0, ## TODO: if scripting is enabled
7338 }->{$token->{tag_name}}) {
7339 !!!cp ('t429');
7340 !!!parse-error (type => 'unmatched end tag',
7341 text => $token->{tag_name}, token => $token);
7342 ## Ignore the token
7343 !!!next-token;
7344 next B;
7345
7346 ## ISSUE: Issue on HTML5 new elements in spec
7347
7348 } else {
7349 ## Step 1
7350 my $node_i = -1;
7351 my $node = $self->{open_elements}->[$node_i];
7352
7353 ## Step 2
7354 S2: {
7355 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7356 ## Step 1
7357 ## generate implied end tags
7358 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7359 !!!cp ('t430');
7360 ## NOTE: |<ruby><rt></ruby>|.
7361 ## ISSUE: <ruby><rt></rt> will also take this code path,
7362 ## which seems wrong.
7363 pop @{$self->{open_elements}};
7364 $node_i++;
7365 }
7366
7367 ## Step 2
7368 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7369 ne $token->{tag_name}) {
7370 !!!cp ('t431');
7371 ## NOTE: <x><y></x>
7372 !!!parse-error (type => 'not closed',
7373 text => $self->{open_elements}->[-1]->[0]
7374 ->manakai_local_name,
7375 token => $token);
7376 } else {
7377 !!!cp ('t432');
7378 }
7379
7380 ## Step 3
7381 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7382
7383 !!!next-token;
7384 last S2;
7385 } else {
7386 ## Step 3
7387 if (not ($node->[1] & FORMATTING_EL) and
7388 #not $phrasing_category->{$node->[1]} and
7389 ($node->[1] & SPECIAL_EL or
7390 $node->[1] & SCOPING_EL)) {
7391 !!!cp ('t433');
7392 !!!parse-error (type => 'unmatched end tag',
7393 text => $token->{tag_name}, token => $token);
7394 ## Ignore the token
7395 !!!next-token;
7396 last S2;
7397 }
7398
7399 !!!cp ('t434');
7400 }
7401
7402 ## Step 4
7403 $node_i--;
7404 $node = $self->{open_elements}->[$node_i];
7405
7406 ## Step 5;
7407 redo S2;
7408 } # S2
7409 next B;
7410 }
7411 }
7412 next B;
7413 } continue { # B
7414 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7415 ## NOTE: The code below is executed in cases where it does not have
7416 ## to be, but it it is harmless even in those cases.
7417 ## has an element in scope
7418 INSCOPE: {
7419 for (reverse 0..$#{$self->{open_elements}}) {
7420 my $node = $self->{open_elements}->[$_];
7421 if ($node->[1] & FOREIGN_EL) {
7422 last INSCOPE;
7423 } elsif ($node->[1] & SCOPING_EL) {
7424 last;
7425 }
7426 }
7427
7428 ## NOTE: No foreign element in scope.
7429 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7430 } # INSCOPE
7431 }
7432 } # B
7433
7434 ## Stop parsing # MUST
7435
7436 ## TODO: script stuffs
7437 } # _tree_construct_main
7438
7439 sub set_inner_html ($$$) {
7440 my $class = shift;
7441 my $node = shift;
7442 my $s = \$_[0];
7443 my $onerror = $_[1];
7444
7445 ## ISSUE: Should {confident} be true?
7446
7447 my $nt = $node->node_type;
7448 if ($nt == 9) {
7449 # MUST
7450
7451 ## Step 1 # MUST
7452 ## TODO: If the document has an active parser, ...
7453 ## ISSUE: There is an issue in the spec.
7454
7455 ## Step 2 # MUST
7456 my @cn = @{$node->child_nodes};
7457 for (@cn) {
7458 $node->remove_child ($_);
7459 }
7460
7461 ## Step 3, 4, 5 # MUST
7462 $class->parse_string ($$s => $node, $onerror);
7463 } elsif ($nt == 1) {
7464 ## TODO: If non-html element
7465
7466 ## NOTE: Most of this code is copied from |parse_string|
7467
7468 ## Step 1 # MUST
7469 my $this_doc = $node->owner_document;
7470 my $doc = $this_doc->implementation->create_document;
7471 $doc->manakai_is_html (1);
7472 my $p = $class->new;
7473 $p->{document} = $doc;
7474
7475 ## Step 8 # MUST
7476 my $i = 0;
7477 $p->{line_prev} = $p->{line} = 1;
7478 $p->{column_prev} = $p->{column} = 0;
7479 $p->{set_next_char} = sub {
7480 my $self = shift;
7481
7482 pop @{$self->{prev_char}};
7483 unshift @{$self->{prev_char}}, $self->{next_char};
7484
7485 $self->{next_char} = -1 and return if $i >= length $$s;
7486 $self->{next_char} = ord substr $$s, $i++, 1;
7487
7488 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7489 $p->{column}++;
7490
7491 if ($self->{next_char} == 0x000A) { # LF
7492 $p->{line}++;
7493 $p->{column} = 0;
7494 !!!cp ('i1');
7495 } elsif ($self->{next_char} == 0x000D) { # CR
7496 $i++ if substr ($$s, $i, 1) eq "\x0A";
7497 $self->{next_char} = 0x000A; # LF # MUST
7498 $p->{line}++;
7499 $p->{column} = 0;
7500 !!!cp ('i2');
7501 } elsif ($self->{next_char} > 0x10FFFF) {
7502 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7503 !!!cp ('i3');
7504 } elsif ($self->{next_char} == 0x0000) { # NULL
7505 !!!cp ('i4');
7506 !!!parse-error (type => 'NULL');
7507 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7508 } elsif ($self->{next_char} <= 0x0008 or
7509 (0x000E <= $self->{next_char} and
7510 $self->{next_char} <= 0x001F) or
7511 (0x007F <= $self->{next_char} and
7512 $self->{next_char} <= 0x009F) or
7513 (0xD800 <= $self->{next_char} and
7514 $self->{next_char} <= 0xDFFF) or
7515 (0xFDD0 <= $self->{next_char} and
7516 $self->{next_char} <= 0xFDDF) or
7517 {
7518 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7519 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7520 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7521 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7522 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7523 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7524 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7525 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7526 0x10FFFE => 1, 0x10FFFF => 1,
7527 }->{$self->{next_char}}) {
7528 !!!cp ('i4.1');
7529 if ($self->{next_char} < 0x10000) {
7530 !!!parse-error (type => 'control char',
7531 text => (sprintf 'U+%04X', $self->{next_char}));
7532 } else {
7533 !!!parse-error (type => 'control char',
7534 text => (sprintf 'U-%08X', $self->{next_char}));
7535 }
7536 }
7537 };
7538 $p->{prev_char} = [-1, -1, -1];
7539 $p->{next_char} = -1;
7540
7541 my $ponerror = $onerror || sub {
7542 my (%opt) = @_;
7543 my $line = $opt{line};
7544 my $column = $opt{column};
7545 if (defined $opt{token} and defined $opt{token}->{line}) {
7546 $line = $opt{token}->{line};
7547 $column = $opt{token}->{column};
7548 }
7549 warn "Parse error ($opt{type}) at line $line column $column\n";
7550 };
7551 $p->{parse_error} = sub {
7552 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7553 };
7554
7555 $p->_initialize_tokenizer;
7556 $p->_initialize_tree_constructor;
7557
7558 ## Step 2
7559 my $node_ln = $node->manakai_local_name;
7560 $p->{content_model} = {
7561 title => RCDATA_CONTENT_MODEL,
7562 textarea => RCDATA_CONTENT_MODEL,
7563 style => CDATA_CONTENT_MODEL,
7564 script => CDATA_CONTENT_MODEL,
7565 xmp => CDATA_CONTENT_MODEL,
7566 iframe => CDATA_CONTENT_MODEL,
7567 noembed => CDATA_CONTENT_MODEL,
7568 noframes => CDATA_CONTENT_MODEL,
7569 noscript => CDATA_CONTENT_MODEL,
7570 plaintext => PLAINTEXT_CONTENT_MODEL,
7571 }->{$node_ln};
7572 $p->{content_model} = PCDATA_CONTENT_MODEL
7573 unless defined $p->{content_model};
7574 ## ISSUE: What is "the name of the element"? local name?
7575
7576 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7577 ## TODO: Foreign element OK?
7578
7579 ## Step 3
7580 my $root = $doc->create_element_ns
7581 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7582
7583 ## Step 4 # MUST
7584 $doc->append_child ($root);
7585
7586 ## Step 5 # MUST
7587 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7588
7589 undef $p->{head_element};
7590
7591 ## Step 6 # MUST
7592 $p->_reset_insertion_mode;
7593
7594 ## Step 7 # MUST
7595 my $anode = $node;
7596 AN: while (defined $anode) {
7597 if ($anode->node_type == 1) {
7598 my $nsuri = $anode->namespace_uri;
7599 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7600 if ($anode->manakai_local_name eq 'form') {
7601 !!!cp ('i5');
7602 $p->{form_element} = $anode;
7603 last AN;
7604 }
7605 }
7606 }
7607 $anode = $anode->parent_node;
7608 } # AN
7609
7610 ## Step 9 # MUST
7611 {
7612 my $self = $p;
7613 !!!next-token;
7614 }
7615 $p->_tree_construction_main;
7616
7617 ## Step 10 # MUST
7618 my @cn = @{$node->child_nodes};
7619 for (@cn) {
7620 $node->remove_child ($_);
7621 }
7622 ## ISSUE: mutation events? read-only?
7623
7624 ## Step 11 # MUST
7625 @cn = @{$root->child_nodes};
7626 for (@cn) {
7627 $this_doc->adopt_node ($_);
7628 $node->append_child ($_);
7629 }
7630 ## ISSUE: mutation events?
7631
7632 $p->_terminate_tree_constructor;
7633
7634 delete $p->{parse_error}; # delete loop
7635 } else {
7636 die "$0: |set_inner_html| is not defined for node of type $nt";
7637 }
7638 } # set_inner_html
7639
7640 } # tree construction stage
7641
7642 package Whatpm::HTML::RestartParser;
7643 push our @ISA, 'Error';
7644
7645 1;
7646 # $Date: 2008/08/31 09:12:30 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24