/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.155 - (show annotations) (download) (as text)
Sat Aug 30 12:57:05 2008 UTC (17 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.154: +4 -2 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	30 Aug 2008 12:56:52 -0000
	* HTML-tree.dat: tree-test-3.dat added.

	* tree-test-3.dat: Test data for definitionURL="" are added (cf.
	HTML5 revision 2130).

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	30 Aug 2008 12:55:11 -0000
	* mkhtmlparser.pl: Support for MathML |definitionURL| attribute (HTML5
	revision 2130).

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.154 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 require IO::Handle;
12
13 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15 my $SVG_NS = q<http://www.w3.org/2000/svg>;
16 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19
20 sub A_EL () { 0b1 }
21 sub ADDRESS_EL () { 0b10 }
22 sub BODY_EL () { 0b100 }
23 sub BUTTON_EL () { 0b1000 }
24 sub CAPTION_EL () { 0b10000 }
25 sub DD_EL () { 0b100000 }
26 sub DIV_EL () { 0b1000000 }
27 sub DT_EL () { 0b10000000 }
28 sub FORM_EL () { 0b100000000 }
29 sub FORMATTING_EL () { 0b1000000000 }
30 sub FRAMESET_EL () { 0b10000000000 }
31 sub HEADING_EL () { 0b100000000000 }
32 sub HTML_EL () { 0b1000000000000 }
33 sub LI_EL () { 0b10000000000000 }
34 sub NOBR_EL () { 0b100000000000000 }
35 sub OPTION_EL () { 0b1000000000000000 }
36 sub OPTGROUP_EL () { 0b10000000000000000 }
37 sub P_EL () { 0b100000000000000000 }
38 sub SELECT_EL () { 0b1000000000000000000 }
39 sub TABLE_EL () { 0b10000000000000000000 }
40 sub TABLE_CELL_EL () { 0b100000000000000000000 }
41 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48 sub RUBY_EL () { 0b10000000000000000000000000000 }
49 sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 ## NOTE: Used in "generate implied end tags" algorithm.
58 ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
59 ## is used in "generate implied end tags" implementation (search for the
60 ## function mae).
61 sub END_TAG_OPTIONAL_EL () {
62 DD_EL |
63 DT_EL |
64 LI_EL |
65 P_EL |
66 RUBY_COMPONENT_EL
67 }
68
69 ## NOTE: Used in </body> and EOF algorithms.
70 sub ALL_END_TAG_OPTIONAL_EL () {
71 DD_EL |
72 DT_EL |
73 LI_EL |
74 P_EL |
75
76 BODY_EL |
77 HTML_EL |
78 TABLE_CELL_EL |
79 TABLE_ROW_EL |
80 TABLE_ROW_GROUP_EL
81 }
82
83 sub SCOPING_EL () {
84 BUTTON_EL |
85 CAPTION_EL |
86 HTML_EL |
87 TABLE_EL |
88 TABLE_CELL_EL |
89 MISC_SCOPING_EL
90 }
91
92 sub TABLE_SCOPING_EL () {
93 HTML_EL |
94 TABLE_EL
95 }
96
97 sub TABLE_ROWS_SCOPING_EL () {
98 HTML_EL |
99 TABLE_ROW_GROUP_EL
100 }
101
102 sub TABLE_ROW_SCOPING_EL () {
103 HTML_EL |
104 TABLE_ROW_EL
105 }
106
107 sub SPECIAL_EL () {
108 ADDRESS_EL |
109 BODY_EL |
110 DIV_EL |
111
112 DD_EL |
113 DT_EL |
114 LI_EL |
115 P_EL |
116
117 FORM_EL |
118 FRAMESET_EL |
119 HEADING_EL |
120 OPTION_EL |
121 OPTGROUP_EL |
122 SELECT_EL |
123 TABLE_ROW_EL |
124 TABLE_ROW_GROUP_EL |
125 MISC_SPECIAL_EL
126 }
127
128 my $el_category = {
129 a => A_EL | FORMATTING_EL,
130 address => ADDRESS_EL,
131 applet => MISC_SCOPING_EL,
132 area => MISC_SPECIAL_EL,
133 b => FORMATTING_EL,
134 base => MISC_SPECIAL_EL,
135 basefont => MISC_SPECIAL_EL,
136 bgsound => MISC_SPECIAL_EL,
137 big => FORMATTING_EL,
138 blockquote => MISC_SPECIAL_EL,
139 body => BODY_EL,
140 br => MISC_SPECIAL_EL,
141 button => BUTTON_EL,
142 caption => CAPTION_EL,
143 center => MISC_SPECIAL_EL,
144 col => MISC_SPECIAL_EL,
145 colgroup => MISC_SPECIAL_EL,
146 dd => DD_EL,
147 dir => MISC_SPECIAL_EL,
148 div => DIV_EL,
149 dl => MISC_SPECIAL_EL,
150 dt => DT_EL,
151 em => FORMATTING_EL,
152 embed => MISC_SPECIAL_EL,
153 fieldset => MISC_SPECIAL_EL,
154 font => FORMATTING_EL,
155 form => FORM_EL,
156 frame => MISC_SPECIAL_EL,
157 frameset => FRAMESET_EL,
158 h1 => HEADING_EL,
159 h2 => HEADING_EL,
160 h3 => HEADING_EL,
161 h4 => HEADING_EL,
162 h5 => HEADING_EL,
163 h6 => HEADING_EL,
164 head => MISC_SPECIAL_EL,
165 hr => MISC_SPECIAL_EL,
166 html => HTML_EL,
167 i => FORMATTING_EL,
168 iframe => MISC_SPECIAL_EL,
169 img => MISC_SPECIAL_EL,
170 input => MISC_SPECIAL_EL,
171 isindex => MISC_SPECIAL_EL,
172 li => LI_EL,
173 link => MISC_SPECIAL_EL,
174 listing => MISC_SPECIAL_EL,
175 marquee => MISC_SCOPING_EL,
176 menu => MISC_SPECIAL_EL,
177 meta => MISC_SPECIAL_EL,
178 nobr => NOBR_EL | FORMATTING_EL,
179 noembed => MISC_SPECIAL_EL,
180 noframes => MISC_SPECIAL_EL,
181 noscript => MISC_SPECIAL_EL,
182 object => MISC_SCOPING_EL,
183 ol => MISC_SPECIAL_EL,
184 optgroup => OPTGROUP_EL,
185 option => OPTION_EL,
186 p => P_EL,
187 param => MISC_SPECIAL_EL,
188 plaintext => MISC_SPECIAL_EL,
189 pre => MISC_SPECIAL_EL,
190 rp => RUBY_COMPONENT_EL,
191 rt => RUBY_COMPONENT_EL,
192 ruby => RUBY_EL,
193 s => FORMATTING_EL,
194 script => MISC_SPECIAL_EL,
195 select => SELECT_EL,
196 small => FORMATTING_EL,
197 spacer => MISC_SPECIAL_EL,
198 strike => FORMATTING_EL,
199 strong => FORMATTING_EL,
200 style => MISC_SPECIAL_EL,
201 table => TABLE_EL,
202 tbody => TABLE_ROW_GROUP_EL,
203 td => TABLE_CELL_EL,
204 textarea => MISC_SPECIAL_EL,
205 tfoot => TABLE_ROW_GROUP_EL,
206 th => TABLE_CELL_EL,
207 thead => TABLE_ROW_GROUP_EL,
208 title => MISC_SPECIAL_EL,
209 tr => TABLE_ROW_EL,
210 tt => FORMATTING_EL,
211 u => FORMATTING_EL,
212 ul => MISC_SPECIAL_EL,
213 wbr => MISC_SPECIAL_EL,
214 };
215
216 my $el_category_f = {
217 $MML_NS => {
218 'annotation-xml' => MML_AXML_EL,
219 mi => FOREIGN_FLOW_CONTENT_EL,
220 mo => FOREIGN_FLOW_CONTENT_EL,
221 mn => FOREIGN_FLOW_CONTENT_EL,
222 ms => FOREIGN_FLOW_CONTENT_EL,
223 mtext => FOREIGN_FLOW_CONTENT_EL,
224 },
225 $SVG_NS => {
226 foreignObject => FOREIGN_FLOW_CONTENT_EL,
227 desc => FOREIGN_FLOW_CONTENT_EL,
228 title => FOREIGN_FLOW_CONTENT_EL,
229 },
230 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
231 };
232
233 my $svg_attr_name = {
234 attributename => 'attributeName',
235 attributetype => 'attributeType',
236 basefrequency => 'baseFrequency',
237 baseprofile => 'baseProfile',
238 calcmode => 'calcMode',
239 clippathunits => 'clipPathUnits',
240 contentscripttype => 'contentScriptType',
241 contentstyletype => 'contentStyleType',
242 diffuseconstant => 'diffuseConstant',
243 edgemode => 'edgeMode',
244 externalresourcesrequired => 'externalResourcesRequired',
245 filterres => 'filterRes',
246 filterunits => 'filterUnits',
247 glyphref => 'glyphRef',
248 gradienttransform => 'gradientTransform',
249 gradientunits => 'gradientUnits',
250 kernelmatrix => 'kernelMatrix',
251 kernelunitlength => 'kernelUnitLength',
252 keypoints => 'keyPoints',
253 keysplines => 'keySplines',
254 keytimes => 'keyTimes',
255 lengthadjust => 'lengthAdjust',
256 limitingconeangle => 'limitingConeAngle',
257 markerheight => 'markerHeight',
258 markerunits => 'markerUnits',
259 markerwidth => 'markerWidth',
260 maskcontentunits => 'maskContentUnits',
261 maskunits => 'maskUnits',
262 numoctaves => 'numOctaves',
263 pathlength => 'pathLength',
264 patterncontentunits => 'patternContentUnits',
265 patterntransform => 'patternTransform',
266 patternunits => 'patternUnits',
267 pointsatx => 'pointsAtX',
268 pointsaty => 'pointsAtY',
269 pointsatz => 'pointsAtZ',
270 preservealpha => 'preserveAlpha',
271 preserveaspectratio => 'preserveAspectRatio',
272 primitiveunits => 'primitiveUnits',
273 refx => 'refX',
274 refy => 'refY',
275 repeatcount => 'repeatCount',
276 repeatdur => 'repeatDur',
277 requiredextensions => 'requiredExtensions',
278 requiredfeatures => 'requiredFeatures',
279 specularconstant => 'specularConstant',
280 specularexponent => 'specularExponent',
281 spreadmethod => 'spreadMethod',
282 startoffset => 'startOffset',
283 stddeviation => 'stdDeviation',
284 stitchtiles => 'stitchTiles',
285 surfacescale => 'surfaceScale',
286 systemlanguage => 'systemLanguage',
287 tablevalues => 'tableValues',
288 targetx => 'targetX',
289 targety => 'targetY',
290 textlength => 'textLength',
291 viewbox => 'viewBox',
292 viewtarget => 'viewTarget',
293 xchannelselector => 'xChannelSelector',
294 ychannelselector => 'yChannelSelector',
295 zoomandpan => 'zoomAndPan',
296 };
297
298 my $foreign_attr_xname = {
299 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
300 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
301 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
302 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
303 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
304 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
305 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
306 'xml:base' => [$XML_NS, ['xml', 'base']],
307 'xml:lang' => [$XML_NS, ['xml', 'lang']],
308 'xml:space' => [$XML_NS, ['xml', 'space']],
309 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
310 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
311 };
312
313 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
314
315 my $c1_entity_char = {
316 0x80 => 0x20AC,
317 0x81 => 0xFFFD,
318 0x82 => 0x201A,
319 0x83 => 0x0192,
320 0x84 => 0x201E,
321 0x85 => 0x2026,
322 0x86 => 0x2020,
323 0x87 => 0x2021,
324 0x88 => 0x02C6,
325 0x89 => 0x2030,
326 0x8A => 0x0160,
327 0x8B => 0x2039,
328 0x8C => 0x0152,
329 0x8D => 0xFFFD,
330 0x8E => 0x017D,
331 0x8F => 0xFFFD,
332 0x90 => 0xFFFD,
333 0x91 => 0x2018,
334 0x92 => 0x2019,
335 0x93 => 0x201C,
336 0x94 => 0x201D,
337 0x95 => 0x2022,
338 0x96 => 0x2013,
339 0x97 => 0x2014,
340 0x98 => 0x02DC,
341 0x99 => 0x2122,
342 0x9A => 0x0161,
343 0x9B => 0x203A,
344 0x9C => 0x0153,
345 0x9D => 0xFFFD,
346 0x9E => 0x017E,
347 0x9F => 0x0178,
348 }; # $c1_entity_char
349
350 sub parse_byte_string ($$$$;$) {
351 my $self = shift;
352 my $charset_name = shift;
353 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
354 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
355 } # parse_byte_string
356
357 sub parse_byte_stream ($$$$;$) {
358 my $self = ref $_[0] ? shift : shift->new;
359 my $charset_name = shift;
360 my $byte_stream = $_[0];
361
362 my $onerror = $_[2] || sub {
363 my (%opt) = @_;
364 warn "Parse error ($opt{type})\n";
365 };
366 $self->{parse_error} = $onerror; # updated later by parse_char_string
367
368 ## HTML5 encoding sniffing algorithm
369 require Message::Charset::Info;
370 my $charset;
371 my $buffer;
372 my ($char_stream, $e_status);
373
374 SNIFFING: {
375
376 ## Step 1
377 if (defined $charset_name) {
378 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
379
380 ## ISSUE: Unsupported encoding is not ignored according to the spec.
381 ($char_stream, $e_status) = $charset->get_decode_handle
382 ($byte_stream, allow_error_reporting => 1,
383 allow_fallback => 1);
384 if ($char_stream) {
385 $self->{confident} = 1;
386 last SNIFFING;
387 } else {
388 ## TODO: unsupported error
389 }
390 }
391
392 ## Step 2
393 my $byte_buffer = '';
394 for (1..1024) {
395 my $char = $byte_stream->getc;
396 last unless defined $char;
397 $byte_buffer .= $char;
398 } ## TODO: timeout
399
400 ## Step 3
401 if ($byte_buffer =~ /^\xFE\xFF/) {
402 $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
403 ($char_stream, $e_status) = $charset->get_decode_handle
404 ($byte_stream, allow_error_reporting => 1,
405 allow_fallback => 1, byte_buffer => \$byte_buffer);
406 $self->{confident} = 1;
407 last SNIFFING;
408 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
409 $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
410 ($char_stream, $e_status) = $charset->get_decode_handle
411 ($byte_stream, allow_error_reporting => 1,
412 allow_fallback => 1, byte_buffer => \$byte_buffer);
413 $self->{confident} = 1;
414 last SNIFFING;
415 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
416 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
417 ($char_stream, $e_status) = $charset->get_decode_handle
418 ($byte_stream, allow_error_reporting => 1,
419 allow_fallback => 1, byte_buffer => \$byte_buffer);
420 $self->{confident} = 1;
421 last SNIFFING;
422 }
423
424 ## Step 4
425 ## TODO: <meta charset>
426
427 ## Step 5
428 ## TODO: from history
429
430 ## Step 6
431 require Whatpm::Charset::UniversalCharDet;
432 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
433 ($byte_buffer);
434 if (defined $charset_name) {
435 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
436
437 ## ISSUE: Unsupported encoding is not ignored according to the spec.
438 require Whatpm::Charset::DecodeHandle;
439 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
440 ($byte_stream);
441 ($char_stream, $e_status) = $charset->get_decode_handle
442 ($buffer, allow_error_reporting => 1,
443 allow_fallback => 1, byte_buffer => \$byte_buffer);
444 if ($char_stream) {
445 $buffer->{buffer} = $byte_buffer;
446 !!!parse-error (type => 'sniffing:chardet',
447 text => $charset_name,
448 level => $self->{level}->{info},
449 layer => 'encode',
450 line => 1, column => 1);
451 $self->{confident} = 0;
452 last SNIFFING;
453 }
454 }
455
456 ## Step 7: default
457 ## TODO: Make this configurable.
458 $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
459 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
460 ## detectable in the step 6.
461 require Whatpm::Charset::DecodeHandle;
462 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
463 ($byte_stream);
464 ($char_stream, $e_status)
465 = $charset->get_decode_handle ($buffer,
466 allow_error_reporting => 1,
467 allow_fallback => 1,
468 byte_buffer => \$byte_buffer);
469 $buffer->{buffer} = $byte_buffer;
470 !!!parse-error (type => 'sniffing:default',
471 text => 'windows-1252',
472 level => $self->{level}->{info},
473 line => 1, column => 1,
474 layer => 'encode');
475 $self->{confident} = 0;
476 } # SNIFFING
477
478 $self->{input_encoding} = $charset->get_iana_name;
479 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
480 !!!parse-error (type => 'chardecode:fallback',
481 text => $self->{input_encoding},
482 level => $self->{level}->{uncertain},
483 line => 1, column => 1,
484 layer => 'encode');
485 } elsif (not ($e_status &
486 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
487 !!!parse-error (type => 'chardecode:no error',
488 text => $self->{input_encoding},
489 level => $self->{level}->{uncertain},
490 line => 1, column => 1,
491 layer => 'encode');
492 }
493
494 $self->{change_encoding} = sub {
495 my $self = shift;
496 $charset_name = shift;
497 my $token = shift;
498
499 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
500 ($char_stream, $e_status) = $charset->get_decode_handle
501 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
502 byte_buffer => \ $buffer->{buffer});
503
504 if ($char_stream) { # if supported
505 ## "Change the encoding" algorithm:
506
507 ## Step 1
508 if ($charset->{category} &
509 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
510 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
511 ($char_stream, $e_status) = $charset->get_decode_handle
512 ($byte_stream,
513 byte_buffer => \ $buffer->{buffer});
514 }
515 $charset_name = $charset->get_iana_name;
516
517 ## Step 2
518 if (defined $self->{input_encoding} and
519 $self->{input_encoding} eq $charset_name) {
520 !!!parse-error (type => 'charset label:matching',
521 text => $charset_name,
522 level => $self->{level}->{info});
523 $self->{confident} = 1;
524 return;
525 }
526
527 !!!parse-error (type => 'charset label detected',
528 text => $self->{input_encoding},
529 value => $charset_name,
530 level => $self->{level}->{warn},
531 token => $token);
532
533 ## Step 3
534 # if (can) {
535 ## change the encoding on the fly.
536 #$self->{confident} = 1;
537 #return;
538 # }
539
540 ## Step 4
541 throw Whatpm::HTML::RestartParser ();
542 }
543 }; # $self->{change_encoding}
544
545 my $char_onerror = sub {
546 my (undef, $type, %opt) = @_;
547 !!!parse-error (layer => 'encode',
548 %opt, type => $type,
549 line => $self->{line}, column => $self->{column} + 1);
550 if ($opt{octets}) {
551 ${$opt{octets}} = "\x{FFFD}"; # relacement character
552 }
553 };
554 $char_stream->onerror ($char_onerror);
555
556 my @args = @_; shift @args; # $s
557 my $return;
558 try {
559 $return = $self->parse_char_stream ($char_stream, @args);
560 } catch Whatpm::HTML::RestartParser with {
561 ## NOTE: Invoked after {change_encoding}.
562
563 $self->{input_encoding} = $charset->get_iana_name;
564 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
565 !!!parse-error (type => 'chardecode:fallback',
566 text => $self->{input_encoding},
567 level => $self->{level}->{uncertain},
568 line => 1, column => 1,
569 layer => 'encode');
570 } elsif (not ($e_status &
571 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
572 !!!parse-error (type => 'chardecode:no error',
573 text => $self->{input_encoding},
574 level => $self->{level}->{uncertain},
575 line => 1, column => 1,
576 layer => 'encode');
577 }
578 $self->{confident} = 1;
579 $char_stream->onerror ($char_onerror);
580 $return = $self->parse_char_stream ($char_stream, @args);
581 };
582 return $return;
583 } # parse_byte_stream
584
585 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
586 ## and the HTML layer MUST ignore it. However, we does strip BOM in
587 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
588 ## because the core part of our HTML parser expects a string of character,
589 ## not a string of bytes or code units or anything which might contain a BOM.
590 ## Therefore, any parser interface that accepts a string of bytes,
591 ## such as |parse_byte_string| in this module, must ensure that it does
592 ## strip the BOM and never strip any ZWNBSP.
593
594 sub parse_char_string ($$$;$) {
595 my $self = shift;
596 require utf8;
597 my $s = ref $_[0] ? $_[0] : \($_[0]);
598 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
599 return $self->parse_char_stream ($input, @_[1..$#_]);
600 } # parse_char_string
601 *parse_string = \&parse_char_string;
602
603 sub parse_char_stream ($$$;$) {
604 my $self = ref $_[0] ? shift : shift->new;
605 my $input = $_[0];
606 $self->{document} = $_[1];
607 @{$self->{document}->child_nodes} = ();
608
609 ## NOTE: |set_inner_html| copies most of this method's code
610
611 $self->{confident} = 1 unless exists $self->{confident};
612 $self->{document}->input_encoding ($self->{input_encoding})
613 if defined $self->{input_encoding};
614
615 my $i = 0;
616 $self->{line_prev} = $self->{line} = 1;
617 $self->{column_prev} = $self->{column} = 0;
618 $self->{set_next_char} = sub {
619 my $self = shift;
620
621 pop @{$self->{prev_char}};
622 unshift @{$self->{prev_char}}, $self->{next_char};
623
624 my $char;
625 if (defined $self->{next_next_char}) {
626 $char = $self->{next_next_char};
627 delete $self->{next_next_char};
628 } else {
629 $char = $input->getc;
630 }
631 $self->{next_char} = -1 and return unless defined $char;
632 $self->{next_char} = ord $char;
633
634 ($self->{line_prev}, $self->{column_prev})
635 = ($self->{line}, $self->{column});
636 $self->{column}++;
637
638 if ($self->{next_char} == 0x000A) { # LF
639 !!!cp ('j1');
640 $self->{line}++;
641 $self->{column} = 0;
642 } elsif ($self->{next_char} == 0x000D) { # CR
643 !!!cp ('j2');
644 my $next = $input->getc;
645 if (defined $next and $next ne "\x0A") {
646 $self->{next_next_char} = $next;
647 }
648 $self->{next_char} = 0x000A; # LF # MUST
649 $self->{line}++;
650 $self->{column} = 0;
651 } elsif ($self->{next_char} > 0x10FFFF) {
652 !!!cp ('j3');
653 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
654 } elsif ($self->{next_char} == 0x0000) { # NULL
655 !!!cp ('j4');
656 !!!parse-error (type => 'NULL');
657 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
658 } elsif ($self->{next_char} <= 0x0008 or
659 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
660 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
661 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
662 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
663 {
664 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
665 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
666 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
667 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
668 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
669 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
670 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
671 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
672 0x10FFFE => 1, 0x10FFFF => 1,
673 }->{$self->{next_char}}) {
674 !!!cp ('j5');
675 if ($self->{next_char} < 0x10000) {
676 !!!parse-error (type => 'control char',
677 text => (sprintf 'U+%04X', $self->{next_char}));
678 } else {
679 !!!parse-error (type => 'control char',
680 text => (sprintf 'U-%08X', $self->{next_char}));
681 }
682 }
683 };
684 $self->{prev_char} = [-1, -1, -1];
685 $self->{next_char} = -1;
686
687 my $onerror = $_[2] || sub {
688 my (%opt) = @_;
689 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
690 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
691 warn "Parse error ($opt{type}) at line $line column $column\n";
692 };
693 $self->{parse_error} = sub {
694 $onerror->(line => $self->{line}, column => $self->{column}, @_);
695 };
696
697 $self->_initialize_tokenizer;
698 $self->_initialize_tree_constructor;
699 $self->_construct_tree;
700 $self->_terminate_tree_constructor;
701
702 delete $self->{parse_error}; # remove loop
703
704 return $self->{document};
705 } # parse_char_stream
706
707 sub new ($) {
708 my $class = shift;
709 my $self = bless {
710 level => {must => 'm',
711 warn => 'w',
712 info => 'i',
713 uncertain => 'u'},
714 }, $class;
715 $self->{set_next_char} = sub {
716 $self->{next_char} = -1;
717 };
718 $self->{parse_error} = sub {
719 #
720 };
721 $self->{change_encoding} = sub {
722 # if ($_[0] is a supported encoding) {
723 # run "change the encoding" algorithm;
724 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
725 # }
726 };
727 $self->{application_cache_selection} = sub {
728 #
729 };
730 return $self;
731 } # new
732
733 sub CM_ENTITY () { 0b001 } # & markup in data
734 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
735 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
736
737 sub PLAINTEXT_CONTENT_MODEL () { 0 }
738 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
739 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
740 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
741
742 sub DATA_STATE () { 0 }
743 sub ENTITY_DATA_STATE () { 1 }
744 sub TAG_OPEN_STATE () { 2 }
745 sub CLOSE_TAG_OPEN_STATE () { 3 }
746 sub TAG_NAME_STATE () { 4 }
747 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
748 sub ATTRIBUTE_NAME_STATE () { 6 }
749 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
750 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
751 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
752 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
753 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
754 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
755 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
756 sub COMMENT_START_STATE () { 14 }
757 sub COMMENT_START_DASH_STATE () { 15 }
758 sub COMMENT_STATE () { 16 }
759 sub COMMENT_END_STATE () { 17 }
760 sub COMMENT_END_DASH_STATE () { 18 }
761 sub BOGUS_COMMENT_STATE () { 19 }
762 sub DOCTYPE_STATE () { 20 }
763 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
764 sub DOCTYPE_NAME_STATE () { 22 }
765 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
766 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
767 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
768 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
769 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
770 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
771 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
772 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
773 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
774 sub BOGUS_DOCTYPE_STATE () { 32 }
775 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
776 sub SELF_CLOSING_START_TAG_STATE () { 34 }
777 sub CDATA_BLOCK_STATE () { 35 }
778
779 sub DOCTYPE_TOKEN () { 1 }
780 sub COMMENT_TOKEN () { 2 }
781 sub START_TAG_TOKEN () { 3 }
782 sub END_TAG_TOKEN () { 4 }
783 sub END_OF_FILE_TOKEN () { 5 }
784 sub CHARACTER_TOKEN () { 6 }
785
786 sub AFTER_HTML_IMS () { 0b100 }
787 sub HEAD_IMS () { 0b1000 }
788 sub BODY_IMS () { 0b10000 }
789 sub BODY_TABLE_IMS () { 0b100000 }
790 sub TABLE_IMS () { 0b1000000 }
791 sub ROW_IMS () { 0b10000000 }
792 sub BODY_AFTER_IMS () { 0b100000000 }
793 sub FRAME_IMS () { 0b1000000000 }
794 sub SELECT_IMS () { 0b10000000000 }
795 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
796 ## NOTE: "in foreign content" insertion mode is special; it is combined
797 ## with the secondary insertion mode. In this parser, they are stored
798 ## together in the bit-or'ed form.
799
800 ## NOTE: "initial" and "before html" insertion modes have no constants.
801
802 ## NOTE: "after after body" insertion mode.
803 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
804
805 ## NOTE: "after after frameset" insertion mode.
806 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
807
808 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
809 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
810 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
811 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
812 sub IN_BODY_IM () { BODY_IMS }
813 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
814 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
815 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
816 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
817 sub IN_TABLE_IM () { TABLE_IMS }
818 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
819 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
820 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
821 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
822 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
823 sub IN_COLUMN_GROUP_IM () { 0b10 }
824
825 ## Implementations MUST act as if state machine in the spec
826
827 sub _initialize_tokenizer ($) {
828 my $self = shift;
829 $self->{state} = DATA_STATE; # MUST
830 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
831 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
832 undef $self->{current_attribute};
833 undef $self->{last_emitted_start_tag_name};
834 undef $self->{last_attribute_value_state};
835 delete $self->{self_closing};
836 $self->{char} = [];
837 # $self->{next_char}
838 !!!next-input-character;
839 $self->{token} = [];
840 # $self->{escape}
841 } # _initialize_tokenizer
842
843 ## A token has:
844 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
845 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
846 ## ->{name} (DOCTYPE_TOKEN)
847 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
848 ## ->{public_identifier} (DOCTYPE_TOKEN)
849 ## ->{system_identifier} (DOCTYPE_TOKEN)
850 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
851 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
852 ## ->{name}
853 ## ->{value}
854 ## ->{has_reference} == 1 or 0
855 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
856 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
857 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
858 ## while the token is pushed back to the stack.
859
860 ## Emitted token MUST immediately be handled by the tree construction state.
861
862 ## Before each step, UA MAY check to see if either one of the scripts in
863 ## "list of scripts that will execute as soon as possible" or the first
864 ## script in the "list of scripts that will execute asynchronously",
865 ## has completed loading. If one has, then it MUST be executed
866 ## and removed from the list.
867
868 ## NOTE: HTML5 "Writing HTML documents" section, applied to
869 ## documents and not to user agents and conformance checkers,
870 ## contains some requirements that are not detected by the
871 ## parsing algorithm:
872 ## - Some requirements on character encoding declarations. ## TODO
873 ## - "Elements MUST NOT contain content that their content model disallows."
874 ## ... Some are parse error, some are not (will be reported by c.c.).
875 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
876 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
877 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
878
879 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
880 ## be detected by the HTML5 parsing algorithm:
881 ## - Text,
882
883 sub _get_next_token ($) {
884 my $self = shift;
885
886 if ($self->{self_closing}) {
887 !!!parse-error (type => 'nestc', token => $self->{current_token});
888 ## NOTE: The |self_closing| flag is only set by start tag token.
889 ## In addition, when a start tag token is emitted, it is always set to
890 ## |current_token|.
891 delete $self->{self_closing};
892 }
893
894 if (@{$self->{token}}) {
895 $self->{self_closing} = $self->{token}->[0]->{self_closing};
896 return shift @{$self->{token}};
897 }
898
899 A: {
900 if ($self->{state} == DATA_STATE) {
901 if ($self->{next_char} == 0x0026) { # &
902 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
903 not $self->{escape}) {
904 !!!cp (1);
905 $self->{state} = ENTITY_DATA_STATE;
906 !!!next-input-character;
907 redo A;
908 } else {
909 !!!cp (2);
910 #
911 }
912 } elsif ($self->{next_char} == 0x002D) { # -
913 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
914 unless ($self->{escape}) {
915 if ($self->{prev_char}->[0] == 0x002D and # -
916 $self->{prev_char}->[1] == 0x0021 and # !
917 $self->{prev_char}->[2] == 0x003C) { # <
918 !!!cp (3);
919 $self->{escape} = 1;
920 } else {
921 !!!cp (4);
922 }
923 } else {
924 !!!cp (5);
925 }
926 }
927
928 #
929 } elsif ($self->{next_char} == 0x003C) { # <
930 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
931 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
932 not $self->{escape})) {
933 !!!cp (6);
934 $self->{state} = TAG_OPEN_STATE;
935 !!!next-input-character;
936 redo A;
937 } else {
938 !!!cp (7);
939 #
940 }
941 } elsif ($self->{next_char} == 0x003E) { # >
942 if ($self->{escape} and
943 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
944 if ($self->{prev_char}->[0] == 0x002D and # -
945 $self->{prev_char}->[1] == 0x002D) { # -
946 !!!cp (8);
947 delete $self->{escape};
948 } else {
949 !!!cp (9);
950 }
951 } else {
952 !!!cp (10);
953 }
954
955 #
956 } elsif ($self->{next_char} == -1) {
957 !!!cp (11);
958 !!!emit ({type => END_OF_FILE_TOKEN,
959 line => $self->{line}, column => $self->{column}});
960 last A; ## TODO: ok?
961 } else {
962 !!!cp (12);
963 }
964 # Anything else
965 my $token = {type => CHARACTER_TOKEN,
966 data => chr $self->{next_char},
967 line => $self->{line}, column => $self->{column},
968 };
969 ## Stay in the data state
970 !!!next-input-character;
971
972 !!!emit ($token);
973
974 redo A;
975 } elsif ($self->{state} == ENTITY_DATA_STATE) {
976 ## (cannot happen in CDATA state)
977
978 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
979
980 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
981
982 $self->{state} = DATA_STATE;
983 # next-input-character is already done
984
985 unless (defined $token) {
986 !!!cp (13);
987 !!!emit ({type => CHARACTER_TOKEN, data => '&',
988 line => $l, column => $c,
989 });
990 } else {
991 !!!cp (14);
992 !!!emit ($token);
993 }
994
995 redo A;
996 } elsif ($self->{state} == TAG_OPEN_STATE) {
997 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
998 if ($self->{next_char} == 0x002F) { # /
999 !!!cp (15);
1000 !!!next-input-character;
1001 $self->{state} = CLOSE_TAG_OPEN_STATE;
1002 redo A;
1003 } else {
1004 !!!cp (16);
1005 ## reconsume
1006 $self->{state} = DATA_STATE;
1007
1008 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1009 line => $self->{line_prev},
1010 column => $self->{column_prev},
1011 });
1012
1013 redo A;
1014 }
1015 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1016 if ($self->{next_char} == 0x0021) { # !
1017 !!!cp (17);
1018 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1019 !!!next-input-character;
1020 redo A;
1021 } elsif ($self->{next_char} == 0x002F) { # /
1022 !!!cp (18);
1023 $self->{state} = CLOSE_TAG_OPEN_STATE;
1024 !!!next-input-character;
1025 redo A;
1026 } elsif (0x0041 <= $self->{next_char} and
1027 $self->{next_char} <= 0x005A) { # A..Z
1028 !!!cp (19);
1029 $self->{current_token}
1030 = {type => START_TAG_TOKEN,
1031 tag_name => chr ($self->{next_char} + 0x0020),
1032 line => $self->{line_prev},
1033 column => $self->{column_prev}};
1034 $self->{state} = TAG_NAME_STATE;
1035 !!!next-input-character;
1036 redo A;
1037 } elsif (0x0061 <= $self->{next_char} and
1038 $self->{next_char} <= 0x007A) { # a..z
1039 !!!cp (20);
1040 $self->{current_token} = {type => START_TAG_TOKEN,
1041 tag_name => chr ($self->{next_char}),
1042 line => $self->{line_prev},
1043 column => $self->{column_prev}};
1044 $self->{state} = TAG_NAME_STATE;
1045 !!!next-input-character;
1046 redo A;
1047 } elsif ($self->{next_char} == 0x003E) { # >
1048 !!!cp (21);
1049 !!!parse-error (type => 'empty start tag',
1050 line => $self->{line_prev},
1051 column => $self->{column_prev});
1052 $self->{state} = DATA_STATE;
1053 !!!next-input-character;
1054
1055 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1056 line => $self->{line_prev},
1057 column => $self->{column_prev},
1058 });
1059
1060 redo A;
1061 } elsif ($self->{next_char} == 0x003F) { # ?
1062 !!!cp (22);
1063 !!!parse-error (type => 'pio',
1064 line => $self->{line_prev},
1065 column => $self->{column_prev});
1066 $self->{state} = BOGUS_COMMENT_STATE;
1067 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1068 line => $self->{line_prev},
1069 column => $self->{column_prev},
1070 };
1071 ## $self->{next_char} is intentionally left as is
1072 redo A;
1073 } else {
1074 !!!cp (23);
1075 !!!parse-error (type => 'bare stago',
1076 line => $self->{line_prev},
1077 column => $self->{column_prev});
1078 $self->{state} = DATA_STATE;
1079 ## reconsume
1080
1081 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1082 line => $self->{line_prev},
1083 column => $self->{column_prev},
1084 });
1085
1086 redo A;
1087 }
1088 } else {
1089 die "$0: $self->{content_model} in tag open";
1090 }
1091 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1092 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1093 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1094 if (defined $self->{last_emitted_start_tag_name}) {
1095
1096 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
1097 my @next_char;
1098 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
1099 push @next_char, $self->{next_char};
1100 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
1101 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
1102 if ($self->{next_char} == $c or $self->{next_char} == $C) {
1103 !!!cp (24);
1104 !!!next-input-character;
1105 next TAGNAME;
1106 } else {
1107 !!!cp (25);
1108 $self->{next_char} = shift @next_char; # reconsume
1109 !!!back-next-input-character (@next_char);
1110 $self->{state} = DATA_STATE;
1111
1112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1113 line => $l, column => $c,
1114 });
1115
1116 redo A;
1117 }
1118 }
1119 push @next_char, $self->{next_char};
1120
1121 unless ($self->{next_char} == 0x0009 or # HT
1122 $self->{next_char} == 0x000A or # LF
1123 $self->{next_char} == 0x000B or # VT
1124 $self->{next_char} == 0x000C or # FF
1125 $self->{next_char} == 0x0020 or # SP
1126 $self->{next_char} == 0x003E or # >
1127 $self->{next_char} == 0x002F or # /
1128 $self->{next_char} == -1) {
1129 !!!cp (26);
1130 $self->{next_char} = shift @next_char; # reconsume
1131 !!!back-next-input-character (@next_char);
1132 $self->{state} = DATA_STATE;
1133 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1134 line => $l, column => $c,
1135 });
1136 redo A;
1137 } else {
1138 !!!cp (27);
1139 $self->{next_char} = shift @next_char;
1140 !!!back-next-input-character (@next_char);
1141 # and consume...
1142 }
1143 } else {
1144 ## No start tag token has ever been emitted
1145 !!!cp (28);
1146 # next-input-character is already done
1147 $self->{state} = DATA_STATE;
1148 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1149 line => $l, column => $c,
1150 });
1151 redo A;
1152 }
1153 }
1154
1155 if (0x0041 <= $self->{next_char} and
1156 $self->{next_char} <= 0x005A) { # A..Z
1157 !!!cp (29);
1158 $self->{current_token}
1159 = {type => END_TAG_TOKEN,
1160 tag_name => chr ($self->{next_char} + 0x0020),
1161 line => $l, column => $c};
1162 $self->{state} = TAG_NAME_STATE;
1163 !!!next-input-character;
1164 redo A;
1165 } elsif (0x0061 <= $self->{next_char} and
1166 $self->{next_char} <= 0x007A) { # a..z
1167 !!!cp (30);
1168 $self->{current_token} = {type => END_TAG_TOKEN,
1169 tag_name => chr ($self->{next_char}),
1170 line => $l, column => $c};
1171 $self->{state} = TAG_NAME_STATE;
1172 !!!next-input-character;
1173 redo A;
1174 } elsif ($self->{next_char} == 0x003E) { # >
1175 !!!cp (31);
1176 !!!parse-error (type => 'empty end tag',
1177 line => $self->{line_prev}, ## "<" in "</>"
1178 column => $self->{column_prev} - 1);
1179 $self->{state} = DATA_STATE;
1180 !!!next-input-character;
1181 redo A;
1182 } elsif ($self->{next_char} == -1) {
1183 !!!cp (32);
1184 !!!parse-error (type => 'bare etago');
1185 $self->{state} = DATA_STATE;
1186 # reconsume
1187
1188 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1189 line => $l, column => $c,
1190 });
1191
1192 redo A;
1193 } else {
1194 !!!cp (33);
1195 !!!parse-error (type => 'bogus end tag');
1196 $self->{state} = BOGUS_COMMENT_STATE;
1197 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1198 line => $self->{line_prev}, # "<" of "</"
1199 column => $self->{column_prev} - 1,
1200 };
1201 ## $self->{next_char} is intentionally left as is
1202 redo A;
1203 }
1204 } elsif ($self->{state} == TAG_NAME_STATE) {
1205 if ($self->{next_char} == 0x0009 or # HT
1206 $self->{next_char} == 0x000A or # LF
1207 $self->{next_char} == 0x000B or # VT
1208 $self->{next_char} == 0x000C or # FF
1209 $self->{next_char} == 0x0020) { # SP
1210 !!!cp (34);
1211 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1212 !!!next-input-character;
1213 redo A;
1214 } elsif ($self->{next_char} == 0x003E) { # >
1215 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1216 !!!cp (35);
1217 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1218 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1219 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1220 #if ($self->{current_token}->{attributes}) {
1221 # ## NOTE: This should never be reached.
1222 # !!! cp (36);
1223 # !!! parse-error (type => 'end tag attribute');
1224 #} else {
1225 !!!cp (37);
1226 #}
1227 } else {
1228 die "$0: $self->{current_token}->{type}: Unknown token type";
1229 }
1230 $self->{state} = DATA_STATE;
1231 !!!next-input-character;
1232
1233 !!!emit ($self->{current_token}); # start tag or end tag
1234
1235 redo A;
1236 } elsif (0x0041 <= $self->{next_char} and
1237 $self->{next_char} <= 0x005A) { # A..Z
1238 !!!cp (38);
1239 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1240 # start tag or end tag
1241 ## Stay in this state
1242 !!!next-input-character;
1243 redo A;
1244 } elsif ($self->{next_char} == -1) {
1245 !!!parse-error (type => 'unclosed tag');
1246 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1247 !!!cp (39);
1248 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1249 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1250 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1251 #if ($self->{current_token}->{attributes}) {
1252 # ## NOTE: This state should never be reached.
1253 # !!! cp (40);
1254 # !!! parse-error (type => 'end tag attribute');
1255 #} else {
1256 !!!cp (41);
1257 #}
1258 } else {
1259 die "$0: $self->{current_token}->{type}: Unknown token type";
1260 }
1261 $self->{state} = DATA_STATE;
1262 # reconsume
1263
1264 !!!emit ($self->{current_token}); # start tag or end tag
1265
1266 redo A;
1267 } elsif ($self->{next_char} == 0x002F) { # /
1268 !!!cp (42);
1269 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1270 !!!next-input-character;
1271 redo A;
1272 } else {
1273 !!!cp (44);
1274 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1275 # start tag or end tag
1276 ## Stay in the state
1277 !!!next-input-character;
1278 redo A;
1279 }
1280 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1281 if ($self->{next_char} == 0x0009 or # HT
1282 $self->{next_char} == 0x000A or # LF
1283 $self->{next_char} == 0x000B or # VT
1284 $self->{next_char} == 0x000C or # FF
1285 $self->{next_char} == 0x0020) { # SP
1286 !!!cp (45);
1287 ## Stay in the state
1288 !!!next-input-character;
1289 redo A;
1290 } elsif ($self->{next_char} == 0x003E) { # >
1291 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1292 !!!cp (46);
1293 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1294 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1295 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1296 if ($self->{current_token}->{attributes}) {
1297 !!!cp (47);
1298 !!!parse-error (type => 'end tag attribute');
1299 } else {
1300 !!!cp (48);
1301 }
1302 } else {
1303 die "$0: $self->{current_token}->{type}: Unknown token type";
1304 }
1305 $self->{state} = DATA_STATE;
1306 !!!next-input-character;
1307
1308 !!!emit ($self->{current_token}); # start tag or end tag
1309
1310 redo A;
1311 } elsif (0x0041 <= $self->{next_char} and
1312 $self->{next_char} <= 0x005A) { # A..Z
1313 !!!cp (49);
1314 $self->{current_attribute}
1315 = {name => chr ($self->{next_char} + 0x0020),
1316 value => '',
1317 line => $self->{line}, column => $self->{column}};
1318 $self->{state} = ATTRIBUTE_NAME_STATE;
1319 !!!next-input-character;
1320 redo A;
1321 } elsif ($self->{next_char} == 0x002F) { # /
1322 !!!cp (50);
1323 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1324 !!!next-input-character;
1325 redo A;
1326 } elsif ($self->{next_char} == -1) {
1327 !!!parse-error (type => 'unclosed tag');
1328 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1329 !!!cp (52);
1330 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1331 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1332 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1333 if ($self->{current_token}->{attributes}) {
1334 !!!cp (53);
1335 !!!parse-error (type => 'end tag attribute');
1336 } else {
1337 !!!cp (54);
1338 }
1339 } else {
1340 die "$0: $self->{current_token}->{type}: Unknown token type";
1341 }
1342 $self->{state} = DATA_STATE;
1343 # reconsume
1344
1345 !!!emit ($self->{current_token}); # start tag or end tag
1346
1347 redo A;
1348 } else {
1349 if ({
1350 0x0022 => 1, # "
1351 0x0027 => 1, # '
1352 0x003D => 1, # =
1353 }->{$self->{next_char}}) {
1354 !!!cp (55);
1355 !!!parse-error (type => 'bad attribute name');
1356 } else {
1357 !!!cp (56);
1358 }
1359 $self->{current_attribute}
1360 = {name => chr ($self->{next_char}),
1361 value => '',
1362 line => $self->{line}, column => $self->{column}};
1363 $self->{state} = ATTRIBUTE_NAME_STATE;
1364 !!!next-input-character;
1365 redo A;
1366 }
1367 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1368 my $before_leave = sub {
1369 if (exists $self->{current_token}->{attributes} # start tag or end tag
1370 ->{$self->{current_attribute}->{name}}) { # MUST
1371 !!!cp (57);
1372 !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1373 ## Discard $self->{current_attribute} # MUST
1374 } else {
1375 !!!cp (58);
1376 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1377 = $self->{current_attribute};
1378 }
1379 }; # $before_leave
1380
1381 if ($self->{next_char} == 0x0009 or # HT
1382 $self->{next_char} == 0x000A or # LF
1383 $self->{next_char} == 0x000B or # VT
1384 $self->{next_char} == 0x000C or # FF
1385 $self->{next_char} == 0x0020) { # SP
1386 !!!cp (59);
1387 $before_leave->();
1388 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1389 !!!next-input-character;
1390 redo A;
1391 } elsif ($self->{next_char} == 0x003D) { # =
1392 !!!cp (60);
1393 $before_leave->();
1394 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1395 !!!next-input-character;
1396 redo A;
1397 } elsif ($self->{next_char} == 0x003E) { # >
1398 $before_leave->();
1399 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1400 !!!cp (61);
1401 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1402 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1403 !!!cp (62);
1404 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1405 if ($self->{current_token}->{attributes}) {
1406 !!!parse-error (type => 'end tag attribute');
1407 }
1408 } else {
1409 die "$0: $self->{current_token}->{type}: Unknown token type";
1410 }
1411 $self->{state} = DATA_STATE;
1412 !!!next-input-character;
1413
1414 !!!emit ($self->{current_token}); # start tag or end tag
1415
1416 redo A;
1417 } elsif (0x0041 <= $self->{next_char} and
1418 $self->{next_char} <= 0x005A) { # A..Z
1419 !!!cp (63);
1420 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1421 ## Stay in the state
1422 !!!next-input-character;
1423 redo A;
1424 } elsif ($self->{next_char} == 0x002F) { # /
1425 !!!cp (64);
1426 $before_leave->();
1427 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1428 !!!next-input-character;
1429 redo A;
1430 } elsif ($self->{next_char} == -1) {
1431 !!!parse-error (type => 'unclosed tag');
1432 $before_leave->();
1433 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1434 !!!cp (66);
1435 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1436 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1437 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1438 if ($self->{current_token}->{attributes}) {
1439 !!!cp (67);
1440 !!!parse-error (type => 'end tag attribute');
1441 } else {
1442 ## NOTE: This state should never be reached.
1443 !!!cp (68);
1444 }
1445 } else {
1446 die "$0: $self->{current_token}->{type}: Unknown token type";
1447 }
1448 $self->{state} = DATA_STATE;
1449 # reconsume
1450
1451 !!!emit ($self->{current_token}); # start tag or end tag
1452
1453 redo A;
1454 } else {
1455 if ($self->{next_char} == 0x0022 or # "
1456 $self->{next_char} == 0x0027) { # '
1457 !!!cp (69);
1458 !!!parse-error (type => 'bad attribute name');
1459 } else {
1460 !!!cp (70);
1461 }
1462 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1463 ## Stay in the state
1464 !!!next-input-character;
1465 redo A;
1466 }
1467 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1468 if ($self->{next_char} == 0x0009 or # HT
1469 $self->{next_char} == 0x000A or # LF
1470 $self->{next_char} == 0x000B or # VT
1471 $self->{next_char} == 0x000C or # FF
1472 $self->{next_char} == 0x0020) { # SP
1473 !!!cp (71);
1474 ## Stay in the state
1475 !!!next-input-character;
1476 redo A;
1477 } elsif ($self->{next_char} == 0x003D) { # =
1478 !!!cp (72);
1479 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1480 !!!next-input-character;
1481 redo A;
1482 } elsif ($self->{next_char} == 0x003E) { # >
1483 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1484 !!!cp (73);
1485 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1486 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1487 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1488 if ($self->{current_token}->{attributes}) {
1489 !!!cp (74);
1490 !!!parse-error (type => 'end tag attribute');
1491 } else {
1492 ## NOTE: This state should never be reached.
1493 !!!cp (75);
1494 }
1495 } else {
1496 die "$0: $self->{current_token}->{type}: Unknown token type";
1497 }
1498 $self->{state} = DATA_STATE;
1499 !!!next-input-character;
1500
1501 !!!emit ($self->{current_token}); # start tag or end tag
1502
1503 redo A;
1504 } elsif (0x0041 <= $self->{next_char} and
1505 $self->{next_char} <= 0x005A) { # A..Z
1506 !!!cp (76);
1507 $self->{current_attribute}
1508 = {name => chr ($self->{next_char} + 0x0020),
1509 value => '',
1510 line => $self->{line}, column => $self->{column}};
1511 $self->{state} = ATTRIBUTE_NAME_STATE;
1512 !!!next-input-character;
1513 redo A;
1514 } elsif ($self->{next_char} == 0x002F) { # /
1515 !!!cp (77);
1516 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1517 !!!next-input-character;
1518 redo A;
1519 } elsif ($self->{next_char} == -1) {
1520 !!!parse-error (type => 'unclosed tag');
1521 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1522 !!!cp (79);
1523 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1524 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1525 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1526 if ($self->{current_token}->{attributes}) {
1527 !!!cp (80);
1528 !!!parse-error (type => 'end tag attribute');
1529 } else {
1530 ## NOTE: This state should never be reached.
1531 !!!cp (81);
1532 }
1533 } else {
1534 die "$0: $self->{current_token}->{type}: Unknown token type";
1535 }
1536 $self->{state} = DATA_STATE;
1537 # reconsume
1538
1539 !!!emit ($self->{current_token}); # start tag or end tag
1540
1541 redo A;
1542 } else {
1543 !!!cp (82);
1544 $self->{current_attribute}
1545 = {name => chr ($self->{next_char}),
1546 value => '',
1547 line => $self->{line}, column => $self->{column}};
1548 $self->{state} = ATTRIBUTE_NAME_STATE;
1549 !!!next-input-character;
1550 redo A;
1551 }
1552 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1553 if ($self->{next_char} == 0x0009 or # HT
1554 $self->{next_char} == 0x000A or # LF
1555 $self->{next_char} == 0x000B or # VT
1556 $self->{next_char} == 0x000C or # FF
1557 $self->{next_char} == 0x0020) { # SP
1558 !!!cp (83);
1559 ## Stay in the state
1560 !!!next-input-character;
1561 redo A;
1562 } elsif ($self->{next_char} == 0x0022) { # "
1563 !!!cp (84);
1564 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1565 !!!next-input-character;
1566 redo A;
1567 } elsif ($self->{next_char} == 0x0026) { # &
1568 !!!cp (85);
1569 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1570 ## reconsume
1571 redo A;
1572 } elsif ($self->{next_char} == 0x0027) { # '
1573 !!!cp (86);
1574 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1575 !!!next-input-character;
1576 redo A;
1577 } elsif ($self->{next_char} == 0x003E) { # >
1578 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1579 !!!cp (87);
1580 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1581 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1582 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1583 if ($self->{current_token}->{attributes}) {
1584 !!!cp (88);
1585 !!!parse-error (type => 'end tag attribute');
1586 } else {
1587 ## NOTE: This state should never be reached.
1588 !!!cp (89);
1589 }
1590 } else {
1591 die "$0: $self->{current_token}->{type}: Unknown token type";
1592 }
1593 $self->{state} = DATA_STATE;
1594 !!!next-input-character;
1595
1596 !!!emit ($self->{current_token}); # start tag or end tag
1597
1598 redo A;
1599 } elsif ($self->{next_char} == -1) {
1600 !!!parse-error (type => 'unclosed tag');
1601 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1602 !!!cp (90);
1603 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1604 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1605 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1606 if ($self->{current_token}->{attributes}) {
1607 !!!cp (91);
1608 !!!parse-error (type => 'end tag attribute');
1609 } else {
1610 ## NOTE: This state should never be reached.
1611 !!!cp (92);
1612 }
1613 } else {
1614 die "$0: $self->{current_token}->{type}: Unknown token type";
1615 }
1616 $self->{state} = DATA_STATE;
1617 ## reconsume
1618
1619 !!!emit ($self->{current_token}); # start tag or end tag
1620
1621 redo A;
1622 } else {
1623 if ($self->{next_char} == 0x003D) { # =
1624 !!!cp (93);
1625 !!!parse-error (type => 'bad attribute value');
1626 } else {
1627 !!!cp (94);
1628 }
1629 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1630 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1631 !!!next-input-character;
1632 redo A;
1633 }
1634 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1635 if ($self->{next_char} == 0x0022) { # "
1636 !!!cp (95);
1637 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1638 !!!next-input-character;
1639 redo A;
1640 } elsif ($self->{next_char} == 0x0026) { # &
1641 !!!cp (96);
1642 $self->{last_attribute_value_state} = $self->{state};
1643 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1644 !!!next-input-character;
1645 redo A;
1646 } elsif ($self->{next_char} == -1) {
1647 !!!parse-error (type => 'unclosed attribute value');
1648 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1649 !!!cp (97);
1650 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1651 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1652 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1653 if ($self->{current_token}->{attributes}) {
1654 !!!cp (98);
1655 !!!parse-error (type => 'end tag attribute');
1656 } else {
1657 ## NOTE: This state should never be reached.
1658 !!!cp (99);
1659 }
1660 } else {
1661 die "$0: $self->{current_token}->{type}: Unknown token type";
1662 }
1663 $self->{state} = DATA_STATE;
1664 ## reconsume
1665
1666 !!!emit ($self->{current_token}); # start tag or end tag
1667
1668 redo A;
1669 } else {
1670 !!!cp (100);
1671 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1672 ## Stay in the state
1673 !!!next-input-character;
1674 redo A;
1675 }
1676 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1677 if ($self->{next_char} == 0x0027) { # '
1678 !!!cp (101);
1679 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1680 !!!next-input-character;
1681 redo A;
1682 } elsif ($self->{next_char} == 0x0026) { # &
1683 !!!cp (102);
1684 $self->{last_attribute_value_state} = $self->{state};
1685 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1686 !!!next-input-character;
1687 redo A;
1688 } elsif ($self->{next_char} == -1) {
1689 !!!parse-error (type => 'unclosed attribute value');
1690 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1691 !!!cp (103);
1692 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1693 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1694 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1695 if ($self->{current_token}->{attributes}) {
1696 !!!cp (104);
1697 !!!parse-error (type => 'end tag attribute');
1698 } else {
1699 ## NOTE: This state should never be reached.
1700 !!!cp (105);
1701 }
1702 } else {
1703 die "$0: $self->{current_token}->{type}: Unknown token type";
1704 }
1705 $self->{state} = DATA_STATE;
1706 ## reconsume
1707
1708 !!!emit ($self->{current_token}); # start tag or end tag
1709
1710 redo A;
1711 } else {
1712 !!!cp (106);
1713 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1714 ## Stay in the state
1715 !!!next-input-character;
1716 redo A;
1717 }
1718 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1719 if ($self->{next_char} == 0x0009 or # HT
1720 $self->{next_char} == 0x000A or # LF
1721 $self->{next_char} == 0x000B or # HT
1722 $self->{next_char} == 0x000C or # FF
1723 $self->{next_char} == 0x0020) { # SP
1724 !!!cp (107);
1725 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1726 !!!next-input-character;
1727 redo A;
1728 } elsif ($self->{next_char} == 0x0026) { # &
1729 !!!cp (108);
1730 $self->{last_attribute_value_state} = $self->{state};
1731 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1732 !!!next-input-character;
1733 redo A;
1734 } elsif ($self->{next_char} == 0x003E) { # >
1735 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1736 !!!cp (109);
1737 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1738 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1739 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1740 if ($self->{current_token}->{attributes}) {
1741 !!!cp (110);
1742 !!!parse-error (type => 'end tag attribute');
1743 } else {
1744 ## NOTE: This state should never be reached.
1745 !!!cp (111);
1746 }
1747 } else {
1748 die "$0: $self->{current_token}->{type}: Unknown token type";
1749 }
1750 $self->{state} = DATA_STATE;
1751 !!!next-input-character;
1752
1753 !!!emit ($self->{current_token}); # start tag or end tag
1754
1755 redo A;
1756 } elsif ($self->{next_char} == -1) {
1757 !!!parse-error (type => 'unclosed tag');
1758 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1759 !!!cp (112);
1760 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1761 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1762 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1763 if ($self->{current_token}->{attributes}) {
1764 !!!cp (113);
1765 !!!parse-error (type => 'end tag attribute');
1766 } else {
1767 ## NOTE: This state should never be reached.
1768 !!!cp (114);
1769 }
1770 } else {
1771 die "$0: $self->{current_token}->{type}: Unknown token type";
1772 }
1773 $self->{state} = DATA_STATE;
1774 ## reconsume
1775
1776 !!!emit ($self->{current_token}); # start tag or end tag
1777
1778 redo A;
1779 } else {
1780 if ({
1781 0x0022 => 1, # "
1782 0x0027 => 1, # '
1783 0x003D => 1, # =
1784 }->{$self->{next_char}}) {
1785 !!!cp (115);
1786 !!!parse-error (type => 'bad attribute value');
1787 } else {
1788 !!!cp (116);
1789 }
1790 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1791 ## Stay in the state
1792 !!!next-input-character;
1793 redo A;
1794 }
1795 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1796 my $token = $self->_tokenize_attempt_to_consume_an_entity
1797 (1,
1798 $self->{last_attribute_value_state}
1799 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1800 $self->{last_attribute_value_state}
1801 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1802 -1);
1803
1804 unless (defined $token) {
1805 !!!cp (117);
1806 $self->{current_attribute}->{value} .= '&';
1807 } else {
1808 !!!cp (118);
1809 $self->{current_attribute}->{value} .= $token->{data};
1810 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1811 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1812 }
1813
1814 $self->{state} = $self->{last_attribute_value_state};
1815 # next-input-character is already done
1816 redo A;
1817 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1818 if ($self->{next_char} == 0x0009 or # HT
1819 $self->{next_char} == 0x000A or # LF
1820 $self->{next_char} == 0x000B or # VT
1821 $self->{next_char} == 0x000C or # FF
1822 $self->{next_char} == 0x0020) { # SP
1823 !!!cp (118);
1824 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1825 !!!next-input-character;
1826 redo A;
1827 } elsif ($self->{next_char} == 0x003E) { # >
1828 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1829 !!!cp (119);
1830 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1831 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1832 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1833 if ($self->{current_token}->{attributes}) {
1834 !!!cp (120);
1835 !!!parse-error (type => 'end tag attribute');
1836 } else {
1837 ## NOTE: This state should never be reached.
1838 !!!cp (121);
1839 }
1840 } else {
1841 die "$0: $self->{current_token}->{type}: Unknown token type";
1842 }
1843 $self->{state} = DATA_STATE;
1844 !!!next-input-character;
1845
1846 !!!emit ($self->{current_token}); # start tag or end tag
1847
1848 redo A;
1849 } elsif ($self->{next_char} == 0x002F) { # /
1850 !!!cp (122);
1851 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1852 !!!next-input-character;
1853 redo A;
1854 } elsif ($self->{next_char} == -1) {
1855 !!!parse-error (type => 'unclosed tag');
1856 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1857 !!!cp (122.3);
1858 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1859 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1860 if ($self->{current_token}->{attributes}) {
1861 !!!cp (122.1);
1862 !!!parse-error (type => 'end tag attribute');
1863 } else {
1864 ## NOTE: This state should never be reached.
1865 !!!cp (122.2);
1866 }
1867 } else {
1868 die "$0: $self->{current_token}->{type}: Unknown token type";
1869 }
1870 $self->{state} = DATA_STATE;
1871 ## Reconsume.
1872 !!!emit ($self->{current_token}); # start tag or end tag
1873 redo A;
1874 } else {
1875 !!!cp ('124.1');
1876 !!!parse-error (type => 'no space between attributes');
1877 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1878 ## reconsume
1879 redo A;
1880 }
1881 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1882 if ($self->{next_char} == 0x003E) { # >
1883 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1884 !!!cp ('124.2');
1885 !!!parse-error (type => 'nestc', token => $self->{current_token});
1886 ## TODO: Different type than slash in start tag
1887 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1888 if ($self->{current_token}->{attributes}) {
1889 !!!cp ('124.4');
1890 !!!parse-error (type => 'end tag attribute');
1891 } else {
1892 !!!cp ('124.5');
1893 }
1894 ## TODO: Test |<title></title/>|
1895 } else {
1896 !!!cp ('124.3');
1897 $self->{self_closing} = 1;
1898 }
1899
1900 $self->{state} = DATA_STATE;
1901 !!!next-input-character;
1902
1903 !!!emit ($self->{current_token}); # start tag or end tag
1904
1905 redo A;
1906 } elsif ($self->{next_char} == -1) {
1907 !!!parse-error (type => 'unclosed tag');
1908 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1909 !!!cp (124.7);
1910 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1911 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1912 if ($self->{current_token}->{attributes}) {
1913 !!!cp (124.5);
1914 !!!parse-error (type => 'end tag attribute');
1915 } else {
1916 ## NOTE: This state should never be reached.
1917 !!!cp (124.6);
1918 }
1919 } else {
1920 die "$0: $self->{current_token}->{type}: Unknown token type";
1921 }
1922 $self->{state} = DATA_STATE;
1923 ## Reconsume.
1924 !!!emit ($self->{current_token}); # start tag or end tag
1925 redo A;
1926 } else {
1927 !!!cp ('124.4');
1928 !!!parse-error (type => 'nestc');
1929 ## TODO: This error type is wrong.
1930 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1931 ## Reconsume.
1932 redo A;
1933 }
1934 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1935 ## (only happen if PCDATA state)
1936
1937 ## NOTE: Set by the previous state
1938 #my $token = {type => COMMENT_TOKEN, data => ''};
1939
1940 BC: {
1941 if ($self->{next_char} == 0x003E) { # >
1942 !!!cp (124);
1943 $self->{state} = DATA_STATE;
1944 !!!next-input-character;
1945
1946 !!!emit ($self->{current_token}); # comment
1947
1948 redo A;
1949 } elsif ($self->{next_char} == -1) {
1950 !!!cp (125);
1951 $self->{state} = DATA_STATE;
1952 ## reconsume
1953
1954 !!!emit ($self->{current_token}); # comment
1955
1956 redo A;
1957 } else {
1958 !!!cp (126);
1959 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1960 !!!next-input-character;
1961 redo BC;
1962 }
1963 } # BC
1964
1965 die "$0: _get_next_token: unexpected case [BC]";
1966 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1967 ## (only happen if PCDATA state)
1968
1969 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1970
1971 my @next_char;
1972 push @next_char, $self->{next_char};
1973
1974 if ($self->{next_char} == 0x002D) { # -
1975 !!!next-input-character;
1976 push @next_char, $self->{next_char};
1977 if ($self->{next_char} == 0x002D) { # -
1978 !!!cp (127);
1979 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1980 line => $l, column => $c,
1981 };
1982 $self->{state} = COMMENT_START_STATE;
1983 !!!next-input-character;
1984 redo A;
1985 } else {
1986 !!!cp (128);
1987 }
1988 } elsif ($self->{next_char} == 0x0044 or # D
1989 $self->{next_char} == 0x0064) { # d
1990 !!!next-input-character;
1991 push @next_char, $self->{next_char};
1992 if ($self->{next_char} == 0x004F or # O
1993 $self->{next_char} == 0x006F) { # o
1994 !!!next-input-character;
1995 push @next_char, $self->{next_char};
1996 if ($self->{next_char} == 0x0043 or # C
1997 $self->{next_char} == 0x0063) { # c
1998 !!!next-input-character;
1999 push @next_char, $self->{next_char};
2000 if ($self->{next_char} == 0x0054 or # T
2001 $self->{next_char} == 0x0074) { # t
2002 !!!next-input-character;
2003 push @next_char, $self->{next_char};
2004 if ($self->{next_char} == 0x0059 or # Y
2005 $self->{next_char} == 0x0079) { # y
2006 !!!next-input-character;
2007 push @next_char, $self->{next_char};
2008 if ($self->{next_char} == 0x0050 or # P
2009 $self->{next_char} == 0x0070) { # p
2010 !!!next-input-character;
2011 push @next_char, $self->{next_char};
2012 if ($self->{next_char} == 0x0045 or # E
2013 $self->{next_char} == 0x0065) { # e
2014 !!!cp (129);
2015 ## TODO: What a stupid code this is!
2016 $self->{state} = DOCTYPE_STATE;
2017 $self->{current_token} = {type => DOCTYPE_TOKEN,
2018 quirks => 1,
2019 line => $l, column => $c,
2020 };
2021 !!!next-input-character;
2022 redo A;
2023 } else {
2024 !!!cp (130);
2025 }
2026 } else {
2027 !!!cp (131);
2028 }
2029 } else {
2030 !!!cp (132);
2031 }
2032 } else {
2033 !!!cp (133);
2034 }
2035 } else {
2036 !!!cp (134);
2037 }
2038 } else {
2039 !!!cp (135);
2040 }
2041 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2042 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2043 $self->{next_char} == 0x005B) { # [
2044 !!!next-input-character;
2045 push @next_char, $self->{next_char};
2046 if ($self->{next_char} == 0x0043) { # C
2047 !!!next-input-character;
2048 push @next_char, $self->{next_char};
2049 if ($self->{next_char} == 0x0044) { # D
2050 !!!next-input-character;
2051 push @next_char, $self->{next_char};
2052 if ($self->{next_char} == 0x0041) { # A
2053 !!!next-input-character;
2054 push @next_char, $self->{next_char};
2055 if ($self->{next_char} == 0x0054) { # T
2056 !!!next-input-character;
2057 push @next_char, $self->{next_char};
2058 if ($self->{next_char} == 0x0041) { # A
2059 !!!next-input-character;
2060 push @next_char, $self->{next_char};
2061 if ($self->{next_char} == 0x005B) { # [
2062 !!!cp (135.1);
2063 $self->{state} = CDATA_BLOCK_STATE;
2064 !!!next-input-character;
2065 redo A;
2066 } else {
2067 !!!cp (135.2);
2068 }
2069 } else {
2070 !!!cp (135.3);
2071 }
2072 } else {
2073 !!!cp (135.4);
2074 }
2075 } else {
2076 !!!cp (135.5);
2077 }
2078 } else {
2079 !!!cp (135.6);
2080 }
2081 } else {
2082 !!!cp (135.7);
2083 }
2084 } else {
2085 !!!cp (136);
2086 }
2087
2088 !!!parse-error (type => 'bogus comment');
2089 $self->{next_char} = shift @next_char;
2090 !!!back-next-input-character (@next_char);
2091 $self->{state} = BOGUS_COMMENT_STATE;
2092 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2093 line => $l, column => $c,
2094 };
2095 redo A;
2096
2097 ## ISSUE: typos in spec: chacacters, is is a parse error
2098 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
2099 } elsif ($self->{state} == COMMENT_START_STATE) {
2100 if ($self->{next_char} == 0x002D) { # -
2101 !!!cp (137);
2102 $self->{state} = COMMENT_START_DASH_STATE;
2103 !!!next-input-character;
2104 redo A;
2105 } elsif ($self->{next_char} == 0x003E) { # >
2106 !!!cp (138);
2107 !!!parse-error (type => 'bogus comment');
2108 $self->{state} = DATA_STATE;
2109 !!!next-input-character;
2110
2111 !!!emit ($self->{current_token}); # comment
2112
2113 redo A;
2114 } elsif ($self->{next_char} == -1) {
2115 !!!cp (139);
2116 !!!parse-error (type => 'unclosed comment');
2117 $self->{state} = DATA_STATE;
2118 ## reconsume
2119
2120 !!!emit ($self->{current_token}); # comment
2121
2122 redo A;
2123 } else {
2124 !!!cp (140);
2125 $self->{current_token}->{data} # comment
2126 .= chr ($self->{next_char});
2127 $self->{state} = COMMENT_STATE;
2128 !!!next-input-character;
2129 redo A;
2130 }
2131 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2132 if ($self->{next_char} == 0x002D) { # -
2133 !!!cp (141);
2134 $self->{state} = COMMENT_END_STATE;
2135 !!!next-input-character;
2136 redo A;
2137 } elsif ($self->{next_char} == 0x003E) { # >
2138 !!!cp (142);
2139 !!!parse-error (type => 'bogus comment');
2140 $self->{state} = DATA_STATE;
2141 !!!next-input-character;
2142
2143 !!!emit ($self->{current_token}); # comment
2144
2145 redo A;
2146 } elsif ($self->{next_char} == -1) {
2147 !!!cp (143);
2148 !!!parse-error (type => 'unclosed comment');
2149 $self->{state} = DATA_STATE;
2150 ## reconsume
2151
2152 !!!emit ($self->{current_token}); # comment
2153
2154 redo A;
2155 } else {
2156 !!!cp (144);
2157 $self->{current_token}->{data} # comment
2158 .= '-' . chr ($self->{next_char});
2159 $self->{state} = COMMENT_STATE;
2160 !!!next-input-character;
2161 redo A;
2162 }
2163 } elsif ($self->{state} == COMMENT_STATE) {
2164 if ($self->{next_char} == 0x002D) { # -
2165 !!!cp (145);
2166 $self->{state} = COMMENT_END_DASH_STATE;
2167 !!!next-input-character;
2168 redo A;
2169 } elsif ($self->{next_char} == -1) {
2170 !!!cp (146);
2171 !!!parse-error (type => 'unclosed comment');
2172 $self->{state} = DATA_STATE;
2173 ## reconsume
2174
2175 !!!emit ($self->{current_token}); # comment
2176
2177 redo A;
2178 } else {
2179 !!!cp (147);
2180 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2181 ## Stay in the state
2182 !!!next-input-character;
2183 redo A;
2184 }
2185 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2186 if ($self->{next_char} == 0x002D) { # -
2187 !!!cp (148);
2188 $self->{state} = COMMENT_END_STATE;
2189 !!!next-input-character;
2190 redo A;
2191 } elsif ($self->{next_char} == -1) {
2192 !!!cp (149);
2193 !!!parse-error (type => 'unclosed comment');
2194 $self->{state} = DATA_STATE;
2195 ## reconsume
2196
2197 !!!emit ($self->{current_token}); # comment
2198
2199 redo A;
2200 } else {
2201 !!!cp (150);
2202 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2203 $self->{state} = COMMENT_STATE;
2204 !!!next-input-character;
2205 redo A;
2206 }
2207 } elsif ($self->{state} == COMMENT_END_STATE) {
2208 if ($self->{next_char} == 0x003E) { # >
2209 !!!cp (151);
2210 $self->{state} = DATA_STATE;
2211 !!!next-input-character;
2212
2213 !!!emit ($self->{current_token}); # comment
2214
2215 redo A;
2216 } elsif ($self->{next_char} == 0x002D) { # -
2217 !!!cp (152);
2218 !!!parse-error (type => 'dash in comment',
2219 line => $self->{line_prev},
2220 column => $self->{column_prev});
2221 $self->{current_token}->{data} .= '-'; # comment
2222 ## Stay in the state
2223 !!!next-input-character;
2224 redo A;
2225 } elsif ($self->{next_char} == -1) {
2226 !!!cp (153);
2227 !!!parse-error (type => 'unclosed comment');
2228 $self->{state} = DATA_STATE;
2229 ## reconsume
2230
2231 !!!emit ($self->{current_token}); # comment
2232
2233 redo A;
2234 } else {
2235 !!!cp (154);
2236 !!!parse-error (type => 'dash in comment',
2237 line => $self->{line_prev},
2238 column => $self->{column_prev});
2239 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2240 $self->{state} = COMMENT_STATE;
2241 !!!next-input-character;
2242 redo A;
2243 }
2244 } elsif ($self->{state} == DOCTYPE_STATE) {
2245 if ($self->{next_char} == 0x0009 or # HT
2246 $self->{next_char} == 0x000A or # LF
2247 $self->{next_char} == 0x000B or # VT
2248 $self->{next_char} == 0x000C or # FF
2249 $self->{next_char} == 0x0020) { # SP
2250 !!!cp (155);
2251 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2252 !!!next-input-character;
2253 redo A;
2254 } else {
2255 !!!cp (156);
2256 !!!parse-error (type => 'no space before DOCTYPE name');
2257 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2258 ## reconsume
2259 redo A;
2260 }
2261 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2262 if ($self->{next_char} == 0x0009 or # HT
2263 $self->{next_char} == 0x000A or # LF
2264 $self->{next_char} == 0x000B or # VT
2265 $self->{next_char} == 0x000C or # FF
2266 $self->{next_char} == 0x0020) { # SP
2267 !!!cp (157);
2268 ## Stay in the state
2269 !!!next-input-character;
2270 redo A;
2271 } elsif ($self->{next_char} == 0x003E) { # >
2272 !!!cp (158);
2273 !!!parse-error (type => 'no DOCTYPE name');
2274 $self->{state} = DATA_STATE;
2275 !!!next-input-character;
2276
2277 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2278
2279 redo A;
2280 } elsif ($self->{next_char} == -1) {
2281 !!!cp (159);
2282 !!!parse-error (type => 'no DOCTYPE name');
2283 $self->{state} = DATA_STATE;
2284 ## reconsume
2285
2286 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2287
2288 redo A;
2289 } else {
2290 !!!cp (160);
2291 $self->{current_token}->{name} = chr $self->{next_char};
2292 delete $self->{current_token}->{quirks};
2293 ## ISSUE: "Set the token's name name to the" in the spec
2294 $self->{state} = DOCTYPE_NAME_STATE;
2295 !!!next-input-character;
2296 redo A;
2297 }
2298 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2299 ## ISSUE: Redundant "First," in the spec.
2300 if ($self->{next_char} == 0x0009 or # HT
2301 $self->{next_char} == 0x000A or # LF
2302 $self->{next_char} == 0x000B or # VT
2303 $self->{next_char} == 0x000C or # FF
2304 $self->{next_char} == 0x0020) { # SP
2305 !!!cp (161);
2306 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2307 !!!next-input-character;
2308 redo A;
2309 } elsif ($self->{next_char} == 0x003E) { # >
2310 !!!cp (162);
2311 $self->{state} = DATA_STATE;
2312 !!!next-input-character;
2313
2314 !!!emit ($self->{current_token}); # DOCTYPE
2315
2316 redo A;
2317 } elsif ($self->{next_char} == -1) {
2318 !!!cp (163);
2319 !!!parse-error (type => 'unclosed DOCTYPE');
2320 $self->{state} = DATA_STATE;
2321 ## reconsume
2322
2323 $self->{current_token}->{quirks} = 1;
2324 !!!emit ($self->{current_token}); # DOCTYPE
2325
2326 redo A;
2327 } else {
2328 !!!cp (164);
2329 $self->{current_token}->{name}
2330 .= chr ($self->{next_char}); # DOCTYPE
2331 ## Stay in the state
2332 !!!next-input-character;
2333 redo A;
2334 }
2335 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2336 if ($self->{next_char} == 0x0009 or # HT
2337 $self->{next_char} == 0x000A or # LF
2338 $self->{next_char} == 0x000B or # VT
2339 $self->{next_char} == 0x000C or # FF
2340 $self->{next_char} == 0x0020) { # SP
2341 !!!cp (165);
2342 ## Stay in the state
2343 !!!next-input-character;
2344 redo A;
2345 } elsif ($self->{next_char} == 0x003E) { # >
2346 !!!cp (166);
2347 $self->{state} = DATA_STATE;
2348 !!!next-input-character;
2349
2350 !!!emit ($self->{current_token}); # DOCTYPE
2351
2352 redo A;
2353 } elsif ($self->{next_char} == -1) {
2354 !!!cp (167);
2355 !!!parse-error (type => 'unclosed DOCTYPE');
2356 $self->{state} = DATA_STATE;
2357 ## reconsume
2358
2359 $self->{current_token}->{quirks} = 1;
2360 !!!emit ($self->{current_token}); # DOCTYPE
2361
2362 redo A;
2363 } elsif ($self->{next_char} == 0x0050 or # P
2364 $self->{next_char} == 0x0070) { # p
2365 !!!next-input-character;
2366 if ($self->{next_char} == 0x0055 or # U
2367 $self->{next_char} == 0x0075) { # u
2368 !!!next-input-character;
2369 if ($self->{next_char} == 0x0042 or # B
2370 $self->{next_char} == 0x0062) { # b
2371 !!!next-input-character;
2372 if ($self->{next_char} == 0x004C or # L
2373 $self->{next_char} == 0x006C) { # l
2374 !!!next-input-character;
2375 if ($self->{next_char} == 0x0049 or # I
2376 $self->{next_char} == 0x0069) { # i
2377 !!!next-input-character;
2378 if ($self->{next_char} == 0x0043 or # C
2379 $self->{next_char} == 0x0063) { # c
2380 !!!cp (168);
2381 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2382 !!!next-input-character;
2383 redo A;
2384 } else {
2385 !!!cp (169);
2386 }
2387 } else {
2388 !!!cp (170);
2389 }
2390 } else {
2391 !!!cp (171);
2392 }
2393 } else {
2394 !!!cp (172);
2395 }
2396 } else {
2397 !!!cp (173);
2398 }
2399
2400 #
2401 } elsif ($self->{next_char} == 0x0053 or # S
2402 $self->{next_char} == 0x0073) { # s
2403 !!!next-input-character;
2404 if ($self->{next_char} == 0x0059 or # Y
2405 $self->{next_char} == 0x0079) { # y
2406 !!!next-input-character;
2407 if ($self->{next_char} == 0x0053 or # S
2408 $self->{next_char} == 0x0073) { # s
2409 !!!next-input-character;
2410 if ($self->{next_char} == 0x0054 or # T
2411 $self->{next_char} == 0x0074) { # t
2412 !!!next-input-character;
2413 if ($self->{next_char} == 0x0045 or # E
2414 $self->{next_char} == 0x0065) { # e
2415 !!!next-input-character;
2416 if ($self->{next_char} == 0x004D or # M
2417 $self->{next_char} == 0x006D) { # m
2418 !!!cp (174);
2419 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2420 !!!next-input-character;
2421 redo A;
2422 } else {
2423 !!!cp (175);
2424 }
2425 } else {
2426 !!!cp (176);
2427 }
2428 } else {
2429 !!!cp (177);
2430 }
2431 } else {
2432 !!!cp (178);
2433 }
2434 } else {
2435 !!!cp (179);
2436 }
2437
2438 #
2439 } else {
2440 !!!cp (180);
2441 !!!next-input-character;
2442 #
2443 }
2444
2445 !!!parse-error (type => 'string after DOCTYPE name');
2446 $self->{current_token}->{quirks} = 1;
2447
2448 $self->{state} = BOGUS_DOCTYPE_STATE;
2449 # next-input-character is already done
2450 redo A;
2451 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2452 if ({
2453 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2454 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2455 }->{$self->{next_char}}) {
2456 !!!cp (181);
2457 ## Stay in the state
2458 !!!next-input-character;
2459 redo A;
2460 } elsif ($self->{next_char} eq 0x0022) { # "
2461 !!!cp (182);
2462 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2463 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2464 !!!next-input-character;
2465 redo A;
2466 } elsif ($self->{next_char} eq 0x0027) { # '
2467 !!!cp (183);
2468 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2469 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2470 !!!next-input-character;
2471 redo A;
2472 } elsif ($self->{next_char} eq 0x003E) { # >
2473 !!!cp (184);
2474 !!!parse-error (type => 'no PUBLIC literal');
2475
2476 $self->{state} = DATA_STATE;
2477 !!!next-input-character;
2478
2479 $self->{current_token}->{quirks} = 1;
2480 !!!emit ($self->{current_token}); # DOCTYPE
2481
2482 redo A;
2483 } elsif ($self->{next_char} == -1) {
2484 !!!cp (185);
2485 !!!parse-error (type => 'unclosed DOCTYPE');
2486
2487 $self->{state} = DATA_STATE;
2488 ## reconsume
2489
2490 $self->{current_token}->{quirks} = 1;
2491 !!!emit ($self->{current_token}); # DOCTYPE
2492
2493 redo A;
2494 } else {
2495 !!!cp (186);
2496 !!!parse-error (type => 'string after PUBLIC');
2497 $self->{current_token}->{quirks} = 1;
2498
2499 $self->{state} = BOGUS_DOCTYPE_STATE;
2500 !!!next-input-character;
2501 redo A;
2502 }
2503 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2504 if ($self->{next_char} == 0x0022) { # "
2505 !!!cp (187);
2506 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2507 !!!next-input-character;
2508 redo A;
2509 } elsif ($self->{next_char} == 0x003E) { # >
2510 !!!cp (188);
2511 !!!parse-error (type => 'unclosed PUBLIC literal');
2512
2513 $self->{state} = DATA_STATE;
2514 !!!next-input-character;
2515
2516 $self->{current_token}->{quirks} = 1;
2517 !!!emit ($self->{current_token}); # DOCTYPE
2518
2519 redo A;
2520 } elsif ($self->{next_char} == -1) {
2521 !!!cp (189);
2522 !!!parse-error (type => 'unclosed PUBLIC literal');
2523
2524 $self->{state} = DATA_STATE;
2525 ## reconsume
2526
2527 $self->{current_token}->{quirks} = 1;
2528 !!!emit ($self->{current_token}); # DOCTYPE
2529
2530 redo A;
2531 } else {
2532 !!!cp (190);
2533 $self->{current_token}->{public_identifier} # DOCTYPE
2534 .= chr $self->{next_char};
2535 ## Stay in the state
2536 !!!next-input-character;
2537 redo A;
2538 }
2539 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2540 if ($self->{next_char} == 0x0027) { # '
2541 !!!cp (191);
2542 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2543 !!!next-input-character;
2544 redo A;
2545 } elsif ($self->{next_char} == 0x003E) { # >
2546 !!!cp (192);
2547 !!!parse-error (type => 'unclosed PUBLIC literal');
2548
2549 $self->{state} = DATA_STATE;
2550 !!!next-input-character;
2551
2552 $self->{current_token}->{quirks} = 1;
2553 !!!emit ($self->{current_token}); # DOCTYPE
2554
2555 redo A;
2556 } elsif ($self->{next_char} == -1) {
2557 !!!cp (193);
2558 !!!parse-error (type => 'unclosed PUBLIC literal');
2559
2560 $self->{state} = DATA_STATE;
2561 ## reconsume
2562
2563 $self->{current_token}->{quirks} = 1;
2564 !!!emit ($self->{current_token}); # DOCTYPE
2565
2566 redo A;
2567 } else {
2568 !!!cp (194);
2569 $self->{current_token}->{public_identifier} # DOCTYPE
2570 .= chr $self->{next_char};
2571 ## Stay in the state
2572 !!!next-input-character;
2573 redo A;
2574 }
2575 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2576 if ({
2577 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2578 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2579 }->{$self->{next_char}}) {
2580 !!!cp (195);
2581 ## Stay in the state
2582 !!!next-input-character;
2583 redo A;
2584 } elsif ($self->{next_char} == 0x0022) { # "
2585 !!!cp (196);
2586 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2587 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2588 !!!next-input-character;
2589 redo A;
2590 } elsif ($self->{next_char} == 0x0027) { # '
2591 !!!cp (197);
2592 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2593 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2594 !!!next-input-character;
2595 redo A;
2596 } elsif ($self->{next_char} == 0x003E) { # >
2597 !!!cp (198);
2598 $self->{state} = DATA_STATE;
2599 !!!next-input-character;
2600
2601 !!!emit ($self->{current_token}); # DOCTYPE
2602
2603 redo A;
2604 } elsif ($self->{next_char} == -1) {
2605 !!!cp (199);
2606 !!!parse-error (type => 'unclosed DOCTYPE');
2607
2608 $self->{state} = DATA_STATE;
2609 ## reconsume
2610
2611 $self->{current_token}->{quirks} = 1;
2612 !!!emit ($self->{current_token}); # DOCTYPE
2613
2614 redo A;
2615 } else {
2616 !!!cp (200);
2617 !!!parse-error (type => 'string after PUBLIC literal');
2618 $self->{current_token}->{quirks} = 1;
2619
2620 $self->{state} = BOGUS_DOCTYPE_STATE;
2621 !!!next-input-character;
2622 redo A;
2623 }
2624 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2625 if ({
2626 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2627 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2628 }->{$self->{next_char}}) {
2629 !!!cp (201);
2630 ## Stay in the state
2631 !!!next-input-character;
2632 redo A;
2633 } elsif ($self->{next_char} == 0x0022) { # "
2634 !!!cp (202);
2635 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2636 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2637 !!!next-input-character;
2638 redo A;
2639 } elsif ($self->{next_char} == 0x0027) { # '
2640 !!!cp (203);
2641 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2642 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2643 !!!next-input-character;
2644 redo A;
2645 } elsif ($self->{next_char} == 0x003E) { # >
2646 !!!cp (204);
2647 !!!parse-error (type => 'no SYSTEM literal');
2648 $self->{state} = DATA_STATE;
2649 !!!next-input-character;
2650
2651 $self->{current_token}->{quirks} = 1;
2652 !!!emit ($self->{current_token}); # DOCTYPE
2653
2654 redo A;
2655 } elsif ($self->{next_char} == -1) {
2656 !!!cp (205);
2657 !!!parse-error (type => 'unclosed DOCTYPE');
2658
2659 $self->{state} = DATA_STATE;
2660 ## reconsume
2661
2662 $self->{current_token}->{quirks} = 1;
2663 !!!emit ($self->{current_token}); # DOCTYPE
2664
2665 redo A;
2666 } else {
2667 !!!cp (206);
2668 !!!parse-error (type => 'string after SYSTEM');
2669 $self->{current_token}->{quirks} = 1;
2670
2671 $self->{state} = BOGUS_DOCTYPE_STATE;
2672 !!!next-input-character;
2673 redo A;
2674 }
2675 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2676 if ($self->{next_char} == 0x0022) { # "
2677 !!!cp (207);
2678 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2679 !!!next-input-character;
2680 redo A;
2681 } elsif ($self->{next_char} == 0x003E) { # >
2682 !!!cp (208);
2683 !!!parse-error (type => 'unclosed SYSTEM literal');
2684
2685 $self->{state} = DATA_STATE;
2686 !!!next-input-character;
2687
2688 $self->{current_token}->{quirks} = 1;
2689 !!!emit ($self->{current_token}); # DOCTYPE
2690
2691 redo A;
2692 } elsif ($self->{next_char} == -1) {
2693 !!!cp (209);
2694 !!!parse-error (type => 'unclosed SYSTEM literal');
2695
2696 $self->{state} = DATA_STATE;
2697 ## reconsume
2698
2699 $self->{current_token}->{quirks} = 1;
2700 !!!emit ($self->{current_token}); # DOCTYPE
2701
2702 redo A;
2703 } else {
2704 !!!cp (210);
2705 $self->{current_token}->{system_identifier} # DOCTYPE
2706 .= chr $self->{next_char};
2707 ## Stay in the state
2708 !!!next-input-character;
2709 redo A;
2710 }
2711 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2712 if ($self->{next_char} == 0x0027) { # '
2713 !!!cp (211);
2714 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2715 !!!next-input-character;
2716 redo A;
2717 } elsif ($self->{next_char} == 0x003E) { # >
2718 !!!cp (212);
2719 !!!parse-error (type => 'unclosed SYSTEM literal');
2720
2721 $self->{state} = DATA_STATE;
2722 !!!next-input-character;
2723
2724 $self->{current_token}->{quirks} = 1;
2725 !!!emit ($self->{current_token}); # DOCTYPE
2726
2727 redo A;
2728 } elsif ($self->{next_char} == -1) {
2729 !!!cp (213);
2730 !!!parse-error (type => 'unclosed SYSTEM literal');
2731
2732 $self->{state} = DATA_STATE;
2733 ## reconsume
2734
2735 $self->{current_token}->{quirks} = 1;
2736 !!!emit ($self->{current_token}); # DOCTYPE
2737
2738 redo A;
2739 } else {
2740 !!!cp (214);
2741 $self->{current_token}->{system_identifier} # DOCTYPE
2742 .= chr $self->{next_char};
2743 ## Stay in the state
2744 !!!next-input-character;
2745 redo A;
2746 }
2747 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2748 if ({
2749 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2750 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2751 }->{$self->{next_char}}) {
2752 !!!cp (215);
2753 ## Stay in the state
2754 !!!next-input-character;
2755 redo A;
2756 } elsif ($self->{next_char} == 0x003E) { # >
2757 !!!cp (216);
2758 $self->{state} = DATA_STATE;
2759 !!!next-input-character;
2760
2761 !!!emit ($self->{current_token}); # DOCTYPE
2762
2763 redo A;
2764 } elsif ($self->{next_char} == -1) {
2765 !!!cp (217);
2766 !!!parse-error (type => 'unclosed DOCTYPE');
2767 $self->{state} = DATA_STATE;
2768 ## reconsume
2769
2770 $self->{current_token}->{quirks} = 1;
2771 !!!emit ($self->{current_token}); # DOCTYPE
2772
2773 redo A;
2774 } else {
2775 !!!cp (218);
2776 !!!parse-error (type => 'string after SYSTEM literal');
2777 #$self->{current_token}->{quirks} = 1;
2778
2779 $self->{state} = BOGUS_DOCTYPE_STATE;
2780 !!!next-input-character;
2781 redo A;
2782 }
2783 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2784 if ($self->{next_char} == 0x003E) { # >
2785 !!!cp (219);
2786 $self->{state} = DATA_STATE;
2787 !!!next-input-character;
2788
2789 !!!emit ($self->{current_token}); # DOCTYPE
2790
2791 redo A;
2792 } elsif ($self->{next_char} == -1) {
2793 !!!cp (220);
2794 !!!parse-error (type => 'unclosed DOCTYPE');
2795 $self->{state} = DATA_STATE;
2796 ## reconsume
2797
2798 !!!emit ($self->{current_token}); # DOCTYPE
2799
2800 redo A;
2801 } else {
2802 !!!cp (221);
2803 ## Stay in the state
2804 !!!next-input-character;
2805 redo A;
2806 }
2807 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2808 my $s = '';
2809
2810 my ($l, $c) = ($self->{line}, $self->{column});
2811
2812 CS: while ($self->{next_char} != -1) {
2813 if ($self->{next_char} == 0x005D) { # ]
2814 !!!next-input-character;
2815 if ($self->{next_char} == 0x005D) { # ]
2816 !!!next-input-character;
2817 MDC: {
2818 if ($self->{next_char} == 0x003E) { # >
2819 !!!cp (221.1);
2820 !!!next-input-character;
2821 last CS;
2822 } elsif ($self->{next_char} == 0x005D) { # ]
2823 !!!cp (221.2);
2824 $s .= ']';
2825 !!!next-input-character;
2826 redo MDC;
2827 } else {
2828 !!!cp (221.3);
2829 $s .= ']]';
2830 #
2831 }
2832 } # MDC
2833 } else {
2834 !!!cp (221.4);
2835 $s .= ']';
2836 #
2837 }
2838 } else {
2839 !!!cp (221.5);
2840 #
2841 }
2842 $s .= chr $self->{next_char};
2843 !!!next-input-character;
2844 } # CS
2845
2846 $self->{state} = DATA_STATE;
2847 ## next-input-character done or EOF, which is reconsumed.
2848
2849 if (length $s) {
2850 !!!cp (221.6);
2851 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2852 line => $l, column => $c});
2853 } else {
2854 !!!cp (221.7);
2855 }
2856
2857 redo A;
2858
2859 ## ISSUE: "text tokens" in spec.
2860 ## TODO: Streaming support
2861 } else {
2862 die "$0: $self->{state}: Unknown state";
2863 }
2864 } # A
2865
2866 die "$0: _get_next_token: unexpected case";
2867 } # _get_next_token
2868
2869 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2870 my ($self, $in_attr, $additional) = @_;
2871
2872 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2873
2874 if ({
2875 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2876 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2877 $additional => 1,
2878 }->{$self->{next_char}}) {
2879 !!!cp (1001);
2880 ## Don't consume
2881 ## No error
2882 return undef;
2883 } elsif ($self->{next_char} == 0x0023) { # #
2884 !!!next-input-character;
2885 if ($self->{next_char} == 0x0078 or # x
2886 $self->{next_char} == 0x0058) { # X
2887 my $code;
2888 X: {
2889 my $x_char = $self->{next_char};
2890 !!!next-input-character;
2891 if (0x0030 <= $self->{next_char} and
2892 $self->{next_char} <= 0x0039) { # 0..9
2893 !!!cp (1002);
2894 $code ||= 0;
2895 $code *= 0x10;
2896 $code += $self->{next_char} - 0x0030;
2897 redo X;
2898 } elsif (0x0061 <= $self->{next_char} and
2899 $self->{next_char} <= 0x0066) { # a..f
2900 !!!cp (1003);
2901 $code ||= 0;
2902 $code *= 0x10;
2903 $code += $self->{next_char} - 0x0060 + 9;
2904 redo X;
2905 } elsif (0x0041 <= $self->{next_char} and
2906 $self->{next_char} <= 0x0046) { # A..F
2907 !!!cp (1004);
2908 $code ||= 0;
2909 $code *= 0x10;
2910 $code += $self->{next_char} - 0x0040 + 9;
2911 redo X;
2912 } elsif (not defined $code) { # no hexadecimal digit
2913 !!!cp (1005);
2914 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2915 !!!back-next-input-character ($x_char, $self->{next_char});
2916 $self->{next_char} = 0x0023; # #
2917 return undef;
2918 } elsif ($self->{next_char} == 0x003B) { # ;
2919 !!!cp (1006);
2920 !!!next-input-character;
2921 } else {
2922 !!!cp (1007);
2923 !!!parse-error (type => 'no refc', line => $l, column => $c);
2924 }
2925
2926 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2927 !!!cp (1008);
2928 !!!parse-error (type => 'invalid character reference',
2929 text => (sprintf 'U+%04X', $code),
2930 line => $l, column => $c);
2931 $code = 0xFFFD;
2932 } elsif ($code > 0x10FFFF) {
2933 !!!cp (1009);
2934 !!!parse-error (type => 'invalid character reference',
2935 text => (sprintf 'U-%08X', $code),
2936 line => $l, column => $c);
2937 $code = 0xFFFD;
2938 } elsif ($code == 0x000D) {
2939 !!!cp (1010);
2940 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2941 $code = 0x000A;
2942 } elsif (0x80 <= $code and $code <= 0x9F) {
2943 !!!cp (1011);
2944 !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
2945 $code = $c1_entity_char->{$code};
2946 }
2947
2948 return {type => CHARACTER_TOKEN, data => chr $code,
2949 has_reference => 1,
2950 line => $l, column => $c,
2951 };
2952 } # X
2953 } elsif (0x0030 <= $self->{next_char} and
2954 $self->{next_char} <= 0x0039) { # 0..9
2955 my $code = $self->{next_char} - 0x0030;
2956 !!!next-input-character;
2957
2958 while (0x0030 <= $self->{next_char} and
2959 $self->{next_char} <= 0x0039) { # 0..9
2960 !!!cp (1012);
2961 $code *= 10;
2962 $code += $self->{next_char} - 0x0030;
2963
2964 !!!next-input-character;
2965 }
2966
2967 if ($self->{next_char} == 0x003B) { # ;
2968 !!!cp (1013);
2969 !!!next-input-character;
2970 } else {
2971 !!!cp (1014);
2972 !!!parse-error (type => 'no refc', line => $l, column => $c);
2973 }
2974
2975 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2976 !!!cp (1015);
2977 !!!parse-error (type => 'invalid character reference',
2978 text => (sprintf 'U+%04X', $code),
2979 line => $l, column => $c);
2980 $code = 0xFFFD;
2981 } elsif ($code > 0x10FFFF) {
2982 !!!cp (1016);
2983 !!!parse-error (type => 'invalid character reference',
2984 text => (sprintf 'U-%08X', $code),
2985 line => $l, column => $c);
2986 $code = 0xFFFD;
2987 } elsif ($code == 0x000D) {
2988 !!!cp (1017);
2989 !!!parse-error (type => 'CR character reference',
2990 line => $l, column => $c);
2991 $code = 0x000A;
2992 } elsif (0x80 <= $code and $code <= 0x9F) {
2993 !!!cp (1018);
2994 !!!parse-error (type => 'C1 character reference',
2995 text => (sprintf 'U+%04X', $code),
2996 line => $l, column => $c);
2997 $code = $c1_entity_char->{$code};
2998 }
2999
3000 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
3001 line => $l, column => $c,
3002 };
3003 } else {
3004 !!!cp (1019);
3005 !!!parse-error (type => 'bare nero', line => $l, column => $c);
3006 !!!back-next-input-character ($self->{next_char});
3007 $self->{next_char} = 0x0023; # #
3008 return undef;
3009 }
3010 } elsif ((0x0041 <= $self->{next_char} and
3011 $self->{next_char} <= 0x005A) or
3012 (0x0061 <= $self->{next_char} and
3013 $self->{next_char} <= 0x007A)) {
3014 my $entity_name = chr $self->{next_char};
3015 !!!next-input-character;
3016
3017 my $value = $entity_name;
3018 my $match = 0;
3019 require Whatpm::_NamedEntityList;
3020 our $EntityChar;
3021
3022 while (length $entity_name < 30 and
3023 ## NOTE: Some number greater than the maximum length of entity name
3024 ((0x0041 <= $self->{next_char} and # a
3025 $self->{next_char} <= 0x005A) or # x
3026 (0x0061 <= $self->{next_char} and # a
3027 $self->{next_char} <= 0x007A) or # z
3028 (0x0030 <= $self->{next_char} and # 0
3029 $self->{next_char} <= 0x0039) or # 9
3030 $self->{next_char} == 0x003B)) { # ;
3031 $entity_name .= chr $self->{next_char};
3032 if (defined $EntityChar->{$entity_name}) {
3033 if ($self->{next_char} == 0x003B) { # ;
3034 !!!cp (1020);
3035 $value = $EntityChar->{$entity_name};
3036 $match = 1;
3037 !!!next-input-character;
3038 last;
3039 } else {
3040 !!!cp (1021);
3041 $value = $EntityChar->{$entity_name};
3042 $match = -1;
3043 !!!next-input-character;
3044 }
3045 } else {
3046 !!!cp (1022);
3047 $value .= chr $self->{next_char};
3048 $match *= 2;
3049 !!!next-input-character;
3050 }
3051 }
3052
3053 if ($match > 0) {
3054 !!!cp (1023);
3055 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3056 line => $l, column => $c,
3057 };
3058 } elsif ($match < 0) {
3059 !!!parse-error (type => 'no refc', line => $l, column => $c);
3060 if ($in_attr and $match < -1) {
3061 !!!cp (1024);
3062 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
3063 line => $l, column => $c,
3064 };
3065 } else {
3066 !!!cp (1025);
3067 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3068 line => $l, column => $c,
3069 };
3070 }
3071 } else {
3072 !!!cp (1026);
3073 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3074 ## NOTE: "No characters are consumed" in the spec.
3075 return {type => CHARACTER_TOKEN, data => '&'.$value,
3076 line => $l, column => $c,
3077 };
3078 }
3079 } else {
3080 !!!cp (1027);
3081 ## no characters are consumed
3082 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3083 return undef;
3084 }
3085 } # _tokenize_attempt_to_consume_an_entity
3086
3087 sub _initialize_tree_constructor ($) {
3088 my $self = shift;
3089 ## NOTE: $self->{document} MUST be specified before this method is called
3090 $self->{document}->strict_error_checking (0);
3091 ## TODO: Turn mutation events off # MUST
3092 ## TODO: Turn loose Document option (manakai extension) on
3093 $self->{document}->manakai_is_html (1); # MUST
3094 $self->{document}->set_user_data (manakai_source_line => 1);
3095 $self->{document}->set_user_data (manakai_source_column => 1);
3096 } # _initialize_tree_constructor
3097
3098 sub _terminate_tree_constructor ($) {
3099 my $self = shift;
3100 $self->{document}->strict_error_checking (1);
3101 ## TODO: Turn mutation events on
3102 } # _terminate_tree_constructor
3103
3104 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3105
3106 { # tree construction stage
3107 my $token;
3108
3109 sub _construct_tree ($) {
3110 my ($self) = @_;
3111
3112 ## When an interactive UA render the $self->{document} available
3113 ## to the user, or when it begin accepting user input, are
3114 ## not defined.
3115
3116 ## Append a character: collect it and all subsequent consecutive
3117 ## characters and insert one Text node whose data is concatenation
3118 ## of all those characters. # MUST
3119
3120 !!!next-token;
3121
3122 undef $self->{form_element};
3123 undef $self->{head_element};
3124 $self->{open_elements} = [];
3125 undef $self->{inner_html_node};
3126
3127 ## NOTE: The "initial" insertion mode.
3128 $self->_tree_construction_initial; # MUST
3129
3130 ## NOTE: The "before html" insertion mode.
3131 $self->_tree_construction_root_element;
3132 $self->{insertion_mode} = BEFORE_HEAD_IM;
3133
3134 ## NOTE: The "before head" insertion mode and so on.
3135 $self->_tree_construction_main;
3136 } # _construct_tree
3137
3138 sub _tree_construction_initial ($) {
3139 my $self = shift;
3140
3141 ## NOTE: "initial" insertion mode
3142
3143 INITIAL: {
3144 if ($token->{type} == DOCTYPE_TOKEN) {
3145 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3146 ## error, switch to a conformance checking mode for another
3147 ## language.
3148 my $doctype_name = $token->{name};
3149 $doctype_name = '' unless defined $doctype_name;
3150 $doctype_name =~ tr/a-z/A-Z/;
3151 if (not defined $token->{name} or # <!DOCTYPE>
3152 defined $token->{public_identifier} or
3153 defined $token->{system_identifier}) {
3154 !!!cp ('t1');
3155 !!!parse-error (type => 'not HTML5', token => $token);
3156 } elsif ($doctype_name ne 'HTML') {
3157 !!!cp ('t2');
3158 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
3159 !!!parse-error (type => 'not HTML5', token => $token);
3160 } else {
3161 !!!cp ('t3');
3162 }
3163
3164 my $doctype = $self->{document}->create_document_type_definition
3165 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3166 ## NOTE: Default value for both |public_id| and |system_id| attributes
3167 ## are empty strings, so that we don't set any value in missing cases.
3168 $doctype->public_id ($token->{public_identifier})
3169 if defined $token->{public_identifier};
3170 $doctype->system_id ($token->{system_identifier})
3171 if defined $token->{system_identifier};
3172 ## NOTE: Other DocumentType attributes are null or empty lists.
3173 ## ISSUE: internalSubset = null??
3174 $self->{document}->append_child ($doctype);
3175
3176 if ($token->{quirks} or $doctype_name ne 'HTML') {
3177 !!!cp ('t4');
3178 $self->{document}->manakai_compat_mode ('quirks');
3179 } elsif (defined $token->{public_identifier}) {
3180 my $pubid = $token->{public_identifier};
3181 $pubid =~ tr/a-z/A-z/;
3182 my $prefix = [
3183 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3184 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3185 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3186 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3187 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3188 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3189 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3190 "-//IETF//DTD HTML 2.0 STRICT//",
3191 "-//IETF//DTD HTML 2.0//",
3192 "-//IETF//DTD HTML 2.1E//",
3193 "-//IETF//DTD HTML 3.0//",
3194 "-//IETF//DTD HTML 3.2 FINAL//",
3195 "-//IETF//DTD HTML 3.2//",
3196 "-//IETF//DTD HTML 3//",
3197 "-//IETF//DTD HTML LEVEL 0//",
3198 "-//IETF//DTD HTML LEVEL 1//",
3199 "-//IETF//DTD HTML LEVEL 2//",
3200 "-//IETF//DTD HTML LEVEL 3//",
3201 "-//IETF//DTD HTML STRICT LEVEL 0//",
3202 "-//IETF//DTD HTML STRICT LEVEL 1//",
3203 "-//IETF//DTD HTML STRICT LEVEL 2//",
3204 "-//IETF//DTD HTML STRICT LEVEL 3//",
3205 "-//IETF//DTD HTML STRICT//",
3206 "-//IETF//DTD HTML//",
3207 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3208 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3209 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3210 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3211 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3212 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3213 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3214 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3215 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3216 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3217 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3218 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3219 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3220 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3221 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3222 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3223 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3224 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3225 "-//W3C//DTD HTML 3 1995-03-24//",
3226 "-//W3C//DTD HTML 3.2 DRAFT//",
3227 "-//W3C//DTD HTML 3.2 FINAL//",
3228 "-//W3C//DTD HTML 3.2//",
3229 "-//W3C//DTD HTML 3.2S DRAFT//",
3230 "-//W3C//DTD HTML 4.0 FRAMESET//",
3231 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3232 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3233 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3234 "-//W3C//DTD W3 HTML//",
3235 "-//W3O//DTD W3 HTML 3.0//",
3236 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3237 "-//WEBTECHS//DTD MOZILLA HTML//",
3238 ]; # $prefix
3239 my $match;
3240 for (@$prefix) {
3241 if (substr ($prefix, 0, length $_) eq $_) {
3242 $match = 1;
3243 last;
3244 }
3245 }
3246 if ($match or
3247 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3248 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3249 $pubid eq "HTML") {
3250 !!!cp ('t5');
3251 $self->{document}->manakai_compat_mode ('quirks');
3252 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3253 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3254 if (defined $token->{system_identifier}) {
3255 !!!cp ('t6');
3256 $self->{document}->manakai_compat_mode ('quirks');
3257 } else {
3258 !!!cp ('t7');
3259 $self->{document}->manakai_compat_mode ('limited quirks');
3260 }
3261 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3262 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3263 !!!cp ('t8');
3264 $self->{document}->manakai_compat_mode ('limited quirks');
3265 } else {
3266 !!!cp ('t9');
3267 }
3268 } else {
3269 !!!cp ('t10');
3270 }
3271 if (defined $token->{system_identifier}) {
3272 my $sysid = $token->{system_identifier};
3273 $sysid =~ tr/A-Z/a-z/;
3274 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3275 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3276 ## marked as quirks.
3277 $self->{document}->manakai_compat_mode ('quirks');
3278 !!!cp ('t11');
3279 } else {
3280 !!!cp ('t12');
3281 }
3282 } else {
3283 !!!cp ('t13');
3284 }
3285
3286 ## Go to the "before html" insertion mode.
3287 !!!next-token;
3288 return;
3289 } elsif ({
3290 START_TAG_TOKEN, 1,
3291 END_TAG_TOKEN, 1,
3292 END_OF_FILE_TOKEN, 1,
3293 }->{$token->{type}}) {
3294 !!!cp ('t14');
3295 !!!parse-error (type => 'no DOCTYPE', token => $token);
3296 $self->{document}->manakai_compat_mode ('quirks');
3297 ## Go to the "before html" insertion mode.
3298 ## reprocess
3299 !!!ack-later;
3300 return;
3301 } elsif ($token->{type} == CHARACTER_TOKEN) {
3302 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3303 ## Ignore the token
3304
3305 unless (length $token->{data}) {
3306 !!!cp ('t15');
3307 ## Stay in the insertion mode.
3308 !!!next-token;
3309 redo INITIAL;
3310 } else {
3311 !!!cp ('t16');
3312 }
3313 } else {
3314 !!!cp ('t17');
3315 }
3316
3317 !!!parse-error (type => 'no DOCTYPE', token => $token);
3318 $self->{document}->manakai_compat_mode ('quirks');
3319 ## Go to the "before html" insertion mode.
3320 ## reprocess
3321 return;
3322 } elsif ($token->{type} == COMMENT_TOKEN) {
3323 !!!cp ('t18');
3324 my $comment = $self->{document}->create_comment ($token->{data});
3325 $self->{document}->append_child ($comment);
3326
3327 ## Stay in the insertion mode.
3328 !!!next-token;
3329 redo INITIAL;
3330 } else {
3331 die "$0: $token->{type}: Unknown token type";
3332 }
3333 } # INITIAL
3334
3335 die "$0: _tree_construction_initial: This should be never reached";
3336 } # _tree_construction_initial
3337
3338 sub _tree_construction_root_element ($) {
3339 my $self = shift;
3340
3341 ## NOTE: "before html" insertion mode.
3342
3343 B: {
3344 if ($token->{type} == DOCTYPE_TOKEN) {
3345 !!!cp ('t19');
3346 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3347 ## Ignore the token
3348 ## Stay in the insertion mode.
3349 !!!next-token;
3350 redo B;
3351 } elsif ($token->{type} == COMMENT_TOKEN) {
3352 !!!cp ('t20');
3353 my $comment = $self->{document}->create_comment ($token->{data});
3354 $self->{document}->append_child ($comment);
3355 ## Stay in the insertion mode.
3356 !!!next-token;
3357 redo B;
3358 } elsif ($token->{type} == CHARACTER_TOKEN) {
3359 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3360 ## Ignore the token.
3361
3362 unless (length $token->{data}) {
3363 !!!cp ('t21');
3364 ## Stay in the insertion mode.
3365 !!!next-token;
3366 redo B;
3367 } else {
3368 !!!cp ('t22');
3369 }
3370 } else {
3371 !!!cp ('t23');
3372 }
3373
3374 $self->{application_cache_selection}->(undef);
3375
3376 #
3377 } elsif ($token->{type} == START_TAG_TOKEN) {
3378 if ($token->{tag_name} eq 'html') {
3379 my $root_element;
3380 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3381 $self->{document}->append_child ($root_element);
3382 push @{$self->{open_elements}},
3383 [$root_element, $el_category->{html}];
3384
3385 if ($token->{attributes}->{manifest}) {
3386 !!!cp ('t24');
3387 $self->{application_cache_selection}
3388 ->($token->{attributes}->{manifest}->{value});
3389 ## ISSUE: Spec is unclear on relative references.
3390 ## According to Hixie (#whatwg 2008-03-19), it should be
3391 ## resolved against the base URI of the document in HTML
3392 ## or xml:base of the element in XHTML.
3393 } else {
3394 !!!cp ('t25');
3395 $self->{application_cache_selection}->(undef);
3396 }
3397
3398 !!!nack ('t25c');
3399
3400 !!!next-token;
3401 return; ## Go to the "before head" insertion mode.
3402 } else {
3403 !!!cp ('t25.1');
3404 #
3405 }
3406 } elsif ({
3407 END_TAG_TOKEN, 1,
3408 END_OF_FILE_TOKEN, 1,
3409 }->{$token->{type}}) {
3410 !!!cp ('t26');
3411 #
3412 } else {
3413 die "$0: $token->{type}: Unknown token type";
3414 }
3415
3416 my $root_element;
3417 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3418 $self->{document}->append_child ($root_element);
3419 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3420
3421 $self->{application_cache_selection}->(undef);
3422
3423 ## NOTE: Reprocess the token.
3424 !!!ack-later;
3425 return; ## Go to the "before head" insertion mode.
3426
3427 ## ISSUE: There is an issue in the spec
3428 } # B
3429
3430 die "$0: _tree_construction_root_element: This should never be reached";
3431 } # _tree_construction_root_element
3432
3433 sub _reset_insertion_mode ($) {
3434 my $self = shift;
3435
3436 ## Step 1
3437 my $last;
3438
3439 ## Step 2
3440 my $i = -1;
3441 my $node = $self->{open_elements}->[$i];
3442
3443 ## Step 3
3444 S3: {
3445 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3446 $last = 1;
3447 if (defined $self->{inner_html_node}) {
3448 !!!cp ('t28');
3449 $node = $self->{inner_html_node};
3450 } else {
3451 die "_reset_insertion_mode: t27";
3452 }
3453 }
3454
3455 ## Step 4..14
3456 my $new_mode;
3457 if ($node->[1] & FOREIGN_EL) {
3458 !!!cp ('t28.1');
3459 ## NOTE: Strictly spaking, the line below only applies to MathML and
3460 ## SVG elements. Currently the HTML syntax supports only MathML and
3461 ## SVG elements as foreigners.
3462 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3463 } elsif ($node->[1] & TABLE_CELL_EL) {
3464 if ($last) {
3465 !!!cp ('t28.2');
3466 #
3467 } else {
3468 !!!cp ('t28.3');
3469 $new_mode = IN_CELL_IM;
3470 }
3471 } else {
3472 !!!cp ('t28.4');
3473 $new_mode = {
3474 select => IN_SELECT_IM,
3475 ## NOTE: |option| and |optgroup| do not set
3476 ## insertion mode to "in select" by themselves.
3477 tr => IN_ROW_IM,
3478 tbody => IN_TABLE_BODY_IM,
3479 thead => IN_TABLE_BODY_IM,
3480 tfoot => IN_TABLE_BODY_IM,
3481 caption => IN_CAPTION_IM,
3482 colgroup => IN_COLUMN_GROUP_IM,
3483 table => IN_TABLE_IM,
3484 head => IN_BODY_IM, # not in head!
3485 body => IN_BODY_IM,
3486 frameset => IN_FRAMESET_IM,
3487 }->{$node->[0]->manakai_local_name};
3488 }
3489 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3490
3491 ## Step 15
3492 if ($node->[1] & HTML_EL) {
3493 unless (defined $self->{head_element}) {
3494 !!!cp ('t29');
3495 $self->{insertion_mode} = BEFORE_HEAD_IM;
3496 } else {
3497 ## ISSUE: Can this state be reached?
3498 !!!cp ('t30');
3499 $self->{insertion_mode} = AFTER_HEAD_IM;
3500 }
3501 return;
3502 } else {
3503 !!!cp ('t31');
3504 }
3505
3506 ## Step 16
3507 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3508
3509 ## Step 17
3510 $i--;
3511 $node = $self->{open_elements}->[$i];
3512
3513 ## Step 18
3514 redo S3;
3515 } # S3
3516
3517 die "$0: _reset_insertion_mode: This line should never be reached";
3518 } # _reset_insertion_mode
3519
3520 sub _tree_construction_main ($) {
3521 my $self = shift;
3522
3523 my $active_formatting_elements = [];
3524
3525 my $reconstruct_active_formatting_elements = sub { # MUST
3526 my $insert = shift;
3527
3528 ## Step 1
3529 return unless @$active_formatting_elements;
3530
3531 ## Step 3
3532 my $i = -1;
3533 my $entry = $active_formatting_elements->[$i];
3534
3535 ## Step 2
3536 return if $entry->[0] eq '#marker';
3537 for (@{$self->{open_elements}}) {
3538 if ($entry->[0] eq $_->[0]) {
3539 !!!cp ('t32');
3540 return;
3541 }
3542 }
3543
3544 S4: {
3545 ## Step 4
3546 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3547
3548 ## Step 5
3549 $i--;
3550 $entry = $active_formatting_elements->[$i];
3551
3552 ## Step 6
3553 if ($entry->[0] eq '#marker') {
3554 !!!cp ('t33_1');
3555 #
3556 } else {
3557 my $in_open_elements;
3558 OE: for (@{$self->{open_elements}}) {
3559 if ($entry->[0] eq $_->[0]) {
3560 !!!cp ('t33');
3561 $in_open_elements = 1;
3562 last OE;
3563 }
3564 }
3565 if ($in_open_elements) {
3566 !!!cp ('t34');
3567 #
3568 } else {
3569 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3570 !!!cp ('t35');
3571 redo S4;
3572 }
3573 }
3574
3575 ## Step 7
3576 $i++;
3577 $entry = $active_formatting_elements->[$i];
3578 } # S4
3579
3580 S7: {
3581 ## Step 8
3582 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3583
3584 ## Step 9
3585 $insert->($clone->[0]);
3586 push @{$self->{open_elements}}, $clone;
3587
3588 ## Step 10
3589 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3590
3591 ## Step 11
3592 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3593 !!!cp ('t36');
3594 ## Step 7'
3595 $i++;
3596 $entry = $active_formatting_elements->[$i];
3597
3598 redo S7;
3599 }
3600
3601 !!!cp ('t37');
3602 } # S7
3603 }; # $reconstruct_active_formatting_elements
3604
3605 my $clear_up_to_marker = sub {
3606 for (reverse 0..$#$active_formatting_elements) {
3607 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3608 !!!cp ('t38');
3609 splice @$active_formatting_elements, $_;
3610 return;
3611 }
3612 }
3613
3614 !!!cp ('t39');
3615 }; # $clear_up_to_marker
3616
3617 my $insert;
3618
3619 my $parse_rcdata = sub ($) {
3620 my ($content_model_flag) = @_;
3621
3622 ## Step 1
3623 my $start_tag_name = $token->{tag_name};
3624 my $el;
3625 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3626
3627 ## Step 2
3628 $insert->($el);
3629
3630 ## Step 3
3631 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3632 delete $self->{escape}; # MUST
3633
3634 ## Step 4
3635 my $text = '';
3636 !!!nack ('t40.1');
3637 !!!next-token;
3638 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3639 !!!cp ('t40');
3640 $text .= $token->{data};
3641 !!!next-token;
3642 }
3643
3644 ## Step 5
3645 if (length $text) {
3646 !!!cp ('t41');
3647 my $text = $self->{document}->create_text_node ($text);
3648 $el->append_child ($text);
3649 }
3650
3651 ## Step 6
3652 $self->{content_model} = PCDATA_CONTENT_MODEL;
3653
3654 ## Step 7
3655 if ($token->{type} == END_TAG_TOKEN and
3656 $token->{tag_name} eq $start_tag_name) {
3657 !!!cp ('t42');
3658 ## Ignore the token
3659 } else {
3660 ## NOTE: An end-of-file token.
3661 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3662 !!!cp ('t43');
3663 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3664 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3665 !!!cp ('t44');
3666 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
3667 } else {
3668 die "$0: $content_model_flag in parse_rcdata";
3669 }
3670 }
3671 !!!next-token;
3672 }; # $parse_rcdata
3673
3674 my $script_start_tag = sub () {
3675 my $script_el;
3676 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3677 ## TODO: mark as "parser-inserted"
3678
3679 $self->{content_model} = CDATA_CONTENT_MODEL;
3680 delete $self->{escape}; # MUST
3681
3682 my $text = '';
3683 !!!nack ('t45.1');
3684 !!!next-token;
3685 while ($token->{type} == CHARACTER_TOKEN) {
3686 !!!cp ('t45');
3687 $text .= $token->{data};
3688 !!!next-token;
3689 } # stop if non-character token or tokenizer stops tokenising
3690 if (length $text) {
3691 !!!cp ('t46');
3692 $script_el->manakai_append_text ($text);
3693 }
3694
3695 $self->{content_model} = PCDATA_CONTENT_MODEL;
3696
3697 if ($token->{type} == END_TAG_TOKEN and
3698 $token->{tag_name} eq 'script') {
3699 !!!cp ('t47');
3700 ## Ignore the token
3701 } else {
3702 !!!cp ('t48');
3703 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3704 ## ISSUE: And ignore?
3705 ## TODO: mark as "already executed"
3706 }
3707
3708 if (defined $self->{inner_html_node}) {
3709 !!!cp ('t49');
3710 ## TODO: mark as "already executed"
3711 } else {
3712 !!!cp ('t50');
3713 ## TODO: $old_insertion_point = current insertion point
3714 ## TODO: insertion point = just before the next input character
3715
3716 $insert->($script_el);
3717
3718 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3719
3720 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3721 }
3722
3723 !!!next-token;
3724 }; # $script_start_tag
3725
3726 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3727 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3728 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3729
3730 my $formatting_end_tag = sub {
3731 my $end_tag_token = shift;
3732 my $tag_name = $end_tag_token->{tag_name};
3733
3734 ## NOTE: The adoption agency algorithm (AAA).
3735
3736 FET: {
3737 ## Step 1
3738 my $formatting_element;
3739 my $formatting_element_i_in_active;
3740 AFE: for (reverse 0..$#$active_formatting_elements) {
3741 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3742 !!!cp ('t52');
3743 last AFE;
3744 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3745 eq $tag_name) {
3746 !!!cp ('t51');
3747 $formatting_element = $active_formatting_elements->[$_];
3748 $formatting_element_i_in_active = $_;
3749 last AFE;
3750 }
3751 } # AFE
3752 unless (defined $formatting_element) {
3753 !!!cp ('t53');
3754 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
3755 ## Ignore the token
3756 !!!next-token;
3757 return;
3758 }
3759 ## has an element in scope
3760 my $in_scope = 1;
3761 my $formatting_element_i_in_open;
3762 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3763 my $node = $self->{open_elements}->[$_];
3764 if ($node->[0] eq $formatting_element->[0]) {
3765 if ($in_scope) {
3766 !!!cp ('t54');
3767 $formatting_element_i_in_open = $_;
3768 last INSCOPE;
3769 } else { # in open elements but not in scope
3770 !!!cp ('t55');
3771 !!!parse-error (type => 'unmatched end tag',
3772 text => $token->{tag_name},
3773 token => $end_tag_token);
3774 ## Ignore the token
3775 !!!next-token;
3776 return;
3777 }
3778 } elsif ($node->[1] & SCOPING_EL) {
3779 !!!cp ('t56');
3780 $in_scope = 0;
3781 }
3782 } # INSCOPE
3783 unless (defined $formatting_element_i_in_open) {
3784 !!!cp ('t57');
3785 !!!parse-error (type => 'unmatched end tag',
3786 text => $token->{tag_name},
3787 token => $end_tag_token);
3788 pop @$active_formatting_elements; # $formatting_element
3789 !!!next-token; ## TODO: ok?
3790 return;
3791 }
3792 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3793 !!!cp ('t58');
3794 !!!parse-error (type => 'not closed',
3795 text => $self->{open_elements}->[-1]->[0]
3796 ->manakai_local_name,
3797 token => $end_tag_token);
3798 }
3799
3800 ## Step 2
3801 my $furthest_block;
3802 my $furthest_block_i_in_open;
3803 OE: for (reverse 0..$#{$self->{open_elements}}) {
3804 my $node = $self->{open_elements}->[$_];
3805 if (not ($node->[1] & FORMATTING_EL) and
3806 #not $phrasing_category->{$node->[1]} and
3807 ($node->[1] & SPECIAL_EL or
3808 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3809 !!!cp ('t59');
3810 $furthest_block = $node;
3811 $furthest_block_i_in_open = $_;
3812 } elsif ($node->[0] eq $formatting_element->[0]) {
3813 !!!cp ('t60');
3814 last OE;
3815 }
3816 } # OE
3817
3818 ## Step 3
3819 unless (defined $furthest_block) { # MUST
3820 !!!cp ('t61');
3821 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3822 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3823 !!!next-token;
3824 return;
3825 }
3826
3827 ## Step 4
3828 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3829
3830 ## Step 5
3831 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3832 if (defined $furthest_block_parent) {
3833 !!!cp ('t62');
3834 $furthest_block_parent->remove_child ($furthest_block->[0]);
3835 }
3836
3837 ## Step 6
3838 my $bookmark_prev_el
3839 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3840 ->[0];
3841
3842 ## Step 7
3843 my $node = $furthest_block;
3844 my $node_i_in_open = $furthest_block_i_in_open;
3845 my $last_node = $furthest_block;
3846 S7: {
3847 ## Step 1
3848 $node_i_in_open--;
3849 $node = $self->{open_elements}->[$node_i_in_open];
3850
3851 ## Step 2
3852 my $node_i_in_active;
3853 S7S2: {
3854 for (reverse 0..$#$active_formatting_elements) {
3855 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3856 !!!cp ('t63');
3857 $node_i_in_active = $_;
3858 last S7S2;
3859 }
3860 }
3861 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3862 redo S7;
3863 } # S7S2
3864
3865 ## Step 3
3866 last S7 if $node->[0] eq $formatting_element->[0];
3867
3868 ## Step 4
3869 if ($last_node->[0] eq $furthest_block->[0]) {
3870 !!!cp ('t64');
3871 $bookmark_prev_el = $node->[0];
3872 }
3873
3874 ## Step 5
3875 if ($node->[0]->has_child_nodes ()) {
3876 !!!cp ('t65');
3877 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3878 $active_formatting_elements->[$node_i_in_active] = $clone;
3879 $self->{open_elements}->[$node_i_in_open] = $clone;
3880 $node = $clone;
3881 }
3882
3883 ## Step 6
3884 $node->[0]->append_child ($last_node->[0]);
3885
3886 ## Step 7
3887 $last_node = $node;
3888
3889 ## Step 8
3890 redo S7;
3891 } # S7
3892
3893 ## Step 8
3894 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3895 my $foster_parent_element;
3896 my $next_sibling;
3897 OE: for (reverse 0..$#{$self->{open_elements}}) {
3898 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3899 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3900 if (defined $parent and $parent->node_type == 1) {
3901 !!!cp ('t65.1');
3902 $foster_parent_element = $parent;
3903 $next_sibling = $self->{open_elements}->[$_]->[0];
3904 } else {
3905 !!!cp ('t65.2');
3906 $foster_parent_element
3907 = $self->{open_elements}->[$_ - 1]->[0];
3908 }
3909 last OE;
3910 }
3911 } # OE
3912 $foster_parent_element = $self->{open_elements}->[0]->[0]
3913 unless defined $foster_parent_element;
3914 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3915 $open_tables->[-1]->[1] = 1; # tainted
3916 } else {
3917 !!!cp ('t65.3');
3918 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3919 }
3920
3921 ## Step 9
3922 my $clone = [$formatting_element->[0]->clone_node (0),
3923 $formatting_element->[1]];
3924
3925 ## Step 10
3926 my @cn = @{$furthest_block->[0]->child_nodes};
3927 $clone->[0]->append_child ($_) for @cn;
3928
3929 ## Step 11
3930 $furthest_block->[0]->append_child ($clone->[0]);
3931
3932 ## Step 12
3933 my $i;
3934 AFE: for (reverse 0..$#$active_formatting_elements) {
3935 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3936 !!!cp ('t66');
3937 splice @$active_formatting_elements, $_, 1;
3938 $i-- and last AFE if defined $i;
3939 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3940 !!!cp ('t67');
3941 $i = $_;
3942 }
3943 } # AFE
3944 splice @$active_formatting_elements, $i + 1, 0, $clone;
3945
3946 ## Step 13
3947 undef $i;
3948 OE: for (reverse 0..$#{$self->{open_elements}}) {
3949 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3950 !!!cp ('t68');
3951 splice @{$self->{open_elements}}, $_, 1;
3952 $i-- and last OE if defined $i;
3953 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3954 !!!cp ('t69');
3955 $i = $_;
3956 }
3957 } # OE
3958 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3959
3960 ## Step 14
3961 redo FET;
3962 } # FET
3963 }; # $formatting_end_tag
3964
3965 $insert = my $insert_to_current = sub {
3966 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3967 }; # $insert_to_current
3968
3969 my $insert_to_foster = sub {
3970 my $child = shift;
3971 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3972 # MUST
3973 my $foster_parent_element;
3974 my $next_sibling;
3975 OE: for (reverse 0..$#{$self->{open_elements}}) {
3976 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3977 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3978 if (defined $parent and $parent->node_type == 1) {
3979 !!!cp ('t70');
3980 $foster_parent_element = $parent;
3981 $next_sibling = $self->{open_elements}->[$_]->[0];
3982 } else {
3983 !!!cp ('t71');
3984 $foster_parent_element
3985 = $self->{open_elements}->[$_ - 1]->[0];
3986 }
3987 last OE;
3988 }
3989 } # OE
3990 $foster_parent_element = $self->{open_elements}->[0]->[0]
3991 unless defined $foster_parent_element;
3992 $foster_parent_element->insert_before
3993 ($child, $next_sibling);
3994 $open_tables->[-1]->[1] = 1; # tainted
3995 } else {
3996 !!!cp ('t72');
3997 $self->{open_elements}->[-1]->[0]->append_child ($child);
3998 }
3999 }; # $insert_to_foster
4000
4001 B: while (1) {
4002 if ($token->{type} == DOCTYPE_TOKEN) {
4003 !!!cp ('t73');
4004 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4005 ## Ignore the token
4006 ## Stay in the phase
4007 !!!next-token;
4008 next B;
4009 } elsif ($token->{type} == START_TAG_TOKEN and
4010 $token->{tag_name} eq 'html') {
4011 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4012 !!!cp ('t79');
4013 !!!parse-error (type => 'after html', text => 'html', token => $token);
4014 $self->{insertion_mode} = AFTER_BODY_IM;
4015 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4016 !!!cp ('t80');
4017 !!!parse-error (type => 'after html', text => 'html', token => $token);
4018 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4019 } else {
4020 !!!cp ('t81');
4021 }
4022
4023 !!!cp ('t82');
4024 !!!parse-error (type => 'not first start tag', token => $token);
4025 my $top_el = $self->{open_elements}->[0]->[0];
4026 for my $attr_name (keys %{$token->{attributes}}) {
4027 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4028 !!!cp ('t84');
4029 $top_el->set_attribute_ns
4030 (undef, [undef, $attr_name],
4031 $token->{attributes}->{$attr_name}->{value});
4032 }
4033 }
4034 !!!nack ('t84.1');
4035 !!!next-token;
4036 next B;
4037 } elsif ($token->{type} == COMMENT_TOKEN) {
4038 my $comment = $self->{document}->create_comment ($token->{data});
4039 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4040 !!!cp ('t85');
4041 $self->{document}->append_child ($comment);
4042 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4043 !!!cp ('t86');
4044 $self->{open_elements}->[0]->[0]->append_child ($comment);
4045 } else {
4046 !!!cp ('t87');
4047 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4048 }
4049 !!!next-token;
4050 next B;
4051 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4052 if ($token->{type} == CHARACTER_TOKEN) {
4053 !!!cp ('t87.1');
4054 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4055 !!!next-token;
4056 next B;
4057 } elsif ($token->{type} == START_TAG_TOKEN) {
4058 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4059 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4060 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4061 ($token->{tag_name} eq 'svg' and
4062 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4063 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4064 !!!cp ('t87.2');
4065 #
4066 } elsif ({
4067 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4068 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4069 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4070 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4071 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4072 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4073 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4074 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4075 }->{$token->{tag_name}}) {
4076 !!!cp ('t87.2');
4077 !!!parse-error (type => 'not closed',
4078 text => $self->{open_elements}->[-1]->[0]
4079 ->manakai_local_name,
4080 token => $token);
4081
4082 pop @{$self->{open_elements}}
4083 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4084
4085 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4086 ## Reprocess.
4087 next B;
4088 } else {
4089 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4090 my $tag_name = $token->{tag_name};
4091 if ($nsuri eq $SVG_NS) {
4092 $tag_name = {
4093 altglyph => 'altGlyph',
4094 altglyphdef => 'altGlyphDef',
4095 altglyphitem => 'altGlyphItem',
4096 animatecolor => 'animateColor',
4097 animatemotion => 'animateMotion',
4098 animatetransform => 'animateTransform',
4099 clippath => 'clipPath',
4100 feblend => 'feBlend',
4101 fecolormatrix => 'feColorMatrix',
4102 fecomponenttransfer => 'feComponentTransfer',
4103 fecomposite => 'feComposite',
4104 feconvolvematrix => 'feConvolveMatrix',
4105 fediffuselighting => 'feDiffuseLighting',
4106 fedisplacementmap => 'feDisplacementMap',
4107 fedistantlight => 'feDistantLight',
4108 feflood => 'feFlood',
4109 fefunca => 'feFuncA',
4110 fefuncb => 'feFuncB',
4111 fefuncg => 'feFuncG',
4112 fefuncr => 'feFuncR',
4113 fegaussianblur => 'feGaussianBlur',
4114 feimage => 'feImage',
4115 femerge => 'feMerge',
4116 femergenode => 'feMergeNode',
4117 femorphology => 'feMorphology',
4118 feoffset => 'feOffset',
4119 fepointlight => 'fePointLight',
4120 fespecularlighting => 'feSpecularLighting',
4121 fespotlight => 'feSpotLight',
4122 fetile => 'feTile',
4123 feturbulence => 'feTurbulence',
4124 foreignobject => 'foreignObject',
4125 glyphref => 'glyphRef',
4126 lineargradient => 'linearGradient',
4127 radialgradient => 'radialGradient',
4128 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4129 textpath => 'textPath',
4130 }->{$tag_name} || $tag_name;
4131 }
4132
4133 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4134
4135 ## "adjust foreign attributes" - done in insert-element-f
4136
4137 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4138
4139 if ($self->{self_closing}) {
4140 pop @{$self->{open_elements}};
4141 !!!ack ('t87.3');
4142 } else {
4143 !!!cp ('t87.4');
4144 }
4145
4146 !!!next-token;
4147 next B;
4148 }
4149 } elsif ($token->{type} == END_TAG_TOKEN) {
4150 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4151 !!!cp ('t87.5');
4152 #
4153 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4154 !!!cp ('t87.6');
4155 !!!parse-error (type => 'not closed',
4156 text => $self->{open_elements}->[-1]->[0]
4157 ->manakai_local_name,
4158 token => $token);
4159
4160 pop @{$self->{open_elements}}
4161 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4162
4163 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4164 ## Reprocess.
4165 next B;
4166 } else {
4167 die "$0: $token->{type}: Unknown token type";
4168 }
4169 }
4170
4171 if ($self->{insertion_mode} & HEAD_IMS) {
4172 if ($token->{type} == CHARACTER_TOKEN) {
4173 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4174 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4175 !!!cp ('t88.2');
4176 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4177 } else {
4178 !!!cp ('t88.1');
4179 ## Ignore the token.
4180 !!!next-token;
4181 next B;
4182 }
4183 unless (length $token->{data}) {
4184 !!!cp ('t88');
4185 !!!next-token;
4186 next B;
4187 }
4188 }
4189
4190 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4191 !!!cp ('t89');
4192 ## As if <head>
4193 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4194 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4195 push @{$self->{open_elements}},
4196 [$self->{head_element}, $el_category->{head}];
4197
4198 ## Reprocess in the "in head" insertion mode...
4199 pop @{$self->{open_elements}};
4200
4201 ## Reprocess in the "after head" insertion mode...
4202 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4203 !!!cp ('t90');
4204 ## As if </noscript>
4205 pop @{$self->{open_elements}};
4206 !!!parse-error (type => 'in noscript:#text', token => $token);
4207
4208 ## Reprocess in the "in head" insertion mode...
4209 ## As if </head>
4210 pop @{$self->{open_elements}};
4211
4212 ## Reprocess in the "after head" insertion mode...
4213 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4214 !!!cp ('t91');
4215 pop @{$self->{open_elements}};
4216
4217 ## Reprocess in the "after head" insertion mode...
4218 } else {
4219 !!!cp ('t92');
4220 }
4221
4222 ## "after head" insertion mode
4223 ## As if <body>
4224 !!!insert-element ('body',, $token);
4225 $self->{insertion_mode} = IN_BODY_IM;
4226 ## reprocess
4227 next B;
4228 } elsif ($token->{type} == START_TAG_TOKEN) {
4229 if ($token->{tag_name} eq 'head') {
4230 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4231 !!!cp ('t93');
4232 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4233 $self->{open_elements}->[-1]->[0]->append_child
4234 ($self->{head_element});
4235 push @{$self->{open_elements}},
4236 [$self->{head_element}, $el_category->{head}];
4237 $self->{insertion_mode} = IN_HEAD_IM;
4238 !!!nack ('t93.1');
4239 !!!next-token;
4240 next B;
4241 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4242 !!!cp ('t93.2');
4243 !!!parse-error (type => 'after head', text => 'head',
4244 token => $token);
4245 ## Ignore the token
4246 !!!nack ('t93.3');
4247 !!!next-token;
4248 next B;
4249 } else {
4250 !!!cp ('t95');
4251 !!!parse-error (type => 'in head:head',
4252 token => $token); # or in head noscript
4253 ## Ignore the token
4254 !!!nack ('t95.1');
4255 !!!next-token;
4256 next B;
4257 }
4258 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4259 !!!cp ('t96');
4260 ## As if <head>
4261 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4262 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4263 push @{$self->{open_elements}},
4264 [$self->{head_element}, $el_category->{head}];
4265
4266 $self->{insertion_mode} = IN_HEAD_IM;
4267 ## Reprocess in the "in head" insertion mode...
4268 } else {
4269 !!!cp ('t97');
4270 }
4271
4272 if ($token->{tag_name} eq 'base') {
4273 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4274 !!!cp ('t98');
4275 ## As if </noscript>
4276 pop @{$self->{open_elements}};
4277 !!!parse-error (type => 'in noscript', text => 'base',
4278 token => $token);
4279
4280 $self->{insertion_mode} = IN_HEAD_IM;
4281 ## Reprocess in the "in head" insertion mode...
4282 } else {
4283 !!!cp ('t99');
4284 }
4285
4286 ## NOTE: There is a "as if in head" code clone.
4287 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4288 !!!cp ('t100');
4289 !!!parse-error (type => 'after head',
4290 text => $token->{tag_name}, token => $token);
4291 push @{$self->{open_elements}},
4292 [$self->{head_element}, $el_category->{head}];
4293 } else {
4294 !!!cp ('t101');
4295 }
4296 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4297 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4298 pop @{$self->{open_elements}} # <head>
4299 if $self->{insertion_mode} == AFTER_HEAD_IM;
4300 !!!nack ('t101.1');
4301 !!!next-token;
4302 next B;
4303 } elsif ($token->{tag_name} eq 'link') {
4304 ## NOTE: There is a "as if in head" code clone.
4305 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4306 !!!cp ('t102');
4307 !!!parse-error (type => 'after head',
4308 text => $token->{tag_name}, token => $token);
4309 push @{$self->{open_elements}},
4310 [$self->{head_element}, $el_category->{head}];
4311 } else {
4312 !!!cp ('t103');
4313 }
4314 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4315 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4316 pop @{$self->{open_elements}} # <head>
4317 if $self->{insertion_mode} == AFTER_HEAD_IM;
4318 !!!ack ('t103.1');
4319 !!!next-token;
4320 next B;
4321 } elsif ($token->{tag_name} eq 'meta') {
4322 ## NOTE: There is a "as if in head" code clone.
4323 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4324 !!!cp ('t104');
4325 !!!parse-error (type => 'after head',
4326 text => $token->{tag_name}, token => $token);
4327 push @{$self->{open_elements}},
4328 [$self->{head_element}, $el_category->{head}];
4329 } else {
4330 !!!cp ('t105');
4331 }
4332 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4333 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4334
4335 unless ($self->{confident}) {
4336 if ($token->{attributes}->{charset}) {
4337 !!!cp ('t106');
4338 ## NOTE: Whether the encoding is supported or not is handled
4339 ## in the {change_encoding} callback.
4340 $self->{change_encoding}
4341 ->($self, $token->{attributes}->{charset}->{value},
4342 $token);
4343
4344 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4345 ->set_user_data (manakai_has_reference =>
4346 $token->{attributes}->{charset}
4347 ->{has_reference});
4348 } elsif ($token->{attributes}->{content}) {
4349 if ($token->{attributes}->{content}->{value}
4350 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4351 [\x09-\x0D\x20]*=
4352 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4353 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4354 !!!cp ('t107');
4355 ## NOTE: Whether the encoding is supported or not is handled
4356 ## in the {change_encoding} callback.
4357 $self->{change_encoding}
4358 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4359 $token);
4360 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4361 ->set_user_data (manakai_has_reference =>
4362 $token->{attributes}->{content}
4363 ->{has_reference});
4364 } else {
4365 !!!cp ('t108');
4366 }
4367 }
4368 } else {
4369 if ($token->{attributes}->{charset}) {
4370 !!!cp ('t109');
4371 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4372 ->set_user_data (manakai_has_reference =>
4373 $token->{attributes}->{charset}
4374 ->{has_reference});
4375 }
4376 if ($token->{attributes}->{content}) {
4377 !!!cp ('t110');
4378 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4379 ->set_user_data (manakai_has_reference =>
4380 $token->{attributes}->{content}
4381 ->{has_reference});
4382 }
4383 }
4384
4385 pop @{$self->{open_elements}} # <head>
4386 if $self->{insertion_mode} == AFTER_HEAD_IM;
4387 !!!ack ('t110.1');
4388 !!!next-token;
4389 next B;
4390 } elsif ($token->{tag_name} eq 'title') {
4391 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4392 !!!cp ('t111');
4393 ## As if </noscript>
4394 pop @{$self->{open_elements}};
4395 !!!parse-error (type => 'in noscript', text => 'title',
4396 token => $token);
4397
4398 $self->{insertion_mode} = IN_HEAD_IM;
4399 ## Reprocess in the "in head" insertion mode...
4400 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4401 !!!cp ('t112');
4402 !!!parse-error (type => 'after head',
4403 text => $token->{tag_name}, token => $token);
4404 push @{$self->{open_elements}},
4405 [$self->{head_element}, $el_category->{head}];
4406 } else {
4407 !!!cp ('t113');
4408 }
4409
4410 ## NOTE: There is a "as if in head" code clone.
4411 my $parent = defined $self->{head_element} ? $self->{head_element}
4412 : $self->{open_elements}->[-1]->[0];
4413 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4414 pop @{$self->{open_elements}} # <head>
4415 if $self->{insertion_mode} == AFTER_HEAD_IM;
4416 next B;
4417 } elsif ($token->{tag_name} eq 'style' or
4418 $token->{tag_name} eq 'noframes') {
4419 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4420 ## insertion mode IN_HEAD_IM)
4421 ## NOTE: There is a "as if in head" code clone.
4422 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4423 !!!cp ('t114');
4424 !!!parse-error (type => 'after head',
4425 text => $token->{tag_name}, token => $token);
4426 push @{$self->{open_elements}},
4427 [$self->{head_element}, $el_category->{head}];
4428 } else {
4429 !!!cp ('t115');
4430 }
4431 $parse_rcdata->(CDATA_CONTENT_MODEL);
4432 pop @{$self->{open_elements}} # <head>
4433 if $self->{insertion_mode} == AFTER_HEAD_IM;
4434 next B;
4435 } elsif ($token->{tag_name} eq 'noscript') {
4436 if ($self->{insertion_mode} == IN_HEAD_IM) {
4437 !!!cp ('t116');
4438 ## NOTE: and scripting is disalbed
4439 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4440 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4441 !!!nack ('t116.1');
4442 !!!next-token;
4443 next B;
4444 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4445 !!!cp ('t117');
4446 !!!parse-error (type => 'in noscript', text => 'noscript',
4447 token => $token);
4448 ## Ignore the token
4449 !!!nack ('t117.1');
4450 !!!next-token;
4451 next B;
4452 } else {
4453 !!!cp ('t118');
4454 #
4455 }
4456 } elsif ($token->{tag_name} eq 'script') {
4457 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4458 !!!cp ('t119');
4459 ## As if </noscript>
4460 pop @{$self->{open_elements}};
4461 !!!parse-error (type => 'in noscript', text => 'script',
4462 token => $token);
4463
4464 $self->{insertion_mode} = IN_HEAD_IM;
4465 ## Reprocess in the "in head" insertion mode...
4466 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4467 !!!cp ('t120');
4468 !!!parse-error (type => 'after head',
4469 text => $token->{tag_name}, token => $token);
4470 push @{$self->{open_elements}},
4471 [$self->{head_element}, $el_category->{head}];
4472 } else {
4473 !!!cp ('t121');
4474 }
4475
4476 ## NOTE: There is a "as if in head" code clone.
4477 $script_start_tag->();
4478 pop @{$self->{open_elements}} # <head>
4479 if $self->{insertion_mode} == AFTER_HEAD_IM;
4480 next B;
4481 } elsif ($token->{tag_name} eq 'body' or
4482 $token->{tag_name} eq 'frameset') {
4483 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4484 !!!cp ('t122');
4485 ## As if </noscript>
4486 pop @{$self->{open_elements}};
4487 !!!parse-error (type => 'in noscript',
4488 text => $token->{tag_name}, token => $token);
4489
4490 ## Reprocess in the "in head" insertion mode...
4491 ## As if </head>
4492 pop @{$self->{open_elements}};
4493
4494 ## Reprocess in the "after head" insertion mode...
4495 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4496 !!!cp ('t124');
4497 pop @{$self->{open_elements}};
4498
4499 ## Reprocess in the "after head" insertion mode...
4500 } else {
4501 !!!cp ('t125');
4502 }
4503
4504 ## "after head" insertion mode
4505 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4506 if ($token->{tag_name} eq 'body') {
4507 !!!cp ('t126');
4508 $self->{insertion_mode} = IN_BODY_IM;
4509 } elsif ($token->{tag_name} eq 'frameset') {
4510 !!!cp ('t127');
4511 $self->{insertion_mode} = IN_FRAMESET_IM;
4512 } else {
4513 die "$0: tag name: $self->{tag_name}";
4514 }
4515 !!!nack ('t127.1');
4516 !!!next-token;
4517 next B;
4518 } else {
4519 !!!cp ('t128');
4520 #
4521 }
4522
4523 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4524 !!!cp ('t129');
4525 ## As if </noscript>
4526 pop @{$self->{open_elements}};
4527 !!!parse-error (type => 'in noscript:/',
4528 text => $token->{tag_name}, token => $token);
4529
4530 ## Reprocess in the "in head" insertion mode...
4531 ## As if </head>
4532 pop @{$self->{open_elements}};
4533
4534 ## Reprocess in the "after head" insertion mode...
4535 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4536 !!!cp ('t130');
4537 ## As if </head>
4538 pop @{$self->{open_elements}};
4539
4540 ## Reprocess in the "after head" insertion mode...
4541 } else {
4542 !!!cp ('t131');
4543 }
4544
4545 ## "after head" insertion mode
4546 ## As if <body>
4547 !!!insert-element ('body',, $token);
4548 $self->{insertion_mode} = IN_BODY_IM;
4549 ## reprocess
4550 !!!ack-later;
4551 next B;
4552 } elsif ($token->{type} == END_TAG_TOKEN) {
4553 if ($token->{tag_name} eq 'head') {
4554 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4555 !!!cp ('t132');
4556 ## As if <head>
4557 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4558 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4559 push @{$self->{open_elements}},
4560 [$self->{head_element}, $el_category->{head}];
4561
4562 ## Reprocess in the "in head" insertion mode...
4563 pop @{$self->{open_elements}};
4564 $self->{insertion_mode} = AFTER_HEAD_IM;
4565 !!!next-token;
4566 next B;
4567 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4568 !!!cp ('t133');
4569 ## As if </noscript>
4570 pop @{$self->{open_elements}};
4571 !!!parse-error (type => 'in noscript:/',
4572 text => 'head', token => $token);
4573
4574 ## Reprocess in the "in head" insertion mode...
4575 pop @{$self->{open_elements}};
4576 $self->{insertion_mode} = AFTER_HEAD_IM;
4577 !!!next-token;
4578 next B;
4579 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4580 !!!cp ('t134');
4581 pop @{$self->{open_elements}};
4582 $self->{insertion_mode} = AFTER_HEAD_IM;
4583 !!!next-token;
4584 next B;
4585 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4586 !!!cp ('t134.1');
4587 !!!parse-error (type => 'unmatched end tag', text => 'head',
4588 token => $token);
4589 ## Ignore the token
4590 !!!next-token;
4591 next B;
4592 } else {
4593 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4594 }
4595 } elsif ($token->{tag_name} eq 'noscript') {
4596 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4597 !!!cp ('t136');
4598 pop @{$self->{open_elements}};
4599 $self->{insertion_mode} = IN_HEAD_IM;
4600 !!!next-token;
4601 next B;
4602 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4603 $self->{insertion_mode} == AFTER_HEAD_IM) {
4604 !!!cp ('t137');
4605 !!!parse-error (type => 'unmatched end tag',
4606 text => 'noscript', token => $token);
4607 ## Ignore the token ## ISSUE: An issue in the spec.
4608 !!!next-token;
4609 next B;
4610 } else {
4611 !!!cp ('t138');
4612 #
4613 }
4614 } elsif ({
4615 body => 1, html => 1,
4616 }->{$token->{tag_name}}) {
4617 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4618 $self->{insertion_mode} == IN_HEAD_IM or
4619 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4620 !!!cp ('t140');
4621 !!!parse-error (type => 'unmatched end tag',
4622 text => $token->{tag_name}, token => $token);
4623 ## Ignore the token
4624 !!!next-token;
4625 next B;
4626 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4627 !!!cp ('t140.1');
4628 !!!parse-error (type => 'unmatched end tag',
4629 text => $token->{tag_name}, token => $token);
4630 ## Ignore the token
4631 !!!next-token;
4632 next B;
4633 } else {
4634 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4635 }
4636 } elsif ($token->{tag_name} eq 'p') {
4637 !!!cp ('t142');
4638 !!!parse-error (type => 'unmatched end tag',
4639 text => $token->{tag_name}, token => $token);
4640 ## Ignore the token
4641 !!!next-token;
4642 next B;
4643 } elsif ($token->{tag_name} eq 'br') {
4644 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4645 !!!cp ('t142.2');
4646 ## (before head) as if <head>, (in head) as if </head>
4647 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4648 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4649 $self->{insertion_mode} = AFTER_HEAD_IM;
4650
4651 ## Reprocess in the "after head" insertion mode...
4652 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4653 !!!cp ('t143.2');
4654 ## As if </head>
4655 pop @{$self->{open_elements}};
4656 $self->{insertion_mode} = AFTER_HEAD_IM;
4657
4658 ## Reprocess in the "after head" insertion mode...
4659 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4660 !!!cp ('t143.3');
4661 ## ISSUE: Two parse errors for <head><noscript></br>
4662 !!!parse-error (type => 'unmatched end tag',
4663 text => 'br', token => $token);
4664 ## As if </noscript>
4665 pop @{$self->{open_elements}};
4666 $self->{insertion_mode} = IN_HEAD_IM;
4667
4668 ## Reprocess in the "in head" insertion mode...
4669 ## As if </head>
4670 pop @{$self->{open_elements}};
4671 $self->{insertion_mode} = AFTER_HEAD_IM;
4672
4673 ## Reprocess in the "after head" insertion mode...
4674 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4675 !!!cp ('t143.4');
4676 #
4677 } else {
4678 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4679 }
4680
4681 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4682 !!!parse-error (type => 'unmatched end tag',
4683 text => 'br', token => $token);
4684 ## Ignore the token
4685 !!!next-token;
4686 next B;
4687 } else {
4688 !!!cp ('t145');
4689 !!!parse-error (type => 'unmatched end tag',
4690 text => $token->{tag_name}, token => $token);
4691 ## Ignore the token
4692 !!!next-token;
4693 next B;
4694 }
4695
4696 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4697 !!!cp ('t146');
4698 ## As if </noscript>
4699 pop @{$self->{open_elements}};
4700 !!!parse-error (type => 'in noscript:/',
4701 text => $token->{tag_name}, token => $token);
4702
4703 ## Reprocess in the "in head" insertion mode...
4704 ## As if </head>
4705 pop @{$self->{open_elements}};
4706
4707 ## Reprocess in the "after head" insertion mode...
4708 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4709 !!!cp ('t147');
4710 ## As if </head>
4711 pop @{$self->{open_elements}};
4712
4713 ## Reprocess in the "after head" insertion mode...
4714 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4715 ## ISSUE: This case cannot be reached?
4716 !!!cp ('t148');
4717 !!!parse-error (type => 'unmatched end tag',
4718 text => $token->{tag_name}, token => $token);
4719 ## Ignore the token ## ISSUE: An issue in the spec.
4720 !!!next-token;
4721 next B;
4722 } else {
4723 !!!cp ('t149');
4724 }
4725
4726 ## "after head" insertion mode
4727 ## As if <body>
4728 !!!insert-element ('body',, $token);
4729 $self->{insertion_mode} = IN_BODY_IM;
4730 ## reprocess
4731 next B;
4732 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4733 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4734 !!!cp ('t149.1');
4735
4736 ## NOTE: As if <head>
4737 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4738 $self->{open_elements}->[-1]->[0]->append_child
4739 ($self->{head_element});
4740 #push @{$self->{open_elements}},
4741 # [$self->{head_element}, $el_category->{head}];
4742 #$self->{insertion_mode} = IN_HEAD_IM;
4743 ## NOTE: Reprocess.
4744
4745 ## NOTE: As if </head>
4746 #pop @{$self->{open_elements}};
4747 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4748 ## NOTE: Reprocess.
4749
4750 #
4751 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4752 !!!cp ('t149.2');
4753
4754 ## NOTE: As if </head>
4755 pop @{$self->{open_elements}};
4756 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4757 ## NOTE: Reprocess.
4758
4759 #
4760 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4761 !!!cp ('t149.3');
4762
4763 !!!parse-error (type => 'in noscript:#eof', token => $token);
4764
4765 ## As if </noscript>
4766 pop @{$self->{open_elements}};
4767 #$self->{insertion_mode} = IN_HEAD_IM;
4768 ## NOTE: Reprocess.
4769
4770 ## NOTE: As if </head>
4771 pop @{$self->{open_elements}};
4772 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4773 ## NOTE: Reprocess.
4774
4775 #
4776 } else {
4777 !!!cp ('t149.4');
4778 #
4779 }
4780
4781 ## NOTE: As if <body>
4782 !!!insert-element ('body',, $token);
4783 $self->{insertion_mode} = IN_BODY_IM;
4784 ## NOTE: Reprocess.
4785 next B;
4786 } else {
4787 die "$0: $token->{type}: Unknown token type";
4788 }
4789
4790 ## ISSUE: An issue in the spec.
4791 } elsif ($self->{insertion_mode} & BODY_IMS) {
4792 if ($token->{type} == CHARACTER_TOKEN) {
4793 !!!cp ('t150');
4794 ## NOTE: There is a code clone of "character in body".
4795 $reconstruct_active_formatting_elements->($insert_to_current);
4796
4797 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4798
4799 !!!next-token;
4800 next B;
4801 } elsif ($token->{type} == START_TAG_TOKEN) {
4802 if ({
4803 caption => 1, col => 1, colgroup => 1, tbody => 1,
4804 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4805 }->{$token->{tag_name}}) {
4806 if ($self->{insertion_mode} == IN_CELL_IM) {
4807 ## have an element in table scope
4808 for (reverse 0..$#{$self->{open_elements}}) {
4809 my $node = $self->{open_elements}->[$_];
4810 if ($node->[1] & TABLE_CELL_EL) {
4811 !!!cp ('t151');
4812
4813 ## Close the cell
4814 !!!back-token; # <x>
4815 $token = {type => END_TAG_TOKEN,
4816 tag_name => $node->[0]->manakai_local_name,
4817 line => $token->{line},
4818 column => $token->{column}};
4819 next B;
4820 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4821 !!!cp ('t152');
4822 ## ISSUE: This case can never be reached, maybe.
4823 last;
4824 }
4825 }
4826
4827 !!!cp ('t153');
4828 !!!parse-error (type => 'start tag not allowed',
4829 text => $token->{tag_name}, token => $token);
4830 ## Ignore the token
4831 !!!nack ('t153.1');
4832 !!!next-token;
4833 next B;
4834 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4835 !!!parse-error (type => 'not closed', text => 'caption',
4836 token => $token);
4837
4838 ## NOTE: As if </caption>.
4839 ## have a table element in table scope
4840 my $i;
4841 INSCOPE: {
4842 for (reverse 0..$#{$self->{open_elements}}) {
4843 my $node = $self->{open_elements}->[$_];
4844 if ($node->[1] & CAPTION_EL) {
4845 !!!cp ('t155');
4846 $i = $_;
4847 last INSCOPE;
4848 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4849 !!!cp ('t156');
4850 last;
4851 }
4852 }
4853
4854 !!!cp ('t157');
4855 !!!parse-error (type => 'start tag not allowed',
4856 text => $token->{tag_name}, token => $token);
4857 ## Ignore the token
4858 !!!nack ('t157.1');
4859 !!!next-token;
4860 next B;
4861 } # INSCOPE
4862
4863 ## generate implied end tags
4864 while ($self->{open_elements}->[-1]->[1]
4865 & END_TAG_OPTIONAL_EL) {
4866 !!!cp ('t158');
4867 pop @{$self->{open_elements}};
4868 }
4869
4870 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4871 !!!cp ('t159');
4872 !!!parse-error (type => 'not closed',
4873 text => $self->{open_elements}->[-1]->[0]
4874 ->manakai_local_name,
4875 token => $token);
4876 } else {
4877 !!!cp ('t160');
4878 }
4879
4880 splice @{$self->{open_elements}}, $i;
4881
4882 $clear_up_to_marker->();
4883
4884 $self->{insertion_mode} = IN_TABLE_IM;
4885
4886 ## reprocess
4887 !!!ack-later;
4888 next B;
4889 } else {
4890 !!!cp ('t161');
4891 #
4892 }
4893 } else {
4894 !!!cp ('t162');
4895 #
4896 }
4897 } elsif ($token->{type} == END_TAG_TOKEN) {
4898 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4899 if ($self->{insertion_mode} == IN_CELL_IM) {
4900 ## have an element in table scope
4901 my $i;
4902 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4903 my $node = $self->{open_elements}->[$_];
4904 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4905 !!!cp ('t163');
4906 $i = $_;
4907 last INSCOPE;
4908 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4909 !!!cp ('t164');
4910 last INSCOPE;
4911 }
4912 } # INSCOPE
4913 unless (defined $i) {
4914 !!!cp ('t165');
4915 !!!parse-error (type => 'unmatched end tag',
4916 text => $token->{tag_name},
4917 token => $token);
4918 ## Ignore the token
4919 !!!next-token;
4920 next B;
4921 }
4922
4923 ## generate implied end tags
4924 while ($self->{open_elements}->[-1]->[1]
4925 & END_TAG_OPTIONAL_EL) {
4926 !!!cp ('t166');
4927 pop @{$self->{open_elements}};
4928 }
4929
4930 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4931 ne $token->{tag_name}) {
4932 !!!cp ('t167');
4933 !!!parse-error (type => 'not closed',
4934 text => $self->{open_elements}->[-1]->[0]
4935 ->manakai_local_name,
4936 token => $token);
4937 } else {
4938 !!!cp ('t168');
4939 }
4940
4941 splice @{$self->{open_elements}}, $i;
4942
4943 $clear_up_to_marker->();
4944
4945 $self->{insertion_mode} = IN_ROW_IM;
4946
4947 !!!next-token;
4948 next B;
4949 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4950 !!!cp ('t169');
4951 !!!parse-error (type => 'unmatched end tag',
4952 text => $token->{tag_name}, token => $token);
4953 ## Ignore the token
4954 !!!next-token;
4955 next B;
4956 } else {
4957 !!!cp ('t170');
4958 #
4959 }
4960 } elsif ($token->{tag_name} eq 'caption') {
4961 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4962 ## have a table element in table scope
4963 my $i;
4964 INSCOPE: {
4965 for (reverse 0..$#{$self->{open_elements}}) {
4966 my $node = $self->{open_elements}->[$_];
4967 if ($node->[1] & CAPTION_EL) {
4968 !!!cp ('t171');
4969 $i = $_;
4970 last INSCOPE;
4971 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4972 !!!cp ('t172');
4973 last;
4974 }
4975 }
4976
4977 !!!cp ('t173');
4978 !!!parse-error (type => 'unmatched end tag',
4979 text => $token->{tag_name}, token => $token);
4980 ## Ignore the token
4981 !!!next-token;
4982 next B;
4983 } # INSCOPE
4984
4985 ## generate implied end tags
4986 while ($self->{open_elements}->[-1]->[1]
4987 & END_TAG_OPTIONAL_EL) {
4988 !!!cp ('t174');
4989 pop @{$self->{open_elements}};
4990 }
4991
4992 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4993 !!!cp ('t175');
4994 !!!parse-error (type => 'not closed',
4995 text => $self->{open_elements}->[-1]->[0]
4996 ->manakai_local_name,
4997 token => $token);
4998 } else {
4999 !!!cp ('t176');
5000 }
5001
5002 splice @{$self->{open_elements}}, $i;
5003
5004 $clear_up_to_marker->();
5005
5006 $self->{insertion_mode} = IN_TABLE_IM;
5007
5008 !!!next-token;
5009 next B;
5010 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5011 !!!cp ('t177');
5012 !!!parse-error (type => 'unmatched end tag',
5013 text => $token->{tag_name}, token => $token);
5014 ## Ignore the token
5015 !!!next-token;
5016 next B;
5017 } else {
5018 !!!cp ('t178');
5019 #
5020 }
5021 } elsif ({
5022 table => 1, tbody => 1, tfoot => 1,
5023 thead => 1, tr => 1,
5024 }->{$token->{tag_name}} and
5025 $self->{insertion_mode} == IN_CELL_IM) {
5026 ## have an element in table scope
5027 my $i;
5028 my $tn;
5029 INSCOPE: {
5030 for (reverse 0..$#{$self->{open_elements}}) {
5031 my $node = $self->{open_elements}->[$_];
5032 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5033 !!!cp ('t179');
5034 $i = $_;
5035
5036 ## Close the cell
5037 !!!back-token; # </x>
5038 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5039 line => $token->{line},
5040 column => $token->{column}};
5041 next B;
5042 } elsif ($node->[1] & TABLE_CELL_EL) {
5043 !!!cp ('t180');
5044 $tn = $node->[0]->manakai_local_name;
5045 ## NOTE: There is exactly one |td| or |th| element
5046 ## in scope in the stack of open elements by definition.
5047 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5048 ## ISSUE: Can this be reached?
5049 !!!cp ('t181');
5050 last;
5051 }
5052 }
5053
5054 !!!cp ('t182');
5055 !!!parse-error (type => 'unmatched end tag',
5056 text => $token->{tag_name}, token => $token);
5057 ## Ignore the token
5058 !!!next-token;
5059 next B;
5060 } # INSCOPE
5061 } elsif ($token->{tag_name} eq 'table' and
5062 $self->{insertion_mode} == IN_CAPTION_IM) {
5063 !!!parse-error (type => 'not closed', text => 'caption',
5064 token => $token);
5065
5066 ## As if </caption>
5067 ## have a table element in table scope
5068 my $i;
5069 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5070 my $node = $self->{open_elements}->[$_];
5071 if ($node->[1] & CAPTION_EL) {
5072 !!!cp ('t184');
5073 $i = $_;
5074 last INSCOPE;
5075 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5076 !!!cp ('t185');
5077 last INSCOPE;
5078 }
5079 } # INSCOPE
5080 unless (defined $i) {
5081 !!!cp ('t186');
5082 !!!parse-error (type => 'unmatched end tag',
5083 text => 'caption', token => $token);
5084 ## Ignore the token
5085 !!!next-token;
5086 next B;
5087 }
5088
5089 ## generate implied end tags
5090 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5091 !!!cp ('t187');
5092 pop @{$self->{open_elements}};
5093 }
5094
5095 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5096 !!!cp ('t188');
5097 !!!parse-error (type => 'not closed',
5098 text => $self->{open_elements}->[-1]->[0]
5099 ->manakai_local_name,
5100 token => $token);
5101 } else {
5102 !!!cp ('t189');
5103 }
5104
5105 splice @{$self->{open_elements}}, $i;
5106
5107 $clear_up_to_marker->();
5108
5109 $self->{insertion_mode} = IN_TABLE_IM;
5110
5111 ## reprocess
5112 next B;
5113 } elsif ({
5114 body => 1, col => 1, colgroup => 1, html => 1,
5115 }->{$token->{tag_name}}) {
5116 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5117 !!!cp ('t190');
5118 !!!parse-error (type => 'unmatched end tag',
5119 text => $token->{tag_name}, token => $token);
5120 ## Ignore the token
5121 !!!next-token;
5122 next B;
5123 } else {
5124 !!!cp ('t191');
5125 #
5126 }
5127 } elsif ({
5128 tbody => 1, tfoot => 1,
5129 thead => 1, tr => 1,
5130 }->{$token->{tag_name}} and
5131 $self->{insertion_mode} == IN_CAPTION_IM) {
5132 !!!cp ('t192');
5133 !!!parse-error (type => 'unmatched end tag',
5134 text => $token->{tag_name}, token => $token);
5135 ## Ignore the token
5136 !!!next-token;
5137 next B;
5138 } else {
5139 !!!cp ('t193');
5140 #
5141 }
5142 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5143 for my $entry (@{$self->{open_elements}}) {
5144 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5145 !!!cp ('t75');
5146 !!!parse-error (type => 'in body:#eof', token => $token);
5147 last;
5148 }
5149 }
5150
5151 ## Stop parsing.
5152 last B;
5153 } else {
5154 die "$0: $token->{type}: Unknown token type";
5155 }
5156
5157 $insert = $insert_to_current;
5158 #
5159 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5160 if ($token->{type} == CHARACTER_TOKEN) {
5161 if (not $open_tables->[-1]->[1] and # tainted
5162 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5163 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5164
5165 unless (length $token->{data}) {
5166 !!!cp ('t194');
5167 !!!next-token;
5168 next B;
5169 } else {
5170 !!!cp ('t195');
5171 }
5172 }
5173
5174 !!!parse-error (type => 'in table:#text', token => $token);
5175
5176 ## As if in body, but insert into foster parent element
5177 ## ISSUE: Spec says that "whenever a node would be inserted
5178 ## into the current node" while characters might not be
5179 ## result in a new Text node.
5180 $reconstruct_active_formatting_elements->($insert_to_foster);
5181
5182 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5183 # MUST
5184 my $foster_parent_element;
5185 my $next_sibling;
5186 my $prev_sibling;
5187 OE: for (reverse 0..$#{$self->{open_elements}}) {
5188 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5189 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5190 if (defined $parent and $parent->node_type == 1) {
5191 !!!cp ('t196');
5192 $foster_parent_element = $parent;
5193 $next_sibling = $self->{open_elements}->[$_]->[0];
5194 $prev_sibling = $next_sibling->previous_sibling;
5195 } else {
5196 !!!cp ('t197');
5197 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5198 $prev_sibling = $foster_parent_element->last_child;
5199 }
5200 last OE;
5201 }
5202 } # OE
5203 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5204 $prev_sibling = $foster_parent_element->last_child
5205 unless defined $foster_parent_element;
5206 if (defined $prev_sibling and
5207 $prev_sibling->node_type == 3) {
5208 !!!cp ('t198');
5209 $prev_sibling->manakai_append_text ($token->{data});
5210 } else {
5211 !!!cp ('t199');
5212 $foster_parent_element->insert_before
5213 ($self->{document}->create_text_node ($token->{data}),
5214 $next_sibling);
5215 }
5216 $open_tables->[-1]->[1] = 1; # tainted
5217 } else {
5218 !!!cp ('t200');
5219 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5220 }
5221
5222 !!!next-token;
5223 next B;
5224 } elsif ($token->{type} == START_TAG_TOKEN) {
5225 if ({
5226 tr => ($self->{insertion_mode} != IN_ROW_IM),
5227 th => 1, td => 1,
5228 }->{$token->{tag_name}}) {
5229 if ($self->{insertion_mode} == IN_TABLE_IM) {
5230 ## Clear back to table context
5231 while (not ($self->{open_elements}->[-1]->[1]
5232 & TABLE_SCOPING_EL)) {
5233 !!!cp ('t201');
5234 pop @{$self->{open_elements}};
5235 }
5236
5237 !!!insert-element ('tbody',, $token);
5238 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5239 ## reprocess in the "in table body" insertion mode...
5240 }
5241
5242 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5243 unless ($token->{tag_name} eq 'tr') {
5244 !!!cp ('t202');
5245 !!!parse-error (type => 'missing start tag:tr', token => $token);
5246 }
5247
5248 ## Clear back to table body context
5249 while (not ($self->{open_elements}->[-1]->[1]
5250 & TABLE_ROWS_SCOPING_EL)) {
5251 !!!cp ('t203');
5252 ## ISSUE: Can this case be reached?
5253 pop @{$self->{open_elements}};
5254 }
5255
5256 $self->{insertion_mode} = IN_ROW_IM;
5257 if ($token->{tag_name} eq 'tr') {
5258 !!!cp ('t204');
5259 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5260 !!!nack ('t204');
5261 !!!next-token;
5262 next B;
5263 } else {
5264 !!!cp ('t205');
5265 !!!insert-element ('tr',, $token);
5266 ## reprocess in the "in row" insertion mode
5267 }
5268 } else {
5269 !!!cp ('t206');
5270 }
5271
5272 ## Clear back to table row context
5273 while (not ($self->{open_elements}->[-1]->[1]
5274 & TABLE_ROW_SCOPING_EL)) {
5275 !!!cp ('t207');
5276 pop @{$self->{open_elements}};
5277 }
5278
5279 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5280 $self->{insertion_mode} = IN_CELL_IM;
5281
5282 push @$active_formatting_elements, ['#marker', ''];
5283
5284 !!!nack ('t207.1');
5285 !!!next-token;
5286 next B;
5287 } elsif ({
5288 caption => 1, col => 1, colgroup => 1,
5289 tbody => 1, tfoot => 1, thead => 1,
5290 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5291 }->{$token->{tag_name}}) {
5292 if ($self->{insertion_mode} == IN_ROW_IM) {
5293 ## As if </tr>
5294 ## have an element in table scope
5295 my $i;
5296 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5297 my $node = $self->{open_elements}->[$_];
5298 if ($node->[1] & TABLE_ROW_EL) {
5299 !!!cp ('t208');
5300 $i = $_;
5301 last INSCOPE;
5302 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5303 !!!cp ('t209');
5304 last INSCOPE;
5305 }
5306 } # INSCOPE
5307 unless (defined $i) {
5308 !!!cp ('t210');
5309 ## TODO: This type is wrong.
5310 !!!parse-error (type => 'unmacthed end tag',
5311 text => $token->{tag_name}, token => $token);
5312 ## Ignore the token
5313 !!!nack ('t210.1');
5314 !!!next-token;
5315 next B;
5316 }
5317
5318 ## Clear back to table row context
5319 while (not ($self->{open_elements}->[-1]->[1]
5320 & TABLE_ROW_SCOPING_EL)) {
5321 !!!cp ('t211');
5322 ## ISSUE: Can this case be reached?
5323 pop @{$self->{open_elements}};
5324 }
5325
5326 pop @{$self->{open_elements}}; # tr
5327 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5328 if ($token->{tag_name} eq 'tr') {
5329 !!!cp ('t212');
5330 ## reprocess
5331 !!!ack-later;
5332 next B;
5333 } else {
5334 !!!cp ('t213');
5335 ## reprocess in the "in table body" insertion mode...
5336 }
5337 }
5338
5339 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5340 ## have an element in table scope
5341 my $i;
5342 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5343 my $node = $self->{open_elements}->[$_];
5344 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5345 !!!cp ('t214');
5346 $i = $_;
5347 last INSCOPE;
5348 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5349 !!!cp ('t215');
5350 last INSCOPE;
5351 }
5352 } # INSCOPE
5353 unless (defined $i) {
5354 !!!cp ('t216');
5355 ## TODO: This erorr type is wrong.
5356 !!!parse-error (type => 'unmatched end tag',
5357 text => $token->{tag_name}, token => $token);
5358 ## Ignore the token
5359 !!!nack ('t216.1');
5360 !!!next-token;
5361 next B;
5362 }
5363
5364 ## Clear back to table body context
5365 while (not ($self->{open_elements}->[-1]->[1]
5366 & TABLE_ROWS_SCOPING_EL)) {
5367 !!!cp ('t217');
5368 ## ISSUE: Can this state be reached?
5369 pop @{$self->{open_elements}};
5370 }
5371
5372 ## As if <{current node}>
5373 ## have an element in table scope
5374 ## true by definition
5375
5376 ## Clear back to table body context
5377 ## nop by definition
5378
5379 pop @{$self->{open_elements}};
5380 $self->{insertion_mode} = IN_TABLE_IM;
5381 ## reprocess in "in table" insertion mode...
5382 } else {
5383 !!!cp ('t218');
5384 }
5385
5386 if ($token->{tag_name} eq 'col') {
5387 ## Clear back to table context
5388 while (not ($self->{open_elements}->[-1]->[1]
5389 & TABLE_SCOPING_EL)) {
5390 !!!cp ('t219');
5391 ## ISSUE: Can this state be reached?
5392 pop @{$self->{open_elements}};
5393 }
5394
5395 !!!insert-element ('colgroup',, $token);
5396 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5397 ## reprocess
5398 !!!ack-later;
5399 next B;
5400 } elsif ({
5401 caption => 1,
5402 colgroup => 1,
5403 tbody => 1, tfoot => 1, thead => 1,
5404 }->{$token->{tag_name}}) {
5405 ## Clear back to table context
5406 while (not ($self->{open_elements}->[-1]->[1]
5407 & TABLE_SCOPING_EL)) {
5408 !!!cp ('t220');
5409 ## ISSUE: Can this state be reached?
5410 pop @{$self->{open_elements}};
5411 }
5412
5413 push @$active_formatting_elements, ['#marker', '']
5414 if $token->{tag_name} eq 'caption';
5415
5416 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5417 $self->{insertion_mode} = {
5418 caption => IN_CAPTION_IM,
5419 colgroup => IN_COLUMN_GROUP_IM,
5420 tbody => IN_TABLE_BODY_IM,
5421 tfoot => IN_TABLE_BODY_IM,
5422 thead => IN_TABLE_BODY_IM,
5423 }->{$token->{tag_name}};
5424 !!!next-token;
5425 !!!nack ('t220.1');
5426 next B;
5427 } else {
5428 die "$0: in table: <>: $token->{tag_name}";
5429 }
5430 } elsif ($token->{tag_name} eq 'table') {
5431 !!!parse-error (type => 'not closed',
5432 text => $self->{open_elements}->[-1]->[0]
5433 ->manakai_local_name,
5434 token => $token);
5435
5436 ## As if </table>
5437 ## have a table element in table scope
5438 my $i;
5439 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5440 my $node = $self->{open_elements}->[$_];
5441 if ($node->[1] & TABLE_EL) {
5442 !!!cp ('t221');
5443 $i = $_;
5444 last INSCOPE;
5445 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5446 !!!cp ('t222');
5447 last INSCOPE;
5448 }
5449 } # INSCOPE
5450 unless (defined $i) {
5451 !!!cp ('t223');
5452 ## TODO: The following is wrong, maybe.
5453 !!!parse-error (type => 'unmatched end tag', text => 'table',
5454 token => $token);
5455 ## Ignore tokens </table><table>
5456 !!!nack ('t223.1');
5457 !!!next-token;
5458 next B;
5459 }
5460
5461 ## TODO: Followings are removed from the latest spec.
5462 ## generate implied end tags
5463 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5464 !!!cp ('t224');
5465 pop @{$self->{open_elements}};
5466 }
5467
5468 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5469 !!!cp ('t225');
5470 ## NOTE: |<table><tr><table>|
5471 !!!parse-error (type => 'not closed',
5472 text => $self->{open_elements}->[-1]->[0]
5473 ->manakai_local_name,
5474 token => $token);
5475 } else {
5476 !!!cp ('t226');
5477 }
5478
5479 splice @{$self->{open_elements}}, $i;
5480 pop @{$open_tables};
5481
5482 $self->_reset_insertion_mode;
5483
5484 ## reprocess
5485 !!!ack-later;
5486 next B;
5487 } elsif ($token->{tag_name} eq 'style') {
5488 if (not $open_tables->[-1]->[1]) { # tainted
5489 !!!cp ('t227.8');
5490 ## NOTE: This is a "as if in head" code clone.
5491 $parse_rcdata->(CDATA_CONTENT_MODEL);
5492 next B;
5493 } else {
5494 !!!cp ('t227.7');
5495 #
5496 }
5497 } elsif ($token->{tag_name} eq 'script') {
5498 if (not $open_tables->[-1]->[1]) { # tainted
5499 !!!cp ('t227.6');
5500 ## NOTE: This is a "as if in head" code clone.
5501 $script_start_tag->();
5502 next B;
5503 } else {
5504 !!!cp ('t227.5');
5505 #
5506 }
5507 } elsif ($token->{tag_name} eq 'input') {
5508 if (not $open_tables->[-1]->[1]) { # tainted
5509 if ($token->{attributes}->{type}) { ## TODO: case
5510 my $type = lc $token->{attributes}->{type}->{value};
5511 if ($type eq 'hidden') {
5512 !!!cp ('t227.3');
5513 !!!parse-error (type => 'in table',
5514 text => $token->{tag_name}, token => $token);
5515
5516 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5517
5518 ## TODO: form element pointer
5519
5520 pop @{$self->{open_elements}};
5521
5522 !!!next-token;
5523 !!!ack ('t227.2.1');
5524 next B;
5525 } else {
5526 !!!cp ('t227.2');
5527 #
5528 }
5529 } else {
5530 !!!cp ('t227.1');
5531 #
5532 }
5533 } else {
5534 !!!cp ('t227.4');
5535 #
5536 }
5537 } else {
5538 !!!cp ('t227');
5539 #
5540 }
5541
5542 !!!parse-error (type => 'in table', text => $token->{tag_name},
5543 token => $token);
5544
5545 $insert = $insert_to_foster;
5546 #
5547 } elsif ($token->{type} == END_TAG_TOKEN) {
5548 if ($token->{tag_name} eq 'tr' and
5549 $self->{insertion_mode} == IN_ROW_IM) {
5550 ## have an element in table scope
5551 my $i;
5552 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5553 my $node = $self->{open_elements}->[$_];
5554 if ($node->[1] & TABLE_ROW_EL) {
5555 !!!cp ('t228');
5556 $i = $_;
5557 last INSCOPE;
5558 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5559 !!!cp ('t229');
5560 last INSCOPE;
5561 }
5562 } # INSCOPE
5563 unless (defined $i) {
5564 !!!cp ('t230');
5565 !!!parse-error (type => 'unmatched end tag',
5566 text => $token->{tag_name}, token => $token);
5567 ## Ignore the token
5568 !!!nack ('t230.1');
5569 !!!next-token;
5570 next B;
5571 } else {
5572 !!!cp ('t232');
5573 }
5574
5575 ## Clear back to table row context
5576 while (not ($self->{open_elements}->[-1]->[1]
5577 & TABLE_ROW_SCOPING_EL)) {
5578 !!!cp ('t231');
5579 ## ISSUE: Can this state be reached?
5580 pop @{$self->{open_elements}};
5581 }
5582
5583 pop @{$self->{open_elements}}; # tr
5584 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5585 !!!next-token;
5586 !!!nack ('t231.1');
5587 next B;
5588 } elsif ($token->{tag_name} eq 'table') {
5589 if ($self->{insertion_mode} == IN_ROW_IM) {
5590 ## As if </tr>
5591 ## have an element in table scope
5592 my $i;
5593 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5594 my $node = $self->{open_elements}->[$_];
5595 if ($node->[1] & TABLE_ROW_EL) {
5596 !!!cp ('t233');
5597 $i = $_;
5598 last INSCOPE;
5599 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5600 !!!cp ('t234');
5601 last INSCOPE;
5602 }
5603 } # INSCOPE
5604 unless (defined $i) {
5605 !!!cp ('t235');
5606 ## TODO: The following is wrong.
5607 !!!parse-error (type => 'unmatched end tag',
5608 text => $token->{type}, token => $token);
5609 ## Ignore the token
5610 !!!nack ('t236.1');
5611 !!!next-token;
5612 next B;
5613 }
5614
5615 ## Clear back to table row context
5616 while (not ($self->{open_elements}->[-1]->[1]
5617 & TABLE_ROW_SCOPING_EL)) {
5618 !!!cp ('t236');
5619 ## ISSUE: Can this state be reached?
5620 pop @{$self->{open_elements}};
5621 }
5622
5623 pop @{$self->{open_elements}}; # tr
5624 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5625 ## reprocess in the "in table body" insertion mode...
5626 }
5627
5628 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5629 ## have an element in table scope
5630 my $i;
5631 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5632 my $node = $self->{open_elements}->[$_];
5633 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5634 !!!cp ('t237');
5635 $i = $_;
5636 last INSCOPE;
5637 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5638 !!!cp ('t238');
5639 last INSCOPE;
5640 }
5641 } # INSCOPE
5642 unless (defined $i) {
5643 !!!cp ('t239');
5644 !!!parse-error (type => 'unmatched end tag',
5645 text => $token->{tag_name}, token => $token);
5646 ## Ignore the token
5647 !!!nack ('t239.1');
5648 !!!next-token;
5649 next B;
5650 }
5651
5652 ## Clear back to table body context
5653 while (not ($self->{open_elements}->[-1]->[1]
5654 & TABLE_ROWS_SCOPING_EL)) {
5655 !!!cp ('t240');
5656 pop @{$self->{open_elements}};
5657 }
5658
5659 ## As if <{current node}>
5660 ## have an element in table scope
5661 ## true by definition
5662
5663 ## Clear back to table body context
5664 ## nop by definition
5665
5666 pop @{$self->{open_elements}};
5667 $self->{insertion_mode} = IN_TABLE_IM;
5668 ## reprocess in the "in table" insertion mode...
5669 }
5670
5671 ## NOTE: </table> in the "in table" insertion mode.
5672 ## When you edit the code fragment below, please ensure that
5673 ## the code for <table> in the "in table" insertion mode
5674 ## is synced with it.
5675
5676 ## have a table element in table scope
5677 my $i;
5678 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5679 my $node = $self->{open_elements}->[$_];
5680 if ($node->[1] & TABLE_EL) {
5681 !!!cp ('t241');
5682 $i = $_;
5683 last INSCOPE;
5684 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5685 !!!cp ('t242');
5686 last INSCOPE;
5687 }
5688 } # INSCOPE
5689 unless (defined $i) {
5690 !!!cp ('t243');
5691 !!!parse-error (type => 'unmatched end tag',
5692 text => $token->{tag_name}, token => $token);
5693 ## Ignore the token
5694 !!!nack ('t243.1');
5695 !!!next-token;
5696 next B;
5697 }
5698
5699 splice @{$self->{open_elements}}, $i;
5700 pop @{$open_tables};
5701
5702 $self->_reset_insertion_mode;
5703
5704 !!!next-token;
5705 next B;
5706 } elsif ({
5707 tbody => 1, tfoot => 1, thead => 1,
5708 }->{$token->{tag_name}} and
5709 $self->{insertion_mode} & ROW_IMS) {
5710 if ($self->{insertion_mode} == IN_ROW_IM) {
5711 ## have an element in table scope
5712 my $i;
5713 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5714 my $node = $self->{open_elements}->[$_];
5715 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5716 !!!cp ('t247');
5717 $i = $_;
5718 last INSCOPE;
5719 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5720 !!!cp ('t248');
5721 last INSCOPE;
5722 }
5723 } # INSCOPE
5724 unless (defined $i) {
5725 !!!cp ('t249');
5726 !!!parse-error (type => 'unmatched end tag',
5727 text => $token->{tag_name}, token => $token);
5728 ## Ignore the token
5729 !!!nack ('t249.1');
5730 !!!next-token;
5731 next B;
5732 }
5733
5734 ## As if </tr>
5735 ## have an element in table scope
5736 my $i;
5737 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5738 my $node = $self->{open_elements}->[$_];
5739 if ($node->[1] & TABLE_ROW_EL) {
5740 !!!cp ('t250');
5741 $i = $_;
5742 last INSCOPE;
5743 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5744 !!!cp ('t251');
5745 last INSCOPE;
5746 }
5747 } # INSCOPE
5748 unless (defined $i) {
5749 !!!cp ('t252');
5750 !!!parse-error (type => 'unmatched end tag',
5751 text => 'tr', token => $token);
5752 ## Ignore the token
5753 !!!nack ('t252.1');
5754 !!!next-token;
5755 next B;
5756 }
5757
5758 ## Clear back to table row context
5759 while (not ($self->{open_elements}->[-1]->[1]
5760 & TABLE_ROW_SCOPING_EL)) {
5761 !!!cp ('t253');
5762 ## ISSUE: Can this case be reached?
5763 pop @{$self->{open_elements}};
5764 }
5765
5766 pop @{$self->{open_elements}}; # tr
5767 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5768 ## reprocess in the "in table body" insertion mode...
5769 }
5770
5771 ## have an element in table scope
5772 my $i;
5773 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5774 my $node = $self->{open_elements}->[$_];
5775 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5776 !!!cp ('t254');
5777 $i = $_;
5778 last INSCOPE;
5779 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5780 !!!cp ('t255');
5781 last INSCOPE;
5782 }
5783 } # INSCOPE
5784 unless (defined $i) {
5785 !!!cp ('t256');
5786 !!!parse-error (type => 'unmatched end tag',
5787 text => $token->{tag_name}, token => $token);
5788 ## Ignore the token
5789 !!!nack ('t256.1');
5790 !!!next-token;
5791 next B;
5792 }
5793
5794 ## Clear back to table body context
5795 while (not ($self->{open_elements}->[-1]->[1]
5796 & TABLE_ROWS_SCOPING_EL)) {
5797 !!!cp ('t257');
5798 ## ISSUE: Can this case be reached?
5799 pop @{$self->{open_elements}};
5800 }
5801
5802 pop @{$self->{open_elements}};
5803 $self->{insertion_mode} = IN_TABLE_IM;
5804 !!!nack ('t257.1');
5805 !!!next-token;
5806 next B;
5807 } elsif ({
5808 body => 1, caption => 1, col => 1, colgroup => 1,
5809 html => 1, td => 1, th => 1,
5810 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5811 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5812 }->{$token->{tag_name}}) {
5813 !!!cp ('t258');
5814 !!!parse-error (type => 'unmatched end tag',
5815 text => $token->{tag_name}, token => $token);
5816 ## Ignore the token
5817 !!!nack ('t258.1');
5818 !!!next-token;
5819 next B;
5820 } else {
5821 !!!cp ('t259');
5822 !!!parse-error (type => 'in table:/',
5823 text => $token->{tag_name}, token => $token);
5824
5825 $insert = $insert_to_foster;
5826 #
5827 }
5828 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5829 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5830 @{$self->{open_elements}} == 1) { # redundant, maybe
5831 !!!parse-error (type => 'in body:#eof', token => $token);
5832 !!!cp ('t259.1');
5833 #
5834 } else {
5835 !!!cp ('t259.2');
5836 #
5837 }
5838
5839 ## Stop parsing
5840 last B;
5841 } else {
5842 die "$0: $token->{type}: Unknown token type";
5843 }
5844 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5845 if ($token->{type} == CHARACTER_TOKEN) {
5846 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5847 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5848 unless (length $token->{data}) {
5849 !!!cp ('t260');
5850 !!!next-token;
5851 next B;
5852 }
5853 }
5854
5855 !!!cp ('t261');
5856 #
5857 } elsif ($token->{type} == START_TAG_TOKEN) {
5858 if ($token->{tag_name} eq 'col') {
5859 !!!cp ('t262');
5860 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5861 pop @{$self->{open_elements}};
5862 !!!ack ('t262.1');
5863 !!!next-token;
5864 next B;
5865 } else {
5866 !!!cp ('t263');
5867 #
5868 }
5869 } elsif ($token->{type} == END_TAG_TOKEN) {
5870 if ($token->{tag_name} eq 'colgroup') {
5871 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5872 !!!cp ('t264');
5873 !!!parse-error (type => 'unmatched end tag',
5874 text => 'colgroup', token => $token);
5875 ## Ignore the token
5876 !!!next-token;
5877 next B;
5878 } else {
5879 !!!cp ('t265');
5880 pop @{$self->{open_elements}}; # colgroup
5881 $self->{insertion_mode} = IN_TABLE_IM;
5882 !!!next-token;
5883 next B;
5884 }
5885 } elsif ($token->{tag_name} eq 'col') {
5886 !!!cp ('t266');
5887 !!!parse-error (type => 'unmatched end tag',
5888 text => 'col', token => $token);
5889 ## Ignore the token
5890 !!!next-token;
5891 next B;
5892 } else {
5893 !!!cp ('t267');
5894 #
5895 }
5896 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5897 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5898 @{$self->{open_elements}} == 1) { # redundant, maybe
5899 !!!cp ('t270.2');
5900 ## Stop parsing.
5901 last B;
5902 } else {
5903 ## NOTE: As if </colgroup>.
5904 !!!cp ('t270.1');
5905 pop @{$self->{open_elements}}; # colgroup
5906 $self->{insertion_mode} = IN_TABLE_IM;
5907 ## Reprocess.
5908 next B;
5909 }
5910 } else {
5911 die "$0: $token->{type}: Unknown token type";
5912 }
5913
5914 ## As if </colgroup>
5915 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5916 !!!cp ('t269');
5917 ## TODO: Wrong error type?
5918 !!!parse-error (type => 'unmatched end tag',
5919 text => 'colgroup', token => $token);
5920 ## Ignore the token
5921 !!!nack ('t269.1');
5922 !!!next-token;
5923 next B;
5924 } else {
5925 !!!cp ('t270');
5926 pop @{$self->{open_elements}}; # colgroup
5927 $self->{insertion_mode} = IN_TABLE_IM;
5928 !!!ack-later;
5929 ## reprocess
5930 next B;
5931 }
5932 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5933 if ($token->{type} == CHARACTER_TOKEN) {
5934 !!!cp ('t271');
5935 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5936 !!!next-token;
5937 next B;
5938 } elsif ($token->{type} == START_TAG_TOKEN) {
5939 if ($token->{tag_name} eq 'option') {
5940 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5941 !!!cp ('t272');
5942 ## As if </option>
5943 pop @{$self->{open_elements}};
5944 } else {
5945 !!!cp ('t273');
5946 }
5947
5948 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5949 !!!nack ('t273.1');
5950 !!!next-token;
5951 next B;
5952 } elsif ($token->{tag_name} eq 'optgroup') {
5953 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5954 !!!cp ('t274');
5955 ## As if </option>
5956 pop @{$self->{open_elements}};
5957 } else {
5958 !!!cp ('t275');
5959 }
5960
5961 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5962 !!!cp ('t276');
5963 ## As if </optgroup>
5964 pop @{$self->{open_elements}};
5965 } else {
5966 !!!cp ('t277');
5967 }
5968
5969 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5970 !!!nack ('t277.1');
5971 !!!next-token;
5972 next B;
5973 } elsif ({
5974 select => 1, input => 1, textarea => 1,
5975 }->{$token->{tag_name}} or
5976 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5977 {
5978 caption => 1, table => 1,
5979 tbody => 1, tfoot => 1, thead => 1,
5980 tr => 1, td => 1, th => 1,
5981 }->{$token->{tag_name}})) {
5982 ## TODO: The type below is not good - <select> is replaced by </select>
5983 !!!parse-error (type => 'not closed', text => 'select',
5984 token => $token);
5985 ## NOTE: As if the token were </select> (<select> case) or
5986 ## as if there were </select> (otherwise).
5987 ## have an element in table scope
5988 my $i;
5989 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5990 my $node = $self->{open_elements}->[$_];
5991 if ($node->[1] & SELECT_EL) {
5992 !!!cp ('t278');
5993 $i = $_;
5994 last INSCOPE;
5995 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5996 !!!cp ('t279');
5997 last INSCOPE;
5998 }
5999 } # INSCOPE
6000 unless (defined $i) {
6001 !!!cp ('t280');
6002 !!!parse-error (type => 'unmatched end tag',
6003 text => 'select', token => $token);
6004 ## Ignore the token
6005 !!!nack ('t280.1');
6006 !!!next-token;
6007 next B;
6008 }
6009
6010 !!!cp ('t281');
6011 splice @{$self->{open_elements}}, $i;
6012
6013 $self->_reset_insertion_mode;
6014
6015 if ($token->{tag_name} eq 'select') {
6016 !!!nack ('t281.2');
6017 !!!next-token;
6018 next B;
6019 } else {
6020 !!!cp ('t281.1');
6021 !!!ack-later;
6022 ## Reprocess the token.
6023 next B;
6024 }
6025 } else {
6026 !!!cp ('t282');
6027 !!!parse-error (type => 'in select',
6028 text => $token->{tag_name}, token => $token);
6029 ## Ignore the token
6030 !!!nack ('t282.1');
6031 !!!next-token;
6032 next B;
6033 }
6034 } elsif ($token->{type} == END_TAG_TOKEN) {
6035 if ($token->{tag_name} eq 'optgroup') {
6036 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6037 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6038 !!!cp ('t283');
6039 ## As if </option>
6040 splice @{$self->{open_elements}}, -2;
6041 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6042 !!!cp ('t284');
6043 pop @{$self->{open_elements}};
6044 } else {
6045 !!!cp ('t285');
6046 !!!parse-error (type => 'unmatched end tag',
6047 text => $token->{tag_name}, token => $token);
6048 ## Ignore the token
6049 }
6050 !!!nack ('t285.1');
6051 !!!next-token;
6052 next B;
6053 } elsif ($token->{tag_name} eq 'option') {
6054 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6055 !!!cp ('t286');
6056 pop @{$self->{open_elements}};
6057 } else {
6058 !!!cp ('t287');
6059 !!!parse-error (type => 'unmatched end tag',
6060 text => $token->{tag_name}, token => $token);
6061 ## Ignore the token
6062 }
6063 !!!nack ('t287.1');
6064 !!!next-token;
6065 next B;
6066 } elsif ($token->{tag_name} eq 'select') {
6067 ## have an element in table scope
6068 my $i;
6069 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6070 my $node = $self->{open_elements}->[$_];
6071 if ($node->[1] & SELECT_EL) {
6072 !!!cp ('t288');
6073 $i = $_;
6074 last INSCOPE;
6075 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6076 !!!cp ('t289');
6077 last INSCOPE;
6078 }
6079 } # INSCOPE
6080 unless (defined $i) {
6081 !!!cp ('t290');
6082 !!!parse-error (type => 'unmatched end tag',
6083 text => $token->{tag_name}, token => $token);
6084 ## Ignore the token
6085 !!!nack ('t290.1');
6086 !!!next-token;
6087 next B;
6088 }
6089
6090 !!!cp ('t291');
6091 splice @{$self->{open_elements}}, $i;
6092
6093 $self->_reset_insertion_mode;
6094
6095 !!!nack ('t291.1');
6096 !!!next-token;
6097 next B;
6098 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6099 {
6100 caption => 1, table => 1, tbody => 1,
6101 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6102 }->{$token->{tag_name}}) {
6103 ## TODO: The following is wrong?
6104 !!!parse-error (type => 'unmatched end tag',
6105 text => $token->{tag_name}, token => $token);
6106
6107 ## have an element in table scope
6108 my $i;
6109 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6110 my $node = $self->{open_elements}->[$_];
6111 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6112 !!!cp ('t292');
6113 $i = $_;
6114 last INSCOPE;
6115 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6116 !!!cp ('t293');
6117 last INSCOPE;
6118 }
6119 } # INSCOPE
6120 unless (defined $i) {
6121 !!!cp ('t294');
6122 ## Ignore the token
6123 !!!nack ('t294.1');
6124 !!!next-token;
6125 next B;
6126 }
6127
6128 ## As if </select>
6129 ## have an element in table scope
6130 undef $i;
6131 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6132 my $node = $self->{open_elements}->[$_];
6133 if ($node->[1] & SELECT_EL) {
6134 !!!cp ('t295');
6135 $i = $_;
6136 last INSCOPE;
6137 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6138 ## ISSUE: Can this state be reached?
6139 !!!cp ('t296');
6140 last INSCOPE;
6141 }
6142 } # INSCOPE
6143 unless (defined $i) {
6144 !!!cp ('t297');
6145 ## TODO: The following error type is correct?
6146 !!!parse-error (type => 'unmatched end tag',
6147 text => 'select', token => $token);
6148 ## Ignore the </select> token
6149 !!!nack ('t297.1');
6150 !!!next-token; ## TODO: ok?
6151 next B;
6152 }
6153
6154 !!!cp ('t298');
6155 splice @{$self->{open_elements}}, $i;
6156
6157 $self->_reset_insertion_mode;
6158
6159 !!!ack-later;
6160 ## reprocess
6161 next B;
6162 } else {
6163 !!!cp ('t299');
6164 !!!parse-error (type => 'in select:/',
6165 text => $token->{tag_name}, token => $token);
6166 ## Ignore the token
6167 !!!nack ('t299.3');
6168 !!!next-token;
6169 next B;
6170 }
6171 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6172 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6173 @{$self->{open_elements}} == 1) { # redundant, maybe
6174 !!!cp ('t299.1');
6175 !!!parse-error (type => 'in body:#eof', token => $token);
6176 } else {
6177 !!!cp ('t299.2');
6178 }
6179
6180 ## Stop parsing.
6181 last B;
6182 } else {
6183 die "$0: $token->{type}: Unknown token type";
6184 }
6185 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6186 if ($token->{type} == CHARACTER_TOKEN) {
6187 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6188 my $data = $1;
6189 ## As if in body
6190 $reconstruct_active_formatting_elements->($insert_to_current);
6191
6192 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6193
6194 unless (length $token->{data}) {
6195 !!!cp ('t300');
6196 !!!next-token;
6197 next B;
6198 }
6199 }
6200
6201 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6202 !!!cp ('t301');
6203 !!!parse-error (type => 'after html:#text', token => $token);
6204
6205 ## Reprocess in the "after body" insertion mode.
6206 } else {
6207 !!!cp ('t302');
6208 }
6209
6210 ## "after body" insertion mode
6211 !!!parse-error (type => 'after body:#text', token => $token);
6212
6213 $self->{insertion_mode} = IN_BODY_IM;
6214 ## reprocess
6215 next B;
6216 } elsif ($token->{type} == START_TAG_TOKEN) {
6217 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6218 !!!cp ('t303');
6219 !!!parse-error (type => 'after html',
6220 text => $token->{tag_name}, token => $token);
6221
6222 ## Reprocess in the "after body" insertion mode.
6223 } else {
6224 !!!cp ('t304');
6225 }
6226
6227 ## "after body" insertion mode
6228 !!!parse-error (type => 'after body',
6229 text => $token->{tag_name}, token => $token);
6230
6231 $self->{insertion_mode} = IN_BODY_IM;
6232 !!!ack-later;
6233 ## reprocess
6234 next B;
6235 } elsif ($token->{type} == END_TAG_TOKEN) {
6236 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6237 !!!cp ('t305');
6238 !!!parse-error (type => 'after html:/',
6239 text => $token->{tag_name}, token => $token);
6240
6241 $self->{insertion_mode} = AFTER_BODY_IM;
6242 ## Reprocess in the "after body" insertion mode.
6243 } else {
6244 !!!cp ('t306');
6245 }
6246
6247 ## "after body" insertion mode
6248 if ($token->{tag_name} eq 'html') {
6249 if (defined $self->{inner_html_node}) {
6250 !!!cp ('t307');
6251 !!!parse-error (type => 'unmatched end tag',
6252 text => 'html', token => $token);
6253 ## Ignore the token
6254 !!!next-token;
6255 next B;
6256 } else {
6257 !!!cp ('t308');
6258 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6259 !!!next-token;
6260 next B;
6261 }
6262 } else {
6263 !!!cp ('t309');
6264 !!!parse-error (type => 'after body:/',
6265 text => $token->{tag_name}, token => $token);
6266
6267 $self->{insertion_mode} = IN_BODY_IM;
6268 ## reprocess
6269 next B;
6270 }
6271 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6272 !!!cp ('t309.2');
6273 ## Stop parsing
6274 last B;
6275 } else {
6276 die "$0: $token->{type}: Unknown token type";
6277 }
6278 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6279 if ($token->{type} == CHARACTER_TOKEN) {
6280 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6281 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6282
6283 unless (length $token->{data}) {
6284 !!!cp ('t310');
6285 !!!next-token;
6286 next B;
6287 }
6288 }
6289
6290 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6291 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6292 !!!cp ('t311');
6293 !!!parse-error (type => 'in frameset:#text', token => $token);
6294 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6295 !!!cp ('t312');
6296 !!!parse-error (type => 'after frameset:#text', token => $token);
6297 } else { # "after html frameset"
6298 !!!cp ('t313');
6299 !!!parse-error (type => 'after html:#text', token => $token);
6300
6301 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6302 ## Reprocess in the "after frameset" insertion mode.
6303 !!!parse-error (type => 'after frameset:#text', token => $token);
6304 }
6305
6306 ## Ignore the token.
6307 if (length $token->{data}) {
6308 !!!cp ('t314');
6309 ## reprocess the rest of characters
6310 } else {
6311 !!!cp ('t315');
6312 !!!next-token;
6313 }
6314 next B;
6315 }
6316
6317 die qq[$0: Character "$token->{data}"];
6318 } elsif ($token->{type} == START_TAG_TOKEN) {
6319 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6320 !!!cp ('t316');
6321 !!!parse-error (type => 'after html',
6322 text => $token->{tag_name}, token => $token);
6323
6324 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6325 ## Process in the "after frameset" insertion mode.
6326 } else {
6327 !!!cp ('t317');
6328 }
6329
6330 if ($token->{tag_name} eq 'frameset' and
6331 $self->{insertion_mode} == IN_FRAMESET_IM) {
6332 !!!cp ('t318');
6333 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6334 !!!nack ('t318.1');
6335 !!!next-token;
6336 next B;
6337 } elsif ($token->{tag_name} eq 'frame' and
6338 $self->{insertion_mode} == IN_FRAMESET_IM) {
6339 !!!cp ('t319');
6340 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6341 pop @{$self->{open_elements}};
6342 !!!ack ('t319.1');
6343 !!!next-token;
6344 next B;
6345 } elsif ($token->{tag_name} eq 'noframes') {
6346 !!!cp ('t320');
6347 ## NOTE: As if in head.
6348 $parse_rcdata->(CDATA_CONTENT_MODEL);
6349 next B;
6350 } else {
6351 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6352 !!!cp ('t321');
6353 !!!parse-error (type => 'in frameset',
6354 text => $token->{tag_name}, token => $token);
6355 } else {
6356 !!!cp ('t322');
6357 !!!parse-error (type => 'after frameset',
6358 text => $token->{tag_name}, token => $token);
6359 }
6360 ## Ignore the token
6361 !!!nack ('t322.1');
6362 !!!next-token;
6363 next B;
6364 }
6365 } elsif ($token->{type} == END_TAG_TOKEN) {
6366 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6367 !!!cp ('t323');
6368 !!!parse-error (type => 'after html:/',
6369 text => $token->{tag_name}, token => $token);
6370
6371 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6372 ## Process in the "after frameset" insertion mode.
6373 } else {
6374 !!!cp ('t324');
6375 }
6376
6377 if ($token->{tag_name} eq 'frameset' and
6378 $self->{insertion_mode} == IN_FRAMESET_IM) {
6379 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6380 @{$self->{open_elements}} == 1) {
6381 !!!cp ('t325');
6382 !!!parse-error (type => 'unmatched end tag',
6383 text => $token->{tag_name}, token => $token);
6384 ## Ignore the token
6385 !!!next-token;
6386 } else {
6387 !!!cp ('t326');
6388 pop @{$self->{open_elements}};
6389 !!!next-token;
6390 }
6391
6392 if (not defined $self->{inner_html_node} and
6393 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6394 !!!cp ('t327');
6395 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6396 } else {
6397 !!!cp ('t328');
6398 }
6399 next B;
6400 } elsif ($token->{tag_name} eq 'html' and
6401 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6402 !!!cp ('t329');
6403 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6404 !!!next-token;
6405 next B;
6406 } else {
6407 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6408 !!!cp ('t330');
6409 !!!parse-error (type => 'in frameset:/',
6410 text => $token->{tag_name}, token => $token);
6411 } else {
6412 !!!cp ('t331');
6413 !!!parse-error (type => 'after frameset:/',
6414 text => $token->{tag_name}, token => $token);
6415 }
6416 ## Ignore the token
6417 !!!next-token;
6418 next B;
6419 }
6420 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6421 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6422 @{$self->{open_elements}} == 1) { # redundant, maybe
6423 !!!cp ('t331.1');
6424 !!!parse-error (type => 'in body:#eof', token => $token);
6425 } else {
6426 !!!cp ('t331.2');
6427 }
6428
6429 ## Stop parsing
6430 last B;
6431 } else {
6432 die "$0: $token->{type}: Unknown token type";
6433 }
6434
6435 ## ISSUE: An issue in spec here
6436 } else {
6437 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6438 }
6439
6440 ## "in body" insertion mode
6441 if ($token->{type} == START_TAG_TOKEN) {
6442 if ($token->{tag_name} eq 'script') {
6443 !!!cp ('t332');
6444 ## NOTE: This is an "as if in head" code clone
6445 $script_start_tag->();
6446 next B;
6447 } elsif ($token->{tag_name} eq 'style') {
6448 !!!cp ('t333');
6449 ## NOTE: This is an "as if in head" code clone
6450 $parse_rcdata->(CDATA_CONTENT_MODEL);
6451 next B;
6452 } elsif ({
6453 base => 1, link => 1,
6454 }->{$token->{tag_name}}) {
6455 !!!cp ('t334');
6456 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6457 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6458 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6459 !!!ack ('t334.1');
6460 !!!next-token;
6461 next B;
6462 } elsif ($token->{tag_name} eq 'meta') {
6463 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6464 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6465 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6466
6467 unless ($self->{confident}) {
6468 if ($token->{attributes}->{charset}) {
6469 !!!cp ('t335');
6470 ## NOTE: Whether the encoding is supported or not is handled
6471 ## in the {change_encoding} callback.
6472 $self->{change_encoding}
6473 ->($self, $token->{attributes}->{charset}->{value}, $token);
6474
6475 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6476 ->set_user_data (manakai_has_reference =>
6477 $token->{attributes}->{charset}
6478 ->{has_reference});
6479 } elsif ($token->{attributes}->{content}) {
6480 if ($token->{attributes}->{content}->{value}
6481 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6482 [\x09-\x0D\x20]*=
6483 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6484 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6485 !!!cp ('t336');
6486 ## NOTE: Whether the encoding is supported or not is handled
6487 ## in the {change_encoding} callback.
6488 $self->{change_encoding}
6489 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6490 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6491 ->set_user_data (manakai_has_reference =>
6492 $token->{attributes}->{content}
6493 ->{has_reference});
6494 }
6495 }
6496 } else {
6497 if ($token->{attributes}->{charset}) {
6498 !!!cp ('t337');
6499 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6500 ->set_user_data (manakai_has_reference =>
6501 $token->{attributes}->{charset}
6502 ->{has_reference});
6503 }
6504 if ($token->{attributes}->{content}) {
6505 !!!cp ('t338');
6506 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6507 ->set_user_data (manakai_has_reference =>
6508 $token->{attributes}->{content}
6509 ->{has_reference});
6510 }
6511 }
6512
6513 !!!ack ('t338.1');
6514 !!!next-token;
6515 next B;
6516 } elsif ($token->{tag_name} eq 'title') {
6517 !!!cp ('t341');
6518 ## NOTE: This is an "as if in head" code clone
6519 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6520 next B;
6521 } elsif ($token->{tag_name} eq 'body') {
6522 !!!parse-error (type => 'in body', text => 'body', token => $token);
6523
6524 if (@{$self->{open_elements}} == 1 or
6525 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6526 !!!cp ('t342');
6527 ## Ignore the token
6528 } else {
6529 my $body_el = $self->{open_elements}->[1]->[0];
6530 for my $attr_name (keys %{$token->{attributes}}) {
6531 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6532 !!!cp ('t343');
6533 $body_el->set_attribute_ns
6534 (undef, [undef, $attr_name],
6535 $token->{attributes}->{$attr_name}->{value});
6536 }
6537 }
6538 }
6539 !!!nack ('t343.1');
6540 !!!next-token;
6541 next B;
6542 } elsif ({
6543 address => 1, blockquote => 1, center => 1, dir => 1,
6544 div => 1, dl => 1, fieldset => 1,
6545 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6546 menu => 1, ol => 1, p => 1, ul => 1,
6547 pre => 1, listing => 1,
6548 form => 1,
6549 table => 1,
6550 hr => 1,
6551 }->{$token->{tag_name}}) {
6552 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6553 !!!cp ('t350');
6554 !!!parse-error (type => 'in form:form', token => $token);
6555 ## Ignore the token
6556 !!!nack ('t350.1');
6557 !!!next-token;
6558 next B;
6559 }
6560
6561 ## has a p element in scope
6562 INSCOPE: for (reverse @{$self->{open_elements}}) {
6563 if ($_->[1] & P_EL) {
6564 !!!cp ('t344');
6565 !!!back-token; # <form>
6566 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6567 line => $token->{line}, column => $token->{column}};
6568 next B;
6569 } elsif ($_->[1] & SCOPING_EL) {
6570 !!!cp ('t345');
6571 last INSCOPE;
6572 }
6573 } # INSCOPE
6574
6575 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6576 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6577 !!!nack ('t346.1');
6578 !!!next-token;
6579 if ($token->{type} == CHARACTER_TOKEN) {
6580 $token->{data} =~ s/^\x0A//;
6581 unless (length $token->{data}) {
6582 !!!cp ('t346');
6583 !!!next-token;
6584 } else {
6585 !!!cp ('t349');
6586 }
6587 } else {
6588 !!!cp ('t348');
6589 }
6590 } elsif ($token->{tag_name} eq 'form') {
6591 !!!cp ('t347.1');
6592 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6593
6594 !!!nack ('t347.2');
6595 !!!next-token;
6596 } elsif ($token->{tag_name} eq 'table') {
6597 !!!cp ('t382');
6598 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6599
6600 $self->{insertion_mode} = IN_TABLE_IM;
6601
6602 !!!nack ('t382.1');
6603 !!!next-token;
6604 } elsif ($token->{tag_name} eq 'hr') {
6605 !!!cp ('t386');
6606 pop @{$self->{open_elements}};
6607
6608 !!!nack ('t386.1');
6609 !!!next-token;
6610 } else {
6611 !!!nack ('t347.1');
6612 !!!next-token;
6613 }
6614 next B;
6615 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6616 ## has a p element in scope
6617 INSCOPE: for (reverse @{$self->{open_elements}}) {
6618 if ($_->[1] & P_EL) {
6619 !!!cp ('t353');
6620 !!!back-token; # <x>
6621 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6622 line => $token->{line}, column => $token->{column}};
6623 next B;
6624 } elsif ($_->[1] & SCOPING_EL) {
6625 !!!cp ('t354');
6626 last INSCOPE;
6627 }
6628 } # INSCOPE
6629
6630 ## Step 1
6631 my $i = -1;
6632 my $node = $self->{open_elements}->[$i];
6633 my $li_or_dtdd = {li => {li => 1},
6634 dt => {dt => 1, dd => 1},
6635 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6636 LI: {
6637 ## Step 2
6638 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6639 if ($i != -1) {
6640 !!!cp ('t355');
6641 !!!parse-error (type => 'not closed',
6642 text => $self->{open_elements}->[-1]->[0]
6643 ->manakai_local_name,
6644 token => $token);
6645 } else {
6646 !!!cp ('t356');
6647 }
6648 splice @{$self->{open_elements}}, $i;
6649 last LI;
6650 } else {
6651 !!!cp ('t357');
6652 }
6653
6654 ## Step 3
6655 if (not ($node->[1] & FORMATTING_EL) and
6656 #not $phrasing_category->{$node->[1]} and
6657 ($node->[1] & SPECIAL_EL or
6658 $node->[1] & SCOPING_EL) and
6659 not ($node->[1] & ADDRESS_EL) and
6660 not ($node->[1] & DIV_EL)) {
6661 !!!cp ('t358');
6662 last LI;
6663 }
6664
6665 !!!cp ('t359');
6666 ## Step 4
6667 $i--;
6668 $node = $self->{open_elements}->[$i];
6669 redo LI;
6670 } # LI
6671
6672 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6673 !!!nack ('t359.1');
6674 !!!next-token;
6675 next B;
6676 } elsif ($token->{tag_name} eq 'plaintext') {
6677 ## has a p element in scope
6678 INSCOPE: for (reverse @{$self->{open_elements}}) {
6679 if ($_->[1] & P_EL) {
6680 !!!cp ('t367');
6681 !!!back-token; # <plaintext>
6682 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6683 line => $token->{line}, column => $token->{column}};
6684 next B;
6685 } elsif ($_->[1] & SCOPING_EL) {
6686 !!!cp ('t368');
6687 last INSCOPE;
6688 }
6689 } # INSCOPE
6690
6691 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6692
6693 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6694
6695 !!!nack ('t368.1');
6696 !!!next-token;
6697 next B;
6698 } elsif ($token->{tag_name} eq 'a') {
6699 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6700 my $node = $active_formatting_elements->[$i];
6701 if ($node->[1] & A_EL) {
6702 !!!cp ('t371');
6703 !!!parse-error (type => 'in a:a', token => $token);
6704
6705 !!!back-token; # <a>
6706 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6707 line => $token->{line}, column => $token->{column}};
6708 $formatting_end_tag->($token);
6709
6710 AFE2: for (reverse 0..$#$active_formatting_elements) {
6711 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6712 !!!cp ('t372');
6713 splice @$active_formatting_elements, $_, 1;
6714 last AFE2;
6715 }
6716 } # AFE2
6717 OE: for (reverse 0..$#{$self->{open_elements}}) {
6718 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6719 !!!cp ('t373');
6720 splice @{$self->{open_elements}}, $_, 1;
6721 last OE;
6722 }
6723 } # OE
6724 last AFE;
6725 } elsif ($node->[0] eq '#marker') {
6726 !!!cp ('t374');
6727 last AFE;
6728 }
6729 } # AFE
6730
6731 $reconstruct_active_formatting_elements->($insert_to_current);
6732
6733 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6734 push @$active_formatting_elements, $self->{open_elements}->[-1];
6735
6736 !!!nack ('t374.1');
6737 !!!next-token;
6738 next B;
6739 } elsif ($token->{tag_name} eq 'nobr') {
6740 $reconstruct_active_formatting_elements->($insert_to_current);
6741
6742 ## has a |nobr| element in scope
6743 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6744 my $node = $self->{open_elements}->[$_];
6745 if ($node->[1] & NOBR_EL) {
6746 !!!cp ('t376');
6747 !!!parse-error (type => 'in nobr:nobr', token => $token);
6748 !!!back-token; # <nobr>
6749 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6750 line => $token->{line}, column => $token->{column}};
6751 next B;
6752 } elsif ($node->[1] & SCOPING_EL) {
6753 !!!cp ('t377');
6754 last INSCOPE;
6755 }
6756 } # INSCOPE
6757
6758 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6759 push @$active_formatting_elements, $self->{open_elements}->[-1];
6760
6761 !!!nack ('t377.1');
6762 !!!next-token;
6763 next B;
6764 } elsif ($token->{tag_name} eq 'button') {
6765 ## has a button element in scope
6766 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6767 my $node = $self->{open_elements}->[$_];
6768 if ($node->[1] & BUTTON_EL) {
6769 !!!cp ('t378');
6770 !!!parse-error (type => 'in button:button', token => $token);
6771 !!!back-token; # <button>
6772 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6773 line => $token->{line}, column => $token->{column}};
6774 next B;
6775 } elsif ($node->[1] & SCOPING_EL) {
6776 !!!cp ('t379');
6777 last INSCOPE;
6778 }
6779 } # INSCOPE
6780
6781 $reconstruct_active_formatting_elements->($insert_to_current);
6782
6783 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6784
6785 ## TODO: associate with $self->{form_element} if defined
6786
6787 push @$active_formatting_elements, ['#marker', ''];
6788
6789 !!!nack ('t379.1');
6790 !!!next-token;
6791 next B;
6792 } elsif ({
6793 xmp => 1,
6794 iframe => 1,
6795 noembed => 1,
6796 noframes => 1, ## NOTE: This is an "as if in head" code clone.
6797 noscript => 0, ## TODO: 1 if scripting is enabled
6798 }->{$token->{tag_name}}) {
6799 if ($token->{tag_name} eq 'xmp') {
6800 !!!cp ('t381');
6801 $reconstruct_active_formatting_elements->($insert_to_current);
6802 } else {
6803 !!!cp ('t399');
6804 }
6805 ## NOTE: There is an "as if in body" code clone.
6806 $parse_rcdata->(CDATA_CONTENT_MODEL);
6807 next B;
6808 } elsif ($token->{tag_name} eq 'isindex') {
6809 !!!parse-error (type => 'isindex', token => $token);
6810
6811 if (defined $self->{form_element}) {
6812 !!!cp ('t389');
6813 ## Ignore the token
6814 !!!nack ('t389'); ## NOTE: Not acknowledged.
6815 !!!next-token;
6816 next B;
6817 } else {
6818 !!!ack ('t391.1');
6819
6820 my $at = $token->{attributes};
6821 my $form_attrs;
6822 $form_attrs->{action} = $at->{action} if $at->{action};
6823 my $prompt_attr = $at->{prompt};
6824 $at->{name} = {name => 'name', value => 'isindex'};
6825 delete $at->{action};
6826 delete $at->{prompt};
6827 my @tokens = (
6828 {type => START_TAG_TOKEN, tag_name => 'form',
6829 attributes => $form_attrs,
6830 line => $token->{line}, column => $token->{column}},
6831 {type => START_TAG_TOKEN, tag_name => 'hr',
6832 line => $token->{line}, column => $token->{column}},
6833 {type => START_TAG_TOKEN, tag_name => 'p',
6834 line => $token->{line}, column => $token->{column}},
6835 {type => START_TAG_TOKEN, tag_name => 'label',
6836 line => $token->{line}, column => $token->{column}},
6837 );
6838 if ($prompt_attr) {
6839 !!!cp ('t390');
6840 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6841 #line => $token->{line}, column => $token->{column},
6842 };
6843 } else {
6844 !!!cp ('t391');
6845 push @tokens, {type => CHARACTER_TOKEN,
6846 data => 'This is a searchable index. Insert your search keywords here: ',
6847 #line => $token->{line}, column => $token->{column},
6848 }; # SHOULD
6849 ## TODO: make this configurable
6850 }
6851 push @tokens,
6852 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6853 line => $token->{line}, column => $token->{column}},
6854 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6855 {type => END_TAG_TOKEN, tag_name => 'label',
6856 line => $token->{line}, column => $token->{column}},
6857 {type => END_TAG_TOKEN, tag_name => 'p',
6858 line => $token->{line}, column => $token->{column}},
6859 {type => START_TAG_TOKEN, tag_name => 'hr',
6860 line => $token->{line}, column => $token->{column}},
6861 {type => END_TAG_TOKEN, tag_name => 'form',
6862 line => $token->{line}, column => $token->{column}};
6863 !!!back-token (@tokens);
6864 !!!next-token;
6865 next B;
6866 }
6867 } elsif ($token->{tag_name} eq 'textarea') {
6868 my $tag_name = $token->{tag_name};
6869 my $el;
6870 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6871
6872 ## TODO: $self->{form_element} if defined
6873 $self->{content_model} = RCDATA_CONTENT_MODEL;
6874 delete $self->{escape}; # MUST
6875
6876 $insert->($el);
6877
6878 my $text = '';
6879 !!!nack ('t392.1');
6880 !!!next-token;
6881 if ($token->{type} == CHARACTER_TOKEN) {
6882 $token->{data} =~ s/^\x0A//;
6883 unless (length $token->{data}) {
6884 !!!cp ('t392');
6885 !!!next-token;
6886 } else {
6887 !!!cp ('t393');
6888 }
6889 } else {
6890 !!!cp ('t394');
6891 }
6892 while ($token->{type} == CHARACTER_TOKEN) {
6893 !!!cp ('t395');
6894 $text .= $token->{data};
6895 !!!next-token;
6896 }
6897 if (length $text) {
6898 !!!cp ('t396');
6899 $el->manakai_append_text ($text);
6900 }
6901
6902 $self->{content_model} = PCDATA_CONTENT_MODEL;
6903
6904 if ($token->{type} == END_TAG_TOKEN and
6905 $token->{tag_name} eq $tag_name) {
6906 !!!cp ('t397');
6907 ## Ignore the token
6908 } else {
6909 !!!cp ('t398');
6910 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
6911 }
6912 !!!next-token;
6913 next B;
6914 } elsif ($token->{tag_name} eq 'rt' or
6915 $token->{tag_name} eq 'rp') {
6916 ## has a |ruby| element in scope
6917 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6918 my $node = $self->{open_elements}->[$_];
6919 if ($node->[1] & RUBY_EL) {
6920 !!!cp ('t398.1');
6921 ## generate implied end tags
6922 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6923 !!!cp ('t398.2');
6924 pop @{$self->{open_elements}};
6925 }
6926 unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
6927 !!!cp ('t398.3');
6928 !!!parse-error (type => 'not closed',
6929 text => $self->{open_elements}->[-1]->[0]
6930 ->manakai_local_name,
6931 token => $token);
6932 pop @{$self->{open_elements}}
6933 while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
6934 }
6935 last INSCOPE;
6936 } elsif ($node->[1] & SCOPING_EL) {
6937 !!!cp ('t398.4');
6938 last INSCOPE;
6939 }
6940 } # INSCOPE
6941
6942 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6943
6944 !!!nack ('t398.5');
6945 !!!next-token;
6946 redo B;
6947 } elsif ($token->{tag_name} eq 'math' or
6948 $token->{tag_name} eq 'svg') {
6949 $reconstruct_active_formatting_elements->($insert_to_current);
6950
6951 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
6952
6953 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
6954
6955 ## "adjust foreign attributes" - done in insert-element-f
6956
6957 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
6958
6959 if ($self->{self_closing}) {
6960 pop @{$self->{open_elements}};
6961 !!!ack ('t398.1');
6962 } else {
6963 !!!cp ('t398.2');
6964 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6965 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6966 ## mode, "in body" (not "in foreign content") secondary insertion
6967 ## mode, maybe.
6968 }
6969
6970 !!!next-token;
6971 next B;
6972 } elsif ({
6973 caption => 1, col => 1, colgroup => 1, frame => 1,
6974 frameset => 1, head => 1, option => 1, optgroup => 1,
6975 tbody => 1, td => 1, tfoot => 1, th => 1,
6976 thead => 1, tr => 1,
6977 }->{$token->{tag_name}}) {
6978 !!!cp ('t401');
6979 !!!parse-error (type => 'in body',
6980 text => $token->{tag_name}, token => $token);
6981 ## Ignore the token
6982 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6983 !!!next-token;
6984 next B;
6985
6986 ## ISSUE: An issue on HTML5 new elements in the spec.
6987 } else {
6988 if ($token->{tag_name} eq 'image') {
6989 !!!cp ('t384');
6990 !!!parse-error (type => 'image', token => $token);
6991 $token->{tag_name} = 'img';
6992 } else {
6993 !!!cp ('t385');
6994 }
6995
6996 ## NOTE: There is an "as if <br>" code clone.
6997 $reconstruct_active_formatting_elements->($insert_to_current);
6998
6999 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7000
7001 if ({
7002 applet => 1, marquee => 1, object => 1,
7003 }->{$token->{tag_name}}) {
7004 !!!cp ('t380');
7005 push @$active_formatting_elements, ['#marker', ''];
7006 !!!nack ('t380.1');
7007 } elsif ({
7008 b => 1, big => 1, em => 1, font => 1, i => 1,
7009 s => 1, small => 1, strile => 1,
7010 strong => 1, tt => 1, u => 1,
7011 }->{$token->{tag_name}}) {
7012 !!!cp ('t375');
7013 push @$active_formatting_elements, $self->{open_elements}->[-1];
7014 !!!nack ('t375.1');
7015 } elsif ($token->{tag_name} eq 'input') {
7016 !!!cp ('t388');
7017 ## TODO: associate with $self->{form_element} if defined
7018 pop @{$self->{open_elements}};
7019 !!!ack ('t388.2');
7020 } elsif ({
7021 area => 1, basefont => 1, bgsound => 1, br => 1,
7022 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7023 #image => 1,
7024 }->{$token->{tag_name}}) {
7025 !!!cp ('t388.1');
7026 pop @{$self->{open_elements}};
7027 !!!ack ('t388.3');
7028 } elsif ($token->{tag_name} eq 'select') {
7029 ## TODO: associate with $self->{form_element} if defined
7030
7031 if ($self->{insertion_mode} & TABLE_IMS or
7032 $self->{insertion_mode} & BODY_TABLE_IMS or
7033 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7034 !!!cp ('t400.1');
7035 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7036 } else {
7037 !!!cp ('t400.2');
7038 $self->{insertion_mode} = IN_SELECT_IM;
7039 }
7040 !!!nack ('t400.3');
7041 } else {
7042 !!!nack ('t402');
7043 }
7044
7045 !!!next-token;
7046 next B;
7047 }
7048 } elsif ($token->{type} == END_TAG_TOKEN) {
7049 if ($token->{tag_name} eq 'body') {
7050 ## has a |body| element in scope
7051 my $i;
7052 INSCOPE: {
7053 for (reverse @{$self->{open_elements}}) {
7054 if ($_->[1] & BODY_EL) {
7055 !!!cp ('t405');
7056 $i = $_;
7057 last INSCOPE;
7058 } elsif ($_->[1] & SCOPING_EL) {
7059 !!!cp ('t405.1');
7060 last;
7061 }
7062 }
7063
7064 !!!parse-error (type => 'start tag not allowed',
7065 text => $token->{tag_name}, token => $token);
7066 ## NOTE: Ignore the token.
7067 !!!next-token;
7068 next B;
7069 } # INSCOPE
7070
7071 for (@{$self->{open_elements}}) {
7072 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7073 !!!cp ('t403');
7074 !!!parse-error (type => 'not closed',
7075 text => $_->[0]->manakai_local_name,
7076 token => $token);
7077 last;
7078 } else {
7079 !!!cp ('t404');
7080 }
7081 }
7082
7083 $self->{insertion_mode} = AFTER_BODY_IM;
7084 !!!next-token;
7085 next B;
7086 } elsif ($token->{tag_name} eq 'html') {
7087 ## TODO: Update this code. It seems that the code below is not
7088 ## up-to-date, though it has same effect as speced.
7089 if (@{$self->{open_elements}} > 1 and
7090 $self->{open_elements}->[1]->[1] & BODY_EL) {
7091 ## ISSUE: There is an issue in the spec.
7092 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7093 !!!cp ('t406');
7094 !!!parse-error (type => 'not closed',
7095 text => $self->{open_elements}->[1]->[0]
7096 ->manakai_local_name,
7097 token => $token);
7098 } else {
7099 !!!cp ('t407');
7100 }
7101 $self->{insertion_mode} = AFTER_BODY_IM;
7102 ## reprocess
7103 next B;
7104 } else {
7105 !!!cp ('t408');
7106 !!!parse-error (type => 'unmatched end tag',
7107 text => $token->{tag_name}, token => $token);
7108 ## Ignore the token
7109 !!!next-token;
7110 next B;
7111 }
7112 } elsif ({
7113 address => 1, blockquote => 1, center => 1, dir => 1,
7114 div => 1, dl => 1, fieldset => 1, listing => 1,
7115 menu => 1, ol => 1, pre => 1, ul => 1,
7116 dd => 1, dt => 1, li => 1,
7117 applet => 1, button => 1, marquee => 1, object => 1,
7118 }->{$token->{tag_name}}) {
7119 ## has an element in scope
7120 my $i;
7121 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7122 my $node = $self->{open_elements}->[$_];
7123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7124 !!!cp ('t410');
7125 $i = $_;
7126 last INSCOPE;
7127 } elsif ($node->[1] & SCOPING_EL) {
7128 !!!cp ('t411');
7129 last INSCOPE;
7130 }
7131 } # INSCOPE
7132
7133 unless (defined $i) { # has an element in scope
7134 !!!cp ('t413');
7135 !!!parse-error (type => 'unmatched end tag',
7136 text => $token->{tag_name}, token => $token);
7137 } else {
7138 ## Step 1. generate implied end tags
7139 while ({
7140 ## END_TAG_OPTIONAL_EL
7141 dd => ($token->{tag_name} ne 'dd'),
7142 dt => ($token->{tag_name} ne 'dt'),
7143 li => ($token->{tag_name} ne 'li'),
7144 p => 1,
7145 rt => 1,
7146 rp => 1,
7147 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7148 !!!cp ('t409');
7149 pop @{$self->{open_elements}};
7150 }
7151
7152 ## Step 2.
7153 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7154 ne $token->{tag_name}) {
7155 !!!cp ('t412');
7156 !!!parse-error (type => 'not closed',
7157 text => $self->{open_elements}->[-1]->[0]
7158 ->manakai_local_name,
7159 token => $token);
7160 } else {
7161 !!!cp ('t414');
7162 }
7163
7164 ## Step 3.
7165 splice @{$self->{open_elements}}, $i;
7166
7167 ## Step 4.
7168 $clear_up_to_marker->()
7169 if {
7170 applet => 1, button => 1, marquee => 1, object => 1,
7171 }->{$token->{tag_name}};
7172 }
7173 !!!next-token;
7174 next B;
7175 } elsif ($token->{tag_name} eq 'form') {
7176 undef $self->{form_element};
7177
7178 ## has an element in scope
7179 my $i;
7180 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7181 my $node = $self->{open_elements}->[$_];
7182 if ($node->[1] & FORM_EL) {
7183 !!!cp ('t418');
7184 $i = $_;
7185 last INSCOPE;
7186 } elsif ($node->[1] & SCOPING_EL) {
7187 !!!cp ('t419');
7188 last INSCOPE;
7189 }
7190 } # INSCOPE
7191
7192 unless (defined $i) { # has an element in scope
7193 !!!cp ('t421');
7194 !!!parse-error (type => 'unmatched end tag',
7195 text => $token->{tag_name}, token => $token);
7196 } else {
7197 ## Step 1. generate implied end tags
7198 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7199 !!!cp ('t417');
7200 pop @{$self->{open_elements}};
7201 }
7202
7203 ## Step 2.
7204 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7205 ne $token->{tag_name}) {
7206 !!!cp ('t417.1');
7207 !!!parse-error (type => 'not closed',
7208 text => $self->{open_elements}->[-1]->[0]
7209 ->manakai_local_name,
7210 token => $token);
7211 } else {
7212 !!!cp ('t420');
7213 }
7214
7215 ## Step 3.
7216 splice @{$self->{open_elements}}, $i;
7217 }
7218
7219 !!!next-token;
7220 next B;
7221 } elsif ({
7222 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7223 }->{$token->{tag_name}}) {
7224 ## has an element in scope
7225 my $i;
7226 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7227 my $node = $self->{open_elements}->[$_];
7228 if ($node->[1] & HEADING_EL) {
7229 !!!cp ('t423');
7230 $i = $_;
7231 last INSCOPE;
7232 } elsif ($node->[1] & SCOPING_EL) {
7233 !!!cp ('t424');
7234 last INSCOPE;
7235 }
7236 } # INSCOPE
7237
7238 unless (defined $i) { # has an element in scope
7239 !!!cp ('t425.1');
7240 !!!parse-error (type => 'unmatched end tag',
7241 text => $token->{tag_name}, token => $token);
7242 } else {
7243 ## Step 1. generate implied end tags
7244 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7245 !!!cp ('t422');
7246 pop @{$self->{open_elements}};
7247 }
7248
7249 ## Step 2.
7250 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7251 ne $token->{tag_name}) {
7252 !!!cp ('t425');
7253 !!!parse-error (type => 'unmatched end tag',
7254 text => $token->{tag_name}, token => $token);
7255 } else {
7256 !!!cp ('t426');
7257 }
7258
7259 ## Step 3.
7260 splice @{$self->{open_elements}}, $i;
7261 }
7262
7263 !!!next-token;
7264 next B;
7265 } elsif ($token->{tag_name} eq 'p') {
7266 ## has an element in scope
7267 my $i;
7268 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7269 my $node = $self->{open_elements}->[$_];
7270 if ($node->[1] & P_EL) {
7271 !!!cp ('t410.1');
7272 $i = $_;
7273 last INSCOPE;
7274 } elsif ($node->[1] & SCOPING_EL) {
7275 !!!cp ('t411.1');
7276 last INSCOPE;
7277 }
7278 } # INSCOPE
7279
7280 if (defined $i) {
7281 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7282 ne $token->{tag_name}) {
7283 !!!cp ('t412.1');
7284 !!!parse-error (type => 'not closed',
7285 text => $self->{open_elements}->[-1]->[0]
7286 ->manakai_local_name,
7287 token => $token);
7288 } else {
7289 !!!cp ('t414.1');
7290 }
7291
7292 splice @{$self->{open_elements}}, $i;
7293 } else {
7294 !!!cp ('t413.1');
7295 !!!parse-error (type => 'unmatched end tag',
7296 text => $token->{tag_name}, token => $token);
7297
7298 !!!cp ('t415.1');
7299 ## As if <p>, then reprocess the current token
7300 my $el;
7301 !!!create-element ($el, $HTML_NS, 'p',, $token);
7302 $insert->($el);
7303 ## NOTE: Not inserted into |$self->{open_elements}|.
7304 }
7305
7306 !!!next-token;
7307 next B;
7308 } elsif ({
7309 a => 1,
7310 b => 1, big => 1, em => 1, font => 1, i => 1,
7311 nobr => 1, s => 1, small => 1, strile => 1,
7312 strong => 1, tt => 1, u => 1,
7313 }->{$token->{tag_name}}) {
7314 !!!cp ('t427');
7315 $formatting_end_tag->($token);
7316 next B;
7317 } elsif ($token->{tag_name} eq 'br') {
7318 !!!cp ('t428');
7319 !!!parse-error (type => 'unmatched end tag',
7320 text => 'br', token => $token);
7321
7322 ## As if <br>
7323 $reconstruct_active_formatting_elements->($insert_to_current);
7324
7325 my $el;
7326 !!!create-element ($el, $HTML_NS, 'br',, $token);
7327 $insert->($el);
7328
7329 ## Ignore the token.
7330 !!!next-token;
7331 next B;
7332 } elsif ({
7333 caption => 1, col => 1, colgroup => 1, frame => 1,
7334 frameset => 1, head => 1, option => 1, optgroup => 1,
7335 tbody => 1, td => 1, tfoot => 1, th => 1,
7336 thead => 1, tr => 1,
7337 area => 1, basefont => 1, bgsound => 1,
7338 embed => 1, hr => 1, iframe => 1, image => 1,
7339 img => 1, input => 1, isindex => 1, noembed => 1,
7340 noframes => 1, param => 1, select => 1, spacer => 1,
7341 table => 1, textarea => 1, wbr => 1,
7342 noscript => 0, ## TODO: if scripting is enabled
7343 }->{$token->{tag_name}}) {
7344 !!!cp ('t429');
7345 !!!parse-error (type => 'unmatched end tag',
7346 text => $token->{tag_name}, token => $token);
7347 ## Ignore the token
7348 !!!next-token;
7349 next B;
7350
7351 ## ISSUE: Issue on HTML5 new elements in spec
7352
7353 } else {
7354 ## Step 1
7355 my $node_i = -1;
7356 my $node = $self->{open_elements}->[$node_i];
7357
7358 ## Step 2
7359 S2: {
7360 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7361 ## Step 1
7362 ## generate implied end tags
7363 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7364 !!!cp ('t430');
7365 ## NOTE: |<ruby><rt></ruby>|.
7366 ## ISSUE: <ruby><rt></rt> will also take this code path,
7367 ## which seems wrong.
7368 pop @{$self->{open_elements}};
7369 $node_i++;
7370 }
7371
7372 ## Step 2
7373 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7374 ne $token->{tag_name}) {
7375 !!!cp ('t431');
7376 ## NOTE: <x><y></x>
7377 !!!parse-error (type => 'not closed',
7378 text => $self->{open_elements}->[-1]->[0]
7379 ->manakai_local_name,
7380 token => $token);
7381 } else {
7382 !!!cp ('t432');
7383 }
7384
7385 ## Step 3
7386 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7387
7388 !!!next-token;
7389 last S2;
7390 } else {
7391 ## Step 3
7392 if (not ($node->[1] & FORMATTING_EL) and
7393 #not $phrasing_category->{$node->[1]} and
7394 ($node->[1] & SPECIAL_EL or
7395 $node->[1] & SCOPING_EL)) {
7396 !!!cp ('t433');
7397 !!!parse-error (type => 'unmatched end tag',
7398 text => $token->{tag_name}, token => $token);
7399 ## Ignore the token
7400 !!!next-token;
7401 last S2;
7402 }
7403
7404 !!!cp ('t434');
7405 }
7406
7407 ## Step 4
7408 $node_i--;
7409 $node = $self->{open_elements}->[$node_i];
7410
7411 ## Step 5;
7412 redo S2;
7413 } # S2
7414 next B;
7415 }
7416 }
7417 next B;
7418 } continue { # B
7419 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7420 ## NOTE: The code below is executed in cases where it does not have
7421 ## to be, but it it is harmless even in those cases.
7422 ## has an element in scope
7423 INSCOPE: {
7424 for (reverse 0..$#{$self->{open_elements}}) {
7425 my $node = $self->{open_elements}->[$_];
7426 if ($node->[1] & FOREIGN_EL) {
7427 last INSCOPE;
7428 } elsif ($node->[1] & SCOPING_EL) {
7429 last;
7430 }
7431 }
7432
7433 ## NOTE: No foreign element in scope.
7434 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7435 } # INSCOPE
7436 }
7437 } # B
7438
7439 ## Stop parsing # MUST
7440
7441 ## TODO: script stuffs
7442 } # _tree_construct_main
7443
7444 sub set_inner_html ($$$) {
7445 my $class = shift;
7446 my $node = shift;
7447 my $s = \$_[0];
7448 my $onerror = $_[1];
7449
7450 ## ISSUE: Should {confident} be true?
7451
7452 my $nt = $node->node_type;
7453 if ($nt == 9) {
7454 # MUST
7455
7456 ## Step 1 # MUST
7457 ## TODO: If the document has an active parser, ...
7458 ## ISSUE: There is an issue in the spec.
7459
7460 ## Step 2 # MUST
7461 my @cn = @{$node->child_nodes};
7462 for (@cn) {
7463 $node->remove_child ($_);
7464 }
7465
7466 ## Step 3, 4, 5 # MUST
7467 $class->parse_string ($$s => $node, $onerror);
7468 } elsif ($nt == 1) {
7469 ## TODO: If non-html element
7470
7471 ## NOTE: Most of this code is copied from |parse_string|
7472
7473 ## Step 1 # MUST
7474 my $this_doc = $node->owner_document;
7475 my $doc = $this_doc->implementation->create_document;
7476 $doc->manakai_is_html (1);
7477 my $p = $class->new;
7478 $p->{document} = $doc;
7479
7480 ## Step 8 # MUST
7481 my $i = 0;
7482 $p->{line_prev} = $p->{line} = 1;
7483 $p->{column_prev} = $p->{column} = 0;
7484 $p->{set_next_char} = sub {
7485 my $self = shift;
7486
7487 pop @{$self->{prev_char}};
7488 unshift @{$self->{prev_char}}, $self->{next_char};
7489
7490 $self->{next_char} = -1 and return if $i >= length $$s;
7491 $self->{next_char} = ord substr $$s, $i++, 1;
7492
7493 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7494 $p->{column}++;
7495
7496 if ($self->{next_char} == 0x000A) { # LF
7497 $p->{line}++;
7498 $p->{column} = 0;
7499 !!!cp ('i1');
7500 } elsif ($self->{next_char} == 0x000D) { # CR
7501 $i++ if substr ($$s, $i, 1) eq "\x0A";
7502 $self->{next_char} = 0x000A; # LF # MUST
7503 $p->{line}++;
7504 $p->{column} = 0;
7505 !!!cp ('i2');
7506 } elsif ($self->{next_char} > 0x10FFFF) {
7507 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7508 !!!cp ('i3');
7509 } elsif ($self->{next_char} == 0x0000) { # NULL
7510 !!!cp ('i4');
7511 !!!parse-error (type => 'NULL');
7512 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7513 } elsif ($self->{next_char} <= 0x0008 or
7514 (0x000E <= $self->{next_char} and
7515 $self->{next_char} <= 0x001F) or
7516 (0x007F <= $self->{next_char} and
7517 $self->{next_char} <= 0x009F) or
7518 (0xD800 <= $self->{next_char} and
7519 $self->{next_char} <= 0xDFFF) or
7520 (0xFDD0 <= $self->{next_char} and
7521 $self->{next_char} <= 0xFDDF) or
7522 {
7523 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7524 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7525 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7526 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7527 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7528 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7529 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7530 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7531 0x10FFFE => 1, 0x10FFFF => 1,
7532 }->{$self->{next_char}}) {
7533 !!!cp ('i4.1');
7534 if ($self->{next_char} < 0x10000) {
7535 !!!parse-error (type => 'control char',
7536 text => (sprintf 'U+%04X', $self->{next_char}));
7537 } else {
7538 !!!parse-error (type => 'control char',
7539 text => (sprintf 'U-%08X', $self->{next_char}));
7540 }
7541 }
7542 };
7543 $p->{prev_char} = [-1, -1, -1];
7544 $p->{next_char} = -1;
7545
7546 my $ponerror = $onerror || sub {
7547 my (%opt) = @_;
7548 my $line = $opt{line};
7549 my $column = $opt{column};
7550 if (defined $opt{token} and defined $opt{token}->{line}) {
7551 $line = $opt{token}->{line};
7552 $column = $opt{token}->{column};
7553 }
7554 warn "Parse error ($opt{type}) at line $line column $column\n";
7555 };
7556 $p->{parse_error} = sub {
7557 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7558 };
7559
7560 $p->_initialize_tokenizer;
7561 $p->_initialize_tree_constructor;
7562
7563 ## Step 2
7564 my $node_ln = $node->manakai_local_name;
7565 $p->{content_model} = {
7566 title => RCDATA_CONTENT_MODEL,
7567 textarea => RCDATA_CONTENT_MODEL,
7568 style => CDATA_CONTENT_MODEL,
7569 script => CDATA_CONTENT_MODEL,
7570 xmp => CDATA_CONTENT_MODEL,
7571 iframe => CDATA_CONTENT_MODEL,
7572 noembed => CDATA_CONTENT_MODEL,
7573 noframes => CDATA_CONTENT_MODEL,
7574 noscript => CDATA_CONTENT_MODEL,
7575 plaintext => PLAINTEXT_CONTENT_MODEL,
7576 }->{$node_ln};
7577 $p->{content_model} = PCDATA_CONTENT_MODEL
7578 unless defined $p->{content_model};
7579 ## ISSUE: What is "the name of the element"? local name?
7580
7581 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7582 ## TODO: Foreign element OK?
7583
7584 ## Step 3
7585 my $root = $doc->create_element_ns
7586 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7587
7588 ## Step 4 # MUST
7589 $doc->append_child ($root);
7590
7591 ## Step 5 # MUST
7592 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7593
7594 undef $p->{head_element};
7595
7596 ## Step 6 # MUST
7597 $p->_reset_insertion_mode;
7598
7599 ## Step 7 # MUST
7600 my $anode = $node;
7601 AN: while (defined $anode) {
7602 if ($anode->node_type == 1) {
7603 my $nsuri = $anode->namespace_uri;
7604 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7605 if ($anode->manakai_local_name eq 'form') {
7606 !!!cp ('i5');
7607 $p->{form_element} = $anode;
7608 last AN;
7609 }
7610 }
7611 }
7612 $anode = $anode->parent_node;
7613 } # AN
7614
7615 ## Step 9 # MUST
7616 {
7617 my $self = $p;
7618 !!!next-token;
7619 }
7620 $p->_tree_construction_main;
7621
7622 ## Step 10 # MUST
7623 my @cn = @{$node->child_nodes};
7624 for (@cn) {
7625 $node->remove_child ($_);
7626 }
7627 ## ISSUE: mutation events? read-only?
7628
7629 ## Step 11 # MUST
7630 @cn = @{$root->child_nodes};
7631 for (@cn) {
7632 $this_doc->adopt_node ($_);
7633 $node->append_child ($_);
7634 }
7635 ## ISSUE: mutation events?
7636
7637 $p->_terminate_tree_constructor;
7638
7639 delete $p->{parse_error}; # delete loop
7640 } else {
7641 die "$0: |set_inner_html| is not defined for node of type $nt";
7642 }
7643 } # set_inner_html
7644
7645 } # tree construction stage
7646
7647 package Whatpm::HTML::RestartParser;
7648 push our @ISA, 'Error';
7649
7650 1;
7651 # $Date: 2008/08/16 07:35:23 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24