/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.164 - (show annotations) (download) (as text)
Sat Sep 13 06:33:39 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.163: +78 -57 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	13 Sep 2008 06:33:32 -0000
	* HTML.pm.src: |CLOSE_TAG_OPEN_STATE| is broken into
	itself and new |CDATA_PCDATA_CLOSE_TAG_STATE| so that
	no longer does the tokenizer have to push back next input
	characters in those states.

2008-09-13  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.163 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 require IO::Handle;
12
13 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15 my $SVG_NS = q<http://www.w3.org/2000/svg>;
16 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19
20 sub A_EL () { 0b1 }
21 sub ADDRESS_EL () { 0b10 }
22 sub BODY_EL () { 0b100 }
23 sub BUTTON_EL () { 0b1000 }
24 sub CAPTION_EL () { 0b10000 }
25 sub DD_EL () { 0b100000 }
26 sub DIV_EL () { 0b1000000 }
27 sub DT_EL () { 0b10000000 }
28 sub FORM_EL () { 0b100000000 }
29 sub FORMATTING_EL () { 0b1000000000 }
30 sub FRAMESET_EL () { 0b10000000000 }
31 sub HEADING_EL () { 0b100000000000 }
32 sub HTML_EL () { 0b1000000000000 }
33 sub LI_EL () { 0b10000000000000 }
34 sub NOBR_EL () { 0b100000000000000 }
35 sub OPTION_EL () { 0b1000000000000000 }
36 sub OPTGROUP_EL () { 0b10000000000000000 }
37 sub P_EL () { 0b100000000000000000 }
38 sub SELECT_EL () { 0b1000000000000000000 }
39 sub TABLE_EL () { 0b10000000000000000000 }
40 sub TABLE_CELL_EL () { 0b100000000000000000000 }
41 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48 sub RUBY_EL () { 0b10000000000000000000000000000 }
49 sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 ## NOTE: Used in "generate implied end tags" algorithm.
58 ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
59 ## is used in "generate implied end tags" implementation (search for the
60 ## function mae).
61 sub END_TAG_OPTIONAL_EL () {
62 DD_EL |
63 DT_EL |
64 LI_EL |
65 P_EL |
66 RUBY_COMPONENT_EL
67 }
68
69 ## NOTE: Used in </body> and EOF algorithms.
70 sub ALL_END_TAG_OPTIONAL_EL () {
71 DD_EL |
72 DT_EL |
73 LI_EL |
74 P_EL |
75
76 BODY_EL |
77 HTML_EL |
78 TABLE_CELL_EL |
79 TABLE_ROW_EL |
80 TABLE_ROW_GROUP_EL
81 }
82
83 sub SCOPING_EL () {
84 BUTTON_EL |
85 CAPTION_EL |
86 HTML_EL |
87 TABLE_EL |
88 TABLE_CELL_EL |
89 MISC_SCOPING_EL
90 }
91
92 sub TABLE_SCOPING_EL () {
93 HTML_EL |
94 TABLE_EL
95 }
96
97 sub TABLE_ROWS_SCOPING_EL () {
98 HTML_EL |
99 TABLE_ROW_GROUP_EL
100 }
101
102 sub TABLE_ROW_SCOPING_EL () {
103 HTML_EL |
104 TABLE_ROW_EL
105 }
106
107 sub SPECIAL_EL () {
108 ADDRESS_EL |
109 BODY_EL |
110 DIV_EL |
111
112 DD_EL |
113 DT_EL |
114 LI_EL |
115 P_EL |
116
117 FORM_EL |
118 FRAMESET_EL |
119 HEADING_EL |
120 OPTION_EL |
121 OPTGROUP_EL |
122 SELECT_EL |
123 TABLE_ROW_EL |
124 TABLE_ROW_GROUP_EL |
125 MISC_SPECIAL_EL
126 }
127
128 my $el_category = {
129 a => A_EL | FORMATTING_EL,
130 address => ADDRESS_EL,
131 applet => MISC_SCOPING_EL,
132 area => MISC_SPECIAL_EL,
133 b => FORMATTING_EL,
134 base => MISC_SPECIAL_EL,
135 basefont => MISC_SPECIAL_EL,
136 bgsound => MISC_SPECIAL_EL,
137 big => FORMATTING_EL,
138 blockquote => MISC_SPECIAL_EL,
139 body => BODY_EL,
140 br => MISC_SPECIAL_EL,
141 button => BUTTON_EL,
142 caption => CAPTION_EL,
143 center => MISC_SPECIAL_EL,
144 col => MISC_SPECIAL_EL,
145 colgroup => MISC_SPECIAL_EL,
146 dd => DD_EL,
147 dir => MISC_SPECIAL_EL,
148 div => DIV_EL,
149 dl => MISC_SPECIAL_EL,
150 dt => DT_EL,
151 em => FORMATTING_EL,
152 embed => MISC_SPECIAL_EL,
153 fieldset => MISC_SPECIAL_EL,
154 font => FORMATTING_EL,
155 form => FORM_EL,
156 frame => MISC_SPECIAL_EL,
157 frameset => FRAMESET_EL,
158 h1 => HEADING_EL,
159 h2 => HEADING_EL,
160 h3 => HEADING_EL,
161 h4 => HEADING_EL,
162 h5 => HEADING_EL,
163 h6 => HEADING_EL,
164 head => MISC_SPECIAL_EL,
165 hr => MISC_SPECIAL_EL,
166 html => HTML_EL,
167 i => FORMATTING_EL,
168 iframe => MISC_SPECIAL_EL,
169 img => MISC_SPECIAL_EL,
170 input => MISC_SPECIAL_EL,
171 isindex => MISC_SPECIAL_EL,
172 li => LI_EL,
173 link => MISC_SPECIAL_EL,
174 listing => MISC_SPECIAL_EL,
175 marquee => MISC_SCOPING_EL,
176 menu => MISC_SPECIAL_EL,
177 meta => MISC_SPECIAL_EL,
178 nobr => NOBR_EL | FORMATTING_EL,
179 noembed => MISC_SPECIAL_EL,
180 noframes => MISC_SPECIAL_EL,
181 noscript => MISC_SPECIAL_EL,
182 object => MISC_SCOPING_EL,
183 ol => MISC_SPECIAL_EL,
184 optgroup => OPTGROUP_EL,
185 option => OPTION_EL,
186 p => P_EL,
187 param => MISC_SPECIAL_EL,
188 plaintext => MISC_SPECIAL_EL,
189 pre => MISC_SPECIAL_EL,
190 rp => RUBY_COMPONENT_EL,
191 rt => RUBY_COMPONENT_EL,
192 ruby => RUBY_EL,
193 s => FORMATTING_EL,
194 script => MISC_SPECIAL_EL,
195 select => SELECT_EL,
196 small => FORMATTING_EL,
197 spacer => MISC_SPECIAL_EL,
198 strike => FORMATTING_EL,
199 strong => FORMATTING_EL,
200 style => MISC_SPECIAL_EL,
201 table => TABLE_EL,
202 tbody => TABLE_ROW_GROUP_EL,
203 td => TABLE_CELL_EL,
204 textarea => MISC_SPECIAL_EL,
205 tfoot => TABLE_ROW_GROUP_EL,
206 th => TABLE_CELL_EL,
207 thead => TABLE_ROW_GROUP_EL,
208 title => MISC_SPECIAL_EL,
209 tr => TABLE_ROW_EL,
210 tt => FORMATTING_EL,
211 u => FORMATTING_EL,
212 ul => MISC_SPECIAL_EL,
213 wbr => MISC_SPECIAL_EL,
214 };
215
216 my $el_category_f = {
217 $MML_NS => {
218 'annotation-xml' => MML_AXML_EL,
219 mi => FOREIGN_FLOW_CONTENT_EL,
220 mo => FOREIGN_FLOW_CONTENT_EL,
221 mn => FOREIGN_FLOW_CONTENT_EL,
222 ms => FOREIGN_FLOW_CONTENT_EL,
223 mtext => FOREIGN_FLOW_CONTENT_EL,
224 },
225 $SVG_NS => {
226 foreignObject => FOREIGN_FLOW_CONTENT_EL,
227 desc => FOREIGN_FLOW_CONTENT_EL,
228 title => FOREIGN_FLOW_CONTENT_EL,
229 },
230 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
231 };
232
233 my $svg_attr_name = {
234 attributename => 'attributeName',
235 attributetype => 'attributeType',
236 basefrequency => 'baseFrequency',
237 baseprofile => 'baseProfile',
238 calcmode => 'calcMode',
239 clippathunits => 'clipPathUnits',
240 contentscripttype => 'contentScriptType',
241 contentstyletype => 'contentStyleType',
242 diffuseconstant => 'diffuseConstant',
243 edgemode => 'edgeMode',
244 externalresourcesrequired => 'externalResourcesRequired',
245 filterres => 'filterRes',
246 filterunits => 'filterUnits',
247 glyphref => 'glyphRef',
248 gradienttransform => 'gradientTransform',
249 gradientunits => 'gradientUnits',
250 kernelmatrix => 'kernelMatrix',
251 kernelunitlength => 'kernelUnitLength',
252 keypoints => 'keyPoints',
253 keysplines => 'keySplines',
254 keytimes => 'keyTimes',
255 lengthadjust => 'lengthAdjust',
256 limitingconeangle => 'limitingConeAngle',
257 markerheight => 'markerHeight',
258 markerunits => 'markerUnits',
259 markerwidth => 'markerWidth',
260 maskcontentunits => 'maskContentUnits',
261 maskunits => 'maskUnits',
262 numoctaves => 'numOctaves',
263 pathlength => 'pathLength',
264 patterncontentunits => 'patternContentUnits',
265 patterntransform => 'patternTransform',
266 patternunits => 'patternUnits',
267 pointsatx => 'pointsAtX',
268 pointsaty => 'pointsAtY',
269 pointsatz => 'pointsAtZ',
270 preservealpha => 'preserveAlpha',
271 preserveaspectratio => 'preserveAspectRatio',
272 primitiveunits => 'primitiveUnits',
273 refx => 'refX',
274 refy => 'refY',
275 repeatcount => 'repeatCount',
276 repeatdur => 'repeatDur',
277 requiredextensions => 'requiredExtensions',
278 requiredfeatures => 'requiredFeatures',
279 specularconstant => 'specularConstant',
280 specularexponent => 'specularExponent',
281 spreadmethod => 'spreadMethod',
282 startoffset => 'startOffset',
283 stddeviation => 'stdDeviation',
284 stitchtiles => 'stitchTiles',
285 surfacescale => 'surfaceScale',
286 systemlanguage => 'systemLanguage',
287 tablevalues => 'tableValues',
288 targetx => 'targetX',
289 targety => 'targetY',
290 textlength => 'textLength',
291 viewbox => 'viewBox',
292 viewtarget => 'viewTarget',
293 xchannelselector => 'xChannelSelector',
294 ychannelselector => 'yChannelSelector',
295 zoomandpan => 'zoomAndPan',
296 };
297
298 my $foreign_attr_xname = {
299 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
300 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
301 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
302 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
303 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
304 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
305 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
306 'xml:base' => [$XML_NS, ['xml', 'base']],
307 'xml:lang' => [$XML_NS, ['xml', 'lang']],
308 'xml:space' => [$XML_NS, ['xml', 'space']],
309 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
310 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
311 };
312
313 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
314
315 my $c1_entity_char = {
316 0x80 => 0x20AC,
317 0x81 => 0xFFFD,
318 0x82 => 0x201A,
319 0x83 => 0x0192,
320 0x84 => 0x201E,
321 0x85 => 0x2026,
322 0x86 => 0x2020,
323 0x87 => 0x2021,
324 0x88 => 0x02C6,
325 0x89 => 0x2030,
326 0x8A => 0x0160,
327 0x8B => 0x2039,
328 0x8C => 0x0152,
329 0x8D => 0xFFFD,
330 0x8E => 0x017D,
331 0x8F => 0xFFFD,
332 0x90 => 0xFFFD,
333 0x91 => 0x2018,
334 0x92 => 0x2019,
335 0x93 => 0x201C,
336 0x94 => 0x201D,
337 0x95 => 0x2022,
338 0x96 => 0x2013,
339 0x97 => 0x2014,
340 0x98 => 0x02DC,
341 0x99 => 0x2122,
342 0x9A => 0x0161,
343 0x9B => 0x203A,
344 0x9C => 0x0153,
345 0x9D => 0xFFFD,
346 0x9E => 0x017E,
347 0x9F => 0x0178,
348 }; # $c1_entity_char
349
350 sub parse_byte_string ($$$$;$) {
351 my $self = shift;
352 my $charset_name = shift;
353 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
354 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
355 } # parse_byte_string
356
357 sub parse_byte_stream ($$$$;$$) {
358 # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_;
359 my $self = ref $_[0] ? shift : shift->new;
360 my $charset_name = shift;
361 my $byte_stream = $_[0];
362
363 my $onerror = $_[2] || sub {
364 my (%opt) = @_;
365 warn "Parse error ($opt{type})\n";
366 };
367 $self->{parse_error} = $onerror; # updated later by parse_char_string
368
369 my $get_wrapper = $_[3] || sub ($) {
370 return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle
371 };
372
373 ## HTML5 encoding sniffing algorithm
374 require Message::Charset::Info;
375 my $charset;
376 my $buffer;
377 my ($char_stream, $e_status);
378
379 SNIFFING: {
380 ## NOTE: By setting |allow_fallback| option true when the
381 ## |get_decode_handle| method is invoked, we ignore what the HTML5
382 ## spec requires, i.e. unsupported encoding should be ignored.
383 ## TODO: We should not do this unless the parser is invoked
384 ## in the conformance checking mode, in which this behavior
385 ## would be useful.
386
387 ## Step 1
388 if (defined $charset_name) {
389 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
390 ## TODO: Is this ok? Transfer protocol's parameter should be
391 ## interpreted in its semantics?
392
393 ## ISSUE: Unsupported encoding is not ignored according to the spec.
394 ($char_stream, $e_status) = $charset->get_decode_handle
395 ($byte_stream, allow_error_reporting => 1,
396 allow_fallback => 1);
397 if ($char_stream) {
398 $self->{confident} = 1;
399 last SNIFFING;
400 } else {
401 ## TODO: unsupported error
402 }
403 }
404
405 ## Step 2
406 my $byte_buffer = '';
407 for (1..1024) {
408 my $char = $byte_stream->getc;
409 last unless defined $char;
410 $byte_buffer .= $char;
411 } ## TODO: timeout
412
413 ## Step 3
414 if ($byte_buffer =~ /^\xFE\xFF/) {
415 $charset = Message::Charset::Info->get_by_html_name ('utf-16be');
416 ($char_stream, $e_status) = $charset->get_decode_handle
417 ($byte_stream, allow_error_reporting => 1,
418 allow_fallback => 1, byte_buffer => \$byte_buffer);
419 $self->{confident} = 1;
420 last SNIFFING;
421 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
422 $charset = Message::Charset::Info->get_by_html_name ('utf-16le');
423 ($char_stream, $e_status) = $charset->get_decode_handle
424 ($byte_stream, allow_error_reporting => 1,
425 allow_fallback => 1, byte_buffer => \$byte_buffer);
426 $self->{confident} = 1;
427 last SNIFFING;
428 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
429 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
430 ($char_stream, $e_status) = $charset->get_decode_handle
431 ($byte_stream, allow_error_reporting => 1,
432 allow_fallback => 1, byte_buffer => \$byte_buffer);
433 $self->{confident} = 1;
434 last SNIFFING;
435 }
436
437 ## Step 4
438 ## TODO: <meta charset>
439
440 ## Step 5
441 ## TODO: from history
442
443 ## Step 6
444 require Whatpm::Charset::UniversalCharDet;
445 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
446 ($byte_buffer);
447 if (defined $charset_name) {
448 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
449
450 ## ISSUE: Unsupported encoding is not ignored according to the spec.
451 require Whatpm::Charset::DecodeHandle;
452 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
453 ($byte_stream);
454 ($char_stream, $e_status) = $charset->get_decode_handle
455 ($buffer, allow_error_reporting => 1,
456 allow_fallback => 1, byte_buffer => \$byte_buffer);
457 if ($char_stream) {
458 $buffer->{buffer} = $byte_buffer;
459 !!!parse-error (type => 'sniffing:chardet',
460 text => $charset_name,
461 level => $self->{level}->{info},
462 layer => 'encode',
463 line => 1, column => 1);
464 $self->{confident} = 0;
465 last SNIFFING;
466 }
467 }
468
469 ## Step 7: default
470 ## TODO: Make this configurable.
471 $charset = Message::Charset::Info->get_by_html_name ('windows-1252');
472 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
473 ## detectable in the step 6.
474 require Whatpm::Charset::DecodeHandle;
475 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
476 ($byte_stream);
477 ($char_stream, $e_status)
478 = $charset->get_decode_handle ($buffer,
479 allow_error_reporting => 1,
480 allow_fallback => 1,
481 byte_buffer => \$byte_buffer);
482 $buffer->{buffer} = $byte_buffer;
483 !!!parse-error (type => 'sniffing:default',
484 text => 'windows-1252',
485 level => $self->{level}->{info},
486 line => 1, column => 1,
487 layer => 'encode');
488 $self->{confident} = 0;
489 } # SNIFFING
490
491 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
492 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
493 !!!parse-error (type => 'chardecode:fallback',
494 #text => $self->{input_encoding},
495 level => $self->{level}->{uncertain},
496 line => 1, column => 1,
497 layer => 'encode');
498 } elsif (not ($e_status &
499 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
500 $self->{input_encoding} = $charset->get_iana_name;
501 !!!parse-error (type => 'chardecode:no error',
502 text => $self->{input_encoding},
503 level => $self->{level}->{uncertain},
504 line => 1, column => 1,
505 layer => 'encode');
506 } else {
507 $self->{input_encoding} = $charset->get_iana_name;
508 }
509
510 $self->{change_encoding} = sub {
511 my $self = shift;
512 $charset_name = shift;
513 my $token = shift;
514
515 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
516 ($char_stream, $e_status) = $charset->get_decode_handle
517 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
518 byte_buffer => \ $buffer->{buffer});
519
520 if ($char_stream) { # if supported
521 ## "Change the encoding" algorithm:
522
523 ## Step 1
524 if ($charset->{category} &
525 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
526 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
527 ($char_stream, $e_status) = $charset->get_decode_handle
528 ($byte_stream,
529 byte_buffer => \ $buffer->{buffer});
530 }
531 $charset_name = $charset->get_iana_name;
532
533 ## Step 2
534 if (defined $self->{input_encoding} and
535 $self->{input_encoding} eq $charset_name) {
536 !!!parse-error (type => 'charset label:matching',
537 text => $charset_name,
538 level => $self->{level}->{info});
539 $self->{confident} = 1;
540 return;
541 }
542
543 !!!parse-error (type => 'charset label detected',
544 text => $self->{input_encoding},
545 value => $charset_name,
546 level => $self->{level}->{warn},
547 token => $token);
548
549 ## Step 3
550 # if (can) {
551 ## change the encoding on the fly.
552 #$self->{confident} = 1;
553 #return;
554 # }
555
556 ## Step 4
557 throw Whatpm::HTML::RestartParser ();
558 }
559 }; # $self->{change_encoding}
560
561 my $char_onerror = sub {
562 my (undef, $type, %opt) = @_;
563 !!!parse-error (layer => 'encode',
564 %opt, type => $type,
565 line => $self->{line}, column => $self->{column} + 1);
566 if ($opt{octets}) {
567 ${$opt{octets}} = "\x{FFFD}"; # relacement character
568 }
569 };
570
571 my $wrapped_char_stream = $get_wrapper->($char_stream);
572 $wrapped_char_stream->onerror ($char_onerror);
573
574 my @args = @_; shift @args; # $s
575 my $return;
576 try {
577 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
578 } catch Whatpm::HTML::RestartParser with {
579 ## NOTE: Invoked after {change_encoding}.
580
581 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
582 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
583 !!!parse-error (type => 'chardecode:fallback',
584 level => $self->{level}->{uncertain},
585 #text => $self->{input_encoding},
586 line => 1, column => 1,
587 layer => 'encode');
588 } elsif (not ($e_status &
589 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
590 $self->{input_encoding} = $charset->get_iana_name;
591 !!!parse-error (type => 'chardecode:no error',
592 text => $self->{input_encoding},
593 level => $self->{level}->{uncertain},
594 line => 1, column => 1,
595 layer => 'encode');
596 } else {
597 $self->{input_encoding} = $charset->get_iana_name;
598 }
599 $self->{confident} = 1;
600
601 $wrapped_char_stream = $get_wrapper->($char_stream);
602 $wrapped_char_stream->onerror ($char_onerror);
603
604 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
605 };
606 return $return;
607 } # parse_byte_stream
608
609 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
610 ## and the HTML layer MUST ignore it. However, we does strip BOM in
611 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
612 ## because the core part of our HTML parser expects a string of character,
613 ## not a string of bytes or code units or anything which might contain a BOM.
614 ## Therefore, any parser interface that accepts a string of bytes,
615 ## such as |parse_byte_string| in this module, must ensure that it does
616 ## strip the BOM and never strip any ZWNBSP.
617
618 sub parse_char_string ($$$;$$) {
619 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
620 my $self = shift;
621 require utf8;
622 my $s = ref $_[0] ? $_[0] : \($_[0]);
623 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
624 if ($_[3]) {
625 $input = $_[3]->($input);
626 }
627 return $self->parse_char_stream ($input, @_[1..$#_]);
628 } # parse_char_string
629 *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
630
631 sub parse_char_stream ($$$;$) {
632 my $self = ref $_[0] ? shift : shift->new;
633 my $input = $_[0];
634 $self->{document} = $_[1];
635 @{$self->{document}->child_nodes} = ();
636
637 ## NOTE: |set_inner_html| copies most of this method's code
638
639 $self->{confident} = 1 unless exists $self->{confident};
640 $self->{document}->input_encoding ($self->{input_encoding})
641 if defined $self->{input_encoding};
642
643 my $i = 0;
644 $self->{line_prev} = $self->{line} = 1;
645 $self->{column_prev} = $self->{column} = 0;
646 $self->{set_next_char} = sub {
647 my $self = shift;
648
649 pop @{$self->{prev_char}};
650 unshift @{$self->{prev_char}}, $self->{next_char};
651
652 my $char;
653 if (defined $self->{next_next_char}) {
654 $char = $self->{next_next_char};
655 delete $self->{next_next_char};
656 } else {
657 $char = $input->getc;
658 }
659 $self->{next_char} = -1 and return unless defined $char;
660 $self->{next_char} = ord $char;
661
662 ($self->{line_prev}, $self->{column_prev})
663 = ($self->{line}, $self->{column});
664 $self->{column}++;
665
666 if ($self->{next_char} == 0x000A) { # LF
667 !!!cp ('j1');
668 $self->{line}++;
669 $self->{column} = 0;
670 } elsif ($self->{next_char} == 0x000D) { # CR
671 !!!cp ('j2');
672 my $next = $input->getc;
673 if (defined $next and $next ne "\x0A") {
674 $self->{next_next_char} = $next;
675 }
676 $self->{next_char} = 0x000A; # LF # MUST
677 $self->{line}++;
678 $self->{column} = 0;
679 } elsif ($self->{next_char} > 0x10FFFF) {
680 !!!cp ('j3');
681 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
682 } elsif ($self->{next_char} == 0x0000) { # NULL
683 !!!cp ('j4');
684 !!!parse-error (type => 'NULL');
685 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
686 } elsif ($self->{next_char} <= 0x0008 or
687 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
688 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
689 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
690 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
691 {
692 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
693 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
694 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
695 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
696 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
697 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
698 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
699 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
700 0x10FFFE => 1, 0x10FFFF => 1,
701 }->{$self->{next_char}}) {
702 !!!cp ('j5');
703 if ($self->{next_char} < 0x10000) {
704 !!!parse-error (type => 'control char',
705 text => (sprintf 'U+%04X', $self->{next_char}));
706 } else {
707 !!!parse-error (type => 'control char',
708 text => (sprintf 'U-%08X', $self->{next_char}));
709 }
710 }
711 };
712 $self->{prev_char} = [-1, -1, -1];
713 $self->{next_char} = -1;
714
715 my $onerror = $_[2] || sub {
716 my (%opt) = @_;
717 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
718 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
719 warn "Parse error ($opt{type}) at line $line column $column\n";
720 };
721 $self->{parse_error} = sub {
722 $onerror->(line => $self->{line}, column => $self->{column}, @_);
723 };
724
725 $self->_initialize_tokenizer;
726 $self->_initialize_tree_constructor;
727 $self->_construct_tree;
728 $self->_terminate_tree_constructor;
729
730 delete $self->{parse_error}; # remove loop
731
732 return $self->{document};
733 } # parse_char_stream
734
735 sub new ($) {
736 my $class = shift;
737 my $self = bless {
738 level => {must => 'm',
739 should => 's',
740 warn => 'w',
741 info => 'i',
742 uncertain => 'u'},
743 }, $class;
744 $self->{set_next_char} = sub {
745 $self->{next_char} = -1;
746 };
747 $self->{parse_error} = sub {
748 #
749 };
750 $self->{change_encoding} = sub {
751 # if ($_[0] is a supported encoding) {
752 # run "change the encoding" algorithm;
753 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
754 # }
755 };
756 $self->{application_cache_selection} = sub {
757 #
758 };
759 return $self;
760 } # new
761
762 sub CM_ENTITY () { 0b001 } # & markup in data
763 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
764 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
765
766 sub PLAINTEXT_CONTENT_MODEL () { 0 }
767 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
768 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
769 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
770
771 sub DATA_STATE () { 0 }
772 sub ENTITY_DATA_STATE () { 1 }
773 sub TAG_OPEN_STATE () { 2 }
774 sub CLOSE_TAG_OPEN_STATE () { 3 }
775 sub TAG_NAME_STATE () { 4 }
776 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
777 sub ATTRIBUTE_NAME_STATE () { 6 }
778 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
779 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
780 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
781 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
782 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
783 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
784 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
785 sub COMMENT_START_STATE () { 14 }
786 sub COMMENT_START_DASH_STATE () { 15 }
787 sub COMMENT_STATE () { 16 }
788 sub COMMENT_END_STATE () { 17 }
789 sub COMMENT_END_DASH_STATE () { 18 }
790 sub BOGUS_COMMENT_STATE () { 19 }
791 sub DOCTYPE_STATE () { 20 }
792 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
793 sub DOCTYPE_NAME_STATE () { 22 }
794 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
795 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
796 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
797 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
798 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
799 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
800 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
801 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
802 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
803 sub BOGUS_DOCTYPE_STATE () { 32 }
804 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
805 sub SELF_CLOSING_START_TAG_STATE () { 34 }
806 sub CDATA_BLOCK_STATE () { 35 }
807 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
808 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
809 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
810 sub CDATA_PCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
811
812 sub DOCTYPE_TOKEN () { 1 }
813 sub COMMENT_TOKEN () { 2 }
814 sub START_TAG_TOKEN () { 3 }
815 sub END_TAG_TOKEN () { 4 }
816 sub END_OF_FILE_TOKEN () { 5 }
817 sub CHARACTER_TOKEN () { 6 }
818
819 sub AFTER_HTML_IMS () { 0b100 }
820 sub HEAD_IMS () { 0b1000 }
821 sub BODY_IMS () { 0b10000 }
822 sub BODY_TABLE_IMS () { 0b100000 }
823 sub TABLE_IMS () { 0b1000000 }
824 sub ROW_IMS () { 0b10000000 }
825 sub BODY_AFTER_IMS () { 0b100000000 }
826 sub FRAME_IMS () { 0b1000000000 }
827 sub SELECT_IMS () { 0b10000000000 }
828 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
829 ## NOTE: "in foreign content" insertion mode is special; it is combined
830 ## with the secondary insertion mode. In this parser, they are stored
831 ## together in the bit-or'ed form.
832
833 ## NOTE: "initial" and "before html" insertion modes have no constants.
834
835 ## NOTE: "after after body" insertion mode.
836 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
837
838 ## NOTE: "after after frameset" insertion mode.
839 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
840
841 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
842 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
843 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
844 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
845 sub IN_BODY_IM () { BODY_IMS }
846 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
847 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
848 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
849 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
850 sub IN_TABLE_IM () { TABLE_IMS }
851 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
852 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
853 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
854 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
855 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
856 sub IN_COLUMN_GROUP_IM () { 0b10 }
857
858 ## Implementations MUST act as if state machine in the spec
859
860 sub _initialize_tokenizer ($) {
861 my $self = shift;
862 $self->{state} = DATA_STATE; # MUST
863 #$self->{state_keyword}; # initialized when used
864 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
865 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
866 undef $self->{current_attribute};
867 undef $self->{last_emitted_start_tag_name};
868 undef $self->{last_attribute_value_state};
869 delete $self->{self_closing};
870 $self->{char} = [];
871 # $self->{next_char}
872 !!!next-input-character;
873 $self->{token} = [];
874 # $self->{escape}
875 } # _initialize_tokenizer
876
877 ## A token has:
878 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
879 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
880 ## ->{name} (DOCTYPE_TOKEN)
881 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
882 ## ->{public_identifier} (DOCTYPE_TOKEN)
883 ## ->{system_identifier} (DOCTYPE_TOKEN)
884 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
885 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
886 ## ->{name}
887 ## ->{value}
888 ## ->{has_reference} == 1 or 0
889 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
890 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
891 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
892 ## while the token is pushed back to the stack.
893
894 ## Emitted token MUST immediately be handled by the tree construction state.
895
896 ## Before each step, UA MAY check to see if either one of the scripts in
897 ## "list of scripts that will execute as soon as possible" or the first
898 ## script in the "list of scripts that will execute asynchronously",
899 ## has completed loading. If one has, then it MUST be executed
900 ## and removed from the list.
901
902 ## NOTE: HTML5 "Writing HTML documents" section, applied to
903 ## documents and not to user agents and conformance checkers,
904 ## contains some requirements that are not detected by the
905 ## parsing algorithm:
906 ## - Some requirements on character encoding declarations. ## TODO
907 ## - "Elements MUST NOT contain content that their content model disallows."
908 ## ... Some are parse error, some are not (will be reported by c.c.).
909 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
910 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
911 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
912
913 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
914 ## be detected by the HTML5 parsing algorithm:
915 ## - Text,
916
917 sub _get_next_token ($) {
918 my $self = shift;
919
920 if ($self->{self_closing}) {
921 !!!parse-error (type => 'nestc', token => $self->{current_token});
922 ## NOTE: The |self_closing| flag is only set by start tag token.
923 ## In addition, when a start tag token is emitted, it is always set to
924 ## |current_token|.
925 delete $self->{self_closing};
926 }
927
928 if (@{$self->{token}}) {
929 $self->{self_closing} = $self->{token}->[0]->{self_closing};
930 return shift @{$self->{token}};
931 }
932
933 A: {
934 if ($self->{state} == DATA_STATE) {
935 if ($self->{next_char} == 0x0026) { # &
936 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
937 not $self->{escape}) {
938 !!!cp (1);
939 $self->{state} = ENTITY_DATA_STATE;
940 !!!next-input-character;
941 redo A;
942 } else {
943 !!!cp (2);
944 #
945 }
946 } elsif ($self->{next_char} == 0x002D) { # -
947 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
948 unless ($self->{escape}) {
949 if ($self->{prev_char}->[0] == 0x002D and # -
950 $self->{prev_char}->[1] == 0x0021 and # !
951 $self->{prev_char}->[2] == 0x003C) { # <
952 !!!cp (3);
953 $self->{escape} = 1;
954 } else {
955 !!!cp (4);
956 }
957 } else {
958 !!!cp (5);
959 }
960 }
961
962 #
963 } elsif ($self->{next_char} == 0x003C) { # <
964 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
965 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
966 not $self->{escape})) {
967 !!!cp (6);
968 $self->{state} = TAG_OPEN_STATE;
969 !!!next-input-character;
970 redo A;
971 } else {
972 !!!cp (7);
973 #
974 }
975 } elsif ($self->{next_char} == 0x003E) { # >
976 if ($self->{escape} and
977 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
978 if ($self->{prev_char}->[0] == 0x002D and # -
979 $self->{prev_char}->[1] == 0x002D) { # -
980 !!!cp (8);
981 delete $self->{escape};
982 } else {
983 !!!cp (9);
984 }
985 } else {
986 !!!cp (10);
987 }
988
989 #
990 } elsif ($self->{next_char} == -1) {
991 !!!cp (11);
992 !!!emit ({type => END_OF_FILE_TOKEN,
993 line => $self->{line}, column => $self->{column}});
994 last A; ## TODO: ok?
995 } else {
996 !!!cp (12);
997 }
998 # Anything else
999 my $token = {type => CHARACTER_TOKEN,
1000 data => chr $self->{next_char},
1001 line => $self->{line}, column => $self->{column},
1002 };
1003 ## Stay in the data state
1004 !!!next-input-character;
1005
1006 !!!emit ($token);
1007
1008 redo A;
1009 } elsif ($self->{state} == ENTITY_DATA_STATE) {
1010 ## (cannot happen in CDATA state)
1011
1012 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
1013
1014 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
1015
1016 $self->{state} = DATA_STATE;
1017 # next-input-character is already done
1018
1019 unless (defined $token) {
1020 !!!cp (13);
1021 !!!emit ({type => CHARACTER_TOKEN, data => '&',
1022 line => $l, column => $c,
1023 });
1024 } else {
1025 !!!cp (14);
1026 !!!emit ($token);
1027 }
1028
1029 redo A;
1030 } elsif ($self->{state} == TAG_OPEN_STATE) {
1031 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1032 if ($self->{next_char} == 0x002F) { # /
1033 !!!cp (15);
1034 !!!next-input-character;
1035 $self->{state} = CLOSE_TAG_OPEN_STATE;
1036 redo A;
1037 } else {
1038 !!!cp (16);
1039 ## reconsume
1040 $self->{state} = DATA_STATE;
1041
1042 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1043 line => $self->{line_prev},
1044 column => $self->{column_prev},
1045 });
1046
1047 redo A;
1048 }
1049 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1050 if ($self->{next_char} == 0x0021) { # !
1051 !!!cp (17);
1052 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1053 !!!next-input-character;
1054 redo A;
1055 } elsif ($self->{next_char} == 0x002F) { # /
1056 !!!cp (18);
1057 $self->{state} = CLOSE_TAG_OPEN_STATE;
1058 !!!next-input-character;
1059 redo A;
1060 } elsif (0x0041 <= $self->{next_char} and
1061 $self->{next_char} <= 0x005A) { # A..Z
1062 !!!cp (19);
1063 $self->{current_token}
1064 = {type => START_TAG_TOKEN,
1065 tag_name => chr ($self->{next_char} + 0x0020),
1066 line => $self->{line_prev},
1067 column => $self->{column_prev}};
1068 $self->{state} = TAG_NAME_STATE;
1069 !!!next-input-character;
1070 redo A;
1071 } elsif (0x0061 <= $self->{next_char} and
1072 $self->{next_char} <= 0x007A) { # a..z
1073 !!!cp (20);
1074 $self->{current_token} = {type => START_TAG_TOKEN,
1075 tag_name => chr ($self->{next_char}),
1076 line => $self->{line_prev},
1077 column => $self->{column_prev}};
1078 $self->{state} = TAG_NAME_STATE;
1079 !!!next-input-character;
1080 redo A;
1081 } elsif ($self->{next_char} == 0x003E) { # >
1082 !!!cp (21);
1083 !!!parse-error (type => 'empty start tag',
1084 line => $self->{line_prev},
1085 column => $self->{column_prev});
1086 $self->{state} = DATA_STATE;
1087 !!!next-input-character;
1088
1089 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1090 line => $self->{line_prev},
1091 column => $self->{column_prev},
1092 });
1093
1094 redo A;
1095 } elsif ($self->{next_char} == 0x003F) { # ?
1096 !!!cp (22);
1097 !!!parse-error (type => 'pio',
1098 line => $self->{line_prev},
1099 column => $self->{column_prev});
1100 $self->{state} = BOGUS_COMMENT_STATE;
1101 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1102 line => $self->{line_prev},
1103 column => $self->{column_prev},
1104 };
1105 ## $self->{next_char} is intentionally left as is
1106 redo A;
1107 } else {
1108 !!!cp (23);
1109 !!!parse-error (type => 'bare stago',
1110 line => $self->{line_prev},
1111 column => $self->{column_prev});
1112 $self->{state} = DATA_STATE;
1113 ## reconsume
1114
1115 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1116 line => $self->{line_prev},
1117 column => $self->{column_prev},
1118 });
1119
1120 redo A;
1121 }
1122 } else {
1123 die "$0: $self->{content_model} in tag open";
1124 }
1125 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1126 ## NOTE: The "close tag open state" in the spec is implemented as
1127 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_PCDATA_CLOSE_TAG_STATE|.
1128
1129 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1130 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1131 if (defined $self->{last_emitted_start_tag_name}) {
1132 $self->{state} = CDATA_PCDATA_CLOSE_TAG_STATE;
1133 $self->{state_keyword} = '';
1134 ## Reconsume.
1135 redo A;
1136 } else {
1137 ## No start tag token has ever been emitted
1138 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1139 !!!cp (28);
1140 $self->{state} = DATA_STATE;
1141 ## Reconsume.
1142 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1143 line => $l, column => $c,
1144 });
1145 redo A;
1146 }
1147 }
1148
1149 if (0x0041 <= $self->{next_char} and
1150 $self->{next_char} <= 0x005A) { # A..Z
1151 !!!cp (29);
1152 $self->{current_token}
1153 = {type => END_TAG_TOKEN,
1154 tag_name => chr ($self->{next_char} + 0x0020),
1155 line => $l, column => $c};
1156 $self->{state} = TAG_NAME_STATE;
1157 !!!next-input-character;
1158 redo A;
1159 } elsif (0x0061 <= $self->{next_char} and
1160 $self->{next_char} <= 0x007A) { # a..z
1161 !!!cp (30);
1162 $self->{current_token} = {type => END_TAG_TOKEN,
1163 tag_name => chr ($self->{next_char}),
1164 line => $l, column => $c};
1165 $self->{state} = TAG_NAME_STATE;
1166 !!!next-input-character;
1167 redo A;
1168 } elsif ($self->{next_char} == 0x003E) { # >
1169 !!!cp (31);
1170 !!!parse-error (type => 'empty end tag',
1171 line => $self->{line_prev}, ## "<" in "</>"
1172 column => $self->{column_prev} - 1);
1173 $self->{state} = DATA_STATE;
1174 !!!next-input-character;
1175 redo A;
1176 } elsif ($self->{next_char} == -1) {
1177 !!!cp (32);
1178 !!!parse-error (type => 'bare etago');
1179 $self->{state} = DATA_STATE;
1180 # reconsume
1181
1182 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1183 line => $l, column => $c,
1184 });
1185
1186 redo A;
1187 } else {
1188 !!!cp (33);
1189 !!!parse-error (type => 'bogus end tag');
1190 $self->{state} = BOGUS_COMMENT_STATE;
1191 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1192 line => $self->{line_prev}, # "<" of "</"
1193 column => $self->{column_prev} - 1,
1194 };
1195 ## NOTE: $self->{next_char} is intentionally left as is.
1196 ## Although the "anything else" case of the spec not explicitly
1197 ## states that the next input character is to be reconsumed,
1198 ## it will be included to the |data| of the comment token
1199 ## generated from the bogus end tag, as defined in the
1200 ## "bogus comment state" entry.
1201 redo A;
1202 }
1203 } elsif ($self->{state} == CDATA_PCDATA_CLOSE_TAG_STATE) {
1204 my $ch = substr $self->{last_emitted_start_tag_name}, length $self->{state_keyword}, 1;
1205 if (length $ch) {
1206 my $CH = $ch;
1207 $ch =~ tr/a-z/A-Z/;
1208 my $nch = chr $self->{next_char};
1209 if ($nch eq $ch or $nch eq $CH) {
1210 !!!cp (24);
1211 ## Stay in the state.
1212 $self->{state_keyword} .= $nch;
1213 !!!next-input-character;
1214 redo A;
1215 } else {
1216 !!!cp (25);
1217 $self->{state} = DATA_STATE;
1218 ## Reconsume.
1219 !!!emit ({type => CHARACTER_TOKEN,
1220 data => '</' . $self->{state_keyword},
1221 line => $self->{line_prev},
1222 column => $self->{column_prev} - 1 - length $self->{state_keyword},
1223 });
1224 redo A;
1225 }
1226 } else { # after "<{tag-name}"
1227 unless ({
1228 0x0009 => 1, # HT
1229 0x000A => 1, # LF
1230 0x000B => 1, # VT
1231 0x000C => 1, # FF
1232 0x0020 => 1, # SP
1233 0x003E => 1, # >
1234 0x002F => 1, # /
1235 -1 => 1, # EOF
1236 }->{$self->{next_char}}) {
1237 !!!cp (26);
1238 ## Reconsume.
1239 $self->{state} = DATA_STATE;
1240 !!!emit ({type => CHARACTER_TOKEN,
1241 data => '</' . $self->{state_keyword},
1242 line => $self->{line_prev},
1243 column => $self->{column_prev} - 1 - length $self->{state_keyword},
1244 });
1245 redo A;
1246 } else {
1247 !!!cp (27);
1248 $self->{current_token}
1249 = {type => END_TAG_TOKEN,
1250 tag_name => $self->{last_emitted_start_tag_name},
1251 line => $self->{line_prev},
1252 column => $self->{column_prev} - 1 - length $self->{state_keyword}};
1253 $self->{state} = TAG_NAME_STATE;
1254 ## Reconsume.
1255 redo A;
1256 }
1257 }
1258 } elsif ($self->{state} == TAG_NAME_STATE) {
1259 if ($self->{next_char} == 0x0009 or # HT
1260 $self->{next_char} == 0x000A or # LF
1261 $self->{next_char} == 0x000B or # VT
1262 $self->{next_char} == 0x000C or # FF
1263 $self->{next_char} == 0x0020) { # SP
1264 !!!cp (34);
1265 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1266 !!!next-input-character;
1267 redo A;
1268 } elsif ($self->{next_char} == 0x003E) { # >
1269 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1270 !!!cp (35);
1271 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1272 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1273 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1274 #if ($self->{current_token}->{attributes}) {
1275 # ## NOTE: This should never be reached.
1276 # !!! cp (36);
1277 # !!! parse-error (type => 'end tag attribute');
1278 #} else {
1279 !!!cp (37);
1280 #}
1281 } else {
1282 die "$0: $self->{current_token}->{type}: Unknown token type";
1283 }
1284 $self->{state} = DATA_STATE;
1285 !!!next-input-character;
1286
1287 !!!emit ($self->{current_token}); # start tag or end tag
1288
1289 redo A;
1290 } elsif (0x0041 <= $self->{next_char} and
1291 $self->{next_char} <= 0x005A) { # A..Z
1292 !!!cp (38);
1293 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1294 # start tag or end tag
1295 ## Stay in this state
1296 !!!next-input-character;
1297 redo A;
1298 } elsif ($self->{next_char} == -1) {
1299 !!!parse-error (type => 'unclosed tag');
1300 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1301 !!!cp (39);
1302 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1303 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1304 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1305 #if ($self->{current_token}->{attributes}) {
1306 # ## NOTE: This state should never be reached.
1307 # !!! cp (40);
1308 # !!! parse-error (type => 'end tag attribute');
1309 #} else {
1310 !!!cp (41);
1311 #}
1312 } else {
1313 die "$0: $self->{current_token}->{type}: Unknown token type";
1314 }
1315 $self->{state} = DATA_STATE;
1316 # reconsume
1317
1318 !!!emit ($self->{current_token}); # start tag or end tag
1319
1320 redo A;
1321 } elsif ($self->{next_char} == 0x002F) { # /
1322 !!!cp (42);
1323 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1324 !!!next-input-character;
1325 redo A;
1326 } else {
1327 !!!cp (44);
1328 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1329 # start tag or end tag
1330 ## Stay in the state
1331 !!!next-input-character;
1332 redo A;
1333 }
1334 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1335 if ($self->{next_char} == 0x0009 or # HT
1336 $self->{next_char} == 0x000A or # LF
1337 $self->{next_char} == 0x000B or # VT
1338 $self->{next_char} == 0x000C or # FF
1339 $self->{next_char} == 0x0020) { # SP
1340 !!!cp (45);
1341 ## Stay in the state
1342 !!!next-input-character;
1343 redo A;
1344 } elsif ($self->{next_char} == 0x003E) { # >
1345 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1346 !!!cp (46);
1347 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1348 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1349 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1350 if ($self->{current_token}->{attributes}) {
1351 !!!cp (47);
1352 !!!parse-error (type => 'end tag attribute');
1353 } else {
1354 !!!cp (48);
1355 }
1356 } else {
1357 die "$0: $self->{current_token}->{type}: Unknown token type";
1358 }
1359 $self->{state} = DATA_STATE;
1360 !!!next-input-character;
1361
1362 !!!emit ($self->{current_token}); # start tag or end tag
1363
1364 redo A;
1365 } elsif (0x0041 <= $self->{next_char} and
1366 $self->{next_char} <= 0x005A) { # A..Z
1367 !!!cp (49);
1368 $self->{current_attribute}
1369 = {name => chr ($self->{next_char} + 0x0020),
1370 value => '',
1371 line => $self->{line}, column => $self->{column}};
1372 $self->{state} = ATTRIBUTE_NAME_STATE;
1373 !!!next-input-character;
1374 redo A;
1375 } elsif ($self->{next_char} == 0x002F) { # /
1376 !!!cp (50);
1377 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1378 !!!next-input-character;
1379 redo A;
1380 } elsif ($self->{next_char} == -1) {
1381 !!!parse-error (type => 'unclosed tag');
1382 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1383 !!!cp (52);
1384 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1385 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1386 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1387 if ($self->{current_token}->{attributes}) {
1388 !!!cp (53);
1389 !!!parse-error (type => 'end tag attribute');
1390 } else {
1391 !!!cp (54);
1392 }
1393 } else {
1394 die "$0: $self->{current_token}->{type}: Unknown token type";
1395 }
1396 $self->{state} = DATA_STATE;
1397 # reconsume
1398
1399 !!!emit ($self->{current_token}); # start tag or end tag
1400
1401 redo A;
1402 } else {
1403 if ({
1404 0x0022 => 1, # "
1405 0x0027 => 1, # '
1406 0x003D => 1, # =
1407 }->{$self->{next_char}}) {
1408 !!!cp (55);
1409 !!!parse-error (type => 'bad attribute name');
1410 } else {
1411 !!!cp (56);
1412 }
1413 $self->{current_attribute}
1414 = {name => chr ($self->{next_char}),
1415 value => '',
1416 line => $self->{line}, column => $self->{column}};
1417 $self->{state} = ATTRIBUTE_NAME_STATE;
1418 !!!next-input-character;
1419 redo A;
1420 }
1421 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1422 my $before_leave = sub {
1423 if (exists $self->{current_token}->{attributes} # start tag or end tag
1424 ->{$self->{current_attribute}->{name}}) { # MUST
1425 !!!cp (57);
1426 !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1427 ## Discard $self->{current_attribute} # MUST
1428 } else {
1429 !!!cp (58);
1430 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1431 = $self->{current_attribute};
1432 }
1433 }; # $before_leave
1434
1435 if ($self->{next_char} == 0x0009 or # HT
1436 $self->{next_char} == 0x000A or # LF
1437 $self->{next_char} == 0x000B or # VT
1438 $self->{next_char} == 0x000C or # FF
1439 $self->{next_char} == 0x0020) { # SP
1440 !!!cp (59);
1441 $before_leave->();
1442 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1443 !!!next-input-character;
1444 redo A;
1445 } elsif ($self->{next_char} == 0x003D) { # =
1446 !!!cp (60);
1447 $before_leave->();
1448 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1449 !!!next-input-character;
1450 redo A;
1451 } elsif ($self->{next_char} == 0x003E) { # >
1452 $before_leave->();
1453 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1454 !!!cp (61);
1455 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1456 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1457 !!!cp (62);
1458 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1459 if ($self->{current_token}->{attributes}) {
1460 !!!parse-error (type => 'end tag attribute');
1461 }
1462 } else {
1463 die "$0: $self->{current_token}->{type}: Unknown token type";
1464 }
1465 $self->{state} = DATA_STATE;
1466 !!!next-input-character;
1467
1468 !!!emit ($self->{current_token}); # start tag or end tag
1469
1470 redo A;
1471 } elsif (0x0041 <= $self->{next_char} and
1472 $self->{next_char} <= 0x005A) { # A..Z
1473 !!!cp (63);
1474 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1475 ## Stay in the state
1476 !!!next-input-character;
1477 redo A;
1478 } elsif ($self->{next_char} == 0x002F) { # /
1479 !!!cp (64);
1480 $before_leave->();
1481 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1482 !!!next-input-character;
1483 redo A;
1484 } elsif ($self->{next_char} == -1) {
1485 !!!parse-error (type => 'unclosed tag');
1486 $before_leave->();
1487 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1488 !!!cp (66);
1489 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1490 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1491 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1492 if ($self->{current_token}->{attributes}) {
1493 !!!cp (67);
1494 !!!parse-error (type => 'end tag attribute');
1495 } else {
1496 ## NOTE: This state should never be reached.
1497 !!!cp (68);
1498 }
1499 } else {
1500 die "$0: $self->{current_token}->{type}: Unknown token type";
1501 }
1502 $self->{state} = DATA_STATE;
1503 # reconsume
1504
1505 !!!emit ($self->{current_token}); # start tag or end tag
1506
1507 redo A;
1508 } else {
1509 if ($self->{next_char} == 0x0022 or # "
1510 $self->{next_char} == 0x0027) { # '
1511 !!!cp (69);
1512 !!!parse-error (type => 'bad attribute name');
1513 } else {
1514 !!!cp (70);
1515 }
1516 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1517 ## Stay in the state
1518 !!!next-input-character;
1519 redo A;
1520 }
1521 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1522 if ($self->{next_char} == 0x0009 or # HT
1523 $self->{next_char} == 0x000A or # LF
1524 $self->{next_char} == 0x000B or # VT
1525 $self->{next_char} == 0x000C or # FF
1526 $self->{next_char} == 0x0020) { # SP
1527 !!!cp (71);
1528 ## Stay in the state
1529 !!!next-input-character;
1530 redo A;
1531 } elsif ($self->{next_char} == 0x003D) { # =
1532 !!!cp (72);
1533 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1534 !!!next-input-character;
1535 redo A;
1536 } elsif ($self->{next_char} == 0x003E) { # >
1537 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1538 !!!cp (73);
1539 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1540 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1541 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1542 if ($self->{current_token}->{attributes}) {
1543 !!!cp (74);
1544 !!!parse-error (type => 'end tag attribute');
1545 } else {
1546 ## NOTE: This state should never be reached.
1547 !!!cp (75);
1548 }
1549 } else {
1550 die "$0: $self->{current_token}->{type}: Unknown token type";
1551 }
1552 $self->{state} = DATA_STATE;
1553 !!!next-input-character;
1554
1555 !!!emit ($self->{current_token}); # start tag or end tag
1556
1557 redo A;
1558 } elsif (0x0041 <= $self->{next_char} and
1559 $self->{next_char} <= 0x005A) { # A..Z
1560 !!!cp (76);
1561 $self->{current_attribute}
1562 = {name => chr ($self->{next_char} + 0x0020),
1563 value => '',
1564 line => $self->{line}, column => $self->{column}};
1565 $self->{state} = ATTRIBUTE_NAME_STATE;
1566 !!!next-input-character;
1567 redo A;
1568 } elsif ($self->{next_char} == 0x002F) { # /
1569 !!!cp (77);
1570 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1571 !!!next-input-character;
1572 redo A;
1573 } elsif ($self->{next_char} == -1) {
1574 !!!parse-error (type => 'unclosed tag');
1575 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1576 !!!cp (79);
1577 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1578 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1579 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1580 if ($self->{current_token}->{attributes}) {
1581 !!!cp (80);
1582 !!!parse-error (type => 'end tag attribute');
1583 } else {
1584 ## NOTE: This state should never be reached.
1585 !!!cp (81);
1586 }
1587 } else {
1588 die "$0: $self->{current_token}->{type}: Unknown token type";
1589 }
1590 $self->{state} = DATA_STATE;
1591 # reconsume
1592
1593 !!!emit ($self->{current_token}); # start tag or end tag
1594
1595 redo A;
1596 } else {
1597 if ($self->{next_char} == 0x0022 or # "
1598 $self->{next_char} == 0x0027) { # '
1599 !!!cp (78);
1600 !!!parse-error (type => 'bad attribute name');
1601 } else {
1602 !!!cp (82);
1603 }
1604 $self->{current_attribute}
1605 = {name => chr ($self->{next_char}),
1606 value => '',
1607 line => $self->{line}, column => $self->{column}};
1608 $self->{state} = ATTRIBUTE_NAME_STATE;
1609 !!!next-input-character;
1610 redo A;
1611 }
1612 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1613 if ($self->{next_char} == 0x0009 or # HT
1614 $self->{next_char} == 0x000A or # LF
1615 $self->{next_char} == 0x000B or # VT
1616 $self->{next_char} == 0x000C or # FF
1617 $self->{next_char} == 0x0020) { # SP
1618 !!!cp (83);
1619 ## Stay in the state
1620 !!!next-input-character;
1621 redo A;
1622 } elsif ($self->{next_char} == 0x0022) { # "
1623 !!!cp (84);
1624 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1625 !!!next-input-character;
1626 redo A;
1627 } elsif ($self->{next_char} == 0x0026) { # &
1628 !!!cp (85);
1629 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1630 ## reconsume
1631 redo A;
1632 } elsif ($self->{next_char} == 0x0027) { # '
1633 !!!cp (86);
1634 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1635 !!!next-input-character;
1636 redo A;
1637 } elsif ($self->{next_char} == 0x003E) { # >
1638 !!!parse-error (type => 'empty unquoted attribute value');
1639 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1640 !!!cp (87);
1641 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1642 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1643 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1644 if ($self->{current_token}->{attributes}) {
1645 !!!cp (88);
1646 !!!parse-error (type => 'end tag attribute');
1647 } else {
1648 ## NOTE: This state should never be reached.
1649 !!!cp (89);
1650 }
1651 } else {
1652 die "$0: $self->{current_token}->{type}: Unknown token type";
1653 }
1654 $self->{state} = DATA_STATE;
1655 !!!next-input-character;
1656
1657 !!!emit ($self->{current_token}); # start tag or end tag
1658
1659 redo A;
1660 } elsif ($self->{next_char} == -1) {
1661 !!!parse-error (type => 'unclosed tag');
1662 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1663 !!!cp (90);
1664 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1665 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1666 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1667 if ($self->{current_token}->{attributes}) {
1668 !!!cp (91);
1669 !!!parse-error (type => 'end tag attribute');
1670 } else {
1671 ## NOTE: This state should never be reached.
1672 !!!cp (92);
1673 }
1674 } else {
1675 die "$0: $self->{current_token}->{type}: Unknown token type";
1676 }
1677 $self->{state} = DATA_STATE;
1678 ## reconsume
1679
1680 !!!emit ($self->{current_token}); # start tag or end tag
1681
1682 redo A;
1683 } else {
1684 if ($self->{next_char} == 0x003D) { # =
1685 !!!cp (93);
1686 !!!parse-error (type => 'bad attribute value');
1687 } else {
1688 !!!cp (94);
1689 }
1690 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1691 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1692 !!!next-input-character;
1693 redo A;
1694 }
1695 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1696 if ($self->{next_char} == 0x0022) { # "
1697 !!!cp (95);
1698 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1699 !!!next-input-character;
1700 redo A;
1701 } elsif ($self->{next_char} == 0x0026) { # &
1702 !!!cp (96);
1703 $self->{last_attribute_value_state} = $self->{state};
1704 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1705 !!!next-input-character;
1706 redo A;
1707 } elsif ($self->{next_char} == -1) {
1708 !!!parse-error (type => 'unclosed attribute value');
1709 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1710 !!!cp (97);
1711 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1712 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1713 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1714 if ($self->{current_token}->{attributes}) {
1715 !!!cp (98);
1716 !!!parse-error (type => 'end tag attribute');
1717 } else {
1718 ## NOTE: This state should never be reached.
1719 !!!cp (99);
1720 }
1721 } else {
1722 die "$0: $self->{current_token}->{type}: Unknown token type";
1723 }
1724 $self->{state} = DATA_STATE;
1725 ## reconsume
1726
1727 !!!emit ($self->{current_token}); # start tag or end tag
1728
1729 redo A;
1730 } else {
1731 !!!cp (100);
1732 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1733 ## Stay in the state
1734 !!!next-input-character;
1735 redo A;
1736 }
1737 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1738 if ($self->{next_char} == 0x0027) { # '
1739 !!!cp (101);
1740 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1741 !!!next-input-character;
1742 redo A;
1743 } elsif ($self->{next_char} == 0x0026) { # &
1744 !!!cp (102);
1745 $self->{last_attribute_value_state} = $self->{state};
1746 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1747 !!!next-input-character;
1748 redo A;
1749 } elsif ($self->{next_char} == -1) {
1750 !!!parse-error (type => 'unclosed attribute value');
1751 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1752 !!!cp (103);
1753 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1754 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1755 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1756 if ($self->{current_token}->{attributes}) {
1757 !!!cp (104);
1758 !!!parse-error (type => 'end tag attribute');
1759 } else {
1760 ## NOTE: This state should never be reached.
1761 !!!cp (105);
1762 }
1763 } else {
1764 die "$0: $self->{current_token}->{type}: Unknown token type";
1765 }
1766 $self->{state} = DATA_STATE;
1767 ## reconsume
1768
1769 !!!emit ($self->{current_token}); # start tag or end tag
1770
1771 redo A;
1772 } else {
1773 !!!cp (106);
1774 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1775 ## Stay in the state
1776 !!!next-input-character;
1777 redo A;
1778 }
1779 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1780 if ($self->{next_char} == 0x0009 or # HT
1781 $self->{next_char} == 0x000A or # LF
1782 $self->{next_char} == 0x000B or # HT
1783 $self->{next_char} == 0x000C or # FF
1784 $self->{next_char} == 0x0020) { # SP
1785 !!!cp (107);
1786 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1787 !!!next-input-character;
1788 redo A;
1789 } elsif ($self->{next_char} == 0x0026) { # &
1790 !!!cp (108);
1791 $self->{last_attribute_value_state} = $self->{state};
1792 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1793 !!!next-input-character;
1794 redo A;
1795 } elsif ($self->{next_char} == 0x003E) { # >
1796 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1797 !!!cp (109);
1798 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1799 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1800 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1801 if ($self->{current_token}->{attributes}) {
1802 !!!cp (110);
1803 !!!parse-error (type => 'end tag attribute');
1804 } else {
1805 ## NOTE: This state should never be reached.
1806 !!!cp (111);
1807 }
1808 } else {
1809 die "$0: $self->{current_token}->{type}: Unknown token type";
1810 }
1811 $self->{state} = DATA_STATE;
1812 !!!next-input-character;
1813
1814 !!!emit ($self->{current_token}); # start tag or end tag
1815
1816 redo A;
1817 } elsif ($self->{next_char} == -1) {
1818 !!!parse-error (type => 'unclosed tag');
1819 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1820 !!!cp (112);
1821 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1822 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1823 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1824 if ($self->{current_token}->{attributes}) {
1825 !!!cp (113);
1826 !!!parse-error (type => 'end tag attribute');
1827 } else {
1828 ## NOTE: This state should never be reached.
1829 !!!cp (114);
1830 }
1831 } else {
1832 die "$0: $self->{current_token}->{type}: Unknown token type";
1833 }
1834 $self->{state} = DATA_STATE;
1835 ## reconsume
1836
1837 !!!emit ($self->{current_token}); # start tag or end tag
1838
1839 redo A;
1840 } else {
1841 if ({
1842 0x0022 => 1, # "
1843 0x0027 => 1, # '
1844 0x003D => 1, # =
1845 }->{$self->{next_char}}) {
1846 !!!cp (115);
1847 !!!parse-error (type => 'bad attribute value');
1848 } else {
1849 !!!cp (116);
1850 }
1851 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1852 ## Stay in the state
1853 !!!next-input-character;
1854 redo A;
1855 }
1856 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1857 my $token = $self->_tokenize_attempt_to_consume_an_entity
1858 (1,
1859 $self->{last_attribute_value_state}
1860 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1861 $self->{last_attribute_value_state}
1862 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1863 -1);
1864
1865 unless (defined $token) {
1866 !!!cp (117);
1867 $self->{current_attribute}->{value} .= '&';
1868 } else {
1869 !!!cp (118);
1870 $self->{current_attribute}->{value} .= $token->{data};
1871 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1872 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1873 }
1874
1875 $self->{state} = $self->{last_attribute_value_state};
1876 # next-input-character is already done
1877 redo A;
1878 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1879 if ($self->{next_char} == 0x0009 or # HT
1880 $self->{next_char} == 0x000A or # LF
1881 $self->{next_char} == 0x000B or # VT
1882 $self->{next_char} == 0x000C or # FF
1883 $self->{next_char} == 0x0020) { # SP
1884 !!!cp (118);
1885 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1886 !!!next-input-character;
1887 redo A;
1888 } elsif ($self->{next_char} == 0x003E) { # >
1889 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1890 !!!cp (119);
1891 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1892 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1893 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1894 if ($self->{current_token}->{attributes}) {
1895 !!!cp (120);
1896 !!!parse-error (type => 'end tag attribute');
1897 } else {
1898 ## NOTE: This state should never be reached.
1899 !!!cp (121);
1900 }
1901 } else {
1902 die "$0: $self->{current_token}->{type}: Unknown token type";
1903 }
1904 $self->{state} = DATA_STATE;
1905 !!!next-input-character;
1906
1907 !!!emit ($self->{current_token}); # start tag or end tag
1908
1909 redo A;
1910 } elsif ($self->{next_char} == 0x002F) { # /
1911 !!!cp (122);
1912 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1913 !!!next-input-character;
1914 redo A;
1915 } elsif ($self->{next_char} == -1) {
1916 !!!parse-error (type => 'unclosed tag');
1917 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1918 !!!cp (122.3);
1919 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1920 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1921 if ($self->{current_token}->{attributes}) {
1922 !!!cp (122.1);
1923 !!!parse-error (type => 'end tag attribute');
1924 } else {
1925 ## NOTE: This state should never be reached.
1926 !!!cp (122.2);
1927 }
1928 } else {
1929 die "$0: $self->{current_token}->{type}: Unknown token type";
1930 }
1931 $self->{state} = DATA_STATE;
1932 ## Reconsume.
1933 !!!emit ($self->{current_token}); # start tag or end tag
1934 redo A;
1935 } else {
1936 !!!cp ('124.1');
1937 !!!parse-error (type => 'no space between attributes');
1938 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1939 ## reconsume
1940 redo A;
1941 }
1942 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1943 if ($self->{next_char} == 0x003E) { # >
1944 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1945 !!!cp ('124.2');
1946 !!!parse-error (type => 'nestc', token => $self->{current_token});
1947 ## TODO: Different type than slash in start tag
1948 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1949 if ($self->{current_token}->{attributes}) {
1950 !!!cp ('124.4');
1951 !!!parse-error (type => 'end tag attribute');
1952 } else {
1953 !!!cp ('124.5');
1954 }
1955 ## TODO: Test |<title></title/>|
1956 } else {
1957 !!!cp ('124.3');
1958 $self->{self_closing} = 1;
1959 }
1960
1961 $self->{state} = DATA_STATE;
1962 !!!next-input-character;
1963
1964 !!!emit ($self->{current_token}); # start tag or end tag
1965
1966 redo A;
1967 } elsif ($self->{next_char} == -1) {
1968 !!!parse-error (type => 'unclosed tag');
1969 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1970 !!!cp (124.7);
1971 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1972 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1973 if ($self->{current_token}->{attributes}) {
1974 !!!cp (124.5);
1975 !!!parse-error (type => 'end tag attribute');
1976 } else {
1977 ## NOTE: This state should never be reached.
1978 !!!cp (124.6);
1979 }
1980 } else {
1981 die "$0: $self->{current_token}->{type}: Unknown token type";
1982 }
1983 $self->{state} = DATA_STATE;
1984 ## Reconsume.
1985 !!!emit ($self->{current_token}); # start tag or end tag
1986 redo A;
1987 } else {
1988 !!!cp ('124.4');
1989 !!!parse-error (type => 'nestc');
1990 ## TODO: This error type is wrong.
1991 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1992 ## Reconsume.
1993 redo A;
1994 }
1995 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1996 ## (only happen if PCDATA state)
1997
1998 ## NOTE: Set by the previous state
1999 #my $token = {type => COMMENT_TOKEN, data => ''};
2000
2001 BC: {
2002 if ($self->{next_char} == 0x003E) { # >
2003 !!!cp (124);
2004 $self->{state} = DATA_STATE;
2005 !!!next-input-character;
2006
2007 !!!emit ($self->{current_token}); # comment
2008
2009 redo A;
2010 } elsif ($self->{next_char} == -1) {
2011 !!!cp (125);
2012 $self->{state} = DATA_STATE;
2013 ## reconsume
2014
2015 !!!emit ($self->{current_token}); # comment
2016
2017 redo A;
2018 } else {
2019 !!!cp (126);
2020 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2021 !!!next-input-character;
2022 redo BC;
2023 }
2024 } # BC
2025
2026 die "$0: _get_next_token: unexpected case [BC]";
2027 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2028 ## (only happen if PCDATA state)
2029
2030 if ($self->{next_char} == 0x002D) { # -
2031 !!!cp (133);
2032 $self->{state} = MD_HYPHEN_STATE;
2033 !!!next-input-character;
2034 redo A;
2035 } elsif ($self->{next_char} == 0x0044 or # D
2036 $self->{next_char} == 0x0064) { # d
2037 ## ASCII case-insensitive.
2038 !!!cp (130);
2039 $self->{state} = MD_DOCTYPE_STATE;
2040 $self->{state_keyword} = chr $self->{next_char};
2041 !!!next-input-character;
2042 redo A;
2043 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2044 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2045 $self->{next_char} == 0x005B) { # [
2046 !!!cp (135.4);
2047 $self->{state} = MD_CDATA_STATE;
2048 $self->{state_keyword} = '[';
2049 !!!next-input-character;
2050 redo A;
2051 } else {
2052 !!!cp (136);
2053 }
2054
2055 !!!parse-error (type => 'bogus comment',
2056 line => $self->{line_prev},
2057 column => $self->{column_prev} - 1);
2058 ## Reconsume.
2059 $self->{state} = BOGUS_COMMENT_STATE;
2060 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2061 line => $self->{line_prev},
2062 column => $self->{column_prev} - 1,
2063 };
2064 redo A;
2065 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2066 if ($self->{next_char} == 0x002D) { # -
2067 !!!cp (127);
2068 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2069 line => $self->{line_prev},
2070 column => $self->{column_prev} - 2,
2071 };
2072 $self->{state} = COMMENT_START_STATE;
2073 !!!next-input-character;
2074 redo A;
2075 } else {
2076 !!!cp (128);
2077 !!!parse-error (type => 'bogus comment',
2078 line => $self->{line_prev},
2079 column => $self->{column_prev} - 2);
2080 $self->{state} = BOGUS_COMMENT_STATE;
2081 ## Reconsume.
2082 $self->{current_token} = {type => COMMENT_TOKEN,
2083 data => '-',
2084 line => $self->{line_prev},
2085 column => $self->{column_prev} - 2,
2086 };
2087 redo A;
2088 }
2089 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2090 ## ASCII case-insensitive.
2091 if ($self->{next_char} == [
2092 undef,
2093 0x004F, # O
2094 0x0043, # C
2095 0x0054, # T
2096 0x0059, # Y
2097 0x0050, # P
2098 ]->[length $self->{state_keyword}] or
2099 $self->{next_char} == [
2100 undef,
2101 0x006F, # o
2102 0x0063, # c
2103 0x0074, # t
2104 0x0079, # y
2105 0x0070, # p
2106 ]->[length $self->{state_keyword}]) {
2107 !!!cp (131);
2108 ## Stay in the state.
2109 $self->{state_keyword} .= chr $self->{next_char};
2110 !!!next-input-character;
2111 redo A;
2112 } elsif ((length $self->{state_keyword}) == 6 and
2113 ($self->{next_char} == 0x0045 or # E
2114 $self->{next_char} == 0x0065)) { # e
2115 !!!cp (129);
2116 $self->{state} = DOCTYPE_STATE;
2117 $self->{current_token} = {type => DOCTYPE_TOKEN,
2118 quirks => 1,
2119 line => $self->{line_prev},
2120 column => $self->{column_prev} - 7,
2121 };
2122 !!!next-input-character;
2123 redo A;
2124 } else {
2125 !!!cp (132);
2126 !!!parse-error (type => 'bogus comment',
2127 line => $self->{line_prev},
2128 column => $self->{column_prev} - 1 - length $self->{state_keyword});
2129 $self->{state} = BOGUS_COMMENT_STATE;
2130 ## Reconsume.
2131 $self->{current_token} = {type => COMMENT_TOKEN,
2132 data => $self->{state_keyword},
2133 line => $self->{line_prev},
2134 column => $self->{column_prev} - 1 - length $self->{state_keyword},
2135 };
2136 redo A;
2137 }
2138 } elsif ($self->{state} == MD_CDATA_STATE) {
2139 if ($self->{next_char} == {
2140 '[' => 0x0043, # C
2141 '[C' => 0x0044, # D
2142 '[CD' => 0x0041, # A
2143 '[CDA' => 0x0054, # T
2144 '[CDAT' => 0x0041, # A
2145 }->{$self->{state_keyword}}) {
2146 !!!cp (135.1);
2147 ## Stay in the state.
2148 $self->{state_keyword} .= chr $self->{next_char};
2149 !!!next-input-character;
2150 redo A;
2151 } elsif ($self->{state_keyword} eq '[CDATA' and
2152 $self->{next_char} == 0x005B) { # [
2153 !!!cp (135.2);
2154 $self->{state} = CDATA_BLOCK_STATE;
2155 !!!next-input-character;
2156 redo A;
2157 } else {
2158 !!!cp (135.3);
2159 !!!parse-error (type => 'bogus comment',
2160 line => $self->{line_prev},
2161 column => $self->{column_prev} - 1 - length $self->{state_keyword});
2162 $self->{state} = BOGUS_COMMENT_STATE;
2163 ## Reconsume.
2164 $self->{current_token} = {type => COMMENT_TOKEN,
2165 data => $self->{state_keyword},
2166 line => $self->{line_prev},
2167 column => $self->{column_prev} - 1 - length $self->{state_keyword},
2168 };
2169 redo A;
2170 }
2171 } elsif ($self->{state} == COMMENT_START_STATE) {
2172 if ($self->{next_char} == 0x002D) { # -
2173 !!!cp (137);
2174 $self->{state} = COMMENT_START_DASH_STATE;
2175 !!!next-input-character;
2176 redo A;
2177 } elsif ($self->{next_char} == 0x003E) { # >
2178 !!!cp (138);
2179 !!!parse-error (type => 'bogus comment');
2180 $self->{state} = DATA_STATE;
2181 !!!next-input-character;
2182
2183 !!!emit ($self->{current_token}); # comment
2184
2185 redo A;
2186 } elsif ($self->{next_char} == -1) {
2187 !!!cp (139);
2188 !!!parse-error (type => 'unclosed comment');
2189 $self->{state} = DATA_STATE;
2190 ## reconsume
2191
2192 !!!emit ($self->{current_token}); # comment
2193
2194 redo A;
2195 } else {
2196 !!!cp (140);
2197 $self->{current_token}->{data} # comment
2198 .= chr ($self->{next_char});
2199 $self->{state} = COMMENT_STATE;
2200 !!!next-input-character;
2201 redo A;
2202 }
2203 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2204 if ($self->{next_char} == 0x002D) { # -
2205 !!!cp (141);
2206 $self->{state} = COMMENT_END_STATE;
2207 !!!next-input-character;
2208 redo A;
2209 } elsif ($self->{next_char} == 0x003E) { # >
2210 !!!cp (142);
2211 !!!parse-error (type => 'bogus comment');
2212 $self->{state} = DATA_STATE;
2213 !!!next-input-character;
2214
2215 !!!emit ($self->{current_token}); # comment
2216
2217 redo A;
2218 } elsif ($self->{next_char} == -1) {
2219 !!!cp (143);
2220 !!!parse-error (type => 'unclosed comment');
2221 $self->{state} = DATA_STATE;
2222 ## reconsume
2223
2224 !!!emit ($self->{current_token}); # comment
2225
2226 redo A;
2227 } else {
2228 !!!cp (144);
2229 $self->{current_token}->{data} # comment
2230 .= '-' . chr ($self->{next_char});
2231 $self->{state} = COMMENT_STATE;
2232 !!!next-input-character;
2233 redo A;
2234 }
2235 } elsif ($self->{state} == COMMENT_STATE) {
2236 if ($self->{next_char} == 0x002D) { # -
2237 !!!cp (145);
2238 $self->{state} = COMMENT_END_DASH_STATE;
2239 !!!next-input-character;
2240 redo A;
2241 } elsif ($self->{next_char} == -1) {
2242 !!!cp (146);
2243 !!!parse-error (type => 'unclosed comment');
2244 $self->{state} = DATA_STATE;
2245 ## reconsume
2246
2247 !!!emit ($self->{current_token}); # comment
2248
2249 redo A;
2250 } else {
2251 !!!cp (147);
2252 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2253 ## Stay in the state
2254 !!!next-input-character;
2255 redo A;
2256 }
2257 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2258 if ($self->{next_char} == 0x002D) { # -
2259 !!!cp (148);
2260 $self->{state} = COMMENT_END_STATE;
2261 !!!next-input-character;
2262 redo A;
2263 } elsif ($self->{next_char} == -1) {
2264 !!!cp (149);
2265 !!!parse-error (type => 'unclosed comment');
2266 $self->{state} = DATA_STATE;
2267 ## reconsume
2268
2269 !!!emit ($self->{current_token}); # comment
2270
2271 redo A;
2272 } else {
2273 !!!cp (150);
2274 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2275 $self->{state} = COMMENT_STATE;
2276 !!!next-input-character;
2277 redo A;
2278 }
2279 } elsif ($self->{state} == COMMENT_END_STATE) {
2280 if ($self->{next_char} == 0x003E) { # >
2281 !!!cp (151);
2282 $self->{state} = DATA_STATE;
2283 !!!next-input-character;
2284
2285 !!!emit ($self->{current_token}); # comment
2286
2287 redo A;
2288 } elsif ($self->{next_char} == 0x002D) { # -
2289 !!!cp (152);
2290 !!!parse-error (type => 'dash in comment',
2291 line => $self->{line_prev},
2292 column => $self->{column_prev});
2293 $self->{current_token}->{data} .= '-'; # comment
2294 ## Stay in the state
2295 !!!next-input-character;
2296 redo A;
2297 } elsif ($self->{next_char} == -1) {
2298 !!!cp (153);
2299 !!!parse-error (type => 'unclosed comment');
2300 $self->{state} = DATA_STATE;
2301 ## reconsume
2302
2303 !!!emit ($self->{current_token}); # comment
2304
2305 redo A;
2306 } else {
2307 !!!cp (154);
2308 !!!parse-error (type => 'dash in comment',
2309 line => $self->{line_prev},
2310 column => $self->{column_prev});
2311 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2312 $self->{state} = COMMENT_STATE;
2313 !!!next-input-character;
2314 redo A;
2315 }
2316 } elsif ($self->{state} == DOCTYPE_STATE) {
2317 if ($self->{next_char} == 0x0009 or # HT
2318 $self->{next_char} == 0x000A or # LF
2319 $self->{next_char} == 0x000B or # VT
2320 $self->{next_char} == 0x000C or # FF
2321 $self->{next_char} == 0x0020) { # SP
2322 !!!cp (155);
2323 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2324 !!!next-input-character;
2325 redo A;
2326 } else {
2327 !!!cp (156);
2328 !!!parse-error (type => 'no space before DOCTYPE name');
2329 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2330 ## reconsume
2331 redo A;
2332 }
2333 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2334 if ($self->{next_char} == 0x0009 or # HT
2335 $self->{next_char} == 0x000A or # LF
2336 $self->{next_char} == 0x000B or # VT
2337 $self->{next_char} == 0x000C or # FF
2338 $self->{next_char} == 0x0020) { # SP
2339 !!!cp (157);
2340 ## Stay in the state
2341 !!!next-input-character;
2342 redo A;
2343 } elsif ($self->{next_char} == 0x003E) { # >
2344 !!!cp (158);
2345 !!!parse-error (type => 'no DOCTYPE name');
2346 $self->{state} = DATA_STATE;
2347 !!!next-input-character;
2348
2349 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2350
2351 redo A;
2352 } elsif ($self->{next_char} == -1) {
2353 !!!cp (159);
2354 !!!parse-error (type => 'no DOCTYPE name');
2355 $self->{state} = DATA_STATE;
2356 ## reconsume
2357
2358 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2359
2360 redo A;
2361 } else {
2362 !!!cp (160);
2363 $self->{current_token}->{name} = chr $self->{next_char};
2364 delete $self->{current_token}->{quirks};
2365 ## ISSUE: "Set the token's name name to the" in the spec
2366 $self->{state} = DOCTYPE_NAME_STATE;
2367 !!!next-input-character;
2368 redo A;
2369 }
2370 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2371 ## ISSUE: Redundant "First," in the spec.
2372 if ($self->{next_char} == 0x0009 or # HT
2373 $self->{next_char} == 0x000A or # LF
2374 $self->{next_char} == 0x000B or # VT
2375 $self->{next_char} == 0x000C or # FF
2376 $self->{next_char} == 0x0020) { # SP
2377 !!!cp (161);
2378 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2379 !!!next-input-character;
2380 redo A;
2381 } elsif ($self->{next_char} == 0x003E) { # >
2382 !!!cp (162);
2383 $self->{state} = DATA_STATE;
2384 !!!next-input-character;
2385
2386 !!!emit ($self->{current_token}); # DOCTYPE
2387
2388 redo A;
2389 } elsif ($self->{next_char} == -1) {
2390 !!!cp (163);
2391 !!!parse-error (type => 'unclosed DOCTYPE');
2392 $self->{state} = DATA_STATE;
2393 ## reconsume
2394
2395 $self->{current_token}->{quirks} = 1;
2396 !!!emit ($self->{current_token}); # DOCTYPE
2397
2398 redo A;
2399 } else {
2400 !!!cp (164);
2401 $self->{current_token}->{name}
2402 .= chr ($self->{next_char}); # DOCTYPE
2403 ## Stay in the state
2404 !!!next-input-character;
2405 redo A;
2406 }
2407 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2408 if ($self->{next_char} == 0x0009 or # HT
2409 $self->{next_char} == 0x000A or # LF
2410 $self->{next_char} == 0x000B or # VT
2411 $self->{next_char} == 0x000C or # FF
2412 $self->{next_char} == 0x0020) { # SP
2413 !!!cp (165);
2414 ## Stay in the state
2415 !!!next-input-character;
2416 redo A;
2417 } elsif ($self->{next_char} == 0x003E) { # >
2418 !!!cp (166);
2419 $self->{state} = DATA_STATE;
2420 !!!next-input-character;
2421
2422 !!!emit ($self->{current_token}); # DOCTYPE
2423
2424 redo A;
2425 } elsif ($self->{next_char} == -1) {
2426 !!!cp (167);
2427 !!!parse-error (type => 'unclosed DOCTYPE');
2428 $self->{state} = DATA_STATE;
2429 ## reconsume
2430
2431 $self->{current_token}->{quirks} = 1;
2432 !!!emit ($self->{current_token}); # DOCTYPE
2433
2434 redo A;
2435 } elsif ($self->{next_char} == 0x0050 or # P
2436 $self->{next_char} == 0x0070) { # p
2437 !!!next-input-character;
2438 if ($self->{next_char} == 0x0055 or # U
2439 $self->{next_char} == 0x0075) { # u
2440 !!!next-input-character;
2441 if ($self->{next_char} == 0x0042 or # B
2442 $self->{next_char} == 0x0062) { # b
2443 !!!next-input-character;
2444 if ($self->{next_char} == 0x004C or # L
2445 $self->{next_char} == 0x006C) { # l
2446 !!!next-input-character;
2447 if ($self->{next_char} == 0x0049 or # I
2448 $self->{next_char} == 0x0069) { # i
2449 !!!next-input-character;
2450 if ($self->{next_char} == 0x0043 or # C
2451 $self->{next_char} == 0x0063) { # c
2452 !!!cp (168);
2453 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2454 !!!next-input-character;
2455 redo A;
2456 } else {
2457 !!!cp (169);
2458 }
2459 } else {
2460 !!!cp (170);
2461 }
2462 } else {
2463 !!!cp (171);
2464 }
2465 } else {
2466 !!!cp (172);
2467 }
2468 } else {
2469 !!!cp (173);
2470 }
2471
2472 #
2473 } elsif ($self->{next_char} == 0x0053 or # S
2474 $self->{next_char} == 0x0073) { # s
2475 !!!next-input-character;
2476 if ($self->{next_char} == 0x0059 or # Y
2477 $self->{next_char} == 0x0079) { # y
2478 !!!next-input-character;
2479 if ($self->{next_char} == 0x0053 or # S
2480 $self->{next_char} == 0x0073) { # s
2481 !!!next-input-character;
2482 if ($self->{next_char} == 0x0054 or # T
2483 $self->{next_char} == 0x0074) { # t
2484 !!!next-input-character;
2485 if ($self->{next_char} == 0x0045 or # E
2486 $self->{next_char} == 0x0065) { # e
2487 !!!next-input-character;
2488 if ($self->{next_char} == 0x004D or # M
2489 $self->{next_char} == 0x006D) { # m
2490 !!!cp (174);
2491 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2492 !!!next-input-character;
2493 redo A;
2494 } else {
2495 !!!cp (175);
2496 }
2497 } else {
2498 !!!cp (176);
2499 }
2500 } else {
2501 !!!cp (177);
2502 }
2503 } else {
2504 !!!cp (178);
2505 }
2506 } else {
2507 !!!cp (179);
2508 }
2509
2510 #
2511 } else {
2512 !!!cp (180);
2513 !!!next-input-character;
2514 #
2515 }
2516
2517 !!!parse-error (type => 'string after DOCTYPE name');
2518 $self->{current_token}->{quirks} = 1;
2519
2520 $self->{state} = BOGUS_DOCTYPE_STATE;
2521 # next-input-character is already done
2522 redo A;
2523 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2524 if ({
2525 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2526 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2527 }->{$self->{next_char}}) {
2528 !!!cp (181);
2529 ## Stay in the state
2530 !!!next-input-character;
2531 redo A;
2532 } elsif ($self->{next_char} eq 0x0022) { # "
2533 !!!cp (182);
2534 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2535 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2536 !!!next-input-character;
2537 redo A;
2538 } elsif ($self->{next_char} eq 0x0027) { # '
2539 !!!cp (183);
2540 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2541 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2542 !!!next-input-character;
2543 redo A;
2544 } elsif ($self->{next_char} eq 0x003E) { # >
2545 !!!cp (184);
2546 !!!parse-error (type => 'no PUBLIC literal');
2547
2548 $self->{state} = DATA_STATE;
2549 !!!next-input-character;
2550
2551 $self->{current_token}->{quirks} = 1;
2552 !!!emit ($self->{current_token}); # DOCTYPE
2553
2554 redo A;
2555 } elsif ($self->{next_char} == -1) {
2556 !!!cp (185);
2557 !!!parse-error (type => 'unclosed DOCTYPE');
2558
2559 $self->{state} = DATA_STATE;
2560 ## reconsume
2561
2562 $self->{current_token}->{quirks} = 1;
2563 !!!emit ($self->{current_token}); # DOCTYPE
2564
2565 redo A;
2566 } else {
2567 !!!cp (186);
2568 !!!parse-error (type => 'string after PUBLIC');
2569 $self->{current_token}->{quirks} = 1;
2570
2571 $self->{state} = BOGUS_DOCTYPE_STATE;
2572 !!!next-input-character;
2573 redo A;
2574 }
2575 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2576 if ($self->{next_char} == 0x0022) { # "
2577 !!!cp (187);
2578 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2579 !!!next-input-character;
2580 redo A;
2581 } elsif ($self->{next_char} == 0x003E) { # >
2582 !!!cp (188);
2583 !!!parse-error (type => 'unclosed PUBLIC literal');
2584
2585 $self->{state} = DATA_STATE;
2586 !!!next-input-character;
2587
2588 $self->{current_token}->{quirks} = 1;
2589 !!!emit ($self->{current_token}); # DOCTYPE
2590
2591 redo A;
2592 } elsif ($self->{next_char} == -1) {
2593 !!!cp (189);
2594 !!!parse-error (type => 'unclosed PUBLIC literal');
2595
2596 $self->{state} = DATA_STATE;
2597 ## reconsume
2598
2599 $self->{current_token}->{quirks} = 1;
2600 !!!emit ($self->{current_token}); # DOCTYPE
2601
2602 redo A;
2603 } else {
2604 !!!cp (190);
2605 $self->{current_token}->{public_identifier} # DOCTYPE
2606 .= chr $self->{next_char};
2607 ## Stay in the state
2608 !!!next-input-character;
2609 redo A;
2610 }
2611 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2612 if ($self->{next_char} == 0x0027) { # '
2613 !!!cp (191);
2614 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2615 !!!next-input-character;
2616 redo A;
2617 } elsif ($self->{next_char} == 0x003E) { # >
2618 !!!cp (192);
2619 !!!parse-error (type => 'unclosed PUBLIC literal');
2620
2621 $self->{state} = DATA_STATE;
2622 !!!next-input-character;
2623
2624 $self->{current_token}->{quirks} = 1;
2625 !!!emit ($self->{current_token}); # DOCTYPE
2626
2627 redo A;
2628 } elsif ($self->{next_char} == -1) {
2629 !!!cp (193);
2630 !!!parse-error (type => 'unclosed PUBLIC literal');
2631
2632 $self->{state} = DATA_STATE;
2633 ## reconsume
2634
2635 $self->{current_token}->{quirks} = 1;
2636 !!!emit ($self->{current_token}); # DOCTYPE
2637
2638 redo A;
2639 } else {
2640 !!!cp (194);
2641 $self->{current_token}->{public_identifier} # DOCTYPE
2642 .= chr $self->{next_char};
2643 ## Stay in the state
2644 !!!next-input-character;
2645 redo A;
2646 }
2647 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2648 if ({
2649 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2650 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2651 }->{$self->{next_char}}) {
2652 !!!cp (195);
2653 ## Stay in the state
2654 !!!next-input-character;
2655 redo A;
2656 } elsif ($self->{next_char} == 0x0022) { # "
2657 !!!cp (196);
2658 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2659 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2660 !!!next-input-character;
2661 redo A;
2662 } elsif ($self->{next_char} == 0x0027) { # '
2663 !!!cp (197);
2664 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2665 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2666 !!!next-input-character;
2667 redo A;
2668 } elsif ($self->{next_char} == 0x003E) { # >
2669 !!!cp (198);
2670 $self->{state} = DATA_STATE;
2671 !!!next-input-character;
2672
2673 !!!emit ($self->{current_token}); # DOCTYPE
2674
2675 redo A;
2676 } elsif ($self->{next_char} == -1) {
2677 !!!cp (199);
2678 !!!parse-error (type => 'unclosed DOCTYPE');
2679
2680 $self->{state} = DATA_STATE;
2681 ## reconsume
2682
2683 $self->{current_token}->{quirks} = 1;
2684 !!!emit ($self->{current_token}); # DOCTYPE
2685
2686 redo A;
2687 } else {
2688 !!!cp (200);
2689 !!!parse-error (type => 'string after PUBLIC literal');
2690 $self->{current_token}->{quirks} = 1;
2691
2692 $self->{state} = BOGUS_DOCTYPE_STATE;
2693 !!!next-input-character;
2694 redo A;
2695 }
2696 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2697 if ({
2698 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2699 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2700 }->{$self->{next_char}}) {
2701 !!!cp (201);
2702 ## Stay in the state
2703 !!!next-input-character;
2704 redo A;
2705 } elsif ($self->{next_char} == 0x0022) { # "
2706 !!!cp (202);
2707 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2708 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2709 !!!next-input-character;
2710 redo A;
2711 } elsif ($self->{next_char} == 0x0027) { # '
2712 !!!cp (203);
2713 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2714 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2715 !!!next-input-character;
2716 redo A;
2717 } elsif ($self->{next_char} == 0x003E) { # >
2718 !!!cp (204);
2719 !!!parse-error (type => 'no SYSTEM literal');
2720 $self->{state} = DATA_STATE;
2721 !!!next-input-character;
2722
2723 $self->{current_token}->{quirks} = 1;
2724 !!!emit ($self->{current_token}); # DOCTYPE
2725
2726 redo A;
2727 } elsif ($self->{next_char} == -1) {
2728 !!!cp (205);
2729 !!!parse-error (type => 'unclosed DOCTYPE');
2730
2731 $self->{state} = DATA_STATE;
2732 ## reconsume
2733
2734 $self->{current_token}->{quirks} = 1;
2735 !!!emit ($self->{current_token}); # DOCTYPE
2736
2737 redo A;
2738 } else {
2739 !!!cp (206);
2740 !!!parse-error (type => 'string after SYSTEM');
2741 $self->{current_token}->{quirks} = 1;
2742
2743 $self->{state} = BOGUS_DOCTYPE_STATE;
2744 !!!next-input-character;
2745 redo A;
2746 }
2747 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2748 if ($self->{next_char} == 0x0022) { # "
2749 !!!cp (207);
2750 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2751 !!!next-input-character;
2752 redo A;
2753 } elsif ($self->{next_char} == 0x003E) { # >
2754 !!!cp (208);
2755 !!!parse-error (type => 'unclosed SYSTEM literal');
2756
2757 $self->{state} = DATA_STATE;
2758 !!!next-input-character;
2759
2760 $self->{current_token}->{quirks} = 1;
2761 !!!emit ($self->{current_token}); # DOCTYPE
2762
2763 redo A;
2764 } elsif ($self->{next_char} == -1) {
2765 !!!cp (209);
2766 !!!parse-error (type => 'unclosed SYSTEM literal');
2767
2768 $self->{state} = DATA_STATE;
2769 ## reconsume
2770
2771 $self->{current_token}->{quirks} = 1;
2772 !!!emit ($self->{current_token}); # DOCTYPE
2773
2774 redo A;
2775 } else {
2776 !!!cp (210);
2777 $self->{current_token}->{system_identifier} # DOCTYPE
2778 .= chr $self->{next_char};
2779 ## Stay in the state
2780 !!!next-input-character;
2781 redo A;
2782 }
2783 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2784 if ($self->{next_char} == 0x0027) { # '
2785 !!!cp (211);
2786 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2787 !!!next-input-character;
2788 redo A;
2789 } elsif ($self->{next_char} == 0x003E) { # >
2790 !!!cp (212);
2791 !!!parse-error (type => 'unclosed SYSTEM literal');
2792
2793 $self->{state} = DATA_STATE;
2794 !!!next-input-character;
2795
2796 $self->{current_token}->{quirks} = 1;
2797 !!!emit ($self->{current_token}); # DOCTYPE
2798
2799 redo A;
2800 } elsif ($self->{next_char} == -1) {
2801 !!!cp (213);
2802 !!!parse-error (type => 'unclosed SYSTEM literal');
2803
2804 $self->{state} = DATA_STATE;
2805 ## reconsume
2806
2807 $self->{current_token}->{quirks} = 1;
2808 !!!emit ($self->{current_token}); # DOCTYPE
2809
2810 redo A;
2811 } else {
2812 !!!cp (214);
2813 $self->{current_token}->{system_identifier} # DOCTYPE
2814 .= chr $self->{next_char};
2815 ## Stay in the state
2816 !!!next-input-character;
2817 redo A;
2818 }
2819 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2820 if ({
2821 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2822 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2823 }->{$self->{next_char}}) {
2824 !!!cp (215);
2825 ## Stay in the state
2826 !!!next-input-character;
2827 redo A;
2828 } elsif ($self->{next_char} == 0x003E) { # >
2829 !!!cp (216);
2830 $self->{state} = DATA_STATE;
2831 !!!next-input-character;
2832
2833 !!!emit ($self->{current_token}); # DOCTYPE
2834
2835 redo A;
2836 } elsif ($self->{next_char} == -1) {
2837 !!!cp (217);
2838 !!!parse-error (type => 'unclosed DOCTYPE');
2839 $self->{state} = DATA_STATE;
2840 ## reconsume
2841
2842 $self->{current_token}->{quirks} = 1;
2843 !!!emit ($self->{current_token}); # DOCTYPE
2844
2845 redo A;
2846 } else {
2847 !!!cp (218);
2848 !!!parse-error (type => 'string after SYSTEM literal');
2849 #$self->{current_token}->{quirks} = 1;
2850
2851 $self->{state} = BOGUS_DOCTYPE_STATE;
2852 !!!next-input-character;
2853 redo A;
2854 }
2855 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2856 if ($self->{next_char} == 0x003E) { # >
2857 !!!cp (219);
2858 $self->{state} = DATA_STATE;
2859 !!!next-input-character;
2860
2861 !!!emit ($self->{current_token}); # DOCTYPE
2862
2863 redo A;
2864 } elsif ($self->{next_char} == -1) {
2865 !!!cp (220);
2866 !!!parse-error (type => 'unclosed DOCTYPE');
2867 $self->{state} = DATA_STATE;
2868 ## reconsume
2869
2870 !!!emit ($self->{current_token}); # DOCTYPE
2871
2872 redo A;
2873 } else {
2874 !!!cp (221);
2875 ## Stay in the state
2876 !!!next-input-character;
2877 redo A;
2878 }
2879 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2880 my $s = '';
2881
2882 my ($l, $c) = ($self->{line}, $self->{column});
2883
2884 CS: while ($self->{next_char} != -1) {
2885 if ($self->{next_char} == 0x005D) { # ]
2886 !!!next-input-character;
2887 if ($self->{next_char} == 0x005D) { # ]
2888 !!!next-input-character;
2889 MDC: {
2890 if ($self->{next_char} == 0x003E) { # >
2891 !!!cp (221.1);
2892 !!!next-input-character;
2893 last CS;
2894 } elsif ($self->{next_char} == 0x005D) { # ]
2895 !!!cp (221.2);
2896 $s .= ']';
2897 !!!next-input-character;
2898 redo MDC;
2899 } else {
2900 !!!cp (221.3);
2901 $s .= ']]';
2902 #
2903 }
2904 } # MDC
2905 } else {
2906 !!!cp (221.4);
2907 $s .= ']';
2908 #
2909 }
2910 } else {
2911 !!!cp (221.5);
2912 #
2913 }
2914 $s .= chr $self->{next_char};
2915 !!!next-input-character;
2916 } # CS
2917
2918 $self->{state} = DATA_STATE;
2919 ## next-input-character done or EOF, which is reconsumed.
2920
2921 if (length $s) {
2922 !!!cp (221.6);
2923 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2924 line => $l, column => $c});
2925 } else {
2926 !!!cp (221.7);
2927 }
2928
2929 redo A;
2930
2931 ## ISSUE: "text tokens" in spec.
2932 ## TODO: Streaming support
2933 } else {
2934 die "$0: $self->{state}: Unknown state";
2935 }
2936 } # A
2937
2938 die "$0: _get_next_token: unexpected case";
2939 } # _get_next_token
2940
2941 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2942 my ($self, $in_attr, $additional) = @_;
2943
2944 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2945
2946 if ({
2947 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2948 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2949 $additional => 1,
2950 }->{$self->{next_char}}) {
2951 !!!cp (1001);
2952 ## Don't consume
2953 ## No error
2954 return undef;
2955 } elsif ($self->{next_char} == 0x0023) { # #
2956 !!!next-input-character;
2957 if ($self->{next_char} == 0x0078 or # x
2958 $self->{next_char} == 0x0058) { # X
2959 my $code;
2960 X: {
2961 my $x_char = $self->{next_char};
2962 !!!next-input-character;
2963 if (0x0030 <= $self->{next_char} and
2964 $self->{next_char} <= 0x0039) { # 0..9
2965 !!!cp (1002);
2966 $code ||= 0;
2967 $code *= 0x10;
2968 $code += $self->{next_char} - 0x0030;
2969 redo X;
2970 } elsif (0x0061 <= $self->{next_char} and
2971 $self->{next_char} <= 0x0066) { # a..f
2972 !!!cp (1003);
2973 $code ||= 0;
2974 $code *= 0x10;
2975 $code += $self->{next_char} - 0x0060 + 9;
2976 redo X;
2977 } elsif (0x0041 <= $self->{next_char} and
2978 $self->{next_char} <= 0x0046) { # A..F
2979 !!!cp (1004);
2980 $code ||= 0;
2981 $code *= 0x10;
2982 $code += $self->{next_char} - 0x0040 + 9;
2983 redo X;
2984 } elsif (not defined $code) { # no hexadecimal digit
2985 !!!cp (1005);
2986 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2987 !!!back-next-input-character ($x_char, $self->{next_char});
2988 $self->{next_char} = 0x0023; # #
2989 return undef;
2990 } elsif ($self->{next_char} == 0x003B) { # ;
2991 !!!cp (1006);
2992 !!!next-input-character;
2993 } else {
2994 !!!cp (1007);
2995 !!!parse-error (type => 'no refc', line => $l, column => $c);
2996 }
2997
2998 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2999 !!!cp (1008);
3000 !!!parse-error (type => 'invalid character reference',
3001 text => (sprintf 'U+%04X', $code),
3002 line => $l, column => $c);
3003 $code = 0xFFFD;
3004 } elsif ($code > 0x10FFFF) {
3005 !!!cp (1009);
3006 !!!parse-error (type => 'invalid character reference',
3007 text => (sprintf 'U-%08X', $code),
3008 line => $l, column => $c);
3009 $code = 0xFFFD;
3010 } elsif ($code == 0x000D) {
3011 !!!cp (1010);
3012 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
3013 $code = 0x000A;
3014 } elsif (0x80 <= $code and $code <= 0x9F) {
3015 !!!cp (1011);
3016 !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
3017 $code = $c1_entity_char->{$code};
3018 }
3019
3020 return {type => CHARACTER_TOKEN, data => chr $code,
3021 has_reference => 1,
3022 line => $l, column => $c,
3023 };
3024 } # X
3025 } elsif (0x0030 <= $self->{next_char} and
3026 $self->{next_char} <= 0x0039) { # 0..9
3027 my $code = $self->{next_char} - 0x0030;
3028 !!!next-input-character;
3029
3030 while (0x0030 <= $self->{next_char} and
3031 $self->{next_char} <= 0x0039) { # 0..9
3032 !!!cp (1012);
3033 $code *= 10;
3034 $code += $self->{next_char} - 0x0030;
3035
3036 !!!next-input-character;
3037 }
3038
3039 if ($self->{next_char} == 0x003B) { # ;
3040 !!!cp (1013);
3041 !!!next-input-character;
3042 } else {
3043 !!!cp (1014);
3044 !!!parse-error (type => 'no refc', line => $l, column => $c);
3045 }
3046
3047 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3048 !!!cp (1015);
3049 !!!parse-error (type => 'invalid character reference',
3050 text => (sprintf 'U+%04X', $code),
3051 line => $l, column => $c);
3052 $code = 0xFFFD;
3053 } elsif ($code > 0x10FFFF) {
3054 !!!cp (1016);
3055 !!!parse-error (type => 'invalid character reference',
3056 text => (sprintf 'U-%08X', $code),
3057 line => $l, column => $c);
3058 $code = 0xFFFD;
3059 } elsif ($code == 0x000D) {
3060 !!!cp (1017);
3061 !!!parse-error (type => 'CR character reference',
3062 line => $l, column => $c);
3063 $code = 0x000A;
3064 } elsif (0x80 <= $code and $code <= 0x9F) {
3065 !!!cp (1018);
3066 !!!parse-error (type => 'C1 character reference',
3067 text => (sprintf 'U+%04X', $code),
3068 line => $l, column => $c);
3069 $code = $c1_entity_char->{$code};
3070 }
3071
3072 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
3073 line => $l, column => $c,
3074 };
3075 } else {
3076 !!!cp (1019);
3077 !!!parse-error (type => 'bare nero', line => $l, column => $c);
3078 !!!back-next-input-character ($self->{next_char});
3079 $self->{next_char} = 0x0023; # #
3080 return undef;
3081 }
3082 } elsif ((0x0041 <= $self->{next_char} and
3083 $self->{next_char} <= 0x005A) or
3084 (0x0061 <= $self->{next_char} and
3085 $self->{next_char} <= 0x007A)) {
3086 my $entity_name = chr $self->{next_char};
3087 !!!next-input-character;
3088
3089 my $value = $entity_name;
3090 my $match = 0;
3091 require Whatpm::_NamedEntityList;
3092 our $EntityChar;
3093
3094 while (length $entity_name < 30 and
3095 ## NOTE: Some number greater than the maximum length of entity name
3096 ((0x0041 <= $self->{next_char} and # a
3097 $self->{next_char} <= 0x005A) or # x
3098 (0x0061 <= $self->{next_char} and # a
3099 $self->{next_char} <= 0x007A) or # z
3100 (0x0030 <= $self->{next_char} and # 0
3101 $self->{next_char} <= 0x0039) or # 9
3102 $self->{next_char} == 0x003B)) { # ;
3103 $entity_name .= chr $self->{next_char};
3104 if (defined $EntityChar->{$entity_name}) {
3105 if ($self->{next_char} == 0x003B) { # ;
3106 !!!cp (1020);
3107 $value = $EntityChar->{$entity_name};
3108 $match = 1;
3109 !!!next-input-character;
3110 last;
3111 } else {
3112 !!!cp (1021);
3113 $value = $EntityChar->{$entity_name};
3114 $match = -1;
3115 !!!next-input-character;
3116 }
3117 } else {
3118 !!!cp (1022);
3119 $value .= chr $self->{next_char};
3120 $match *= 2;
3121 !!!next-input-character;
3122 }
3123 }
3124
3125 if ($match > 0) {
3126 !!!cp (1023);
3127 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3128 line => $l, column => $c,
3129 };
3130 } elsif ($match < 0) {
3131 !!!parse-error (type => 'no refc', line => $l, column => $c);
3132 if ($in_attr and $match < -1) {
3133 !!!cp (1024);
3134 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
3135 line => $l, column => $c,
3136 };
3137 } else {
3138 !!!cp (1025);
3139 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3140 line => $l, column => $c,
3141 };
3142 }
3143 } else {
3144 !!!cp (1026);
3145 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3146 ## NOTE: "No characters are consumed" in the spec.
3147 return {type => CHARACTER_TOKEN, data => '&'.$value,
3148 line => $l, column => $c,
3149 };
3150 }
3151 } else {
3152 !!!cp (1027);
3153 ## no characters are consumed
3154 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3155 return undef;
3156 }
3157 } # _tokenize_attempt_to_consume_an_entity
3158
3159 sub _initialize_tree_constructor ($) {
3160 my $self = shift;
3161 ## NOTE: $self->{document} MUST be specified before this method is called
3162 $self->{document}->strict_error_checking (0);
3163 ## TODO: Turn mutation events off # MUST
3164 ## TODO: Turn loose Document option (manakai extension) on
3165 $self->{document}->manakai_is_html (1); # MUST
3166 $self->{document}->set_user_data (manakai_source_line => 1);
3167 $self->{document}->set_user_data (manakai_source_column => 1);
3168 } # _initialize_tree_constructor
3169
3170 sub _terminate_tree_constructor ($) {
3171 my $self = shift;
3172 $self->{document}->strict_error_checking (1);
3173 ## TODO: Turn mutation events on
3174 } # _terminate_tree_constructor
3175
3176 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3177
3178 { # tree construction stage
3179 my $token;
3180
3181 sub _construct_tree ($) {
3182 my ($self) = @_;
3183
3184 ## When an interactive UA render the $self->{document} available
3185 ## to the user, or when it begin accepting user input, are
3186 ## not defined.
3187
3188 ## Append a character: collect it and all subsequent consecutive
3189 ## characters and insert one Text node whose data is concatenation
3190 ## of all those characters. # MUST
3191
3192 !!!next-token;
3193
3194 undef $self->{form_element};
3195 undef $self->{head_element};
3196 $self->{open_elements} = [];
3197 undef $self->{inner_html_node};
3198
3199 ## NOTE: The "initial" insertion mode.
3200 $self->_tree_construction_initial; # MUST
3201
3202 ## NOTE: The "before html" insertion mode.
3203 $self->_tree_construction_root_element;
3204 $self->{insertion_mode} = BEFORE_HEAD_IM;
3205
3206 ## NOTE: The "before head" insertion mode and so on.
3207 $self->_tree_construction_main;
3208 } # _construct_tree
3209
3210 sub _tree_construction_initial ($) {
3211 my $self = shift;
3212
3213 ## NOTE: "initial" insertion mode
3214
3215 INITIAL: {
3216 if ($token->{type} == DOCTYPE_TOKEN) {
3217 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3218 ## error, switch to a conformance checking mode for another
3219 ## language.
3220 my $doctype_name = $token->{name};
3221 $doctype_name = '' unless defined $doctype_name;
3222 $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3223 if (not defined $token->{name} or # <!DOCTYPE>
3224 defined $token->{system_identifier}) {
3225 !!!cp ('t1');
3226 !!!parse-error (type => 'not HTML5', token => $token);
3227 } elsif ($doctype_name ne 'HTML') {
3228 !!!cp ('t2');
3229 !!!parse-error (type => 'not HTML5', token => $token);
3230 } elsif (defined $token->{public_identifier}) {
3231 if ($token->{public_identifier} eq 'XSLT-compat') {
3232 !!!cp ('t1.2');
3233 !!!parse-error (type => 'XSLT-compat', token => $token,
3234 level => $self->{level}->{should});
3235 } else {
3236 !!!parse-error (type => 'not HTML5', token => $token);
3237 }
3238 } else {
3239 !!!cp ('t3');
3240 #
3241 }
3242
3243 my $doctype = $self->{document}->create_document_type_definition
3244 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3245 ## NOTE: Default value for both |public_id| and |system_id| attributes
3246 ## are empty strings, so that we don't set any value in missing cases.
3247 $doctype->public_id ($token->{public_identifier})
3248 if defined $token->{public_identifier};
3249 $doctype->system_id ($token->{system_identifier})
3250 if defined $token->{system_identifier};
3251 ## NOTE: Other DocumentType attributes are null or empty lists.
3252 ## ISSUE: internalSubset = null??
3253 $self->{document}->append_child ($doctype);
3254
3255 if ($token->{quirks} or $doctype_name ne 'HTML') {
3256 !!!cp ('t4');
3257 $self->{document}->manakai_compat_mode ('quirks');
3258 } elsif (defined $token->{public_identifier}) {
3259 my $pubid = $token->{public_identifier};
3260 $pubid =~ tr/a-z/A-z/;
3261 my $prefix = [
3262 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3263 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3264 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3265 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3266 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3267 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3268 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3269 "-//IETF//DTD HTML 2.0 STRICT//",
3270 "-//IETF//DTD HTML 2.0//",
3271 "-//IETF//DTD HTML 2.1E//",
3272 "-//IETF//DTD HTML 3.0//",
3273 "-//IETF//DTD HTML 3.2 FINAL//",
3274 "-//IETF//DTD HTML 3.2//",
3275 "-//IETF//DTD HTML 3//",
3276 "-//IETF//DTD HTML LEVEL 0//",
3277 "-//IETF//DTD HTML LEVEL 1//",
3278 "-//IETF//DTD HTML LEVEL 2//",
3279 "-//IETF//DTD HTML LEVEL 3//",
3280 "-//IETF//DTD HTML STRICT LEVEL 0//",
3281 "-//IETF//DTD HTML STRICT LEVEL 1//",
3282 "-//IETF//DTD HTML STRICT LEVEL 2//",
3283 "-//IETF//DTD HTML STRICT LEVEL 3//",
3284 "-//IETF//DTD HTML STRICT//",
3285 "-//IETF//DTD HTML//",
3286 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3287 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3288 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3289 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3290 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3291 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3292 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3293 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3294 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3295 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3296 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3297 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3298 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3299 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3300 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3301 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3302 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3303 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3304 "-//W3C//DTD HTML 3 1995-03-24//",
3305 "-//W3C//DTD HTML 3.2 DRAFT//",
3306 "-//W3C//DTD HTML 3.2 FINAL//",
3307 "-//W3C//DTD HTML 3.2//",
3308 "-//W3C//DTD HTML 3.2S DRAFT//",
3309 "-//W3C//DTD HTML 4.0 FRAMESET//",
3310 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3311 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3312 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3313 "-//W3C//DTD W3 HTML//",
3314 "-//W3O//DTD W3 HTML 3.0//",
3315 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3316 "-//WEBTECHS//DTD MOZILLA HTML//",
3317 ]; # $prefix
3318 my $match;
3319 for (@$prefix) {
3320 if (substr ($prefix, 0, length $_) eq $_) {
3321 $match = 1;
3322 last;
3323 }
3324 }
3325 if ($match or
3326 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3327 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3328 $pubid eq "HTML") {
3329 !!!cp ('t5');
3330 $self->{document}->manakai_compat_mode ('quirks');
3331 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3332 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3333 if (defined $token->{system_identifier}) {
3334 !!!cp ('t6');
3335 $self->{document}->manakai_compat_mode ('quirks');
3336 } else {
3337 !!!cp ('t7');
3338 $self->{document}->manakai_compat_mode ('limited quirks');
3339 }
3340 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3341 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3342 !!!cp ('t8');
3343 $self->{document}->manakai_compat_mode ('limited quirks');
3344 } else {
3345 !!!cp ('t9');
3346 }
3347 } else {
3348 !!!cp ('t10');
3349 }
3350 if (defined $token->{system_identifier}) {
3351 my $sysid = $token->{system_identifier};
3352 $sysid =~ tr/A-Z/a-z/;
3353 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3354 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3355 ## marked as quirks.
3356 $self->{document}->manakai_compat_mode ('quirks');
3357 !!!cp ('t11');
3358 } else {
3359 !!!cp ('t12');
3360 }
3361 } else {
3362 !!!cp ('t13');
3363 }
3364
3365 ## Go to the "before html" insertion mode.
3366 !!!next-token;
3367 return;
3368 } elsif ({
3369 START_TAG_TOKEN, 1,
3370 END_TAG_TOKEN, 1,
3371 END_OF_FILE_TOKEN, 1,
3372 }->{$token->{type}}) {
3373 !!!cp ('t14');
3374 !!!parse-error (type => 'no DOCTYPE', token => $token);
3375 $self->{document}->manakai_compat_mode ('quirks');
3376 ## Go to the "before html" insertion mode.
3377 ## reprocess
3378 !!!ack-later;
3379 return;
3380 } elsif ($token->{type} == CHARACTER_TOKEN) {
3381 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3382 ## Ignore the token
3383
3384 unless (length $token->{data}) {
3385 !!!cp ('t15');
3386 ## Stay in the insertion mode.
3387 !!!next-token;
3388 redo INITIAL;
3389 } else {
3390 !!!cp ('t16');
3391 }
3392 } else {
3393 !!!cp ('t17');
3394 }
3395
3396 !!!parse-error (type => 'no DOCTYPE', token => $token);
3397 $self->{document}->manakai_compat_mode ('quirks');
3398 ## Go to the "before html" insertion mode.
3399 ## reprocess
3400 return;
3401 } elsif ($token->{type} == COMMENT_TOKEN) {
3402 !!!cp ('t18');
3403 my $comment = $self->{document}->create_comment ($token->{data});
3404 $self->{document}->append_child ($comment);
3405
3406 ## Stay in the insertion mode.
3407 !!!next-token;
3408 redo INITIAL;
3409 } else {
3410 die "$0: $token->{type}: Unknown token type";
3411 }
3412 } # INITIAL
3413
3414 die "$0: _tree_construction_initial: This should be never reached";
3415 } # _tree_construction_initial
3416
3417 sub _tree_construction_root_element ($) {
3418 my $self = shift;
3419
3420 ## NOTE: "before html" insertion mode.
3421
3422 B: {
3423 if ($token->{type} == DOCTYPE_TOKEN) {
3424 !!!cp ('t19');
3425 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3426 ## Ignore the token
3427 ## Stay in the insertion mode.
3428 !!!next-token;
3429 redo B;
3430 } elsif ($token->{type} == COMMENT_TOKEN) {
3431 !!!cp ('t20');
3432 my $comment = $self->{document}->create_comment ($token->{data});
3433 $self->{document}->append_child ($comment);
3434 ## Stay in the insertion mode.
3435 !!!next-token;
3436 redo B;
3437 } elsif ($token->{type} == CHARACTER_TOKEN) {
3438 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3439 ## Ignore the token.
3440
3441 unless (length $token->{data}) {
3442 !!!cp ('t21');
3443 ## Stay in the insertion mode.
3444 !!!next-token;
3445 redo B;
3446 } else {
3447 !!!cp ('t22');
3448 }
3449 } else {
3450 !!!cp ('t23');
3451 }
3452
3453 $self->{application_cache_selection}->(undef);
3454
3455 #
3456 } elsif ($token->{type} == START_TAG_TOKEN) {
3457 if ($token->{tag_name} eq 'html') {
3458 my $root_element;
3459 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3460 $self->{document}->append_child ($root_element);
3461 push @{$self->{open_elements}},
3462 [$root_element, $el_category->{html}];
3463
3464 if ($token->{attributes}->{manifest}) {
3465 !!!cp ('t24');
3466 $self->{application_cache_selection}
3467 ->($token->{attributes}->{manifest}->{value});
3468 ## ISSUE: Spec is unclear on relative references.
3469 ## According to Hixie (#whatwg 2008-03-19), it should be
3470 ## resolved against the base URI of the document in HTML
3471 ## or xml:base of the element in XHTML.
3472 } else {
3473 !!!cp ('t25');
3474 $self->{application_cache_selection}->(undef);
3475 }
3476
3477 !!!nack ('t25c');
3478
3479 !!!next-token;
3480 return; ## Go to the "before head" insertion mode.
3481 } else {
3482 !!!cp ('t25.1');
3483 #
3484 }
3485 } elsif ({
3486 END_TAG_TOKEN, 1,
3487 END_OF_FILE_TOKEN, 1,
3488 }->{$token->{type}}) {
3489 !!!cp ('t26');
3490 #
3491 } else {
3492 die "$0: $token->{type}: Unknown token type";
3493 }
3494
3495 my $root_element;
3496 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3497 $self->{document}->append_child ($root_element);
3498 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3499
3500 $self->{application_cache_selection}->(undef);
3501
3502 ## NOTE: Reprocess the token.
3503 !!!ack-later;
3504 return; ## Go to the "before head" insertion mode.
3505
3506 ## ISSUE: There is an issue in the spec
3507 } # B
3508
3509 die "$0: _tree_construction_root_element: This should never be reached";
3510 } # _tree_construction_root_element
3511
3512 sub _reset_insertion_mode ($) {
3513 my $self = shift;
3514
3515 ## Step 1
3516 my $last;
3517
3518 ## Step 2
3519 my $i = -1;
3520 my $node = $self->{open_elements}->[$i];
3521
3522 ## Step 3
3523 S3: {
3524 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3525 $last = 1;
3526 if (defined $self->{inner_html_node}) {
3527 !!!cp ('t28');
3528 $node = $self->{inner_html_node};
3529 } else {
3530 die "_reset_insertion_mode: t27";
3531 }
3532 }
3533
3534 ## Step 4..14
3535 my $new_mode;
3536 if ($node->[1] & FOREIGN_EL) {
3537 !!!cp ('t28.1');
3538 ## NOTE: Strictly spaking, the line below only applies to MathML and
3539 ## SVG elements. Currently the HTML syntax supports only MathML and
3540 ## SVG elements as foreigners.
3541 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3542 } elsif ($node->[1] & TABLE_CELL_EL) {
3543 if ($last) {
3544 !!!cp ('t28.2');
3545 #
3546 } else {
3547 !!!cp ('t28.3');
3548 $new_mode = IN_CELL_IM;
3549 }
3550 } else {
3551 !!!cp ('t28.4');
3552 $new_mode = {
3553 select => IN_SELECT_IM,
3554 ## NOTE: |option| and |optgroup| do not set
3555 ## insertion mode to "in select" by themselves.
3556 tr => IN_ROW_IM,
3557 tbody => IN_TABLE_BODY_IM,
3558 thead => IN_TABLE_BODY_IM,
3559 tfoot => IN_TABLE_BODY_IM,
3560 caption => IN_CAPTION_IM,
3561 colgroup => IN_COLUMN_GROUP_IM,
3562 table => IN_TABLE_IM,
3563 head => IN_BODY_IM, # not in head!
3564 body => IN_BODY_IM,
3565 frameset => IN_FRAMESET_IM,
3566 }->{$node->[0]->manakai_local_name};
3567 }
3568 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3569
3570 ## Step 15
3571 if ($node->[1] & HTML_EL) {
3572 unless (defined $self->{head_element}) {
3573 !!!cp ('t29');
3574 $self->{insertion_mode} = BEFORE_HEAD_IM;
3575 } else {
3576 ## ISSUE: Can this state be reached?
3577 !!!cp ('t30');
3578 $self->{insertion_mode} = AFTER_HEAD_IM;
3579 }
3580 return;
3581 } else {
3582 !!!cp ('t31');
3583 }
3584
3585 ## Step 16
3586 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3587
3588 ## Step 17
3589 $i--;
3590 $node = $self->{open_elements}->[$i];
3591
3592 ## Step 18
3593 redo S3;
3594 } # S3
3595
3596 die "$0: _reset_insertion_mode: This line should never be reached";
3597 } # _reset_insertion_mode
3598
3599 sub _tree_construction_main ($) {
3600 my $self = shift;
3601
3602 my $active_formatting_elements = [];
3603
3604 my $reconstruct_active_formatting_elements = sub { # MUST
3605 my $insert = shift;
3606
3607 ## Step 1
3608 return unless @$active_formatting_elements;
3609
3610 ## Step 3
3611 my $i = -1;
3612 my $entry = $active_formatting_elements->[$i];
3613
3614 ## Step 2
3615 return if $entry->[0] eq '#marker';
3616 for (@{$self->{open_elements}}) {
3617 if ($entry->[0] eq $_->[0]) {
3618 !!!cp ('t32');
3619 return;
3620 }
3621 }
3622
3623 S4: {
3624 ## Step 4
3625 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3626
3627 ## Step 5
3628 $i--;
3629 $entry = $active_formatting_elements->[$i];
3630
3631 ## Step 6
3632 if ($entry->[0] eq '#marker') {
3633 !!!cp ('t33_1');
3634 #
3635 } else {
3636 my $in_open_elements;
3637 OE: for (@{$self->{open_elements}}) {
3638 if ($entry->[0] eq $_->[0]) {
3639 !!!cp ('t33');
3640 $in_open_elements = 1;
3641 last OE;
3642 }
3643 }
3644 if ($in_open_elements) {
3645 !!!cp ('t34');
3646 #
3647 } else {
3648 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3649 !!!cp ('t35');
3650 redo S4;
3651 }
3652 }
3653
3654 ## Step 7
3655 $i++;
3656 $entry = $active_formatting_elements->[$i];
3657 } # S4
3658
3659 S7: {
3660 ## Step 8
3661 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3662
3663 ## Step 9
3664 $insert->($clone->[0]);
3665 push @{$self->{open_elements}}, $clone;
3666
3667 ## Step 10
3668 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3669
3670 ## Step 11
3671 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3672 !!!cp ('t36');
3673 ## Step 7'
3674 $i++;
3675 $entry = $active_formatting_elements->[$i];
3676
3677 redo S7;
3678 }
3679
3680 !!!cp ('t37');
3681 } # S7
3682 }; # $reconstruct_active_formatting_elements
3683
3684 my $clear_up_to_marker = sub {
3685 for (reverse 0..$#$active_formatting_elements) {
3686 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3687 !!!cp ('t38');
3688 splice @$active_formatting_elements, $_;
3689 return;
3690 }
3691 }
3692
3693 !!!cp ('t39');
3694 }; # $clear_up_to_marker
3695
3696 my $insert;
3697
3698 my $parse_rcdata = sub ($) {
3699 my ($content_model_flag) = @_;
3700
3701 ## Step 1
3702 my $start_tag_name = $token->{tag_name};
3703 my $el;
3704 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3705
3706 ## Step 2
3707 $insert->($el);
3708
3709 ## Step 3
3710 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3711 delete $self->{escape}; # MUST
3712
3713 ## Step 4
3714 my $text = '';
3715 !!!nack ('t40.1');
3716 !!!next-token;
3717 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3718 !!!cp ('t40');
3719 $text .= $token->{data};
3720 !!!next-token;
3721 }
3722
3723 ## Step 5
3724 if (length $text) {
3725 !!!cp ('t41');
3726 my $text = $self->{document}->create_text_node ($text);
3727 $el->append_child ($text);
3728 }
3729
3730 ## Step 6
3731 $self->{content_model} = PCDATA_CONTENT_MODEL;
3732
3733 ## Step 7
3734 if ($token->{type} == END_TAG_TOKEN and
3735 $token->{tag_name} eq $start_tag_name) {
3736 !!!cp ('t42');
3737 ## Ignore the token
3738 } else {
3739 ## NOTE: An end-of-file token.
3740 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3741 !!!cp ('t43');
3742 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3743 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3744 !!!cp ('t44');
3745 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
3746 } else {
3747 die "$0: $content_model_flag in parse_rcdata";
3748 }
3749 }
3750 !!!next-token;
3751 }; # $parse_rcdata
3752
3753 my $script_start_tag = sub () {
3754 my $script_el;
3755 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3756 ## TODO: mark as "parser-inserted"
3757
3758 $self->{content_model} = CDATA_CONTENT_MODEL;
3759 delete $self->{escape}; # MUST
3760
3761 my $text = '';
3762 !!!nack ('t45.1');
3763 !!!next-token;
3764 while ($token->{type} == CHARACTER_TOKEN) {
3765 !!!cp ('t45');
3766 $text .= $token->{data};
3767 !!!next-token;
3768 } # stop if non-character token or tokenizer stops tokenising
3769 if (length $text) {
3770 !!!cp ('t46');
3771 $script_el->manakai_append_text ($text);
3772 }
3773
3774 $self->{content_model} = PCDATA_CONTENT_MODEL;
3775
3776 if ($token->{type} == END_TAG_TOKEN and
3777 $token->{tag_name} eq 'script') {
3778 !!!cp ('t47');
3779 ## Ignore the token
3780 } else {
3781 !!!cp ('t48');
3782 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3783 ## ISSUE: And ignore?
3784 ## TODO: mark as "already executed"
3785 }
3786
3787 if (defined $self->{inner_html_node}) {
3788 !!!cp ('t49');
3789 ## TODO: mark as "already executed"
3790 } else {
3791 !!!cp ('t50');
3792 ## TODO: $old_insertion_point = current insertion point
3793 ## TODO: insertion point = just before the next input character
3794
3795 $insert->($script_el);
3796
3797 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3798
3799 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3800 }
3801
3802 !!!next-token;
3803 }; # $script_start_tag
3804
3805 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3806 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3807 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3808
3809 my $formatting_end_tag = sub {
3810 my $end_tag_token = shift;
3811 my $tag_name = $end_tag_token->{tag_name};
3812
3813 ## NOTE: The adoption agency algorithm (AAA).
3814
3815 FET: {
3816 ## Step 1
3817 my $formatting_element;
3818 my $formatting_element_i_in_active;
3819 AFE: for (reverse 0..$#$active_formatting_elements) {
3820 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3821 !!!cp ('t52');
3822 last AFE;
3823 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3824 eq $tag_name) {
3825 !!!cp ('t51');
3826 $formatting_element = $active_formatting_elements->[$_];
3827 $formatting_element_i_in_active = $_;
3828 last AFE;
3829 }
3830 } # AFE
3831 unless (defined $formatting_element) {
3832 !!!cp ('t53');
3833 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
3834 ## Ignore the token
3835 !!!next-token;
3836 return;
3837 }
3838 ## has an element in scope
3839 my $in_scope = 1;
3840 my $formatting_element_i_in_open;
3841 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3842 my $node = $self->{open_elements}->[$_];
3843 if ($node->[0] eq $formatting_element->[0]) {
3844 if ($in_scope) {
3845 !!!cp ('t54');
3846 $formatting_element_i_in_open = $_;
3847 last INSCOPE;
3848 } else { # in open elements but not in scope
3849 !!!cp ('t55');
3850 !!!parse-error (type => 'unmatched end tag',
3851 text => $token->{tag_name},
3852 token => $end_tag_token);
3853 ## Ignore the token
3854 !!!next-token;
3855 return;
3856 }
3857 } elsif ($node->[1] & SCOPING_EL) {
3858 !!!cp ('t56');
3859 $in_scope = 0;
3860 }
3861 } # INSCOPE
3862 unless (defined $formatting_element_i_in_open) {
3863 !!!cp ('t57');
3864 !!!parse-error (type => 'unmatched end tag',
3865 text => $token->{tag_name},
3866 token => $end_tag_token);
3867 pop @$active_formatting_elements; # $formatting_element
3868 !!!next-token; ## TODO: ok?
3869 return;
3870 }
3871 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3872 !!!cp ('t58');
3873 !!!parse-error (type => 'not closed',
3874 text => $self->{open_elements}->[-1]->[0]
3875 ->manakai_local_name,
3876 token => $end_tag_token);
3877 }
3878
3879 ## Step 2
3880 my $furthest_block;
3881 my $furthest_block_i_in_open;
3882 OE: for (reverse 0..$#{$self->{open_elements}}) {
3883 my $node = $self->{open_elements}->[$_];
3884 if (not ($node->[1] & FORMATTING_EL) and
3885 #not $phrasing_category->{$node->[1]} and
3886 ($node->[1] & SPECIAL_EL or
3887 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3888 !!!cp ('t59');
3889 $furthest_block = $node;
3890 $furthest_block_i_in_open = $_;
3891 } elsif ($node->[0] eq $formatting_element->[0]) {
3892 !!!cp ('t60');
3893 last OE;
3894 }
3895 } # OE
3896
3897 ## Step 3
3898 unless (defined $furthest_block) { # MUST
3899 !!!cp ('t61');
3900 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3901 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3902 !!!next-token;
3903 return;
3904 }
3905
3906 ## Step 4
3907 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3908
3909 ## Step 5
3910 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3911 if (defined $furthest_block_parent) {
3912 !!!cp ('t62');
3913 $furthest_block_parent->remove_child ($furthest_block->[0]);
3914 }
3915
3916 ## Step 6
3917 my $bookmark_prev_el
3918 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3919 ->[0];
3920
3921 ## Step 7
3922 my $node = $furthest_block;
3923 my $node_i_in_open = $furthest_block_i_in_open;
3924 my $last_node = $furthest_block;
3925 S7: {
3926 ## Step 1
3927 $node_i_in_open--;
3928 $node = $self->{open_elements}->[$node_i_in_open];
3929
3930 ## Step 2
3931 my $node_i_in_active;
3932 S7S2: {
3933 for (reverse 0..$#$active_formatting_elements) {
3934 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3935 !!!cp ('t63');
3936 $node_i_in_active = $_;
3937 last S7S2;
3938 }
3939 }
3940 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3941 redo S7;
3942 } # S7S2
3943
3944 ## Step 3
3945 last S7 if $node->[0] eq $formatting_element->[0];
3946
3947 ## Step 4
3948 if ($last_node->[0] eq $furthest_block->[0]) {
3949 !!!cp ('t64');
3950 $bookmark_prev_el = $node->[0];
3951 }
3952
3953 ## Step 5
3954 if ($node->[0]->has_child_nodes ()) {
3955 !!!cp ('t65');
3956 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3957 $active_formatting_elements->[$node_i_in_active] = $clone;
3958 $self->{open_elements}->[$node_i_in_open] = $clone;
3959 $node = $clone;
3960 }
3961
3962 ## Step 6
3963 $node->[0]->append_child ($last_node->[0]);
3964
3965 ## Step 7
3966 $last_node = $node;
3967
3968 ## Step 8
3969 redo S7;
3970 } # S7
3971
3972 ## Step 8
3973 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3974 my $foster_parent_element;
3975 my $next_sibling;
3976 OE: for (reverse 0..$#{$self->{open_elements}}) {
3977 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3978 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3979 if (defined $parent and $parent->node_type == 1) {
3980 !!!cp ('t65.1');
3981 $foster_parent_element = $parent;
3982 $next_sibling = $self->{open_elements}->[$_]->[0];
3983 } else {
3984 !!!cp ('t65.2');
3985 $foster_parent_element
3986 = $self->{open_elements}->[$_ - 1]->[0];
3987 }
3988 last OE;
3989 }
3990 } # OE
3991 $foster_parent_element = $self->{open_elements}->[0]->[0]
3992 unless defined $foster_parent_element;
3993 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3994 $open_tables->[-1]->[1] = 1; # tainted
3995 } else {
3996 !!!cp ('t65.3');
3997 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3998 }
3999
4000 ## Step 9
4001 my $clone = [$formatting_element->[0]->clone_node (0),
4002 $formatting_element->[1]];
4003
4004 ## Step 10
4005 my @cn = @{$furthest_block->[0]->child_nodes};
4006 $clone->[0]->append_child ($_) for @cn;
4007
4008 ## Step 11
4009 $furthest_block->[0]->append_child ($clone->[0]);
4010
4011 ## Step 12
4012 my $i;
4013 AFE: for (reverse 0..$#$active_formatting_elements) {
4014 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
4015 !!!cp ('t66');
4016 splice @$active_formatting_elements, $_, 1;
4017 $i-- and last AFE if defined $i;
4018 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
4019 !!!cp ('t67');
4020 $i = $_;
4021 }
4022 } # AFE
4023 splice @$active_formatting_elements, $i + 1, 0, $clone;
4024
4025 ## Step 13
4026 undef $i;
4027 OE: for (reverse 0..$#{$self->{open_elements}}) {
4028 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
4029 !!!cp ('t68');
4030 splice @{$self->{open_elements}}, $_, 1;
4031 $i-- and last OE if defined $i;
4032 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
4033 !!!cp ('t69');
4034 $i = $_;
4035 }
4036 } # OE
4037 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
4038
4039 ## Step 14
4040 redo FET;
4041 } # FET
4042 }; # $formatting_end_tag
4043
4044 $insert = my $insert_to_current = sub {
4045 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
4046 }; # $insert_to_current
4047
4048 my $insert_to_foster = sub {
4049 my $child = shift;
4050 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4051 # MUST
4052 my $foster_parent_element;
4053 my $next_sibling;
4054 OE: for (reverse 0..$#{$self->{open_elements}}) {
4055 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4056 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4057 if (defined $parent and $parent->node_type == 1) {
4058 !!!cp ('t70');
4059 $foster_parent_element = $parent;
4060 $next_sibling = $self->{open_elements}->[$_]->[0];
4061 } else {
4062 !!!cp ('t71');
4063 $foster_parent_element
4064 = $self->{open_elements}->[$_ - 1]->[0];
4065 }
4066 last OE;
4067 }
4068 } # OE
4069 $foster_parent_element = $self->{open_elements}->[0]->[0]
4070 unless defined $foster_parent_element;
4071 $foster_parent_element->insert_before
4072 ($child, $next_sibling);
4073 $open_tables->[-1]->[1] = 1; # tainted
4074 } else {
4075 !!!cp ('t72');
4076 $self->{open_elements}->[-1]->[0]->append_child ($child);
4077 }
4078 }; # $insert_to_foster
4079
4080 B: while (1) {
4081 if ($token->{type} == DOCTYPE_TOKEN) {
4082 !!!cp ('t73');
4083 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4084 ## Ignore the token
4085 ## Stay in the phase
4086 !!!next-token;
4087 next B;
4088 } elsif ($token->{type} == START_TAG_TOKEN and
4089 $token->{tag_name} eq 'html') {
4090 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4091 !!!cp ('t79');
4092 !!!parse-error (type => 'after html', text => 'html', token => $token);
4093 $self->{insertion_mode} = AFTER_BODY_IM;
4094 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4095 !!!cp ('t80');
4096 !!!parse-error (type => 'after html', text => 'html', token => $token);
4097 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4098 } else {
4099 !!!cp ('t81');
4100 }
4101
4102 !!!cp ('t82');
4103 !!!parse-error (type => 'not first start tag', token => $token);
4104 my $top_el = $self->{open_elements}->[0]->[0];
4105 for my $attr_name (keys %{$token->{attributes}}) {
4106 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4107 !!!cp ('t84');
4108 $top_el->set_attribute_ns
4109 (undef, [undef, $attr_name],
4110 $token->{attributes}->{$attr_name}->{value});
4111 }
4112 }
4113 !!!nack ('t84.1');
4114 !!!next-token;
4115 next B;
4116 } elsif ($token->{type} == COMMENT_TOKEN) {
4117 my $comment = $self->{document}->create_comment ($token->{data});
4118 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4119 !!!cp ('t85');
4120 $self->{document}->append_child ($comment);
4121 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4122 !!!cp ('t86');
4123 $self->{open_elements}->[0]->[0]->append_child ($comment);
4124 } else {
4125 !!!cp ('t87');
4126 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4127 }
4128 !!!next-token;
4129 next B;
4130 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4131 if ($token->{type} == CHARACTER_TOKEN) {
4132 !!!cp ('t87.1');
4133 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4134 !!!next-token;
4135 next B;
4136 } elsif ($token->{type} == START_TAG_TOKEN) {
4137 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4138 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4139 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4140 ($token->{tag_name} eq 'svg' and
4141 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4142 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4143 !!!cp ('t87.2');
4144 #
4145 } elsif ({
4146 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4147 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4148 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4149 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4150 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4151 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4152 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4153 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4154 }->{$token->{tag_name}}) {
4155 !!!cp ('t87.2');
4156 !!!parse-error (type => 'not closed',
4157 text => $self->{open_elements}->[-1]->[0]
4158 ->manakai_local_name,
4159 token => $token);
4160
4161 pop @{$self->{open_elements}}
4162 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4163
4164 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4165 ## Reprocess.
4166 next B;
4167 } else {
4168 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4169 my $tag_name = $token->{tag_name};
4170 if ($nsuri eq $SVG_NS) {
4171 $tag_name = {
4172 altglyph => 'altGlyph',
4173 altglyphdef => 'altGlyphDef',
4174 altglyphitem => 'altGlyphItem',
4175 animatecolor => 'animateColor',
4176 animatemotion => 'animateMotion',
4177 animatetransform => 'animateTransform',
4178 clippath => 'clipPath',
4179 feblend => 'feBlend',
4180 fecolormatrix => 'feColorMatrix',
4181 fecomponenttransfer => 'feComponentTransfer',
4182 fecomposite => 'feComposite',
4183 feconvolvematrix => 'feConvolveMatrix',
4184 fediffuselighting => 'feDiffuseLighting',
4185 fedisplacementmap => 'feDisplacementMap',
4186 fedistantlight => 'feDistantLight',
4187 feflood => 'feFlood',
4188 fefunca => 'feFuncA',
4189 fefuncb => 'feFuncB',
4190 fefuncg => 'feFuncG',
4191 fefuncr => 'feFuncR',
4192 fegaussianblur => 'feGaussianBlur',
4193 feimage => 'feImage',
4194 femerge => 'feMerge',
4195 femergenode => 'feMergeNode',
4196 femorphology => 'feMorphology',
4197 feoffset => 'feOffset',
4198 fepointlight => 'fePointLight',
4199 fespecularlighting => 'feSpecularLighting',
4200 fespotlight => 'feSpotLight',
4201 fetile => 'feTile',
4202 feturbulence => 'feTurbulence',
4203 foreignobject => 'foreignObject',
4204 glyphref => 'glyphRef',
4205 lineargradient => 'linearGradient',
4206 radialgradient => 'radialGradient',
4207 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4208 textpath => 'textPath',
4209 }->{$tag_name} || $tag_name;
4210 }
4211
4212 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4213
4214 ## "adjust foreign attributes" - done in insert-element-f
4215
4216 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4217
4218 if ($self->{self_closing}) {
4219 pop @{$self->{open_elements}};
4220 !!!ack ('t87.3');
4221 } else {
4222 !!!cp ('t87.4');
4223 }
4224
4225 !!!next-token;
4226 next B;
4227 }
4228 } elsif ($token->{type} == END_TAG_TOKEN) {
4229 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4230 !!!cp ('t87.5');
4231 #
4232 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4233 !!!cp ('t87.6');
4234 !!!parse-error (type => 'not closed',
4235 text => $self->{open_elements}->[-1]->[0]
4236 ->manakai_local_name,
4237 token => $token);
4238
4239 pop @{$self->{open_elements}}
4240 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4241
4242 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4243 ## Reprocess.
4244 next B;
4245 } else {
4246 die "$0: $token->{type}: Unknown token type";
4247 }
4248 }
4249
4250 if ($self->{insertion_mode} & HEAD_IMS) {
4251 if ($token->{type} == CHARACTER_TOKEN) {
4252 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4253 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4254 !!!cp ('t88.2');
4255 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4256 } else {
4257 !!!cp ('t88.1');
4258 ## Ignore the token.
4259 !!!next-token;
4260 next B;
4261 }
4262 unless (length $token->{data}) {
4263 !!!cp ('t88');
4264 !!!next-token;
4265 next B;
4266 }
4267 }
4268
4269 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4270 !!!cp ('t89');
4271 ## As if <head>
4272 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4273 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4274 push @{$self->{open_elements}},
4275 [$self->{head_element}, $el_category->{head}];
4276
4277 ## Reprocess in the "in head" insertion mode...
4278 pop @{$self->{open_elements}};
4279
4280 ## Reprocess in the "after head" insertion mode...
4281 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4282 !!!cp ('t90');
4283 ## As if </noscript>
4284 pop @{$self->{open_elements}};
4285 !!!parse-error (type => 'in noscript:#text', token => $token);
4286
4287 ## Reprocess in the "in head" insertion mode...
4288 ## As if </head>
4289 pop @{$self->{open_elements}};
4290
4291 ## Reprocess in the "after head" insertion mode...
4292 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4293 !!!cp ('t91');
4294 pop @{$self->{open_elements}};
4295
4296 ## Reprocess in the "after head" insertion mode...
4297 } else {
4298 !!!cp ('t92');
4299 }
4300
4301 ## "after head" insertion mode
4302 ## As if <body>
4303 !!!insert-element ('body',, $token);
4304 $self->{insertion_mode} = IN_BODY_IM;
4305 ## reprocess
4306 next B;
4307 } elsif ($token->{type} == START_TAG_TOKEN) {
4308 if ($token->{tag_name} eq 'head') {
4309 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4310 !!!cp ('t93');
4311 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4312 $self->{open_elements}->[-1]->[0]->append_child
4313 ($self->{head_element});
4314 push @{$self->{open_elements}},
4315 [$self->{head_element}, $el_category->{head}];
4316 $self->{insertion_mode} = IN_HEAD_IM;
4317 !!!nack ('t93.1');
4318 !!!next-token;
4319 next B;
4320 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4321 !!!cp ('t93.2');
4322 !!!parse-error (type => 'after head', text => 'head',
4323 token => $token);
4324 ## Ignore the token
4325 !!!nack ('t93.3');
4326 !!!next-token;
4327 next B;
4328 } else {
4329 !!!cp ('t95');
4330 !!!parse-error (type => 'in head:head',
4331 token => $token); # or in head noscript
4332 ## Ignore the token
4333 !!!nack ('t95.1');
4334 !!!next-token;
4335 next B;
4336 }
4337 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4338 !!!cp ('t96');
4339 ## As if <head>
4340 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4341 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4342 push @{$self->{open_elements}},
4343 [$self->{head_element}, $el_category->{head}];
4344
4345 $self->{insertion_mode} = IN_HEAD_IM;
4346 ## Reprocess in the "in head" insertion mode...
4347 } else {
4348 !!!cp ('t97');
4349 }
4350
4351 if ($token->{tag_name} eq 'base') {
4352 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4353 !!!cp ('t98');
4354 ## As if </noscript>
4355 pop @{$self->{open_elements}};
4356 !!!parse-error (type => 'in noscript', text => 'base',
4357 token => $token);
4358
4359 $self->{insertion_mode} = IN_HEAD_IM;
4360 ## Reprocess in the "in head" insertion mode...
4361 } else {
4362 !!!cp ('t99');
4363 }
4364
4365 ## NOTE: There is a "as if in head" code clone.
4366 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4367 !!!cp ('t100');
4368 !!!parse-error (type => 'after head',
4369 text => $token->{tag_name}, token => $token);
4370 push @{$self->{open_elements}},
4371 [$self->{head_element}, $el_category->{head}];
4372 } else {
4373 !!!cp ('t101');
4374 }
4375 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4376 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4377 pop @{$self->{open_elements}} # <head>
4378 if $self->{insertion_mode} == AFTER_HEAD_IM;
4379 !!!nack ('t101.1');
4380 !!!next-token;
4381 next B;
4382 } elsif ($token->{tag_name} eq 'link') {
4383 ## NOTE: There is a "as if in head" code clone.
4384 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4385 !!!cp ('t102');
4386 !!!parse-error (type => 'after head',
4387 text => $token->{tag_name}, token => $token);
4388 push @{$self->{open_elements}},
4389 [$self->{head_element}, $el_category->{head}];
4390 } else {
4391 !!!cp ('t103');
4392 }
4393 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4394 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4395 pop @{$self->{open_elements}} # <head>
4396 if $self->{insertion_mode} == AFTER_HEAD_IM;
4397 !!!ack ('t103.1');
4398 !!!next-token;
4399 next B;
4400 } elsif ($token->{tag_name} eq 'meta') {
4401 ## NOTE: There is a "as if in head" code clone.
4402 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4403 !!!cp ('t104');
4404 !!!parse-error (type => 'after head',
4405 text => $token->{tag_name}, token => $token);
4406 push @{$self->{open_elements}},
4407 [$self->{head_element}, $el_category->{head}];
4408 } else {
4409 !!!cp ('t105');
4410 }
4411 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4412 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4413
4414 unless ($self->{confident}) {
4415 if ($token->{attributes}->{charset}) {
4416 !!!cp ('t106');
4417 ## NOTE: Whether the encoding is supported or not is handled
4418 ## in the {change_encoding} callback.
4419 $self->{change_encoding}
4420 ->($self, $token->{attributes}->{charset}->{value},
4421 $token);
4422
4423 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4424 ->set_user_data (manakai_has_reference =>
4425 $token->{attributes}->{charset}
4426 ->{has_reference});
4427 } elsif ($token->{attributes}->{content}) {
4428 if ($token->{attributes}->{content}->{value}
4429 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4430 [\x09-\x0D\x20]*=
4431 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4432 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4433 !!!cp ('t107');
4434 ## NOTE: Whether the encoding is supported or not is handled
4435 ## in the {change_encoding} callback.
4436 $self->{change_encoding}
4437 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4438 $token);
4439 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4440 ->set_user_data (manakai_has_reference =>
4441 $token->{attributes}->{content}
4442 ->{has_reference});
4443 } else {
4444 !!!cp ('t108');
4445 }
4446 }
4447 } else {
4448 if ($token->{attributes}->{charset}) {
4449 !!!cp ('t109');
4450 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4451 ->set_user_data (manakai_has_reference =>
4452 $token->{attributes}->{charset}
4453 ->{has_reference});
4454 }
4455 if ($token->{attributes}->{content}) {
4456 !!!cp ('t110');
4457 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4458 ->set_user_data (manakai_has_reference =>
4459 $token->{attributes}->{content}
4460 ->{has_reference});
4461 }
4462 }
4463
4464 pop @{$self->{open_elements}} # <head>
4465 if $self->{insertion_mode} == AFTER_HEAD_IM;
4466 !!!ack ('t110.1');
4467 !!!next-token;
4468 next B;
4469 } elsif ($token->{tag_name} eq 'title') {
4470 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4471 !!!cp ('t111');
4472 ## As if </noscript>
4473 pop @{$self->{open_elements}};
4474 !!!parse-error (type => 'in noscript', text => 'title',
4475 token => $token);
4476
4477 $self->{insertion_mode} = IN_HEAD_IM;
4478 ## Reprocess in the "in head" insertion mode...
4479 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4480 !!!cp ('t112');
4481 !!!parse-error (type => 'after head',
4482 text => $token->{tag_name}, token => $token);
4483 push @{$self->{open_elements}},
4484 [$self->{head_element}, $el_category->{head}];
4485 } else {
4486 !!!cp ('t113');
4487 }
4488
4489 ## NOTE: There is a "as if in head" code clone.
4490 my $parent = defined $self->{head_element} ? $self->{head_element}
4491 : $self->{open_elements}->[-1]->[0];
4492 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4493 pop @{$self->{open_elements}} # <head>
4494 if $self->{insertion_mode} == AFTER_HEAD_IM;
4495 next B;
4496 } elsif ($token->{tag_name} eq 'style' or
4497 $token->{tag_name} eq 'noframes') {
4498 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4499 ## insertion mode IN_HEAD_IM)
4500 ## NOTE: There is a "as if in head" code clone.
4501 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4502 !!!cp ('t114');
4503 !!!parse-error (type => 'after head',
4504 text => $token->{tag_name}, token => $token);
4505 push @{$self->{open_elements}},
4506 [$self->{head_element}, $el_category->{head}];
4507 } else {
4508 !!!cp ('t115');
4509 }
4510 $parse_rcdata->(CDATA_CONTENT_MODEL);
4511 pop @{$self->{open_elements}} # <head>
4512 if $self->{insertion_mode} == AFTER_HEAD_IM;
4513 next B;
4514 } elsif ($token->{tag_name} eq 'noscript') {
4515 if ($self->{insertion_mode} == IN_HEAD_IM) {
4516 !!!cp ('t116');
4517 ## NOTE: and scripting is disalbed
4518 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4519 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4520 !!!nack ('t116.1');
4521 !!!next-token;
4522 next B;
4523 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4524 !!!cp ('t117');
4525 !!!parse-error (type => 'in noscript', text => 'noscript',
4526 token => $token);
4527 ## Ignore the token
4528 !!!nack ('t117.1');
4529 !!!next-token;
4530 next B;
4531 } else {
4532 !!!cp ('t118');
4533 #
4534 }
4535 } elsif ($token->{tag_name} eq 'script') {
4536 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4537 !!!cp ('t119');
4538 ## As if </noscript>
4539 pop @{$self->{open_elements}};
4540 !!!parse-error (type => 'in noscript', text => 'script',
4541 token => $token);
4542
4543 $self->{insertion_mode} = IN_HEAD_IM;
4544 ## Reprocess in the "in head" insertion mode...
4545 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4546 !!!cp ('t120');
4547 !!!parse-error (type => 'after head',
4548 text => $token->{tag_name}, token => $token);
4549 push @{$self->{open_elements}},
4550 [$self->{head_element}, $el_category->{head}];
4551 } else {
4552 !!!cp ('t121');
4553 }
4554
4555 ## NOTE: There is a "as if in head" code clone.
4556 $script_start_tag->();
4557 pop @{$self->{open_elements}} # <head>
4558 if $self->{insertion_mode} == AFTER_HEAD_IM;
4559 next B;
4560 } elsif ($token->{tag_name} eq 'body' or
4561 $token->{tag_name} eq 'frameset') {
4562 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4563 !!!cp ('t122');
4564 ## As if </noscript>
4565 pop @{$self->{open_elements}};
4566 !!!parse-error (type => 'in noscript',
4567 text => $token->{tag_name}, token => $token);
4568
4569 ## Reprocess in the "in head" insertion mode...
4570 ## As if </head>
4571 pop @{$self->{open_elements}};
4572
4573 ## Reprocess in the "after head" insertion mode...
4574 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4575 !!!cp ('t124');
4576 pop @{$self->{open_elements}};
4577
4578 ## Reprocess in the "after head" insertion mode...
4579 } else {
4580 !!!cp ('t125');
4581 }
4582
4583 ## "after head" insertion mode
4584 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4585 if ($token->{tag_name} eq 'body') {
4586 !!!cp ('t126');
4587 $self->{insertion_mode} = IN_BODY_IM;
4588 } elsif ($token->{tag_name} eq 'frameset') {
4589 !!!cp ('t127');
4590 $self->{insertion_mode} = IN_FRAMESET_IM;
4591 } else {
4592 die "$0: tag name: $self->{tag_name}";
4593 }
4594 !!!nack ('t127.1');
4595 !!!next-token;
4596 next B;
4597 } else {
4598 !!!cp ('t128');
4599 #
4600 }
4601
4602 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4603 !!!cp ('t129');
4604 ## As if </noscript>
4605 pop @{$self->{open_elements}};
4606 !!!parse-error (type => 'in noscript:/',
4607 text => $token->{tag_name}, token => $token);
4608
4609 ## Reprocess in the "in head" insertion mode...
4610 ## As if </head>
4611 pop @{$self->{open_elements}};
4612
4613 ## Reprocess in the "after head" insertion mode...
4614 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4615 !!!cp ('t130');
4616 ## As if </head>
4617 pop @{$self->{open_elements}};
4618
4619 ## Reprocess in the "after head" insertion mode...
4620 } else {
4621 !!!cp ('t131');
4622 }
4623
4624 ## "after head" insertion mode
4625 ## As if <body>
4626 !!!insert-element ('body',, $token);
4627 $self->{insertion_mode} = IN_BODY_IM;
4628 ## reprocess
4629 !!!ack-later;
4630 next B;
4631 } elsif ($token->{type} == END_TAG_TOKEN) {
4632 if ($token->{tag_name} eq 'head') {
4633 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4634 !!!cp ('t132');
4635 ## As if <head>
4636 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4637 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4638 push @{$self->{open_elements}},
4639 [$self->{head_element}, $el_category->{head}];
4640
4641 ## Reprocess in the "in head" insertion mode...
4642 pop @{$self->{open_elements}};
4643 $self->{insertion_mode} = AFTER_HEAD_IM;
4644 !!!next-token;
4645 next B;
4646 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4647 !!!cp ('t133');
4648 ## As if </noscript>
4649 pop @{$self->{open_elements}};
4650 !!!parse-error (type => 'in noscript:/',
4651 text => 'head', token => $token);
4652
4653 ## Reprocess in the "in head" insertion mode...
4654 pop @{$self->{open_elements}};
4655 $self->{insertion_mode} = AFTER_HEAD_IM;
4656 !!!next-token;
4657 next B;
4658 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4659 !!!cp ('t134');
4660 pop @{$self->{open_elements}};
4661 $self->{insertion_mode} = AFTER_HEAD_IM;
4662 !!!next-token;
4663 next B;
4664 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4665 !!!cp ('t134.1');
4666 !!!parse-error (type => 'unmatched end tag', text => 'head',
4667 token => $token);
4668 ## Ignore the token
4669 !!!next-token;
4670 next B;
4671 } else {
4672 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4673 }
4674 } elsif ($token->{tag_name} eq 'noscript') {
4675 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4676 !!!cp ('t136');
4677 pop @{$self->{open_elements}};
4678 $self->{insertion_mode} = IN_HEAD_IM;
4679 !!!next-token;
4680 next B;
4681 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4682 $self->{insertion_mode} == AFTER_HEAD_IM) {
4683 !!!cp ('t137');
4684 !!!parse-error (type => 'unmatched end tag',
4685 text => 'noscript', token => $token);
4686 ## Ignore the token ## ISSUE: An issue in the spec.
4687 !!!next-token;
4688 next B;
4689 } else {
4690 !!!cp ('t138');
4691 #
4692 }
4693 } elsif ({
4694 body => 1, html => 1,
4695 }->{$token->{tag_name}}) {
4696 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4697 $self->{insertion_mode} == IN_HEAD_IM or
4698 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4699 !!!cp ('t140');
4700 !!!parse-error (type => 'unmatched end tag',
4701 text => $token->{tag_name}, token => $token);
4702 ## Ignore the token
4703 !!!next-token;
4704 next B;
4705 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4706 !!!cp ('t140.1');
4707 !!!parse-error (type => 'unmatched end tag',
4708 text => $token->{tag_name}, token => $token);
4709 ## Ignore the token
4710 !!!next-token;
4711 next B;
4712 } else {
4713 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4714 }
4715 } elsif ($token->{tag_name} eq 'p') {
4716 !!!cp ('t142');
4717 !!!parse-error (type => 'unmatched end tag',
4718 text => $token->{tag_name}, token => $token);
4719 ## Ignore the token
4720 !!!next-token;
4721 next B;
4722 } elsif ($token->{tag_name} eq 'br') {
4723 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4724 !!!cp ('t142.2');
4725 ## (before head) as if <head>, (in head) as if </head>
4726 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4727 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4728 $self->{insertion_mode} = AFTER_HEAD_IM;
4729
4730 ## Reprocess in the "after head" insertion mode...
4731 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4732 !!!cp ('t143.2');
4733 ## As if </head>
4734 pop @{$self->{open_elements}};
4735 $self->{insertion_mode} = AFTER_HEAD_IM;
4736
4737 ## Reprocess in the "after head" insertion mode...
4738 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4739 !!!cp ('t143.3');
4740 ## ISSUE: Two parse errors for <head><noscript></br>
4741 !!!parse-error (type => 'unmatched end tag',
4742 text => 'br', token => $token);
4743 ## As if </noscript>
4744 pop @{$self->{open_elements}};
4745 $self->{insertion_mode} = IN_HEAD_IM;
4746
4747 ## Reprocess in the "in head" insertion mode...
4748 ## As if </head>
4749 pop @{$self->{open_elements}};
4750 $self->{insertion_mode} = AFTER_HEAD_IM;
4751
4752 ## Reprocess in the "after head" insertion mode...
4753 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4754 !!!cp ('t143.4');
4755 #
4756 } else {
4757 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4758 }
4759
4760 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4761 !!!parse-error (type => 'unmatched end tag',
4762 text => 'br', token => $token);
4763 ## Ignore the token
4764 !!!next-token;
4765 next B;
4766 } else {
4767 !!!cp ('t145');
4768 !!!parse-error (type => 'unmatched end tag',
4769 text => $token->{tag_name}, token => $token);
4770 ## Ignore the token
4771 !!!next-token;
4772 next B;
4773 }
4774
4775 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4776 !!!cp ('t146');
4777 ## As if </noscript>
4778 pop @{$self->{open_elements}};
4779 !!!parse-error (type => 'in noscript:/',
4780 text => $token->{tag_name}, token => $token);
4781
4782 ## Reprocess in the "in head" insertion mode...
4783 ## As if </head>
4784 pop @{$self->{open_elements}};
4785
4786 ## Reprocess in the "after head" insertion mode...
4787 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4788 !!!cp ('t147');
4789 ## As if </head>
4790 pop @{$self->{open_elements}};
4791
4792 ## Reprocess in the "after head" insertion mode...
4793 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4794 ## ISSUE: This case cannot be reached?
4795 !!!cp ('t148');
4796 !!!parse-error (type => 'unmatched end tag',
4797 text => $token->{tag_name}, token => $token);
4798 ## Ignore the token ## ISSUE: An issue in the spec.
4799 !!!next-token;
4800 next B;
4801 } else {
4802 !!!cp ('t149');
4803 }
4804
4805 ## "after head" insertion mode
4806 ## As if <body>
4807 !!!insert-element ('body',, $token);
4808 $self->{insertion_mode} = IN_BODY_IM;
4809 ## reprocess
4810 next B;
4811 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4812 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4813 !!!cp ('t149.1');
4814
4815 ## NOTE: As if <head>
4816 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4817 $self->{open_elements}->[-1]->[0]->append_child
4818 ($self->{head_element});
4819 #push @{$self->{open_elements}},
4820 # [$self->{head_element}, $el_category->{head}];
4821 #$self->{insertion_mode} = IN_HEAD_IM;
4822 ## NOTE: Reprocess.
4823
4824 ## NOTE: As if </head>
4825 #pop @{$self->{open_elements}};
4826 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4827 ## NOTE: Reprocess.
4828
4829 #
4830 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4831 !!!cp ('t149.2');
4832
4833 ## NOTE: As if </head>
4834 pop @{$self->{open_elements}};
4835 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4836 ## NOTE: Reprocess.
4837
4838 #
4839 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4840 !!!cp ('t149.3');
4841
4842 !!!parse-error (type => 'in noscript:#eof', token => $token);
4843
4844 ## As if </noscript>
4845 pop @{$self->{open_elements}};
4846 #$self->{insertion_mode} = IN_HEAD_IM;
4847 ## NOTE: Reprocess.
4848
4849 ## NOTE: As if </head>
4850 pop @{$self->{open_elements}};
4851 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4852 ## NOTE: Reprocess.
4853
4854 #
4855 } else {
4856 !!!cp ('t149.4');
4857 #
4858 }
4859
4860 ## NOTE: As if <body>
4861 !!!insert-element ('body',, $token);
4862 $self->{insertion_mode} = IN_BODY_IM;
4863 ## NOTE: Reprocess.
4864 next B;
4865 } else {
4866 die "$0: $token->{type}: Unknown token type";
4867 }
4868
4869 ## ISSUE: An issue in the spec.
4870 } elsif ($self->{insertion_mode} & BODY_IMS) {
4871 if ($token->{type} == CHARACTER_TOKEN) {
4872 !!!cp ('t150');
4873 ## NOTE: There is a code clone of "character in body".
4874 $reconstruct_active_formatting_elements->($insert_to_current);
4875
4876 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4877
4878 !!!next-token;
4879 next B;
4880 } elsif ($token->{type} == START_TAG_TOKEN) {
4881 if ({
4882 caption => 1, col => 1, colgroup => 1, tbody => 1,
4883 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4884 }->{$token->{tag_name}}) {
4885 if ($self->{insertion_mode} == IN_CELL_IM) {
4886 ## have an element in table scope
4887 for (reverse 0..$#{$self->{open_elements}}) {
4888 my $node = $self->{open_elements}->[$_];
4889 if ($node->[1] & TABLE_CELL_EL) {
4890 !!!cp ('t151');
4891
4892 ## Close the cell
4893 !!!back-token; # <x>
4894 $token = {type => END_TAG_TOKEN,
4895 tag_name => $node->[0]->manakai_local_name,
4896 line => $token->{line},
4897 column => $token->{column}};
4898 next B;
4899 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4900 !!!cp ('t152');
4901 ## ISSUE: This case can never be reached, maybe.
4902 last;
4903 }
4904 }
4905
4906 !!!cp ('t153');
4907 !!!parse-error (type => 'start tag not allowed',
4908 text => $token->{tag_name}, token => $token);
4909 ## Ignore the token
4910 !!!nack ('t153.1');
4911 !!!next-token;
4912 next B;
4913 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4914 !!!parse-error (type => 'not closed', text => 'caption',
4915 token => $token);
4916
4917 ## NOTE: As if </caption>.
4918 ## have a table element in table scope
4919 my $i;
4920 INSCOPE: {
4921 for (reverse 0..$#{$self->{open_elements}}) {
4922 my $node = $self->{open_elements}->[$_];
4923 if ($node->[1] & CAPTION_EL) {
4924 !!!cp ('t155');
4925 $i = $_;
4926 last INSCOPE;
4927 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4928 !!!cp ('t156');
4929 last;
4930 }
4931 }
4932
4933 !!!cp ('t157');
4934 !!!parse-error (type => 'start tag not allowed',
4935 text => $token->{tag_name}, token => $token);
4936 ## Ignore the token
4937 !!!nack ('t157.1');
4938 !!!next-token;
4939 next B;
4940 } # INSCOPE
4941
4942 ## generate implied end tags
4943 while ($self->{open_elements}->[-1]->[1]
4944 & END_TAG_OPTIONAL_EL) {
4945 !!!cp ('t158');
4946 pop @{$self->{open_elements}};
4947 }
4948
4949 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4950 !!!cp ('t159');
4951 !!!parse-error (type => 'not closed',
4952 text => $self->{open_elements}->[-1]->[0]
4953 ->manakai_local_name,
4954 token => $token);
4955 } else {
4956 !!!cp ('t160');
4957 }
4958
4959 splice @{$self->{open_elements}}, $i;
4960
4961 $clear_up_to_marker->();
4962
4963 $self->{insertion_mode} = IN_TABLE_IM;
4964
4965 ## reprocess
4966 !!!ack-later;
4967 next B;
4968 } else {
4969 !!!cp ('t161');
4970 #
4971 }
4972 } else {
4973 !!!cp ('t162');
4974 #
4975 }
4976 } elsif ($token->{type} == END_TAG_TOKEN) {
4977 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4978 if ($self->{insertion_mode} == IN_CELL_IM) {
4979 ## have an element in table scope
4980 my $i;
4981 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4982 my $node = $self->{open_elements}->[$_];
4983 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4984 !!!cp ('t163');
4985 $i = $_;
4986 last INSCOPE;
4987 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4988 !!!cp ('t164');
4989 last INSCOPE;
4990 }
4991 } # INSCOPE
4992 unless (defined $i) {
4993 !!!cp ('t165');
4994 !!!parse-error (type => 'unmatched end tag',
4995 text => $token->{tag_name},
4996 token => $token);
4997 ## Ignore the token
4998 !!!next-token;
4999 next B;
5000 }
5001
5002 ## generate implied end tags
5003 while ($self->{open_elements}->[-1]->[1]
5004 & END_TAG_OPTIONAL_EL) {
5005 !!!cp ('t166');
5006 pop @{$self->{open_elements}};
5007 }
5008
5009 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
5010 ne $token->{tag_name}) {
5011 !!!cp ('t167');
5012 !!!parse-error (type => 'not closed',
5013 text => $self->{open_elements}->[-1]->[0]
5014 ->manakai_local_name,
5015 token => $token);
5016 } else {
5017 !!!cp ('t168');
5018 }
5019
5020 splice @{$self->{open_elements}}, $i;
5021
5022 $clear_up_to_marker->();
5023
5024 $self->{insertion_mode} = IN_ROW_IM;
5025
5026 !!!next-token;
5027 next B;
5028 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5029 !!!cp ('t169');
5030 !!!parse-error (type => 'unmatched end tag',
5031 text => $token->{tag_name}, token => $token);
5032 ## Ignore the token
5033 !!!next-token;
5034 next B;
5035 } else {
5036 !!!cp ('t170');
5037 #
5038 }
5039 } elsif ($token->{tag_name} eq 'caption') {
5040 if ($self->{insertion_mode} == IN_CAPTION_IM) {
5041 ## have a table element in table scope
5042 my $i;
5043 INSCOPE: {
5044 for (reverse 0..$#{$self->{open_elements}}) {
5045 my $node = $self->{open_elements}->[$_];
5046 if ($node->[1] & CAPTION_EL) {
5047 !!!cp ('t171');
5048 $i = $_;
5049 last INSCOPE;
5050 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5051 !!!cp ('t172');
5052 last;
5053 }
5054 }
5055
5056 !!!cp ('t173');
5057 !!!parse-error (type => 'unmatched end tag',
5058 text => $token->{tag_name}, token => $token);
5059 ## Ignore the token
5060 !!!next-token;
5061 next B;
5062 } # INSCOPE
5063
5064 ## generate implied end tags
5065 while ($self->{open_elements}->[-1]->[1]
5066 & END_TAG_OPTIONAL_EL) {
5067 !!!cp ('t174');
5068 pop @{$self->{open_elements}};
5069 }
5070
5071 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5072 !!!cp ('t175');
5073 !!!parse-error (type => 'not closed',
5074 text => $self->{open_elements}->[-1]->[0]
5075 ->manakai_local_name,
5076 token => $token);
5077 } else {
5078 !!!cp ('t176');
5079 }
5080
5081 splice @{$self->{open_elements}}, $i;
5082
5083 $clear_up_to_marker->();
5084
5085 $self->{insertion_mode} = IN_TABLE_IM;
5086
5087 !!!next-token;
5088 next B;
5089 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5090 !!!cp ('t177');
5091 !!!parse-error (type => 'unmatched end tag',
5092 text => $token->{tag_name}, token => $token);
5093 ## Ignore the token
5094 !!!next-token;
5095 next B;
5096 } else {
5097 !!!cp ('t178');
5098 #
5099 }
5100 } elsif ({
5101 table => 1, tbody => 1, tfoot => 1,
5102 thead => 1, tr => 1,
5103 }->{$token->{tag_name}} and
5104 $self->{insertion_mode} == IN_CELL_IM) {
5105 ## have an element in table scope
5106 my $i;
5107 my $tn;
5108 INSCOPE: {
5109 for (reverse 0..$#{$self->{open_elements}}) {
5110 my $node = $self->{open_elements}->[$_];
5111 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5112 !!!cp ('t179');
5113 $i = $_;
5114
5115 ## Close the cell
5116 !!!back-token; # </x>
5117 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5118 line => $token->{line},
5119 column => $token->{column}};
5120 next B;
5121 } elsif ($node->[1] & TABLE_CELL_EL) {
5122 !!!cp ('t180');
5123 $tn = $node->[0]->manakai_local_name;
5124 ## NOTE: There is exactly one |td| or |th| element
5125 ## in scope in the stack of open elements by definition.
5126 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5127 ## ISSUE: Can this be reached?
5128 !!!cp ('t181');
5129 last;
5130 }
5131 }
5132
5133 !!!cp ('t182');
5134 !!!parse-error (type => 'unmatched end tag',
5135 text => $token->{tag_name}, token => $token);
5136 ## Ignore the token
5137 !!!next-token;
5138 next B;
5139 } # INSCOPE
5140 } elsif ($token->{tag_name} eq 'table' and
5141 $self->{insertion_mode} == IN_CAPTION_IM) {
5142 !!!parse-error (type => 'not closed', text => 'caption',
5143 token => $token);
5144
5145 ## As if </caption>
5146 ## have a table element in table scope
5147 my $i;
5148 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5149 my $node = $self->{open_elements}->[$_];
5150 if ($node->[1] & CAPTION_EL) {
5151 !!!cp ('t184');
5152 $i = $_;
5153 last INSCOPE;
5154 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5155 !!!cp ('t185');
5156 last INSCOPE;
5157 }
5158 } # INSCOPE
5159 unless (defined $i) {
5160 !!!cp ('t186');
5161 !!!parse-error (type => 'unmatched end tag',
5162 text => 'caption', token => $token);
5163 ## Ignore the token
5164 !!!next-token;
5165 next B;
5166 }
5167
5168 ## generate implied end tags
5169 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5170 !!!cp ('t187');
5171 pop @{$self->{open_elements}};
5172 }
5173
5174 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5175 !!!cp ('t188');
5176 !!!parse-error (type => 'not closed',
5177 text => $self->{open_elements}->[-1]->[0]
5178 ->manakai_local_name,
5179 token => $token);
5180 } else {
5181 !!!cp ('t189');
5182 }
5183
5184 splice @{$self->{open_elements}}, $i;
5185
5186 $clear_up_to_marker->();
5187
5188 $self->{insertion_mode} = IN_TABLE_IM;
5189
5190 ## reprocess
5191 next B;
5192 } elsif ({
5193 body => 1, col => 1, colgroup => 1, html => 1,
5194 }->{$token->{tag_name}}) {
5195 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5196 !!!cp ('t190');
5197 !!!parse-error (type => 'unmatched end tag',
5198 text => $token->{tag_name}, token => $token);
5199 ## Ignore the token
5200 !!!next-token;
5201 next B;
5202 } else {
5203 !!!cp ('t191');
5204 #
5205 }
5206 } elsif ({
5207 tbody => 1, tfoot => 1,
5208 thead => 1, tr => 1,
5209 }->{$token->{tag_name}} and
5210 $self->{insertion_mode} == IN_CAPTION_IM) {
5211 !!!cp ('t192');
5212 !!!parse-error (type => 'unmatched end tag',
5213 text => $token->{tag_name}, token => $token);
5214 ## Ignore the token
5215 !!!next-token;
5216 next B;
5217 } else {
5218 !!!cp ('t193');
5219 #
5220 }
5221 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5222 for my $entry (@{$self->{open_elements}}) {
5223 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5224 !!!cp ('t75');
5225 !!!parse-error (type => 'in body:#eof', token => $token);
5226 last;
5227 }
5228 }
5229
5230 ## Stop parsing.
5231 last B;
5232 } else {
5233 die "$0: $token->{type}: Unknown token type";
5234 }
5235
5236 $insert = $insert_to_current;
5237 #
5238 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5239 if ($token->{type} == CHARACTER_TOKEN) {
5240 if (not $open_tables->[-1]->[1] and # tainted
5241 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5242 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5243
5244 unless (length $token->{data}) {
5245 !!!cp ('t194');
5246 !!!next-token;
5247 next B;
5248 } else {
5249 !!!cp ('t195');
5250 }
5251 }
5252
5253 !!!parse-error (type => 'in table:#text', token => $token);
5254
5255 ## As if in body, but insert into foster parent element
5256 ## ISSUE: Spec says that "whenever a node would be inserted
5257 ## into the current node" while characters might not be
5258 ## result in a new Text node.
5259 $reconstruct_active_formatting_elements->($insert_to_foster);
5260
5261 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5262 # MUST
5263 my $foster_parent_element;
5264 my $next_sibling;
5265 my $prev_sibling;
5266 OE: for (reverse 0..$#{$self->{open_elements}}) {
5267 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5268 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5269 if (defined $parent and $parent->node_type == 1) {
5270 !!!cp ('t196');
5271 $foster_parent_element = $parent;
5272 $next_sibling = $self->{open_elements}->[$_]->[0];
5273 $prev_sibling = $next_sibling->previous_sibling;
5274 } else {
5275 !!!cp ('t197');
5276 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5277 $prev_sibling = $foster_parent_element->last_child;
5278 }
5279 last OE;
5280 }
5281 } # OE
5282 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5283 $prev_sibling = $foster_parent_element->last_child
5284 unless defined $foster_parent_element;
5285 if (defined $prev_sibling and
5286 $prev_sibling->node_type == 3) {
5287 !!!cp ('t198');
5288 $prev_sibling->manakai_append_text ($token->{data});
5289 } else {
5290 !!!cp ('t199');
5291 $foster_parent_element->insert_before
5292 ($self->{document}->create_text_node ($token->{data}),
5293 $next_sibling);
5294 }
5295 $open_tables->[-1]->[1] = 1; # tainted
5296 } else {
5297 !!!cp ('t200');
5298 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5299 }
5300
5301 !!!next-token;
5302 next B;
5303 } elsif ($token->{type} == START_TAG_TOKEN) {
5304 if ({
5305 tr => ($self->{insertion_mode} != IN_ROW_IM),
5306 th => 1, td => 1,
5307 }->{$token->{tag_name}}) {
5308 if ($self->{insertion_mode} == IN_TABLE_IM) {
5309 ## Clear back to table context
5310 while (not ($self->{open_elements}->[-1]->[1]
5311 & TABLE_SCOPING_EL)) {
5312 !!!cp ('t201');
5313 pop @{$self->{open_elements}};
5314 }
5315
5316 !!!insert-element ('tbody',, $token);
5317 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5318 ## reprocess in the "in table body" insertion mode...
5319 }
5320
5321 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5322 unless ($token->{tag_name} eq 'tr') {
5323 !!!cp ('t202');
5324 !!!parse-error (type => 'missing start tag:tr', token => $token);
5325 }
5326
5327 ## Clear back to table body context
5328 while (not ($self->{open_elements}->[-1]->[1]
5329 & TABLE_ROWS_SCOPING_EL)) {
5330 !!!cp ('t203');
5331 ## ISSUE: Can this case be reached?
5332 pop @{$self->{open_elements}};
5333 }
5334
5335 $self->{insertion_mode} = IN_ROW_IM;
5336 if ($token->{tag_name} eq 'tr') {
5337 !!!cp ('t204');
5338 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5339 !!!nack ('t204');
5340 !!!next-token;
5341 next B;
5342 } else {
5343 !!!cp ('t205');
5344 !!!insert-element ('tr',, $token);
5345 ## reprocess in the "in row" insertion mode
5346 }
5347 } else {
5348 !!!cp ('t206');
5349 }
5350
5351 ## Clear back to table row context
5352 while (not ($self->{open_elements}->[-1]->[1]
5353 & TABLE_ROW_SCOPING_EL)) {
5354 !!!cp ('t207');
5355 pop @{$self->{open_elements}};
5356 }
5357
5358 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5359 $self->{insertion_mode} = IN_CELL_IM;
5360
5361 push @$active_formatting_elements, ['#marker', ''];
5362
5363 !!!nack ('t207.1');
5364 !!!next-token;
5365 next B;
5366 } elsif ({
5367 caption => 1, col => 1, colgroup => 1,
5368 tbody => 1, tfoot => 1, thead => 1,
5369 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5370 }->{$token->{tag_name}}) {
5371 if ($self->{insertion_mode} == IN_ROW_IM) {
5372 ## As if </tr>
5373 ## have an element in table scope
5374 my $i;
5375 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5376 my $node = $self->{open_elements}->[$_];
5377 if ($node->[1] & TABLE_ROW_EL) {
5378 !!!cp ('t208');
5379 $i = $_;
5380 last INSCOPE;
5381 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5382 !!!cp ('t209');
5383 last INSCOPE;
5384 }
5385 } # INSCOPE
5386 unless (defined $i) {
5387 !!!cp ('t210');
5388 ## TODO: This type is wrong.
5389 !!!parse-error (type => 'unmacthed end tag',
5390 text => $token->{tag_name}, token => $token);
5391 ## Ignore the token
5392 !!!nack ('t210.1');
5393 !!!next-token;
5394 next B;
5395 }
5396
5397 ## Clear back to table row context
5398 while (not ($self->{open_elements}->[-1]->[1]
5399 & TABLE_ROW_SCOPING_EL)) {
5400 !!!cp ('t211');
5401 ## ISSUE: Can this case be reached?
5402 pop @{$self->{open_elements}};
5403 }
5404
5405 pop @{$self->{open_elements}}; # tr
5406 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5407 if ($token->{tag_name} eq 'tr') {
5408 !!!cp ('t212');
5409 ## reprocess
5410 !!!ack-later;
5411 next B;
5412 } else {
5413 !!!cp ('t213');
5414 ## reprocess in the "in table body" insertion mode...
5415 }
5416 }
5417
5418 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5419 ## have an element in table scope
5420 my $i;
5421 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5422 my $node = $self->{open_elements}->[$_];
5423 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5424 !!!cp ('t214');
5425 $i = $_;
5426 last INSCOPE;
5427 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5428 !!!cp ('t215');
5429 last INSCOPE;
5430 }
5431 } # INSCOPE
5432 unless (defined $i) {
5433 !!!cp ('t216');
5434 ## TODO: This erorr type is wrong.
5435 !!!parse-error (type => 'unmatched end tag',
5436 text => $token->{tag_name}, token => $token);
5437 ## Ignore the token
5438 !!!nack ('t216.1');
5439 !!!next-token;
5440 next B;
5441 }
5442
5443 ## Clear back to table body context
5444 while (not ($self->{open_elements}->[-1]->[1]
5445 & TABLE_ROWS_SCOPING_EL)) {
5446 !!!cp ('t217');
5447 ## ISSUE: Can this state be reached?
5448 pop @{$self->{open_elements}};
5449 }
5450
5451 ## As if <{current node}>
5452 ## have an element in table scope
5453 ## true by definition
5454
5455 ## Clear back to table body context
5456 ## nop by definition
5457
5458 pop @{$self->{open_elements}};
5459 $self->{insertion_mode} = IN_TABLE_IM;
5460 ## reprocess in "in table" insertion mode...
5461 } else {
5462 !!!cp ('t218');
5463 }
5464
5465 if ($token->{tag_name} eq 'col') {
5466 ## Clear back to table context
5467 while (not ($self->{open_elements}->[-1]->[1]
5468 & TABLE_SCOPING_EL)) {
5469 !!!cp ('t219');
5470 ## ISSUE: Can this state be reached?
5471 pop @{$self->{open_elements}};
5472 }
5473
5474 !!!insert-element ('colgroup',, $token);
5475 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5476 ## reprocess
5477 !!!ack-later;
5478 next B;
5479 } elsif ({
5480 caption => 1,
5481 colgroup => 1,
5482 tbody => 1, tfoot => 1, thead => 1,
5483 }->{$token->{tag_name}}) {
5484 ## Clear back to table context
5485 while (not ($self->{open_elements}->[-1]->[1]
5486 & TABLE_SCOPING_EL)) {
5487 !!!cp ('t220');
5488 ## ISSUE: Can this state be reached?
5489 pop @{$self->{open_elements}};
5490 }
5491
5492 push @$active_formatting_elements, ['#marker', '']
5493 if $token->{tag_name} eq 'caption';
5494
5495 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5496 $self->{insertion_mode} = {
5497 caption => IN_CAPTION_IM,
5498 colgroup => IN_COLUMN_GROUP_IM,
5499 tbody => IN_TABLE_BODY_IM,
5500 tfoot => IN_TABLE_BODY_IM,
5501 thead => IN_TABLE_BODY_IM,
5502 }->{$token->{tag_name}};
5503 !!!next-token;
5504 !!!nack ('t220.1');
5505 next B;
5506 } else {
5507 die "$0: in table: <>: $token->{tag_name}";
5508 }
5509 } elsif ($token->{tag_name} eq 'table') {
5510 !!!parse-error (type => 'not closed',
5511 text => $self->{open_elements}->[-1]->[0]
5512 ->manakai_local_name,
5513 token => $token);
5514
5515 ## As if </table>
5516 ## have a table element in table scope
5517 my $i;
5518 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5519 my $node = $self->{open_elements}->[$_];
5520 if ($node->[1] & TABLE_EL) {
5521 !!!cp ('t221');
5522 $i = $_;
5523 last INSCOPE;
5524 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5525 !!!cp ('t222');
5526 last INSCOPE;
5527 }
5528 } # INSCOPE
5529 unless (defined $i) {
5530 !!!cp ('t223');
5531 ## TODO: The following is wrong, maybe.
5532 !!!parse-error (type => 'unmatched end tag', text => 'table',
5533 token => $token);
5534 ## Ignore tokens </table><table>
5535 !!!nack ('t223.1');
5536 !!!next-token;
5537 next B;
5538 }
5539
5540 ## TODO: Followings are removed from the latest spec.
5541 ## generate implied end tags
5542 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5543 !!!cp ('t224');
5544 pop @{$self->{open_elements}};
5545 }
5546
5547 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5548 !!!cp ('t225');
5549 ## NOTE: |<table><tr><table>|
5550 !!!parse-error (type => 'not closed',
5551 text => $self->{open_elements}->[-1]->[0]
5552 ->manakai_local_name,
5553 token => $token);
5554 } else {
5555 !!!cp ('t226');
5556 }
5557
5558 splice @{$self->{open_elements}}, $i;
5559 pop @{$open_tables};
5560
5561 $self->_reset_insertion_mode;
5562
5563 ## reprocess
5564 !!!ack-later;
5565 next B;
5566 } elsif ($token->{tag_name} eq 'style') {
5567 if (not $open_tables->[-1]->[1]) { # tainted
5568 !!!cp ('t227.8');
5569 ## NOTE: This is a "as if in head" code clone.
5570 $parse_rcdata->(CDATA_CONTENT_MODEL);
5571 next B;
5572 } else {
5573 !!!cp ('t227.7');
5574 #
5575 }
5576 } elsif ($token->{tag_name} eq 'script') {
5577 if (not $open_tables->[-1]->[1]) { # tainted
5578 !!!cp ('t227.6');
5579 ## NOTE: This is a "as if in head" code clone.
5580 $script_start_tag->();
5581 next B;
5582 } else {
5583 !!!cp ('t227.5');
5584 #
5585 }
5586 } elsif ($token->{tag_name} eq 'input') {
5587 if (not $open_tables->[-1]->[1]) { # tainted
5588 if ($token->{attributes}->{type}) { ## TODO: case
5589 my $type = lc $token->{attributes}->{type}->{value};
5590 if ($type eq 'hidden') {
5591 !!!cp ('t227.3');
5592 !!!parse-error (type => 'in table',
5593 text => $token->{tag_name}, token => $token);
5594
5595 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5596
5597 ## TODO: form element pointer
5598
5599 pop @{$self->{open_elements}};
5600
5601 !!!next-token;
5602 !!!ack ('t227.2.1');
5603 next B;
5604 } else {
5605 !!!cp ('t227.2');
5606 #
5607 }
5608 } else {
5609 !!!cp ('t227.1');
5610 #
5611 }
5612 } else {
5613 !!!cp ('t227.4');
5614 #
5615 }
5616 } else {
5617 !!!cp ('t227');
5618 #
5619 }
5620
5621 !!!parse-error (type => 'in table', text => $token->{tag_name},
5622 token => $token);
5623
5624 $insert = $insert_to_foster;
5625 #
5626 } elsif ($token->{type} == END_TAG_TOKEN) {
5627 if ($token->{tag_name} eq 'tr' and
5628 $self->{insertion_mode} == IN_ROW_IM) {
5629 ## have an element in table scope
5630 my $i;
5631 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5632 my $node = $self->{open_elements}->[$_];
5633 if ($node->[1] & TABLE_ROW_EL) {
5634 !!!cp ('t228');
5635 $i = $_;
5636 last INSCOPE;
5637 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5638 !!!cp ('t229');
5639 last INSCOPE;
5640 }
5641 } # INSCOPE
5642 unless (defined $i) {
5643 !!!cp ('t230');
5644 !!!parse-error (type => 'unmatched end tag',
5645 text => $token->{tag_name}, token => $token);
5646 ## Ignore the token
5647 !!!nack ('t230.1');
5648 !!!next-token;
5649 next B;
5650 } else {
5651 !!!cp ('t232');
5652 }
5653
5654 ## Clear back to table row context
5655 while (not ($self->{open_elements}->[-1]->[1]
5656 & TABLE_ROW_SCOPING_EL)) {
5657 !!!cp ('t231');
5658 ## ISSUE: Can this state be reached?
5659 pop @{$self->{open_elements}};
5660 }
5661
5662 pop @{$self->{open_elements}}; # tr
5663 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5664 !!!next-token;
5665 !!!nack ('t231.1');
5666 next B;
5667 } elsif ($token->{tag_name} eq 'table') {
5668 if ($self->{insertion_mode} == IN_ROW_IM) {
5669 ## As if </tr>
5670 ## have an element in table scope
5671 my $i;
5672 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5673 my $node = $self->{open_elements}->[$_];
5674 if ($node->[1] & TABLE_ROW_EL) {
5675 !!!cp ('t233');
5676 $i = $_;
5677 last INSCOPE;
5678 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5679 !!!cp ('t234');
5680 last INSCOPE;
5681 }
5682 } # INSCOPE
5683 unless (defined $i) {
5684 !!!cp ('t235');
5685 ## TODO: The following is wrong.
5686 !!!parse-error (type => 'unmatched end tag',
5687 text => $token->{type}, token => $token);
5688 ## Ignore the token
5689 !!!nack ('t236.1');
5690 !!!next-token;
5691 next B;
5692 }
5693
5694 ## Clear back to table row context
5695 while (not ($self->{open_elements}->[-1]->[1]
5696 & TABLE_ROW_SCOPING_EL)) {
5697 !!!cp ('t236');
5698 ## ISSUE: Can this state be reached?
5699 pop @{$self->{open_elements}};
5700 }
5701
5702 pop @{$self->{open_elements}}; # tr
5703 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5704 ## reprocess in the "in table body" insertion mode...
5705 }
5706
5707 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5708 ## have an element in table scope
5709 my $i;
5710 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5711 my $node = $self->{open_elements}->[$_];
5712 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5713 !!!cp ('t237');
5714 $i = $_;
5715 last INSCOPE;
5716 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5717 !!!cp ('t238');
5718 last INSCOPE;
5719 }
5720 } # INSCOPE
5721 unless (defined $i) {
5722 !!!cp ('t239');
5723 !!!parse-error (type => 'unmatched end tag',
5724 text => $token->{tag_name}, token => $token);
5725 ## Ignore the token
5726 !!!nack ('t239.1');
5727 !!!next-token;
5728 next B;
5729 }
5730
5731 ## Clear back to table body context
5732 while (not ($self->{open_elements}->[-1]->[1]
5733 & TABLE_ROWS_SCOPING_EL)) {
5734 !!!cp ('t240');
5735 pop @{$self->{open_elements}};
5736 }
5737
5738 ## As if <{current node}>
5739 ## have an element in table scope
5740 ## true by definition
5741
5742 ## Clear back to table body context
5743 ## nop by definition
5744
5745 pop @{$self->{open_elements}};
5746 $self->{insertion_mode} = IN_TABLE_IM;
5747 ## reprocess in the "in table" insertion mode...
5748 }
5749
5750 ## NOTE: </table> in the "in table" insertion mode.
5751 ## When you edit the code fragment below, please ensure that
5752 ## the code for <table> in the "in table" insertion mode
5753 ## is synced with it.
5754
5755 ## have a table element in table scope
5756 my $i;
5757 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5758 my $node = $self->{open_elements}->[$_];
5759 if ($node->[1] & TABLE_EL) {
5760 !!!cp ('t241');
5761 $i = $_;
5762 last INSCOPE;
5763 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5764 !!!cp ('t242');
5765 last INSCOPE;
5766 }
5767 } # INSCOPE
5768 unless (defined $i) {
5769 !!!cp ('t243');
5770 !!!parse-error (type => 'unmatched end tag',
5771 text => $token->{tag_name}, token => $token);
5772 ## Ignore the token
5773 !!!nack ('t243.1');
5774 !!!next-token;
5775 next B;
5776 }
5777
5778 splice @{$self->{open_elements}}, $i;
5779 pop @{$open_tables};
5780
5781 $self->_reset_insertion_mode;
5782
5783 !!!next-token;
5784 next B;
5785 } elsif ({
5786 tbody => 1, tfoot => 1, thead => 1,
5787 }->{$token->{tag_name}} and
5788 $self->{insertion_mode} & ROW_IMS) {
5789 if ($self->{insertion_mode} == IN_ROW_IM) {
5790 ## have an element in table scope
5791 my $i;
5792 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5793 my $node = $self->{open_elements}->[$_];
5794 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5795 !!!cp ('t247');
5796 $i = $_;
5797 last INSCOPE;
5798 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5799 !!!cp ('t248');
5800 last INSCOPE;
5801 }
5802 } # INSCOPE
5803 unless (defined $i) {
5804 !!!cp ('t249');
5805 !!!parse-error (type => 'unmatched end tag',
5806 text => $token->{tag_name}, token => $token);
5807 ## Ignore the token
5808 !!!nack ('t249.1');
5809 !!!next-token;
5810 next B;
5811 }
5812
5813 ## As if </tr>
5814 ## have an element in table scope
5815 my $i;
5816 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5817 my $node = $self->{open_elements}->[$_];
5818 if ($node->[1] & TABLE_ROW_EL) {
5819 !!!cp ('t250');
5820 $i = $_;
5821 last INSCOPE;
5822 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5823 !!!cp ('t251');
5824 last INSCOPE;
5825 }
5826 } # INSCOPE
5827 unless (defined $i) {
5828 !!!cp ('t252');
5829 !!!parse-error (type => 'unmatched end tag',
5830 text => 'tr', token => $token);
5831 ## Ignore the token
5832 !!!nack ('t252.1');
5833 !!!next-token;
5834 next B;
5835 }
5836
5837 ## Clear back to table row context
5838 while (not ($self->{open_elements}->[-1]->[1]
5839 & TABLE_ROW_SCOPING_EL)) {
5840 !!!cp ('t253');
5841 ## ISSUE: Can this case be reached?
5842 pop @{$self->{open_elements}};
5843 }
5844
5845 pop @{$self->{open_elements}}; # tr
5846 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5847 ## reprocess in the "in table body" insertion mode...
5848 }
5849
5850 ## have an element in table scope
5851 my $i;
5852 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5853 my $node = $self->{open_elements}->[$_];
5854 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5855 !!!cp ('t254');
5856 $i = $_;
5857 last INSCOPE;
5858 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5859 !!!cp ('t255');
5860 last INSCOPE;
5861 }
5862 } # INSCOPE
5863 unless (defined $i) {
5864 !!!cp ('t256');
5865 !!!parse-error (type => 'unmatched end tag',
5866 text => $token->{tag_name}, token => $token);
5867 ## Ignore the token
5868 !!!nack ('t256.1');
5869 !!!next-token;
5870 next B;
5871 }
5872
5873 ## Clear back to table body context
5874 while (not ($self->{open_elements}->[-1]->[1]
5875 & TABLE_ROWS_SCOPING_EL)) {
5876 !!!cp ('t257');
5877 ## ISSUE: Can this case be reached?
5878 pop @{$self->{open_elements}};
5879 }
5880
5881 pop @{$self->{open_elements}};
5882 $self->{insertion_mode} = IN_TABLE_IM;
5883 !!!nack ('t257.1');
5884 !!!next-token;
5885 next B;
5886 } elsif ({
5887 body => 1, caption => 1, col => 1, colgroup => 1,
5888 html => 1, td => 1, th => 1,
5889 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5890 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5891 }->{$token->{tag_name}}) {
5892 !!!cp ('t258');
5893 !!!parse-error (type => 'unmatched end tag',
5894 text => $token->{tag_name}, token => $token);
5895 ## Ignore the token
5896 !!!nack ('t258.1');
5897 !!!next-token;
5898 next B;
5899 } else {
5900 !!!cp ('t259');
5901 !!!parse-error (type => 'in table:/',
5902 text => $token->{tag_name}, token => $token);
5903
5904 $insert = $insert_to_foster;
5905 #
5906 }
5907 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5908 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5909 @{$self->{open_elements}} == 1) { # redundant, maybe
5910 !!!parse-error (type => 'in body:#eof', token => $token);
5911 !!!cp ('t259.1');
5912 #
5913 } else {
5914 !!!cp ('t259.2');
5915 #
5916 }
5917
5918 ## Stop parsing
5919 last B;
5920 } else {
5921 die "$0: $token->{type}: Unknown token type";
5922 }
5923 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5924 if ($token->{type} == CHARACTER_TOKEN) {
5925 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5926 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5927 unless (length $token->{data}) {
5928 !!!cp ('t260');
5929 !!!next-token;
5930 next B;
5931 }
5932 }
5933
5934 !!!cp ('t261');
5935 #
5936 } elsif ($token->{type} == START_TAG_TOKEN) {
5937 if ($token->{tag_name} eq 'col') {
5938 !!!cp ('t262');
5939 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5940 pop @{$self->{open_elements}};
5941 !!!ack ('t262.1');
5942 !!!next-token;
5943 next B;
5944 } else {
5945 !!!cp ('t263');
5946 #
5947 }
5948 } elsif ($token->{type} == END_TAG_TOKEN) {
5949 if ($token->{tag_name} eq 'colgroup') {
5950 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5951 !!!cp ('t264');
5952 !!!parse-error (type => 'unmatched end tag',
5953 text => 'colgroup', token => $token);
5954 ## Ignore the token
5955 !!!next-token;
5956 next B;
5957 } else {
5958 !!!cp ('t265');
5959 pop @{$self->{open_elements}}; # colgroup
5960 $self->{insertion_mode} = IN_TABLE_IM;
5961 !!!next-token;
5962 next B;
5963 }
5964 } elsif ($token->{tag_name} eq 'col') {
5965 !!!cp ('t266');
5966 !!!parse-error (type => 'unmatched end tag',
5967 text => 'col', token => $token);
5968 ## Ignore the token
5969 !!!next-token;
5970 next B;
5971 } else {
5972 !!!cp ('t267');
5973 #
5974 }
5975 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5976 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5977 @{$self->{open_elements}} == 1) { # redundant, maybe
5978 !!!cp ('t270.2');
5979 ## Stop parsing.
5980 last B;
5981 } else {
5982 ## NOTE: As if </colgroup>.
5983 !!!cp ('t270.1');
5984 pop @{$self->{open_elements}}; # colgroup
5985 $self->{insertion_mode} = IN_TABLE_IM;
5986 ## Reprocess.
5987 next B;
5988 }
5989 } else {
5990 die "$0: $token->{type}: Unknown token type";
5991 }
5992
5993 ## As if </colgroup>
5994 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5995 !!!cp ('t269');
5996 ## TODO: Wrong error type?
5997 !!!parse-error (type => 'unmatched end tag',
5998 text => 'colgroup', token => $token);
5999 ## Ignore the token
6000 !!!nack ('t269.1');
6001 !!!next-token;
6002 next B;
6003 } else {
6004 !!!cp ('t270');
6005 pop @{$self->{open_elements}}; # colgroup
6006 $self->{insertion_mode} = IN_TABLE_IM;
6007 !!!ack-later;
6008 ## reprocess
6009 next B;
6010 }
6011 } elsif ($self->{insertion_mode} & SELECT_IMS) {
6012 if ($token->{type} == CHARACTER_TOKEN) {
6013 !!!cp ('t271');
6014 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
6015 !!!next-token;
6016 next B;
6017 } elsif ($token->{type} == START_TAG_TOKEN) {
6018 if ($token->{tag_name} eq 'option') {
6019 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6020 !!!cp ('t272');
6021 ## As if </option>
6022 pop @{$self->{open_elements}};
6023 } else {
6024 !!!cp ('t273');
6025 }
6026
6027 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6028 !!!nack ('t273.1');
6029 !!!next-token;
6030 next B;
6031 } elsif ($token->{tag_name} eq 'optgroup') {
6032 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6033 !!!cp ('t274');
6034 ## As if </option>
6035 pop @{$self->{open_elements}};
6036 } else {
6037 !!!cp ('t275');
6038 }
6039
6040 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6041 !!!cp ('t276');
6042 ## As if </optgroup>
6043 pop @{$self->{open_elements}};
6044 } else {
6045 !!!cp ('t277');
6046 }
6047
6048 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6049 !!!nack ('t277.1');
6050 !!!next-token;
6051 next B;
6052 } elsif ({
6053 select => 1, input => 1, textarea => 1,
6054 }->{$token->{tag_name}} or
6055 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6056 {
6057 caption => 1, table => 1,
6058 tbody => 1, tfoot => 1, thead => 1,
6059 tr => 1, td => 1, th => 1,
6060 }->{$token->{tag_name}})) {
6061 ## TODO: The type below is not good - <select> is replaced by </select>
6062 !!!parse-error (type => 'not closed', text => 'select',
6063 token => $token);
6064 ## NOTE: As if the token were </select> (<select> case) or
6065 ## as if there were </select> (otherwise).
6066 ## have an element in table scope
6067 my $i;
6068 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6069 my $node = $self->{open_elements}->[$_];
6070 if ($node->[1] & SELECT_EL) {
6071 !!!cp ('t278');
6072 $i = $_;
6073 last INSCOPE;
6074 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6075 !!!cp ('t279');
6076 last INSCOPE;
6077 }
6078 } # INSCOPE
6079 unless (defined $i) {
6080 !!!cp ('t280');
6081 !!!parse-error (type => 'unmatched end tag',
6082 text => 'select', token => $token);
6083 ## Ignore the token
6084 !!!nack ('t280.1');
6085 !!!next-token;
6086 next B;
6087 }
6088
6089 !!!cp ('t281');
6090 splice @{$self->{open_elements}}, $i;
6091
6092 $self->_reset_insertion_mode;
6093
6094 if ($token->{tag_name} eq 'select') {
6095 !!!nack ('t281.2');
6096 !!!next-token;
6097 next B;
6098 } else {
6099 !!!cp ('t281.1');
6100 !!!ack-later;
6101 ## Reprocess the token.
6102 next B;
6103 }
6104 } else {
6105 !!!cp ('t282');
6106 !!!parse-error (type => 'in select',
6107 text => $token->{tag_name}, token => $token);
6108 ## Ignore the token
6109 !!!nack ('t282.1');
6110 !!!next-token;
6111 next B;
6112 }
6113 } elsif ($token->{type} == END_TAG_TOKEN) {
6114 if ($token->{tag_name} eq 'optgroup') {
6115 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6116 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6117 !!!cp ('t283');
6118 ## As if </option>
6119 splice @{$self->{open_elements}}, -2;
6120 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6121 !!!cp ('t284');
6122 pop @{$self->{open_elements}};
6123 } else {
6124 !!!cp ('t285');
6125 !!!parse-error (type => 'unmatched end tag',
6126 text => $token->{tag_name}, token => $token);
6127 ## Ignore the token
6128 }
6129 !!!nack ('t285.1');
6130 !!!next-token;
6131 next B;
6132 } elsif ($token->{tag_name} eq 'option') {
6133 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6134 !!!cp ('t286');
6135 pop @{$self->{open_elements}};
6136 } else {
6137 !!!cp ('t287');
6138 !!!parse-error (type => 'unmatched end tag',
6139 text => $token->{tag_name}, token => $token);
6140 ## Ignore the token
6141 }
6142 !!!nack ('t287.1');
6143 !!!next-token;
6144 next B;
6145 } elsif ($token->{tag_name} eq 'select') {
6146 ## have an element in table scope
6147 my $i;
6148 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6149 my $node = $self->{open_elements}->[$_];
6150 if ($node->[1] & SELECT_EL) {
6151 !!!cp ('t288');
6152 $i = $_;
6153 last INSCOPE;
6154 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6155 !!!cp ('t289');
6156 last INSCOPE;
6157 }
6158 } # INSCOPE
6159 unless (defined $i) {
6160 !!!cp ('t290');
6161 !!!parse-error (type => 'unmatched end tag',
6162 text => $token->{tag_name}, token => $token);
6163 ## Ignore the token
6164 !!!nack ('t290.1');
6165 !!!next-token;
6166 next B;
6167 }
6168
6169 !!!cp ('t291');
6170 splice @{$self->{open_elements}}, $i;
6171
6172 $self->_reset_insertion_mode;
6173
6174 !!!nack ('t291.1');
6175 !!!next-token;
6176 next B;
6177 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6178 {
6179 caption => 1, table => 1, tbody => 1,
6180 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6181 }->{$token->{tag_name}}) {
6182 ## TODO: The following is wrong?
6183 !!!parse-error (type => 'unmatched end tag',
6184 text => $token->{tag_name}, token => $token);
6185
6186 ## have an element in table scope
6187 my $i;
6188 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6189 my $node = $self->{open_elements}->[$_];
6190 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6191 !!!cp ('t292');
6192 $i = $_;
6193 last INSCOPE;
6194 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6195 !!!cp ('t293');
6196 last INSCOPE;
6197 }
6198 } # INSCOPE
6199 unless (defined $i) {
6200 !!!cp ('t294');
6201 ## Ignore the token
6202 !!!nack ('t294.1');
6203 !!!next-token;
6204 next B;
6205 }
6206
6207 ## As if </select>
6208 ## have an element in table scope
6209 undef $i;
6210 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6211 my $node = $self->{open_elements}->[$_];
6212 if ($node->[1] & SELECT_EL) {
6213 !!!cp ('t295');
6214 $i = $_;
6215 last INSCOPE;
6216 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6217 ## ISSUE: Can this state be reached?
6218 !!!cp ('t296');
6219 last INSCOPE;
6220 }
6221 } # INSCOPE
6222 unless (defined $i) {
6223 !!!cp ('t297');
6224 ## TODO: The following error type is correct?
6225 !!!parse-error (type => 'unmatched end tag',
6226 text => 'select', token => $token);
6227 ## Ignore the </select> token
6228 !!!nack ('t297.1');
6229 !!!next-token; ## TODO: ok?
6230 next B;
6231 }
6232
6233 !!!cp ('t298');
6234 splice @{$self->{open_elements}}, $i;
6235
6236 $self->_reset_insertion_mode;
6237
6238 !!!ack-later;
6239 ## reprocess
6240 next B;
6241 } else {
6242 !!!cp ('t299');
6243 !!!parse-error (type => 'in select:/',
6244 text => $token->{tag_name}, token => $token);
6245 ## Ignore the token
6246 !!!nack ('t299.3');
6247 !!!next-token;
6248 next B;
6249 }
6250 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6251 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6252 @{$self->{open_elements}} == 1) { # redundant, maybe
6253 !!!cp ('t299.1');
6254 !!!parse-error (type => 'in body:#eof', token => $token);
6255 } else {
6256 !!!cp ('t299.2');
6257 }
6258
6259 ## Stop parsing.
6260 last B;
6261 } else {
6262 die "$0: $token->{type}: Unknown token type";
6263 }
6264 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6265 if ($token->{type} == CHARACTER_TOKEN) {
6266 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6267 my $data = $1;
6268 ## As if in body
6269 $reconstruct_active_formatting_elements->($insert_to_current);
6270
6271 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6272
6273 unless (length $token->{data}) {
6274 !!!cp ('t300');
6275 !!!next-token;
6276 next B;
6277 }
6278 }
6279
6280 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6281 !!!cp ('t301');
6282 !!!parse-error (type => 'after html:#text', token => $token);
6283
6284 ## Reprocess in the "after body" insertion mode.
6285 } else {
6286 !!!cp ('t302');
6287 }
6288
6289 ## "after body" insertion mode
6290 !!!parse-error (type => 'after body:#text', token => $token);
6291
6292 $self->{insertion_mode} = IN_BODY_IM;
6293 ## reprocess
6294 next B;
6295 } elsif ($token->{type} == START_TAG_TOKEN) {
6296 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6297 !!!cp ('t303');
6298 !!!parse-error (type => 'after html',
6299 text => $token->{tag_name}, token => $token);
6300
6301 ## Reprocess in the "after body" insertion mode.
6302 } else {
6303 !!!cp ('t304');
6304 }
6305
6306 ## "after body" insertion mode
6307 !!!parse-error (type => 'after body',
6308 text => $token->{tag_name}, token => $token);
6309
6310 $self->{insertion_mode} = IN_BODY_IM;
6311 !!!ack-later;
6312 ## reprocess
6313 next B;
6314 } elsif ($token->{type} == END_TAG_TOKEN) {
6315 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6316 !!!cp ('t305');
6317 !!!parse-error (type => 'after html:/',
6318 text => $token->{tag_name}, token => $token);
6319
6320 $self->{insertion_mode} = AFTER_BODY_IM;
6321 ## Reprocess in the "after body" insertion mode.
6322 } else {
6323 !!!cp ('t306');
6324 }
6325
6326 ## "after body" insertion mode
6327 if ($token->{tag_name} eq 'html') {
6328 if (defined $self->{inner_html_node}) {
6329 !!!cp ('t307');
6330 !!!parse-error (type => 'unmatched end tag',
6331 text => 'html', token => $token);
6332 ## Ignore the token
6333 !!!next-token;
6334 next B;
6335 } else {
6336 !!!cp ('t308');
6337 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6338 !!!next-token;
6339 next B;
6340 }
6341 } else {
6342 !!!cp ('t309');
6343 !!!parse-error (type => 'after body:/',
6344 text => $token->{tag_name}, token => $token);
6345
6346 $self->{insertion_mode} = IN_BODY_IM;
6347 ## reprocess
6348 next B;
6349 }
6350 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6351 !!!cp ('t309.2');
6352 ## Stop parsing
6353 last B;
6354 } else {
6355 die "$0: $token->{type}: Unknown token type";
6356 }
6357 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6358 if ($token->{type} == CHARACTER_TOKEN) {
6359 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6360 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6361
6362 unless (length $token->{data}) {
6363 !!!cp ('t310');
6364 !!!next-token;
6365 next B;
6366 }
6367 }
6368
6369 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6370 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6371 !!!cp ('t311');
6372 !!!parse-error (type => 'in frameset:#text', token => $token);
6373 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6374 !!!cp ('t312');
6375 !!!parse-error (type => 'after frameset:#text', token => $token);
6376 } else { # "after after frameset"
6377 !!!cp ('t313');
6378 !!!parse-error (type => 'after html:#text', token => $token);
6379 }
6380
6381 ## Ignore the token.
6382 if (length $token->{data}) {
6383 !!!cp ('t314');
6384 ## reprocess the rest of characters
6385 } else {
6386 !!!cp ('t315');
6387 !!!next-token;
6388 }
6389 next B;
6390 }
6391
6392 die qq[$0: Character "$token->{data}"];
6393 } elsif ($token->{type} == START_TAG_TOKEN) {
6394 if ($token->{tag_name} eq 'frameset' and
6395 $self->{insertion_mode} == IN_FRAMESET_IM) {
6396 !!!cp ('t318');
6397 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6398 !!!nack ('t318.1');
6399 !!!next-token;
6400 next B;
6401 } elsif ($token->{tag_name} eq 'frame' and
6402 $self->{insertion_mode} == IN_FRAMESET_IM) {
6403 !!!cp ('t319');
6404 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6405 pop @{$self->{open_elements}};
6406 !!!ack ('t319.1');
6407 !!!next-token;
6408 next B;
6409 } elsif ($token->{tag_name} eq 'noframes') {
6410 !!!cp ('t320');
6411 ## NOTE: As if in head.
6412 $parse_rcdata->(CDATA_CONTENT_MODEL);
6413 next B;
6414
6415 ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6416 ## has no parse error.
6417 } else {
6418 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6419 !!!cp ('t321');
6420 !!!parse-error (type => 'in frameset',
6421 text => $token->{tag_name}, token => $token);
6422 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6423 !!!cp ('t322');
6424 !!!parse-error (type => 'after frameset',
6425 text => $token->{tag_name}, token => $token);
6426 } else { # "after after frameset"
6427 !!!cp ('t322.2');
6428 !!!parse-error (type => 'after after frameset',
6429 text => $token->{tag_name}, token => $token);
6430 }
6431 ## Ignore the token
6432 !!!nack ('t322.1');
6433 !!!next-token;
6434 next B;
6435 }
6436 } elsif ($token->{type} == END_TAG_TOKEN) {
6437 if ($token->{tag_name} eq 'frameset' and
6438 $self->{insertion_mode} == IN_FRAMESET_IM) {
6439 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6440 @{$self->{open_elements}} == 1) {
6441 !!!cp ('t325');
6442 !!!parse-error (type => 'unmatched end tag',
6443 text => $token->{tag_name}, token => $token);
6444 ## Ignore the token
6445 !!!next-token;
6446 } else {
6447 !!!cp ('t326');
6448 pop @{$self->{open_elements}};
6449 !!!next-token;
6450 }
6451
6452 if (not defined $self->{inner_html_node} and
6453 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6454 !!!cp ('t327');
6455 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6456 } else {
6457 !!!cp ('t328');
6458 }
6459 next B;
6460 } elsif ($token->{tag_name} eq 'html' and
6461 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6462 !!!cp ('t329');
6463 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6464 !!!next-token;
6465 next B;
6466 } else {
6467 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6468 !!!cp ('t330');
6469 !!!parse-error (type => 'in frameset:/',
6470 text => $token->{tag_name}, token => $token);
6471 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6472 !!!cp ('t330.1');
6473 !!!parse-error (type => 'after frameset:/',
6474 text => $token->{tag_name}, token => $token);
6475 } else { # "after after html"
6476 !!!cp ('t331');
6477 !!!parse-error (type => 'after after frameset:/',
6478 text => $token->{tag_name}, token => $token);
6479 }
6480 ## Ignore the token
6481 !!!next-token;
6482 next B;
6483 }
6484 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6485 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6486 @{$self->{open_elements}} == 1) { # redundant, maybe
6487 !!!cp ('t331.1');
6488 !!!parse-error (type => 'in body:#eof', token => $token);
6489 } else {
6490 !!!cp ('t331.2');
6491 }
6492
6493 ## Stop parsing
6494 last B;
6495 } else {
6496 die "$0: $token->{type}: Unknown token type";
6497 }
6498
6499 ## ISSUE: An issue in spec here
6500 } else {
6501 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6502 }
6503
6504 ## "in body" insertion mode
6505 if ($token->{type} == START_TAG_TOKEN) {
6506 if ($token->{tag_name} eq 'script') {
6507 !!!cp ('t332');
6508 ## NOTE: This is an "as if in head" code clone
6509 $script_start_tag->();
6510 next B;
6511 } elsif ($token->{tag_name} eq 'style') {
6512 !!!cp ('t333');
6513 ## NOTE: This is an "as if in head" code clone
6514 $parse_rcdata->(CDATA_CONTENT_MODEL);
6515 next B;
6516 } elsif ({
6517 base => 1, link => 1,
6518 }->{$token->{tag_name}}) {
6519 !!!cp ('t334');
6520 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6521 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6522 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6523 !!!ack ('t334.1');
6524 !!!next-token;
6525 next B;
6526 } elsif ($token->{tag_name} eq 'meta') {
6527 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6528 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6529 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6530
6531 unless ($self->{confident}) {
6532 if ($token->{attributes}->{charset}) {
6533 !!!cp ('t335');
6534 ## NOTE: Whether the encoding is supported or not is handled
6535 ## in the {change_encoding} callback.
6536 $self->{change_encoding}
6537 ->($self, $token->{attributes}->{charset}->{value}, $token);
6538
6539 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6540 ->set_user_data (manakai_has_reference =>
6541 $token->{attributes}->{charset}
6542 ->{has_reference});
6543 } elsif ($token->{attributes}->{content}) {
6544 if ($token->{attributes}->{content}->{value}
6545 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6546 [\x09-\x0D\x20]*=
6547 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6548 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6549 !!!cp ('t336');
6550 ## NOTE: Whether the encoding is supported or not is handled
6551 ## in the {change_encoding} callback.
6552 $self->{change_encoding}
6553 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6554 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6555 ->set_user_data (manakai_has_reference =>
6556 $token->{attributes}->{content}
6557 ->{has_reference});
6558 }
6559 }
6560 } else {
6561 if ($token->{attributes}->{charset}) {
6562 !!!cp ('t337');
6563 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6564 ->set_user_data (manakai_has_reference =>
6565 $token->{attributes}->{charset}
6566 ->{has_reference});
6567 }
6568 if ($token->{attributes}->{content}) {
6569 !!!cp ('t338');
6570 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6571 ->set_user_data (manakai_has_reference =>
6572 $token->{attributes}->{content}
6573 ->{has_reference});
6574 }
6575 }
6576
6577 !!!ack ('t338.1');
6578 !!!next-token;
6579 next B;
6580 } elsif ($token->{tag_name} eq 'title') {
6581 !!!cp ('t341');
6582 ## NOTE: This is an "as if in head" code clone
6583 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6584 next B;
6585 } elsif ($token->{tag_name} eq 'body') {
6586 !!!parse-error (type => 'in body', text => 'body', token => $token);
6587
6588 if (@{$self->{open_elements}} == 1 or
6589 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6590 !!!cp ('t342');
6591 ## Ignore the token
6592 } else {
6593 my $body_el = $self->{open_elements}->[1]->[0];
6594 for my $attr_name (keys %{$token->{attributes}}) {
6595 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6596 !!!cp ('t343');
6597 $body_el->set_attribute_ns
6598 (undef, [undef, $attr_name],
6599 $token->{attributes}->{$attr_name}->{value});
6600 }
6601 }
6602 }
6603 !!!nack ('t343.1');
6604 !!!next-token;
6605 next B;
6606 } elsif ({
6607 address => 1, blockquote => 1, center => 1, dir => 1,
6608 div => 1, dl => 1, fieldset => 1,
6609 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6610 menu => 1, ol => 1, p => 1, ul => 1,
6611 pre => 1, listing => 1,
6612 form => 1,
6613 table => 1,
6614 hr => 1,
6615 }->{$token->{tag_name}}) {
6616 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6617 !!!cp ('t350');
6618 !!!parse-error (type => 'in form:form', token => $token);
6619 ## Ignore the token
6620 !!!nack ('t350.1');
6621 !!!next-token;
6622 next B;
6623 }
6624
6625 ## has a p element in scope
6626 INSCOPE: for (reverse @{$self->{open_elements}}) {
6627 if ($_->[1] & P_EL) {
6628 !!!cp ('t344');
6629 !!!back-token; # <form>
6630 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6631 line => $token->{line}, column => $token->{column}};
6632 next B;
6633 } elsif ($_->[1] & SCOPING_EL) {
6634 !!!cp ('t345');
6635 last INSCOPE;
6636 }
6637 } # INSCOPE
6638
6639 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6640 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6641 !!!nack ('t346.1');
6642 !!!next-token;
6643 if ($token->{type} == CHARACTER_TOKEN) {
6644 $token->{data} =~ s/^\x0A//;
6645 unless (length $token->{data}) {
6646 !!!cp ('t346');
6647 !!!next-token;
6648 } else {
6649 !!!cp ('t349');
6650 }
6651 } else {
6652 !!!cp ('t348');
6653 }
6654 } elsif ($token->{tag_name} eq 'form') {
6655 !!!cp ('t347.1');
6656 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6657
6658 !!!nack ('t347.2');
6659 !!!next-token;
6660 } elsif ($token->{tag_name} eq 'table') {
6661 !!!cp ('t382');
6662 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6663
6664 $self->{insertion_mode} = IN_TABLE_IM;
6665
6666 !!!nack ('t382.1');
6667 !!!next-token;
6668 } elsif ($token->{tag_name} eq 'hr') {
6669 !!!cp ('t386');
6670 pop @{$self->{open_elements}};
6671
6672 !!!nack ('t386.1');
6673 !!!next-token;
6674 } else {
6675 !!!nack ('t347.1');
6676 !!!next-token;
6677 }
6678 next B;
6679 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6680 ## has a p element in scope
6681 INSCOPE: for (reverse @{$self->{open_elements}}) {
6682 if ($_->[1] & P_EL) {
6683 !!!cp ('t353');
6684 !!!back-token; # <x>
6685 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6686 line => $token->{line}, column => $token->{column}};
6687 next B;
6688 } elsif ($_->[1] & SCOPING_EL) {
6689 !!!cp ('t354');
6690 last INSCOPE;
6691 }
6692 } # INSCOPE
6693
6694 ## Step 1
6695 my $i = -1;
6696 my $node = $self->{open_elements}->[$i];
6697 my $li_or_dtdd = {li => {li => 1},
6698 dt => {dt => 1, dd => 1},
6699 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6700 LI: {
6701 ## Step 2
6702 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6703 if ($i != -1) {
6704 !!!cp ('t355');
6705 !!!parse-error (type => 'not closed',
6706 text => $self->{open_elements}->[-1]->[0]
6707 ->manakai_local_name,
6708 token => $token);
6709 } else {
6710 !!!cp ('t356');
6711 }
6712 splice @{$self->{open_elements}}, $i;
6713 last LI;
6714 } else {
6715 !!!cp ('t357');
6716 }
6717
6718 ## Step 3
6719 if (not ($node->[1] & FORMATTING_EL) and
6720 #not $phrasing_category->{$node->[1]} and
6721 ($node->[1] & SPECIAL_EL or
6722 $node->[1] & SCOPING_EL) and
6723 not ($node->[1] & ADDRESS_EL) and
6724 not ($node->[1] & DIV_EL)) {
6725 !!!cp ('t358');
6726 last LI;
6727 }
6728
6729 !!!cp ('t359');
6730 ## Step 4
6731 $i--;
6732 $node = $self->{open_elements}->[$i];
6733 redo LI;
6734 } # LI
6735
6736 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6737 !!!nack ('t359.1');
6738 !!!next-token;
6739 next B;
6740 } elsif ($token->{tag_name} eq 'plaintext') {
6741 ## has a p element in scope
6742 INSCOPE: for (reverse @{$self->{open_elements}}) {
6743 if ($_->[1] & P_EL) {
6744 !!!cp ('t367');
6745 !!!back-token; # <plaintext>
6746 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6747 line => $token->{line}, column => $token->{column}};
6748 next B;
6749 } elsif ($_->[1] & SCOPING_EL) {
6750 !!!cp ('t368');
6751 last INSCOPE;
6752 }
6753 } # INSCOPE
6754
6755 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6756
6757 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6758
6759 !!!nack ('t368.1');
6760 !!!next-token;
6761 next B;
6762 } elsif ($token->{tag_name} eq 'a') {
6763 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6764 my $node = $active_formatting_elements->[$i];
6765 if ($node->[1] & A_EL) {
6766 !!!cp ('t371');
6767 !!!parse-error (type => 'in a:a', token => $token);
6768
6769 !!!back-token; # <a>
6770 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6771 line => $token->{line}, column => $token->{column}};
6772 $formatting_end_tag->($token);
6773
6774 AFE2: for (reverse 0..$#$active_formatting_elements) {
6775 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6776 !!!cp ('t372');
6777 splice @$active_formatting_elements, $_, 1;
6778 last AFE2;
6779 }
6780 } # AFE2
6781 OE: for (reverse 0..$#{$self->{open_elements}}) {
6782 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6783 !!!cp ('t373');
6784 splice @{$self->{open_elements}}, $_, 1;
6785 last OE;
6786 }
6787 } # OE
6788 last AFE;
6789 } elsif ($node->[0] eq '#marker') {
6790 !!!cp ('t374');
6791 last AFE;
6792 }
6793 } # AFE
6794
6795 $reconstruct_active_formatting_elements->($insert_to_current);
6796
6797 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6798 push @$active_formatting_elements, $self->{open_elements}->[-1];
6799
6800 !!!nack ('t374.1');
6801 !!!next-token;
6802 next B;
6803 } elsif ($token->{tag_name} eq 'nobr') {
6804 $reconstruct_active_formatting_elements->($insert_to_current);
6805
6806 ## has a |nobr| element in scope
6807 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6808 my $node = $self->{open_elements}->[$_];
6809 if ($node->[1] & NOBR_EL) {
6810 !!!cp ('t376');
6811 !!!parse-error (type => 'in nobr:nobr', token => $token);
6812 !!!back-token; # <nobr>
6813 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6814 line => $token->{line}, column => $token->{column}};
6815 next B;
6816 } elsif ($node->[1] & SCOPING_EL) {
6817 !!!cp ('t377');
6818 last INSCOPE;
6819 }
6820 } # INSCOPE
6821
6822 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6823 push @$active_formatting_elements, $self->{open_elements}->[-1];
6824
6825 !!!nack ('t377.1');
6826 !!!next-token;
6827 next B;
6828 } elsif ($token->{tag_name} eq 'button') {
6829 ## has a button element in scope
6830 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6831 my $node = $self->{open_elements}->[$_];
6832 if ($node->[1] & BUTTON_EL) {
6833 !!!cp ('t378');
6834 !!!parse-error (type => 'in button:button', token => $token);
6835 !!!back-token; # <button>
6836 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6837 line => $token->{line}, column => $token->{column}};
6838 next B;
6839 } elsif ($node->[1] & SCOPING_EL) {
6840 !!!cp ('t379');
6841 last INSCOPE;
6842 }
6843 } # INSCOPE
6844
6845 $reconstruct_active_formatting_elements->($insert_to_current);
6846
6847 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6848
6849 ## TODO: associate with $self->{form_element} if defined
6850
6851 push @$active_formatting_elements, ['#marker', ''];
6852
6853 !!!nack ('t379.1');
6854 !!!next-token;
6855 next B;
6856 } elsif ({
6857 xmp => 1,
6858 iframe => 1,
6859 noembed => 1,
6860 noframes => 1, ## NOTE: This is an "as if in head" code clone.
6861 noscript => 0, ## TODO: 1 if scripting is enabled
6862 }->{$token->{tag_name}}) {
6863 if ($token->{tag_name} eq 'xmp') {
6864 !!!cp ('t381');
6865 $reconstruct_active_formatting_elements->($insert_to_current);
6866 } else {
6867 !!!cp ('t399');
6868 }
6869 ## NOTE: There is an "as if in body" code clone.
6870 $parse_rcdata->(CDATA_CONTENT_MODEL);
6871 next B;
6872 } elsif ($token->{tag_name} eq 'isindex') {
6873 !!!parse-error (type => 'isindex', token => $token);
6874
6875 if (defined $self->{form_element}) {
6876 !!!cp ('t389');
6877 ## Ignore the token
6878 !!!nack ('t389'); ## NOTE: Not acknowledged.
6879 !!!next-token;
6880 next B;
6881 } else {
6882 !!!ack ('t391.1');
6883
6884 my $at = $token->{attributes};
6885 my $form_attrs;
6886 $form_attrs->{action} = $at->{action} if $at->{action};
6887 my $prompt_attr = $at->{prompt};
6888 $at->{name} = {name => 'name', value => 'isindex'};
6889 delete $at->{action};
6890 delete $at->{prompt};
6891 my @tokens = (
6892 {type => START_TAG_TOKEN, tag_name => 'form',
6893 attributes => $form_attrs,
6894 line => $token->{line}, column => $token->{column}},
6895 {type => START_TAG_TOKEN, tag_name => 'hr',
6896 line => $token->{line}, column => $token->{column}},
6897 {type => START_TAG_TOKEN, tag_name => 'p',
6898 line => $token->{line}, column => $token->{column}},
6899 {type => START_TAG_TOKEN, tag_name => 'label',
6900 line => $token->{line}, column => $token->{column}},
6901 );
6902 if ($prompt_attr) {
6903 !!!cp ('t390');
6904 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6905 #line => $token->{line}, column => $token->{column},
6906 };
6907 } else {
6908 !!!cp ('t391');
6909 push @tokens, {type => CHARACTER_TOKEN,
6910 data => 'This is a searchable index. Insert your search keywords here: ',
6911 #line => $token->{line}, column => $token->{column},
6912 }; # SHOULD
6913 ## TODO: make this configurable
6914 }
6915 push @tokens,
6916 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6917 line => $token->{line}, column => $token->{column}},
6918 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6919 {type => END_TAG_TOKEN, tag_name => 'label',
6920 line => $token->{line}, column => $token->{column}},
6921 {type => END_TAG_TOKEN, tag_name => 'p',
6922 line => $token->{line}, column => $token->{column}},
6923 {type => START_TAG_TOKEN, tag_name => 'hr',
6924 line => $token->{line}, column => $token->{column}},
6925 {type => END_TAG_TOKEN, tag_name => 'form',
6926 line => $token->{line}, column => $token->{column}};
6927 !!!back-token (@tokens);
6928 !!!next-token;
6929 next B;
6930 }
6931 } elsif ($token->{tag_name} eq 'textarea') {
6932 my $tag_name = $token->{tag_name};
6933 my $el;
6934 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6935
6936 ## TODO: $self->{form_element} if defined
6937 $self->{content_model} = RCDATA_CONTENT_MODEL;
6938 delete $self->{escape}; # MUST
6939
6940 $insert->($el);
6941
6942 my $text = '';
6943 !!!nack ('t392.1');
6944 !!!next-token;
6945 if ($token->{type} == CHARACTER_TOKEN) {
6946 $token->{data} =~ s/^\x0A//;
6947 unless (length $token->{data}) {
6948 !!!cp ('t392');
6949 !!!next-token;
6950 } else {
6951 !!!cp ('t393');
6952 }
6953 } else {
6954 !!!cp ('t394');
6955 }
6956 while ($token->{type} == CHARACTER_TOKEN) {
6957 !!!cp ('t395');
6958 $text .= $token->{data};
6959 !!!next-token;
6960 }
6961 if (length $text) {
6962 !!!cp ('t396');
6963 $el->manakai_append_text ($text);
6964 }
6965
6966 $self->{content_model} = PCDATA_CONTENT_MODEL;
6967
6968 if ($token->{type} == END_TAG_TOKEN and
6969 $token->{tag_name} eq $tag_name) {
6970 !!!cp ('t397');
6971 ## Ignore the token
6972 } else {
6973 !!!cp ('t398');
6974 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
6975 }
6976 !!!next-token;
6977 next B;
6978 } elsif ($token->{tag_name} eq 'rt' or
6979 $token->{tag_name} eq 'rp') {
6980 ## has a |ruby| element in scope
6981 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6982 my $node = $self->{open_elements}->[$_];
6983 if ($node->[1] & RUBY_EL) {
6984 !!!cp ('t398.1');
6985 ## generate implied end tags
6986 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6987 !!!cp ('t398.2');
6988 pop @{$self->{open_elements}};
6989 }
6990 unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
6991 !!!cp ('t398.3');
6992 !!!parse-error (type => 'not closed',
6993 text => $self->{open_elements}->[-1]->[0]
6994 ->manakai_local_name,
6995 token => $token);
6996 pop @{$self->{open_elements}}
6997 while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
6998 }
6999 last INSCOPE;
7000 } elsif ($node->[1] & SCOPING_EL) {
7001 !!!cp ('t398.4');
7002 last INSCOPE;
7003 }
7004 } # INSCOPE
7005
7006 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7007
7008 !!!nack ('t398.5');
7009 !!!next-token;
7010 redo B;
7011 } elsif ($token->{tag_name} eq 'math' or
7012 $token->{tag_name} eq 'svg') {
7013 $reconstruct_active_formatting_elements->($insert_to_current);
7014
7015 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
7016
7017 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
7018
7019 ## "adjust foreign attributes" - done in insert-element-f
7020
7021 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
7022
7023 if ($self->{self_closing}) {
7024 pop @{$self->{open_elements}};
7025 !!!ack ('t398.1');
7026 } else {
7027 !!!cp ('t398.2');
7028 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
7029 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
7030 ## mode, "in body" (not "in foreign content") secondary insertion
7031 ## mode, maybe.
7032 }
7033
7034 !!!next-token;
7035 next B;
7036 } elsif ({
7037 caption => 1, col => 1, colgroup => 1, frame => 1,
7038 frameset => 1, head => 1, option => 1, optgroup => 1,
7039 tbody => 1, td => 1, tfoot => 1, th => 1,
7040 thead => 1, tr => 1,
7041 }->{$token->{tag_name}}) {
7042 !!!cp ('t401');
7043 !!!parse-error (type => 'in body',
7044 text => $token->{tag_name}, token => $token);
7045 ## Ignore the token
7046 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
7047 !!!next-token;
7048 next B;
7049
7050 ## ISSUE: An issue on HTML5 new elements in the spec.
7051 } else {
7052 if ($token->{tag_name} eq 'image') {
7053 !!!cp ('t384');
7054 !!!parse-error (type => 'image', token => $token);
7055 $token->{tag_name} = 'img';
7056 } else {
7057 !!!cp ('t385');
7058 }
7059
7060 ## NOTE: There is an "as if <br>" code clone.
7061 $reconstruct_active_formatting_elements->($insert_to_current);
7062
7063 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7064
7065 if ({
7066 applet => 1, marquee => 1, object => 1,
7067 }->{$token->{tag_name}}) {
7068 !!!cp ('t380');
7069 push @$active_formatting_elements, ['#marker', ''];
7070 !!!nack ('t380.1');
7071 } elsif ({
7072 b => 1, big => 1, em => 1, font => 1, i => 1,
7073 s => 1, small => 1, strile => 1,
7074 strong => 1, tt => 1, u => 1,
7075 }->{$token->{tag_name}}) {
7076 !!!cp ('t375');
7077 push @$active_formatting_elements, $self->{open_elements}->[-1];
7078 !!!nack ('t375.1');
7079 } elsif ($token->{tag_name} eq 'input') {
7080 !!!cp ('t388');
7081 ## TODO: associate with $self->{form_element} if defined
7082 pop @{$self->{open_elements}};
7083 !!!ack ('t388.2');
7084 } elsif ({
7085 area => 1, basefont => 1, bgsound => 1, br => 1,
7086 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7087 #image => 1,
7088 }->{$token->{tag_name}}) {
7089 !!!cp ('t388.1');
7090 pop @{$self->{open_elements}};
7091 !!!ack ('t388.3');
7092 } elsif ($token->{tag_name} eq 'select') {
7093 ## TODO: associate with $self->{form_element} if defined
7094
7095 if ($self->{insertion_mode} & TABLE_IMS or
7096 $self->{insertion_mode} & BODY_TABLE_IMS or
7097 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7098 !!!cp ('t400.1');
7099 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7100 } else {
7101 !!!cp ('t400.2');
7102 $self->{insertion_mode} = IN_SELECT_IM;
7103 }
7104 !!!nack ('t400.3');
7105 } else {
7106 !!!nack ('t402');
7107 }
7108
7109 !!!next-token;
7110 next B;
7111 }
7112 } elsif ($token->{type} == END_TAG_TOKEN) {
7113 if ($token->{tag_name} eq 'body') {
7114 ## has a |body| element in scope
7115 my $i;
7116 INSCOPE: {
7117 for (reverse @{$self->{open_elements}}) {
7118 if ($_->[1] & BODY_EL) {
7119 !!!cp ('t405');
7120 $i = $_;
7121 last INSCOPE;
7122 } elsif ($_->[1] & SCOPING_EL) {
7123 !!!cp ('t405.1');
7124 last;
7125 }
7126 }
7127
7128 !!!parse-error (type => 'start tag not allowed',
7129 text => $token->{tag_name}, token => $token);
7130 ## NOTE: Ignore the token.
7131 !!!next-token;
7132 next B;
7133 } # INSCOPE
7134
7135 for (@{$self->{open_elements}}) {
7136 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7137 !!!cp ('t403');
7138 !!!parse-error (type => 'not closed',
7139 text => $_->[0]->manakai_local_name,
7140 token => $token);
7141 last;
7142 } else {
7143 !!!cp ('t404');
7144 }
7145 }
7146
7147 $self->{insertion_mode} = AFTER_BODY_IM;
7148 !!!next-token;
7149 next B;
7150 } elsif ($token->{tag_name} eq 'html') {
7151 ## TODO: Update this code. It seems that the code below is not
7152 ## up-to-date, though it has same effect as speced.
7153 if (@{$self->{open_elements}} > 1 and
7154 $self->{open_elements}->[1]->[1] & BODY_EL) {
7155 ## ISSUE: There is an issue in the spec.
7156 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7157 !!!cp ('t406');
7158 !!!parse-error (type => 'not closed',
7159 text => $self->{open_elements}->[1]->[0]
7160 ->manakai_local_name,
7161 token => $token);
7162 } else {
7163 !!!cp ('t407');
7164 }
7165 $self->{insertion_mode} = AFTER_BODY_IM;
7166 ## reprocess
7167 next B;
7168 } else {
7169 !!!cp ('t408');
7170 !!!parse-error (type => 'unmatched end tag',
7171 text => $token->{tag_name}, token => $token);
7172 ## Ignore the token
7173 !!!next-token;
7174 next B;
7175 }
7176 } elsif ({
7177 address => 1, blockquote => 1, center => 1, dir => 1,
7178 div => 1, dl => 1, fieldset => 1, listing => 1,
7179 menu => 1, ol => 1, pre => 1, ul => 1,
7180 dd => 1, dt => 1, li => 1,
7181 applet => 1, button => 1, marquee => 1, object => 1,
7182 }->{$token->{tag_name}}) {
7183 ## has an element in scope
7184 my $i;
7185 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7186 my $node = $self->{open_elements}->[$_];
7187 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7188 !!!cp ('t410');
7189 $i = $_;
7190 last INSCOPE;
7191 } elsif ($node->[1] & SCOPING_EL) {
7192 !!!cp ('t411');
7193 last INSCOPE;
7194 }
7195 } # INSCOPE
7196
7197 unless (defined $i) { # has an element in scope
7198 !!!cp ('t413');
7199 !!!parse-error (type => 'unmatched end tag',
7200 text => $token->{tag_name}, token => $token);
7201 ## NOTE: Ignore the token.
7202 } else {
7203 ## Step 1. generate implied end tags
7204 while ({
7205 ## END_TAG_OPTIONAL_EL
7206 dd => ($token->{tag_name} ne 'dd'),
7207 dt => ($token->{tag_name} ne 'dt'),
7208 li => ($token->{tag_name} ne 'li'),
7209 p => 1,
7210 rt => 1,
7211 rp => 1,
7212 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7213 !!!cp ('t409');
7214 pop @{$self->{open_elements}};
7215 }
7216
7217 ## Step 2.
7218 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7219 ne $token->{tag_name}) {
7220 !!!cp ('t412');
7221 !!!parse-error (type => 'not closed',
7222 text => $self->{open_elements}->[-1]->[0]
7223 ->manakai_local_name,
7224 token => $token);
7225 } else {
7226 !!!cp ('t414');
7227 }
7228
7229 ## Step 3.
7230 splice @{$self->{open_elements}}, $i;
7231
7232 ## Step 4.
7233 $clear_up_to_marker->()
7234 if {
7235 applet => 1, button => 1, marquee => 1, object => 1,
7236 }->{$token->{tag_name}};
7237 }
7238 !!!next-token;
7239 next B;
7240 } elsif ($token->{tag_name} eq 'form') {
7241 undef $self->{form_element};
7242
7243 ## has an element in scope
7244 my $i;
7245 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7246 my $node = $self->{open_elements}->[$_];
7247 if ($node->[1] & FORM_EL) {
7248 !!!cp ('t418');
7249 $i = $_;
7250 last INSCOPE;
7251 } elsif ($node->[1] & SCOPING_EL) {
7252 !!!cp ('t419');
7253 last INSCOPE;
7254 }
7255 } # INSCOPE
7256
7257 unless (defined $i) { # has an element in scope
7258 !!!cp ('t421');
7259 !!!parse-error (type => 'unmatched end tag',
7260 text => $token->{tag_name}, token => $token);
7261 ## NOTE: Ignore the token.
7262 } else {
7263 ## Step 1. generate implied end tags
7264 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7265 !!!cp ('t417');
7266 pop @{$self->{open_elements}};
7267 }
7268
7269 ## Step 2.
7270 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7271 ne $token->{tag_name}) {
7272 !!!cp ('t417.1');
7273 !!!parse-error (type => 'not closed',
7274 text => $self->{open_elements}->[-1]->[0]
7275 ->manakai_local_name,
7276 token => $token);
7277 } else {
7278 !!!cp ('t420');
7279 }
7280
7281 ## Step 3.
7282 splice @{$self->{open_elements}}, $i;
7283 }
7284
7285 !!!next-token;
7286 next B;
7287 } elsif ({
7288 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7289 }->{$token->{tag_name}}) {
7290 ## has an element in scope
7291 my $i;
7292 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7293 my $node = $self->{open_elements}->[$_];
7294 if ($node->[1] & HEADING_EL) {
7295 !!!cp ('t423');
7296 $i = $_;
7297 last INSCOPE;
7298 } elsif ($node->[1] & SCOPING_EL) {
7299 !!!cp ('t424');
7300 last INSCOPE;
7301 }
7302 } # INSCOPE
7303
7304 unless (defined $i) { # has an element in scope
7305 !!!cp ('t425.1');
7306 !!!parse-error (type => 'unmatched end tag',
7307 text => $token->{tag_name}, token => $token);
7308 ## NOTE: Ignore the token.
7309 } else {
7310 ## Step 1. generate implied end tags
7311 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7312 !!!cp ('t422');
7313 pop @{$self->{open_elements}};
7314 }
7315
7316 ## Step 2.
7317 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7318 ne $token->{tag_name}) {
7319 !!!cp ('t425');
7320 !!!parse-error (type => 'unmatched end tag',
7321 text => $token->{tag_name}, token => $token);
7322 } else {
7323 !!!cp ('t426');
7324 }
7325
7326 ## Step 3.
7327 splice @{$self->{open_elements}}, $i;
7328 }
7329
7330 !!!next-token;
7331 next B;
7332 } elsif ($token->{tag_name} eq 'p') {
7333 ## has an element in scope
7334 my $i;
7335 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7336 my $node = $self->{open_elements}->[$_];
7337 if ($node->[1] & P_EL) {
7338 !!!cp ('t410.1');
7339 $i = $_;
7340 last INSCOPE;
7341 } elsif ($node->[1] & SCOPING_EL) {
7342 !!!cp ('t411.1');
7343 last INSCOPE;
7344 }
7345 } # INSCOPE
7346
7347 if (defined $i) {
7348 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7349 ne $token->{tag_name}) {
7350 !!!cp ('t412.1');
7351 !!!parse-error (type => 'not closed',
7352 text => $self->{open_elements}->[-1]->[0]
7353 ->manakai_local_name,
7354 token => $token);
7355 } else {
7356 !!!cp ('t414.1');
7357 }
7358
7359 splice @{$self->{open_elements}}, $i;
7360 } else {
7361 !!!cp ('t413.1');
7362 !!!parse-error (type => 'unmatched end tag',
7363 text => $token->{tag_name}, token => $token);
7364
7365 !!!cp ('t415.1');
7366 ## As if <p>, then reprocess the current token
7367 my $el;
7368 !!!create-element ($el, $HTML_NS, 'p',, $token);
7369 $insert->($el);
7370 ## NOTE: Not inserted into |$self->{open_elements}|.
7371 }
7372
7373 !!!next-token;
7374 next B;
7375 } elsif ({
7376 a => 1,
7377 b => 1, big => 1, em => 1, font => 1, i => 1,
7378 nobr => 1, s => 1, small => 1, strile => 1,
7379 strong => 1, tt => 1, u => 1,
7380 }->{$token->{tag_name}}) {
7381 !!!cp ('t427');
7382 $formatting_end_tag->($token);
7383 next B;
7384 } elsif ($token->{tag_name} eq 'br') {
7385 !!!cp ('t428');
7386 !!!parse-error (type => 'unmatched end tag',
7387 text => 'br', token => $token);
7388
7389 ## As if <br>
7390 $reconstruct_active_formatting_elements->($insert_to_current);
7391
7392 my $el;
7393 !!!create-element ($el, $HTML_NS, 'br',, $token);
7394 $insert->($el);
7395
7396 ## Ignore the token.
7397 !!!next-token;
7398 next B;
7399 } elsif ({
7400 caption => 1, col => 1, colgroup => 1, frame => 1,
7401 frameset => 1, head => 1, option => 1, optgroup => 1,
7402 tbody => 1, td => 1, tfoot => 1, th => 1,
7403 thead => 1, tr => 1,
7404 area => 1, basefont => 1, bgsound => 1,
7405 embed => 1, hr => 1, iframe => 1, image => 1,
7406 img => 1, input => 1, isindex => 1, noembed => 1,
7407 noframes => 1, param => 1, select => 1, spacer => 1,
7408 table => 1, textarea => 1, wbr => 1,
7409 noscript => 0, ## TODO: if scripting is enabled
7410 }->{$token->{tag_name}}) {
7411 !!!cp ('t429');
7412 !!!parse-error (type => 'unmatched end tag',
7413 text => $token->{tag_name}, token => $token);
7414 ## Ignore the token
7415 !!!next-token;
7416 next B;
7417
7418 ## ISSUE: Issue on HTML5 new elements in spec
7419
7420 } else {
7421 ## Step 1
7422 my $node_i = -1;
7423 my $node = $self->{open_elements}->[$node_i];
7424
7425 ## Step 2
7426 S2: {
7427 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7428 ## Step 1
7429 ## generate implied end tags
7430 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7431 !!!cp ('t430');
7432 ## NOTE: |<ruby><rt></ruby>|.
7433 ## ISSUE: <ruby><rt></rt> will also take this code path,
7434 ## which seems wrong.
7435 pop @{$self->{open_elements}};
7436 $node_i++;
7437 }
7438
7439 ## Step 2
7440 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7441 ne $token->{tag_name}) {
7442 !!!cp ('t431');
7443 ## NOTE: <x><y></x>
7444 !!!parse-error (type => 'not closed',
7445 text => $self->{open_elements}->[-1]->[0]
7446 ->manakai_local_name,
7447 token => $token);
7448 } else {
7449 !!!cp ('t432');
7450 }
7451
7452 ## Step 3
7453 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7454
7455 !!!next-token;
7456 last S2;
7457 } else {
7458 ## Step 3
7459 if (not ($node->[1] & FORMATTING_EL) and
7460 #not $phrasing_category->{$node->[1]} and
7461 ($node->[1] & SPECIAL_EL or
7462 $node->[1] & SCOPING_EL)) {
7463 !!!cp ('t433');
7464 !!!parse-error (type => 'unmatched end tag',
7465 text => $token->{tag_name}, token => $token);
7466 ## Ignore the token
7467 !!!next-token;
7468 last S2;
7469 }
7470
7471 !!!cp ('t434');
7472 }
7473
7474 ## Step 4
7475 $node_i--;
7476 $node = $self->{open_elements}->[$node_i];
7477
7478 ## Step 5;
7479 redo S2;
7480 } # S2
7481 next B;
7482 }
7483 }
7484 next B;
7485 } continue { # B
7486 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7487 ## NOTE: The code below is executed in cases where it does not have
7488 ## to be, but it it is harmless even in those cases.
7489 ## has an element in scope
7490 INSCOPE: {
7491 for (reverse 0..$#{$self->{open_elements}}) {
7492 my $node = $self->{open_elements}->[$_];
7493 if ($node->[1] & FOREIGN_EL) {
7494 last INSCOPE;
7495 } elsif ($node->[1] & SCOPING_EL) {
7496 last;
7497 }
7498 }
7499
7500 ## NOTE: No foreign element in scope.
7501 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7502 } # INSCOPE
7503 }
7504 } # B
7505
7506 ## Stop parsing # MUST
7507
7508 ## TODO: script stuffs
7509 } # _tree_construct_main
7510
7511 sub set_inner_html ($$$;$) {
7512 my $class = shift;
7513 my $node = shift;
7514 my $s = \$_[0];
7515 my $onerror = $_[1];
7516 my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7517
7518 ## ISSUE: Should {confident} be true?
7519
7520 my $nt = $node->node_type;
7521 if ($nt == 9) {
7522 # MUST
7523
7524 ## Step 1 # MUST
7525 ## TODO: If the document has an active parser, ...
7526 ## ISSUE: There is an issue in the spec.
7527
7528 ## Step 2 # MUST
7529 my @cn = @{$node->child_nodes};
7530 for (@cn) {
7531 $node->remove_child ($_);
7532 }
7533
7534 ## Step 3, 4, 5 # MUST
7535 $class->parse_char_string ($$s => $node, $onerror, $get_wrapper);
7536 } elsif ($nt == 1) {
7537 ## TODO: If non-html element
7538
7539 ## NOTE: Most of this code is copied from |parse_string|
7540
7541 ## TODO: Support for $get_wrapper
7542
7543 ## Step 1 # MUST
7544 my $this_doc = $node->owner_document;
7545 my $doc = $this_doc->implementation->create_document;
7546 $doc->manakai_is_html (1);
7547 my $p = $class->new;
7548 $p->{document} = $doc;
7549
7550 ## Step 8 # MUST
7551 my $i = 0;
7552 $p->{line_prev} = $p->{line} = 1;
7553 $p->{column_prev} = $p->{column} = 0;
7554 $p->{set_next_char} = sub {
7555 my $self = shift;
7556
7557 pop @{$self->{prev_char}};
7558 unshift @{$self->{prev_char}}, $self->{next_char};
7559
7560 $self->{next_char} = -1 and return if $i >= length $$s;
7561 $self->{next_char} = ord substr $$s, $i++, 1;
7562
7563 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7564 $p->{column}++;
7565
7566 if ($self->{next_char} == 0x000A) { # LF
7567 $p->{line}++;
7568 $p->{column} = 0;
7569 !!!cp ('i1');
7570 } elsif ($self->{next_char} == 0x000D) { # CR
7571 $i++ if substr ($$s, $i, 1) eq "\x0A";
7572 $self->{next_char} = 0x000A; # LF # MUST
7573 $p->{line}++;
7574 $p->{column} = 0;
7575 !!!cp ('i2');
7576 } elsif ($self->{next_char} > 0x10FFFF) {
7577 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7578 !!!cp ('i3');
7579 } elsif ($self->{next_char} == 0x0000) { # NULL
7580 !!!cp ('i4');
7581 !!!parse-error (type => 'NULL');
7582 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7583 } elsif ($self->{next_char} <= 0x0008 or
7584 (0x000E <= $self->{next_char} and
7585 $self->{next_char} <= 0x001F) or
7586 (0x007F <= $self->{next_char} and
7587 $self->{next_char} <= 0x009F) or
7588 (0xD800 <= $self->{next_char} and
7589 $self->{next_char} <= 0xDFFF) or
7590 (0xFDD0 <= $self->{next_char} and
7591 $self->{next_char} <= 0xFDDF) or
7592 {
7593 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7594 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7595 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7596 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7597 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7598 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7599 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7600 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7601 0x10FFFE => 1, 0x10FFFF => 1,
7602 }->{$self->{next_char}}) {
7603 !!!cp ('i4.1');
7604 if ($self->{next_char} < 0x10000) {
7605 !!!parse-error (type => 'control char',
7606 text => (sprintf 'U+%04X', $self->{next_char}));
7607 } else {
7608 !!!parse-error (type => 'control char',
7609 text => (sprintf 'U-%08X', $self->{next_char}));
7610 }
7611 }
7612 };
7613 $p->{prev_char} = [-1, -1, -1];
7614 $p->{next_char} = -1;
7615
7616 my $ponerror = $onerror || sub {
7617 my (%opt) = @_;
7618 my $line = $opt{line};
7619 my $column = $opt{column};
7620 if (defined $opt{token} and defined $opt{token}->{line}) {
7621 $line = $opt{token}->{line};
7622 $column = $opt{token}->{column};
7623 }
7624 warn "Parse error ($opt{type}) at line $line column $column\n";
7625 };
7626 $p->{parse_error} = sub {
7627 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7628 };
7629
7630 $p->_initialize_tokenizer;
7631 $p->_initialize_tree_constructor;
7632
7633 ## Step 2
7634 my $node_ln = $node->manakai_local_name;
7635 $p->{content_model} = {
7636 title => RCDATA_CONTENT_MODEL,
7637 textarea => RCDATA_CONTENT_MODEL,
7638 style => CDATA_CONTENT_MODEL,
7639 script => CDATA_CONTENT_MODEL,
7640 xmp => CDATA_CONTENT_MODEL,
7641 iframe => CDATA_CONTENT_MODEL,
7642 noembed => CDATA_CONTENT_MODEL,
7643 noframes => CDATA_CONTENT_MODEL,
7644 noscript => CDATA_CONTENT_MODEL,
7645 plaintext => PLAINTEXT_CONTENT_MODEL,
7646 }->{$node_ln};
7647 $p->{content_model} = PCDATA_CONTENT_MODEL
7648 unless defined $p->{content_model};
7649 ## ISSUE: What is "the name of the element"? local name?
7650
7651 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7652 ## TODO: Foreign element OK?
7653
7654 ## Step 3
7655 my $root = $doc->create_element_ns
7656 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7657
7658 ## Step 4 # MUST
7659 $doc->append_child ($root);
7660
7661 ## Step 5 # MUST
7662 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7663
7664 undef $p->{head_element};
7665
7666 ## Step 6 # MUST
7667 $p->_reset_insertion_mode;
7668
7669 ## Step 7 # MUST
7670 my $anode = $node;
7671 AN: while (defined $anode) {
7672 if ($anode->node_type == 1) {
7673 my $nsuri = $anode->namespace_uri;
7674 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7675 if ($anode->manakai_local_name eq 'form') {
7676 !!!cp ('i5');
7677 $p->{form_element} = $anode;
7678 last AN;
7679 }
7680 }
7681 }
7682 $anode = $anode->parent_node;
7683 } # AN
7684
7685 ## Step 9 # MUST
7686 {
7687 my $self = $p;
7688 !!!next-token;
7689 }
7690 $p->_tree_construction_main;
7691
7692 ## Step 10 # MUST
7693 my @cn = @{$node->child_nodes};
7694 for (@cn) {
7695 $node->remove_child ($_);
7696 }
7697 ## ISSUE: mutation events? read-only?
7698
7699 ## Step 11 # MUST
7700 @cn = @{$root->child_nodes};
7701 for (@cn) {
7702 $this_doc->adopt_node ($_);
7703 $node->append_child ($_);
7704 }
7705 ## ISSUE: mutation events?
7706
7707 $p->_terminate_tree_constructor;
7708
7709 delete $p->{parse_error}; # delete loop
7710 } else {
7711 die "$0: |set_inner_html| is not defined for node of type $nt";
7712 }
7713 } # set_inner_html
7714
7715 } # tree construction stage
7716
7717 package Whatpm::HTML::RestartParser;
7718 push our @ISA, 'Error';
7719
7720 1;
7721 # $Date: 2008/09/13 04:19:56 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24