/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.142 - (show annotations) (download) (as text)
Sat May 24 10:32:29 2008 UTC (17 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.141: +2 -3 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	24 May 2008 10:29:47 -0000
	* tokenizer-test-2.dat: New test data on truncated doctypes (c.f.
	HTML5 revision 1685).

2008-05-24  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	24 May 2008 10:30:11 -0000
	* HTML.pm.src: Reduce the number of errors in truncated doctypes (HTML5
	revision 1685).

2008-05-24  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.141 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: 1252 parse error (revision 1264)
12 ## TODO: 8859-11 = 874 (revision 1271)
13
14 require IO::Handle;
15
16 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
17 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
18 my $SVG_NS = q<http://www.w3.org/2000/svg>;
19 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
20 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
21 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
22
23 sub A_EL () { 0b1 }
24 sub ADDRESS_EL () { 0b10 }
25 sub BODY_EL () { 0b100 }
26 sub BUTTON_EL () { 0b1000 }
27 sub CAPTION_EL () { 0b10000 }
28 sub DD_EL () { 0b100000 }
29 sub DIV_EL () { 0b1000000 }
30 sub DT_EL () { 0b10000000 }
31 sub FORM_EL () { 0b100000000 }
32 sub FORMATTING_EL () { 0b1000000000 }
33 sub FRAMESET_EL () { 0b10000000000 }
34 sub HEADING_EL () { 0b100000000000 }
35 sub HTML_EL () { 0b1000000000000 }
36 sub LI_EL () { 0b10000000000000 }
37 sub NOBR_EL () { 0b100000000000000 }
38 sub OPTION_EL () { 0b1000000000000000 }
39 sub OPTGROUP_EL () { 0b10000000000000000 }
40 sub P_EL () { 0b100000000000000000 }
41 sub SELECT_EL () { 0b1000000000000000000 }
42 sub TABLE_EL () { 0b10000000000000000000 }
43 sub TABLE_CELL_EL () { 0b100000000000000000000 }
44 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
45 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
46 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
47 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
48 sub FOREIGN_EL () { 0b10000000000000000000000000 }
49 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
50 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
51
52 sub TABLE_ROWS_EL () {
53 TABLE_EL |
54 TABLE_ROW_EL |
55 TABLE_ROW_GROUP_EL
56 }
57
58 sub END_TAG_OPTIONAL_EL () {
59 DD_EL |
60 DT_EL |
61 LI_EL |
62 P_EL
63 }
64
65 sub ALL_END_TAG_OPTIONAL_EL () {
66 END_TAG_OPTIONAL_EL |
67 BODY_EL |
68 HTML_EL |
69 TABLE_CELL_EL |
70 TABLE_ROW_EL |
71 TABLE_ROW_GROUP_EL
72 }
73
74 sub SCOPING_EL () {
75 BUTTON_EL |
76 CAPTION_EL |
77 HTML_EL |
78 TABLE_EL |
79 TABLE_CELL_EL |
80 MISC_SCOPING_EL
81 }
82
83 sub TABLE_SCOPING_EL () {
84 HTML_EL |
85 TABLE_EL
86 }
87
88 sub TABLE_ROWS_SCOPING_EL () {
89 HTML_EL |
90 TABLE_ROW_GROUP_EL
91 }
92
93 sub TABLE_ROW_SCOPING_EL () {
94 HTML_EL |
95 TABLE_ROW_EL
96 }
97
98 sub SPECIAL_EL () {
99 ADDRESS_EL |
100 BODY_EL |
101 DIV_EL |
102 END_TAG_OPTIONAL_EL |
103 FORM_EL |
104 FRAMESET_EL |
105 HEADING_EL |
106 OPTION_EL |
107 OPTGROUP_EL |
108 SELECT_EL |
109 TABLE_ROW_EL |
110 TABLE_ROW_GROUP_EL |
111 MISC_SPECIAL_EL
112 }
113
114 my $el_category = {
115 a => A_EL | FORMATTING_EL,
116 address => ADDRESS_EL,
117 applet => MISC_SCOPING_EL,
118 area => MISC_SPECIAL_EL,
119 b => FORMATTING_EL,
120 base => MISC_SPECIAL_EL,
121 basefont => MISC_SPECIAL_EL,
122 bgsound => MISC_SPECIAL_EL,
123 big => FORMATTING_EL,
124 blockquote => MISC_SPECIAL_EL,
125 body => BODY_EL,
126 br => MISC_SPECIAL_EL,
127 button => BUTTON_EL,
128 caption => CAPTION_EL,
129 center => MISC_SPECIAL_EL,
130 col => MISC_SPECIAL_EL,
131 colgroup => MISC_SPECIAL_EL,
132 dd => DD_EL,
133 dir => MISC_SPECIAL_EL,
134 div => DIV_EL,
135 dl => MISC_SPECIAL_EL,
136 dt => DT_EL,
137 em => FORMATTING_EL,
138 embed => MISC_SPECIAL_EL,
139 fieldset => MISC_SPECIAL_EL,
140 font => FORMATTING_EL,
141 form => FORM_EL,
142 frame => MISC_SPECIAL_EL,
143 frameset => FRAMESET_EL,
144 h1 => HEADING_EL,
145 h2 => HEADING_EL,
146 h3 => HEADING_EL,
147 h4 => HEADING_EL,
148 h5 => HEADING_EL,
149 h6 => HEADING_EL,
150 head => MISC_SPECIAL_EL,
151 hr => MISC_SPECIAL_EL,
152 html => HTML_EL,
153 i => FORMATTING_EL,
154 iframe => MISC_SPECIAL_EL,
155 img => MISC_SPECIAL_EL,
156 input => MISC_SPECIAL_EL,
157 isindex => MISC_SPECIAL_EL,
158 li => LI_EL,
159 link => MISC_SPECIAL_EL,
160 listing => MISC_SPECIAL_EL,
161 marquee => MISC_SCOPING_EL,
162 menu => MISC_SPECIAL_EL,
163 meta => MISC_SPECIAL_EL,
164 nobr => NOBR_EL | FORMATTING_EL,
165 noembed => MISC_SPECIAL_EL,
166 noframes => MISC_SPECIAL_EL,
167 noscript => MISC_SPECIAL_EL,
168 object => MISC_SCOPING_EL,
169 ol => MISC_SPECIAL_EL,
170 optgroup => OPTGROUP_EL,
171 option => OPTION_EL,
172 p => P_EL,
173 param => MISC_SPECIAL_EL,
174 plaintext => MISC_SPECIAL_EL,
175 pre => MISC_SPECIAL_EL,
176 s => FORMATTING_EL,
177 script => MISC_SPECIAL_EL,
178 select => SELECT_EL,
179 small => FORMATTING_EL,
180 spacer => MISC_SPECIAL_EL,
181 strike => FORMATTING_EL,
182 strong => FORMATTING_EL,
183 style => MISC_SPECIAL_EL,
184 table => TABLE_EL,
185 tbody => TABLE_ROW_GROUP_EL,
186 td => TABLE_CELL_EL,
187 textarea => MISC_SPECIAL_EL,
188 tfoot => TABLE_ROW_GROUP_EL,
189 th => TABLE_CELL_EL,
190 thead => TABLE_ROW_GROUP_EL,
191 title => MISC_SPECIAL_EL,
192 tr => TABLE_ROW_EL,
193 tt => FORMATTING_EL,
194 u => FORMATTING_EL,
195 ul => MISC_SPECIAL_EL,
196 wbr => MISC_SPECIAL_EL,
197 };
198
199 my $el_category_f = {
200 $MML_NS => {
201 'annotation-xml' => MML_AXML_EL,
202 mi => FOREIGN_FLOW_CONTENT_EL,
203 mo => FOREIGN_FLOW_CONTENT_EL,
204 mn => FOREIGN_FLOW_CONTENT_EL,
205 ms => FOREIGN_FLOW_CONTENT_EL,
206 mtext => FOREIGN_FLOW_CONTENT_EL,
207 },
208 $SVG_NS => {
209 foreignObject => FOREIGN_FLOW_CONTENT_EL,
210 desc => FOREIGN_FLOW_CONTENT_EL,
211 title => FOREIGN_FLOW_CONTENT_EL,
212 },
213 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
214 };
215
216 my $svg_attr_name = {
217 attributetype => 'attributeType',
218 basefrequency => 'baseFrequency',
219 baseprofile => 'baseProfile',
220 calcmode => 'calcMode',
221 clippathunits => 'clipPathUnits',
222 contentscripttype => 'contentScriptType',
223 contentstyletype => 'contentStyleType',
224 diffuseconstant => 'diffuseConstant',
225 edgemode => 'edgeMode',
226 externalresourcesrequired => 'externalResourcesRequired',
227 fecolormatrix => 'feColorMatrix',
228 fecomposite => 'feComposite',
229 fegaussianblur => 'feGaussianBlur',
230 femorphology => 'feMorphology',
231 fetile => 'feTile',
232 filterres => 'filterRes',
233 filterunits => 'filterUnits',
234 glyphref => 'glyphRef',
235 gradienttransform => 'gradientTransform',
236 gradientunits => 'gradientUnits',
237 kernelmatrix => 'kernelMatrix',
238 kernelunitlength => 'kernelUnitLength',
239 keypoints => 'keyPoints',
240 keysplines => 'keySplines',
241 keytimes => 'keyTimes',
242 lengthadjust => 'lengthAdjust',
243 limitingconeangle => 'limitingConeAngle',
244 markerheight => 'markerHeight',
245 markerunits => 'markerUnits',
246 markerwidth => 'markerWidth',
247 maskcontentunits => 'maskContentUnits',
248 maskunits => 'maskUnits',
249 numoctaves => 'numOctaves',
250 pathlength => 'pathLength',
251 patterncontentunits => 'patternContentUnits',
252 patterntransform => 'patternTransform',
253 patternunits => 'patternUnits',
254 pointsatx => 'pointsAtX',
255 pointsaty => 'pointsAtY',
256 pointsatz => 'pointsAtZ',
257 preservealpha => 'preserveAlpha',
258 preserveaspectratio => 'preserveAspectRatio',
259 primitiveunits => 'primitiveUnits',
260 refx => 'refX',
261 refy => 'refY',
262 repeatcount => 'repeatCount',
263 repeatdur => 'repeatDur',
264 requiredextensions => 'requiredExtensions',
265 specularconstant => 'specularConstant',
266 specularexponent => 'specularExponent',
267 spreadmethod => 'spreadMethod',
268 startoffset => 'startOffset',
269 stddeviation => 'stdDeviation',
270 stitchtiles => 'stitchTiles',
271 surfacescale => 'surfaceScale',
272 systemlanguage => 'systemLanguage',
273 tablevalues => 'tableValues',
274 targetx => 'targetX',
275 targety => 'targetY',
276 textlength => 'textLength',
277 viewbox => 'viewBox',
278 viewtarget => 'viewTarget',
279 xchannelselector => 'xChannelSelector',
280 ychannelselector => 'yChannelSelector',
281 zoomandpan => 'zoomAndPan',
282 };
283
284 my $foreign_attr_xname = {
285 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
286 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
287 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
288 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
289 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
290 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
291 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
292 'xml:base' => [$XML_NS, ['xml', 'base']],
293 'xml:lang' => [$XML_NS, ['xml', 'lang']],
294 'xml:space' => [$XML_NS, ['xml', 'space']],
295 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
296 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
297 };
298
299 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
300
301 my $c1_entity_char = {
302 0x80 => 0x20AC,
303 0x81 => 0xFFFD,
304 0x82 => 0x201A,
305 0x83 => 0x0192,
306 0x84 => 0x201E,
307 0x85 => 0x2026,
308 0x86 => 0x2020,
309 0x87 => 0x2021,
310 0x88 => 0x02C6,
311 0x89 => 0x2030,
312 0x8A => 0x0160,
313 0x8B => 0x2039,
314 0x8C => 0x0152,
315 0x8D => 0xFFFD,
316 0x8E => 0x017D,
317 0x8F => 0xFFFD,
318 0x90 => 0xFFFD,
319 0x91 => 0x2018,
320 0x92 => 0x2019,
321 0x93 => 0x201C,
322 0x94 => 0x201D,
323 0x95 => 0x2022,
324 0x96 => 0x2013,
325 0x97 => 0x2014,
326 0x98 => 0x02DC,
327 0x99 => 0x2122,
328 0x9A => 0x0161,
329 0x9B => 0x203A,
330 0x9C => 0x0153,
331 0x9D => 0xFFFD,
332 0x9E => 0x017E,
333 0x9F => 0x0178,
334 }; # $c1_entity_char
335
336 sub parse_byte_string ($$$$;$) {
337 my $self = shift;
338 my $charset_name = shift;
339 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
340 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
341 } # parse_byte_string
342
343 sub parse_byte_stream ($$$$;$) {
344 my $self = ref $_[0] ? shift : shift->new;
345 my $charset_name = shift;
346 my $byte_stream = $_[0];
347
348 my $onerror = $_[2] || sub {
349 my (%opt) = @_;
350 warn "Parse error ($opt{type})\n";
351 };
352 $self->{parse_error} = $onerror; # updated later by parse_char_string
353
354 ## HTML5 encoding sniffing algorithm
355 require Message::Charset::Info;
356 my $charset;
357 my $buffer;
358 my ($char_stream, $e_status);
359
360 SNIFFING: {
361
362 ## Step 1
363 if (defined $charset_name) {
364 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
365
366 ## ISSUE: Unsupported encoding is not ignored according to the spec.
367 ($char_stream, $e_status) = $charset->get_decode_handle
368 ($byte_stream, allow_error_reporting => 1,
369 allow_fallback => 1);
370 if ($char_stream) {
371 $self->{confident} = 1;
372 last SNIFFING;
373 } else {
374 ## TODO: unsupported error
375 }
376 }
377
378 ## Step 2
379 my $byte_buffer = '';
380 for (1..1024) {
381 my $char = $byte_stream->getc;
382 last unless defined $char;
383 $byte_buffer .= $char;
384 } ## TODO: timeout
385
386 ## Step 3
387 if ($byte_buffer =~ /^\xFE\xFF/) {
388 $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
389 ($char_stream, $e_status) = $charset->get_decode_handle
390 ($byte_stream, allow_error_reporting => 1,
391 allow_fallback => 1, byte_buffer => \$byte_buffer);
392 $self->{confident} = 1;
393 last SNIFFING;
394 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
395 $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
396 ($char_stream, $e_status) = $charset->get_decode_handle
397 ($byte_stream, allow_error_reporting => 1,
398 allow_fallback => 1, byte_buffer => \$byte_buffer);
399 $self->{confident} = 1;
400 last SNIFFING;
401 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
402 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
403 ($char_stream, $e_status) = $charset->get_decode_handle
404 ($byte_stream, allow_error_reporting => 1,
405 allow_fallback => 1, byte_buffer => \$byte_buffer);
406 $self->{confident} = 1;
407 last SNIFFING;
408 }
409
410 ## Step 4
411 ## TODO: <meta charset>
412
413 ## Step 5
414 ## TODO: from history
415
416 ## Step 6
417 require Whatpm::Charset::UniversalCharDet;
418 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
419 ($byte_buffer);
420 if (defined $charset_name) {
421 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
422
423 ## ISSUE: Unsupported encoding is not ignored according to the spec.
424 require Whatpm::Charset::DecodeHandle;
425 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
426 ($byte_stream);
427 ($char_stream, $e_status) = $charset->get_decode_handle
428 ($buffer, allow_error_reporting => 1,
429 allow_fallback => 1, byte_buffer => \$byte_buffer);
430 if ($char_stream) {
431 $buffer->{buffer} = $byte_buffer;
432 !!!parse-error (type => 'sniffing:chardet', ## TODO: type name
433 value => $charset_name,
434 level => $self->{info_level},
435 line => 1, column => 1);
436 $self->{confident} = 0;
437 last SNIFFING;
438 }
439 }
440
441 ## Step 7: default
442 ## TODO: Make this configurable.
443 $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
444 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
445 ## detectable in the step 6.
446 require Whatpm::Charset::DecodeHandle;
447 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
448 ($byte_stream);
449 ($char_stream, $e_status)
450 = $charset->get_decode_handle ($buffer,
451 allow_error_reporting => 1,
452 allow_fallback => 1,
453 byte_buffer => \$byte_buffer);
454 $buffer->{buffer} = $byte_buffer;
455 !!!parse-error (type => 'sniffing:default', ## TODO: type name
456 value => 'windows-1252',
457 level => $self->{info_level},
458 line => 1, column => 1);
459 $self->{confident} = 0;
460 } # SNIFFING
461
462 $self->{input_encoding} = $charset->get_iana_name;
463 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
464 !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
465 value => $self->{input_encoding},
466 level => $self->{unsupported_level},
467 line => 1, column => 1);
468 } elsif (not ($e_status &
469 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
470 !!!parse-error (type => 'chardecode:no error', ## TODO: type name
471 value => $self->{input_encoding},
472 level => $self->{unsupported_level},
473 line => 1, column => 1);
474 }
475
476 $self->{change_encoding} = sub {
477 my $self = shift;
478 $charset_name = shift;
479 my $token = shift;
480
481 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
482 ($char_stream, $e_status) = $charset->get_decode_handle
483 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
484 byte_buffer => \ $buffer->{buffer});
485
486 if ($char_stream) { # if supported
487 ## "Change the encoding" algorithm:
488
489 ## Step 1
490 if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
491 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
492 ($char_stream, $e_status) = $charset->get_decode_handle
493 ($byte_stream,
494 byte_buffer => \ $buffer->{buffer});
495 }
496 $charset_name = $charset->get_iana_name;
497
498 ## Step 2
499 if (defined $self->{input_encoding} and
500 $self->{input_encoding} eq $charset_name) {
501 !!!parse-error (type => 'charset label:matching', ## TODO: type
502 value => $charset_name,
503 level => $self->{info_level});
504 $self->{confident} = 1;
505 return;
506 }
507
508 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
509 ':'.$charset_name, level => 'w', token => $token);
510
511 ## Step 3
512 # if (can) {
513 ## change the encoding on the fly.
514 #$self->{confident} = 1;
515 #return;
516 # }
517
518 ## Step 4
519 throw Whatpm::HTML::RestartParser ();
520 }
521 }; # $self->{change_encoding}
522
523 my $char_onerror = sub {
524 my (undef, $type, %opt) = @_;
525 !!!parse-error (%opt, type => $type,
526 line => $self->{line}, column => $self->{column} + 1);
527 if ($opt{octets}) {
528 ${$opt{octets}} = "\x{FFFD}"; # relacement character
529 }
530 };
531 $char_stream->onerror ($char_onerror);
532
533 my @args = @_; shift @args; # $s
534 my $return;
535 try {
536 $return = $self->parse_char_stream ($char_stream, @args);
537 } catch Whatpm::HTML::RestartParser with {
538 ## NOTE: Invoked after {change_encoding}.
539
540 $self->{input_encoding} = $charset->get_iana_name;
541 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
542 !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
543 value => $self->{input_encoding},
544 level => $self->{unsupported_level},
545 line => 1, column => 1);
546 } elsif (not ($e_status &
547 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
548 !!!parse-error (type => 'chardecode:no error', ## TODO: type name
549 value => $self->{input_encoding},
550 level => $self->{unsupported_level},
551 line => 1, column => 1);
552 }
553 $self->{confident} = 1;
554 $char_stream->onerror ($char_onerror);
555 $return = $self->parse_char_stream ($char_stream, @args);
556 };
557 return $return;
558 } # parse_byte_stream
559
560 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
561 ## and the HTML layer MUST ignore it. However, we does strip BOM in
562 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
563 ## because the core part of our HTML parser expects a string of character,
564 ## not a string of bytes or code units or anything which might contain a BOM.
565 ## Therefore, any parser interface that accepts a string of bytes,
566 ## such as |parse_byte_string| in this module, must ensure that it does
567 ## strip the BOM and never strip any ZWNBSP.
568
569 sub parse_char_string ($$$;$) {
570 my $self = shift;
571 require utf8;
572 my $s = ref $_[0] ? $_[0] : \($_[0]);
573 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
574 return $self->parse_char_stream ($input, @_[1..$#_]);
575 } # parse_char_string
576 *parse_string = \&parse_char_string;
577
578 sub parse_char_stream ($$$;$) {
579 my $self = ref $_[0] ? shift : shift->new;
580 my $input = $_[0];
581 $self->{document} = $_[1];
582 @{$self->{document}->child_nodes} = ();
583
584 ## NOTE: |set_inner_html| copies most of this method's code
585
586 $self->{confident} = 1 unless exists $self->{confident};
587 $self->{document}->input_encoding ($self->{input_encoding})
588 if defined $self->{input_encoding};
589
590 my $i = 0;
591 $self->{line_prev} = $self->{line} = 1;
592 $self->{column_prev} = $self->{column} = 0;
593 $self->{set_next_char} = sub {
594 my $self = shift;
595
596 pop @{$self->{prev_char}};
597 unshift @{$self->{prev_char}}, $self->{next_char};
598
599 my $char;
600 if (defined $self->{next_next_char}) {
601 $char = $self->{next_next_char};
602 delete $self->{next_next_char};
603 } else {
604 $char = $input->getc;
605 }
606 $self->{next_char} = -1 and return unless defined $char;
607 $self->{next_char} = ord $char;
608
609 ($self->{line_prev}, $self->{column_prev})
610 = ($self->{line}, $self->{column});
611 $self->{column}++;
612
613 if ($self->{next_char} == 0x000A) { # LF
614 !!!cp ('j1');
615 $self->{line}++;
616 $self->{column} = 0;
617 } elsif ($self->{next_char} == 0x000D) { # CR
618 !!!cp ('j2');
619 my $next = $input->getc;
620 if (defined $next and $next ne "\x0A") {
621 $self->{next_next_char} = $next;
622 }
623 $self->{next_char} = 0x000A; # LF # MUST
624 $self->{line}++;
625 $self->{column} = 0;
626 } elsif ($self->{next_char} > 0x10FFFF) {
627 !!!cp ('j3');
628 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
629 } elsif ($self->{next_char} == 0x0000) { # NULL
630 !!!cp ('j4');
631 !!!parse-error (type => 'NULL');
632 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
633 } elsif ($self->{next_char} <= 0x0008 or
634 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
635 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
636 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
637 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
638 {
639 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
640 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
641 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
642 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
643 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
644 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
645 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
646 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
647 0x10FFFE => 1, 0x10FFFF => 1,
648 }->{$self->{next_char}}) {
649 !!!cp ('j5');
650 !!!parse-error (type => 'control char', level => $self->{must_level});
651 ## TODO: error type documentation
652 }
653 };
654 $self->{prev_char} = [-1, -1, -1];
655 $self->{next_char} = -1;
656
657 my $onerror = $_[2] || sub {
658 my (%opt) = @_;
659 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
660 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
661 warn "Parse error ($opt{type}) at line $line column $column\n";
662 };
663 $self->{parse_error} = sub {
664 $onerror->(line => $self->{line}, column => $self->{column}, @_);
665 };
666
667 $self->_initialize_tokenizer;
668 $self->_initialize_tree_constructor;
669 $self->_construct_tree;
670 $self->_terminate_tree_constructor;
671
672 delete $self->{parse_error}; # remove loop
673
674 return $self->{document};
675 } # parse_char_stream
676
677 sub new ($) {
678 my $class = shift;
679 my $self = bless {
680 must_level => 'm',
681 should_level => 's',
682 good_level => 'w',
683 warn_level => 'w',
684 info_level => 'i',
685 unsupported_level => 'u',
686 }, $class;
687 $self->{set_next_char} = sub {
688 $self->{next_char} = -1;
689 };
690 $self->{parse_error} = sub {
691 #
692 };
693 $self->{change_encoding} = sub {
694 # if ($_[0] is a supported encoding) {
695 # run "change the encoding" algorithm;
696 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
697 # }
698 };
699 $self->{application_cache_selection} = sub {
700 #
701 };
702 return $self;
703 } # new
704
705 sub CM_ENTITY () { 0b001 } # & markup in data
706 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
707 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
708
709 sub PLAINTEXT_CONTENT_MODEL () { 0 }
710 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
711 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
712 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
713
714 sub DATA_STATE () { 0 }
715 sub ENTITY_DATA_STATE () { 1 }
716 sub TAG_OPEN_STATE () { 2 }
717 sub CLOSE_TAG_OPEN_STATE () { 3 }
718 sub TAG_NAME_STATE () { 4 }
719 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
720 sub ATTRIBUTE_NAME_STATE () { 6 }
721 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
722 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
723 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
724 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
725 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
726 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
727 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
728 sub COMMENT_START_STATE () { 14 }
729 sub COMMENT_START_DASH_STATE () { 15 }
730 sub COMMENT_STATE () { 16 }
731 sub COMMENT_END_STATE () { 17 }
732 sub COMMENT_END_DASH_STATE () { 18 }
733 sub BOGUS_COMMENT_STATE () { 19 }
734 sub DOCTYPE_STATE () { 20 }
735 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
736 sub DOCTYPE_NAME_STATE () { 22 }
737 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
738 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
739 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
740 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
741 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
742 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
743 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
744 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
745 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
746 sub BOGUS_DOCTYPE_STATE () { 32 }
747 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
748 sub SELF_CLOSING_START_TAG_STATE () { 34 }
749 sub CDATA_BLOCK_STATE () { 35 }
750
751 sub DOCTYPE_TOKEN () { 1 }
752 sub COMMENT_TOKEN () { 2 }
753 sub START_TAG_TOKEN () { 3 }
754 sub END_TAG_TOKEN () { 4 }
755 sub END_OF_FILE_TOKEN () { 5 }
756 sub CHARACTER_TOKEN () { 6 }
757
758 sub AFTER_HTML_IMS () { 0b100 }
759 sub HEAD_IMS () { 0b1000 }
760 sub BODY_IMS () { 0b10000 }
761 sub BODY_TABLE_IMS () { 0b100000 }
762 sub TABLE_IMS () { 0b1000000 }
763 sub ROW_IMS () { 0b10000000 }
764 sub BODY_AFTER_IMS () { 0b100000000 }
765 sub FRAME_IMS () { 0b1000000000 }
766 sub SELECT_IMS () { 0b10000000000 }
767 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
768 ## NOTE: "in foreign content" insertion mode is special; it is combined
769 ## with the secondary insertion mode. In this parser, they are stored
770 ## together in the bit-or'ed form.
771
772 ## NOTE: "initial" and "before html" insertion modes have no constants.
773
774 ## NOTE: "after after body" insertion mode.
775 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
776
777 ## NOTE: "after after frameset" insertion mode.
778 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
779
780 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
781 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
782 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
783 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
784 sub IN_BODY_IM () { BODY_IMS }
785 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
786 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
787 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
788 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
789 sub IN_TABLE_IM () { TABLE_IMS }
790 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
791 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
792 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
793 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
794 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
795 sub IN_COLUMN_GROUP_IM () { 0b10 }
796
797 ## Implementations MUST act as if state machine in the spec
798
799 sub _initialize_tokenizer ($) {
800 my $self = shift;
801 $self->{state} = DATA_STATE; # MUST
802 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
803 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
804 undef $self->{current_attribute};
805 undef $self->{last_emitted_start_tag_name};
806 undef $self->{last_attribute_value_state};
807 delete $self->{self_closing};
808 $self->{char} = [];
809 # $self->{next_char}
810 !!!next-input-character;
811 $self->{token} = [];
812 # $self->{escape}
813 } # _initialize_tokenizer
814
815 ## A token has:
816 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
817 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
818 ## ->{name} (DOCTYPE_TOKEN)
819 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
820 ## ->{public_identifier} (DOCTYPE_TOKEN)
821 ## ->{system_identifier} (DOCTYPE_TOKEN)
822 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
823 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
824 ## ->{name}
825 ## ->{value}
826 ## ->{has_reference} == 1 or 0
827 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
828 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
829 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
830 ## while the token is pushed back to the stack.
831
832 ## ISSUE: "When a DOCTYPE token is created, its
833 ## <i>self-closing flag</i> must be unset (its other state is that it
834 ## be set), and its attributes list must be empty.": Wrong subject?
835
836 ## Emitted token MUST immediately be handled by the tree construction state.
837
838 ## Before each step, UA MAY check to see if either one of the scripts in
839 ## "list of scripts that will execute as soon as possible" or the first
840 ## script in the "list of scripts that will execute asynchronously",
841 ## has completed loading. If one has, then it MUST be executed
842 ## and removed from the list.
843
844 ## NOTE: HTML5 "Writing HTML documents" section, applied to
845 ## documents and not to user agents and conformance checkers,
846 ## contains some requirements that are not detected by the
847 ## parsing algorithm:
848 ## - Some requirements on character encoding declarations. ## TODO
849 ## - "Elements MUST NOT contain content that their content model disallows."
850 ## ... Some are parse error, some are not (will be reported by c.c.).
851 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
852 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
853 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
854
855 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
856 ## be detected by the HTML5 parsing algorithm:
857 ## - Text,
858
859 sub _get_next_token ($) {
860 my $self = shift;
861
862 if ($self->{self_closing}) {
863 !!!parse-error (type => 'nestc', token => $self->{current_token});
864 ## NOTE: The |self_closing| flag is only set by start tag token.
865 ## In addition, when a start tag token is emitted, it is always set to
866 ## |current_token|.
867 delete $self->{self_closing};
868 }
869
870 if (@{$self->{token}}) {
871 $self->{self_closing} = $self->{token}->[0]->{self_closing};
872 return shift @{$self->{token}};
873 }
874
875 A: {
876 if ($self->{state} == DATA_STATE) {
877 if ($self->{next_char} == 0x0026) { # &
878 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
879 not $self->{escape}) {
880 !!!cp (1);
881 $self->{state} = ENTITY_DATA_STATE;
882 !!!next-input-character;
883 redo A;
884 } else {
885 !!!cp (2);
886 #
887 }
888 } elsif ($self->{next_char} == 0x002D) { # -
889 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
890 unless ($self->{escape}) {
891 if ($self->{prev_char}->[0] == 0x002D and # -
892 $self->{prev_char}->[1] == 0x0021 and # !
893 $self->{prev_char}->[2] == 0x003C) { # <
894 !!!cp (3);
895 $self->{escape} = 1;
896 } else {
897 !!!cp (4);
898 }
899 } else {
900 !!!cp (5);
901 }
902 }
903
904 #
905 } elsif ($self->{next_char} == 0x003C) { # <
906 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
907 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
908 not $self->{escape})) {
909 !!!cp (6);
910 $self->{state} = TAG_OPEN_STATE;
911 !!!next-input-character;
912 redo A;
913 } else {
914 !!!cp (7);
915 #
916 }
917 } elsif ($self->{next_char} == 0x003E) { # >
918 if ($self->{escape} and
919 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
920 if ($self->{prev_char}->[0] == 0x002D and # -
921 $self->{prev_char}->[1] == 0x002D) { # -
922 !!!cp (8);
923 delete $self->{escape};
924 } else {
925 !!!cp (9);
926 }
927 } else {
928 !!!cp (10);
929 }
930
931 #
932 } elsif ($self->{next_char} == -1) {
933 !!!cp (11);
934 !!!emit ({type => END_OF_FILE_TOKEN,
935 line => $self->{line}, column => $self->{column}});
936 last A; ## TODO: ok?
937 } else {
938 !!!cp (12);
939 }
940 # Anything else
941 my $token = {type => CHARACTER_TOKEN,
942 data => chr $self->{next_char},
943 line => $self->{line}, column => $self->{column},
944 };
945 ## Stay in the data state
946 !!!next-input-character;
947
948 !!!emit ($token);
949
950 redo A;
951 } elsif ($self->{state} == ENTITY_DATA_STATE) {
952 ## (cannot happen in CDATA state)
953
954 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
955
956 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
957
958 $self->{state} = DATA_STATE;
959 # next-input-character is already done
960
961 unless (defined $token) {
962 !!!cp (13);
963 !!!emit ({type => CHARACTER_TOKEN, data => '&',
964 line => $l, column => $c,
965 });
966 } else {
967 !!!cp (14);
968 !!!emit ($token);
969 }
970
971 redo A;
972 } elsif ($self->{state} == TAG_OPEN_STATE) {
973 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
974 if ($self->{next_char} == 0x002F) { # /
975 !!!cp (15);
976 !!!next-input-character;
977 $self->{state} = CLOSE_TAG_OPEN_STATE;
978 redo A;
979 } else {
980 !!!cp (16);
981 ## reconsume
982 $self->{state} = DATA_STATE;
983
984 !!!emit ({type => CHARACTER_TOKEN, data => '<',
985 line => $self->{line_prev},
986 column => $self->{column_prev},
987 });
988
989 redo A;
990 }
991 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
992 if ($self->{next_char} == 0x0021) { # !
993 !!!cp (17);
994 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif ($self->{next_char} == 0x002F) { # /
998 !!!cp (18);
999 $self->{state} = CLOSE_TAG_OPEN_STATE;
1000 !!!next-input-character;
1001 redo A;
1002 } elsif (0x0041 <= $self->{next_char} and
1003 $self->{next_char} <= 0x005A) { # A..Z
1004 !!!cp (19);
1005 $self->{current_token}
1006 = {type => START_TAG_TOKEN,
1007 tag_name => chr ($self->{next_char} + 0x0020),
1008 line => $self->{line_prev},
1009 column => $self->{column_prev}};
1010 $self->{state} = TAG_NAME_STATE;
1011 !!!next-input-character;
1012 redo A;
1013 } elsif (0x0061 <= $self->{next_char} and
1014 $self->{next_char} <= 0x007A) { # a..z
1015 !!!cp (20);
1016 $self->{current_token} = {type => START_TAG_TOKEN,
1017 tag_name => chr ($self->{next_char}),
1018 line => $self->{line_prev},
1019 column => $self->{column_prev}};
1020 $self->{state} = TAG_NAME_STATE;
1021 !!!next-input-character;
1022 redo A;
1023 } elsif ($self->{next_char} == 0x003E) { # >
1024 !!!cp (21);
1025 !!!parse-error (type => 'empty start tag',
1026 line => $self->{line_prev},
1027 column => $self->{column_prev});
1028 $self->{state} = DATA_STATE;
1029 !!!next-input-character;
1030
1031 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1032 line => $self->{line_prev},
1033 column => $self->{column_prev},
1034 });
1035
1036 redo A;
1037 } elsif ($self->{next_char} == 0x003F) { # ?
1038 !!!cp (22);
1039 !!!parse-error (type => 'pio',
1040 line => $self->{line_prev},
1041 column => $self->{column_prev});
1042 $self->{state} = BOGUS_COMMENT_STATE;
1043 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1044 line => $self->{line_prev},
1045 column => $self->{column_prev},
1046 };
1047 ## $self->{next_char} is intentionally left as is
1048 redo A;
1049 } else {
1050 !!!cp (23);
1051 !!!parse-error (type => 'bare stago',
1052 line => $self->{line_prev},
1053 column => $self->{column_prev});
1054 $self->{state} = DATA_STATE;
1055 ## reconsume
1056
1057 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1058 line => $self->{line_prev},
1059 column => $self->{column_prev},
1060 });
1061
1062 redo A;
1063 }
1064 } else {
1065 die "$0: $self->{content_model} in tag open";
1066 }
1067 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1068 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1069 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1070 if (defined $self->{last_emitted_start_tag_name}) {
1071
1072 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
1073 my @next_char;
1074 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
1075 push @next_char, $self->{next_char};
1076 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
1077 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
1078 if ($self->{next_char} == $c or $self->{next_char} == $C) {
1079 !!!cp (24);
1080 !!!next-input-character;
1081 next TAGNAME;
1082 } else {
1083 !!!cp (25);
1084 $self->{next_char} = shift @next_char; # reconsume
1085 !!!back-next-input-character (@next_char);
1086 $self->{state} = DATA_STATE;
1087
1088 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1089 line => $l, column => $c,
1090 });
1091
1092 redo A;
1093 }
1094 }
1095 push @next_char, $self->{next_char};
1096
1097 unless ($self->{next_char} == 0x0009 or # HT
1098 $self->{next_char} == 0x000A or # LF
1099 $self->{next_char} == 0x000B or # VT
1100 $self->{next_char} == 0x000C or # FF
1101 $self->{next_char} == 0x0020 or # SP
1102 $self->{next_char} == 0x003E or # >
1103 $self->{next_char} == 0x002F or # /
1104 $self->{next_char} == -1) {
1105 !!!cp (26);
1106 $self->{next_char} = shift @next_char; # reconsume
1107 !!!back-next-input-character (@next_char);
1108 $self->{state} = DATA_STATE;
1109 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1110 line => $l, column => $c,
1111 });
1112 redo A;
1113 } else {
1114 !!!cp (27);
1115 $self->{next_char} = shift @next_char;
1116 !!!back-next-input-character (@next_char);
1117 # and consume...
1118 }
1119 } else {
1120 ## No start tag token has ever been emitted
1121 !!!cp (28);
1122 # next-input-character is already done
1123 $self->{state} = DATA_STATE;
1124 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1125 line => $l, column => $c,
1126 });
1127 redo A;
1128 }
1129 }
1130
1131 if (0x0041 <= $self->{next_char} and
1132 $self->{next_char} <= 0x005A) { # A..Z
1133 !!!cp (29);
1134 $self->{current_token}
1135 = {type => END_TAG_TOKEN,
1136 tag_name => chr ($self->{next_char} + 0x0020),
1137 line => $l, column => $c};
1138 $self->{state} = TAG_NAME_STATE;
1139 !!!next-input-character;
1140 redo A;
1141 } elsif (0x0061 <= $self->{next_char} and
1142 $self->{next_char} <= 0x007A) { # a..z
1143 !!!cp (30);
1144 $self->{current_token} = {type => END_TAG_TOKEN,
1145 tag_name => chr ($self->{next_char}),
1146 line => $l, column => $c};
1147 $self->{state} = TAG_NAME_STATE;
1148 !!!next-input-character;
1149 redo A;
1150 } elsif ($self->{next_char} == 0x003E) { # >
1151 !!!cp (31);
1152 !!!parse-error (type => 'empty end tag',
1153 line => $self->{line_prev}, ## "<" in "</>"
1154 column => $self->{column_prev} - 1);
1155 $self->{state} = DATA_STATE;
1156 !!!next-input-character;
1157 redo A;
1158 } elsif ($self->{next_char} == -1) {
1159 !!!cp (32);
1160 !!!parse-error (type => 'bare etago');
1161 $self->{state} = DATA_STATE;
1162 # reconsume
1163
1164 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1165 line => $l, column => $c,
1166 });
1167
1168 redo A;
1169 } else {
1170 !!!cp (33);
1171 !!!parse-error (type => 'bogus end tag');
1172 $self->{state} = BOGUS_COMMENT_STATE;
1173 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1174 line => $self->{line_prev}, # "<" of "</"
1175 column => $self->{column_prev} - 1,
1176 };
1177 ## $self->{next_char} is intentionally left as is
1178 redo A;
1179 }
1180 } elsif ($self->{state} == TAG_NAME_STATE) {
1181 if ($self->{next_char} == 0x0009 or # HT
1182 $self->{next_char} == 0x000A or # LF
1183 $self->{next_char} == 0x000B or # VT
1184 $self->{next_char} == 0x000C or # FF
1185 $self->{next_char} == 0x0020) { # SP
1186 !!!cp (34);
1187 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1188 !!!next-input-character;
1189 redo A;
1190 } elsif ($self->{next_char} == 0x003E) { # >
1191 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1192 !!!cp (35);
1193 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1194 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1195 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1196 #if ($self->{current_token}->{attributes}) {
1197 # ## NOTE: This should never be reached.
1198 # !!! cp (36);
1199 # !!! parse-error (type => 'end tag attribute');
1200 #} else {
1201 !!!cp (37);
1202 #}
1203 } else {
1204 die "$0: $self->{current_token}->{type}: Unknown token type";
1205 }
1206 $self->{state} = DATA_STATE;
1207 !!!next-input-character;
1208
1209 !!!emit ($self->{current_token}); # start tag or end tag
1210
1211 redo A;
1212 } elsif (0x0041 <= $self->{next_char} and
1213 $self->{next_char} <= 0x005A) { # A..Z
1214 !!!cp (38);
1215 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1216 # start tag or end tag
1217 ## Stay in this state
1218 !!!next-input-character;
1219 redo A;
1220 } elsif ($self->{next_char} == -1) {
1221 !!!parse-error (type => 'unclosed tag');
1222 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1223 !!!cp (39);
1224 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1225 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1226 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1227 #if ($self->{current_token}->{attributes}) {
1228 # ## NOTE: This state should never be reached.
1229 # !!! cp (40);
1230 # !!! parse-error (type => 'end tag attribute');
1231 #} else {
1232 !!!cp (41);
1233 #}
1234 } else {
1235 die "$0: $self->{current_token}->{type}: Unknown token type";
1236 }
1237 $self->{state} = DATA_STATE;
1238 # reconsume
1239
1240 !!!emit ($self->{current_token}); # start tag or end tag
1241
1242 redo A;
1243 } elsif ($self->{next_char} == 0x002F) { # /
1244 !!!cp (42);
1245 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1246 !!!next-input-character;
1247 redo A;
1248 } else {
1249 !!!cp (44);
1250 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1251 # start tag or end tag
1252 ## Stay in the state
1253 !!!next-input-character;
1254 redo A;
1255 }
1256 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1257 if ($self->{next_char} == 0x0009 or # HT
1258 $self->{next_char} == 0x000A or # LF
1259 $self->{next_char} == 0x000B or # VT
1260 $self->{next_char} == 0x000C or # FF
1261 $self->{next_char} == 0x0020) { # SP
1262 !!!cp (45);
1263 ## Stay in the state
1264 !!!next-input-character;
1265 redo A;
1266 } elsif ($self->{next_char} == 0x003E) { # >
1267 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1268 !!!cp (46);
1269 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1270 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1271 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1272 if ($self->{current_token}->{attributes}) {
1273 !!!cp (47);
1274 !!!parse-error (type => 'end tag attribute');
1275 } else {
1276 !!!cp (48);
1277 }
1278 } else {
1279 die "$0: $self->{current_token}->{type}: Unknown token type";
1280 }
1281 $self->{state} = DATA_STATE;
1282 !!!next-input-character;
1283
1284 !!!emit ($self->{current_token}); # start tag or end tag
1285
1286 redo A;
1287 } elsif (0x0041 <= $self->{next_char} and
1288 $self->{next_char} <= 0x005A) { # A..Z
1289 !!!cp (49);
1290 $self->{current_attribute}
1291 = {name => chr ($self->{next_char} + 0x0020),
1292 value => '',
1293 line => $self->{line}, column => $self->{column}};
1294 $self->{state} = ATTRIBUTE_NAME_STATE;
1295 !!!next-input-character;
1296 redo A;
1297 } elsif ($self->{next_char} == 0x002F) { # /
1298 !!!cp (50);
1299 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1300 !!!next-input-character;
1301 redo A;
1302 } elsif ($self->{next_char} == -1) {
1303 !!!parse-error (type => 'unclosed tag');
1304 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1305 !!!cp (52);
1306 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1307 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1308 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1309 if ($self->{current_token}->{attributes}) {
1310 !!!cp (53);
1311 !!!parse-error (type => 'end tag attribute');
1312 } else {
1313 !!!cp (54);
1314 }
1315 } else {
1316 die "$0: $self->{current_token}->{type}: Unknown token type";
1317 }
1318 $self->{state} = DATA_STATE;
1319 # reconsume
1320
1321 !!!emit ($self->{current_token}); # start tag or end tag
1322
1323 redo A;
1324 } else {
1325 if ({
1326 0x0022 => 1, # "
1327 0x0027 => 1, # '
1328 0x003D => 1, # =
1329 }->{$self->{next_char}}) {
1330 !!!cp (55);
1331 !!!parse-error (type => 'bad attribute name');
1332 } else {
1333 !!!cp (56);
1334 }
1335 $self->{current_attribute}
1336 = {name => chr ($self->{next_char}),
1337 value => '',
1338 line => $self->{line}, column => $self->{column}};
1339 $self->{state} = ATTRIBUTE_NAME_STATE;
1340 !!!next-input-character;
1341 redo A;
1342 }
1343 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1344 my $before_leave = sub {
1345 if (exists $self->{current_token}->{attributes} # start tag or end tag
1346 ->{$self->{current_attribute}->{name}}) { # MUST
1347 !!!cp (57);
1348 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1349 ## Discard $self->{current_attribute} # MUST
1350 } else {
1351 !!!cp (58);
1352 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1353 = $self->{current_attribute};
1354 }
1355 }; # $before_leave
1356
1357 if ($self->{next_char} == 0x0009 or # HT
1358 $self->{next_char} == 0x000A or # LF
1359 $self->{next_char} == 0x000B or # VT
1360 $self->{next_char} == 0x000C or # FF
1361 $self->{next_char} == 0x0020) { # SP
1362 !!!cp (59);
1363 $before_leave->();
1364 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1365 !!!next-input-character;
1366 redo A;
1367 } elsif ($self->{next_char} == 0x003D) { # =
1368 !!!cp (60);
1369 $before_leave->();
1370 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1371 !!!next-input-character;
1372 redo A;
1373 } elsif ($self->{next_char} == 0x003E) { # >
1374 $before_leave->();
1375 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1376 !!!cp (61);
1377 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1378 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1379 !!!cp (62);
1380 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1381 if ($self->{current_token}->{attributes}) {
1382 !!!parse-error (type => 'end tag attribute');
1383 }
1384 } else {
1385 die "$0: $self->{current_token}->{type}: Unknown token type";
1386 }
1387 $self->{state} = DATA_STATE;
1388 !!!next-input-character;
1389
1390 !!!emit ($self->{current_token}); # start tag or end tag
1391
1392 redo A;
1393 } elsif (0x0041 <= $self->{next_char} and
1394 $self->{next_char} <= 0x005A) { # A..Z
1395 !!!cp (63);
1396 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1397 ## Stay in the state
1398 !!!next-input-character;
1399 redo A;
1400 } elsif ($self->{next_char} == 0x002F) { # /
1401 !!!cp (64);
1402 $before_leave->();
1403 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1404 !!!next-input-character;
1405 redo A;
1406 } elsif ($self->{next_char} == -1) {
1407 !!!parse-error (type => 'unclosed tag');
1408 $before_leave->();
1409 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1410 !!!cp (66);
1411 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1412 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1413 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1414 if ($self->{current_token}->{attributes}) {
1415 !!!cp (67);
1416 !!!parse-error (type => 'end tag attribute');
1417 } else {
1418 ## NOTE: This state should never be reached.
1419 !!!cp (68);
1420 }
1421 } else {
1422 die "$0: $self->{current_token}->{type}: Unknown token type";
1423 }
1424 $self->{state} = DATA_STATE;
1425 # reconsume
1426
1427 !!!emit ($self->{current_token}); # start tag or end tag
1428
1429 redo A;
1430 } else {
1431 if ($self->{next_char} == 0x0022 or # "
1432 $self->{next_char} == 0x0027) { # '
1433 !!!cp (69);
1434 !!!parse-error (type => 'bad attribute name');
1435 } else {
1436 !!!cp (70);
1437 }
1438 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1439 ## Stay in the state
1440 !!!next-input-character;
1441 redo A;
1442 }
1443 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1444 if ($self->{next_char} == 0x0009 or # HT
1445 $self->{next_char} == 0x000A or # LF
1446 $self->{next_char} == 0x000B or # VT
1447 $self->{next_char} == 0x000C or # FF
1448 $self->{next_char} == 0x0020) { # SP
1449 !!!cp (71);
1450 ## Stay in the state
1451 !!!next-input-character;
1452 redo A;
1453 } elsif ($self->{next_char} == 0x003D) { # =
1454 !!!cp (72);
1455 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1456 !!!next-input-character;
1457 redo A;
1458 } elsif ($self->{next_char} == 0x003E) { # >
1459 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1460 !!!cp (73);
1461 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1462 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1463 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1464 if ($self->{current_token}->{attributes}) {
1465 !!!cp (74);
1466 !!!parse-error (type => 'end tag attribute');
1467 } else {
1468 ## NOTE: This state should never be reached.
1469 !!!cp (75);
1470 }
1471 } else {
1472 die "$0: $self->{current_token}->{type}: Unknown token type";
1473 }
1474 $self->{state} = DATA_STATE;
1475 !!!next-input-character;
1476
1477 !!!emit ($self->{current_token}); # start tag or end tag
1478
1479 redo A;
1480 } elsif (0x0041 <= $self->{next_char} and
1481 $self->{next_char} <= 0x005A) { # A..Z
1482 !!!cp (76);
1483 $self->{current_attribute}
1484 = {name => chr ($self->{next_char} + 0x0020),
1485 value => '',
1486 line => $self->{line}, column => $self->{column}};
1487 $self->{state} = ATTRIBUTE_NAME_STATE;
1488 !!!next-input-character;
1489 redo A;
1490 } elsif ($self->{next_char} == 0x002F) { # /
1491 !!!cp (77);
1492 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1493 !!!next-input-character;
1494 redo A;
1495 } elsif ($self->{next_char} == -1) {
1496 !!!parse-error (type => 'unclosed tag');
1497 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1498 !!!cp (79);
1499 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1500 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1501 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1502 if ($self->{current_token}->{attributes}) {
1503 !!!cp (80);
1504 !!!parse-error (type => 'end tag attribute');
1505 } else {
1506 ## NOTE: This state should never be reached.
1507 !!!cp (81);
1508 }
1509 } else {
1510 die "$0: $self->{current_token}->{type}: Unknown token type";
1511 }
1512 $self->{state} = DATA_STATE;
1513 # reconsume
1514
1515 !!!emit ($self->{current_token}); # start tag or end tag
1516
1517 redo A;
1518 } else {
1519 !!!cp (82);
1520 $self->{current_attribute}
1521 = {name => chr ($self->{next_char}),
1522 value => '',
1523 line => $self->{line}, column => $self->{column}};
1524 $self->{state} = ATTRIBUTE_NAME_STATE;
1525 !!!next-input-character;
1526 redo A;
1527 }
1528 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1529 if ($self->{next_char} == 0x0009 or # HT
1530 $self->{next_char} == 0x000A or # LF
1531 $self->{next_char} == 0x000B or # VT
1532 $self->{next_char} == 0x000C or # FF
1533 $self->{next_char} == 0x0020) { # SP
1534 !!!cp (83);
1535 ## Stay in the state
1536 !!!next-input-character;
1537 redo A;
1538 } elsif ($self->{next_char} == 0x0022) { # "
1539 !!!cp (84);
1540 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1541 !!!next-input-character;
1542 redo A;
1543 } elsif ($self->{next_char} == 0x0026) { # &
1544 !!!cp (85);
1545 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1546 ## reconsume
1547 redo A;
1548 } elsif ($self->{next_char} == 0x0027) { # '
1549 !!!cp (86);
1550 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1551 !!!next-input-character;
1552 redo A;
1553 } elsif ($self->{next_char} == 0x003E) { # >
1554 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1555 !!!cp (87);
1556 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1557 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1558 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1559 if ($self->{current_token}->{attributes}) {
1560 !!!cp (88);
1561 !!!parse-error (type => 'end tag attribute');
1562 } else {
1563 ## NOTE: This state should never be reached.
1564 !!!cp (89);
1565 }
1566 } else {
1567 die "$0: $self->{current_token}->{type}: Unknown token type";
1568 }
1569 $self->{state} = DATA_STATE;
1570 !!!next-input-character;
1571
1572 !!!emit ($self->{current_token}); # start tag or end tag
1573
1574 redo A;
1575 } elsif ($self->{next_char} == -1) {
1576 !!!parse-error (type => 'unclosed tag');
1577 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1578 !!!cp (90);
1579 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1580 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1581 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1582 if ($self->{current_token}->{attributes}) {
1583 !!!cp (91);
1584 !!!parse-error (type => 'end tag attribute');
1585 } else {
1586 ## NOTE: This state should never be reached.
1587 !!!cp (92);
1588 }
1589 } else {
1590 die "$0: $self->{current_token}->{type}: Unknown token type";
1591 }
1592 $self->{state} = DATA_STATE;
1593 ## reconsume
1594
1595 !!!emit ($self->{current_token}); # start tag or end tag
1596
1597 redo A;
1598 } else {
1599 if ($self->{next_char} == 0x003D) { # =
1600 !!!cp (93);
1601 !!!parse-error (type => 'bad attribute value');
1602 } else {
1603 !!!cp (94);
1604 }
1605 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1606 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1607 !!!next-input-character;
1608 redo A;
1609 }
1610 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1611 if ($self->{next_char} == 0x0022) { # "
1612 !!!cp (95);
1613 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1614 !!!next-input-character;
1615 redo A;
1616 } elsif ($self->{next_char} == 0x0026) { # &
1617 !!!cp (96);
1618 $self->{last_attribute_value_state} = $self->{state};
1619 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1620 !!!next-input-character;
1621 redo A;
1622 } elsif ($self->{next_char} == -1) {
1623 !!!parse-error (type => 'unclosed attribute value');
1624 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1625 !!!cp (97);
1626 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1627 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1628 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1629 if ($self->{current_token}->{attributes}) {
1630 !!!cp (98);
1631 !!!parse-error (type => 'end tag attribute');
1632 } else {
1633 ## NOTE: This state should never be reached.
1634 !!!cp (99);
1635 }
1636 } else {
1637 die "$0: $self->{current_token}->{type}: Unknown token type";
1638 }
1639 $self->{state} = DATA_STATE;
1640 ## reconsume
1641
1642 !!!emit ($self->{current_token}); # start tag or end tag
1643
1644 redo A;
1645 } else {
1646 !!!cp (100);
1647 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1648 ## Stay in the state
1649 !!!next-input-character;
1650 redo A;
1651 }
1652 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1653 if ($self->{next_char} == 0x0027) { # '
1654 !!!cp (101);
1655 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1656 !!!next-input-character;
1657 redo A;
1658 } elsif ($self->{next_char} == 0x0026) { # &
1659 !!!cp (102);
1660 $self->{last_attribute_value_state} = $self->{state};
1661 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1662 !!!next-input-character;
1663 redo A;
1664 } elsif ($self->{next_char} == -1) {
1665 !!!parse-error (type => 'unclosed attribute value');
1666 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1667 !!!cp (103);
1668 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1669 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1670 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1671 if ($self->{current_token}->{attributes}) {
1672 !!!cp (104);
1673 !!!parse-error (type => 'end tag attribute');
1674 } else {
1675 ## NOTE: This state should never be reached.
1676 !!!cp (105);
1677 }
1678 } else {
1679 die "$0: $self->{current_token}->{type}: Unknown token type";
1680 }
1681 $self->{state} = DATA_STATE;
1682 ## reconsume
1683
1684 !!!emit ($self->{current_token}); # start tag or end tag
1685
1686 redo A;
1687 } else {
1688 !!!cp (106);
1689 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1690 ## Stay in the state
1691 !!!next-input-character;
1692 redo A;
1693 }
1694 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1695 if ($self->{next_char} == 0x0009 or # HT
1696 $self->{next_char} == 0x000A or # LF
1697 $self->{next_char} == 0x000B or # HT
1698 $self->{next_char} == 0x000C or # FF
1699 $self->{next_char} == 0x0020) { # SP
1700 !!!cp (107);
1701 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1702 !!!next-input-character;
1703 redo A;
1704 } elsif ($self->{next_char} == 0x0026) { # &
1705 !!!cp (108);
1706 $self->{last_attribute_value_state} = $self->{state};
1707 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1708 !!!next-input-character;
1709 redo A;
1710 } elsif ($self->{next_char} == 0x003E) { # >
1711 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1712 !!!cp (109);
1713 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1714 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1715 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1716 if ($self->{current_token}->{attributes}) {
1717 !!!cp (110);
1718 !!!parse-error (type => 'end tag attribute');
1719 } else {
1720 ## NOTE: This state should never be reached.
1721 !!!cp (111);
1722 }
1723 } else {
1724 die "$0: $self->{current_token}->{type}: Unknown token type";
1725 }
1726 $self->{state} = DATA_STATE;
1727 !!!next-input-character;
1728
1729 !!!emit ($self->{current_token}); # start tag or end tag
1730
1731 redo A;
1732 } elsif ($self->{next_char} == -1) {
1733 !!!parse-error (type => 'unclosed tag');
1734 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1735 !!!cp (112);
1736 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1737 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1738 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1739 if ($self->{current_token}->{attributes}) {
1740 !!!cp (113);
1741 !!!parse-error (type => 'end tag attribute');
1742 } else {
1743 ## NOTE: This state should never be reached.
1744 !!!cp (114);
1745 }
1746 } else {
1747 die "$0: $self->{current_token}->{type}: Unknown token type";
1748 }
1749 $self->{state} = DATA_STATE;
1750 ## reconsume
1751
1752 !!!emit ($self->{current_token}); # start tag or end tag
1753
1754 redo A;
1755 } else {
1756 if ({
1757 0x0022 => 1, # "
1758 0x0027 => 1, # '
1759 0x003D => 1, # =
1760 }->{$self->{next_char}}) {
1761 !!!cp (115);
1762 !!!parse-error (type => 'bad attribute value');
1763 } else {
1764 !!!cp (116);
1765 }
1766 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1767 ## Stay in the state
1768 !!!next-input-character;
1769 redo A;
1770 }
1771 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1772 my $token = $self->_tokenize_attempt_to_consume_an_entity
1773 (1,
1774 $self->{last_attribute_value_state}
1775 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1776 $self->{last_attribute_value_state}
1777 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1778 -1);
1779
1780 unless (defined $token) {
1781 !!!cp (117);
1782 $self->{current_attribute}->{value} .= '&';
1783 } else {
1784 !!!cp (118);
1785 $self->{current_attribute}->{value} .= $token->{data};
1786 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1787 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1788 }
1789
1790 $self->{state} = $self->{last_attribute_value_state};
1791 # next-input-character is already done
1792 redo A;
1793 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1794 if ($self->{next_char} == 0x0009 or # HT
1795 $self->{next_char} == 0x000A or # LF
1796 $self->{next_char} == 0x000B or # VT
1797 $self->{next_char} == 0x000C or # FF
1798 $self->{next_char} == 0x0020) { # SP
1799 !!!cp (118);
1800 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1801 !!!next-input-character;
1802 redo A;
1803 } elsif ($self->{next_char} == 0x003E) { # >
1804 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1805 !!!cp (119);
1806 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1807 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1808 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1809 if ($self->{current_token}->{attributes}) {
1810 !!!cp (120);
1811 !!!parse-error (type => 'end tag attribute');
1812 } else {
1813 ## NOTE: This state should never be reached.
1814 !!!cp (121);
1815 }
1816 } else {
1817 die "$0: $self->{current_token}->{type}: Unknown token type";
1818 }
1819 $self->{state} = DATA_STATE;
1820 !!!next-input-character;
1821
1822 !!!emit ($self->{current_token}); # start tag or end tag
1823
1824 redo A;
1825 } elsif ($self->{next_char} == 0x002F) { # /
1826 !!!cp (122);
1827 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1828 !!!next-input-character;
1829 redo A;
1830 } elsif ($self->{next_char} == -1) {
1831 !!!parse-error (type => 'unclosed tag');
1832 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1833 !!!cp (122.3);
1834 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1835 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1836 if ($self->{current_token}->{attributes}) {
1837 !!!cp (122.1);
1838 !!!parse-error (type => 'end tag attribute');
1839 } else {
1840 ## NOTE: This state should never be reached.
1841 !!!cp (122.2);
1842 }
1843 } else {
1844 die "$0: $self->{current_token}->{type}: Unknown token type";
1845 }
1846 $self->{state} = DATA_STATE;
1847 ## Reconsume.
1848 !!!emit ($self->{current_token}); # start tag or end tag
1849 redo A;
1850 } else {
1851 !!!cp ('124.1');
1852 !!!parse-error (type => 'no space between attributes');
1853 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1854 ## reconsume
1855 redo A;
1856 }
1857 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1858 if ($self->{next_char} == 0x003E) { # >
1859 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1860 !!!cp ('124.2');
1861 !!!parse-error (type => 'nestc', token => $self->{current_token});
1862 ## TODO: Different type than slash in start tag
1863 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1864 if ($self->{current_token}->{attributes}) {
1865 !!!cp ('124.4');
1866 !!!parse-error (type => 'end tag attribute');
1867 } else {
1868 !!!cp ('124.5');
1869 }
1870 ## TODO: Test |<title></title/>|
1871 } else {
1872 !!!cp ('124.3');
1873 $self->{self_closing} = 1;
1874 }
1875
1876 $self->{state} = DATA_STATE;
1877 !!!next-input-character;
1878
1879 !!!emit ($self->{current_token}); # start tag or end tag
1880
1881 redo A;
1882 } elsif ($self->{next_char} == -1) {
1883 !!!parse-error (type => 'unclosed tag');
1884 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1885 !!!cp (124.7);
1886 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1887 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1888 if ($self->{current_token}->{attributes}) {
1889 !!!cp (124.5);
1890 !!!parse-error (type => 'end tag attribute');
1891 } else {
1892 ## NOTE: This state should never be reached.
1893 !!!cp (124.6);
1894 }
1895 } else {
1896 die "$0: $self->{current_token}->{type}: Unknown token type";
1897 }
1898 $self->{state} = DATA_STATE;
1899 ## Reconsume.
1900 !!!emit ($self->{current_token}); # start tag or end tag
1901 redo A;
1902 } else {
1903 !!!cp ('124.4');
1904 !!!parse-error (type => 'nestc');
1905 ## TODO: This error type is wrong.
1906 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1907 ## Reconsume.
1908 redo A;
1909 }
1910 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1911 ## (only happen if PCDATA state)
1912
1913 ## NOTE: Set by the previous state
1914 #my $token = {type => COMMENT_TOKEN, data => ''};
1915
1916 BC: {
1917 if ($self->{next_char} == 0x003E) { # >
1918 !!!cp (124);
1919 $self->{state} = DATA_STATE;
1920 !!!next-input-character;
1921
1922 !!!emit ($self->{current_token}); # comment
1923
1924 redo A;
1925 } elsif ($self->{next_char} == -1) {
1926 !!!cp (125);
1927 $self->{state} = DATA_STATE;
1928 ## reconsume
1929
1930 !!!emit ($self->{current_token}); # comment
1931
1932 redo A;
1933 } else {
1934 !!!cp (126);
1935 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1936 !!!next-input-character;
1937 redo BC;
1938 }
1939 } # BC
1940
1941 die "$0: _get_next_token: unexpected case [BC]";
1942 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1943 ## (only happen if PCDATA state)
1944
1945 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1946
1947 my @next_char;
1948 push @next_char, $self->{next_char};
1949
1950 if ($self->{next_char} == 0x002D) { # -
1951 !!!next-input-character;
1952 push @next_char, $self->{next_char};
1953 if ($self->{next_char} == 0x002D) { # -
1954 !!!cp (127);
1955 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1956 line => $l, column => $c,
1957 };
1958 $self->{state} = COMMENT_START_STATE;
1959 !!!next-input-character;
1960 redo A;
1961 } else {
1962 !!!cp (128);
1963 }
1964 } elsif ($self->{next_char} == 0x0044 or # D
1965 $self->{next_char} == 0x0064) { # d
1966 !!!next-input-character;
1967 push @next_char, $self->{next_char};
1968 if ($self->{next_char} == 0x004F or # O
1969 $self->{next_char} == 0x006F) { # o
1970 !!!next-input-character;
1971 push @next_char, $self->{next_char};
1972 if ($self->{next_char} == 0x0043 or # C
1973 $self->{next_char} == 0x0063) { # c
1974 !!!next-input-character;
1975 push @next_char, $self->{next_char};
1976 if ($self->{next_char} == 0x0054 or # T
1977 $self->{next_char} == 0x0074) { # t
1978 !!!next-input-character;
1979 push @next_char, $self->{next_char};
1980 if ($self->{next_char} == 0x0059 or # Y
1981 $self->{next_char} == 0x0079) { # y
1982 !!!next-input-character;
1983 push @next_char, $self->{next_char};
1984 if ($self->{next_char} == 0x0050 or # P
1985 $self->{next_char} == 0x0070) { # p
1986 !!!next-input-character;
1987 push @next_char, $self->{next_char};
1988 if ($self->{next_char} == 0x0045 or # E
1989 $self->{next_char} == 0x0065) { # e
1990 !!!cp (129);
1991 ## TODO: What a stupid code this is!
1992 $self->{state} = DOCTYPE_STATE;
1993 $self->{current_token} = {type => DOCTYPE_TOKEN,
1994 quirks => 1,
1995 line => $l, column => $c,
1996 };
1997 !!!next-input-character;
1998 redo A;
1999 } else {
2000 !!!cp (130);
2001 }
2002 } else {
2003 !!!cp (131);
2004 }
2005 } else {
2006 !!!cp (132);
2007 }
2008 } else {
2009 !!!cp (133);
2010 }
2011 } else {
2012 !!!cp (134);
2013 }
2014 } else {
2015 !!!cp (135);
2016 }
2017 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2018 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2019 $self->{next_char} == 0x005B) { # [
2020 !!!next-input-character;
2021 push @next_char, $self->{next_char};
2022 if ($self->{next_char} == 0x0043) { # C
2023 !!!next-input-character;
2024 push @next_char, $self->{next_char};
2025 if ($self->{next_char} == 0x0044) { # D
2026 !!!next-input-character;
2027 push @next_char, $self->{next_char};
2028 if ($self->{next_char} == 0x0041) { # A
2029 !!!next-input-character;
2030 push @next_char, $self->{next_char};
2031 if ($self->{next_char} == 0x0054) { # T
2032 !!!next-input-character;
2033 push @next_char, $self->{next_char};
2034 if ($self->{next_char} == 0x0041) { # A
2035 !!!next-input-character;
2036 push @next_char, $self->{next_char};
2037 if ($self->{next_char} == 0x005B) { # [
2038 !!!cp (135.1);
2039 $self->{state} = CDATA_BLOCK_STATE;
2040 !!!next-input-character;
2041 redo A;
2042 } else {
2043 !!!cp (135.2);
2044 }
2045 } else {
2046 !!!cp (135.3);
2047 }
2048 } else {
2049 !!!cp (135.4);
2050 }
2051 } else {
2052 !!!cp (135.5);
2053 }
2054 } else {
2055 !!!cp (135.6);
2056 }
2057 } else {
2058 !!!cp (135.7);
2059 }
2060 } else {
2061 !!!cp (136);
2062 }
2063
2064 !!!parse-error (type => 'bogus comment');
2065 $self->{next_char} = shift @next_char;
2066 !!!back-next-input-character (@next_char);
2067 $self->{state} = BOGUS_COMMENT_STATE;
2068 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2069 line => $l, column => $c,
2070 };
2071 redo A;
2072
2073 ## ISSUE: typos in spec: chacacters, is is a parse error
2074 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
2075 } elsif ($self->{state} == COMMENT_START_STATE) {
2076 if ($self->{next_char} == 0x002D) { # -
2077 !!!cp (137);
2078 $self->{state} = COMMENT_START_DASH_STATE;
2079 !!!next-input-character;
2080 redo A;
2081 } elsif ($self->{next_char} == 0x003E) { # >
2082 !!!cp (138);
2083 !!!parse-error (type => 'bogus comment');
2084 $self->{state} = DATA_STATE;
2085 !!!next-input-character;
2086
2087 !!!emit ($self->{current_token}); # comment
2088
2089 redo A;
2090 } elsif ($self->{next_char} == -1) {
2091 !!!cp (139);
2092 !!!parse-error (type => 'unclosed comment');
2093 $self->{state} = DATA_STATE;
2094 ## reconsume
2095
2096 !!!emit ($self->{current_token}); # comment
2097
2098 redo A;
2099 } else {
2100 !!!cp (140);
2101 $self->{current_token}->{data} # comment
2102 .= chr ($self->{next_char});
2103 $self->{state} = COMMENT_STATE;
2104 !!!next-input-character;
2105 redo A;
2106 }
2107 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2108 if ($self->{next_char} == 0x002D) { # -
2109 !!!cp (141);
2110 $self->{state} = COMMENT_END_STATE;
2111 !!!next-input-character;
2112 redo A;
2113 } elsif ($self->{next_char} == 0x003E) { # >
2114 !!!cp (142);
2115 !!!parse-error (type => 'bogus comment');
2116 $self->{state} = DATA_STATE;
2117 !!!next-input-character;
2118
2119 !!!emit ($self->{current_token}); # comment
2120
2121 redo A;
2122 } elsif ($self->{next_char} == -1) {
2123 !!!cp (143);
2124 !!!parse-error (type => 'unclosed comment');
2125 $self->{state} = DATA_STATE;
2126 ## reconsume
2127
2128 !!!emit ($self->{current_token}); # comment
2129
2130 redo A;
2131 } else {
2132 !!!cp (144);
2133 $self->{current_token}->{data} # comment
2134 .= '-' . chr ($self->{next_char});
2135 $self->{state} = COMMENT_STATE;
2136 !!!next-input-character;
2137 redo A;
2138 }
2139 } elsif ($self->{state} == COMMENT_STATE) {
2140 if ($self->{next_char} == 0x002D) { # -
2141 !!!cp (145);
2142 $self->{state} = COMMENT_END_DASH_STATE;
2143 !!!next-input-character;
2144 redo A;
2145 } elsif ($self->{next_char} == -1) {
2146 !!!cp (146);
2147 !!!parse-error (type => 'unclosed comment');
2148 $self->{state} = DATA_STATE;
2149 ## reconsume
2150
2151 !!!emit ($self->{current_token}); # comment
2152
2153 redo A;
2154 } else {
2155 !!!cp (147);
2156 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2157 ## Stay in the state
2158 !!!next-input-character;
2159 redo A;
2160 }
2161 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2162 if ($self->{next_char} == 0x002D) { # -
2163 !!!cp (148);
2164 $self->{state} = COMMENT_END_STATE;
2165 !!!next-input-character;
2166 redo A;
2167 } elsif ($self->{next_char} == -1) {
2168 !!!cp (149);
2169 !!!parse-error (type => 'unclosed comment');
2170 $self->{state} = DATA_STATE;
2171 ## reconsume
2172
2173 !!!emit ($self->{current_token}); # comment
2174
2175 redo A;
2176 } else {
2177 !!!cp (150);
2178 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2179 $self->{state} = COMMENT_STATE;
2180 !!!next-input-character;
2181 redo A;
2182 }
2183 } elsif ($self->{state} == COMMENT_END_STATE) {
2184 if ($self->{next_char} == 0x003E) { # >
2185 !!!cp (151);
2186 $self->{state} = DATA_STATE;
2187 !!!next-input-character;
2188
2189 !!!emit ($self->{current_token}); # comment
2190
2191 redo A;
2192 } elsif ($self->{next_char} == 0x002D) { # -
2193 !!!cp (152);
2194 !!!parse-error (type => 'dash in comment',
2195 line => $self->{line_prev},
2196 column => $self->{column_prev});
2197 $self->{current_token}->{data} .= '-'; # comment
2198 ## Stay in the state
2199 !!!next-input-character;
2200 redo A;
2201 } elsif ($self->{next_char} == -1) {
2202 !!!cp (153);
2203 !!!parse-error (type => 'unclosed comment');
2204 $self->{state} = DATA_STATE;
2205 ## reconsume
2206
2207 !!!emit ($self->{current_token}); # comment
2208
2209 redo A;
2210 } else {
2211 !!!cp (154);
2212 !!!parse-error (type => 'dash in comment',
2213 line => $self->{line_prev},
2214 column => $self->{column_prev});
2215 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2216 $self->{state} = COMMENT_STATE;
2217 !!!next-input-character;
2218 redo A;
2219 }
2220 } elsif ($self->{state} == DOCTYPE_STATE) {
2221 if ($self->{next_char} == 0x0009 or # HT
2222 $self->{next_char} == 0x000A or # LF
2223 $self->{next_char} == 0x000B or # VT
2224 $self->{next_char} == 0x000C or # FF
2225 $self->{next_char} == 0x0020) { # SP
2226 !!!cp (155);
2227 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2228 !!!next-input-character;
2229 redo A;
2230 } else {
2231 !!!cp (156);
2232 !!!parse-error (type => 'no space before DOCTYPE name');
2233 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2234 ## reconsume
2235 redo A;
2236 }
2237 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2238 if ($self->{next_char} == 0x0009 or # HT
2239 $self->{next_char} == 0x000A or # LF
2240 $self->{next_char} == 0x000B or # VT
2241 $self->{next_char} == 0x000C or # FF
2242 $self->{next_char} == 0x0020) { # SP
2243 !!!cp (157);
2244 ## Stay in the state
2245 !!!next-input-character;
2246 redo A;
2247 } elsif ($self->{next_char} == 0x003E) { # >
2248 !!!cp (158);
2249 !!!parse-error (type => 'no DOCTYPE name');
2250 $self->{state} = DATA_STATE;
2251 !!!next-input-character;
2252
2253 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2254
2255 redo A;
2256 } elsif ($self->{next_char} == -1) {
2257 !!!cp (159);
2258 !!!parse-error (type => 'no DOCTYPE name');
2259 $self->{state} = DATA_STATE;
2260 ## reconsume
2261
2262 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2263
2264 redo A;
2265 } else {
2266 !!!cp (160);
2267 $self->{current_token}->{name} = chr $self->{next_char};
2268 delete $self->{current_token}->{quirks};
2269 ## ISSUE: "Set the token's name name to the" in the spec
2270 $self->{state} = DOCTYPE_NAME_STATE;
2271 !!!next-input-character;
2272 redo A;
2273 }
2274 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2275 ## ISSUE: Redundant "First," in the spec.
2276 if ($self->{next_char} == 0x0009 or # HT
2277 $self->{next_char} == 0x000A or # LF
2278 $self->{next_char} == 0x000B or # VT
2279 $self->{next_char} == 0x000C or # FF
2280 $self->{next_char} == 0x0020) { # SP
2281 !!!cp (161);
2282 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2283 !!!next-input-character;
2284 redo A;
2285 } elsif ($self->{next_char} == 0x003E) { # >
2286 !!!cp (162);
2287 $self->{state} = DATA_STATE;
2288 !!!next-input-character;
2289
2290 !!!emit ($self->{current_token}); # DOCTYPE
2291
2292 redo A;
2293 } elsif ($self->{next_char} == -1) {
2294 !!!cp (163);
2295 !!!parse-error (type => 'unclosed DOCTYPE');
2296 $self->{state} = DATA_STATE;
2297 ## reconsume
2298
2299 $self->{current_token}->{quirks} = 1;
2300 !!!emit ($self->{current_token}); # DOCTYPE
2301
2302 redo A;
2303 } else {
2304 !!!cp (164);
2305 $self->{current_token}->{name}
2306 .= chr ($self->{next_char}); # DOCTYPE
2307 ## Stay in the state
2308 !!!next-input-character;
2309 redo A;
2310 }
2311 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2312 if ($self->{next_char} == 0x0009 or # HT
2313 $self->{next_char} == 0x000A or # LF
2314 $self->{next_char} == 0x000B or # VT
2315 $self->{next_char} == 0x000C or # FF
2316 $self->{next_char} == 0x0020) { # SP
2317 !!!cp (165);
2318 ## Stay in the state
2319 !!!next-input-character;
2320 redo A;
2321 } elsif ($self->{next_char} == 0x003E) { # >
2322 !!!cp (166);
2323 $self->{state} = DATA_STATE;
2324 !!!next-input-character;
2325
2326 !!!emit ($self->{current_token}); # DOCTYPE
2327
2328 redo A;
2329 } elsif ($self->{next_char} == -1) {
2330 !!!cp (167);
2331 !!!parse-error (type => 'unclosed DOCTYPE');
2332 $self->{state} = DATA_STATE;
2333 ## reconsume
2334
2335 $self->{current_token}->{quirks} = 1;
2336 !!!emit ($self->{current_token}); # DOCTYPE
2337
2338 redo A;
2339 } elsif ($self->{next_char} == 0x0050 or # P
2340 $self->{next_char} == 0x0070) { # p
2341 !!!next-input-character;
2342 if ($self->{next_char} == 0x0055 or # U
2343 $self->{next_char} == 0x0075) { # u
2344 !!!next-input-character;
2345 if ($self->{next_char} == 0x0042 or # B
2346 $self->{next_char} == 0x0062) { # b
2347 !!!next-input-character;
2348 if ($self->{next_char} == 0x004C or # L
2349 $self->{next_char} == 0x006C) { # l
2350 !!!next-input-character;
2351 if ($self->{next_char} == 0x0049 or # I
2352 $self->{next_char} == 0x0069) { # i
2353 !!!next-input-character;
2354 if ($self->{next_char} == 0x0043 or # C
2355 $self->{next_char} == 0x0063) { # c
2356 !!!cp (168);
2357 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2358 !!!next-input-character;
2359 redo A;
2360 } else {
2361 !!!cp (169);
2362 }
2363 } else {
2364 !!!cp (170);
2365 }
2366 } else {
2367 !!!cp (171);
2368 }
2369 } else {
2370 !!!cp (172);
2371 }
2372 } else {
2373 !!!cp (173);
2374 }
2375
2376 #
2377 } elsif ($self->{next_char} == 0x0053 or # S
2378 $self->{next_char} == 0x0073) { # s
2379 !!!next-input-character;
2380 if ($self->{next_char} == 0x0059 or # Y
2381 $self->{next_char} == 0x0079) { # y
2382 !!!next-input-character;
2383 if ($self->{next_char} == 0x0053 or # S
2384 $self->{next_char} == 0x0073) { # s
2385 !!!next-input-character;
2386 if ($self->{next_char} == 0x0054 or # T
2387 $self->{next_char} == 0x0074) { # t
2388 !!!next-input-character;
2389 if ($self->{next_char} == 0x0045 or # E
2390 $self->{next_char} == 0x0065) { # e
2391 !!!next-input-character;
2392 if ($self->{next_char} == 0x004D or # M
2393 $self->{next_char} == 0x006D) { # m
2394 !!!cp (174);
2395 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2396 !!!next-input-character;
2397 redo A;
2398 } else {
2399 !!!cp (175);
2400 }
2401 } else {
2402 !!!cp (176);
2403 }
2404 } else {
2405 !!!cp (177);
2406 }
2407 } else {
2408 !!!cp (178);
2409 }
2410 } else {
2411 !!!cp (179);
2412 }
2413
2414 #
2415 } else {
2416 !!!cp (180);
2417 !!!next-input-character;
2418 #
2419 }
2420
2421 !!!parse-error (type => 'string after DOCTYPE name');
2422 $self->{current_token}->{quirks} = 1;
2423
2424 $self->{state} = BOGUS_DOCTYPE_STATE;
2425 # next-input-character is already done
2426 redo A;
2427 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2428 if ({
2429 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2430 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2431 }->{$self->{next_char}}) {
2432 !!!cp (181);
2433 ## Stay in the state
2434 !!!next-input-character;
2435 redo A;
2436 } elsif ($self->{next_char} eq 0x0022) { # "
2437 !!!cp (182);
2438 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2439 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2440 !!!next-input-character;
2441 redo A;
2442 } elsif ($self->{next_char} eq 0x0027) { # '
2443 !!!cp (183);
2444 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2445 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2446 !!!next-input-character;
2447 redo A;
2448 } elsif ($self->{next_char} eq 0x003E) { # >
2449 !!!cp (184);
2450 !!!parse-error (type => 'no PUBLIC literal');
2451
2452 $self->{state} = DATA_STATE;
2453 !!!next-input-character;
2454
2455 $self->{current_token}->{quirks} = 1;
2456 !!!emit ($self->{current_token}); # DOCTYPE
2457
2458 redo A;
2459 } elsif ($self->{next_char} == -1) {
2460 !!!cp (185);
2461 !!!parse-error (type => 'unclosed DOCTYPE');
2462
2463 $self->{state} = DATA_STATE;
2464 ## reconsume
2465
2466 $self->{current_token}->{quirks} = 1;
2467 !!!emit ($self->{current_token}); # DOCTYPE
2468
2469 redo A;
2470 } else {
2471 !!!cp (186);
2472 !!!parse-error (type => 'string after PUBLIC');
2473 $self->{current_token}->{quirks} = 1;
2474
2475 $self->{state} = BOGUS_DOCTYPE_STATE;
2476 !!!next-input-character;
2477 redo A;
2478 }
2479 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2480 if ($self->{next_char} == 0x0022) { # "
2481 !!!cp (187);
2482 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2483 !!!next-input-character;
2484 redo A;
2485 } elsif ($self->{next_char} == 0x003E) { # >
2486 !!!cp (188);
2487 !!!parse-error (type => 'unclosed PUBLIC literal');
2488
2489 $self->{state} = DATA_STATE;
2490 !!!next-input-character;
2491
2492 $self->{current_token}->{quirks} = 1;
2493 !!!emit ($self->{current_token}); # DOCTYPE
2494
2495 redo A;
2496 } elsif ($self->{next_char} == -1) {
2497 !!!cp (189);
2498 !!!parse-error (type => 'unclosed PUBLIC literal');
2499
2500 $self->{state} = DATA_STATE;
2501 ## reconsume
2502
2503 $self->{current_token}->{quirks} = 1;
2504 !!!emit ($self->{current_token}); # DOCTYPE
2505
2506 redo A;
2507 } else {
2508 !!!cp (190);
2509 $self->{current_token}->{public_identifier} # DOCTYPE
2510 .= chr $self->{next_char};
2511 ## Stay in the state
2512 !!!next-input-character;
2513 redo A;
2514 }
2515 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2516 if ($self->{next_char} == 0x0027) { # '
2517 !!!cp (191);
2518 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2519 !!!next-input-character;
2520 redo A;
2521 } elsif ($self->{next_char} == 0x003E) { # >
2522 !!!cp (192);
2523 !!!parse-error (type => 'unclosed PUBLIC literal');
2524
2525 $self->{state} = DATA_STATE;
2526 !!!next-input-character;
2527
2528 $self->{current_token}->{quirks} = 1;
2529 !!!emit ($self->{current_token}); # DOCTYPE
2530
2531 redo A;
2532 } elsif ($self->{next_char} == -1) {
2533 !!!cp (193);
2534 !!!parse-error (type => 'unclosed PUBLIC literal');
2535
2536 $self->{state} = DATA_STATE;
2537 ## reconsume
2538
2539 $self->{current_token}->{quirks} = 1;
2540 !!!emit ($self->{current_token}); # DOCTYPE
2541
2542 redo A;
2543 } else {
2544 !!!cp (194);
2545 $self->{current_token}->{public_identifier} # DOCTYPE
2546 .= chr $self->{next_char};
2547 ## Stay in the state
2548 !!!next-input-character;
2549 redo A;
2550 }
2551 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2552 if ({
2553 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2554 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2555 }->{$self->{next_char}}) {
2556 !!!cp (195);
2557 ## Stay in the state
2558 !!!next-input-character;
2559 redo A;
2560 } elsif ($self->{next_char} == 0x0022) { # "
2561 !!!cp (196);
2562 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2563 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2564 !!!next-input-character;
2565 redo A;
2566 } elsif ($self->{next_char} == 0x0027) { # '
2567 !!!cp (197);
2568 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2569 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2570 !!!next-input-character;
2571 redo A;
2572 } elsif ($self->{next_char} == 0x003E) { # >
2573 !!!cp (198);
2574 $self->{state} = DATA_STATE;
2575 !!!next-input-character;
2576
2577 !!!emit ($self->{current_token}); # DOCTYPE
2578
2579 redo A;
2580 } elsif ($self->{next_char} == -1) {
2581 !!!cp (199);
2582 !!!parse-error (type => 'unclosed DOCTYPE');
2583
2584 $self->{state} = DATA_STATE;
2585 ## reconsume
2586
2587 $self->{current_token}->{quirks} = 1;
2588 !!!emit ($self->{current_token}); # DOCTYPE
2589
2590 redo A;
2591 } else {
2592 !!!cp (200);
2593 !!!parse-error (type => 'string after PUBLIC literal');
2594 $self->{current_token}->{quirks} = 1;
2595
2596 $self->{state} = BOGUS_DOCTYPE_STATE;
2597 !!!next-input-character;
2598 redo A;
2599 }
2600 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2601 if ({
2602 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2603 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2604 }->{$self->{next_char}}) {
2605 !!!cp (201);
2606 ## Stay in the state
2607 !!!next-input-character;
2608 redo A;
2609 } elsif ($self->{next_char} == 0x0022) { # "
2610 !!!cp (202);
2611 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2612 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2613 !!!next-input-character;
2614 redo A;
2615 } elsif ($self->{next_char} == 0x0027) { # '
2616 !!!cp (203);
2617 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2618 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2619 !!!next-input-character;
2620 redo A;
2621 } elsif ($self->{next_char} == 0x003E) { # >
2622 !!!cp (204);
2623 !!!parse-error (type => 'no SYSTEM literal');
2624 $self->{state} = DATA_STATE;
2625 !!!next-input-character;
2626
2627 $self->{current_token}->{quirks} = 1;
2628 !!!emit ($self->{current_token}); # DOCTYPE
2629
2630 redo A;
2631 } elsif ($self->{next_char} == -1) {
2632 !!!cp (205);
2633 !!!parse-error (type => 'unclosed DOCTYPE');
2634
2635 $self->{state} = DATA_STATE;
2636 ## reconsume
2637
2638 $self->{current_token}->{quirks} = 1;
2639 !!!emit ($self->{current_token}); # DOCTYPE
2640
2641 redo A;
2642 } else {
2643 !!!cp (206);
2644 !!!parse-error (type => 'string after SYSTEM');
2645 $self->{current_token}->{quirks} = 1;
2646
2647 $self->{state} = BOGUS_DOCTYPE_STATE;
2648 !!!next-input-character;
2649 redo A;
2650 }
2651 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2652 if ($self->{next_char} == 0x0022) { # "
2653 !!!cp (207);
2654 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2655 !!!next-input-character;
2656 redo A;
2657 } elsif ($self->{next_char} == 0x003E) { # >
2658 !!!cp (208);
2659 !!!parse-error (type => 'unclosed PUBLIC literal');
2660
2661 $self->{state} = DATA_STATE;
2662 !!!next-input-character;
2663
2664 $self->{current_token}->{quirks} = 1;
2665 !!!emit ($self->{current_token}); # DOCTYPE
2666
2667 redo A;
2668 } elsif ($self->{next_char} == -1) {
2669 !!!cp (209);
2670 !!!parse-error (type => 'unclosed SYSTEM literal');
2671
2672 $self->{state} = DATA_STATE;
2673 ## reconsume
2674
2675 $self->{current_token}->{quirks} = 1;
2676 !!!emit ($self->{current_token}); # DOCTYPE
2677
2678 redo A;
2679 } else {
2680 !!!cp (210);
2681 $self->{current_token}->{system_identifier} # DOCTYPE
2682 .= chr $self->{next_char};
2683 ## Stay in the state
2684 !!!next-input-character;
2685 redo A;
2686 }
2687 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2688 if ($self->{next_char} == 0x0027) { # '
2689 !!!cp (211);
2690 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2691 !!!next-input-character;
2692 redo A;
2693 } elsif ($self->{next_char} == 0x003E) { # >
2694 !!!cp (212);
2695 !!!parse-error (type => 'unclosed PUBLIC literal');
2696
2697 $self->{state} = DATA_STATE;
2698 !!!next-input-character;
2699
2700 $self->{current_token}->{quirks} = 1;
2701 !!!emit ($self->{current_token}); # DOCTYPE
2702
2703 redo A;
2704 } elsif ($self->{next_char} == -1) {
2705 !!!cp (213);
2706 !!!parse-error (type => 'unclosed SYSTEM literal');
2707
2708 $self->{state} = DATA_STATE;
2709 ## reconsume
2710
2711 $self->{current_token}->{quirks} = 1;
2712 !!!emit ($self->{current_token}); # DOCTYPE
2713
2714 redo A;
2715 } else {
2716 !!!cp (214);
2717 $self->{current_token}->{system_identifier} # DOCTYPE
2718 .= chr $self->{next_char};
2719 ## Stay in the state
2720 !!!next-input-character;
2721 redo A;
2722 }
2723 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2724 if ({
2725 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2726 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2727 }->{$self->{next_char}}) {
2728 !!!cp (215);
2729 ## Stay in the state
2730 !!!next-input-character;
2731 redo A;
2732 } elsif ($self->{next_char} == 0x003E) { # >
2733 !!!cp (216);
2734 $self->{state} = DATA_STATE;
2735 !!!next-input-character;
2736
2737 !!!emit ($self->{current_token}); # DOCTYPE
2738
2739 redo A;
2740 } elsif ($self->{next_char} == -1) {
2741 !!!cp (217);
2742
2743 $self->{state} = DATA_STATE;
2744 ## reconsume
2745
2746 $self->{current_token}->{quirks} = 1;
2747 !!!emit ($self->{current_token}); # DOCTYPE
2748
2749 redo A;
2750 } else {
2751 !!!cp (218);
2752 !!!parse-error (type => 'string after SYSTEM literal');
2753 #$self->{current_token}->{quirks} = 1;
2754
2755 $self->{state} = BOGUS_DOCTYPE_STATE;
2756 !!!next-input-character;
2757 redo A;
2758 }
2759 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2760 if ($self->{next_char} == 0x003E) { # >
2761 !!!cp (219);
2762 $self->{state} = DATA_STATE;
2763 !!!next-input-character;
2764
2765 !!!emit ($self->{current_token}); # DOCTYPE
2766
2767 redo A;
2768 } elsif ($self->{next_char} == -1) {
2769 !!!cp (220);
2770 !!!parse-error (type => 'unclosed DOCTYPE');
2771 $self->{state} = DATA_STATE;
2772 ## reconsume
2773
2774 !!!emit ($self->{current_token}); # DOCTYPE
2775
2776 redo A;
2777 } else {
2778 !!!cp (221);
2779 ## Stay in the state
2780 !!!next-input-character;
2781 redo A;
2782 }
2783 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2784 my $s = '';
2785
2786 my ($l, $c) = ($self->{line}, $self->{column});
2787
2788 CS: while ($self->{next_char} != -1) {
2789 if ($self->{next_char} == 0x005D) { # ]
2790 !!!next-input-character;
2791 if ($self->{next_char} == 0x005D) { # ]
2792 !!!next-input-character;
2793 MDC: {
2794 if ($self->{next_char} == 0x003E) { # >
2795 !!!cp (221.1);
2796 !!!next-input-character;
2797 last CS;
2798 } elsif ($self->{next_char} == 0x005D) { # ]
2799 !!!cp (221.2);
2800 $s .= ']';
2801 !!!next-input-character;
2802 redo MDC;
2803 } else {
2804 !!!cp (221.3);
2805 $s .= ']]';
2806 #
2807 }
2808 } # MDC
2809 } else {
2810 !!!cp (221.4);
2811 $s .= ']';
2812 #
2813 }
2814 } else {
2815 !!!cp (221.5);
2816 #
2817 }
2818 $s .= chr $self->{next_char};
2819 !!!next-input-character;
2820 } # CS
2821
2822 $self->{state} = DATA_STATE;
2823 ## next-input-character done or EOF, which is reconsumed.
2824
2825 if (length $s) {
2826 !!!cp (221.6);
2827 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2828 line => $l, column => $c});
2829 } else {
2830 !!!cp (221.7);
2831 }
2832
2833 redo A;
2834
2835 ## ISSUE: "text tokens" in spec.
2836 ## TODO: Streaming support
2837 } else {
2838 die "$0: $self->{state}: Unknown state";
2839 }
2840 } # A
2841
2842 die "$0: _get_next_token: unexpected case";
2843 } # _get_next_token
2844
2845 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2846 my ($self, $in_attr, $additional) = @_;
2847
2848 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2849
2850 if ({
2851 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2852 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2853 $additional => 1,
2854 }->{$self->{next_char}}) {
2855 !!!cp (1001);
2856 ## Don't consume
2857 ## No error
2858 return undef;
2859 } elsif ($self->{next_char} == 0x0023) { # #
2860 !!!next-input-character;
2861 if ($self->{next_char} == 0x0078 or # x
2862 $self->{next_char} == 0x0058) { # X
2863 my $code;
2864 X: {
2865 my $x_char = $self->{next_char};
2866 !!!next-input-character;
2867 if (0x0030 <= $self->{next_char} and
2868 $self->{next_char} <= 0x0039) { # 0..9
2869 !!!cp (1002);
2870 $code ||= 0;
2871 $code *= 0x10;
2872 $code += $self->{next_char} - 0x0030;
2873 redo X;
2874 } elsif (0x0061 <= $self->{next_char} and
2875 $self->{next_char} <= 0x0066) { # a..f
2876 !!!cp (1003);
2877 $code ||= 0;
2878 $code *= 0x10;
2879 $code += $self->{next_char} - 0x0060 + 9;
2880 redo X;
2881 } elsif (0x0041 <= $self->{next_char} and
2882 $self->{next_char} <= 0x0046) { # A..F
2883 !!!cp (1004);
2884 $code ||= 0;
2885 $code *= 0x10;
2886 $code += $self->{next_char} - 0x0040 + 9;
2887 redo X;
2888 } elsif (not defined $code) { # no hexadecimal digit
2889 !!!cp (1005);
2890 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2891 !!!back-next-input-character ($x_char, $self->{next_char});
2892 $self->{next_char} = 0x0023; # #
2893 return undef;
2894 } elsif ($self->{next_char} == 0x003B) { # ;
2895 !!!cp (1006);
2896 !!!next-input-character;
2897 } else {
2898 !!!cp (1007);
2899 !!!parse-error (type => 'no refc', line => $l, column => $c);
2900 }
2901
2902 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2903 !!!cp (1008);
2904 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2905 $code = 0xFFFD;
2906 } elsif ($code > 0x10FFFF) {
2907 !!!cp (1009);
2908 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2909 $code = 0xFFFD;
2910 } elsif ($code == 0x000D) {
2911 !!!cp (1010);
2912 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2913 $code = 0x000A;
2914 } elsif (0x80 <= $code and $code <= 0x9F) {
2915 !!!cp (1011);
2916 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2917 $code = $c1_entity_char->{$code};
2918 }
2919
2920 return {type => CHARACTER_TOKEN, data => chr $code,
2921 has_reference => 1,
2922 line => $l, column => $c,
2923 };
2924 } # X
2925 } elsif (0x0030 <= $self->{next_char} and
2926 $self->{next_char} <= 0x0039) { # 0..9
2927 my $code = $self->{next_char} - 0x0030;
2928 !!!next-input-character;
2929
2930 while (0x0030 <= $self->{next_char} and
2931 $self->{next_char} <= 0x0039) { # 0..9
2932 !!!cp (1012);
2933 $code *= 10;
2934 $code += $self->{next_char} - 0x0030;
2935
2936 !!!next-input-character;
2937 }
2938
2939 if ($self->{next_char} == 0x003B) { # ;
2940 !!!cp (1013);
2941 !!!next-input-character;
2942 } else {
2943 !!!cp (1014);
2944 !!!parse-error (type => 'no refc', line => $l, column => $c);
2945 }
2946
2947 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2948 !!!cp (1015);
2949 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2950 $code = 0xFFFD;
2951 } elsif ($code > 0x10FFFF) {
2952 !!!cp (1016);
2953 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2954 $code = 0xFFFD;
2955 } elsif ($code == 0x000D) {
2956 !!!cp (1017);
2957 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2958 $code = 0x000A;
2959 } elsif (0x80 <= $code and $code <= 0x9F) {
2960 !!!cp (1018);
2961 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2962 $code = $c1_entity_char->{$code};
2963 }
2964
2965 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2966 line => $l, column => $c,
2967 };
2968 } else {
2969 !!!cp (1019);
2970 !!!parse-error (type => 'bare nero', line => $l, column => $c);
2971 !!!back-next-input-character ($self->{next_char});
2972 $self->{next_char} = 0x0023; # #
2973 return undef;
2974 }
2975 } elsif ((0x0041 <= $self->{next_char} and
2976 $self->{next_char} <= 0x005A) or
2977 (0x0061 <= $self->{next_char} and
2978 $self->{next_char} <= 0x007A)) {
2979 my $entity_name = chr $self->{next_char};
2980 !!!next-input-character;
2981
2982 my $value = $entity_name;
2983 my $match = 0;
2984 require Whatpm::_NamedEntityList;
2985 our $EntityChar;
2986
2987 while (length $entity_name < 30 and
2988 ## NOTE: Some number greater than the maximum length of entity name
2989 ((0x0041 <= $self->{next_char} and # a
2990 $self->{next_char} <= 0x005A) or # x
2991 (0x0061 <= $self->{next_char} and # a
2992 $self->{next_char} <= 0x007A) or # z
2993 (0x0030 <= $self->{next_char} and # 0
2994 $self->{next_char} <= 0x0039) or # 9
2995 $self->{next_char} == 0x003B)) { # ;
2996 $entity_name .= chr $self->{next_char};
2997 if (defined $EntityChar->{$entity_name}) {
2998 if ($self->{next_char} == 0x003B) { # ;
2999 !!!cp (1020);
3000 $value = $EntityChar->{$entity_name};
3001 $match = 1;
3002 !!!next-input-character;
3003 last;
3004 } else {
3005 !!!cp (1021);
3006 $value = $EntityChar->{$entity_name};
3007 $match = -1;
3008 !!!next-input-character;
3009 }
3010 } else {
3011 !!!cp (1022);
3012 $value .= chr $self->{next_char};
3013 $match *= 2;
3014 !!!next-input-character;
3015 }
3016 }
3017
3018 if ($match > 0) {
3019 !!!cp (1023);
3020 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3021 line => $l, column => $c,
3022 };
3023 } elsif ($match < 0) {
3024 !!!parse-error (type => 'no refc', line => $l, column => $c);
3025 if ($in_attr and $match < -1) {
3026 !!!cp (1024);
3027 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
3028 line => $l, column => $c,
3029 };
3030 } else {
3031 !!!cp (1025);
3032 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3033 line => $l, column => $c,
3034 };
3035 }
3036 } else {
3037 !!!cp (1026);
3038 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3039 ## NOTE: "No characters are consumed" in the spec.
3040 return {type => CHARACTER_TOKEN, data => '&'.$value,
3041 line => $l, column => $c,
3042 };
3043 }
3044 } else {
3045 !!!cp (1027);
3046 ## no characters are consumed
3047 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3048 return undef;
3049 }
3050 } # _tokenize_attempt_to_consume_an_entity
3051
3052 sub _initialize_tree_constructor ($) {
3053 my $self = shift;
3054 ## NOTE: $self->{document} MUST be specified before this method is called
3055 $self->{document}->strict_error_checking (0);
3056 ## TODO: Turn mutation events off # MUST
3057 ## TODO: Turn loose Document option (manakai extension) on
3058 $self->{document}->manakai_is_html (1); # MUST
3059 } # _initialize_tree_constructor
3060
3061 sub _terminate_tree_constructor ($) {
3062 my $self = shift;
3063 $self->{document}->strict_error_checking (1);
3064 ## TODO: Turn mutation events on
3065 } # _terminate_tree_constructor
3066
3067 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3068
3069 { # tree construction stage
3070 my $token;
3071
3072 sub _construct_tree ($) {
3073 my ($self) = @_;
3074
3075 ## When an interactive UA render the $self->{document} available
3076 ## to the user, or when it begin accepting user input, are
3077 ## not defined.
3078
3079 ## Append a character: collect it and all subsequent consecutive
3080 ## characters and insert one Text node whose data is concatenation
3081 ## of all those characters. # MUST
3082
3083 !!!next-token;
3084
3085 undef $self->{form_element};
3086 undef $self->{head_element};
3087 $self->{open_elements} = [];
3088 undef $self->{inner_html_node};
3089
3090 ## NOTE: The "initial" insertion mode.
3091 $self->_tree_construction_initial; # MUST
3092
3093 ## NOTE: The "before html" insertion mode.
3094 $self->_tree_construction_root_element;
3095 $self->{insertion_mode} = BEFORE_HEAD_IM;
3096
3097 ## NOTE: The "before head" insertion mode and so on.
3098 $self->_tree_construction_main;
3099 } # _construct_tree
3100
3101 sub _tree_construction_initial ($) {
3102 my $self = shift;
3103
3104 ## NOTE: "initial" insertion mode
3105
3106 INITIAL: {
3107 if ($token->{type} == DOCTYPE_TOKEN) {
3108 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3109 ## error, switch to a conformance checking mode for another
3110 ## language.
3111 my $doctype_name = $token->{name};
3112 $doctype_name = '' unless defined $doctype_name;
3113 $doctype_name =~ tr/a-z/A-Z/;
3114 if (not defined $token->{name} or # <!DOCTYPE>
3115 defined $token->{public_identifier} or
3116 defined $token->{system_identifier}) {
3117 !!!cp ('t1');
3118 !!!parse-error (type => 'not HTML5', token => $token);
3119 } elsif ($doctype_name ne 'HTML') {
3120 !!!cp ('t2');
3121 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
3122 !!!parse-error (type => 'not HTML5', token => $token);
3123 } else {
3124 !!!cp ('t3');
3125 }
3126
3127 my $doctype = $self->{document}->create_document_type_definition
3128 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3129 ## NOTE: Default value for both |public_id| and |system_id| attributes
3130 ## are empty strings, so that we don't set any value in missing cases.
3131 $doctype->public_id ($token->{public_identifier})
3132 if defined $token->{public_identifier};
3133 $doctype->system_id ($token->{system_identifier})
3134 if defined $token->{system_identifier};
3135 ## NOTE: Other DocumentType attributes are null or empty lists.
3136 ## ISSUE: internalSubset = null??
3137 $self->{document}->append_child ($doctype);
3138
3139 if ($token->{quirks} or $doctype_name ne 'HTML') {
3140 !!!cp ('t4');
3141 $self->{document}->manakai_compat_mode ('quirks');
3142 } elsif (defined $token->{public_identifier}) {
3143 my $pubid = $token->{public_identifier};
3144 $pubid =~ tr/a-z/A-z/;
3145 if ({
3146 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
3147 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
3148 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
3149 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
3150 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
3151 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
3152 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
3153 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
3154 "-//IETF//DTD HTML 2.0//EN" => 1,
3155 "-//IETF//DTD HTML 2.1E//EN" => 1,
3156 "-//IETF//DTD HTML 3.0//EN" => 1,
3157 "-//IETF//DTD HTML 3.0//EN//" => 1,
3158 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
3159 "-//IETF//DTD HTML 3.2//EN" => 1,
3160 "-//IETF//DTD HTML 3//EN" => 1,
3161 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
3162 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
3163 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
3164 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
3165 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
3166 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
3167 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
3168 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
3169 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
3170 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
3171 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
3172 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
3173 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
3174 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
3175 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
3176 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
3177 "-//IETF//DTD HTML STRICT//EN" => 1,
3178 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
3179 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
3180 "-//IETF//DTD HTML//EN" => 1,
3181 "-//IETF//DTD HTML//EN//2.0" => 1,
3182 "-//IETF//DTD HTML//EN//3.0" => 1,
3183 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
3184 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
3185 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
3186 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
3187 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
3188 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
3189 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
3190 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
3191 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
3192 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
3193 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
3194 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
3195 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
3196 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
3197 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
3198 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
3199 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
3200 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
3201 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
3202 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
3203 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
3204 "-//W3C//DTD HTML 3.2//EN" => 1,
3205 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
3206 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
3207 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
3208 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
3209 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
3210 "-//W3C//DTD W3 HTML//EN" => 1,
3211 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
3212 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
3213 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
3214 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
3215 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
3216 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
3217 "HTML" => 1,
3218 }->{$pubid}) {
3219 !!!cp ('t5');
3220 $self->{document}->manakai_compat_mode ('quirks');
3221 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
3222 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
3223 if (defined $token->{system_identifier}) {
3224 !!!cp ('t6');
3225 $self->{document}->manakai_compat_mode ('quirks');
3226 } else {
3227 !!!cp ('t7');
3228 $self->{document}->manakai_compat_mode ('limited quirks');
3229 }
3230 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
3231 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
3232 !!!cp ('t8');
3233 $self->{document}->manakai_compat_mode ('limited quirks');
3234 } else {
3235 !!!cp ('t9');
3236 }
3237 } else {
3238 !!!cp ('t10');
3239 }
3240 if (defined $token->{system_identifier}) {
3241 my $sysid = $token->{system_identifier};
3242 $sysid =~ tr/A-Z/a-z/;
3243 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3244 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
3245 $self->{document}->manakai_compat_mode ('quirks');
3246 !!!cp ('t11');
3247 } else {
3248 !!!cp ('t12');
3249 }
3250 } else {
3251 !!!cp ('t13');
3252 }
3253
3254 ## Go to the "before html" insertion mode.
3255 !!!next-token;
3256 return;
3257 } elsif ({
3258 START_TAG_TOKEN, 1,
3259 END_TAG_TOKEN, 1,
3260 END_OF_FILE_TOKEN, 1,
3261 }->{$token->{type}}) {
3262 !!!cp ('t14');
3263 !!!parse-error (type => 'no DOCTYPE', token => $token);
3264 $self->{document}->manakai_compat_mode ('quirks');
3265 ## Go to the "before html" insertion mode.
3266 ## reprocess
3267 !!!ack-later;
3268 return;
3269 } elsif ($token->{type} == CHARACTER_TOKEN) {
3270 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3271 ## Ignore the token
3272
3273 unless (length $token->{data}) {
3274 !!!cp ('t15');
3275 ## Stay in the insertion mode.
3276 !!!next-token;
3277 redo INITIAL;
3278 } else {
3279 !!!cp ('t16');
3280 }
3281 } else {
3282 !!!cp ('t17');
3283 }
3284
3285 !!!parse-error (type => 'no DOCTYPE', token => $token);
3286 $self->{document}->manakai_compat_mode ('quirks');
3287 ## Go to the "before html" insertion mode.
3288 ## reprocess
3289 return;
3290 } elsif ($token->{type} == COMMENT_TOKEN) {
3291 !!!cp ('t18');
3292 my $comment = $self->{document}->create_comment ($token->{data});
3293 $self->{document}->append_child ($comment);
3294
3295 ## Stay in the insertion mode.
3296 !!!next-token;
3297 redo INITIAL;
3298 } else {
3299 die "$0: $token->{type}: Unknown token type";
3300 }
3301 } # INITIAL
3302
3303 die "$0: _tree_construction_initial: This should be never reached";
3304 } # _tree_construction_initial
3305
3306 sub _tree_construction_root_element ($) {
3307 my $self = shift;
3308
3309 ## NOTE: "before html" insertion mode.
3310
3311 B: {
3312 if ($token->{type} == DOCTYPE_TOKEN) {
3313 !!!cp ('t19');
3314 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3315 ## Ignore the token
3316 ## Stay in the insertion mode.
3317 !!!next-token;
3318 redo B;
3319 } elsif ($token->{type} == COMMENT_TOKEN) {
3320 !!!cp ('t20');
3321 my $comment = $self->{document}->create_comment ($token->{data});
3322 $self->{document}->append_child ($comment);
3323 ## Stay in the insertion mode.
3324 !!!next-token;
3325 redo B;
3326 } elsif ($token->{type} == CHARACTER_TOKEN) {
3327 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3328 ## Ignore the token.
3329
3330 unless (length $token->{data}) {
3331 !!!cp ('t21');
3332 ## Stay in the insertion mode.
3333 !!!next-token;
3334 redo B;
3335 } else {
3336 !!!cp ('t22');
3337 }
3338 } else {
3339 !!!cp ('t23');
3340 }
3341
3342 $self->{application_cache_selection}->(undef);
3343
3344 #
3345 } elsif ($token->{type} == START_TAG_TOKEN) {
3346 if ($token->{tag_name} eq 'html') {
3347 my $root_element;
3348 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3349 $self->{document}->append_child ($root_element);
3350 push @{$self->{open_elements}},
3351 [$root_element, $el_category->{html}];
3352
3353 if ($token->{attributes}->{manifest}) {
3354 !!!cp ('t24');
3355 $self->{application_cache_selection}
3356 ->($token->{attributes}->{manifest}->{value});
3357 ## ISSUE: Spec is unclear on relative references.
3358 ## According to Hixie (#whatwg 2008-03-19), it should be
3359 ## resolved against the base URI of the document in HTML
3360 ## or xml:base of the element in XHTML.
3361 } else {
3362 !!!cp ('t25');
3363 $self->{application_cache_selection}->(undef);
3364 }
3365
3366 !!!nack ('t25c');
3367
3368 !!!next-token;
3369 return; ## Go to the "before head" insertion mode.
3370 } else {
3371 !!!cp ('t25.1');
3372 #
3373 }
3374 } elsif ({
3375 END_TAG_TOKEN, 1,
3376 END_OF_FILE_TOKEN, 1,
3377 }->{$token->{type}}) {
3378 !!!cp ('t26');
3379 #
3380 } else {
3381 die "$0: $token->{type}: Unknown token type";
3382 }
3383
3384 my $root_element;
3385 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3386 $self->{document}->append_child ($root_element);
3387 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3388
3389 $self->{application_cache_selection}->(undef);
3390
3391 ## NOTE: Reprocess the token.
3392 !!!ack-later;
3393 return; ## Go to the "before head" insertion mode.
3394
3395 ## ISSUE: There is an issue in the spec
3396 } # B
3397
3398 die "$0: _tree_construction_root_element: This should never be reached";
3399 } # _tree_construction_root_element
3400
3401 sub _reset_insertion_mode ($) {
3402 my $self = shift;
3403
3404 ## Step 1
3405 my $last;
3406
3407 ## Step 2
3408 my $i = -1;
3409 my $node = $self->{open_elements}->[$i];
3410
3411 ## Step 3
3412 S3: {
3413 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3414 $last = 1;
3415 if (defined $self->{inner_html_node}) {
3416 !!!cp ('t28');
3417 $node = $self->{inner_html_node};
3418 } else {
3419 die "_reset_insertion_mode: t27";
3420 }
3421 }
3422
3423 ## Step 4..14
3424 my $new_mode;
3425 if ($node->[1] & FOREIGN_EL) {
3426 !!!cp ('t28.1');
3427 ## NOTE: Strictly spaking, the line below only applies to MathML and
3428 ## SVG elements. Currently the HTML syntax supports only MathML and
3429 ## SVG elements as foreigners.
3430 $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
3431 ## ISSUE: What is set as the secondary insertion mode?
3432 } elsif ($node->[1] & TABLE_CELL_EL) {
3433 if ($last) {
3434 !!!cp ('t28.2');
3435 #
3436 } else {
3437 !!!cp ('t28.3');
3438 $new_mode = IN_CELL_IM;
3439 }
3440 } else {
3441 !!!cp ('t28.4');
3442 $new_mode = {
3443 select => IN_SELECT_IM,
3444 ## NOTE: |option| and |optgroup| do not set
3445 ## insertion mode to "in select" by themselves.
3446 tr => IN_ROW_IM,
3447 tbody => IN_TABLE_BODY_IM,
3448 thead => IN_TABLE_BODY_IM,
3449 tfoot => IN_TABLE_BODY_IM,
3450 caption => IN_CAPTION_IM,
3451 colgroup => IN_COLUMN_GROUP_IM,
3452 table => IN_TABLE_IM,
3453 head => IN_BODY_IM, # not in head!
3454 body => IN_BODY_IM,
3455 frameset => IN_FRAMESET_IM,
3456 }->{$node->[0]->manakai_local_name};
3457 }
3458 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3459
3460 ## Step 15
3461 if ($node->[1] & HTML_EL) {
3462 unless (defined $self->{head_element}) {
3463 !!!cp ('t29');
3464 $self->{insertion_mode} = BEFORE_HEAD_IM;
3465 } else {
3466 ## ISSUE: Can this state be reached?
3467 !!!cp ('t30');
3468 $self->{insertion_mode} = AFTER_HEAD_IM;
3469 }
3470 return;
3471 } else {
3472 !!!cp ('t31');
3473 }
3474
3475 ## Step 16
3476 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3477
3478 ## Step 17
3479 $i--;
3480 $node = $self->{open_elements}->[$i];
3481
3482 ## Step 18
3483 redo S3;
3484 } # S3
3485
3486 die "$0: _reset_insertion_mode: This line should never be reached";
3487 } # _reset_insertion_mode
3488
3489 sub _tree_construction_main ($) {
3490 my $self = shift;
3491
3492 my $active_formatting_elements = [];
3493
3494 my $reconstruct_active_formatting_elements = sub { # MUST
3495 my $insert = shift;
3496
3497 ## Step 1
3498 return unless @$active_formatting_elements;
3499
3500 ## Step 3
3501 my $i = -1;
3502 my $entry = $active_formatting_elements->[$i];
3503
3504 ## Step 2
3505 return if $entry->[0] eq '#marker';
3506 for (@{$self->{open_elements}}) {
3507 if ($entry->[0] eq $_->[0]) {
3508 !!!cp ('t32');
3509 return;
3510 }
3511 }
3512
3513 S4: {
3514 ## Step 4
3515 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3516
3517 ## Step 5
3518 $i--;
3519 $entry = $active_formatting_elements->[$i];
3520
3521 ## Step 6
3522 if ($entry->[0] eq '#marker') {
3523 !!!cp ('t33_1');
3524 #
3525 } else {
3526 my $in_open_elements;
3527 OE: for (@{$self->{open_elements}}) {
3528 if ($entry->[0] eq $_->[0]) {
3529 !!!cp ('t33');
3530 $in_open_elements = 1;
3531 last OE;
3532 }
3533 }
3534 if ($in_open_elements) {
3535 !!!cp ('t34');
3536 #
3537 } else {
3538 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3539 !!!cp ('t35');
3540 redo S4;
3541 }
3542 }
3543
3544 ## Step 7
3545 $i++;
3546 $entry = $active_formatting_elements->[$i];
3547 } # S4
3548
3549 S7: {
3550 ## Step 8
3551 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3552
3553 ## Step 9
3554 $insert->($clone->[0]);
3555 push @{$self->{open_elements}}, $clone;
3556
3557 ## Step 10
3558 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3559
3560 ## Step 11
3561 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3562 !!!cp ('t36');
3563 ## Step 7'
3564 $i++;
3565 $entry = $active_formatting_elements->[$i];
3566
3567 redo S7;
3568 }
3569
3570 !!!cp ('t37');
3571 } # S7
3572 }; # $reconstruct_active_formatting_elements
3573
3574 my $clear_up_to_marker = sub {
3575 for (reverse 0..$#$active_formatting_elements) {
3576 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3577 !!!cp ('t38');
3578 splice @$active_formatting_elements, $_;
3579 return;
3580 }
3581 }
3582
3583 !!!cp ('t39');
3584 }; # $clear_up_to_marker
3585
3586 my $insert;
3587
3588 my $parse_rcdata = sub ($) {
3589 my ($content_model_flag) = @_;
3590
3591 ## Step 1
3592 my $start_tag_name = $token->{tag_name};
3593 my $el;
3594 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3595
3596 ## Step 2
3597 $insert->($el);
3598
3599 ## Step 3
3600 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3601 delete $self->{escape}; # MUST
3602
3603 ## Step 4
3604 my $text = '';
3605 !!!nack ('t40.1');
3606 !!!next-token;
3607 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3608 !!!cp ('t40');
3609 $text .= $token->{data};
3610 !!!next-token;
3611 }
3612
3613 ## Step 5
3614 if (length $text) {
3615 !!!cp ('t41');
3616 my $text = $self->{document}->create_text_node ($text);
3617 $el->append_child ($text);
3618 }
3619
3620 ## Step 6
3621 $self->{content_model} = PCDATA_CONTENT_MODEL;
3622
3623 ## Step 7
3624 if ($token->{type} == END_TAG_TOKEN and
3625 $token->{tag_name} eq $start_tag_name) {
3626 !!!cp ('t42');
3627 ## Ignore the token
3628 } else {
3629 ## NOTE: An end-of-file token.
3630 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3631 !!!cp ('t43');
3632 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3633 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3634 !!!cp ('t44');
3635 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
3636 } else {
3637 die "$0: $content_model_flag in parse_rcdata";
3638 }
3639 }
3640 !!!next-token;
3641 }; # $parse_rcdata
3642
3643 my $script_start_tag = sub () {
3644 my $script_el;
3645 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3646 ## TODO: mark as "parser-inserted"
3647
3648 $self->{content_model} = CDATA_CONTENT_MODEL;
3649 delete $self->{escape}; # MUST
3650
3651 my $text = '';
3652 !!!nack ('t45.1');
3653 !!!next-token;
3654 while ($token->{type} == CHARACTER_TOKEN) {
3655 !!!cp ('t45');
3656 $text .= $token->{data};
3657 !!!next-token;
3658 } # stop if non-character token or tokenizer stops tokenising
3659 if (length $text) {
3660 !!!cp ('t46');
3661 $script_el->manakai_append_text ($text);
3662 }
3663
3664 $self->{content_model} = PCDATA_CONTENT_MODEL;
3665
3666 if ($token->{type} == END_TAG_TOKEN and
3667 $token->{tag_name} eq 'script') {
3668 !!!cp ('t47');
3669 ## Ignore the token
3670 } else {
3671 !!!cp ('t48');
3672 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3673 ## ISSUE: And ignore?
3674 ## TODO: mark as "already executed"
3675 }
3676
3677 if (defined $self->{inner_html_node}) {
3678 !!!cp ('t49');
3679 ## TODO: mark as "already executed"
3680 } else {
3681 !!!cp ('t50');
3682 ## TODO: $old_insertion_point = current insertion point
3683 ## TODO: insertion point = just before the next input character
3684
3685 $insert->($script_el);
3686
3687 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3688
3689 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3690 }
3691
3692 !!!next-token;
3693 }; # $script_start_tag
3694
3695 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3696 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3697 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3698
3699 my $formatting_end_tag = sub {
3700 my $end_tag_token = shift;
3701 my $tag_name = $end_tag_token->{tag_name};
3702
3703 ## NOTE: The adoption agency algorithm (AAA).
3704
3705 FET: {
3706 ## Step 1
3707 my $formatting_element;
3708 my $formatting_element_i_in_active;
3709 AFE: for (reverse 0..$#$active_formatting_elements) {
3710 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3711 !!!cp ('t52');
3712 last AFE;
3713 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3714 eq $tag_name) {
3715 !!!cp ('t51');
3716 $formatting_element = $active_formatting_elements->[$_];
3717 $formatting_element_i_in_active = $_;
3718 last AFE;
3719 }
3720 } # AFE
3721 unless (defined $formatting_element) {
3722 !!!cp ('t53');
3723 !!!parse-error (type => 'unmatched end tag:'.$tag_name, token => $end_tag_token);
3724 ## Ignore the token
3725 !!!next-token;
3726 return;
3727 }
3728 ## has an element in scope
3729 my $in_scope = 1;
3730 my $formatting_element_i_in_open;
3731 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3732 my $node = $self->{open_elements}->[$_];
3733 if ($node->[0] eq $formatting_element->[0]) {
3734 if ($in_scope) {
3735 !!!cp ('t54');
3736 $formatting_element_i_in_open = $_;
3737 last INSCOPE;
3738 } else { # in open elements but not in scope
3739 !!!cp ('t55');
3740 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3741 token => $end_tag_token);
3742 ## Ignore the token
3743 !!!next-token;
3744 return;
3745 }
3746 } elsif ($node->[1] & SCOPING_EL) {
3747 !!!cp ('t56');
3748 $in_scope = 0;
3749 }
3750 } # INSCOPE
3751 unless (defined $formatting_element_i_in_open) {
3752 !!!cp ('t57');
3753 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3754 token => $end_tag_token);
3755 pop @$active_formatting_elements; # $formatting_element
3756 !!!next-token; ## TODO: ok?
3757 return;
3758 }
3759 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3760 !!!cp ('t58');
3761 !!!parse-error (type => 'not closed',
3762 value => $self->{open_elements}->[-1]->[0]
3763 ->manakai_local_name,
3764 token => $end_tag_token);
3765 }
3766
3767 ## Step 2
3768 my $furthest_block;
3769 my $furthest_block_i_in_open;
3770 OE: for (reverse 0..$#{$self->{open_elements}}) {
3771 my $node = $self->{open_elements}->[$_];
3772 if (not ($node->[1] & FORMATTING_EL) and
3773 #not $phrasing_category->{$node->[1]} and
3774 ($node->[1] & SPECIAL_EL or
3775 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3776 !!!cp ('t59');
3777 $furthest_block = $node;
3778 $furthest_block_i_in_open = $_;
3779 } elsif ($node->[0] eq $formatting_element->[0]) {
3780 !!!cp ('t60');
3781 last OE;
3782 }
3783 } # OE
3784
3785 ## Step 3
3786 unless (defined $furthest_block) { # MUST
3787 !!!cp ('t61');
3788 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3789 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3790 !!!next-token;
3791 return;
3792 }
3793
3794 ## Step 4
3795 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3796
3797 ## Step 5
3798 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3799 if (defined $furthest_block_parent) {
3800 !!!cp ('t62');
3801 $furthest_block_parent->remove_child ($furthest_block->[0]);
3802 }
3803
3804 ## Step 6
3805 my $bookmark_prev_el
3806 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3807 ->[0];
3808
3809 ## Step 7
3810 my $node = $furthest_block;
3811 my $node_i_in_open = $furthest_block_i_in_open;
3812 my $last_node = $furthest_block;
3813 S7: {
3814 ## Step 1
3815 $node_i_in_open--;
3816 $node = $self->{open_elements}->[$node_i_in_open];
3817
3818 ## Step 2
3819 my $node_i_in_active;
3820 S7S2: {
3821 for (reverse 0..$#$active_formatting_elements) {
3822 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3823 !!!cp ('t63');
3824 $node_i_in_active = $_;
3825 last S7S2;
3826 }
3827 }
3828 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3829 redo S7;
3830 } # S7S2
3831
3832 ## Step 3
3833 last S7 if $node->[0] eq $formatting_element->[0];
3834
3835 ## Step 4
3836 if ($last_node->[0] eq $furthest_block->[0]) {
3837 !!!cp ('t64');
3838 $bookmark_prev_el = $node->[0];
3839 }
3840
3841 ## Step 5
3842 if ($node->[0]->has_child_nodes ()) {
3843 !!!cp ('t65');
3844 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3845 $active_formatting_elements->[$node_i_in_active] = $clone;
3846 $self->{open_elements}->[$node_i_in_open] = $clone;
3847 $node = $clone;
3848 }
3849
3850 ## Step 6
3851 $node->[0]->append_child ($last_node->[0]);
3852
3853 ## Step 7
3854 $last_node = $node;
3855
3856 ## Step 8
3857 redo S7;
3858 } # S7
3859
3860 ## Step 8
3861 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3862 my $foster_parent_element;
3863 my $next_sibling;
3864 OE: for (reverse 0..$#{$self->{open_elements}}) {
3865 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3866 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3867 if (defined $parent and $parent->node_type == 1) {
3868 !!!cp ('t65.1');
3869 $foster_parent_element = $parent;
3870 $next_sibling = $self->{open_elements}->[$_]->[0];
3871 } else {
3872 !!!cp ('t65.2');
3873 $foster_parent_element
3874 = $self->{open_elements}->[$_ - 1]->[0];
3875 }
3876 last OE;
3877 }
3878 } # OE
3879 $foster_parent_element = $self->{open_elements}->[0]->[0]
3880 unless defined $foster_parent_element;
3881 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3882 $open_tables->[-1]->[1] = 1; # tainted
3883 } else {
3884 !!!cp ('t65.3');
3885 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3886 }
3887
3888 ## Step 9
3889 my $clone = [$formatting_element->[0]->clone_node (0),
3890 $formatting_element->[1]];
3891
3892 ## Step 10
3893 my @cn = @{$furthest_block->[0]->child_nodes};
3894 $clone->[0]->append_child ($_) for @cn;
3895
3896 ## Step 11
3897 $furthest_block->[0]->append_child ($clone->[0]);
3898
3899 ## Step 12
3900 my $i;
3901 AFE: for (reverse 0..$#$active_formatting_elements) {
3902 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3903 !!!cp ('t66');
3904 splice @$active_formatting_elements, $_, 1;
3905 $i-- and last AFE if defined $i;
3906 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3907 !!!cp ('t67');
3908 $i = $_;
3909 }
3910 } # AFE
3911 splice @$active_formatting_elements, $i + 1, 0, $clone;
3912
3913 ## Step 13
3914 undef $i;
3915 OE: for (reverse 0..$#{$self->{open_elements}}) {
3916 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3917 !!!cp ('t68');
3918 splice @{$self->{open_elements}}, $_, 1;
3919 $i-- and last OE if defined $i;
3920 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3921 !!!cp ('t69');
3922 $i = $_;
3923 }
3924 } # OE
3925 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3926
3927 ## Step 14
3928 redo FET;
3929 } # FET
3930 }; # $formatting_end_tag
3931
3932 $insert = my $insert_to_current = sub {
3933 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3934 }; # $insert_to_current
3935
3936 my $insert_to_foster = sub {
3937 my $child = shift;
3938 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3939 # MUST
3940 my $foster_parent_element;
3941 my $next_sibling;
3942 OE: for (reverse 0..$#{$self->{open_elements}}) {
3943 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3944 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3945 if (defined $parent and $parent->node_type == 1) {
3946 !!!cp ('t70');
3947 $foster_parent_element = $parent;
3948 $next_sibling = $self->{open_elements}->[$_]->[0];
3949 } else {
3950 !!!cp ('t71');
3951 $foster_parent_element
3952 = $self->{open_elements}->[$_ - 1]->[0];
3953 }
3954 last OE;
3955 }
3956 } # OE
3957 $foster_parent_element = $self->{open_elements}->[0]->[0]
3958 unless defined $foster_parent_element;
3959 $foster_parent_element->insert_before
3960 ($child, $next_sibling);
3961 $open_tables->[-1]->[1] = 1; # tainted
3962 } else {
3963 !!!cp ('t72');
3964 $self->{open_elements}->[-1]->[0]->append_child ($child);
3965 }
3966 }; # $insert_to_foster
3967
3968 B: while (1) {
3969 if ($token->{type} == DOCTYPE_TOKEN) {
3970 !!!cp ('t73');
3971 !!!parse-error (type => 'DOCTYPE in the middle', token => $token);
3972 ## Ignore the token
3973 ## Stay in the phase
3974 !!!next-token;
3975 next B;
3976 } elsif ($token->{type} == START_TAG_TOKEN and
3977 $token->{tag_name} eq 'html') {
3978 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3979 !!!cp ('t79');
3980 !!!parse-error (type => 'after html:html', token => $token);
3981 $self->{insertion_mode} = AFTER_BODY_IM;
3982 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3983 !!!cp ('t80');
3984 !!!parse-error (type => 'after html:html', token => $token);
3985 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3986 } else {
3987 !!!cp ('t81');
3988 }
3989
3990 !!!cp ('t82');
3991 !!!parse-error (type => 'not first start tag', token => $token);
3992 my $top_el = $self->{open_elements}->[0]->[0];
3993 for my $attr_name (keys %{$token->{attributes}}) {
3994 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3995 !!!cp ('t84');
3996 $top_el->set_attribute_ns
3997 (undef, [undef, $attr_name],
3998 $token->{attributes}->{$attr_name}->{value});
3999 }
4000 }
4001 !!!nack ('t84.1');
4002 !!!next-token;
4003 next B;
4004 } elsif ($token->{type} == COMMENT_TOKEN) {
4005 my $comment = $self->{document}->create_comment ($token->{data});
4006 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4007 !!!cp ('t85');
4008 $self->{document}->append_child ($comment);
4009 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4010 !!!cp ('t86');
4011 $self->{open_elements}->[0]->[0]->append_child ($comment);
4012 } else {
4013 !!!cp ('t87');
4014 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4015 }
4016 !!!next-token;
4017 next B;
4018 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4019 if ($token->{type} == CHARACTER_TOKEN) {
4020 !!!cp ('t87.1');
4021 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4022 !!!next-token;
4023 next B;
4024 } elsif ($token->{type} == START_TAG_TOKEN) {
4025 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4026 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4027 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4028 ($token->{tag_name} eq 'svg' and
4029 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4030 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4031 !!!cp ('t87.2');
4032 #
4033 } elsif ({
4034 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4035 center => 1, code => 1, dd => 1, div => 1, dl => 1, em => 1,
4036 embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1, ## No h4!
4037 h5 => 1, h6 => 1, head => 1, hr => 1, i => 1, img => 1,
4038 li => 1, menu => 1, meta => 1, nobr => 1, p => 1, pre => 1,
4039 ruby => 1, s => 1, small => 1, span => 1, strong => 1,
4040 sub => 1, sup => 1, table => 1, tt => 1, u => 1, ul => 1,
4041 var => 1,
4042 }->{$token->{tag_name}}) {
4043 !!!cp ('t87.2');
4044 !!!parse-error (type => 'not closed',
4045 value => $self->{open_elements}->[-1]->[0]
4046 ->manakai_local_name,
4047 token => $token);
4048
4049 pop @{$self->{open_elements}}
4050 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4051
4052 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4053 ## Reprocess.
4054 next B;
4055 } else {
4056 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4057 my $tag_name = $token->{tag_name};
4058 if ($nsuri eq $SVG_NS) {
4059 $tag_name = {
4060 altglyph => 'altGlyph',
4061 altglyphdef => 'altGlyphDef',
4062 altglyphitem => 'altGlyphItem',
4063 animatecolor => 'animateColor',
4064 animatemotion => 'animateMotion',
4065 animatetransform => 'animateTransform',
4066 clippath => 'clipPath',
4067 feblend => 'feBlend',
4068 fecolormatrix => 'feColorMatrix',
4069 fecomponenttransfer => 'feComponentTransfer',
4070 fecomposite => 'feComposite',
4071 feconvolvematrix => 'feConvolveMatrix',
4072 fediffuselighting => 'feDiffuseLighting',
4073 fedisplacementmap => 'feDisplacementMap',
4074 fedistantlight => 'feDistantLight',
4075 feflood => 'feFlood',
4076 fefunca => 'feFuncA',
4077 fefuncb => 'feFuncB',
4078 fefuncg => 'feFuncG',
4079 fefuncr => 'feFuncR',
4080 fegaussianblur => 'feGaussianBlur',
4081 feimage => 'feImage',
4082 femerge => 'feMerge',
4083 femergenode => 'feMergeNode',
4084 femorphology => 'feMorphology',
4085 feoffset => 'feOffset',
4086 fepointlight => 'fePointLight',
4087 fespecularlighting => 'feSpecularLighting',
4088 fespotlight => 'feSpotLight',
4089 fetile => 'feTile',
4090 feturbulence => 'feTurbulence',
4091 foreignobject => 'foreignObject',
4092 glyphref => 'glyphRef',
4093 lineargradient => 'linearGradient',
4094 radialgradient => 'radialGradient',
4095 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4096 textpath => 'textPath',
4097 }->{$tag_name} || $tag_name;
4098 }
4099
4100 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4101
4102 ## "adjust foreign attributes" - done in insert-element-f
4103
4104 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4105
4106 if ($self->{self_closing}) {
4107 pop @{$self->{open_elements}};
4108 !!!ack ('t87.3');
4109 } else {
4110 !!!cp ('t87.4');
4111 }
4112
4113 !!!next-token;
4114 next B;
4115 }
4116 } elsif ($token->{type} == END_TAG_TOKEN) {
4117 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4118 !!!cp ('t87.5');
4119 #
4120 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4121 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4122 !!!cp ('t87.6');
4123 #
4124 ## TODO: ...
4125 } else {
4126 die "$0: $token->{type}: Unknown token type";
4127 }
4128 }
4129
4130 if ($self->{insertion_mode} & HEAD_IMS) {
4131 if ($token->{type} == CHARACTER_TOKEN) {
4132 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4133 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4134 !!!cp ('t88.2');
4135 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4136 } else {
4137 !!!cp ('t88.1');
4138 ## Ignore the token.
4139 !!!next-token;
4140 next B;
4141 }
4142 unless (length $token->{data}) {
4143 !!!cp ('t88');
4144 !!!next-token;
4145 next B;
4146 }
4147 }
4148
4149 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4150 !!!cp ('t89');
4151 ## As if <head>
4152 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4153 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4154 push @{$self->{open_elements}},
4155 [$self->{head_element}, $el_category->{head}];
4156
4157 ## Reprocess in the "in head" insertion mode...
4158 pop @{$self->{open_elements}};
4159
4160 ## Reprocess in the "after head" insertion mode...
4161 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4162 !!!cp ('t90');
4163 ## As if </noscript>
4164 pop @{$self->{open_elements}};
4165 !!!parse-error (type => 'in noscript:#character', token => $token);
4166
4167 ## Reprocess in the "in head" insertion mode...
4168 ## As if </head>
4169 pop @{$self->{open_elements}};
4170
4171 ## Reprocess in the "after head" insertion mode...
4172 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4173 !!!cp ('t91');
4174 pop @{$self->{open_elements}};
4175
4176 ## Reprocess in the "after head" insertion mode...
4177 } else {
4178 !!!cp ('t92');
4179 }
4180
4181 ## "after head" insertion mode
4182 ## As if <body>
4183 !!!insert-element ('body',, $token);
4184 $self->{insertion_mode} = IN_BODY_IM;
4185 ## reprocess
4186 next B;
4187 } elsif ($token->{type} == START_TAG_TOKEN) {
4188 if ($token->{tag_name} eq 'head') {
4189 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4190 !!!cp ('t93');
4191 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4192 $self->{open_elements}->[-1]->[0]->append_child
4193 ($self->{head_element});
4194 push @{$self->{open_elements}},
4195 [$self->{head_element}, $el_category->{head}];
4196 $self->{insertion_mode} = IN_HEAD_IM;
4197 !!!nack ('t93.1');
4198 !!!next-token;
4199 next B;
4200 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4201 !!!cp ('t93.2');
4202 !!!parse-error (type => 'after head:head', token => $token); ## TODO: error type
4203 ## Ignore the token
4204 !!!nack ('t93.3');
4205 !!!next-token;
4206 next B;
4207 } else {
4208 !!!cp ('t95');
4209 !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
4210 ## Ignore the token
4211 !!!nack ('t95.1');
4212 !!!next-token;
4213 next B;
4214 }
4215 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4216 !!!cp ('t96');
4217 ## As if <head>
4218 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4219 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4220 push @{$self->{open_elements}},
4221 [$self->{head_element}, $el_category->{head}];
4222
4223 $self->{insertion_mode} = IN_HEAD_IM;
4224 ## Reprocess in the "in head" insertion mode...
4225 } else {
4226 !!!cp ('t97');
4227 }
4228
4229 if ($token->{tag_name} eq 'base') {
4230 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4231 !!!cp ('t98');
4232 ## As if </noscript>
4233 pop @{$self->{open_elements}};
4234 !!!parse-error (type => 'in noscript:base', token => $token);
4235
4236 $self->{insertion_mode} = IN_HEAD_IM;
4237 ## Reprocess in the "in head" insertion mode...
4238 } else {
4239 !!!cp ('t99');
4240 }
4241
4242 ## NOTE: There is a "as if in head" code clone.
4243 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4244 !!!cp ('t100');
4245 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4246 push @{$self->{open_elements}},
4247 [$self->{head_element}, $el_category->{head}];
4248 } else {
4249 !!!cp ('t101');
4250 }
4251 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4252 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4253 pop @{$self->{open_elements}} # <head>
4254 if $self->{insertion_mode} == AFTER_HEAD_IM;
4255 !!!nack ('t101.1');
4256 !!!next-token;
4257 next B;
4258 } elsif ($token->{tag_name} eq 'link') {
4259 ## NOTE: There is a "as if in head" code clone.
4260 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4261 !!!cp ('t102');
4262 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4263 push @{$self->{open_elements}},
4264 [$self->{head_element}, $el_category->{head}];
4265 } else {
4266 !!!cp ('t103');
4267 }
4268 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4269 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4270 pop @{$self->{open_elements}} # <head>
4271 if $self->{insertion_mode} == AFTER_HEAD_IM;
4272 !!!ack ('t103.1');
4273 !!!next-token;
4274 next B;
4275 } elsif ($token->{tag_name} eq 'meta') {
4276 ## NOTE: There is a "as if in head" code clone.
4277 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4278 !!!cp ('t104');
4279 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4280 push @{$self->{open_elements}},
4281 [$self->{head_element}, $el_category->{head}];
4282 } else {
4283 !!!cp ('t105');
4284 }
4285 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4286 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4287
4288 unless ($self->{confident}) {
4289 if ($token->{attributes}->{charset}) {
4290 !!!cp ('t106');
4291 ## NOTE: Whether the encoding is supported or not is handled
4292 ## in the {change_encoding} callback.
4293 $self->{change_encoding}
4294 ->($self, $token->{attributes}->{charset}->{value},
4295 $token);
4296
4297 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4298 ->set_user_data (manakai_has_reference =>
4299 $token->{attributes}->{charset}
4300 ->{has_reference});
4301 } elsif ($token->{attributes}->{content}) {
4302 if ($token->{attributes}->{content}->{value}
4303 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4304 [\x09-\x0D\x20]*=
4305 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4306 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4307 !!!cp ('t107');
4308 ## NOTE: Whether the encoding is supported or not is handled
4309 ## in the {change_encoding} callback.
4310 $self->{change_encoding}
4311 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4312 $token);
4313 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4314 ->set_user_data (manakai_has_reference =>
4315 $token->{attributes}->{content}
4316 ->{has_reference});
4317 } else {
4318 !!!cp ('t108');
4319 }
4320 }
4321 } else {
4322 if ($token->{attributes}->{charset}) {
4323 !!!cp ('t109');
4324 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4325 ->set_user_data (manakai_has_reference =>
4326 $token->{attributes}->{charset}
4327 ->{has_reference});
4328 }
4329 if ($token->{attributes}->{content}) {
4330 !!!cp ('t110');
4331 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4332 ->set_user_data (manakai_has_reference =>
4333 $token->{attributes}->{content}
4334 ->{has_reference});
4335 }
4336 }
4337
4338 pop @{$self->{open_elements}} # <head>
4339 if $self->{insertion_mode} == AFTER_HEAD_IM;
4340 !!!ack ('t110.1');
4341 !!!next-token;
4342 next B;
4343 } elsif ($token->{tag_name} eq 'title') {
4344 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4345 !!!cp ('t111');
4346 ## As if </noscript>
4347 pop @{$self->{open_elements}};
4348 !!!parse-error (type => 'in noscript:title', token => $token);
4349
4350 $self->{insertion_mode} = IN_HEAD_IM;
4351 ## Reprocess in the "in head" insertion mode...
4352 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4353 !!!cp ('t112');
4354 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4355 push @{$self->{open_elements}},
4356 [$self->{head_element}, $el_category->{head}];
4357 } else {
4358 !!!cp ('t113');
4359 }
4360
4361 ## NOTE: There is a "as if in head" code clone.
4362 my $parent = defined $self->{head_element} ? $self->{head_element}
4363 : $self->{open_elements}->[-1]->[0];
4364 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4365 pop @{$self->{open_elements}} # <head>
4366 if $self->{insertion_mode} == AFTER_HEAD_IM;
4367 next B;
4368 } elsif ($token->{tag_name} eq 'style') {
4369 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4370 ## insertion mode IN_HEAD_IM)
4371 ## NOTE: There is a "as if in head" code clone.
4372 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4373 !!!cp ('t114');
4374 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4375 push @{$self->{open_elements}},
4376 [$self->{head_element}, $el_category->{head}];
4377 } else {
4378 !!!cp ('t115');
4379 }
4380 $parse_rcdata->(CDATA_CONTENT_MODEL);
4381 pop @{$self->{open_elements}} # <head>
4382 if $self->{insertion_mode} == AFTER_HEAD_IM;
4383 next B;
4384 } elsif ($token->{tag_name} eq 'noscript') {
4385 if ($self->{insertion_mode} == IN_HEAD_IM) {
4386 !!!cp ('t116');
4387 ## NOTE: and scripting is disalbed
4388 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4389 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4390 !!!nack ('t116.1');
4391 !!!next-token;
4392 next B;
4393 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4394 !!!cp ('t117');
4395 !!!parse-error (type => 'in noscript:noscript', token => $token);
4396 ## Ignore the token
4397 !!!nack ('t117.1');
4398 !!!next-token;
4399 next B;
4400 } else {
4401 !!!cp ('t118');
4402 #
4403 }
4404 } elsif ($token->{tag_name} eq 'script') {
4405 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4406 !!!cp ('t119');
4407 ## As if </noscript>
4408 pop @{$self->{open_elements}};
4409 !!!parse-error (type => 'in noscript:script', token => $token);
4410
4411 $self->{insertion_mode} = IN_HEAD_IM;
4412 ## Reprocess in the "in head" insertion mode...
4413 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4414 !!!cp ('t120');
4415 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4416 push @{$self->{open_elements}},
4417 [$self->{head_element}, $el_category->{head}];
4418 } else {
4419 !!!cp ('t121');
4420 }
4421
4422 ## NOTE: There is a "as if in head" code clone.
4423 $script_start_tag->();
4424 pop @{$self->{open_elements}} # <head>
4425 if $self->{insertion_mode} == AFTER_HEAD_IM;
4426 next B;
4427 } elsif ($token->{tag_name} eq 'body' or
4428 $token->{tag_name} eq 'frameset') {
4429 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4430 !!!cp ('t122');
4431 ## As if </noscript>
4432 pop @{$self->{open_elements}};
4433 !!!parse-error (type => 'in noscript:'.$token->{tag_name}, token => $token);
4434
4435 ## Reprocess in the "in head" insertion mode...
4436 ## As if </head>
4437 pop @{$self->{open_elements}};
4438
4439 ## Reprocess in the "after head" insertion mode...
4440 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4441 !!!cp ('t124');
4442 pop @{$self->{open_elements}};
4443
4444 ## Reprocess in the "after head" insertion mode...
4445 } else {
4446 !!!cp ('t125');
4447 }
4448
4449 ## "after head" insertion mode
4450 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4451 if ($token->{tag_name} eq 'body') {
4452 !!!cp ('t126');
4453 $self->{insertion_mode} = IN_BODY_IM;
4454 } elsif ($token->{tag_name} eq 'frameset') {
4455 !!!cp ('t127');
4456 $self->{insertion_mode} = IN_FRAMESET_IM;
4457 } else {
4458 die "$0: tag name: $self->{tag_name}";
4459 }
4460 !!!nack ('t127.1');
4461 !!!next-token;
4462 next B;
4463 } else {
4464 !!!cp ('t128');
4465 #
4466 }
4467
4468 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4469 !!!cp ('t129');
4470 ## As if </noscript>
4471 pop @{$self->{open_elements}};
4472 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4473
4474 ## Reprocess in the "in head" insertion mode...
4475 ## As if </head>
4476 pop @{$self->{open_elements}};
4477
4478 ## Reprocess in the "after head" insertion mode...
4479 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4480 !!!cp ('t130');
4481 ## As if </head>
4482 pop @{$self->{open_elements}};
4483
4484 ## Reprocess in the "after head" insertion mode...
4485 } else {
4486 !!!cp ('t131');
4487 }
4488
4489 ## "after head" insertion mode
4490 ## As if <body>
4491 !!!insert-element ('body',, $token);
4492 $self->{insertion_mode} = IN_BODY_IM;
4493 ## reprocess
4494 !!!ack-later;
4495 next B;
4496 } elsif ($token->{type} == END_TAG_TOKEN) {
4497 if ($token->{tag_name} eq 'head') {
4498 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4499 !!!cp ('t132');
4500 ## As if <head>
4501 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4502 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4503 push @{$self->{open_elements}},
4504 [$self->{head_element}, $el_category->{head}];
4505
4506 ## Reprocess in the "in head" insertion mode...
4507 pop @{$self->{open_elements}};
4508 $self->{insertion_mode} = AFTER_HEAD_IM;
4509 !!!next-token;
4510 next B;
4511 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4512 !!!cp ('t133');
4513 ## As if </noscript>
4514 pop @{$self->{open_elements}};
4515 !!!parse-error (type => 'in noscript:/head', token => $token);
4516
4517 ## Reprocess in the "in head" insertion mode...
4518 pop @{$self->{open_elements}};
4519 $self->{insertion_mode} = AFTER_HEAD_IM;
4520 !!!next-token;
4521 next B;
4522 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4523 !!!cp ('t134');
4524 pop @{$self->{open_elements}};
4525 $self->{insertion_mode} = AFTER_HEAD_IM;
4526 !!!next-token;
4527 next B;
4528 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4529 !!!cp ('t134.1');
4530 !!!parse-error (type => 'unmatched end tag:head', token => $token);
4531 ## Ignore the token
4532 !!!next-token;
4533 next B;
4534 } else {
4535 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4536 }
4537 } elsif ($token->{tag_name} eq 'noscript') {
4538 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4539 !!!cp ('t136');
4540 pop @{$self->{open_elements}};
4541 $self->{insertion_mode} = IN_HEAD_IM;
4542 !!!next-token;
4543 next B;
4544 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4545 $self->{insertion_mode} == AFTER_HEAD_IM) {
4546 !!!cp ('t137');
4547 !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4548 ## Ignore the token ## ISSUE: An issue in the spec.
4549 !!!next-token;
4550 next B;
4551 } else {
4552 !!!cp ('t138');
4553 #
4554 }
4555 } elsif ({
4556 body => 1, html => 1,
4557 }->{$token->{tag_name}}) {
4558 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4559 $self->{insertion_mode} == IN_HEAD_IM or
4560 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4561 !!!cp ('t140');
4562 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4563 ## Ignore the token
4564 !!!next-token;
4565 next B;
4566 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4567 !!!cp ('t140.1');
4568 !!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token);
4569 ## Ignore the token
4570 !!!next-token;
4571 next B;
4572 } else {
4573 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4574 }
4575 } elsif ($token->{tag_name} eq 'p') {
4576 !!!cp ('t142');
4577 !!!parse-error (type => 'unmatched end tag:p', token => $token);
4578 ## Ignore the token
4579 !!!next-token;
4580 next B;
4581 } elsif ($token->{tag_name} eq 'br') {
4582 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4583 !!!cp ('t142.2');
4584 ## (before head) as if <head>, (in head) as if </head>
4585 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4586 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4587 $self->{insertion_mode} = AFTER_HEAD_IM;
4588
4589 ## Reprocess in the "after head" insertion mode...
4590 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4591 !!!cp ('t143.2');
4592 ## As if </head>
4593 pop @{$self->{open_elements}};
4594 $self->{insertion_mode} = AFTER_HEAD_IM;
4595
4596 ## Reprocess in the "after head" insertion mode...
4597 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4598 !!!cp ('t143.3');
4599 ## ISSUE: Two parse errors for <head><noscript></br>
4600 !!!parse-error (type => 'unmatched end tag:br', token => $token);
4601 ## As if </noscript>
4602 pop @{$self->{open_elements}};
4603 $self->{insertion_mode} = IN_HEAD_IM;
4604
4605 ## Reprocess in the "in head" insertion mode...
4606 ## As if </head>
4607 pop @{$self->{open_elements}};
4608 $self->{insertion_mode} = AFTER_HEAD_IM;
4609
4610 ## Reprocess in the "after head" insertion mode...
4611 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4612 !!!cp ('t143.4');
4613 #
4614 } else {
4615 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4616 }
4617
4618 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4619 !!!parse-error (type => 'unmatched end tag:br', token => $token);
4620 ## Ignore the token
4621 !!!next-token;
4622 next B;
4623 } else {
4624 !!!cp ('t145');
4625 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4626 ## Ignore the token
4627 !!!next-token;
4628 next B;
4629 }
4630
4631 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4632 !!!cp ('t146');
4633 ## As if </noscript>
4634 pop @{$self->{open_elements}};
4635 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4636
4637 ## Reprocess in the "in head" insertion mode...
4638 ## As if </head>
4639 pop @{$self->{open_elements}};
4640
4641 ## Reprocess in the "after head" insertion mode...
4642 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4643 !!!cp ('t147');
4644 ## As if </head>
4645 pop @{$self->{open_elements}};
4646
4647 ## Reprocess in the "after head" insertion mode...
4648 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4649 ## ISSUE: This case cannot be reached?
4650 !!!cp ('t148');
4651 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4652 ## Ignore the token ## ISSUE: An issue in the spec.
4653 !!!next-token;
4654 next B;
4655 } else {
4656 !!!cp ('t149');
4657 }
4658
4659 ## "after head" insertion mode
4660 ## As if <body>
4661 !!!insert-element ('body',, $token);
4662 $self->{insertion_mode} = IN_BODY_IM;
4663 ## reprocess
4664 next B;
4665 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4666 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4667 !!!cp ('t149.1');
4668
4669 ## NOTE: As if <head>
4670 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4671 $self->{open_elements}->[-1]->[0]->append_child
4672 ($self->{head_element});
4673 #push @{$self->{open_elements}},
4674 # [$self->{head_element}, $el_category->{head}];
4675 #$self->{insertion_mode} = IN_HEAD_IM;
4676 ## NOTE: Reprocess.
4677
4678 ## NOTE: As if </head>
4679 #pop @{$self->{open_elements}};
4680 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4681 ## NOTE: Reprocess.
4682
4683 #
4684 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4685 !!!cp ('t149.2');
4686
4687 ## NOTE: As if </head>
4688 pop @{$self->{open_elements}};
4689 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4690 ## NOTE: Reprocess.
4691
4692 #
4693 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4694 !!!cp ('t149.3');
4695
4696 !!!parse-error (type => 'in noscript:#eof', token => $token);
4697
4698 ## As if </noscript>
4699 pop @{$self->{open_elements}};
4700 #$self->{insertion_mode} = IN_HEAD_IM;
4701 ## NOTE: Reprocess.
4702
4703 ## NOTE: As if </head>
4704 pop @{$self->{open_elements}};
4705 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4706 ## NOTE: Reprocess.
4707
4708 #
4709 } else {
4710 !!!cp ('t149.4');
4711 #
4712 }
4713
4714 ## NOTE: As if <body>
4715 !!!insert-element ('body',, $token);
4716 $self->{insertion_mode} = IN_BODY_IM;
4717 ## NOTE: Reprocess.
4718 next B;
4719 } else {
4720 die "$0: $token->{type}: Unknown token type";
4721 }
4722
4723 ## ISSUE: An issue in the spec.
4724 } elsif ($self->{insertion_mode} & BODY_IMS) {
4725 if ($token->{type} == CHARACTER_TOKEN) {
4726 !!!cp ('t150');
4727 ## NOTE: There is a code clone of "character in body".
4728 $reconstruct_active_formatting_elements->($insert_to_current);
4729
4730 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4731
4732 !!!next-token;
4733 next B;
4734 } elsif ($token->{type} == START_TAG_TOKEN) {
4735 if ({
4736 caption => 1, col => 1, colgroup => 1, tbody => 1,
4737 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4738 }->{$token->{tag_name}}) {
4739 if ($self->{insertion_mode} == IN_CELL_IM) {
4740 ## have an element in table scope
4741 for (reverse 0..$#{$self->{open_elements}}) {
4742 my $node = $self->{open_elements}->[$_];
4743 if ($node->[1] & TABLE_CELL_EL) {
4744 !!!cp ('t151');
4745
4746 ## Close the cell
4747 !!!back-token; # <x>
4748 $token = {type => END_TAG_TOKEN,
4749 tag_name => $node->[0]->manakai_local_name,
4750 line => $token->{line},
4751 column => $token->{column}};
4752 next B;
4753 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4754 !!!cp ('t152');
4755 ## ISSUE: This case can never be reached, maybe.
4756 last;
4757 }
4758 }
4759
4760 !!!cp ('t153');
4761 !!!parse-error (type => 'start tag not allowed',
4762 value => $token->{tag_name}, token => $token);
4763 ## Ignore the token
4764 !!!nack ('t153.1');
4765 !!!next-token;
4766 next B;
4767 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4768 !!!parse-error (type => 'not closed:caption', token => $token);
4769
4770 ## NOTE: As if </caption>.
4771 ## have a table element in table scope
4772 my $i;
4773 INSCOPE: {
4774 for (reverse 0..$#{$self->{open_elements}}) {
4775 my $node = $self->{open_elements}->[$_];
4776 if ($node->[1] & CAPTION_EL) {
4777 !!!cp ('t155');
4778 $i = $_;
4779 last INSCOPE;
4780 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4781 !!!cp ('t156');
4782 last;
4783 }
4784 }
4785
4786 !!!cp ('t157');
4787 !!!parse-error (type => 'start tag not allowed',
4788 value => $token->{tag_name}, token => $token);
4789 ## Ignore the token
4790 !!!nack ('t157.1');
4791 !!!next-token;
4792 next B;
4793 } # INSCOPE
4794
4795 ## generate implied end tags
4796 while ($self->{open_elements}->[-1]->[1]
4797 & END_TAG_OPTIONAL_EL) {
4798 !!!cp ('t158');
4799 pop @{$self->{open_elements}};
4800 }
4801
4802 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4803 !!!cp ('t159');
4804 !!!parse-error (type => 'not closed',
4805 value => $self->{open_elements}->[-1]->[0]
4806 ->manakai_local_name,
4807 token => $token);
4808 } else {
4809 !!!cp ('t160');
4810 }
4811
4812 splice @{$self->{open_elements}}, $i;
4813
4814 $clear_up_to_marker->();
4815
4816 $self->{insertion_mode} = IN_TABLE_IM;
4817
4818 ## reprocess
4819 !!!ack-later;
4820 next B;
4821 } else {
4822 !!!cp ('t161');
4823 #
4824 }
4825 } else {
4826 !!!cp ('t162');
4827 #
4828 }
4829 } elsif ($token->{type} == END_TAG_TOKEN) {
4830 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4831 if ($self->{insertion_mode} == IN_CELL_IM) {
4832 ## have an element in table scope
4833 my $i;
4834 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4835 my $node = $self->{open_elements}->[$_];
4836 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4837 !!!cp ('t163');
4838 $i = $_;
4839 last INSCOPE;
4840 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4841 !!!cp ('t164');
4842 last INSCOPE;
4843 }
4844 } # INSCOPE
4845 unless (defined $i) {
4846 !!!cp ('t165');
4847 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4848 ## Ignore the token
4849 !!!next-token;
4850 next B;
4851 }
4852
4853 ## generate implied end tags
4854 while ($self->{open_elements}->[-1]->[1]
4855 & END_TAG_OPTIONAL_EL) {
4856 !!!cp ('t166');
4857 pop @{$self->{open_elements}};
4858 }
4859
4860 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4861 ne $token->{tag_name}) {
4862 !!!cp ('t167');
4863 !!!parse-error (type => 'not closed',
4864 value => $self->{open_elements}->[-1]->[0]
4865 ->manakai_local_name,
4866 token => $token);
4867 } else {
4868 !!!cp ('t168');
4869 }
4870
4871 splice @{$self->{open_elements}}, $i;
4872
4873 $clear_up_to_marker->();
4874
4875 $self->{insertion_mode} = IN_ROW_IM;
4876
4877 !!!next-token;
4878 next B;
4879 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4880 !!!cp ('t169');
4881 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4882 ## Ignore the token
4883 !!!next-token;
4884 next B;
4885 } else {
4886 !!!cp ('t170');
4887 #
4888 }
4889 } elsif ($token->{tag_name} eq 'caption') {
4890 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4891 ## have a table element in table scope
4892 my $i;
4893 INSCOPE: {
4894 for (reverse 0..$#{$self->{open_elements}}) {
4895 my $node = $self->{open_elements}->[$_];
4896 if ($node->[1] & CAPTION_EL) {
4897 !!!cp ('t171');
4898 $i = $_;
4899 last INSCOPE;
4900 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4901 !!!cp ('t172');
4902 last;
4903 }
4904 }
4905
4906 !!!cp ('t173');
4907 !!!parse-error (type => 'unmatched end tag',
4908 value => $token->{tag_name}, token => $token);
4909 ## Ignore the token
4910 !!!next-token;
4911 next B;
4912 } # INSCOPE
4913
4914 ## generate implied end tags
4915 while ($self->{open_elements}->[-1]->[1]
4916 & END_TAG_OPTIONAL_EL) {
4917 !!!cp ('t174');
4918 pop @{$self->{open_elements}};
4919 }
4920
4921 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4922 !!!cp ('t175');
4923 !!!parse-error (type => 'not closed',
4924 value => $self->{open_elements}->[-1]->[0]
4925 ->manakai_local_name,
4926 token => $token);
4927 } else {
4928 !!!cp ('t176');
4929 }
4930
4931 splice @{$self->{open_elements}}, $i;
4932
4933 $clear_up_to_marker->();
4934
4935 $self->{insertion_mode} = IN_TABLE_IM;
4936
4937 !!!next-token;
4938 next B;
4939 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4940 !!!cp ('t177');
4941 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4942 ## Ignore the token
4943 !!!next-token;
4944 next B;
4945 } else {
4946 !!!cp ('t178');
4947 #
4948 }
4949 } elsif ({
4950 table => 1, tbody => 1, tfoot => 1,
4951 thead => 1, tr => 1,
4952 }->{$token->{tag_name}} and
4953 $self->{insertion_mode} == IN_CELL_IM) {
4954 ## have an element in table scope
4955 my $i;
4956 my $tn;
4957 INSCOPE: {
4958 for (reverse 0..$#{$self->{open_elements}}) {
4959 my $node = $self->{open_elements}->[$_];
4960 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4961 !!!cp ('t179');
4962 $i = $_;
4963
4964 ## Close the cell
4965 !!!back-token; # </x>
4966 $token = {type => END_TAG_TOKEN, tag_name => $tn,
4967 line => $token->{line},
4968 column => $token->{column}};
4969 next B;
4970 } elsif ($node->[1] & TABLE_CELL_EL) {
4971 !!!cp ('t180');
4972 $tn = $node->[0]->manakai_local_name;
4973 ## NOTE: There is exactly one |td| or |th| element
4974 ## in scope in the stack of open elements by definition.
4975 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4976 ## ISSUE: Can this be reached?
4977 !!!cp ('t181');
4978 last;
4979 }
4980 }
4981
4982 !!!cp ('t182');
4983 !!!parse-error (type => 'unmatched end tag',
4984 value => $token->{tag_name}, token => $token);
4985 ## Ignore the token
4986 !!!next-token;
4987 next B;
4988 } # INSCOPE
4989 } elsif ($token->{tag_name} eq 'table' and
4990 $self->{insertion_mode} == IN_CAPTION_IM) {
4991 !!!parse-error (type => 'not closed:caption', token => $token);
4992
4993 ## As if </caption>
4994 ## have a table element in table scope
4995 my $i;
4996 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4997 my $node = $self->{open_elements}->[$_];
4998 if ($node->[1] & CAPTION_EL) {
4999 !!!cp ('t184');
5000 $i = $_;
5001 last INSCOPE;
5002 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5003 !!!cp ('t185');
5004 last INSCOPE;
5005 }
5006 } # INSCOPE
5007 unless (defined $i) {
5008 !!!cp ('t186');
5009 !!!parse-error (type => 'unmatched end tag:caption', token => $token);
5010 ## Ignore the token
5011 !!!next-token;
5012 next B;
5013 }
5014
5015 ## generate implied end tags
5016 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5017 !!!cp ('t187');
5018 pop @{$self->{open_elements}};
5019 }
5020
5021 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5022 !!!cp ('t188');
5023 !!!parse-error (type => 'not closed',
5024 value => $self->{open_elements}->[-1]->[0]
5025 ->manakai_local_name,
5026 token => $token);
5027 } else {
5028 !!!cp ('t189');
5029 }
5030
5031 splice @{$self->{open_elements}}, $i;
5032
5033 $clear_up_to_marker->();
5034
5035 $self->{insertion_mode} = IN_TABLE_IM;
5036
5037 ## reprocess
5038 next B;
5039 } elsif ({
5040 body => 1, col => 1, colgroup => 1, html => 1,
5041 }->{$token->{tag_name}}) {
5042 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5043 !!!cp ('t190');
5044 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5045 ## Ignore the token
5046 !!!next-token;
5047 next B;
5048 } else {
5049 !!!cp ('t191');
5050 #
5051 }
5052 } elsif ({
5053 tbody => 1, tfoot => 1,
5054 thead => 1, tr => 1,
5055 }->{$token->{tag_name}} and
5056 $self->{insertion_mode} == IN_CAPTION_IM) {
5057 !!!cp ('t192');
5058 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5059 ## Ignore the token
5060 !!!next-token;
5061 next B;
5062 } else {
5063 !!!cp ('t193');
5064 #
5065 }
5066 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5067 for my $entry (@{$self->{open_elements}}) {
5068 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5069 !!!cp ('t75');
5070 !!!parse-error (type => 'in body:#eof', token => $token);
5071 last;
5072 }
5073 }
5074
5075 ## Stop parsing.
5076 last B;
5077 } else {
5078 die "$0: $token->{type}: Unknown token type";
5079 }
5080
5081 $insert = $insert_to_current;
5082 #
5083 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5084 if ($token->{type} == CHARACTER_TOKEN) {
5085 if (not $open_tables->[-1]->[1] and # tainted
5086 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5087 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5088
5089 unless (length $token->{data}) {
5090 !!!cp ('t194');
5091 !!!next-token;
5092 next B;
5093 } else {
5094 !!!cp ('t195');
5095 }
5096 }
5097
5098 !!!parse-error (type => 'in table:#character', token => $token);
5099
5100 ## As if in body, but insert into foster parent element
5101 ## ISSUE: Spec says that "whenever a node would be inserted
5102 ## into the current node" while characters might not be
5103 ## result in a new Text node.
5104 $reconstruct_active_formatting_elements->($insert_to_foster);
5105
5106 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5107 # MUST
5108 my $foster_parent_element;
5109 my $next_sibling;
5110 my $prev_sibling;
5111 OE: for (reverse 0..$#{$self->{open_elements}}) {
5112 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5113 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5114 if (defined $parent and $parent->node_type == 1) {
5115 !!!cp ('t196');
5116 $foster_parent_element = $parent;
5117 $next_sibling = $self->{open_elements}->[$_]->[0];
5118 $prev_sibling = $next_sibling->previous_sibling;
5119 } else {
5120 !!!cp ('t197');
5121 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5122 $prev_sibling = $foster_parent_element->last_child;
5123 }
5124 last OE;
5125 }
5126 } # OE
5127 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5128 $prev_sibling = $foster_parent_element->last_child
5129 unless defined $foster_parent_element;
5130 if (defined $prev_sibling and
5131 $prev_sibling->node_type == 3) {
5132 !!!cp ('t198');
5133 $prev_sibling->manakai_append_text ($token->{data});
5134 } else {
5135 !!!cp ('t199');
5136 $foster_parent_element->insert_before
5137 ($self->{document}->create_text_node ($token->{data}),
5138 $next_sibling);
5139 }
5140 $open_tables->[-1]->[1] = 1; # tainted
5141 } else {
5142 !!!cp ('t200');
5143 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5144 }
5145
5146 !!!next-token;
5147 next B;
5148 } elsif ($token->{type} == START_TAG_TOKEN) {
5149 if ({
5150 tr => ($self->{insertion_mode} != IN_ROW_IM),
5151 th => 1, td => 1,
5152 }->{$token->{tag_name}}) {
5153 if ($self->{insertion_mode} == IN_TABLE_IM) {
5154 ## Clear back to table context
5155 while (not ($self->{open_elements}->[-1]->[1]
5156 & TABLE_SCOPING_EL)) {
5157 !!!cp ('t201');
5158 pop @{$self->{open_elements}};
5159 }
5160
5161 !!!insert-element ('tbody',, $token);
5162 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5163 ## reprocess in the "in table body" insertion mode...
5164 }
5165
5166 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5167 unless ($token->{tag_name} eq 'tr') {
5168 !!!cp ('t202');
5169 !!!parse-error (type => 'missing start tag:tr', token => $token);
5170 }
5171
5172 ## Clear back to table body context
5173 while (not ($self->{open_elements}->[-1]->[1]
5174 & TABLE_ROWS_SCOPING_EL)) {
5175 !!!cp ('t203');
5176 ## ISSUE: Can this case be reached?
5177 pop @{$self->{open_elements}};
5178 }
5179
5180 $self->{insertion_mode} = IN_ROW_IM;
5181 if ($token->{tag_name} eq 'tr') {
5182 !!!cp ('t204');
5183 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5184 !!!nack ('t204');
5185 !!!next-token;
5186 next B;
5187 } else {
5188 !!!cp ('t205');
5189 !!!insert-element ('tr',, $token);
5190 ## reprocess in the "in row" insertion mode
5191 }
5192 } else {
5193 !!!cp ('t206');
5194 }
5195
5196 ## Clear back to table row context
5197 while (not ($self->{open_elements}->[-1]->[1]
5198 & TABLE_ROW_SCOPING_EL)) {
5199 !!!cp ('t207');
5200 pop @{$self->{open_elements}};
5201 }
5202
5203 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5204 $self->{insertion_mode} = IN_CELL_IM;
5205
5206 push @$active_formatting_elements, ['#marker', ''];
5207
5208 !!!nack ('t207.1');
5209 !!!next-token;
5210 next B;
5211 } elsif ({
5212 caption => 1, col => 1, colgroup => 1,
5213 tbody => 1, tfoot => 1, thead => 1,
5214 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5215 }->{$token->{tag_name}}) {
5216 if ($self->{insertion_mode} == IN_ROW_IM) {
5217 ## As if </tr>
5218 ## have an element in table scope
5219 my $i;
5220 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5221 my $node = $self->{open_elements}->[$_];
5222 if ($node->[1] & TABLE_ROW_EL) {
5223 !!!cp ('t208');
5224 $i = $_;
5225 last INSCOPE;
5226 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5227 !!!cp ('t209');
5228 last INSCOPE;
5229 }
5230 } # INSCOPE
5231 unless (defined $i) {
5232 !!!cp ('t210');
5233 ## TODO: This type is wrong.
5234 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name}, token => $token);
5235 ## Ignore the token
5236 !!!nack ('t210.1');
5237 !!!next-token;
5238 next B;
5239 }
5240
5241 ## Clear back to table row context
5242 while (not ($self->{open_elements}->[-1]->[1]
5243 & TABLE_ROW_SCOPING_EL)) {
5244 !!!cp ('t211');
5245 ## ISSUE: Can this case be reached?
5246 pop @{$self->{open_elements}};
5247 }
5248
5249 pop @{$self->{open_elements}}; # tr
5250 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5251 if ($token->{tag_name} eq 'tr') {
5252 !!!cp ('t212');
5253 ## reprocess
5254 !!!ack-later;
5255 next B;
5256 } else {
5257 !!!cp ('t213');
5258 ## reprocess in the "in table body" insertion mode...
5259 }
5260 }
5261
5262 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5263 ## have an element in table scope
5264 my $i;
5265 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5266 my $node = $self->{open_elements}->[$_];
5267 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5268 !!!cp ('t214');
5269 $i = $_;
5270 last INSCOPE;
5271 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5272 !!!cp ('t215');
5273 last INSCOPE;
5274 }
5275 } # INSCOPE
5276 unless (defined $i) {
5277 !!!cp ('t216');
5278 ## TODO: This erorr type ios wrong.
5279 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5280 ## Ignore the token
5281 !!!nack ('t216.1');
5282 !!!next-token;
5283 next B;
5284 }
5285
5286 ## Clear back to table body context
5287 while (not ($self->{open_elements}->[-1]->[1]
5288 & TABLE_ROWS_SCOPING_EL)) {
5289 !!!cp ('t217');
5290 ## ISSUE: Can this state be reached?
5291 pop @{$self->{open_elements}};
5292 }
5293
5294 ## As if <{current node}>
5295 ## have an element in table scope
5296 ## true by definition
5297
5298 ## Clear back to table body context
5299 ## nop by definition
5300
5301 pop @{$self->{open_elements}};
5302 $self->{insertion_mode} = IN_TABLE_IM;
5303 ## reprocess in "in table" insertion mode...
5304 } else {
5305 !!!cp ('t218');
5306 }
5307
5308 if ($token->{tag_name} eq 'col') {
5309 ## Clear back to table context
5310 while (not ($self->{open_elements}->[-1]->[1]
5311 & TABLE_SCOPING_EL)) {
5312 !!!cp ('t219');
5313 ## ISSUE: Can this state be reached?
5314 pop @{$self->{open_elements}};
5315 }
5316
5317 !!!insert-element ('colgroup',, $token);
5318 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5319 ## reprocess
5320 !!!ack-later;
5321 next B;
5322 } elsif ({
5323 caption => 1,
5324 colgroup => 1,
5325 tbody => 1, tfoot => 1, thead => 1,
5326 }->{$token->{tag_name}}) {
5327 ## Clear back to table context
5328 while (not ($self->{open_elements}->[-1]->[1]
5329 & TABLE_SCOPING_EL)) {
5330 !!!cp ('t220');
5331 ## ISSUE: Can this state be reached?
5332 pop @{$self->{open_elements}};
5333 }
5334
5335 push @$active_formatting_elements, ['#marker', '']
5336 if $token->{tag_name} eq 'caption';
5337
5338 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5339 $self->{insertion_mode} = {
5340 caption => IN_CAPTION_IM,
5341 colgroup => IN_COLUMN_GROUP_IM,
5342 tbody => IN_TABLE_BODY_IM,
5343 tfoot => IN_TABLE_BODY_IM,
5344 thead => IN_TABLE_BODY_IM,
5345 }->{$token->{tag_name}};
5346 !!!next-token;
5347 !!!nack ('t220.1');
5348 next B;
5349 } else {
5350 die "$0: in table: <>: $token->{tag_name}";
5351 }
5352 } elsif ($token->{tag_name} eq 'table') {
5353 !!!parse-error (type => 'not closed',
5354 value => $self->{open_elements}->[-1]->[0]
5355 ->manakai_local_name,
5356 token => $token);
5357
5358 ## As if </table>
5359 ## have a table element in table scope
5360 my $i;
5361 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5362 my $node = $self->{open_elements}->[$_];
5363 if ($node->[1] & TABLE_EL) {
5364 !!!cp ('t221');
5365 $i = $_;
5366 last INSCOPE;
5367 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5368 !!!cp ('t222');
5369 last INSCOPE;
5370 }
5371 } # INSCOPE
5372 unless (defined $i) {
5373 !!!cp ('t223');
5374 ## TODO: The following is wrong, maybe.
5375 !!!parse-error (type => 'unmatched end tag:table', token => $token);
5376 ## Ignore tokens </table><table>
5377 !!!nack ('t223.1');
5378 !!!next-token;
5379 next B;
5380 }
5381
5382 ## TODO: Followings are removed from the latest spec.
5383 ## generate implied end tags
5384 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5385 !!!cp ('t224');
5386 pop @{$self->{open_elements}};
5387 }
5388
5389 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5390 !!!cp ('t225');
5391 ## NOTE: |<table><tr><table>|
5392 !!!parse-error (type => 'not closed',
5393 value => $self->{open_elements}->[-1]->[0]
5394 ->manakai_local_name,
5395 token => $token);
5396 } else {
5397 !!!cp ('t226');
5398 }
5399
5400 splice @{$self->{open_elements}}, $i;
5401 pop @{$open_tables};
5402
5403 $self->_reset_insertion_mode;
5404
5405 ## reprocess
5406 !!!ack-later;
5407 next B;
5408 } elsif ($token->{tag_name} eq 'style') {
5409 if (not $open_tables->[-1]->[1]) { # tainted
5410 !!!cp ('t227.8');
5411 ## NOTE: This is a "as if in head" code clone.
5412 $parse_rcdata->(CDATA_CONTENT_MODEL);
5413 next B;
5414 } else {
5415 !!!cp ('t227.7');
5416 #
5417 }
5418 } elsif ($token->{tag_name} eq 'script') {
5419 if (not $open_tables->[-1]->[1]) { # tainted
5420 !!!cp ('t227.6');
5421 ## NOTE: This is a "as if in head" code clone.
5422 $script_start_tag->();
5423 next B;
5424 } else {
5425 !!!cp ('t227.5');
5426 #
5427 }
5428 } elsif ($token->{tag_name} eq 'input') {
5429 if (not $open_tables->[-1]->[1]) { # tainted
5430 if ($token->{attributes}->{type}) { ## TODO: case
5431 my $type = lc $token->{attributes}->{type}->{value};
5432 if ($type eq 'hidden') {
5433 !!!cp ('t227.3');
5434 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5435
5436 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5437
5438 ## TODO: form element pointer
5439
5440 pop @{$self->{open_elements}};
5441
5442 !!!next-token;
5443 !!!ack ('t227.2.1');
5444 next B;
5445 } else {
5446 !!!cp ('t227.2');
5447 #
5448 }
5449 } else {
5450 !!!cp ('t227.1');
5451 #
5452 }
5453 } else {
5454 !!!cp ('t227.4');
5455 #
5456 }
5457 } else {
5458 !!!cp ('t227');
5459 #
5460 }
5461
5462 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5463
5464 $insert = $insert_to_foster;
5465 #
5466 } elsif ($token->{type} == END_TAG_TOKEN) {
5467 if ($token->{tag_name} eq 'tr' and
5468 $self->{insertion_mode} == IN_ROW_IM) {
5469 ## have an element in table scope
5470 my $i;
5471 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5472 my $node = $self->{open_elements}->[$_];
5473 if ($node->[1] & TABLE_ROW_EL) {
5474 !!!cp ('t228');
5475 $i = $_;
5476 last INSCOPE;
5477 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5478 !!!cp ('t229');
5479 last INSCOPE;
5480 }
5481 } # INSCOPE
5482 unless (defined $i) {
5483 !!!cp ('t230');
5484 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5485 ## Ignore the token
5486 !!!nack ('t230.1');
5487 !!!next-token;
5488 next B;
5489 } else {
5490 !!!cp ('t232');
5491 }
5492
5493 ## Clear back to table row context
5494 while (not ($self->{open_elements}->[-1]->[1]
5495 & TABLE_ROW_SCOPING_EL)) {
5496 !!!cp ('t231');
5497 ## ISSUE: Can this state be reached?
5498 pop @{$self->{open_elements}};
5499 }
5500
5501 pop @{$self->{open_elements}}; # tr
5502 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5503 !!!next-token;
5504 !!!nack ('t231.1');
5505 next B;
5506 } elsif ($token->{tag_name} eq 'table') {
5507 if ($self->{insertion_mode} == IN_ROW_IM) {
5508 ## As if </tr>
5509 ## have an element in table scope
5510 my $i;
5511 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5512 my $node = $self->{open_elements}->[$_];
5513 if ($node->[1] & TABLE_ROW_EL) {
5514 !!!cp ('t233');
5515 $i = $_;
5516 last INSCOPE;
5517 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5518 !!!cp ('t234');
5519 last INSCOPE;
5520 }
5521 } # INSCOPE
5522 unless (defined $i) {
5523 !!!cp ('t235');
5524 ## TODO: The following is wrong.
5525 !!!parse-error (type => 'unmatched end tag:'.$token->{type}, token => $token);
5526 ## Ignore the token
5527 !!!nack ('t236.1');
5528 !!!next-token;
5529 next B;
5530 }
5531
5532 ## Clear back to table row context
5533 while (not ($self->{open_elements}->[-1]->[1]
5534 & TABLE_ROW_SCOPING_EL)) {
5535 !!!cp ('t236');
5536 ## ISSUE: Can this state be reached?
5537 pop @{$self->{open_elements}};
5538 }
5539
5540 pop @{$self->{open_elements}}; # tr
5541 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5542 ## reprocess in the "in table body" insertion mode...
5543 }
5544
5545 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5546 ## have an element in table scope
5547 my $i;
5548 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5549 my $node = $self->{open_elements}->[$_];
5550 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5551 !!!cp ('t237');
5552 $i = $_;
5553 last INSCOPE;
5554 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5555 !!!cp ('t238');
5556 last INSCOPE;
5557 }
5558 } # INSCOPE
5559 unless (defined $i) {
5560 !!!cp ('t239');
5561 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5562 ## Ignore the token
5563 !!!nack ('t239.1');
5564 !!!next-token;
5565 next B;
5566 }
5567
5568 ## Clear back to table body context
5569 while (not ($self->{open_elements}->[-1]->[1]
5570 & TABLE_ROWS_SCOPING_EL)) {
5571 !!!cp ('t240');
5572 pop @{$self->{open_elements}};
5573 }
5574
5575 ## As if <{current node}>
5576 ## have an element in table scope
5577 ## true by definition
5578
5579 ## Clear back to table body context
5580 ## nop by definition
5581
5582 pop @{$self->{open_elements}};
5583 $self->{insertion_mode} = IN_TABLE_IM;
5584 ## reprocess in the "in table" insertion mode...
5585 }
5586
5587 ## NOTE: </table> in the "in table" insertion mode.
5588 ## When you edit the code fragment below, please ensure that
5589 ## the code for <table> in the "in table" insertion mode
5590 ## is synced with it.
5591
5592 ## have a table element in table scope
5593 my $i;
5594 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5595 my $node = $self->{open_elements}->[$_];
5596 if ($node->[1] & TABLE_EL) {
5597 !!!cp ('t241');
5598 $i = $_;
5599 last INSCOPE;
5600 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5601 !!!cp ('t242');
5602 last INSCOPE;
5603 }
5604 } # INSCOPE
5605 unless (defined $i) {
5606 !!!cp ('t243');
5607 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5608 ## Ignore the token
5609 !!!nack ('t243.1');
5610 !!!next-token;
5611 next B;
5612 }
5613
5614 splice @{$self->{open_elements}}, $i;
5615 pop @{$open_tables};
5616
5617 $self->_reset_insertion_mode;
5618
5619 !!!next-token;
5620 next B;
5621 } elsif ({
5622 tbody => 1, tfoot => 1, thead => 1,
5623 }->{$token->{tag_name}} and
5624 $self->{insertion_mode} & ROW_IMS) {
5625 if ($self->{insertion_mode} == IN_ROW_IM) {
5626 ## have an element in table scope
5627 my $i;
5628 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5629 my $node = $self->{open_elements}->[$_];
5630 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5631 !!!cp ('t247');
5632 $i = $_;
5633 last INSCOPE;
5634 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5635 !!!cp ('t248');
5636 last INSCOPE;
5637 }
5638 } # INSCOPE
5639 unless (defined $i) {
5640 !!!cp ('t249');
5641 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5642 ## Ignore the token
5643 !!!nack ('t249.1');
5644 !!!next-token;
5645 next B;
5646 }
5647
5648 ## As if </tr>
5649 ## have an element in table scope
5650 my $i;
5651 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5652 my $node = $self->{open_elements}->[$_];
5653 if ($node->[1] & TABLE_ROW_EL) {
5654 !!!cp ('t250');
5655 $i = $_;
5656 last INSCOPE;
5657 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5658 !!!cp ('t251');
5659 last INSCOPE;
5660 }
5661 } # INSCOPE
5662 unless (defined $i) {
5663 !!!cp ('t252');
5664 !!!parse-error (type => 'unmatched end tag:tr', token => $token);
5665 ## Ignore the token
5666 !!!nack ('t252.1');
5667 !!!next-token;
5668 next B;
5669 }
5670
5671 ## Clear back to table row context
5672 while (not ($self->{open_elements}->[-1]->[1]
5673 & TABLE_ROW_SCOPING_EL)) {
5674 !!!cp ('t253');
5675 ## ISSUE: Can this case be reached?
5676 pop @{$self->{open_elements}};
5677 }
5678
5679 pop @{$self->{open_elements}}; # tr
5680 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5681 ## reprocess in the "in table body" insertion mode...
5682 }
5683
5684 ## have an element in table scope
5685 my $i;
5686 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5687 my $node = $self->{open_elements}->[$_];
5688 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5689 !!!cp ('t254');
5690 $i = $_;
5691 last INSCOPE;
5692 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5693 !!!cp ('t255');
5694 last INSCOPE;
5695 }
5696 } # INSCOPE
5697 unless (defined $i) {
5698 !!!cp ('t256');
5699 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5700 ## Ignore the token
5701 !!!nack ('t256.1');
5702 !!!next-token;
5703 next B;
5704 }
5705
5706 ## Clear back to table body context
5707 while (not ($self->{open_elements}->[-1]->[1]
5708 & TABLE_ROWS_SCOPING_EL)) {
5709 !!!cp ('t257');
5710 ## ISSUE: Can this case be reached?
5711 pop @{$self->{open_elements}};
5712 }
5713
5714 pop @{$self->{open_elements}};
5715 $self->{insertion_mode} = IN_TABLE_IM;
5716 !!!nack ('t257.1');
5717 !!!next-token;
5718 next B;
5719 } elsif ({
5720 body => 1, caption => 1, col => 1, colgroup => 1,
5721 html => 1, td => 1, th => 1,
5722 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5723 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5724 }->{$token->{tag_name}}) {
5725 !!!cp ('t258');
5726 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5727 ## Ignore the token
5728 !!!nack ('t258.1');
5729 !!!next-token;
5730 next B;
5731 } else {
5732 !!!cp ('t259');
5733 !!!parse-error (type => 'in table:/'.$token->{tag_name}, token => $token);
5734
5735 $insert = $insert_to_foster;
5736 #
5737 }
5738 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5739 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5740 @{$self->{open_elements}} == 1) { # redundant, maybe
5741 !!!parse-error (type => 'in body:#eof', token => $token);
5742 !!!cp ('t259.1');
5743 #
5744 } else {
5745 !!!cp ('t259.2');
5746 #
5747 }
5748
5749 ## Stop parsing
5750 last B;
5751 } else {
5752 die "$0: $token->{type}: Unknown token type";
5753 }
5754 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5755 if ($token->{type} == CHARACTER_TOKEN) {
5756 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5757 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5758 unless (length $token->{data}) {
5759 !!!cp ('t260');
5760 !!!next-token;
5761 next B;
5762 }
5763 }
5764
5765 !!!cp ('t261');
5766 #
5767 } elsif ($token->{type} == START_TAG_TOKEN) {
5768 if ($token->{tag_name} eq 'col') {
5769 !!!cp ('t262');
5770 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5771 pop @{$self->{open_elements}};
5772 !!!ack ('t262.1');
5773 !!!next-token;
5774 next B;
5775 } else {
5776 !!!cp ('t263');
5777 #
5778 }
5779 } elsif ($token->{type} == END_TAG_TOKEN) {
5780 if ($token->{tag_name} eq 'colgroup') {
5781 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5782 !!!cp ('t264');
5783 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5784 ## Ignore the token
5785 !!!next-token;
5786 next B;
5787 } else {
5788 !!!cp ('t265');
5789 pop @{$self->{open_elements}}; # colgroup
5790 $self->{insertion_mode} = IN_TABLE_IM;
5791 !!!next-token;
5792 next B;
5793 }
5794 } elsif ($token->{tag_name} eq 'col') {
5795 !!!cp ('t266');
5796 !!!parse-error (type => 'unmatched end tag:col', token => $token);
5797 ## Ignore the token
5798 !!!next-token;
5799 next B;
5800 } else {
5801 !!!cp ('t267');
5802 #
5803 }
5804 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5805 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5806 @{$self->{open_elements}} == 1) { # redundant, maybe
5807 !!!cp ('t270.2');
5808 ## Stop parsing.
5809 last B;
5810 } else {
5811 ## NOTE: As if </colgroup>.
5812 !!!cp ('t270.1');
5813 pop @{$self->{open_elements}}; # colgroup
5814 $self->{insertion_mode} = IN_TABLE_IM;
5815 ## Reprocess.
5816 next B;
5817 }
5818 } else {
5819 die "$0: $token->{type}: Unknown token type";
5820 }
5821
5822 ## As if </colgroup>
5823 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5824 !!!cp ('t269');
5825 ## TODO: Wrong error type?
5826 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5827 ## Ignore the token
5828 !!!nack ('t269.1');
5829 !!!next-token;
5830 next B;
5831 } else {
5832 !!!cp ('t270');
5833 pop @{$self->{open_elements}}; # colgroup
5834 $self->{insertion_mode} = IN_TABLE_IM;
5835 !!!ack-later;
5836 ## reprocess
5837 next B;
5838 }
5839 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5840 if ($token->{type} == CHARACTER_TOKEN) {
5841 !!!cp ('t271');
5842 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5843 !!!next-token;
5844 next B;
5845 } elsif ($token->{type} == START_TAG_TOKEN) {
5846 if ($token->{tag_name} eq 'option') {
5847 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5848 !!!cp ('t272');
5849 ## As if </option>
5850 pop @{$self->{open_elements}};
5851 } else {
5852 !!!cp ('t273');
5853 }
5854
5855 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5856 !!!nack ('t273.1');
5857 !!!next-token;
5858 next B;
5859 } elsif ($token->{tag_name} eq 'optgroup') {
5860 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5861 !!!cp ('t274');
5862 ## As if </option>
5863 pop @{$self->{open_elements}};
5864 } else {
5865 !!!cp ('t275');
5866 }
5867
5868 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5869 !!!cp ('t276');
5870 ## As if </optgroup>
5871 pop @{$self->{open_elements}};
5872 } else {
5873 !!!cp ('t277');
5874 }
5875
5876 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5877 !!!nack ('t277.1');
5878 !!!next-token;
5879 next B;
5880 } elsif ($token->{tag_name} eq 'select' or
5881 $token->{tag_name} eq 'input' or
5882 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5883 {
5884 caption => 1, table => 1,
5885 tbody => 1, tfoot => 1, thead => 1,
5886 tr => 1, td => 1, th => 1,
5887 }->{$token->{tag_name}})) {
5888 ## TODO: The type below is not good - <select> is replaced by </select>
5889 !!!parse-error (type => 'not closed:select', token => $token);
5890 ## NOTE: As if the token were </select> (<select> case) or
5891 ## as if there were </select> (otherwise).
5892 ## have an element in table scope
5893 my $i;
5894 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5895 my $node = $self->{open_elements}->[$_];
5896 if ($node->[1] & SELECT_EL) {
5897 !!!cp ('t278');
5898 $i = $_;
5899 last INSCOPE;
5900 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5901 !!!cp ('t279');
5902 last INSCOPE;
5903 }
5904 } # INSCOPE
5905 unless (defined $i) {
5906 !!!cp ('t280');
5907 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5908 ## Ignore the token
5909 !!!nack ('t280.1');
5910 !!!next-token;
5911 next B;
5912 }
5913
5914 !!!cp ('t281');
5915 splice @{$self->{open_elements}}, $i;
5916
5917 $self->_reset_insertion_mode;
5918
5919 if ($token->{tag_name} eq 'select') {
5920 !!!nack ('t281.2');
5921 !!!next-token;
5922 next B;
5923 } else {
5924 !!!cp ('t281.1');
5925 !!!ack-later;
5926 ## Reprocess the token.
5927 next B;
5928 }
5929 } else {
5930 !!!cp ('t282');
5931 !!!parse-error (type => 'in select:'.$token->{tag_name}, token => $token);
5932 ## Ignore the token
5933 !!!nack ('t282.1');
5934 !!!next-token;
5935 next B;
5936 }
5937 } elsif ($token->{type} == END_TAG_TOKEN) {
5938 if ($token->{tag_name} eq 'optgroup') {
5939 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
5940 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
5941 !!!cp ('t283');
5942 ## As if </option>
5943 splice @{$self->{open_elements}}, -2;
5944 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5945 !!!cp ('t284');
5946 pop @{$self->{open_elements}};
5947 } else {
5948 !!!cp ('t285');
5949 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5950 ## Ignore the token
5951 }
5952 !!!nack ('t285.1');
5953 !!!next-token;
5954 next B;
5955 } elsif ($token->{tag_name} eq 'option') {
5956 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5957 !!!cp ('t286');
5958 pop @{$self->{open_elements}};
5959 } else {
5960 !!!cp ('t287');
5961 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5962 ## Ignore the token
5963 }
5964 !!!nack ('t287.1');
5965 !!!next-token;
5966 next B;
5967 } elsif ($token->{tag_name} eq 'select') {
5968 ## have an element in table scope
5969 my $i;
5970 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5971 my $node = $self->{open_elements}->[$_];
5972 if ($node->[1] & SELECT_EL) {
5973 !!!cp ('t288');
5974 $i = $_;
5975 last INSCOPE;
5976 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5977 !!!cp ('t289');
5978 last INSCOPE;
5979 }
5980 } # INSCOPE
5981 unless (defined $i) {
5982 !!!cp ('t290');
5983 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5984 ## Ignore the token
5985 !!!nack ('t290.1');
5986 !!!next-token;
5987 next B;
5988 }
5989
5990 !!!cp ('t291');
5991 splice @{$self->{open_elements}}, $i;
5992
5993 $self->_reset_insertion_mode;
5994
5995 !!!nack ('t291.1');
5996 !!!next-token;
5997 next B;
5998 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5999 {
6000 caption => 1, table => 1, tbody => 1,
6001 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6002 }->{$token->{tag_name}}) {
6003 ## TODO: The following is wrong?
6004 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6005
6006 ## have an element in table scope
6007 my $i;
6008 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6009 my $node = $self->{open_elements}->[$_];
6010 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6011 !!!cp ('t292');
6012 $i = $_;
6013 last INSCOPE;
6014 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6015 !!!cp ('t293');
6016 last INSCOPE;
6017 }
6018 } # INSCOPE
6019 unless (defined $i) {
6020 !!!cp ('t294');
6021 ## Ignore the token
6022 !!!nack ('t294.1');
6023 !!!next-token;
6024 next B;
6025 }
6026
6027 ## As if </select>
6028 ## have an element in table scope
6029 undef $i;
6030 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6031 my $node = $self->{open_elements}->[$_];
6032 if ($node->[1] & SELECT_EL) {
6033 !!!cp ('t295');
6034 $i = $_;
6035 last INSCOPE;
6036 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6037 ## ISSUE: Can this state be reached?
6038 !!!cp ('t296');
6039 last INSCOPE;
6040 }
6041 } # INSCOPE
6042 unless (defined $i) {
6043 !!!cp ('t297');
6044 ## TODO: The following error type is correct?
6045 !!!parse-error (type => 'unmatched end tag:select', token => $token);
6046 ## Ignore the </select> token
6047 !!!nack ('t297.1');
6048 !!!next-token; ## TODO: ok?
6049 next B;
6050 }
6051
6052 !!!cp ('t298');
6053 splice @{$self->{open_elements}}, $i;
6054
6055 $self->_reset_insertion_mode;
6056
6057 !!!ack-later;
6058 ## reprocess
6059 next B;
6060 } else {
6061 !!!cp ('t299');
6062 !!!parse-error (type => 'in select:/'.$token->{tag_name}, token => $token);
6063 ## Ignore the token
6064 !!!nack ('t299.3');
6065 !!!next-token;
6066 next B;
6067 }
6068 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6069 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6070 @{$self->{open_elements}} == 1) { # redundant, maybe
6071 !!!cp ('t299.1');
6072 !!!parse-error (type => 'in body:#eof', token => $token);
6073 } else {
6074 !!!cp ('t299.2');
6075 }
6076
6077 ## Stop parsing.
6078 last B;
6079 } else {
6080 die "$0: $token->{type}: Unknown token type";
6081 }
6082 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6083 if ($token->{type} == CHARACTER_TOKEN) {
6084 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6085 my $data = $1;
6086 ## As if in body
6087 $reconstruct_active_formatting_elements->($insert_to_current);
6088
6089 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6090
6091 unless (length $token->{data}) {
6092 !!!cp ('t300');
6093 !!!next-token;
6094 next B;
6095 }
6096 }
6097
6098 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6099 !!!cp ('t301');
6100 !!!parse-error (type => 'after html:#character', token => $token);
6101
6102 ## Reprocess in the "after body" insertion mode.
6103 } else {
6104 !!!cp ('t302');
6105 }
6106
6107 ## "after body" insertion mode
6108 !!!parse-error (type => 'after body:#character', token => $token);
6109
6110 $self->{insertion_mode} = IN_BODY_IM;
6111 ## reprocess
6112 next B;
6113 } elsif ($token->{type} == START_TAG_TOKEN) {
6114 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6115 !!!cp ('t303');
6116 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
6117
6118 ## Reprocess in the "after body" insertion mode.
6119 } else {
6120 !!!cp ('t304');
6121 }
6122
6123 ## "after body" insertion mode
6124 !!!parse-error (type => 'after body:'.$token->{tag_name}, token => $token);
6125
6126 $self->{insertion_mode} = IN_BODY_IM;
6127 !!!ack-later;
6128 ## reprocess
6129 next B;
6130 } elsif ($token->{type} == END_TAG_TOKEN) {
6131 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6132 !!!cp ('t305');
6133 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
6134
6135 $self->{insertion_mode} = AFTER_BODY_IM;
6136 ## Reprocess in the "after body" insertion mode.
6137 } else {
6138 !!!cp ('t306');
6139 }
6140
6141 ## "after body" insertion mode
6142 if ($token->{tag_name} eq 'html') {
6143 if (defined $self->{inner_html_node}) {
6144 !!!cp ('t307');
6145 !!!parse-error (type => 'unmatched end tag:html', token => $token);
6146 ## Ignore the token
6147 !!!next-token;
6148 next B;
6149 } else {
6150 !!!cp ('t308');
6151 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6152 !!!next-token;
6153 next B;
6154 }
6155 } else {
6156 !!!cp ('t309');
6157 !!!parse-error (type => 'after body:/'.$token->{tag_name}, token => $token);
6158
6159 $self->{insertion_mode} = IN_BODY_IM;
6160 ## reprocess
6161 next B;
6162 }
6163 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6164 !!!cp ('t309.2');
6165 ## Stop parsing
6166 last B;
6167 } else {
6168 die "$0: $token->{type}: Unknown token type";
6169 }
6170 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6171 if ($token->{type} == CHARACTER_TOKEN) {
6172 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6173 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6174
6175 unless (length $token->{data}) {
6176 !!!cp ('t310');
6177 !!!next-token;
6178 next B;
6179 }
6180 }
6181
6182 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6183 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6184 !!!cp ('t311');
6185 !!!parse-error (type => 'in frameset:#character', token => $token);
6186 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6187 !!!cp ('t312');
6188 !!!parse-error (type => 'after frameset:#character', token => $token);
6189 } else { # "after html frameset"
6190 !!!cp ('t313');
6191 !!!parse-error (type => 'after html:#character', token => $token);
6192
6193 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6194 ## Reprocess in the "after frameset" insertion mode.
6195 !!!parse-error (type => 'after frameset:#character', token => $token);
6196 }
6197
6198 ## Ignore the token.
6199 if (length $token->{data}) {
6200 !!!cp ('t314');
6201 ## reprocess the rest of characters
6202 } else {
6203 !!!cp ('t315');
6204 !!!next-token;
6205 }
6206 next B;
6207 }
6208
6209 die qq[$0: Character "$token->{data}"];
6210 } elsif ($token->{type} == START_TAG_TOKEN) {
6211 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6212 !!!cp ('t316');
6213 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
6214
6215 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6216 ## Process in the "after frameset" insertion mode.
6217 } else {
6218 !!!cp ('t317');
6219 }
6220
6221 if ($token->{tag_name} eq 'frameset' and
6222 $self->{insertion_mode} == IN_FRAMESET_IM) {
6223 !!!cp ('t318');
6224 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6225 !!!nack ('t318.1');
6226 !!!next-token;
6227 next B;
6228 } elsif ($token->{tag_name} eq 'frame' and
6229 $self->{insertion_mode} == IN_FRAMESET_IM) {
6230 !!!cp ('t319');
6231 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6232 pop @{$self->{open_elements}};
6233 !!!ack ('t319.1');
6234 !!!next-token;
6235 next B;
6236 } elsif ($token->{tag_name} eq 'noframes') {
6237 !!!cp ('t320');
6238 ## NOTE: As if in body.
6239 $parse_rcdata->(CDATA_CONTENT_MODEL);
6240 next B;
6241 } else {
6242 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6243 !!!cp ('t321');
6244 !!!parse-error (type => 'in frameset:'.$token->{tag_name}, token => $token);
6245 } else {
6246 !!!cp ('t322');
6247 !!!parse-error (type => 'after frameset:'.$token->{tag_name}, token => $token);
6248 }
6249 ## Ignore the token
6250 !!!nack ('t322.1');
6251 !!!next-token;
6252 next B;
6253 }
6254 } elsif ($token->{type} == END_TAG_TOKEN) {
6255 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6256 !!!cp ('t323');
6257 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
6258
6259 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6260 ## Process in the "after frameset" insertion mode.
6261 } else {
6262 !!!cp ('t324');
6263 }
6264
6265 if ($token->{tag_name} eq 'frameset' and
6266 $self->{insertion_mode} == IN_FRAMESET_IM) {
6267 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6268 @{$self->{open_elements}} == 1) {
6269 !!!cp ('t325');
6270 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6271 ## Ignore the token
6272 !!!next-token;
6273 } else {
6274 !!!cp ('t326');
6275 pop @{$self->{open_elements}};
6276 !!!next-token;
6277 }
6278
6279 if (not defined $self->{inner_html_node} and
6280 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6281 !!!cp ('t327');
6282 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6283 } else {
6284 !!!cp ('t328');
6285 }
6286 next B;
6287 } elsif ($token->{tag_name} eq 'html' and
6288 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6289 !!!cp ('t329');
6290 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6291 !!!next-token;
6292 next B;
6293 } else {
6294 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6295 !!!cp ('t330');
6296 !!!parse-error (type => 'in frameset:/'.$token->{tag_name}, token => $token);
6297 } else {
6298 !!!cp ('t331');
6299 !!!parse-error (type => 'after frameset:/'.$token->{tag_name}, token => $token);
6300 }
6301 ## Ignore the token
6302 !!!next-token;
6303 next B;
6304 }
6305 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6306 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6307 @{$self->{open_elements}} == 1) { # redundant, maybe
6308 !!!cp ('t331.1');
6309 !!!parse-error (type => 'in body:#eof', token => $token);
6310 } else {
6311 !!!cp ('t331.2');
6312 }
6313
6314 ## Stop parsing
6315 last B;
6316 } else {
6317 die "$0: $token->{type}: Unknown token type";
6318 }
6319
6320 ## ISSUE: An issue in spec here
6321 } else {
6322 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6323 }
6324
6325 ## "in body" insertion mode
6326 if ($token->{type} == START_TAG_TOKEN) {
6327 if ($token->{tag_name} eq 'script') {
6328 !!!cp ('t332');
6329 ## NOTE: This is an "as if in head" code clone
6330 $script_start_tag->();
6331 next B;
6332 } elsif ($token->{tag_name} eq 'style') {
6333 !!!cp ('t333');
6334 ## NOTE: This is an "as if in head" code clone
6335 $parse_rcdata->(CDATA_CONTENT_MODEL);
6336 next B;
6337 } elsif ({
6338 base => 1, link => 1,
6339 }->{$token->{tag_name}}) {
6340 !!!cp ('t334');
6341 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6342 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6343 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6344 !!!ack ('t334.1');
6345 !!!next-token;
6346 next B;
6347 } elsif ($token->{tag_name} eq 'meta') {
6348 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6349 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6350 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6351
6352 unless ($self->{confident}) {
6353 if ($token->{attributes}->{charset}) {
6354 !!!cp ('t335');
6355 ## NOTE: Whether the encoding is supported or not is handled
6356 ## in the {change_encoding} callback.
6357 $self->{change_encoding}
6358 ->($self, $token->{attributes}->{charset}->{value}, $token);
6359
6360 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6361 ->set_user_data (manakai_has_reference =>
6362 $token->{attributes}->{charset}
6363 ->{has_reference});
6364 } elsif ($token->{attributes}->{content}) {
6365 if ($token->{attributes}->{content}->{value}
6366 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6367 [\x09-\x0D\x20]*=
6368 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6369 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
6370 !!!cp ('t336');
6371 ## NOTE: Whether the encoding is supported or not is handled
6372 ## in the {change_encoding} callback.
6373 $self->{change_encoding}
6374 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6375 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6376 ->set_user_data (manakai_has_reference =>
6377 $token->{attributes}->{content}
6378 ->{has_reference});
6379 }
6380 }
6381 } else {
6382 if ($token->{attributes}->{charset}) {
6383 !!!cp ('t337');
6384 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6385 ->set_user_data (manakai_has_reference =>
6386 $token->{attributes}->{charset}
6387 ->{has_reference});
6388 }
6389 if ($token->{attributes}->{content}) {
6390 !!!cp ('t338');
6391 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6392 ->set_user_data (manakai_has_reference =>
6393 $token->{attributes}->{content}
6394 ->{has_reference});
6395 }
6396 }
6397
6398 !!!ack ('t338.1');
6399 !!!next-token;
6400 next B;
6401 } elsif ($token->{tag_name} eq 'title') {
6402 !!!cp ('t341');
6403 ## NOTE: This is an "as if in head" code clone
6404 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6405 next B;
6406 } elsif ($token->{tag_name} eq 'body') {
6407 !!!parse-error (type => 'in body:body', token => $token);
6408
6409 if (@{$self->{open_elements}} == 1 or
6410 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6411 !!!cp ('t342');
6412 ## Ignore the token
6413 } else {
6414 my $body_el = $self->{open_elements}->[1]->[0];
6415 for my $attr_name (keys %{$token->{attributes}}) {
6416 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6417 !!!cp ('t343');
6418 $body_el->set_attribute_ns
6419 (undef, [undef, $attr_name],
6420 $token->{attributes}->{$attr_name}->{value});
6421 }
6422 }
6423 }
6424 !!!nack ('t343.1');
6425 !!!next-token;
6426 next B;
6427 } elsif ({
6428 address => 1, blockquote => 1, center => 1, dir => 1,
6429 div => 1, dl => 1, fieldset => 1,
6430 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6431 menu => 1, ol => 1, p => 1, ul => 1,
6432 pre => 1, listing => 1,
6433 form => 1,
6434 table => 1,
6435 hr => 1,
6436 }->{$token->{tag_name}}) {
6437 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6438 !!!cp ('t350');
6439 !!!parse-error (type => 'in form:form', token => $token);
6440 ## Ignore the token
6441 !!!nack ('t350.1');
6442 !!!next-token;
6443 next B;
6444 }
6445
6446 ## has a p element in scope
6447 INSCOPE: for (reverse @{$self->{open_elements}}) {
6448 if ($_->[1] & P_EL) {
6449 !!!cp ('t344');
6450 !!!back-token; # <form>
6451 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6452 line => $token->{line}, column => $token->{column}};
6453 next B;
6454 } elsif ($_->[1] & SCOPING_EL) {
6455 !!!cp ('t345');
6456 last INSCOPE;
6457 }
6458 } # INSCOPE
6459
6460 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6461 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6462 !!!nack ('t346.1');
6463 !!!next-token;
6464 if ($token->{type} == CHARACTER_TOKEN) {
6465 $token->{data} =~ s/^\x0A//;
6466 unless (length $token->{data}) {
6467 !!!cp ('t346');
6468 !!!next-token;
6469 } else {
6470 !!!cp ('t349');
6471 }
6472 } else {
6473 !!!cp ('t348');
6474 }
6475 } elsif ($token->{tag_name} eq 'form') {
6476 !!!cp ('t347.1');
6477 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6478
6479 !!!nack ('t347.2');
6480 !!!next-token;
6481 } elsif ($token->{tag_name} eq 'table') {
6482 !!!cp ('t382');
6483 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6484
6485 $self->{insertion_mode} = IN_TABLE_IM;
6486
6487 !!!nack ('t382.1');
6488 !!!next-token;
6489 } elsif ($token->{tag_name} eq 'hr') {
6490 !!!cp ('t386');
6491 pop @{$self->{open_elements}};
6492
6493 !!!nack ('t386.1');
6494 !!!next-token;
6495 } else {
6496 !!!nack ('t347.1');
6497 !!!next-token;
6498 }
6499 next B;
6500 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6501 ## has a p element in scope
6502 INSCOPE: for (reverse @{$self->{open_elements}}) {
6503 if ($_->[1] & P_EL) {
6504 !!!cp ('t353');
6505 !!!back-token; # <x>
6506 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6507 line => $token->{line}, column => $token->{column}};
6508 next B;
6509 } elsif ($_->[1] & SCOPING_EL) {
6510 !!!cp ('t354');
6511 last INSCOPE;
6512 }
6513 } # INSCOPE
6514
6515 ## Step 1
6516 my $i = -1;
6517 my $node = $self->{open_elements}->[$i];
6518 my $li_or_dtdd = {li => {li => 1},
6519 dt => {dt => 1, dd => 1},
6520 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6521 LI: {
6522 ## Step 2
6523 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6524 if ($i != -1) {
6525 !!!cp ('t355');
6526 !!!parse-error (type => 'not closed',
6527 value => $self->{open_elements}->[-1]->[0]
6528 ->manakai_local_name,
6529 token => $token);
6530 } else {
6531 !!!cp ('t356');
6532 }
6533 splice @{$self->{open_elements}}, $i;
6534 last LI;
6535 } else {
6536 !!!cp ('t357');
6537 }
6538
6539 ## Step 3
6540 if (not ($node->[1] & FORMATTING_EL) and
6541 #not $phrasing_category->{$node->[1]} and
6542 ($node->[1] & SPECIAL_EL or
6543 $node->[1] & SCOPING_EL) and
6544 not ($node->[1] & ADDRESS_EL) and
6545 not ($node->[1] & DIV_EL)) {
6546 !!!cp ('t358');
6547 last LI;
6548 }
6549
6550 !!!cp ('t359');
6551 ## Step 4
6552 $i--;
6553 $node = $self->{open_elements}->[$i];
6554 redo LI;
6555 } # LI
6556
6557 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6558 !!!nack ('t359.1');
6559 !!!next-token;
6560 next B;
6561 } elsif ($token->{tag_name} eq 'plaintext') {
6562 ## has a p element in scope
6563 INSCOPE: for (reverse @{$self->{open_elements}}) {
6564 if ($_->[1] & P_EL) {
6565 !!!cp ('t367');
6566 !!!back-token; # <plaintext>
6567 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6568 line => $token->{line}, column => $token->{column}};
6569 next B;
6570 } elsif ($_->[1] & SCOPING_EL) {
6571 !!!cp ('t368');
6572 last INSCOPE;
6573 }
6574 } # INSCOPE
6575
6576 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6577
6578 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6579
6580 !!!nack ('t368.1');
6581 !!!next-token;
6582 next B;
6583 } elsif ($token->{tag_name} eq 'a') {
6584 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6585 my $node = $active_formatting_elements->[$i];
6586 if ($node->[1] & A_EL) {
6587 !!!cp ('t371');
6588 !!!parse-error (type => 'in a:a', token => $token);
6589
6590 !!!back-token; # <a>
6591 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6592 line => $token->{line}, column => $token->{column}};
6593 $formatting_end_tag->($token);
6594
6595 AFE2: for (reverse 0..$#$active_formatting_elements) {
6596 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6597 !!!cp ('t372');
6598 splice @$active_formatting_elements, $_, 1;
6599 last AFE2;
6600 }
6601 } # AFE2
6602 OE: for (reverse 0..$#{$self->{open_elements}}) {
6603 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6604 !!!cp ('t373');
6605 splice @{$self->{open_elements}}, $_, 1;
6606 last OE;
6607 }
6608 } # OE
6609 last AFE;
6610 } elsif ($node->[0] eq '#marker') {
6611 !!!cp ('t374');
6612 last AFE;
6613 }
6614 } # AFE
6615
6616 $reconstruct_active_formatting_elements->($insert_to_current);
6617
6618 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6619 push @$active_formatting_elements, $self->{open_elements}->[-1];
6620
6621 !!!nack ('t374.1');
6622 !!!next-token;
6623 next B;
6624 } elsif ($token->{tag_name} eq 'nobr') {
6625 $reconstruct_active_formatting_elements->($insert_to_current);
6626
6627 ## has a |nobr| element in scope
6628 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6629 my $node = $self->{open_elements}->[$_];
6630 if ($node->[1] & NOBR_EL) {
6631 !!!cp ('t376');
6632 !!!parse-error (type => 'in nobr:nobr', token => $token);
6633 !!!back-token; # <nobr>
6634 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6635 line => $token->{line}, column => $token->{column}};
6636 next B;
6637 } elsif ($node->[1] & SCOPING_EL) {
6638 !!!cp ('t377');
6639 last INSCOPE;
6640 }
6641 } # INSCOPE
6642
6643 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6644 push @$active_formatting_elements, $self->{open_elements}->[-1];
6645
6646 !!!nack ('t377.1');
6647 !!!next-token;
6648 next B;
6649 } elsif ($token->{tag_name} eq 'button') {
6650 ## has a button element in scope
6651 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6652 my $node = $self->{open_elements}->[$_];
6653 if ($node->[1] & BUTTON_EL) {
6654 !!!cp ('t378');
6655 !!!parse-error (type => 'in button:button', token => $token);
6656 !!!back-token; # <button>
6657 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6658 line => $token->{line}, column => $token->{column}};
6659 next B;
6660 } elsif ($node->[1] & SCOPING_EL) {
6661 !!!cp ('t379');
6662 last INSCOPE;
6663 }
6664 } # INSCOPE
6665
6666 $reconstruct_active_formatting_elements->($insert_to_current);
6667
6668 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6669
6670 ## TODO: associate with $self->{form_element} if defined
6671
6672 push @$active_formatting_elements, ['#marker', ''];
6673
6674 !!!nack ('t379.1');
6675 !!!next-token;
6676 next B;
6677 } elsif ({
6678 xmp => 1,
6679 iframe => 1,
6680 noembed => 1,
6681 noframes => 1,
6682 noscript => 0, ## TODO: 1 if scripting is enabled
6683 }->{$token->{tag_name}}) {
6684 if ($token->{tag_name} eq 'xmp') {
6685 !!!cp ('t381');
6686 $reconstruct_active_formatting_elements->($insert_to_current);
6687 } else {
6688 !!!cp ('t399');
6689 }
6690 ## NOTE: There is an "as if in body" code clone.
6691 $parse_rcdata->(CDATA_CONTENT_MODEL);
6692 next B;
6693 } elsif ($token->{tag_name} eq 'isindex') {
6694 !!!parse-error (type => 'isindex', token => $token);
6695
6696 if (defined $self->{form_element}) {
6697 !!!cp ('t389');
6698 ## Ignore the token
6699 !!!nack ('t389'); ## NOTE: Not acknowledged.
6700 !!!next-token;
6701 next B;
6702 } else {
6703 my $at = $token->{attributes};
6704 my $form_attrs;
6705 $form_attrs->{action} = $at->{action} if $at->{action};
6706 my $prompt_attr = $at->{prompt};
6707 $at->{name} = {name => 'name', value => 'isindex'};
6708 delete $at->{action};
6709 delete $at->{prompt};
6710 my @tokens = (
6711 {type => START_TAG_TOKEN, tag_name => 'form',
6712 attributes => $form_attrs,
6713 line => $token->{line}, column => $token->{column}},
6714 {type => START_TAG_TOKEN, tag_name => 'hr',
6715 line => $token->{line}, column => $token->{column}},
6716 {type => START_TAG_TOKEN, tag_name => 'p',
6717 line => $token->{line}, column => $token->{column}},
6718 {type => START_TAG_TOKEN, tag_name => 'label',
6719 line => $token->{line}, column => $token->{column}},
6720 );
6721 if ($prompt_attr) {
6722 !!!cp ('t390');
6723 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6724 #line => $token->{line}, column => $token->{column},
6725 };
6726 } else {
6727 !!!cp ('t391');
6728 push @tokens, {type => CHARACTER_TOKEN,
6729 data => 'This is a searchable index. Insert your search keywords here: ',
6730 #line => $token->{line}, column => $token->{column},
6731 }; # SHOULD
6732 ## TODO: make this configurable
6733 }
6734 push @tokens,
6735 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6736 line => $token->{line}, column => $token->{column}},
6737 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6738 {type => END_TAG_TOKEN, tag_name => 'label',
6739 line => $token->{line}, column => $token->{column}},
6740 {type => END_TAG_TOKEN, tag_name => 'p',
6741 line => $token->{line}, column => $token->{column}},
6742 {type => START_TAG_TOKEN, tag_name => 'hr',
6743 line => $token->{line}, column => $token->{column}},
6744 {type => END_TAG_TOKEN, tag_name => 'form',
6745 line => $token->{line}, column => $token->{column}};
6746 !!!nack ('t391.1'); ## NOTE: Not acknowledged.
6747 !!!back-token (@tokens);
6748 !!!next-token;
6749 next B;
6750 }
6751 } elsif ($token->{tag_name} eq 'textarea') {
6752 my $tag_name = $token->{tag_name};
6753 my $el;
6754 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6755
6756 ## TODO: $self->{form_element} if defined
6757 $self->{content_model} = RCDATA_CONTENT_MODEL;
6758 delete $self->{escape}; # MUST
6759
6760 $insert->($el);
6761
6762 my $text = '';
6763 !!!nack ('t392.1');
6764 !!!next-token;
6765 if ($token->{type} == CHARACTER_TOKEN) {
6766 $token->{data} =~ s/^\x0A//;
6767 unless (length $token->{data}) {
6768 !!!cp ('t392');
6769 !!!next-token;
6770 } else {
6771 !!!cp ('t393');
6772 }
6773 } else {
6774 !!!cp ('t394');
6775 }
6776 while ($token->{type} == CHARACTER_TOKEN) {
6777 !!!cp ('t395');
6778 $text .= $token->{data};
6779 !!!next-token;
6780 }
6781 if (length $text) {
6782 !!!cp ('t396');
6783 $el->manakai_append_text ($text);
6784 }
6785
6786 $self->{content_model} = PCDATA_CONTENT_MODEL;
6787
6788 if ($token->{type} == END_TAG_TOKEN and
6789 $token->{tag_name} eq $tag_name) {
6790 !!!cp ('t397');
6791 ## Ignore the token
6792 } else {
6793 !!!cp ('t398');
6794 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
6795 }
6796 !!!next-token;
6797 next B;
6798 } elsif ($token->{tag_name} eq 'math' or
6799 $token->{tag_name} eq 'svg') {
6800 $reconstruct_active_formatting_elements->($insert_to_current);
6801
6802 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
6803
6804 ## "adjust foreign attributes" - done in insert-element-f
6805
6806 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
6807
6808 if ($self->{self_closing}) {
6809 pop @{$self->{open_elements}};
6810 !!!ack ('t398.1');
6811 } else {
6812 !!!cp ('t398.2');
6813 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6814 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6815 ## mode, "in body" (not "in foreign content") secondary insertion
6816 ## mode, maybe.
6817 }
6818
6819 !!!next-token;
6820 next B;
6821 } elsif ({
6822 caption => 1, col => 1, colgroup => 1, frame => 1,
6823 frameset => 1, head => 1, option => 1, optgroup => 1,
6824 tbody => 1, td => 1, tfoot => 1, th => 1,
6825 thead => 1, tr => 1,
6826 }->{$token->{tag_name}}) {
6827 !!!cp ('t401');
6828 !!!parse-error (type => 'in body:'.$token->{tag_name}, token => $token);
6829 ## Ignore the token
6830 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6831 !!!next-token;
6832 next B;
6833
6834 ## ISSUE: An issue on HTML5 new elements in the spec.
6835 } else {
6836 if ($token->{tag_name} eq 'image') {
6837 !!!cp ('t384');
6838 !!!parse-error (type => 'image', token => $token);
6839 $token->{tag_name} = 'img';
6840 } else {
6841 !!!cp ('t385');
6842 }
6843
6844 ## NOTE: There is an "as if <br>" code clone.
6845 $reconstruct_active_formatting_elements->($insert_to_current);
6846
6847 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6848
6849 if ({
6850 applet => 1, marquee => 1, object => 1,
6851 }->{$token->{tag_name}}) {
6852 !!!cp ('t380');
6853 push @$active_formatting_elements, ['#marker', ''];
6854 !!!nack ('t380.1');
6855 } elsif ({
6856 b => 1, big => 1, em => 1, font => 1, i => 1,
6857 s => 1, small => 1, strile => 1,
6858 strong => 1, tt => 1, u => 1,
6859 }->{$token->{tag_name}}) {
6860 !!!cp ('t375');
6861 push @$active_formatting_elements, $self->{open_elements}->[-1];
6862 !!!nack ('t375.1');
6863 } elsif ($token->{tag_name} eq 'input') {
6864 !!!cp ('t388');
6865 ## TODO: associate with $self->{form_element} if defined
6866 pop @{$self->{open_elements}};
6867 !!!ack ('t388.2');
6868 } elsif ({
6869 area => 1, basefont => 1, bgsound => 1, br => 1,
6870 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
6871 #image => 1,
6872 }->{$token->{tag_name}}) {
6873 !!!cp ('t388.1');
6874 pop @{$self->{open_elements}};
6875 !!!ack ('t388.3');
6876 } elsif ($token->{tag_name} eq 'select') {
6877 ## TODO: associate with $self->{form_element} if defined
6878
6879 if ($self->{insertion_mode} & TABLE_IMS or
6880 $self->{insertion_mode} & BODY_TABLE_IMS or
6881 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6882 !!!cp ('t400.1');
6883 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6884 } else {
6885 !!!cp ('t400.2');
6886 $self->{insertion_mode} = IN_SELECT_IM;
6887 }
6888 !!!nack ('t400.3');
6889 } else {
6890 !!!nack ('t402');
6891 }
6892
6893 !!!next-token;
6894 next B;
6895 }
6896 } elsif ($token->{type} == END_TAG_TOKEN) {
6897 if ($token->{tag_name} eq 'body') {
6898 ## has a |body| element in scope
6899 my $i;
6900 INSCOPE: {
6901 for (reverse @{$self->{open_elements}}) {
6902 if ($_->[1] & BODY_EL) {
6903 !!!cp ('t405');
6904 $i = $_;
6905 last INSCOPE;
6906 } elsif ($_->[1] & SCOPING_EL) {
6907 !!!cp ('t405.1');
6908 last;
6909 }
6910 }
6911
6912 !!!parse-error (type => 'start tag not allowed',
6913 value => $token->{tag_name}, token => $token);
6914 ## NOTE: Ignore the token.
6915 !!!next-token;
6916 next B;
6917 } # INSCOPE
6918
6919 for (@{$self->{open_elements}}) {
6920 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
6921 !!!cp ('t403');
6922 !!!parse-error (type => 'not closed',
6923 value => $_->[0]->manakai_local_name,
6924 token => $token);
6925 last;
6926 } else {
6927 !!!cp ('t404');
6928 }
6929 }
6930
6931 $self->{insertion_mode} = AFTER_BODY_IM;
6932 !!!next-token;
6933 next B;
6934 } elsif ($token->{tag_name} eq 'html') {
6935 ## TODO: Update this code. It seems that the code below is not
6936 ## up-to-date, though it has same effect as speced.
6937 if (@{$self->{open_elements}} > 1 and
6938 $self->{open_elements}->[1]->[1] & BODY_EL) {
6939 ## ISSUE: There is an issue in the spec.
6940 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
6941 !!!cp ('t406');
6942 !!!parse-error (type => 'not closed',
6943 value => $self->{open_elements}->[1]->[0]
6944 ->manakai_local_name,
6945 token => $token);
6946 } else {
6947 !!!cp ('t407');
6948 }
6949 $self->{insertion_mode} = AFTER_BODY_IM;
6950 ## reprocess
6951 next B;
6952 } else {
6953 !!!cp ('t408');
6954 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6955 ## Ignore the token
6956 !!!next-token;
6957 next B;
6958 }
6959 } elsif ({
6960 address => 1, blockquote => 1, center => 1, dir => 1,
6961 div => 1, dl => 1, fieldset => 1, listing => 1,
6962 menu => 1, ol => 1, pre => 1, ul => 1,
6963 dd => 1, dt => 1, li => 1,
6964 applet => 1, button => 1, marquee => 1, object => 1,
6965 }->{$token->{tag_name}}) {
6966 ## has an element in scope
6967 my $i;
6968 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6969 my $node = $self->{open_elements}->[$_];
6970 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6971 !!!cp ('t410');
6972 $i = $_;
6973 last INSCOPE;
6974 } elsif ($node->[1] & SCOPING_EL) {
6975 !!!cp ('t411');
6976 last INSCOPE;
6977 }
6978 } # INSCOPE
6979
6980 unless (defined $i) { # has an element in scope
6981 !!!cp ('t413');
6982 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6983 } else {
6984 ## Step 1. generate implied end tags
6985 while ({
6986 dd => ($token->{tag_name} ne 'dd'),
6987 dt => ($token->{tag_name} ne 'dt'),
6988 li => ($token->{tag_name} ne 'li'),
6989 p => 1,
6990 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
6991 !!!cp ('t409');
6992 pop @{$self->{open_elements}};
6993 }
6994
6995 ## Step 2.
6996 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6997 ne $token->{tag_name}) {
6998 !!!cp ('t412');
6999 !!!parse-error (type => 'not closed',
7000 value => $self->{open_elements}->[-1]->[0]
7001 ->manakai_local_name,
7002 token => $token);
7003 } else {
7004 !!!cp ('t414');
7005 }
7006
7007 ## Step 3.
7008 splice @{$self->{open_elements}}, $i;
7009
7010 ## Step 4.
7011 $clear_up_to_marker->()
7012 if {
7013 applet => 1, button => 1, marquee => 1, object => 1,
7014 }->{$token->{tag_name}};
7015 }
7016 !!!next-token;
7017 next B;
7018 } elsif ($token->{tag_name} eq 'form') {
7019 undef $self->{form_element};
7020
7021 ## has an element in scope
7022 my $i;
7023 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7024 my $node = $self->{open_elements}->[$_];
7025 if ($node->[1] & FORM_EL) {
7026 !!!cp ('t418');
7027 $i = $_;
7028 last INSCOPE;
7029 } elsif ($node->[1] & SCOPING_EL) {
7030 !!!cp ('t419');
7031 last INSCOPE;
7032 }
7033 } # INSCOPE
7034
7035 unless (defined $i) { # has an element in scope
7036 !!!cp ('t421');
7037 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7038 } else {
7039 ## Step 1. generate implied end tags
7040 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7041 !!!cp ('t417');
7042 pop @{$self->{open_elements}};
7043 }
7044
7045 ## Step 2.
7046 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7047 ne $token->{tag_name}) {
7048 !!!cp ('t417.1');
7049 !!!parse-error (type => 'not closed',
7050 value => $self->{open_elements}->[-1]->[0]
7051 ->manakai_local_name,
7052 token => $token);
7053 } else {
7054 !!!cp ('t420');
7055 }
7056
7057 ## Step 3.
7058 splice @{$self->{open_elements}}, $i;
7059 }
7060
7061 !!!next-token;
7062 next B;
7063 } elsif ({
7064 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7065 }->{$token->{tag_name}}) {
7066 ## has an element in scope
7067 my $i;
7068 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7069 my $node = $self->{open_elements}->[$_];
7070 if ($node->[1] & HEADING_EL) {
7071 !!!cp ('t423');
7072 $i = $_;
7073 last INSCOPE;
7074 } elsif ($node->[1] & SCOPING_EL) {
7075 !!!cp ('t424');
7076 last INSCOPE;
7077 }
7078 } # INSCOPE
7079
7080 unless (defined $i) { # has an element in scope
7081 !!!cp ('t425.1');
7082 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7083 } else {
7084 ## Step 1. generate implied end tags
7085 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7086 !!!cp ('t422');
7087 pop @{$self->{open_elements}};
7088 }
7089
7090 ## Step 2.
7091 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7092 ne $token->{tag_name}) {
7093 !!!cp ('t425');
7094 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7095 } else {
7096 !!!cp ('t426');
7097 }
7098
7099 ## Step 3.
7100 splice @{$self->{open_elements}}, $i;
7101 }
7102
7103 !!!next-token;
7104 next B;
7105 } elsif ($token->{tag_name} eq 'p') {
7106 ## has an element in scope
7107 my $i;
7108 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7109 my $node = $self->{open_elements}->[$_];
7110 if ($node->[1] & P_EL) {
7111 !!!cp ('t410.1');
7112 $i = $_;
7113 last INSCOPE;
7114 } elsif ($node->[1] & SCOPING_EL) {
7115 !!!cp ('t411.1');
7116 last INSCOPE;
7117 }
7118 } # INSCOPE
7119
7120 if (defined $i) {
7121 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7122 ne $token->{tag_name}) {
7123 !!!cp ('t412.1');
7124 !!!parse-error (type => 'not closed',
7125 value => $self->{open_elements}->[-1]->[0]
7126 ->manakai_local_name,
7127 token => $token);
7128 } else {
7129 !!!cp ('t414.1');
7130 }
7131
7132 splice @{$self->{open_elements}}, $i;
7133 } else {
7134 !!!cp ('t413.1');
7135 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7136
7137 !!!cp ('t415.1');
7138 ## As if <p>, then reprocess the current token
7139 my $el;
7140 !!!create-element ($el, $HTML_NS, 'p',, $token);
7141 $insert->($el);
7142 ## NOTE: Not inserted into |$self->{open_elements}|.
7143 }
7144
7145 !!!next-token;
7146 next B;
7147 } elsif ({
7148 a => 1,
7149 b => 1, big => 1, em => 1, font => 1, i => 1,
7150 nobr => 1, s => 1, small => 1, strile => 1,
7151 strong => 1, tt => 1, u => 1,
7152 }->{$token->{tag_name}}) {
7153 !!!cp ('t427');
7154 $formatting_end_tag->($token);
7155 next B;
7156 } elsif ($token->{tag_name} eq 'br') {
7157 !!!cp ('t428');
7158 !!!parse-error (type => 'unmatched end tag:br', token => $token);
7159
7160 ## As if <br>
7161 $reconstruct_active_formatting_elements->($insert_to_current);
7162
7163 my $el;
7164 !!!create-element ($el, $HTML_NS, 'br',, $token);
7165 $insert->($el);
7166
7167 ## Ignore the token.
7168 !!!next-token;
7169 next B;
7170 } elsif ({
7171 caption => 1, col => 1, colgroup => 1, frame => 1,
7172 frameset => 1, head => 1, option => 1, optgroup => 1,
7173 tbody => 1, td => 1, tfoot => 1, th => 1,
7174 thead => 1, tr => 1,
7175 area => 1, basefont => 1, bgsound => 1,
7176 embed => 1, hr => 1, iframe => 1, image => 1,
7177 img => 1, input => 1, isindex => 1, noembed => 1,
7178 noframes => 1, param => 1, select => 1, spacer => 1,
7179 table => 1, textarea => 1, wbr => 1,
7180 noscript => 0, ## TODO: if scripting is enabled
7181 }->{$token->{tag_name}}) {
7182 !!!cp ('t429');
7183 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7184 ## Ignore the token
7185 !!!next-token;
7186 next B;
7187
7188 ## ISSUE: Issue on HTML5 new elements in spec
7189
7190 } else {
7191 ## Step 1
7192 my $node_i = -1;
7193 my $node = $self->{open_elements}->[$node_i];
7194
7195 ## Step 2
7196 S2: {
7197 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7198 ## Step 1
7199 ## generate implied end tags
7200 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7201 !!!cp ('t430');
7202 ## ISSUE: Can this case be reached?
7203 pop @{$self->{open_elements}};
7204 }
7205
7206 ## Step 2
7207 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7208 ne $token->{tag_name}) {
7209 !!!cp ('t431');
7210 ## NOTE: <x><y></x>
7211 !!!parse-error (type => 'not closed',
7212 value => $self->{open_elements}->[-1]->[0]
7213 ->manakai_local_name,
7214 token => $token);
7215 } else {
7216 !!!cp ('t432');
7217 }
7218
7219 ## Step 3
7220 splice @{$self->{open_elements}}, $node_i;
7221
7222 !!!next-token;
7223 last S2;
7224 } else {
7225 ## Step 3
7226 if (not ($node->[1] & FORMATTING_EL) and
7227 #not $phrasing_category->{$node->[1]} and
7228 ($node->[1] & SPECIAL_EL or
7229 $node->[1] & SCOPING_EL)) {
7230 !!!cp ('t433');
7231 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7232 ## Ignore the token
7233 !!!next-token;
7234 last S2;
7235 }
7236
7237 !!!cp ('t434');
7238 }
7239
7240 ## Step 4
7241 $node_i--;
7242 $node = $self->{open_elements}->[$node_i];
7243
7244 ## Step 5;
7245 redo S2;
7246 } # S2
7247 next B;
7248 }
7249 }
7250 next B;
7251 } continue { # B
7252 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7253 ## NOTE: The code below is executed in cases where it does not have
7254 ## to be, but it it is harmless even in those cases.
7255 ## has an element in scope
7256 INSCOPE: {
7257 for (reverse 0..$#{$self->{open_elements}}) {
7258 my $node = $self->{open_elements}->[$_];
7259 if ($node->[1] & FOREIGN_EL) {
7260 last INSCOPE;
7261 } elsif ($node->[1] & SCOPING_EL) {
7262 last;
7263 }
7264 }
7265
7266 ## NOTE: No foreign element in scope.
7267 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7268 } # INSCOPE
7269 }
7270 } # B
7271
7272 ## Stop parsing # MUST
7273
7274 ## TODO: script stuffs
7275 } # _tree_construct_main
7276
7277 sub set_inner_html ($$$) {
7278 my $class = shift;
7279 my $node = shift;
7280 my $s = \$_[0];
7281 my $onerror = $_[1];
7282
7283 ## ISSUE: Should {confident} be true?
7284
7285 my $nt = $node->node_type;
7286 if ($nt == 9) {
7287 # MUST
7288
7289 ## Step 1 # MUST
7290 ## TODO: If the document has an active parser, ...
7291 ## ISSUE: There is an issue in the spec.
7292
7293 ## Step 2 # MUST
7294 my @cn = @{$node->child_nodes};
7295 for (@cn) {
7296 $node->remove_child ($_);
7297 }
7298
7299 ## Step 3, 4, 5 # MUST
7300 $class->parse_string ($$s => $node, $onerror);
7301 } elsif ($nt == 1) {
7302 ## TODO: If non-html element
7303
7304 ## NOTE: Most of this code is copied from |parse_string|
7305
7306 ## Step 1 # MUST
7307 my $this_doc = $node->owner_document;
7308 my $doc = $this_doc->implementation->create_document;
7309 $doc->manakai_is_html (1);
7310 my $p = $class->new;
7311 $p->{document} = $doc;
7312
7313 ## Step 8 # MUST
7314 my $i = 0;
7315 $p->{line_prev} = $p->{line} = 1;
7316 $p->{column_prev} = $p->{column} = 0;
7317 $p->{set_next_char} = sub {
7318 my $self = shift;
7319
7320 pop @{$self->{prev_char}};
7321 unshift @{$self->{prev_char}}, $self->{next_char};
7322
7323 $self->{next_char} = -1 and return if $i >= length $$s;
7324 $self->{next_char} = ord substr $$s, $i++, 1;
7325
7326 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7327 $p->{column}++;
7328
7329 if ($self->{next_char} == 0x000A) { # LF
7330 $p->{line}++;
7331 $p->{column} = 0;
7332 !!!cp ('i1');
7333 } elsif ($self->{next_char} == 0x000D) { # CR
7334 $i++ if substr ($$s, $i, 1) eq "\x0A";
7335 $self->{next_char} = 0x000A; # LF # MUST
7336 $p->{line}++;
7337 $p->{column} = 0;
7338 !!!cp ('i2');
7339 } elsif ($self->{next_char} > 0x10FFFF) {
7340 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7341 !!!cp ('i3');
7342 } elsif ($self->{next_char} == 0x0000) { # NULL
7343 !!!cp ('i4');
7344 !!!parse-error (type => 'NULL');
7345 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7346 } elsif ($self->{next_char} <= 0x0008 or
7347 (0x000E <= $self->{next_char} and
7348 $self->{next_char} <= 0x001F) or
7349 (0x007F <= $self->{next_char} and
7350 $self->{next_char} <= 0x009F) or
7351 (0xD800 <= $self->{next_char} and
7352 $self->{next_char} <= 0xDFFF) or
7353 (0xFDD0 <= $self->{next_char} and
7354 $self->{next_char} <= 0xFDDF) or
7355 {
7356 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7357 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7358 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7359 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7360 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7361 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7362 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7363 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7364 0x10FFFE => 1, 0x10FFFF => 1,
7365 }->{$self->{next_char}}) {
7366 !!!cp ('i4.1');
7367 !!!parse-error (type => 'control char', level => $self->{must_level});
7368 ## TODO: error type documentation
7369 }
7370 };
7371 $p->{prev_char} = [-1, -1, -1];
7372 $p->{next_char} = -1;
7373
7374 my $ponerror = $onerror || sub {
7375 my (%opt) = @_;
7376 my $line = $opt{line};
7377 my $column = $opt{column};
7378 if (defined $opt{token} and defined $opt{token}->{line}) {
7379 $line = $opt{token}->{line};
7380 $column = $opt{token}->{column};
7381 }
7382 warn "Parse error ($opt{type}) at line $line column $column\n";
7383 };
7384 $p->{parse_error} = sub {
7385 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7386 };
7387
7388 $p->_initialize_tokenizer;
7389 $p->_initialize_tree_constructor;
7390
7391 ## Step 2
7392 my $node_ln = $node->manakai_local_name;
7393 $p->{content_model} = {
7394 title => RCDATA_CONTENT_MODEL,
7395 textarea => RCDATA_CONTENT_MODEL,
7396 style => CDATA_CONTENT_MODEL,
7397 script => CDATA_CONTENT_MODEL,
7398 xmp => CDATA_CONTENT_MODEL,
7399 iframe => CDATA_CONTENT_MODEL,
7400 noembed => CDATA_CONTENT_MODEL,
7401 noframes => CDATA_CONTENT_MODEL,
7402 noscript => CDATA_CONTENT_MODEL,
7403 plaintext => PLAINTEXT_CONTENT_MODEL,
7404 }->{$node_ln};
7405 $p->{content_model} = PCDATA_CONTENT_MODEL
7406 unless defined $p->{content_model};
7407 ## ISSUE: What is "the name of the element"? local name?
7408
7409 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7410 ## TODO: Foreign element OK?
7411
7412 ## Step 3
7413 my $root = $doc->create_element_ns
7414 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7415
7416 ## Step 4 # MUST
7417 $doc->append_child ($root);
7418
7419 ## Step 5 # MUST
7420 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7421
7422 undef $p->{head_element};
7423
7424 ## Step 6 # MUST
7425 $p->_reset_insertion_mode;
7426
7427 ## Step 7 # MUST
7428 my $anode = $node;
7429 AN: while (defined $anode) {
7430 if ($anode->node_type == 1) {
7431 my $nsuri = $anode->namespace_uri;
7432 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7433 if ($anode->manakai_local_name eq 'form') {
7434 !!!cp ('i5');
7435 $p->{form_element} = $anode;
7436 last AN;
7437 }
7438 }
7439 }
7440 $anode = $anode->parent_node;
7441 } # AN
7442
7443 ## Step 9 # MUST
7444 {
7445 my $self = $p;
7446 !!!next-token;
7447 }
7448 $p->_tree_construction_main;
7449
7450 ## Step 10 # MUST
7451 my @cn = @{$node->child_nodes};
7452 for (@cn) {
7453 $node->remove_child ($_);
7454 }
7455 ## ISSUE: mutation events? read-only?
7456
7457 ## Step 11 # MUST
7458 @cn = @{$root->child_nodes};
7459 for (@cn) {
7460 $this_doc->adopt_node ($_);
7461 $node->append_child ($_);
7462 }
7463 ## ISSUE: mutation events?
7464
7465 $p->_terminate_tree_constructor;
7466
7467 delete $p->{parse_error}; # delete loop
7468 } else {
7469 die "$0: |set_inner_html| is not defined for node of type $nt";
7470 }
7471 } # set_inner_html
7472
7473 } # tree construction stage
7474
7475 package Whatpm::HTML::RestartParser;
7476 push our @ISA, 'Error';
7477
7478 1;
7479 # $Date: 2008/05/24 10:18:26 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24