/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.197 - (show annotations) (download) (as text)
Sat Oct 4 08:29:19 2008 UTC (17 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.196: +22 -8 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	4 Oct 2008 08:25:52 -0000
	* tree-test-flow.dat: Test on implied end tag before <p> start tag
	are added (HTML5 revision 1731).

2008-10-04  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	4 Oct 2008 08:26:13 -0000
	* HTML.pm.src: <p> steps reimplemented (HTML5 revision 1731).

2008-10-04  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.196 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## NOTE: This module don't check all HTML5 parse errors; character
7 ## encoding related parse errors are expected to be handled by relevant
8 ## modules.
9 ## Parse errors for control characters that are not allowed in HTML5
10 ## documents, for surrogate code points, and for noncharacter code
11 ## points, as well as U+FFFD substitions for characters whose code points
12 ## is higher than U+10FFFF may be detected by combining the parser with
13 ## the checker implemented by Whatpm::Charset::UnicodeChecker (for its
14 ## usage example, see |t/HTML-tree.t| in the Whatpm package or the
15 ## WebHACC::Language::HTML module in the WebHACC package).
16
17 ## ISSUE:
18 ## var doc = implementation.createDocument (null, null, null);
19 ## doc.write ('');
20 ## alert (doc.compatMode);
21
22 require IO::Handle;
23
24 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
25 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
26 my $SVG_NS = q<http://www.w3.org/2000/svg>;
27 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
28 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
29 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
30
31 sub A_EL () { 0b1 }
32 sub ADDRESS_EL () { 0b10 }
33 sub BODY_EL () { 0b100 }
34 sub BUTTON_EL () { 0b1000 }
35 sub CAPTION_EL () { 0b10000 }
36 sub DD_EL () { 0b100000 }
37 sub DIV_EL () { 0b1000000 }
38 sub DT_EL () { 0b10000000 }
39 sub FORM_EL () { 0b100000000 }
40 sub FORMATTING_EL () { 0b1000000000 }
41 sub FRAMESET_EL () { 0b10000000000 }
42 sub HEADING_EL () { 0b100000000000 }
43 sub HTML_EL () { 0b1000000000000 }
44 sub LI_EL () { 0b10000000000000 }
45 sub NOBR_EL () { 0b100000000000000 }
46 sub OPTION_EL () { 0b1000000000000000 }
47 sub OPTGROUP_EL () { 0b10000000000000000 }
48 sub P_EL () { 0b100000000000000000 }
49 sub SELECT_EL () { 0b1000000000000000000 }
50 sub TABLE_EL () { 0b10000000000000000000 }
51 sub TABLE_CELL_EL () { 0b100000000000000000000 }
52 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
53 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
54 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
55 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
56 sub FOREIGN_EL () { 0b10000000000000000000000000 }
57 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
58 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
59 sub RUBY_EL () { 0b10000000000000000000000000000 }
60 sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
61
62 sub TABLE_ROWS_EL () {
63 TABLE_EL |
64 TABLE_ROW_EL |
65 TABLE_ROW_GROUP_EL
66 }
67
68 ## NOTE: Used in "generate implied end tags" algorithm.
69 ## NOTE: There is a code where a modified version of
70 ## END_TAG_OPTIONAL_EL is used in "generate implied end tags"
71 ## implementation (search for the algorithm name).
72 sub END_TAG_OPTIONAL_EL () {
73 DD_EL |
74 DT_EL |
75 LI_EL |
76 OPTION_EL |
77 OPTGROUP_EL |
78 P_EL |
79 RUBY_COMPONENT_EL
80 }
81
82 ## NOTE: Used in </body> and EOF algorithms.
83 sub ALL_END_TAG_OPTIONAL_EL () {
84 DD_EL |
85 DT_EL |
86 LI_EL |
87 P_EL |
88
89 BODY_EL |
90 HTML_EL |
91 TABLE_CELL_EL |
92 TABLE_ROW_EL |
93 TABLE_ROW_GROUP_EL
94 }
95
96 sub SCOPING_EL () {
97 BUTTON_EL |
98 CAPTION_EL |
99 HTML_EL |
100 TABLE_EL |
101 TABLE_CELL_EL |
102 MISC_SCOPING_EL
103 }
104
105 sub TABLE_SCOPING_EL () {
106 HTML_EL |
107 TABLE_EL
108 }
109
110 sub TABLE_ROWS_SCOPING_EL () {
111 HTML_EL |
112 TABLE_ROW_GROUP_EL
113 }
114
115 sub TABLE_ROW_SCOPING_EL () {
116 HTML_EL |
117 TABLE_ROW_EL
118 }
119
120 sub SPECIAL_EL () {
121 ADDRESS_EL |
122 BODY_EL |
123 DIV_EL |
124
125 DD_EL |
126 DT_EL |
127 LI_EL |
128 P_EL |
129
130 FORM_EL |
131 FRAMESET_EL |
132 HEADING_EL |
133 OPTION_EL |
134 OPTGROUP_EL |
135 SELECT_EL |
136 TABLE_ROW_EL |
137 TABLE_ROW_GROUP_EL |
138 MISC_SPECIAL_EL
139 }
140
141 my $el_category = {
142 a => A_EL | FORMATTING_EL,
143 address => ADDRESS_EL,
144 applet => MISC_SCOPING_EL,
145 area => MISC_SPECIAL_EL,
146 article => MISC_SPECIAL_EL,
147 aside => MISC_SPECIAL_EL,
148 b => FORMATTING_EL,
149 base => MISC_SPECIAL_EL,
150 basefont => MISC_SPECIAL_EL,
151 bgsound => MISC_SPECIAL_EL,
152 big => FORMATTING_EL,
153 blockquote => MISC_SPECIAL_EL,
154 body => BODY_EL,
155 br => MISC_SPECIAL_EL,
156 button => BUTTON_EL,
157 caption => CAPTION_EL,
158 center => MISC_SPECIAL_EL,
159 col => MISC_SPECIAL_EL,
160 colgroup => MISC_SPECIAL_EL,
161 command => MISC_SPECIAL_EL,
162 datagrid => MISC_SPECIAL_EL,
163 dd => DD_EL,
164 details => MISC_SPECIAL_EL,
165 dialog => MISC_SPECIAL_EL,
166 dir => MISC_SPECIAL_EL,
167 div => DIV_EL,
168 dl => MISC_SPECIAL_EL,
169 dt => DT_EL,
170 em => FORMATTING_EL,
171 embed => MISC_SPECIAL_EL,
172 eventsource => MISC_SPECIAL_EL,
173 fieldset => MISC_SPECIAL_EL,
174 figure => MISC_SPECIAL_EL,
175 font => FORMATTING_EL,
176 footer => MISC_SPECIAL_EL,
177 form => FORM_EL,
178 frame => MISC_SPECIAL_EL,
179 frameset => FRAMESET_EL,
180 h1 => HEADING_EL,
181 h2 => HEADING_EL,
182 h3 => HEADING_EL,
183 h4 => HEADING_EL,
184 h5 => HEADING_EL,
185 h6 => HEADING_EL,
186 head => MISC_SPECIAL_EL,
187 header => MISC_SPECIAL_EL,
188 hr => MISC_SPECIAL_EL,
189 html => HTML_EL,
190 i => FORMATTING_EL,
191 iframe => MISC_SPECIAL_EL,
192 img => MISC_SPECIAL_EL,
193 #image => MISC_SPECIAL_EL, ## NOTE: Commented out in the spec.
194 input => MISC_SPECIAL_EL,
195 isindex => MISC_SPECIAL_EL,
196 li => LI_EL,
197 link => MISC_SPECIAL_EL,
198 listing => MISC_SPECIAL_EL,
199 marquee => MISC_SCOPING_EL,
200 menu => MISC_SPECIAL_EL,
201 meta => MISC_SPECIAL_EL,
202 nav => MISC_SPECIAL_EL,
203 nobr => NOBR_EL | FORMATTING_EL,
204 noembed => MISC_SPECIAL_EL,
205 noframes => MISC_SPECIAL_EL,
206 noscript => MISC_SPECIAL_EL,
207 object => MISC_SCOPING_EL,
208 ol => MISC_SPECIAL_EL,
209 optgroup => OPTGROUP_EL,
210 option => OPTION_EL,
211 p => P_EL,
212 param => MISC_SPECIAL_EL,
213 plaintext => MISC_SPECIAL_EL,
214 pre => MISC_SPECIAL_EL,
215 rp => RUBY_COMPONENT_EL,
216 rt => RUBY_COMPONENT_EL,
217 ruby => RUBY_EL,
218 s => FORMATTING_EL,
219 script => MISC_SPECIAL_EL,
220 select => SELECT_EL,
221 section => MISC_SPECIAL_EL,
222 small => FORMATTING_EL,
223 spacer => MISC_SPECIAL_EL,
224 strike => FORMATTING_EL,
225 strong => FORMATTING_EL,
226 style => MISC_SPECIAL_EL,
227 table => TABLE_EL,
228 tbody => TABLE_ROW_GROUP_EL,
229 td => TABLE_CELL_EL,
230 textarea => MISC_SPECIAL_EL,
231 tfoot => TABLE_ROW_GROUP_EL,
232 th => TABLE_CELL_EL,
233 thead => TABLE_ROW_GROUP_EL,
234 title => MISC_SPECIAL_EL,
235 tr => TABLE_ROW_EL,
236 tt => FORMATTING_EL,
237 u => FORMATTING_EL,
238 ul => MISC_SPECIAL_EL,
239 wbr => MISC_SPECIAL_EL,
240 };
241
242 my $el_category_f = {
243 $MML_NS => {
244 'annotation-xml' => MML_AXML_EL,
245 mi => FOREIGN_FLOW_CONTENT_EL,
246 mo => FOREIGN_FLOW_CONTENT_EL,
247 mn => FOREIGN_FLOW_CONTENT_EL,
248 ms => FOREIGN_FLOW_CONTENT_EL,
249 mtext => FOREIGN_FLOW_CONTENT_EL,
250 },
251 $SVG_NS => {
252 foreignObject => FOREIGN_FLOW_CONTENT_EL,
253 desc => FOREIGN_FLOW_CONTENT_EL,
254 title => FOREIGN_FLOW_CONTENT_EL,
255 },
256 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
257 };
258
259 my $svg_attr_name = {
260 attributename => 'attributeName',
261 attributetype => 'attributeType',
262 basefrequency => 'baseFrequency',
263 baseprofile => 'baseProfile',
264 calcmode => 'calcMode',
265 clippathunits => 'clipPathUnits',
266 contentscripttype => 'contentScriptType',
267 contentstyletype => 'contentStyleType',
268 diffuseconstant => 'diffuseConstant',
269 edgemode => 'edgeMode',
270 externalresourcesrequired => 'externalResourcesRequired',
271 filterres => 'filterRes',
272 filterunits => 'filterUnits',
273 glyphref => 'glyphRef',
274 gradienttransform => 'gradientTransform',
275 gradientunits => 'gradientUnits',
276 kernelmatrix => 'kernelMatrix',
277 kernelunitlength => 'kernelUnitLength',
278 keypoints => 'keyPoints',
279 keysplines => 'keySplines',
280 keytimes => 'keyTimes',
281 lengthadjust => 'lengthAdjust',
282 limitingconeangle => 'limitingConeAngle',
283 markerheight => 'markerHeight',
284 markerunits => 'markerUnits',
285 markerwidth => 'markerWidth',
286 maskcontentunits => 'maskContentUnits',
287 maskunits => 'maskUnits',
288 numoctaves => 'numOctaves',
289 pathlength => 'pathLength',
290 patterncontentunits => 'patternContentUnits',
291 patterntransform => 'patternTransform',
292 patternunits => 'patternUnits',
293 pointsatx => 'pointsAtX',
294 pointsaty => 'pointsAtY',
295 pointsatz => 'pointsAtZ',
296 preservealpha => 'preserveAlpha',
297 preserveaspectratio => 'preserveAspectRatio',
298 primitiveunits => 'primitiveUnits',
299 refx => 'refX',
300 refy => 'refY',
301 repeatcount => 'repeatCount',
302 repeatdur => 'repeatDur',
303 requiredextensions => 'requiredExtensions',
304 requiredfeatures => 'requiredFeatures',
305 specularconstant => 'specularConstant',
306 specularexponent => 'specularExponent',
307 spreadmethod => 'spreadMethod',
308 startoffset => 'startOffset',
309 stddeviation => 'stdDeviation',
310 stitchtiles => 'stitchTiles',
311 surfacescale => 'surfaceScale',
312 systemlanguage => 'systemLanguage',
313 tablevalues => 'tableValues',
314 targetx => 'targetX',
315 targety => 'targetY',
316 textlength => 'textLength',
317 viewbox => 'viewBox',
318 viewtarget => 'viewTarget',
319 xchannelselector => 'xChannelSelector',
320 ychannelselector => 'yChannelSelector',
321 zoomandpan => 'zoomAndPan',
322 };
323
324 my $foreign_attr_xname = {
325 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
326 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
327 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
328 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
329 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
330 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
331 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
332 'xml:base' => [$XML_NS, ['xml', 'base']],
333 'xml:lang' => [$XML_NS, ['xml', 'lang']],
334 'xml:space' => [$XML_NS, ['xml', 'space']],
335 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
336 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
337 };
338
339 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
340
341 my $charref_map = {
342 0x0D => 0x000A,
343 0x80 => 0x20AC,
344 0x81 => 0xFFFD,
345 0x82 => 0x201A,
346 0x83 => 0x0192,
347 0x84 => 0x201E,
348 0x85 => 0x2026,
349 0x86 => 0x2020,
350 0x87 => 0x2021,
351 0x88 => 0x02C6,
352 0x89 => 0x2030,
353 0x8A => 0x0160,
354 0x8B => 0x2039,
355 0x8C => 0x0152,
356 0x8D => 0xFFFD,
357 0x8E => 0x017D,
358 0x8F => 0xFFFD,
359 0x90 => 0xFFFD,
360 0x91 => 0x2018,
361 0x92 => 0x2019,
362 0x93 => 0x201C,
363 0x94 => 0x201D,
364 0x95 => 0x2022,
365 0x96 => 0x2013,
366 0x97 => 0x2014,
367 0x98 => 0x02DC,
368 0x99 => 0x2122,
369 0x9A => 0x0161,
370 0x9B => 0x203A,
371 0x9C => 0x0153,
372 0x9D => 0xFFFD,
373 0x9E => 0x017E,
374 0x9F => 0x0178,
375 }; # $charref_map
376 $charref_map->{$_} = 0xFFFD
377 for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
378 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
379 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
380 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
381 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
382 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
383 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
384
385 ## TODO: Invoke the reset algorithm when a resettable element is
386 ## created (cf. HTML5 revision 2259).
387
388 sub parse_byte_string ($$$$;$) {
389 my $self = shift;
390 my $charset_name = shift;
391 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
392 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
393 } # parse_byte_string
394
395 sub parse_byte_stream ($$$$;$$) {
396 # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_;
397 my $self = ref $_[0] ? shift : shift->new;
398 my $charset_name = shift;
399 my $byte_stream = $_[0];
400
401 my $onerror = $_[2] || sub {
402 my (%opt) = @_;
403 warn "Parse error ($opt{type})\n";
404 };
405 $self->{parse_error} = $onerror; # updated later by parse_char_string
406
407 my $get_wrapper = $_[3] || sub ($) {
408 return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle
409 };
410
411 ## HTML5 encoding sniffing algorithm
412 require Message::Charset::Info;
413 my $charset;
414 my $buffer;
415 my ($char_stream, $e_status);
416
417 SNIFFING: {
418 ## NOTE: By setting |allow_fallback| option true when the
419 ## |get_decode_handle| method is invoked, we ignore what the HTML5
420 ## spec requires, i.e. unsupported encoding should be ignored.
421 ## TODO: We should not do this unless the parser is invoked
422 ## in the conformance checking mode, in which this behavior
423 ## would be useful.
424
425 ## Step 1
426 if (defined $charset_name) {
427 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
428 ## TODO: Is this ok? Transfer protocol's parameter should be
429 ## interpreted in its semantics?
430
431 ($char_stream, $e_status) = $charset->get_decode_handle
432 ($byte_stream, allow_error_reporting => 1,
433 allow_fallback => 1);
434 if ($char_stream) {
435 $self->{confident} = 1;
436 last SNIFFING;
437 } else {
438 !!!parse-error (type => 'charset:not supported',
439 layer => 'encode',
440 line => 1, column => 1,
441 value => $charset_name,
442 level => $self->{level}->{uncertain});
443 }
444 }
445
446 ## Step 2
447 my $byte_buffer = '';
448 for (1..1024) {
449 my $char = $byte_stream->getc;
450 last unless defined $char;
451 $byte_buffer .= $char;
452 } ## TODO: timeout
453
454 ## Step 3
455 if ($byte_buffer =~ /^\xFE\xFF/) {
456 $charset = Message::Charset::Info->get_by_html_name ('utf-16be');
457 ($char_stream, $e_status) = $charset->get_decode_handle
458 ($byte_stream, allow_error_reporting => 1,
459 allow_fallback => 1, byte_buffer => \$byte_buffer);
460 $self->{confident} = 1;
461 last SNIFFING;
462 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
463 $charset = Message::Charset::Info->get_by_html_name ('utf-16le');
464 ($char_stream, $e_status) = $charset->get_decode_handle
465 ($byte_stream, allow_error_reporting => 1,
466 allow_fallback => 1, byte_buffer => \$byte_buffer);
467 $self->{confident} = 1;
468 last SNIFFING;
469 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
470 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
471 ($char_stream, $e_status) = $charset->get_decode_handle
472 ($byte_stream, allow_error_reporting => 1,
473 allow_fallback => 1, byte_buffer => \$byte_buffer);
474 $self->{confident} = 1;
475 last SNIFFING;
476 }
477
478 ## Step 4
479 ## TODO: <meta charset>
480
481 ## Step 5
482 ## TODO: from history
483
484 ## Step 6
485 require Whatpm::Charset::UniversalCharDet;
486 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
487 ($byte_buffer);
488 if (defined $charset_name) {
489 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
490
491 ## ISSUE: Unsupported encoding is not ignored according to the spec.
492 require Whatpm::Charset::DecodeHandle;
493 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
494 ($byte_stream);
495 ($char_stream, $e_status) = $charset->get_decode_handle
496 ($buffer, allow_error_reporting => 1,
497 allow_fallback => 1, byte_buffer => \$byte_buffer);
498 if ($char_stream) {
499 $buffer->{buffer} = $byte_buffer;
500 !!!parse-error (type => 'sniffing:chardet',
501 text => $charset_name,
502 level => $self->{level}->{info},
503 layer => 'encode',
504 line => 1, column => 1);
505 $self->{confident} = 0;
506 last SNIFFING;
507 }
508 }
509
510 ## Step 7: default
511 ## TODO: Make this configurable.
512 $charset = Message::Charset::Info->get_by_html_name ('windows-1252');
513 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
514 ## detectable in the step 6.
515 require Whatpm::Charset::DecodeHandle;
516 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
517 ($byte_stream);
518 ($char_stream, $e_status)
519 = $charset->get_decode_handle ($buffer,
520 allow_error_reporting => 1,
521 allow_fallback => 1,
522 byte_buffer => \$byte_buffer);
523 $buffer->{buffer} = $byte_buffer;
524 !!!parse-error (type => 'sniffing:default',
525 text => 'windows-1252',
526 level => $self->{level}->{info},
527 line => 1, column => 1,
528 layer => 'encode');
529 $self->{confident} = 0;
530 } # SNIFFING
531
532 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
533 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
534 !!!parse-error (type => 'chardecode:fallback',
535 #text => $self->{input_encoding},
536 level => $self->{level}->{uncertain},
537 line => 1, column => 1,
538 layer => 'encode');
539 } elsif (not ($e_status &
540 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
541 $self->{input_encoding} = $charset->get_iana_name;
542 !!!parse-error (type => 'chardecode:no error',
543 text => $self->{input_encoding},
544 level => $self->{level}->{uncertain},
545 line => 1, column => 1,
546 layer => 'encode');
547 } else {
548 $self->{input_encoding} = $charset->get_iana_name;
549 }
550
551 $self->{change_encoding} = sub {
552 my $self = shift;
553 $charset_name = shift;
554 my $token = shift;
555
556 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
557 ($char_stream, $e_status) = $charset->get_decode_handle
558 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
559 byte_buffer => \ $buffer->{buffer});
560
561 if ($char_stream) { # if supported
562 ## "Change the encoding" algorithm:
563
564 ## Step 1
565 if ($charset->{category} &
566 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
567 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
568 ($char_stream, $e_status) = $charset->get_decode_handle
569 ($byte_stream,
570 byte_buffer => \ $buffer->{buffer});
571 }
572 $charset_name = $charset->get_iana_name;
573
574 ## Step 2
575 if (defined $self->{input_encoding} and
576 $self->{input_encoding} eq $charset_name) {
577 !!!parse-error (type => 'charset label:matching',
578 text => $charset_name,
579 level => $self->{level}->{info});
580 $self->{confident} = 1;
581 return;
582 }
583
584 !!!parse-error (type => 'charset label detected',
585 text => $self->{input_encoding},
586 value => $charset_name,
587 level => $self->{level}->{warn},
588 token => $token);
589
590 ## Step 3
591 # if (can) {
592 ## change the encoding on the fly.
593 #$self->{confident} = 1;
594 #return;
595 # }
596
597 ## Step 4
598 throw Whatpm::HTML::RestartParser ();
599 }
600 }; # $self->{change_encoding}
601
602 my $char_onerror = sub {
603 my (undef, $type, %opt) = @_;
604 !!!parse-error (layer => 'encode',
605 line => $self->{line}, column => $self->{column} + 1,
606 %opt, type => $type);
607 if ($opt{octets}) {
608 ${$opt{octets}} = "\x{FFFD}"; # relacement character
609 }
610 };
611
612 my $wrapped_char_stream = $get_wrapper->($char_stream);
613 $wrapped_char_stream->onerror ($char_onerror);
614
615 my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef;
616 my $return;
617 try {
618 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
619 } catch Whatpm::HTML::RestartParser with {
620 ## NOTE: Invoked after {change_encoding}.
621
622 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
623 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
624 !!!parse-error (type => 'chardecode:fallback',
625 level => $self->{level}->{uncertain},
626 #text => $self->{input_encoding},
627 line => 1, column => 1,
628 layer => 'encode');
629 } elsif (not ($e_status &
630 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
631 $self->{input_encoding} = $charset->get_iana_name;
632 !!!parse-error (type => 'chardecode:no error',
633 text => $self->{input_encoding},
634 level => $self->{level}->{uncertain},
635 line => 1, column => 1,
636 layer => 'encode');
637 } else {
638 $self->{input_encoding} = $charset->get_iana_name;
639 }
640 $self->{confident} = 1;
641
642 $wrapped_char_stream = $get_wrapper->($char_stream);
643 $wrapped_char_stream->onerror ($char_onerror);
644
645 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
646 };
647 return $return;
648 } # parse_byte_stream
649
650 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
651 ## and the HTML layer MUST ignore it. However, we does strip BOM in
652 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
653 ## because the core part of our HTML parser expects a string of character,
654 ## not a string of bytes or code units or anything which might contain a BOM.
655 ## Therefore, any parser interface that accepts a string of bytes,
656 ## such as |parse_byte_string| in this module, must ensure that it does
657 ## strip the BOM and never strip any ZWNBSP.
658
659 sub parse_char_string ($$$;$$) {
660 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
661 my $self = shift;
662 my $s = ref $_[0] ? $_[0] : \($_[0]);
663 require Whatpm::Charset::DecodeHandle;
664 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
665 return $self->parse_char_stream ($input, @_[1..$#_]);
666 } # parse_char_string
667 *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
668
669 sub parse_char_stream ($$$;$$) {
670 my $self = ref $_[0] ? shift : shift->new;
671 my $input = $_[0];
672 $self->{document} = $_[1];
673 @{$self->{document}->child_nodes} = ();
674
675 ## NOTE: |set_inner_html| copies most of this method's code
676
677 $self->{confident} = 1 unless exists $self->{confident};
678 $self->{document}->input_encoding ($self->{input_encoding})
679 if defined $self->{input_encoding};
680 ## TODO: |{input_encoding}| is needless?
681
682 $self->{line_prev} = $self->{line} = 1;
683 $self->{column_prev} = -1;
684 $self->{column} = 0;
685 $self->{set_nc} = sub {
686 my $self = shift;
687
688 my $char = '';
689 if (defined $self->{next_nc}) {
690 $char = $self->{next_nc};
691 delete $self->{next_nc};
692 $self->{nc} = ord $char;
693 } else {
694 $self->{char_buffer} = '';
695 $self->{char_buffer_pos} = 0;
696
697 my $count = $input->manakai_read_until
698 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
699 if ($count) {
700 $self->{line_prev} = $self->{line};
701 $self->{column_prev} = $self->{column};
702 $self->{column}++;
703 $self->{nc}
704 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
705 return;
706 }
707
708 if ($input->read ($char, 1)) {
709 $self->{nc} = ord $char;
710 } else {
711 $self->{nc} = -1;
712 return;
713 }
714 }
715
716 ($self->{line_prev}, $self->{column_prev})
717 = ($self->{line}, $self->{column});
718 $self->{column}++;
719
720 if ($self->{nc} == 0x000A) { # LF
721 !!!cp ('j1');
722 $self->{line}++;
723 $self->{column} = 0;
724 } elsif ($self->{nc} == 0x000D) { # CR
725 !!!cp ('j2');
726 ## TODO: support for abort/streaming
727 my $next = '';
728 if ($input->read ($next, 1) and $next ne "\x0A") {
729 $self->{next_nc} = $next;
730 }
731 $self->{nc} = 0x000A; # LF # MUST
732 $self->{line}++;
733 $self->{column} = 0;
734 } elsif ($self->{nc} == 0x0000) { # NULL
735 !!!cp ('j4');
736 !!!parse-error (type => 'NULL');
737 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
738 }
739 };
740
741 $self->{read_until} = sub {
742 #my ($scalar, $specials_range, $offset) = @_;
743 return 0 if defined $self->{next_nc};
744
745 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
746 my $offset = $_[2] || 0;
747
748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
749 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
750 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
751 substr ($_[0], $offset)
752 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
753 my $count = $+[0] - $-[0];
754 if ($count) {
755 $self->{column} += $count;
756 $self->{char_buffer_pos} += $count;
757 $self->{line_prev} = $self->{line};
758 $self->{column_prev} = $self->{column} - 1;
759 $self->{nc} = -1;
760 }
761 return $count;
762 } else {
763 return 0;
764 }
765 } else {
766 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
767 if ($count) {
768 $self->{column} += $count;
769 $self->{line_prev} = $self->{line};
770 $self->{column_prev} = $self->{column} - 1;
771 $self->{nc} = -1;
772 }
773 return $count;
774 }
775 }; # $self->{read_until}
776
777 my $onerror = $_[2] || sub {
778 my (%opt) = @_;
779 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
780 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
781 warn "Parse error ($opt{type}) at line $line column $column\n";
782 };
783 $self->{parse_error} = sub {
784 $onerror->(line => $self->{line}, column => $self->{column}, @_);
785 };
786
787 my $char_onerror = sub {
788 my (undef, $type, %opt) = @_;
789 !!!parse-error (layer => 'encode',
790 line => $self->{line}, column => $self->{column} + 1,
791 %opt, type => $type);
792 }; # $char_onerror
793
794 if ($_[3]) {
795 $input = $_[3]->($input);
796 $input->onerror ($char_onerror);
797 } else {
798 $input->onerror ($char_onerror) unless defined $input->onerror;
799 }
800
801 $self->_initialize_tokenizer;
802 $self->_initialize_tree_constructor;
803 $self->_construct_tree;
804 $self->_terminate_tree_constructor;
805
806 delete $self->{parse_error}; # remove loop
807
808 return $self->{document};
809 } # parse_char_stream
810
811 sub new ($) {
812 my $class = shift;
813 my $self = bless {
814 level => {must => 'm',
815 should => 's',
816 warn => 'w',
817 info => 'i',
818 uncertain => 'u'},
819 }, $class;
820 $self->{set_nc} = sub {
821 $self->{nc} = -1;
822 };
823 $self->{parse_error} = sub {
824 #
825 };
826 $self->{change_encoding} = sub {
827 # if ($_[0] is a supported encoding) {
828 # run "change the encoding" algorithm;
829 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
830 # }
831 };
832 $self->{application_cache_selection} = sub {
833 #
834 };
835 return $self;
836 } # new
837
838 sub CM_ENTITY () { 0b001 } # & markup in data
839 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
840 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
841
842 sub PLAINTEXT_CONTENT_MODEL () { 0 }
843 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
844 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
845 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
846
847 sub DATA_STATE () { 0 }
848 #sub ENTITY_DATA_STATE () { 1 }
849 sub TAG_OPEN_STATE () { 2 }
850 sub CLOSE_TAG_OPEN_STATE () { 3 }
851 sub TAG_NAME_STATE () { 4 }
852 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
853 sub ATTRIBUTE_NAME_STATE () { 6 }
854 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
855 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
856 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
857 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
858 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
859 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
860 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
861 sub COMMENT_START_STATE () { 14 }
862 sub COMMENT_START_DASH_STATE () { 15 }
863 sub COMMENT_STATE () { 16 }
864 sub COMMENT_END_STATE () { 17 }
865 sub COMMENT_END_DASH_STATE () { 18 }
866 sub BOGUS_COMMENT_STATE () { 19 }
867 sub DOCTYPE_STATE () { 20 }
868 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
869 sub DOCTYPE_NAME_STATE () { 22 }
870 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
871 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
872 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
873 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
874 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
875 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
876 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
877 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
878 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
879 sub BOGUS_DOCTYPE_STATE () { 32 }
880 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
881 sub SELF_CLOSING_START_TAG_STATE () { 34 }
882 sub CDATA_SECTION_STATE () { 35 }
883 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
884 sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
885 sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
886 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
887 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
888 sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
889 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
890 sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
891 ## NOTE: "Entity data state", "entity in attribute value state", and
892 ## "consume a character reference" algorithm are jointly implemented
893 ## using the following six states:
894 sub ENTITY_STATE () { 44 }
895 sub ENTITY_HASH_STATE () { 45 }
896 sub NCR_NUM_STATE () { 46 }
897 sub HEXREF_X_STATE () { 47 }
898 sub HEXREF_HEX_STATE () { 48 }
899 sub ENTITY_NAME_STATE () { 49 }
900 sub PCDATA_STATE () { 50 } # "data state" in the spec
901
902 sub DOCTYPE_TOKEN () { 1 }
903 sub COMMENT_TOKEN () { 2 }
904 sub START_TAG_TOKEN () { 3 }
905 sub END_TAG_TOKEN () { 4 }
906 sub END_OF_FILE_TOKEN () { 5 }
907 sub CHARACTER_TOKEN () { 6 }
908
909 sub AFTER_HTML_IMS () { 0b100 }
910 sub HEAD_IMS () { 0b1000 }
911 sub BODY_IMS () { 0b10000 }
912 sub BODY_TABLE_IMS () { 0b100000 }
913 sub TABLE_IMS () { 0b1000000 }
914 sub ROW_IMS () { 0b10000000 }
915 sub BODY_AFTER_IMS () { 0b100000000 }
916 sub FRAME_IMS () { 0b1000000000 }
917 sub SELECT_IMS () { 0b10000000000 }
918 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
919 ## NOTE: "in foreign content" insertion mode is special; it is combined
920 ## with the secondary insertion mode. In this parser, they are stored
921 ## together in the bit-or'ed form.
922
923 ## NOTE: "initial" and "before html" insertion modes have no constants.
924
925 ## NOTE: "after after body" insertion mode.
926 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
927
928 ## NOTE: "after after frameset" insertion mode.
929 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
930
931 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
932 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
933 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
934 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
935 sub IN_BODY_IM () { BODY_IMS }
936 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
937 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
938 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
939 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
940 sub IN_TABLE_IM () { TABLE_IMS }
941 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
942 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
943 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
944 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
945 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
946 sub IN_COLUMN_GROUP_IM () { 0b10 }
947
948 ## Implementations MUST act as if state machine in the spec
949
950 sub _initialize_tokenizer ($) {
951 my $self = shift;
952 $self->{state} = DATA_STATE; # MUST
953 #$self->{s_kwd}; # state keyword - initialized when used
954 #$self->{entity__value}; # initialized when used
955 #$self->{entity__match}; # initialized when used
956 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
957 undef $self->{ct}; # current token
958 undef $self->{ca}; # current attribute
959 undef $self->{last_stag_name}; # last emitted start tag name
960 #$self->{prev_state}; # initialized when used
961 delete $self->{self_closing};
962 $self->{char_buffer} = '';
963 $self->{char_buffer_pos} = 0;
964 $self->{nc} = -1; # next input character
965 #$self->{next_nc}
966 !!!next-input-character;
967 $self->{token} = [];
968 # $self->{escape}
969 } # _initialize_tokenizer
970
971 ## A token has:
972 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
973 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
974 ## ->{name} (DOCTYPE_TOKEN)
975 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
976 ## ->{pubid} (DOCTYPE_TOKEN)
977 ## ->{sysid} (DOCTYPE_TOKEN)
978 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
979 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
980 ## ->{name}
981 ## ->{value}
982 ## ->{has_reference} == 1 or 0
983 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
984 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
985 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
986 ## while the token is pushed back to the stack.
987
988 ## Emitted token MUST immediately be handled by the tree construction state.
989
990 ## Before each step, UA MAY check to see if either one of the scripts in
991 ## "list of scripts that will execute as soon as possible" or the first
992 ## script in the "list of scripts that will execute asynchronously",
993 ## has completed loading. If one has, then it MUST be executed
994 ## and removed from the list.
995
996 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
997 ## (This requirement was dropped from HTML5 spec, unfortunately.)
998
999 my $is_space = {
1000 0x0009 => 1, # CHARACTER TABULATION (HT)
1001 0x000A => 1, # LINE FEED (LF)
1002 #0x000B => 0, # LINE TABULATION (VT)
1003 0x000C => 1, # FORM FEED (FF)
1004 #0x000D => 1, # CARRIAGE RETURN (CR)
1005 0x0020 => 1, # SPACE (SP)
1006 };
1007
1008 sub _get_next_token ($) {
1009 my $self = shift;
1010
1011 if ($self->{self_closing}) {
1012 !!!parse-error (type => 'nestc', token => $self->{ct});
1013 ## NOTE: The |self_closing| flag is only set by start tag token.
1014 ## In addition, when a start tag token is emitted, it is always set to
1015 ## |ct|.
1016 delete $self->{self_closing};
1017 }
1018
1019 if (@{$self->{token}}) {
1020 $self->{self_closing} = $self->{token}->[0]->{self_closing};
1021 return shift @{$self->{token}};
1022 }
1023
1024 A: {
1025 if ($self->{state} == PCDATA_STATE) {
1026 ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
1027
1028 if ($self->{nc} == 0x0026) { # &
1029 !!!cp (0.1);
1030 ## NOTE: In the spec, the tokenizer is switched to the
1031 ## "entity data state". In this implementation, the tokenizer
1032 ## is switched to the |ENTITY_STATE|, which is an implementation
1033 ## of the "consume a character reference" algorithm.
1034 $self->{entity_add} = -1;
1035 $self->{prev_state} = DATA_STATE;
1036 $self->{state} = ENTITY_STATE;
1037 !!!next-input-character;
1038 redo A;
1039 } elsif ($self->{nc} == 0x003C) { # <
1040 !!!cp (0.2);
1041 $self->{state} = TAG_OPEN_STATE;
1042 !!!next-input-character;
1043 redo A;
1044 } elsif ($self->{nc} == -1) {
1045 !!!cp (0.3);
1046 !!!emit ({type => END_OF_FILE_TOKEN,
1047 line => $self->{line}, column => $self->{column}});
1048 last A; ## TODO: ok?
1049 } else {
1050 !!!cp (0.4);
1051 #
1052 }
1053
1054 # Anything else
1055 my $token = {type => CHARACTER_TOKEN,
1056 data => chr $self->{nc},
1057 line => $self->{line}, column => $self->{column},
1058 };
1059 $self->{read_until}->($token->{data}, q[<&], length $token->{data});
1060
1061 ## Stay in the state.
1062 !!!next-input-character;
1063 !!!emit ($token);
1064 redo A;
1065 } elsif ($self->{state} == DATA_STATE) {
1066 $self->{s_kwd} = '' unless defined $self->{s_kwd};
1067 if ($self->{nc} == 0x0026) { # &
1068 $self->{s_kwd} = '';
1069 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
1070 not $self->{escape}) {
1071 !!!cp (1);
1072 ## NOTE: In the spec, the tokenizer is switched to the
1073 ## "entity data state". In this implementation, the tokenizer
1074 ## is switched to the |ENTITY_STATE|, which is an implementation
1075 ## of the "consume a character reference" algorithm.
1076 $self->{entity_add} = -1;
1077 $self->{prev_state} = DATA_STATE;
1078 $self->{state} = ENTITY_STATE;
1079 !!!next-input-character;
1080 redo A;
1081 } else {
1082 !!!cp (2);
1083 #
1084 }
1085 } elsif ($self->{nc} == 0x002D) { # -
1086 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1087 $self->{s_kwd} .= '-';
1088
1089 if ($self->{s_kwd} eq '<!--') {
1090 !!!cp (3);
1091 $self->{escape} = 1; # unless $self->{escape};
1092 $self->{s_kwd} = '--';
1093 #
1094 } elsif ($self->{s_kwd} eq '---') {
1095 !!!cp (4);
1096 $self->{s_kwd} = '--';
1097 #
1098 } else {
1099 !!!cp (5);
1100 #
1101 }
1102 }
1103
1104 #
1105 } elsif ($self->{nc} == 0x0021) { # !
1106 if (length $self->{s_kwd}) {
1107 !!!cp (5.1);
1108 $self->{s_kwd} .= '!';
1109 #
1110 } else {
1111 !!!cp (5.2);
1112 #$self->{s_kwd} = '';
1113 #
1114 }
1115 #
1116 } elsif ($self->{nc} == 0x003C) { # <
1117 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
1118 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
1119 not $self->{escape})) {
1120 !!!cp (6);
1121 $self->{state} = TAG_OPEN_STATE;
1122 !!!next-input-character;
1123 redo A;
1124 } else {
1125 !!!cp (7);
1126 $self->{s_kwd} = '';
1127 #
1128 }
1129 } elsif ($self->{nc} == 0x003E) { # >
1130 if ($self->{escape} and
1131 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
1132 if ($self->{s_kwd} eq '--') {
1133 !!!cp (8);
1134 delete $self->{escape};
1135 } else {
1136 !!!cp (9);
1137 }
1138 } else {
1139 !!!cp (10);
1140 }
1141
1142 $self->{s_kwd} = '';
1143 #
1144 } elsif ($self->{nc} == -1) {
1145 !!!cp (11);
1146 $self->{s_kwd} = '';
1147 !!!emit ({type => END_OF_FILE_TOKEN,
1148 line => $self->{line}, column => $self->{column}});
1149 last A; ## TODO: ok?
1150 } else {
1151 !!!cp (12);
1152 $self->{s_kwd} = '';
1153 #
1154 }
1155
1156 # Anything else
1157 my $token = {type => CHARACTER_TOKEN,
1158 data => chr $self->{nc},
1159 line => $self->{line}, column => $self->{column},
1160 };
1161 if ($self->{read_until}->($token->{data}, q[-!<>&],
1162 length $token->{data})) {
1163 $self->{s_kwd} = '';
1164 }
1165
1166 ## Stay in the data state.
1167 if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
1168 !!!cp (13);
1169 $self->{state} = PCDATA_STATE;
1170 } else {
1171 !!!cp (14);
1172 ## Stay in the state.
1173 }
1174 !!!next-input-character;
1175 !!!emit ($token);
1176 redo A;
1177 } elsif ($self->{state} == TAG_OPEN_STATE) {
1178 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1179 if ($self->{nc} == 0x002F) { # /
1180 !!!cp (15);
1181 !!!next-input-character;
1182 $self->{state} = CLOSE_TAG_OPEN_STATE;
1183 redo A;
1184 } elsif ($self->{nc} == 0x0021) { # !
1185 !!!cp (15.1);
1186 $self->{s_kwd} = '<' unless $self->{escape};
1187 #
1188 } else {
1189 !!!cp (16);
1190 #
1191 }
1192
1193 ## reconsume
1194 $self->{state} = DATA_STATE;
1195 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1196 line => $self->{line_prev},
1197 column => $self->{column_prev},
1198 });
1199 redo A;
1200 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1201 if ($self->{nc} == 0x0021) { # !
1202 !!!cp (17);
1203 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1204 !!!next-input-character;
1205 redo A;
1206 } elsif ($self->{nc} == 0x002F) { # /
1207 !!!cp (18);
1208 $self->{state} = CLOSE_TAG_OPEN_STATE;
1209 !!!next-input-character;
1210 redo A;
1211 } elsif (0x0041 <= $self->{nc} and
1212 $self->{nc} <= 0x005A) { # A..Z
1213 !!!cp (19);
1214 $self->{ct}
1215 = {type => START_TAG_TOKEN,
1216 tag_name => chr ($self->{nc} + 0x0020),
1217 line => $self->{line_prev},
1218 column => $self->{column_prev}};
1219 $self->{state} = TAG_NAME_STATE;
1220 !!!next-input-character;
1221 redo A;
1222 } elsif (0x0061 <= $self->{nc} and
1223 $self->{nc} <= 0x007A) { # a..z
1224 !!!cp (20);
1225 $self->{ct} = {type => START_TAG_TOKEN,
1226 tag_name => chr ($self->{nc}),
1227 line => $self->{line_prev},
1228 column => $self->{column_prev}};
1229 $self->{state} = TAG_NAME_STATE;
1230 !!!next-input-character;
1231 redo A;
1232 } elsif ($self->{nc} == 0x003E) { # >
1233 !!!cp (21);
1234 !!!parse-error (type => 'empty start tag',
1235 line => $self->{line_prev},
1236 column => $self->{column_prev});
1237 $self->{state} = DATA_STATE;
1238 !!!next-input-character;
1239
1240 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1241 line => $self->{line_prev},
1242 column => $self->{column_prev},
1243 });
1244
1245 redo A;
1246 } elsif ($self->{nc} == 0x003F) { # ?
1247 !!!cp (22);
1248 !!!parse-error (type => 'pio',
1249 line => $self->{line_prev},
1250 column => $self->{column_prev});
1251 $self->{state} = BOGUS_COMMENT_STATE;
1252 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1253 line => $self->{line_prev},
1254 column => $self->{column_prev},
1255 };
1256 ## $self->{nc} is intentionally left as is
1257 redo A;
1258 } else {
1259 !!!cp (23);
1260 !!!parse-error (type => 'bare stago',
1261 line => $self->{line_prev},
1262 column => $self->{column_prev});
1263 $self->{state} = DATA_STATE;
1264 ## reconsume
1265
1266 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1267 line => $self->{line_prev},
1268 column => $self->{column_prev},
1269 });
1270
1271 redo A;
1272 }
1273 } else {
1274 die "$0: $self->{content_model} in tag open";
1275 }
1276 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1277 ## NOTE: The "close tag open state" in the spec is implemented as
1278 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
1279
1280 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1281 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1282 if (defined $self->{last_stag_name}) {
1283 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
1284 $self->{s_kwd} = '';
1285 ## Reconsume.
1286 redo A;
1287 } else {
1288 ## No start tag token has ever been emitted
1289 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1290 !!!cp (28);
1291 $self->{state} = DATA_STATE;
1292 ## Reconsume.
1293 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1294 line => $l, column => $c,
1295 });
1296 redo A;
1297 }
1298 }
1299
1300 if (0x0041 <= $self->{nc} and
1301 $self->{nc} <= 0x005A) { # A..Z
1302 !!!cp (29);
1303 $self->{ct}
1304 = {type => END_TAG_TOKEN,
1305 tag_name => chr ($self->{nc} + 0x0020),
1306 line => $l, column => $c};
1307 $self->{state} = TAG_NAME_STATE;
1308 !!!next-input-character;
1309 redo A;
1310 } elsif (0x0061 <= $self->{nc} and
1311 $self->{nc} <= 0x007A) { # a..z
1312 !!!cp (30);
1313 $self->{ct} = {type => END_TAG_TOKEN,
1314 tag_name => chr ($self->{nc}),
1315 line => $l, column => $c};
1316 $self->{state} = TAG_NAME_STATE;
1317 !!!next-input-character;
1318 redo A;
1319 } elsif ($self->{nc} == 0x003E) { # >
1320 !!!cp (31);
1321 !!!parse-error (type => 'empty end tag',
1322 line => $self->{line_prev}, ## "<" in "</>"
1323 column => $self->{column_prev} - 1);
1324 $self->{state} = DATA_STATE;
1325 !!!next-input-character;
1326 redo A;
1327 } elsif ($self->{nc} == -1) {
1328 !!!cp (32);
1329 !!!parse-error (type => 'bare etago');
1330 $self->{state} = DATA_STATE;
1331 # reconsume
1332
1333 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1334 line => $l, column => $c,
1335 });
1336
1337 redo A;
1338 } else {
1339 !!!cp (33);
1340 !!!parse-error (type => 'bogus end tag');
1341 $self->{state} = BOGUS_COMMENT_STATE;
1342 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1343 line => $self->{line_prev}, # "<" of "</"
1344 column => $self->{column_prev} - 1,
1345 };
1346 ## NOTE: $self->{nc} is intentionally left as is.
1347 ## Although the "anything else" case of the spec not explicitly
1348 ## states that the next input character is to be reconsumed,
1349 ## it will be included to the |data| of the comment token
1350 ## generated from the bogus end tag, as defined in the
1351 ## "bogus comment state" entry.
1352 redo A;
1353 }
1354 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
1355 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
1356 if (length $ch) {
1357 my $CH = $ch;
1358 $ch =~ tr/a-z/A-Z/;
1359 my $nch = chr $self->{nc};
1360 if ($nch eq $ch or $nch eq $CH) {
1361 !!!cp (24);
1362 ## Stay in the state.
1363 $self->{s_kwd} .= $nch;
1364 !!!next-input-character;
1365 redo A;
1366 } else {
1367 !!!cp (25);
1368 $self->{state} = DATA_STATE;
1369 ## Reconsume.
1370 !!!emit ({type => CHARACTER_TOKEN,
1371 data => '</' . $self->{s_kwd},
1372 line => $self->{line_prev},
1373 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1374 });
1375 redo A;
1376 }
1377 } else { # after "<{tag-name}"
1378 unless ($is_space->{$self->{nc}} or
1379 {
1380 0x003E => 1, # >
1381 0x002F => 1, # /
1382 -1 => 1, # EOF
1383 }->{$self->{nc}}) {
1384 !!!cp (26);
1385 ## Reconsume.
1386 $self->{state} = DATA_STATE;
1387 !!!emit ({type => CHARACTER_TOKEN,
1388 data => '</' . $self->{s_kwd},
1389 line => $self->{line_prev},
1390 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1391 });
1392 redo A;
1393 } else {
1394 !!!cp (27);
1395 $self->{ct}
1396 = {type => END_TAG_TOKEN,
1397 tag_name => $self->{last_stag_name},
1398 line => $self->{line_prev},
1399 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
1400 $self->{state} = TAG_NAME_STATE;
1401 ## Reconsume.
1402 redo A;
1403 }
1404 }
1405 } elsif ($self->{state} == TAG_NAME_STATE) {
1406 if ($is_space->{$self->{nc}}) {
1407 !!!cp (34);
1408 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1409 !!!next-input-character;
1410 redo A;
1411 } elsif ($self->{nc} == 0x003E) { # >
1412 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1413 !!!cp (35);
1414 $self->{last_stag_name} = $self->{ct}->{tag_name};
1415 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1416 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1417 #if ($self->{ct}->{attributes}) {
1418 # ## NOTE: This should never be reached.
1419 # !!! cp (36);
1420 # !!! parse-error (type => 'end tag attribute');
1421 #} else {
1422 !!!cp (37);
1423 #}
1424 } else {
1425 die "$0: $self->{ct}->{type}: Unknown token type";
1426 }
1427 $self->{state} = DATA_STATE;
1428 !!!next-input-character;
1429
1430 !!!emit ($self->{ct}); # start tag or end tag
1431
1432 redo A;
1433 } elsif (0x0041 <= $self->{nc} and
1434 $self->{nc} <= 0x005A) { # A..Z
1435 !!!cp (38);
1436 $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
1437 # start tag or end tag
1438 ## Stay in this state
1439 !!!next-input-character;
1440 redo A;
1441 } elsif ($self->{nc} == -1) {
1442 !!!parse-error (type => 'unclosed tag');
1443 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1444 !!!cp (39);
1445 $self->{last_stag_name} = $self->{ct}->{tag_name};
1446 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1447 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1448 #if ($self->{ct}->{attributes}) {
1449 # ## NOTE: This state should never be reached.
1450 # !!! cp (40);
1451 # !!! parse-error (type => 'end tag attribute');
1452 #} else {
1453 !!!cp (41);
1454 #}
1455 } else {
1456 die "$0: $self->{ct}->{type}: Unknown token type";
1457 }
1458 $self->{state} = DATA_STATE;
1459 # reconsume
1460
1461 !!!emit ($self->{ct}); # start tag or end tag
1462
1463 redo A;
1464 } elsif ($self->{nc} == 0x002F) { # /
1465 !!!cp (42);
1466 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1467 !!!next-input-character;
1468 redo A;
1469 } else {
1470 !!!cp (44);
1471 $self->{ct}->{tag_name} .= chr $self->{nc};
1472 # start tag or end tag
1473 ## Stay in the state
1474 !!!next-input-character;
1475 redo A;
1476 }
1477 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1478 if ($is_space->{$self->{nc}}) {
1479 !!!cp (45);
1480 ## Stay in the state
1481 !!!next-input-character;
1482 redo A;
1483 } elsif ($self->{nc} == 0x003E) { # >
1484 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1485 !!!cp (46);
1486 $self->{last_stag_name} = $self->{ct}->{tag_name};
1487 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1488 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1489 if ($self->{ct}->{attributes}) {
1490 !!!cp (47);
1491 !!!parse-error (type => 'end tag attribute');
1492 } else {
1493 !!!cp (48);
1494 }
1495 } else {
1496 die "$0: $self->{ct}->{type}: Unknown token type";
1497 }
1498 $self->{state} = DATA_STATE;
1499 !!!next-input-character;
1500
1501 !!!emit ($self->{ct}); # start tag or end tag
1502
1503 redo A;
1504 } elsif (0x0041 <= $self->{nc} and
1505 $self->{nc} <= 0x005A) { # A..Z
1506 !!!cp (49);
1507 $self->{ca}
1508 = {name => chr ($self->{nc} + 0x0020),
1509 value => '',
1510 line => $self->{line}, column => $self->{column}};
1511 $self->{state} = ATTRIBUTE_NAME_STATE;
1512 !!!next-input-character;
1513 redo A;
1514 } elsif ($self->{nc} == 0x002F) { # /
1515 !!!cp (50);
1516 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1517 !!!next-input-character;
1518 redo A;
1519 } elsif ($self->{nc} == -1) {
1520 !!!parse-error (type => 'unclosed tag');
1521 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1522 !!!cp (52);
1523 $self->{last_stag_name} = $self->{ct}->{tag_name};
1524 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1525 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1526 if ($self->{ct}->{attributes}) {
1527 !!!cp (53);
1528 !!!parse-error (type => 'end tag attribute');
1529 } else {
1530 !!!cp (54);
1531 }
1532 } else {
1533 die "$0: $self->{ct}->{type}: Unknown token type";
1534 }
1535 $self->{state} = DATA_STATE;
1536 # reconsume
1537
1538 !!!emit ($self->{ct}); # start tag or end tag
1539
1540 redo A;
1541 } else {
1542 if ({
1543 0x0022 => 1, # "
1544 0x0027 => 1, # '
1545 0x003D => 1, # =
1546 }->{$self->{nc}}) {
1547 !!!cp (55);
1548 !!!parse-error (type => 'bad attribute name');
1549 } else {
1550 !!!cp (56);
1551 }
1552 $self->{ca}
1553 = {name => chr ($self->{nc}),
1554 value => '',
1555 line => $self->{line}, column => $self->{column}};
1556 $self->{state} = ATTRIBUTE_NAME_STATE;
1557 !!!next-input-character;
1558 redo A;
1559 }
1560 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1561 my $before_leave = sub {
1562 if (exists $self->{ct}->{attributes} # start tag or end tag
1563 ->{$self->{ca}->{name}}) { # MUST
1564 !!!cp (57);
1565 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1566 ## Discard $self->{ca} # MUST
1567 } else {
1568 !!!cp (58);
1569 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1570 = $self->{ca};
1571 }
1572 }; # $before_leave
1573
1574 if ($is_space->{$self->{nc}}) {
1575 !!!cp (59);
1576 $before_leave->();
1577 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1578 !!!next-input-character;
1579 redo A;
1580 } elsif ($self->{nc} == 0x003D) { # =
1581 !!!cp (60);
1582 $before_leave->();
1583 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1584 !!!next-input-character;
1585 redo A;
1586 } elsif ($self->{nc} == 0x003E) { # >
1587 $before_leave->();
1588 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1589 !!!cp (61);
1590 $self->{last_stag_name} = $self->{ct}->{tag_name};
1591 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1592 !!!cp (62);
1593 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1594 if ($self->{ct}->{attributes}) {
1595 !!!parse-error (type => 'end tag attribute');
1596 }
1597 } else {
1598 die "$0: $self->{ct}->{type}: Unknown token type";
1599 }
1600 $self->{state} = DATA_STATE;
1601 !!!next-input-character;
1602
1603 !!!emit ($self->{ct}); # start tag or end tag
1604
1605 redo A;
1606 } elsif (0x0041 <= $self->{nc} and
1607 $self->{nc} <= 0x005A) { # A..Z
1608 !!!cp (63);
1609 $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1610 ## Stay in the state
1611 !!!next-input-character;
1612 redo A;
1613 } elsif ($self->{nc} == 0x002F) { # /
1614 !!!cp (64);
1615 $before_leave->();
1616 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1617 !!!next-input-character;
1618 redo A;
1619 } elsif ($self->{nc} == -1) {
1620 !!!parse-error (type => 'unclosed tag');
1621 $before_leave->();
1622 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1623 !!!cp (66);
1624 $self->{last_stag_name} = $self->{ct}->{tag_name};
1625 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1626 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1627 if ($self->{ct}->{attributes}) {
1628 !!!cp (67);
1629 !!!parse-error (type => 'end tag attribute');
1630 } else {
1631 ## NOTE: This state should never be reached.
1632 !!!cp (68);
1633 }
1634 } else {
1635 die "$0: $self->{ct}->{type}: Unknown token type";
1636 }
1637 $self->{state} = DATA_STATE;
1638 # reconsume
1639
1640 !!!emit ($self->{ct}); # start tag or end tag
1641
1642 redo A;
1643 } else {
1644 if ($self->{nc} == 0x0022 or # "
1645 $self->{nc} == 0x0027) { # '
1646 !!!cp (69);
1647 !!!parse-error (type => 'bad attribute name');
1648 } else {
1649 !!!cp (70);
1650 }
1651 $self->{ca}->{name} .= chr ($self->{nc});
1652 ## Stay in the state
1653 !!!next-input-character;
1654 redo A;
1655 }
1656 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1657 if ($is_space->{$self->{nc}}) {
1658 !!!cp (71);
1659 ## Stay in the state
1660 !!!next-input-character;
1661 redo A;
1662 } elsif ($self->{nc} == 0x003D) { # =
1663 !!!cp (72);
1664 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1665 !!!next-input-character;
1666 redo A;
1667 } elsif ($self->{nc} == 0x003E) { # >
1668 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1669 !!!cp (73);
1670 $self->{last_stag_name} = $self->{ct}->{tag_name};
1671 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1672 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1673 if ($self->{ct}->{attributes}) {
1674 !!!cp (74);
1675 !!!parse-error (type => 'end tag attribute');
1676 } else {
1677 ## NOTE: This state should never be reached.
1678 !!!cp (75);
1679 }
1680 } else {
1681 die "$0: $self->{ct}->{type}: Unknown token type";
1682 }
1683 $self->{state} = DATA_STATE;
1684 !!!next-input-character;
1685
1686 !!!emit ($self->{ct}); # start tag or end tag
1687
1688 redo A;
1689 } elsif (0x0041 <= $self->{nc} and
1690 $self->{nc} <= 0x005A) { # A..Z
1691 !!!cp (76);
1692 $self->{ca}
1693 = {name => chr ($self->{nc} + 0x0020),
1694 value => '',
1695 line => $self->{line}, column => $self->{column}};
1696 $self->{state} = ATTRIBUTE_NAME_STATE;
1697 !!!next-input-character;
1698 redo A;
1699 } elsif ($self->{nc} == 0x002F) { # /
1700 !!!cp (77);
1701 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1702 !!!next-input-character;
1703 redo A;
1704 } elsif ($self->{nc} == -1) {
1705 !!!parse-error (type => 'unclosed tag');
1706 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1707 !!!cp (79);
1708 $self->{last_stag_name} = $self->{ct}->{tag_name};
1709 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1710 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1711 if ($self->{ct}->{attributes}) {
1712 !!!cp (80);
1713 !!!parse-error (type => 'end tag attribute');
1714 } else {
1715 ## NOTE: This state should never be reached.
1716 !!!cp (81);
1717 }
1718 } else {
1719 die "$0: $self->{ct}->{type}: Unknown token type";
1720 }
1721 $self->{state} = DATA_STATE;
1722 # reconsume
1723
1724 !!!emit ($self->{ct}); # start tag or end tag
1725
1726 redo A;
1727 } else {
1728 if ($self->{nc} == 0x0022 or # "
1729 $self->{nc} == 0x0027) { # '
1730 !!!cp (78);
1731 !!!parse-error (type => 'bad attribute name');
1732 } else {
1733 !!!cp (82);
1734 }
1735 $self->{ca}
1736 = {name => chr ($self->{nc}),
1737 value => '',
1738 line => $self->{line}, column => $self->{column}};
1739 $self->{state} = ATTRIBUTE_NAME_STATE;
1740 !!!next-input-character;
1741 redo A;
1742 }
1743 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1744 if ($is_space->{$self->{nc}}) {
1745 !!!cp (83);
1746 ## Stay in the state
1747 !!!next-input-character;
1748 redo A;
1749 } elsif ($self->{nc} == 0x0022) { # "
1750 !!!cp (84);
1751 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1752 !!!next-input-character;
1753 redo A;
1754 } elsif ($self->{nc} == 0x0026) { # &
1755 !!!cp (85);
1756 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1757 ## reconsume
1758 redo A;
1759 } elsif ($self->{nc} == 0x0027) { # '
1760 !!!cp (86);
1761 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1762 !!!next-input-character;
1763 redo A;
1764 } elsif ($self->{nc} == 0x003E) { # >
1765 !!!parse-error (type => 'empty unquoted attribute value');
1766 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1767 !!!cp (87);
1768 $self->{last_stag_name} = $self->{ct}->{tag_name};
1769 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1770 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1771 if ($self->{ct}->{attributes}) {
1772 !!!cp (88);
1773 !!!parse-error (type => 'end tag attribute');
1774 } else {
1775 ## NOTE: This state should never be reached.
1776 !!!cp (89);
1777 }
1778 } else {
1779 die "$0: $self->{ct}->{type}: Unknown token type";
1780 }
1781 $self->{state} = DATA_STATE;
1782 !!!next-input-character;
1783
1784 !!!emit ($self->{ct}); # start tag or end tag
1785
1786 redo A;
1787 } elsif ($self->{nc} == -1) {
1788 !!!parse-error (type => 'unclosed tag');
1789 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1790 !!!cp (90);
1791 $self->{last_stag_name} = $self->{ct}->{tag_name};
1792 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1793 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1794 if ($self->{ct}->{attributes}) {
1795 !!!cp (91);
1796 !!!parse-error (type => 'end tag attribute');
1797 } else {
1798 ## NOTE: This state should never be reached.
1799 !!!cp (92);
1800 }
1801 } else {
1802 die "$0: $self->{ct}->{type}: Unknown token type";
1803 }
1804 $self->{state} = DATA_STATE;
1805 ## reconsume
1806
1807 !!!emit ($self->{ct}); # start tag or end tag
1808
1809 redo A;
1810 } else {
1811 if ($self->{nc} == 0x003D) { # =
1812 !!!cp (93);
1813 !!!parse-error (type => 'bad attribute value');
1814 } else {
1815 !!!cp (94);
1816 }
1817 $self->{ca}->{value} .= chr ($self->{nc});
1818 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1819 !!!next-input-character;
1820 redo A;
1821 }
1822 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1823 if ($self->{nc} == 0x0022) { # "
1824 !!!cp (95);
1825 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1826 !!!next-input-character;
1827 redo A;
1828 } elsif ($self->{nc} == 0x0026) { # &
1829 !!!cp (96);
1830 ## NOTE: In the spec, the tokenizer is switched to the
1831 ## "entity in attribute value state". In this implementation, the
1832 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1833 ## implementation of the "consume a character reference" algorithm.
1834 $self->{prev_state} = $self->{state};
1835 $self->{entity_add} = 0x0022; # "
1836 $self->{state} = ENTITY_STATE;
1837 !!!next-input-character;
1838 redo A;
1839 } elsif ($self->{nc} == -1) {
1840 !!!parse-error (type => 'unclosed attribute value');
1841 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1842 !!!cp (97);
1843 $self->{last_stag_name} = $self->{ct}->{tag_name};
1844 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1845 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1846 if ($self->{ct}->{attributes}) {
1847 !!!cp (98);
1848 !!!parse-error (type => 'end tag attribute');
1849 } else {
1850 ## NOTE: This state should never be reached.
1851 !!!cp (99);
1852 }
1853 } else {
1854 die "$0: $self->{ct}->{type}: Unknown token type";
1855 }
1856 $self->{state} = DATA_STATE;
1857 ## reconsume
1858
1859 !!!emit ($self->{ct}); # start tag or end tag
1860
1861 redo A;
1862 } else {
1863 !!!cp (100);
1864 $self->{ca}->{value} .= chr ($self->{nc});
1865 $self->{read_until}->($self->{ca}->{value},
1866 q["&],
1867 length $self->{ca}->{value});
1868
1869 ## Stay in the state
1870 !!!next-input-character;
1871 redo A;
1872 }
1873 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1874 if ($self->{nc} == 0x0027) { # '
1875 !!!cp (101);
1876 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1877 !!!next-input-character;
1878 redo A;
1879 } elsif ($self->{nc} == 0x0026) { # &
1880 !!!cp (102);
1881 ## NOTE: In the spec, the tokenizer is switched to the
1882 ## "entity in attribute value state". In this implementation, the
1883 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1884 ## implementation of the "consume a character reference" algorithm.
1885 $self->{entity_add} = 0x0027; # '
1886 $self->{prev_state} = $self->{state};
1887 $self->{state} = ENTITY_STATE;
1888 !!!next-input-character;
1889 redo A;
1890 } elsif ($self->{nc} == -1) {
1891 !!!parse-error (type => 'unclosed attribute value');
1892 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1893 !!!cp (103);
1894 $self->{last_stag_name} = $self->{ct}->{tag_name};
1895 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1896 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1897 if ($self->{ct}->{attributes}) {
1898 !!!cp (104);
1899 !!!parse-error (type => 'end tag attribute');
1900 } else {
1901 ## NOTE: This state should never be reached.
1902 !!!cp (105);
1903 }
1904 } else {
1905 die "$0: $self->{ct}->{type}: Unknown token type";
1906 }
1907 $self->{state} = DATA_STATE;
1908 ## reconsume
1909
1910 !!!emit ($self->{ct}); # start tag or end tag
1911
1912 redo A;
1913 } else {
1914 !!!cp (106);
1915 $self->{ca}->{value} .= chr ($self->{nc});
1916 $self->{read_until}->($self->{ca}->{value},
1917 q['&],
1918 length $self->{ca}->{value});
1919
1920 ## Stay in the state
1921 !!!next-input-character;
1922 redo A;
1923 }
1924 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1925 if ($is_space->{$self->{nc}}) {
1926 !!!cp (107);
1927 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1928 !!!next-input-character;
1929 redo A;
1930 } elsif ($self->{nc} == 0x0026) { # &
1931 !!!cp (108);
1932 ## NOTE: In the spec, the tokenizer is switched to the
1933 ## "entity in attribute value state". In this implementation, the
1934 ## tokenizer is switched to the |ENTITY_STATE|, which is an
1935 ## implementation of the "consume a character reference" algorithm.
1936 $self->{entity_add} = -1;
1937 $self->{prev_state} = $self->{state};
1938 $self->{state} = ENTITY_STATE;
1939 !!!next-input-character;
1940 redo A;
1941 } elsif ($self->{nc} == 0x003E) { # >
1942 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1943 !!!cp (109);
1944 $self->{last_stag_name} = $self->{ct}->{tag_name};
1945 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1946 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1947 if ($self->{ct}->{attributes}) {
1948 !!!cp (110);
1949 !!!parse-error (type => 'end tag attribute');
1950 } else {
1951 ## NOTE: This state should never be reached.
1952 !!!cp (111);
1953 }
1954 } else {
1955 die "$0: $self->{ct}->{type}: Unknown token type";
1956 }
1957 $self->{state} = DATA_STATE;
1958 !!!next-input-character;
1959
1960 !!!emit ($self->{ct}); # start tag or end tag
1961
1962 redo A;
1963 } elsif ($self->{nc} == -1) {
1964 !!!parse-error (type => 'unclosed tag');
1965 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1966 !!!cp (112);
1967 $self->{last_stag_name} = $self->{ct}->{tag_name};
1968 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1969 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1970 if ($self->{ct}->{attributes}) {
1971 !!!cp (113);
1972 !!!parse-error (type => 'end tag attribute');
1973 } else {
1974 ## NOTE: This state should never be reached.
1975 !!!cp (114);
1976 }
1977 } else {
1978 die "$0: $self->{ct}->{type}: Unknown token type";
1979 }
1980 $self->{state} = DATA_STATE;
1981 ## reconsume
1982
1983 !!!emit ($self->{ct}); # start tag or end tag
1984
1985 redo A;
1986 } else {
1987 if ({
1988 0x0022 => 1, # "
1989 0x0027 => 1, # '
1990 0x003D => 1, # =
1991 }->{$self->{nc}}) {
1992 !!!cp (115);
1993 !!!parse-error (type => 'bad attribute value');
1994 } else {
1995 !!!cp (116);
1996 }
1997 $self->{ca}->{value} .= chr ($self->{nc});
1998 $self->{read_until}->($self->{ca}->{value},
1999 q["'=& >],
2000 length $self->{ca}->{value});
2001
2002 ## Stay in the state
2003 !!!next-input-character;
2004 redo A;
2005 }
2006 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2007 if ($is_space->{$self->{nc}}) {
2008 !!!cp (118);
2009 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2010 !!!next-input-character;
2011 redo A;
2012 } elsif ($self->{nc} == 0x003E) { # >
2013 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2014 !!!cp (119);
2015 $self->{last_stag_name} = $self->{ct}->{tag_name};
2016 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2017 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2018 if ($self->{ct}->{attributes}) {
2019 !!!cp (120);
2020 !!!parse-error (type => 'end tag attribute');
2021 } else {
2022 ## NOTE: This state should never be reached.
2023 !!!cp (121);
2024 }
2025 } else {
2026 die "$0: $self->{ct}->{type}: Unknown token type";
2027 }
2028 $self->{state} = DATA_STATE;
2029 !!!next-input-character;
2030
2031 !!!emit ($self->{ct}); # start tag or end tag
2032
2033 redo A;
2034 } elsif ($self->{nc} == 0x002F) { # /
2035 !!!cp (122);
2036 $self->{state} = SELF_CLOSING_START_TAG_STATE;
2037 !!!next-input-character;
2038 redo A;
2039 } elsif ($self->{nc} == -1) {
2040 !!!parse-error (type => 'unclosed tag');
2041 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2042 !!!cp (122.3);
2043 $self->{last_stag_name} = $self->{ct}->{tag_name};
2044 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2045 if ($self->{ct}->{attributes}) {
2046 !!!cp (122.1);
2047 !!!parse-error (type => 'end tag attribute');
2048 } else {
2049 ## NOTE: This state should never be reached.
2050 !!!cp (122.2);
2051 }
2052 } else {
2053 die "$0: $self->{ct}->{type}: Unknown token type";
2054 }
2055 $self->{state} = DATA_STATE;
2056 ## Reconsume.
2057 !!!emit ($self->{ct}); # start tag or end tag
2058 redo A;
2059 } else {
2060 !!!cp ('124.1');
2061 !!!parse-error (type => 'no space between attributes');
2062 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2063 ## reconsume
2064 redo A;
2065 }
2066 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2067 if ($self->{nc} == 0x003E) { # >
2068 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2069 !!!cp ('124.2');
2070 !!!parse-error (type => 'nestc', token => $self->{ct});
2071 ## TODO: Different type than slash in start tag
2072 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2073 if ($self->{ct}->{attributes}) {
2074 !!!cp ('124.4');
2075 !!!parse-error (type => 'end tag attribute');
2076 } else {
2077 !!!cp ('124.5');
2078 }
2079 ## TODO: Test |<title></title/>|
2080 } else {
2081 !!!cp ('124.3');
2082 $self->{self_closing} = 1;
2083 }
2084
2085 $self->{state} = DATA_STATE;
2086 !!!next-input-character;
2087
2088 !!!emit ($self->{ct}); # start tag or end tag
2089
2090 redo A;
2091 } elsif ($self->{nc} == -1) {
2092 !!!parse-error (type => 'unclosed tag');
2093 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2094 !!!cp (124.7);
2095 $self->{last_stag_name} = $self->{ct}->{tag_name};
2096 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2097 if ($self->{ct}->{attributes}) {
2098 !!!cp (124.5);
2099 !!!parse-error (type => 'end tag attribute');
2100 } else {
2101 ## NOTE: This state should never be reached.
2102 !!!cp (124.6);
2103 }
2104 } else {
2105 die "$0: $self->{ct}->{type}: Unknown token type";
2106 }
2107 $self->{state} = DATA_STATE;
2108 ## Reconsume.
2109 !!!emit ($self->{ct}); # start tag or end tag
2110 redo A;
2111 } else {
2112 !!!cp ('124.4');
2113 !!!parse-error (type => 'nestc');
2114 ## TODO: This error type is wrong.
2115 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2116 ## Reconsume.
2117 redo A;
2118 }
2119 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2120 ## (only happen if PCDATA state)
2121
2122 ## NOTE: Unlike spec's "bogus comment state", this implementation
2123 ## consumes characters one-by-one basis.
2124
2125 if ($self->{nc} == 0x003E) { # >
2126 !!!cp (124);
2127 $self->{state} = DATA_STATE;
2128 !!!next-input-character;
2129
2130 !!!emit ($self->{ct}); # comment
2131 redo A;
2132 } elsif ($self->{nc} == -1) {
2133 !!!cp (125);
2134 $self->{state} = DATA_STATE;
2135 ## reconsume
2136
2137 !!!emit ($self->{ct}); # comment
2138 redo A;
2139 } else {
2140 !!!cp (126);
2141 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2142 $self->{read_until}->($self->{ct}->{data},
2143 q[>],
2144 length $self->{ct}->{data});
2145
2146 ## Stay in the state.
2147 !!!next-input-character;
2148 redo A;
2149 }
2150 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2151 ## (only happen if PCDATA state)
2152
2153 if ($self->{nc} == 0x002D) { # -
2154 !!!cp (133);
2155 $self->{state} = MD_HYPHEN_STATE;
2156 !!!next-input-character;
2157 redo A;
2158 } elsif ($self->{nc} == 0x0044 or # D
2159 $self->{nc} == 0x0064) { # d
2160 ## ASCII case-insensitive.
2161 !!!cp (130);
2162 $self->{state} = MD_DOCTYPE_STATE;
2163 $self->{s_kwd} = chr $self->{nc};
2164 !!!next-input-character;
2165 redo A;
2166 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2167 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2168 $self->{nc} == 0x005B) { # [
2169 !!!cp (135.4);
2170 $self->{state} = MD_CDATA_STATE;
2171 $self->{s_kwd} = '[';
2172 !!!next-input-character;
2173 redo A;
2174 } else {
2175 !!!cp (136);
2176 }
2177
2178 !!!parse-error (type => 'bogus comment',
2179 line => $self->{line_prev},
2180 column => $self->{column_prev} - 1);
2181 ## Reconsume.
2182 $self->{state} = BOGUS_COMMENT_STATE;
2183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2184 line => $self->{line_prev},
2185 column => $self->{column_prev} - 1,
2186 };
2187 redo A;
2188 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2189 if ($self->{nc} == 0x002D) { # -
2190 !!!cp (127);
2191 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2192 line => $self->{line_prev},
2193 column => $self->{column_prev} - 2,
2194 };
2195 $self->{state} = COMMENT_START_STATE;
2196 !!!next-input-character;
2197 redo A;
2198 } else {
2199 !!!cp (128);
2200 !!!parse-error (type => 'bogus comment',
2201 line => $self->{line_prev},
2202 column => $self->{column_prev} - 2);
2203 $self->{state} = BOGUS_COMMENT_STATE;
2204 ## Reconsume.
2205 $self->{ct} = {type => COMMENT_TOKEN,
2206 data => '-',
2207 line => $self->{line_prev},
2208 column => $self->{column_prev} - 2,
2209 };
2210 redo A;
2211 }
2212 } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2213 ## ASCII case-insensitive.
2214 if ($self->{nc} == [
2215 undef,
2216 0x004F, # O
2217 0x0043, # C
2218 0x0054, # T
2219 0x0059, # Y
2220 0x0050, # P
2221 ]->[length $self->{s_kwd}] or
2222 $self->{nc} == [
2223 undef,
2224 0x006F, # o
2225 0x0063, # c
2226 0x0074, # t
2227 0x0079, # y
2228 0x0070, # p
2229 ]->[length $self->{s_kwd}]) {
2230 !!!cp (131);
2231 ## Stay in the state.
2232 $self->{s_kwd} .= chr $self->{nc};
2233 !!!next-input-character;
2234 redo A;
2235 } elsif ((length $self->{s_kwd}) == 6 and
2236 ($self->{nc} == 0x0045 or # E
2237 $self->{nc} == 0x0065)) { # e
2238 !!!cp (129);
2239 $self->{state} = DOCTYPE_STATE;
2240 $self->{ct} = {type => DOCTYPE_TOKEN,
2241 quirks => 1,
2242 line => $self->{line_prev},
2243 column => $self->{column_prev} - 7,
2244 };
2245 !!!next-input-character;
2246 redo A;
2247 } else {
2248 !!!cp (132);
2249 !!!parse-error (type => 'bogus comment',
2250 line => $self->{line_prev},
2251 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2252 $self->{state} = BOGUS_COMMENT_STATE;
2253 ## Reconsume.
2254 $self->{ct} = {type => COMMENT_TOKEN,
2255 data => $self->{s_kwd},
2256 line => $self->{line_prev},
2257 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2258 };
2259 redo A;
2260 }
2261 } elsif ($self->{state} == MD_CDATA_STATE) {
2262 if ($self->{nc} == {
2263 '[' => 0x0043, # C
2264 '[C' => 0x0044, # D
2265 '[CD' => 0x0041, # A
2266 '[CDA' => 0x0054, # T
2267 '[CDAT' => 0x0041, # A
2268 }->{$self->{s_kwd}}) {
2269 !!!cp (135.1);
2270 ## Stay in the state.
2271 $self->{s_kwd} .= chr $self->{nc};
2272 !!!next-input-character;
2273 redo A;
2274 } elsif ($self->{s_kwd} eq '[CDATA' and
2275 $self->{nc} == 0x005B) { # [
2276 !!!cp (135.2);
2277 $self->{ct} = {type => CHARACTER_TOKEN,
2278 data => '',
2279 line => $self->{line_prev},
2280 column => $self->{column_prev} - 7};
2281 $self->{state} = CDATA_SECTION_STATE;
2282 !!!next-input-character;
2283 redo A;
2284 } else {
2285 !!!cp (135.3);
2286 !!!parse-error (type => 'bogus comment',
2287 line => $self->{line_prev},
2288 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2289 $self->{state} = BOGUS_COMMENT_STATE;
2290 ## Reconsume.
2291 $self->{ct} = {type => COMMENT_TOKEN,
2292 data => $self->{s_kwd},
2293 line => $self->{line_prev},
2294 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2295 };
2296 redo A;
2297 }
2298 } elsif ($self->{state} == COMMENT_START_STATE) {
2299 if ($self->{nc} == 0x002D) { # -
2300 !!!cp (137);
2301 $self->{state} = COMMENT_START_DASH_STATE;
2302 !!!next-input-character;
2303 redo A;
2304 } elsif ($self->{nc} == 0x003E) { # >
2305 !!!cp (138);
2306 !!!parse-error (type => 'bogus comment');
2307 $self->{state} = DATA_STATE;
2308 !!!next-input-character;
2309
2310 !!!emit ($self->{ct}); # comment
2311
2312 redo A;
2313 } elsif ($self->{nc} == -1) {
2314 !!!cp (139);
2315 !!!parse-error (type => 'unclosed comment');
2316 $self->{state} = DATA_STATE;
2317 ## reconsume
2318
2319 !!!emit ($self->{ct}); # comment
2320
2321 redo A;
2322 } else {
2323 !!!cp (140);
2324 $self->{ct}->{data} # comment
2325 .= chr ($self->{nc});
2326 $self->{state} = COMMENT_STATE;
2327 !!!next-input-character;
2328 redo A;
2329 }
2330 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2331 if ($self->{nc} == 0x002D) { # -
2332 !!!cp (141);
2333 $self->{state} = COMMENT_END_STATE;
2334 !!!next-input-character;
2335 redo A;
2336 } elsif ($self->{nc} == 0x003E) { # >
2337 !!!cp (142);
2338 !!!parse-error (type => 'bogus comment');
2339 $self->{state} = DATA_STATE;
2340 !!!next-input-character;
2341
2342 !!!emit ($self->{ct}); # comment
2343
2344 redo A;
2345 } elsif ($self->{nc} == -1) {
2346 !!!cp (143);
2347 !!!parse-error (type => 'unclosed comment');
2348 $self->{state} = DATA_STATE;
2349 ## reconsume
2350
2351 !!!emit ($self->{ct}); # comment
2352
2353 redo A;
2354 } else {
2355 !!!cp (144);
2356 $self->{ct}->{data} # comment
2357 .= '-' . chr ($self->{nc});
2358 $self->{state} = COMMENT_STATE;
2359 !!!next-input-character;
2360 redo A;
2361 }
2362 } elsif ($self->{state} == COMMENT_STATE) {
2363 if ($self->{nc} == 0x002D) { # -
2364 !!!cp (145);
2365 $self->{state} = COMMENT_END_DASH_STATE;
2366 !!!next-input-character;
2367 redo A;
2368 } elsif ($self->{nc} == -1) {
2369 !!!cp (146);
2370 !!!parse-error (type => 'unclosed comment');
2371 $self->{state} = DATA_STATE;
2372 ## reconsume
2373
2374 !!!emit ($self->{ct}); # comment
2375
2376 redo A;
2377 } else {
2378 !!!cp (147);
2379 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2380 $self->{read_until}->($self->{ct}->{data},
2381 q[-],
2382 length $self->{ct}->{data});
2383
2384 ## Stay in the state
2385 !!!next-input-character;
2386 redo A;
2387 }
2388 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2389 if ($self->{nc} == 0x002D) { # -
2390 !!!cp (148);
2391 $self->{state} = COMMENT_END_STATE;
2392 !!!next-input-character;
2393 redo A;
2394 } elsif ($self->{nc} == -1) {
2395 !!!cp (149);
2396 !!!parse-error (type => 'unclosed comment');
2397 $self->{state} = DATA_STATE;
2398 ## reconsume
2399
2400 !!!emit ($self->{ct}); # comment
2401
2402 redo A;
2403 } else {
2404 !!!cp (150);
2405 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2406 $self->{state} = COMMENT_STATE;
2407 !!!next-input-character;
2408 redo A;
2409 }
2410 } elsif ($self->{state} == COMMENT_END_STATE) {
2411 if ($self->{nc} == 0x003E) { # >
2412 !!!cp (151);
2413 $self->{state} = DATA_STATE;
2414 !!!next-input-character;
2415
2416 !!!emit ($self->{ct}); # comment
2417
2418 redo A;
2419 } elsif ($self->{nc} == 0x002D) { # -
2420 !!!cp (152);
2421 !!!parse-error (type => 'dash in comment',
2422 line => $self->{line_prev},
2423 column => $self->{column_prev});
2424 $self->{ct}->{data} .= '-'; # comment
2425 ## Stay in the state
2426 !!!next-input-character;
2427 redo A;
2428 } elsif ($self->{nc} == -1) {
2429 !!!cp (153);
2430 !!!parse-error (type => 'unclosed comment');
2431 $self->{state} = DATA_STATE;
2432 ## reconsume
2433
2434 !!!emit ($self->{ct}); # comment
2435
2436 redo A;
2437 } else {
2438 !!!cp (154);
2439 !!!parse-error (type => 'dash in comment',
2440 line => $self->{line_prev},
2441 column => $self->{column_prev});
2442 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2443 $self->{state} = COMMENT_STATE;
2444 !!!next-input-character;
2445 redo A;
2446 }
2447 } elsif ($self->{state} == DOCTYPE_STATE) {
2448 if ($is_space->{$self->{nc}}) {
2449 !!!cp (155);
2450 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2451 !!!next-input-character;
2452 redo A;
2453 } else {
2454 !!!cp (156);
2455 !!!parse-error (type => 'no space before DOCTYPE name');
2456 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2457 ## reconsume
2458 redo A;
2459 }
2460 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2461 if ($is_space->{$self->{nc}}) {
2462 !!!cp (157);
2463 ## Stay in the state
2464 !!!next-input-character;
2465 redo A;
2466 } elsif ($self->{nc} == 0x003E) { # >
2467 !!!cp (158);
2468 !!!parse-error (type => 'no DOCTYPE name');
2469 $self->{state} = DATA_STATE;
2470 !!!next-input-character;
2471
2472 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2473
2474 redo A;
2475 } elsif ($self->{nc} == -1) {
2476 !!!cp (159);
2477 !!!parse-error (type => 'no DOCTYPE name');
2478 $self->{state} = DATA_STATE;
2479 ## reconsume
2480
2481 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2482
2483 redo A;
2484 } else {
2485 !!!cp (160);
2486 $self->{ct}->{name} = chr $self->{nc};
2487 delete $self->{ct}->{quirks};
2488 ## ISSUE: "Set the token's name name to the" in the spec
2489 $self->{state} = DOCTYPE_NAME_STATE;
2490 !!!next-input-character;
2491 redo A;
2492 }
2493 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2494 ## ISSUE: Redundant "First," in the spec.
2495 if ($is_space->{$self->{nc}}) {
2496 !!!cp (161);
2497 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2498 !!!next-input-character;
2499 redo A;
2500 } elsif ($self->{nc} == 0x003E) { # >
2501 !!!cp (162);
2502 $self->{state} = DATA_STATE;
2503 !!!next-input-character;
2504
2505 !!!emit ($self->{ct}); # DOCTYPE
2506
2507 redo A;
2508 } elsif ($self->{nc} == -1) {
2509 !!!cp (163);
2510 !!!parse-error (type => 'unclosed DOCTYPE');
2511 $self->{state} = DATA_STATE;
2512 ## reconsume
2513
2514 $self->{ct}->{quirks} = 1;
2515 !!!emit ($self->{ct}); # DOCTYPE
2516
2517 redo A;
2518 } else {
2519 !!!cp (164);
2520 $self->{ct}->{name}
2521 .= chr ($self->{nc}); # DOCTYPE
2522 ## Stay in the state
2523 !!!next-input-character;
2524 redo A;
2525 }
2526 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2527 if ($is_space->{$self->{nc}}) {
2528 !!!cp (165);
2529 ## Stay in the state
2530 !!!next-input-character;
2531 redo A;
2532 } elsif ($self->{nc} == 0x003E) { # >
2533 !!!cp (166);
2534 $self->{state} = DATA_STATE;
2535 !!!next-input-character;
2536
2537 !!!emit ($self->{ct}); # DOCTYPE
2538
2539 redo A;
2540 } elsif ($self->{nc} == -1) {
2541 !!!cp (167);
2542 !!!parse-error (type => 'unclosed DOCTYPE');
2543 $self->{state} = DATA_STATE;
2544 ## reconsume
2545
2546 $self->{ct}->{quirks} = 1;
2547 !!!emit ($self->{ct}); # DOCTYPE
2548
2549 redo A;
2550 } elsif ($self->{nc} == 0x0050 or # P
2551 $self->{nc} == 0x0070) { # p
2552 $self->{state} = PUBLIC_STATE;
2553 $self->{s_kwd} = chr $self->{nc};
2554 !!!next-input-character;
2555 redo A;
2556 } elsif ($self->{nc} == 0x0053 or # S
2557 $self->{nc} == 0x0073) { # s
2558 $self->{state} = SYSTEM_STATE;
2559 $self->{s_kwd} = chr $self->{nc};
2560 !!!next-input-character;
2561 redo A;
2562 } else {
2563 !!!cp (180);
2564 !!!parse-error (type => 'string after DOCTYPE name');
2565 $self->{ct}->{quirks} = 1;
2566
2567 $self->{state} = BOGUS_DOCTYPE_STATE;
2568 !!!next-input-character;
2569 redo A;
2570 }
2571 } elsif ($self->{state} == PUBLIC_STATE) {
2572 ## ASCII case-insensitive
2573 if ($self->{nc} == [
2574 undef,
2575 0x0055, # U
2576 0x0042, # B
2577 0x004C, # L
2578 0x0049, # I
2579 ]->[length $self->{s_kwd}] or
2580 $self->{nc} == [
2581 undef,
2582 0x0075, # u
2583 0x0062, # b
2584 0x006C, # l
2585 0x0069, # i
2586 ]->[length $self->{s_kwd}]) {
2587 !!!cp (175);
2588 ## Stay in the state.
2589 $self->{s_kwd} .= chr $self->{nc};
2590 !!!next-input-character;
2591 redo A;
2592 } elsif ((length $self->{s_kwd}) == 5 and
2593 ($self->{nc} == 0x0043 or # C
2594 $self->{nc} == 0x0063)) { # c
2595 !!!cp (168);
2596 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2597 !!!next-input-character;
2598 redo A;
2599 } else {
2600 !!!cp (169);
2601 !!!parse-error (type => 'string after DOCTYPE name',
2602 line => $self->{line_prev},
2603 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2604 $self->{ct}->{quirks} = 1;
2605
2606 $self->{state} = BOGUS_DOCTYPE_STATE;
2607 ## Reconsume.
2608 redo A;
2609 }
2610 } elsif ($self->{state} == SYSTEM_STATE) {
2611 ## ASCII case-insensitive
2612 if ($self->{nc} == [
2613 undef,
2614 0x0059, # Y
2615 0x0053, # S
2616 0x0054, # T
2617 0x0045, # E
2618 ]->[length $self->{s_kwd}] or
2619 $self->{nc} == [
2620 undef,
2621 0x0079, # y
2622 0x0073, # s
2623 0x0074, # t
2624 0x0065, # e
2625 ]->[length $self->{s_kwd}]) {
2626 !!!cp (170);
2627 ## Stay in the state.
2628 $self->{s_kwd} .= chr $self->{nc};
2629 !!!next-input-character;
2630 redo A;
2631 } elsif ((length $self->{s_kwd}) == 5 and
2632 ($self->{nc} == 0x004D or # M
2633 $self->{nc} == 0x006D)) { # m
2634 !!!cp (171);
2635 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2636 !!!next-input-character;
2637 redo A;
2638 } else {
2639 !!!cp (172);
2640 !!!parse-error (type => 'string after DOCTYPE name',
2641 line => $self->{line_prev},
2642 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2643 $self->{ct}->{quirks} = 1;
2644
2645 $self->{state} = BOGUS_DOCTYPE_STATE;
2646 ## Reconsume.
2647 redo A;
2648 }
2649 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2650 if ($is_space->{$self->{nc}}) {
2651 !!!cp (181);
2652 ## Stay in the state
2653 !!!next-input-character;
2654 redo A;
2655 } elsif ($self->{nc} eq 0x0022) { # "
2656 !!!cp (182);
2657 $self->{ct}->{pubid} = ''; # DOCTYPE
2658 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2659 !!!next-input-character;
2660 redo A;
2661 } elsif ($self->{nc} eq 0x0027) { # '
2662 !!!cp (183);
2663 $self->{ct}->{pubid} = ''; # DOCTYPE
2664 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2665 !!!next-input-character;
2666 redo A;
2667 } elsif ($self->{nc} eq 0x003E) { # >
2668 !!!cp (184);
2669 !!!parse-error (type => 'no PUBLIC literal');
2670
2671 $self->{state} = DATA_STATE;
2672 !!!next-input-character;
2673
2674 $self->{ct}->{quirks} = 1;
2675 !!!emit ($self->{ct}); # DOCTYPE
2676
2677 redo A;
2678 } elsif ($self->{nc} == -1) {
2679 !!!cp (185);
2680 !!!parse-error (type => 'unclosed DOCTYPE');
2681
2682 $self->{state} = DATA_STATE;
2683 ## reconsume
2684
2685 $self->{ct}->{quirks} = 1;
2686 !!!emit ($self->{ct}); # DOCTYPE
2687
2688 redo A;
2689 } else {
2690 !!!cp (186);
2691 !!!parse-error (type => 'string after PUBLIC');
2692 $self->{ct}->{quirks} = 1;
2693
2694 $self->{state} = BOGUS_DOCTYPE_STATE;
2695 !!!next-input-character;
2696 redo A;
2697 }
2698 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2699 if ($self->{nc} == 0x0022) { # "
2700 !!!cp (187);
2701 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2702 !!!next-input-character;
2703 redo A;
2704 } elsif ($self->{nc} == 0x003E) { # >
2705 !!!cp (188);
2706 !!!parse-error (type => 'unclosed PUBLIC literal');
2707
2708 $self->{state} = DATA_STATE;
2709 !!!next-input-character;
2710
2711 $self->{ct}->{quirks} = 1;
2712 !!!emit ($self->{ct}); # DOCTYPE
2713
2714 redo A;
2715 } elsif ($self->{nc} == -1) {
2716 !!!cp (189);
2717 !!!parse-error (type => 'unclosed PUBLIC literal');
2718
2719 $self->{state} = DATA_STATE;
2720 ## reconsume
2721
2722 $self->{ct}->{quirks} = 1;
2723 !!!emit ($self->{ct}); # DOCTYPE
2724
2725 redo A;
2726 } else {
2727 !!!cp (190);
2728 $self->{ct}->{pubid} # DOCTYPE
2729 .= chr $self->{nc};
2730 $self->{read_until}->($self->{ct}->{pubid}, q[">],
2731 length $self->{ct}->{pubid});
2732
2733 ## Stay in the state
2734 !!!next-input-character;
2735 redo A;
2736 }
2737 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2738 if ($self->{nc} == 0x0027) { # '
2739 !!!cp (191);
2740 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2741 !!!next-input-character;
2742 redo A;
2743 } elsif ($self->{nc} == 0x003E) { # >
2744 !!!cp (192);
2745 !!!parse-error (type => 'unclosed PUBLIC literal');
2746
2747 $self->{state} = DATA_STATE;
2748 !!!next-input-character;
2749
2750 $self->{ct}->{quirks} = 1;
2751 !!!emit ($self->{ct}); # DOCTYPE
2752
2753 redo A;
2754 } elsif ($self->{nc} == -1) {
2755 !!!cp (193);
2756 !!!parse-error (type => 'unclosed PUBLIC literal');
2757
2758 $self->{state} = DATA_STATE;
2759 ## reconsume
2760
2761 $self->{ct}->{quirks} = 1;
2762 !!!emit ($self->{ct}); # DOCTYPE
2763
2764 redo A;
2765 } else {
2766 !!!cp (194);
2767 $self->{ct}->{pubid} # DOCTYPE
2768 .= chr $self->{nc};
2769 $self->{read_until}->($self->{ct}->{pubid}, q['>],
2770 length $self->{ct}->{pubid});
2771
2772 ## Stay in the state
2773 !!!next-input-character;
2774 redo A;
2775 }
2776 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2777 if ($is_space->{$self->{nc}}) {
2778 !!!cp (195);
2779 ## Stay in the state
2780 !!!next-input-character;
2781 redo A;
2782 } elsif ($self->{nc} == 0x0022) { # "
2783 !!!cp (196);
2784 $self->{ct}->{sysid} = ''; # DOCTYPE
2785 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2786 !!!next-input-character;
2787 redo A;
2788 } elsif ($self->{nc} == 0x0027) { # '
2789 !!!cp (197);
2790 $self->{ct}->{sysid} = ''; # DOCTYPE
2791 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2792 !!!next-input-character;
2793 redo A;
2794 } elsif ($self->{nc} == 0x003E) { # >
2795 !!!cp (198);
2796 $self->{state} = DATA_STATE;
2797 !!!next-input-character;
2798
2799 !!!emit ($self->{ct}); # DOCTYPE
2800
2801 redo A;
2802 } elsif ($self->{nc} == -1) {
2803 !!!cp (199);
2804 !!!parse-error (type => 'unclosed DOCTYPE');
2805
2806 $self->{state} = DATA_STATE;
2807 ## reconsume
2808
2809 $self->{ct}->{quirks} = 1;
2810 !!!emit ($self->{ct}); # DOCTYPE
2811
2812 redo A;
2813 } else {
2814 !!!cp (200);
2815 !!!parse-error (type => 'string after PUBLIC literal');
2816 $self->{ct}->{quirks} = 1;
2817
2818 $self->{state} = BOGUS_DOCTYPE_STATE;
2819 !!!next-input-character;
2820 redo A;
2821 }
2822 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2823 if ($is_space->{$self->{nc}}) {
2824 !!!cp (201);
2825 ## Stay in the state
2826 !!!next-input-character;
2827 redo A;
2828 } elsif ($self->{nc} == 0x0022) { # "
2829 !!!cp (202);
2830 $self->{ct}->{sysid} = ''; # DOCTYPE
2831 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2832 !!!next-input-character;
2833 redo A;
2834 } elsif ($self->{nc} == 0x0027) { # '
2835 !!!cp (203);
2836 $self->{ct}->{sysid} = ''; # DOCTYPE
2837 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2838 !!!next-input-character;
2839 redo A;
2840 } elsif ($self->{nc} == 0x003E) { # >
2841 !!!cp (204);
2842 !!!parse-error (type => 'no SYSTEM literal');
2843 $self->{state} = DATA_STATE;
2844 !!!next-input-character;
2845
2846 $self->{ct}->{quirks} = 1;
2847 !!!emit ($self->{ct}); # DOCTYPE
2848
2849 redo A;
2850 } elsif ($self->{nc} == -1) {
2851 !!!cp (205);
2852 !!!parse-error (type => 'unclosed DOCTYPE');
2853
2854 $self->{state} = DATA_STATE;
2855 ## reconsume
2856
2857 $self->{ct}->{quirks} = 1;
2858 !!!emit ($self->{ct}); # DOCTYPE
2859
2860 redo A;
2861 } else {
2862 !!!cp (206);
2863 !!!parse-error (type => 'string after SYSTEM');
2864 $self->{ct}->{quirks} = 1;
2865
2866 $self->{state} = BOGUS_DOCTYPE_STATE;
2867 !!!next-input-character;
2868 redo A;
2869 }
2870 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2871 if ($self->{nc} == 0x0022) { # "
2872 !!!cp (207);
2873 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2874 !!!next-input-character;
2875 redo A;
2876 } elsif ($self->{nc} == 0x003E) { # >
2877 !!!cp (208);
2878 !!!parse-error (type => 'unclosed SYSTEM literal');
2879
2880 $self->{state} = DATA_STATE;
2881 !!!next-input-character;
2882
2883 $self->{ct}->{quirks} = 1;
2884 !!!emit ($self->{ct}); # DOCTYPE
2885
2886 redo A;
2887 } elsif ($self->{nc} == -1) {
2888 !!!cp (209);
2889 !!!parse-error (type => 'unclosed SYSTEM literal');
2890
2891 $self->{state} = DATA_STATE;
2892 ## reconsume
2893
2894 $self->{ct}->{quirks} = 1;
2895 !!!emit ($self->{ct}); # DOCTYPE
2896
2897 redo A;
2898 } else {
2899 !!!cp (210);
2900 $self->{ct}->{sysid} # DOCTYPE
2901 .= chr $self->{nc};
2902 $self->{read_until}->($self->{ct}->{sysid}, q[">],
2903 length $self->{ct}->{sysid});
2904
2905 ## Stay in the state
2906 !!!next-input-character;
2907 redo A;
2908 }
2909 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2910 if ($self->{nc} == 0x0027) { # '
2911 !!!cp (211);
2912 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2913 !!!next-input-character;
2914 redo A;
2915 } elsif ($self->{nc} == 0x003E) { # >
2916 !!!cp (212);
2917 !!!parse-error (type => 'unclosed SYSTEM literal');
2918
2919 $self->{state} = DATA_STATE;
2920 !!!next-input-character;
2921
2922 $self->{ct}->{quirks} = 1;
2923 !!!emit ($self->{ct}); # DOCTYPE
2924
2925 redo A;
2926 } elsif ($self->{nc} == -1) {
2927 !!!cp (213);
2928 !!!parse-error (type => 'unclosed SYSTEM literal');
2929
2930 $self->{state} = DATA_STATE;
2931 ## reconsume
2932
2933 $self->{ct}->{quirks} = 1;
2934 !!!emit ($self->{ct}); # DOCTYPE
2935
2936 redo A;
2937 } else {
2938 !!!cp (214);
2939 $self->{ct}->{sysid} # DOCTYPE
2940 .= chr $self->{nc};
2941 $self->{read_until}->($self->{ct}->{sysid}, q['>],
2942 length $self->{ct}->{sysid});
2943
2944 ## Stay in the state
2945 !!!next-input-character;
2946 redo A;
2947 }
2948 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2949 if ($is_space->{$self->{nc}}) {
2950 !!!cp (215);
2951 ## Stay in the state
2952 !!!next-input-character;
2953 redo A;
2954 } elsif ($self->{nc} == 0x003E) { # >
2955 !!!cp (216);
2956 $self->{state} = DATA_STATE;
2957 !!!next-input-character;
2958
2959 !!!emit ($self->{ct}); # DOCTYPE
2960
2961 redo A;
2962 } elsif ($self->{nc} == -1) {
2963 !!!cp (217);
2964 !!!parse-error (type => 'unclosed DOCTYPE');
2965 $self->{state} = DATA_STATE;
2966 ## reconsume
2967
2968 $self->{ct}->{quirks} = 1;
2969 !!!emit ($self->{ct}); # DOCTYPE
2970
2971 redo A;
2972 } else {
2973 !!!cp (218);
2974 !!!parse-error (type => 'string after SYSTEM literal');
2975 #$self->{ct}->{quirks} = 1;
2976
2977 $self->{state} = BOGUS_DOCTYPE_STATE;
2978 !!!next-input-character;
2979 redo A;
2980 }
2981 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2982 if ($self->{nc} == 0x003E) { # >
2983 !!!cp (219);
2984 $self->{state} = DATA_STATE;
2985 !!!next-input-character;
2986
2987 !!!emit ($self->{ct}); # DOCTYPE
2988
2989 redo A;
2990 } elsif ($self->{nc} == -1) {
2991 !!!cp (220);
2992 !!!parse-error (type => 'unclosed DOCTYPE');
2993 $self->{state} = DATA_STATE;
2994 ## reconsume
2995
2996 !!!emit ($self->{ct}); # DOCTYPE
2997
2998 redo A;
2999 } else {
3000 !!!cp (221);
3001 my $s = '';
3002 $self->{read_until}->($s, q[>], 0);
3003
3004 ## Stay in the state
3005 !!!next-input-character;
3006 redo A;
3007 }
3008 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3009 ## NOTE: "CDATA section state" in the state is jointly implemented
3010 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3011 ## and |CDATA_SECTION_MSE2_STATE|.
3012
3013 if ($self->{nc} == 0x005D) { # ]
3014 !!!cp (221.1);
3015 $self->{state} = CDATA_SECTION_MSE1_STATE;
3016 !!!next-input-character;
3017 redo A;
3018 } elsif ($self->{nc} == -1) {
3019 $self->{state} = DATA_STATE;
3020 !!!next-input-character;
3021 if (length $self->{ct}->{data}) { # character
3022 !!!cp (221.2);
3023 !!!emit ($self->{ct}); # character
3024 } else {
3025 !!!cp (221.3);
3026 ## No token to emit. $self->{ct} is discarded.
3027 }
3028 redo A;
3029 } else {
3030 !!!cp (221.4);
3031 $self->{ct}->{data} .= chr $self->{nc};
3032 $self->{read_until}->($self->{ct}->{data},
3033 q<]>,
3034 length $self->{ct}->{data});
3035
3036 ## Stay in the state.
3037 !!!next-input-character;
3038 redo A;
3039 }
3040
3041 ## ISSUE: "text tokens" in spec.
3042 } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3043 if ($self->{nc} == 0x005D) { # ]
3044 !!!cp (221.5);
3045 $self->{state} = CDATA_SECTION_MSE2_STATE;
3046 !!!next-input-character;
3047 redo A;
3048 } else {
3049 !!!cp (221.6);
3050 $self->{ct}->{data} .= ']';
3051 $self->{state} = CDATA_SECTION_STATE;
3052 ## Reconsume.
3053 redo A;
3054 }
3055 } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3056 if ($self->{nc} == 0x003E) { # >
3057 $self->{state} = DATA_STATE;
3058 !!!next-input-character;
3059 if (length $self->{ct}->{data}) { # character
3060 !!!cp (221.7);
3061 !!!emit ($self->{ct}); # character
3062 } else {
3063 !!!cp (221.8);
3064 ## No token to emit. $self->{ct} is discarded.
3065 }
3066 redo A;
3067 } elsif ($self->{nc} == 0x005D) { # ]
3068 !!!cp (221.9); # character
3069 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3070 ## Stay in the state.
3071 !!!next-input-character;
3072 redo A;
3073 } else {
3074 !!!cp (221.11);
3075 $self->{ct}->{data} .= ']]'; # character
3076 $self->{state} = CDATA_SECTION_STATE;
3077 ## Reconsume.
3078 redo A;
3079 }
3080 } elsif ($self->{state} == ENTITY_STATE) {
3081 if ($is_space->{$self->{nc}} or
3082 {
3083 0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3084 $self->{entity_add} => 1,
3085 }->{$self->{nc}}) {
3086 !!!cp (1001);
3087 ## Don't consume
3088 ## No error
3089 ## Return nothing.
3090 #
3091 } elsif ($self->{nc} == 0x0023) { # #
3092 !!!cp (999);
3093 $self->{state} = ENTITY_HASH_STATE;
3094 $self->{s_kwd} = '#';
3095 !!!next-input-character;
3096 redo A;
3097 } elsif ((0x0041 <= $self->{nc} and
3098 $self->{nc} <= 0x005A) or # A..Z
3099 (0x0061 <= $self->{nc} and
3100 $self->{nc} <= 0x007A)) { # a..z
3101 !!!cp (998);
3102 require Whatpm::_NamedEntityList;
3103 $self->{state} = ENTITY_NAME_STATE;
3104 $self->{s_kwd} = chr $self->{nc};
3105 $self->{entity__value} = $self->{s_kwd};
3106 $self->{entity__match} = 0;
3107 !!!next-input-character;
3108 redo A;
3109 } else {
3110 !!!cp (1027);
3111 !!!parse-error (type => 'bare ero');
3112 ## Return nothing.
3113 #
3114 }
3115
3116 ## NOTE: No character is consumed by the "consume a character
3117 ## reference" algorithm. In other word, there is an "&" character
3118 ## that does not introduce a character reference, which would be
3119 ## appended to the parent element or the attribute value in later
3120 ## process of the tokenizer.
3121
3122 if ($self->{prev_state} == DATA_STATE) {
3123 !!!cp (997);
3124 $self->{state} = $self->{prev_state};
3125 ## Reconsume.
3126 !!!emit ({type => CHARACTER_TOKEN, data => '&',
3127 line => $self->{line_prev},
3128 column => $self->{column_prev},
3129 });
3130 redo A;
3131 } else {
3132 !!!cp (996);
3133 $self->{ca}->{value} .= '&';
3134 $self->{state} = $self->{prev_state};
3135 ## Reconsume.
3136 redo A;
3137 }
3138 } elsif ($self->{state} == ENTITY_HASH_STATE) {
3139 if ($self->{nc} == 0x0078 or # x
3140 $self->{nc} == 0x0058) { # X
3141 !!!cp (995);
3142 $self->{state} = HEXREF_X_STATE;
3143 $self->{s_kwd} .= chr $self->{nc};
3144 !!!next-input-character;
3145 redo A;
3146 } elsif (0x0030 <= $self->{nc} and
3147 $self->{nc} <= 0x0039) { # 0..9
3148 !!!cp (994);
3149 $self->{state} = NCR_NUM_STATE;
3150 $self->{s_kwd} = $self->{nc} - 0x0030;
3151 !!!next-input-character;
3152 redo A;
3153 } else {
3154 !!!parse-error (type => 'bare nero',
3155 line => $self->{line_prev},
3156 column => $self->{column_prev} - 1);
3157
3158 ## NOTE: According to the spec algorithm, nothing is returned,
3159 ## and then "&#" is appended to the parent element or the attribute
3160 ## value in the later processing.
3161
3162 if ($self->{prev_state} == DATA_STATE) {
3163 !!!cp (1019);
3164 $self->{state} = $self->{prev_state};
3165 ## Reconsume.
3166 !!!emit ({type => CHARACTER_TOKEN,
3167 data => '&#',
3168 line => $self->{line_prev},
3169 column => $self->{column_prev} - 1,
3170 });
3171 redo A;
3172 } else {
3173 !!!cp (993);
3174 $self->{ca}->{value} .= '&#';
3175 $self->{state} = $self->{prev_state};
3176 ## Reconsume.
3177 redo A;
3178 }
3179 }
3180 } elsif ($self->{state} == NCR_NUM_STATE) {
3181 if (0x0030 <= $self->{nc} and
3182 $self->{nc} <= 0x0039) { # 0..9
3183 !!!cp (1012);
3184 $self->{s_kwd} *= 10;
3185 $self->{s_kwd} += $self->{nc} - 0x0030;
3186
3187 ## Stay in the state.
3188 !!!next-input-character;
3189 redo A;
3190 } elsif ($self->{nc} == 0x003B) { # ;
3191 !!!cp (1013);
3192 !!!next-input-character;
3193 #
3194 } else {
3195 !!!cp (1014);
3196 !!!parse-error (type => 'no refc');
3197 ## Reconsume.
3198 #
3199 }
3200
3201 my $code = $self->{s_kwd};
3202 my $l = $self->{line_prev};
3203 my $c = $self->{column_prev};
3204 if ($charref_map->{$code}) {
3205 !!!cp (1015);
3206 !!!parse-error (type => 'invalid character reference',
3207 text => (sprintf 'U+%04X', $code),
3208 line => $l, column => $c);
3209 $code = $charref_map->{$code};
3210 } elsif ($code > 0x10FFFF) {
3211 !!!cp (1016);
3212 !!!parse-error (type => 'invalid character reference',
3213 text => (sprintf 'U-%08X', $code),
3214 line => $l, column => $c);
3215 $code = 0xFFFD;
3216 }
3217
3218 if ($self->{prev_state} == DATA_STATE) {
3219 !!!cp (992);
3220 $self->{state} = $self->{prev_state};
3221 ## Reconsume.
3222 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3223 line => $l, column => $c,
3224 });
3225 redo A;
3226 } else {
3227 !!!cp (991);
3228 $self->{ca}->{value} .= chr $code;
3229 $self->{ca}->{has_reference} = 1;
3230 $self->{state} = $self->{prev_state};
3231 ## Reconsume.
3232 redo A;
3233 }
3234 } elsif ($self->{state} == HEXREF_X_STATE) {
3235 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3236 (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3237 (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3238 # 0..9, A..F, a..f
3239 !!!cp (990);
3240 $self->{state} = HEXREF_HEX_STATE;
3241 $self->{s_kwd} = 0;
3242 ## Reconsume.
3243 redo A;
3244 } else {
3245 !!!parse-error (type => 'bare hcro',
3246 line => $self->{line_prev},
3247 column => $self->{column_prev} - 2);
3248
3249 ## NOTE: According to the spec algorithm, nothing is returned,
3250 ## and then "&#" followed by "X" or "x" is appended to the parent
3251 ## element or the attribute value in the later processing.
3252
3253 if ($self->{prev_state} == DATA_STATE) {
3254 !!!cp (1005);
3255 $self->{state} = $self->{prev_state};
3256 ## Reconsume.
3257 !!!emit ({type => CHARACTER_TOKEN,
3258 data => '&' . $self->{s_kwd},
3259 line => $self->{line_prev},
3260 column => $self->{column_prev} - length $self->{s_kwd},
3261 });
3262 redo A;
3263 } else {
3264 !!!cp (989);
3265 $self->{ca}->{value} .= '&' . $self->{s_kwd};
3266 $self->{state} = $self->{prev_state};
3267 ## Reconsume.
3268 redo A;
3269 }
3270 }
3271 } elsif ($self->{state} == HEXREF_HEX_STATE) {
3272 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3273 # 0..9
3274 !!!cp (1002);
3275 $self->{s_kwd} *= 0x10;
3276 $self->{s_kwd} += $self->{nc} - 0x0030;
3277 ## Stay in the state.
3278 !!!next-input-character;
3279 redo A;
3280 } elsif (0x0061 <= $self->{nc} and
3281 $self->{nc} <= 0x0066) { # a..f
3282 !!!cp (1003);
3283 $self->{s_kwd} *= 0x10;
3284 $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3285 ## Stay in the state.
3286 !!!next-input-character;
3287 redo A;
3288 } elsif (0x0041 <= $self->{nc} and
3289 $self->{nc} <= 0x0046) { # A..F
3290 !!!cp (1004);
3291 $self->{s_kwd} *= 0x10;
3292 $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3293 ## Stay in the state.
3294 !!!next-input-character;
3295 redo A;
3296 } elsif ($self->{nc} == 0x003B) { # ;
3297 !!!cp (1006);
3298 !!!next-input-character;
3299 #
3300 } else {
3301 !!!cp (1007);
3302 !!!parse-error (type => 'no refc',
3303 line => $self->{line},
3304 column => $self->{column});
3305 ## Reconsume.
3306 #
3307 }
3308
3309 my $code = $self->{s_kwd};
3310 my $l = $self->{line_prev};
3311 my $c = $self->{column_prev};
3312 if ($charref_map->{$code}) {
3313 !!!cp (1008);
3314 !!!parse-error (type => 'invalid character reference',
3315 text => (sprintf 'U+%04X', $code),
3316 line => $l, column => $c);
3317 $code = $charref_map->{$code};
3318 } elsif ($code > 0x10FFFF) {
3319 !!!cp (1009);
3320 !!!parse-error (type => 'invalid character reference',
3321 text => (sprintf 'U-%08X', $code),
3322 line => $l, column => $c);
3323 $code = 0xFFFD;
3324 }
3325
3326 if ($self->{prev_state} == DATA_STATE) {
3327 !!!cp (988);
3328 $self->{state} = $self->{prev_state};
3329 ## Reconsume.
3330 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3331 line => $l, column => $c,
3332 });
3333 redo A;
3334 } else {
3335 !!!cp (987);
3336 $self->{ca}->{value} .= chr $code;
3337 $self->{ca}->{has_reference} = 1;
3338 $self->{state} = $self->{prev_state};
3339 ## Reconsume.
3340 redo A;
3341 }
3342 } elsif ($self->{state} == ENTITY_NAME_STATE) {
3343 if (length $self->{s_kwd} < 30 and
3344 ## NOTE: Some number greater than the maximum length of entity name
3345 ((0x0041 <= $self->{nc} and # a
3346 $self->{nc} <= 0x005A) or # x
3347 (0x0061 <= $self->{nc} and # a
3348 $self->{nc} <= 0x007A) or # z
3349 (0x0030 <= $self->{nc} and # 0
3350 $self->{nc} <= 0x0039) or # 9
3351 $self->{nc} == 0x003B)) { # ;
3352 our $EntityChar;
3353 $self->{s_kwd} .= chr $self->{nc};
3354 if (defined $EntityChar->{$self->{s_kwd}}) {
3355 if ($self->{nc} == 0x003B) { # ;
3356 !!!cp (1020);
3357 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
3358 $self->{entity__match} = 1;
3359 !!!next-input-character;
3360 #
3361 } else {
3362 !!!cp (1021);
3363 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
3364 $self->{entity__match} = -1;
3365 ## Stay in the state.
3366 !!!next-input-character;
3367 redo A;
3368 }
3369 } else {
3370 !!!cp (1022);
3371 $self->{entity__value} .= chr $self->{nc};
3372 $self->{entity__match} *= 2;
3373 ## Stay in the state.
3374 !!!next-input-character;
3375 redo A;
3376 }
3377 }
3378
3379 my $data;
3380 my $has_ref;
3381 if ($self->{entity__match} > 0) {
3382 !!!cp (1023);
3383 $data = $self->{entity__value};
3384 $has_ref = 1;
3385 #
3386 } elsif ($self->{entity__match} < 0) {
3387 !!!parse-error (type => 'no refc');
3388 if ($self->{prev_state} != DATA_STATE and # in attribute
3389 $self->{entity__match} < -1) {
3390 !!!cp (1024);
3391 $data = '&' . $self->{s_kwd};
3392 #
3393 } else {
3394 !!!cp (1025);
3395 $data = $self->{entity__value};
3396 $has_ref = 1;
3397 #
3398 }
3399 } else {
3400 !!!cp (1026);
3401 !!!parse-error (type => 'bare ero',
3402 line => $self->{line_prev},
3403 column => $self->{column_prev} - length $self->{s_kwd});
3404 $data = '&' . $self->{s_kwd};
3405 #
3406 }
3407
3408 ## NOTE: In these cases, when a character reference is found,
3409 ## it is consumed and a character token is returned, or, otherwise,
3410 ## nothing is consumed and returned, according to the spec algorithm.
3411 ## In this implementation, anything that has been examined by the
3412 ## tokenizer is appended to the parent element or the attribute value
3413 ## as string, either literal string when no character reference or
3414 ## entity-replaced string otherwise, in this stage, since any characters
3415 ## that would not be consumed are appended in the data state or in an
3416 ## appropriate attribute value state anyway.
3417
3418 if ($self->{prev_state} == DATA_STATE) {
3419 !!!cp (986);
3420 $self->{state} = $self->{prev_state};
3421 ## Reconsume.
3422 !!!emit ({type => CHARACTER_TOKEN,
3423 data => $data,
3424 line => $self->{line_prev},
3425 column => $self->{column_prev} + 1 - length $self->{s_kwd},
3426 });
3427 redo A;
3428 } else {
3429 !!!cp (985);
3430 $self->{ca}->{value} .= $data;
3431 $self->{ca}->{has_reference} = 1 if $has_ref;
3432 $self->{state} = $self->{prev_state};
3433 ## Reconsume.
3434 redo A;
3435 }
3436 } else {
3437 die "$0: $self->{state}: Unknown state";
3438 }
3439 } # A
3440
3441 die "$0: _get_next_token: unexpected case";
3442 } # _get_next_token
3443
3444 sub _initialize_tree_constructor ($) {
3445 my $self = shift;
3446 ## NOTE: $self->{document} MUST be specified before this method is called
3447 $self->{document}->strict_error_checking (0);
3448 ## TODO: Turn mutation events off # MUST
3449 ## TODO: Turn loose Document option (manakai extension) on
3450 $self->{document}->manakai_is_html (1); # MUST
3451 $self->{document}->set_user_data (manakai_source_line => 1);
3452 $self->{document}->set_user_data (manakai_source_column => 1);
3453 } # _initialize_tree_constructor
3454
3455 sub _terminate_tree_constructor ($) {
3456 my $self = shift;
3457 $self->{document}->strict_error_checking (1);
3458 ## TODO: Turn mutation events on
3459 } # _terminate_tree_constructor
3460
3461 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3462
3463 { # tree construction stage
3464 my $token;
3465
3466 sub _construct_tree ($) {
3467 my ($self) = @_;
3468
3469 ## When an interactive UA render the $self->{document} available
3470 ## to the user, or when it begin accepting user input, are
3471 ## not defined.
3472
3473 ## Append a character: collect it and all subsequent consecutive
3474 ## characters and insert one Text node whose data is concatenation
3475 ## of all those characters. # MUST
3476
3477 !!!next-token;
3478
3479 undef $self->{form_element};
3480 undef $self->{head_element};
3481 $self->{open_elements} = [];
3482 undef $self->{inner_html_node};
3483
3484 ## NOTE: The "initial" insertion mode.
3485 $self->_tree_construction_initial; # MUST
3486
3487 ## NOTE: The "before html" insertion mode.
3488 $self->_tree_construction_root_element;
3489 $self->{insertion_mode} = BEFORE_HEAD_IM;
3490
3491 ## NOTE: The "before head" insertion mode and so on.
3492 $self->_tree_construction_main;
3493 } # _construct_tree
3494
3495 sub _tree_construction_initial ($) {
3496 my $self = shift;
3497
3498 ## NOTE: "initial" insertion mode
3499
3500 INITIAL: {
3501 if ($token->{type} == DOCTYPE_TOKEN) {
3502 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3503 ## error, switch to a conformance checking mode for another
3504 ## language.
3505 my $doctype_name = $token->{name};
3506 $doctype_name = '' unless defined $doctype_name;
3507 $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3508 if (not defined $token->{name} or # <!DOCTYPE>
3509 defined $token->{sysid}) {
3510 !!!cp ('t1');
3511 !!!parse-error (type => 'not HTML5', token => $token);
3512 } elsif ($doctype_name ne 'HTML') {
3513 !!!cp ('t2');
3514 !!!parse-error (type => 'not HTML5', token => $token);
3515 } elsif (defined $token->{pubid}) {
3516 if ($token->{pubid} eq 'XSLT-compat') {
3517 !!!cp ('t1.2');
3518 !!!parse-error (type => 'XSLT-compat', token => $token,
3519 level => $self->{level}->{should});
3520 } else {
3521 !!!parse-error (type => 'not HTML5', token => $token);
3522 }
3523 } else {
3524 !!!cp ('t3');
3525 #
3526 }
3527
3528 my $doctype = $self->{document}->create_document_type_definition
3529 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3530 ## NOTE: Default value for both |public_id| and |system_id| attributes
3531 ## are empty strings, so that we don't set any value in missing cases.
3532 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
3533 $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
3534 ## NOTE: Other DocumentType attributes are null or empty lists.
3535 ## ISSUE: internalSubset = null??
3536 $self->{document}->append_child ($doctype);
3537
3538 if ($token->{quirks} or $doctype_name ne 'HTML') {
3539 !!!cp ('t4');
3540 $self->{document}->manakai_compat_mode ('quirks');
3541 } elsif (defined $token->{pubid}) {
3542 my $pubid = $token->{pubid};
3543 $pubid =~ tr/a-z/A-z/;
3544 my $prefix = [
3545 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3546 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3547 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3548 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3549 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3550 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3551 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3552 "-//IETF//DTD HTML 2.0 STRICT//",
3553 "-//IETF//DTD HTML 2.0//",
3554 "-//IETF//DTD HTML 2.1E//",
3555 "-//IETF//DTD HTML 3.0//",
3556 "-//IETF//DTD HTML 3.2 FINAL//",
3557 "-//IETF//DTD HTML 3.2//",
3558 "-//IETF//DTD HTML 3//",
3559 "-//IETF//DTD HTML LEVEL 0//",
3560 "-//IETF//DTD HTML LEVEL 1//",
3561 "-//IETF//DTD HTML LEVEL 2//",
3562 "-//IETF//DTD HTML LEVEL 3//",
3563 "-//IETF//DTD HTML STRICT LEVEL 0//",
3564 "-//IETF//DTD HTML STRICT LEVEL 1//",
3565 "-//IETF//DTD HTML STRICT LEVEL 2//",
3566 "-//IETF//DTD HTML STRICT LEVEL 3//",
3567 "-//IETF//DTD HTML STRICT//",
3568 "-//IETF//DTD HTML//",
3569 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3570 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3571 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3572 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3573 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3574 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3575 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3576 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3577 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3578 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3579 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3580 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3581 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3582 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3583 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3584 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3585 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3586 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3587 "-//W3C//DTD HTML 3 1995-03-24//",
3588 "-//W3C//DTD HTML 3.2 DRAFT//",
3589 "-//W3C//DTD HTML 3.2 FINAL//",
3590 "-//W3C//DTD HTML 3.2//",
3591 "-//W3C//DTD HTML 3.2S DRAFT//",
3592 "-//W3C//DTD HTML 4.0 FRAMESET//",
3593 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3594 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3595 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3596 "-//W3C//DTD W3 HTML//",
3597 "-//W3O//DTD W3 HTML 3.0//",
3598 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3599 "-//WEBTECHS//DTD MOZILLA HTML//",
3600 ]; # $prefix
3601 my $match;
3602 for (@$prefix) {
3603 if (substr ($prefix, 0, length $_) eq $_) {
3604 $match = 1;
3605 last;
3606 }
3607 }
3608 if ($match or
3609 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3610 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3611 $pubid eq "HTML") {
3612 !!!cp ('t5');
3613 $self->{document}->manakai_compat_mode ('quirks');
3614 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3615 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3616 if (defined $token->{sysid}) {
3617 !!!cp ('t6');
3618 $self->{document}->manakai_compat_mode ('quirks');
3619 } else {
3620 !!!cp ('t7');
3621 $self->{document}->manakai_compat_mode ('limited quirks');
3622 }
3623 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3624 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3625 !!!cp ('t8');
3626 $self->{document}->manakai_compat_mode ('limited quirks');
3627 } else {
3628 !!!cp ('t9');
3629 }
3630 } else {
3631 !!!cp ('t10');
3632 }
3633 if (defined $token->{sysid}) {
3634 my $sysid = $token->{sysid};
3635 $sysid =~ tr/A-Z/a-z/;
3636 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3637 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3638 ## marked as quirks.
3639 $self->{document}->manakai_compat_mode ('quirks');
3640 !!!cp ('t11');
3641 } else {
3642 !!!cp ('t12');
3643 }
3644 } else {
3645 !!!cp ('t13');
3646 }
3647
3648 ## Go to the "before html" insertion mode.
3649 !!!next-token;
3650 return;
3651 } elsif ({
3652 START_TAG_TOKEN, 1,
3653 END_TAG_TOKEN, 1,
3654 END_OF_FILE_TOKEN, 1,
3655 }->{$token->{type}}) {
3656 !!!cp ('t14');
3657 !!!parse-error (type => 'no DOCTYPE', token => $token);
3658 $self->{document}->manakai_compat_mode ('quirks');
3659 ## Go to the "before html" insertion mode.
3660 ## reprocess
3661 !!!ack-later;
3662 return;
3663 } elsif ($token->{type} == CHARACTER_TOKEN) {
3664 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
3665 ## Ignore the token
3666
3667 unless (length $token->{data}) {
3668 !!!cp ('t15');
3669 ## Stay in the insertion mode.
3670 !!!next-token;
3671 redo INITIAL;
3672 } else {
3673 !!!cp ('t16');
3674 }
3675 } else {
3676 !!!cp ('t17');
3677 }
3678
3679 !!!parse-error (type => 'no DOCTYPE', token => $token);
3680 $self->{document}->manakai_compat_mode ('quirks');
3681 ## Go to the "before html" insertion mode.
3682 ## reprocess
3683 return;
3684 } elsif ($token->{type} == COMMENT_TOKEN) {
3685 !!!cp ('t18');
3686 my $comment = $self->{document}->create_comment ($token->{data});
3687 $self->{document}->append_child ($comment);
3688
3689 ## Stay in the insertion mode.
3690 !!!next-token;
3691 redo INITIAL;
3692 } else {
3693 die "$0: $token->{type}: Unknown token type";
3694 }
3695 } # INITIAL
3696
3697 die "$0: _tree_construction_initial: This should be never reached";
3698 } # _tree_construction_initial
3699
3700 sub _tree_construction_root_element ($) {
3701 my $self = shift;
3702
3703 ## NOTE: "before html" insertion mode.
3704
3705 B: {
3706 if ($token->{type} == DOCTYPE_TOKEN) {
3707 !!!cp ('t19');
3708 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3709 ## Ignore the token
3710 ## Stay in the insertion mode.
3711 !!!next-token;
3712 redo B;
3713 } elsif ($token->{type} == COMMENT_TOKEN) {
3714 !!!cp ('t20');
3715 my $comment = $self->{document}->create_comment ($token->{data});
3716 $self->{document}->append_child ($comment);
3717 ## Stay in the insertion mode.
3718 !!!next-token;
3719 redo B;
3720 } elsif ($token->{type} == CHARACTER_TOKEN) {
3721 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
3722 ## Ignore the token.
3723
3724 unless (length $token->{data}) {
3725 !!!cp ('t21');
3726 ## Stay in the insertion mode.
3727 !!!next-token;
3728 redo B;
3729 } else {
3730 !!!cp ('t22');
3731 }
3732 } else {
3733 !!!cp ('t23');
3734 }
3735
3736 $self->{application_cache_selection}->(undef);
3737
3738 #
3739 } elsif ($token->{type} == START_TAG_TOKEN) {
3740 if ($token->{tag_name} eq 'html') {
3741 my $root_element;
3742 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3743 $self->{document}->append_child ($root_element);
3744 push @{$self->{open_elements}},
3745 [$root_element, $el_category->{html}];
3746
3747 if ($token->{attributes}->{manifest}) {
3748 !!!cp ('t24');
3749 $self->{application_cache_selection}
3750 ->($token->{attributes}->{manifest}->{value});
3751 ## ISSUE: Spec is unclear on relative references.
3752 ## According to Hixie (#whatwg 2008-03-19), it should be
3753 ## resolved against the base URI of the document in HTML
3754 ## or xml:base of the element in XHTML.
3755 } else {
3756 !!!cp ('t25');
3757 $self->{application_cache_selection}->(undef);
3758 }
3759
3760 !!!nack ('t25c');
3761
3762 !!!next-token;
3763 return; ## Go to the "before head" insertion mode.
3764 } else {
3765 !!!cp ('t25.1');
3766 #
3767 }
3768 } elsif ({
3769 END_TAG_TOKEN, 1,
3770 END_OF_FILE_TOKEN, 1,
3771 }->{$token->{type}}) {
3772 !!!cp ('t26');
3773 #
3774 } else {
3775 die "$0: $token->{type}: Unknown token type";
3776 }
3777
3778 my $root_element;
3779 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3780 $self->{document}->append_child ($root_element);
3781 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3782
3783 $self->{application_cache_selection}->(undef);
3784
3785 ## NOTE: Reprocess the token.
3786 !!!ack-later;
3787 return; ## Go to the "before head" insertion mode.
3788
3789 ## ISSUE: There is an issue in the spec
3790 } # B
3791
3792 die "$0: _tree_construction_root_element: This should never be reached";
3793 } # _tree_construction_root_element
3794
3795 sub _reset_insertion_mode ($) {
3796 my $self = shift;
3797
3798 ## Step 1
3799 my $last;
3800
3801 ## Step 2
3802 my $i = -1;
3803 my $node = $self->{open_elements}->[$i];
3804
3805 ## Step 3
3806 S3: {
3807 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3808 $last = 1;
3809 if (defined $self->{inner_html_node}) {
3810 !!!cp ('t28');
3811 $node = $self->{inner_html_node};
3812 } else {
3813 die "_reset_insertion_mode: t27";
3814 }
3815 }
3816
3817 ## Step 4..14
3818 my $new_mode;
3819 if ($node->[1] & FOREIGN_EL) {
3820 !!!cp ('t28.1');
3821 ## NOTE: Strictly spaking, the line below only applies to MathML and
3822 ## SVG elements. Currently the HTML syntax supports only MathML and
3823 ## SVG elements as foreigners.
3824 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3825 } elsif ($node->[1] & TABLE_CELL_EL) {
3826 if ($last) {
3827 !!!cp ('t28.2');
3828 #
3829 } else {
3830 !!!cp ('t28.3');
3831 $new_mode = IN_CELL_IM;
3832 }
3833 } else {
3834 !!!cp ('t28.4');
3835 $new_mode = {
3836 select => IN_SELECT_IM,
3837 ## NOTE: |option| and |optgroup| do not set
3838 ## insertion mode to "in select" by themselves.
3839 tr => IN_ROW_IM,
3840 tbody => IN_TABLE_BODY_IM,
3841 thead => IN_TABLE_BODY_IM,
3842 tfoot => IN_TABLE_BODY_IM,
3843 caption => IN_CAPTION_IM,
3844 colgroup => IN_COLUMN_GROUP_IM,
3845 table => IN_TABLE_IM,
3846 head => IN_BODY_IM, # not in head!
3847 body => IN_BODY_IM,
3848 frameset => IN_FRAMESET_IM,
3849 }->{$node->[0]->manakai_local_name};
3850 }
3851 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3852
3853 ## Step 15
3854 if ($node->[1] & HTML_EL) {
3855 unless (defined $self->{head_element}) {
3856 !!!cp ('t29');
3857 $self->{insertion_mode} = BEFORE_HEAD_IM;
3858 } else {
3859 ## ISSUE: Can this state be reached?
3860 !!!cp ('t30');
3861 $self->{insertion_mode} = AFTER_HEAD_IM;
3862 }
3863 return;
3864 } else {
3865 !!!cp ('t31');
3866 }
3867
3868 ## Step 16
3869 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3870
3871 ## Step 17
3872 $i--;
3873 $node = $self->{open_elements}->[$i];
3874
3875 ## Step 18
3876 redo S3;
3877 } # S3
3878
3879 die "$0: _reset_insertion_mode: This line should never be reached";
3880 } # _reset_insertion_mode
3881
3882 sub _tree_construction_main ($) {
3883 my $self = shift;
3884
3885 my $active_formatting_elements = [];
3886
3887 my $reconstruct_active_formatting_elements = sub { # MUST
3888 my $insert = shift;
3889
3890 ## Step 1
3891 return unless @$active_formatting_elements;
3892
3893 ## Step 3
3894 my $i = -1;
3895 my $entry = $active_formatting_elements->[$i];
3896
3897 ## Step 2
3898 return if $entry->[0] eq '#marker';
3899 for (@{$self->{open_elements}}) {
3900 if ($entry->[0] eq $_->[0]) {
3901 !!!cp ('t32');
3902 return;
3903 }
3904 }
3905
3906 S4: {
3907 ## Step 4
3908 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3909
3910 ## Step 5
3911 $i--;
3912 $entry = $active_formatting_elements->[$i];
3913
3914 ## Step 6
3915 if ($entry->[0] eq '#marker') {
3916 !!!cp ('t33_1');
3917 #
3918 } else {
3919 my $in_open_elements;
3920 OE: for (@{$self->{open_elements}}) {
3921 if ($entry->[0] eq $_->[0]) {
3922 !!!cp ('t33');
3923 $in_open_elements = 1;
3924 last OE;
3925 }
3926 }
3927 if ($in_open_elements) {
3928 !!!cp ('t34');
3929 #
3930 } else {
3931 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3932 !!!cp ('t35');
3933 redo S4;
3934 }
3935 }
3936
3937 ## Step 7
3938 $i++;
3939 $entry = $active_formatting_elements->[$i];
3940 } # S4
3941
3942 S7: {
3943 ## Step 8
3944 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3945
3946 ## Step 9
3947 $insert->($clone->[0]);
3948 push @{$self->{open_elements}}, $clone;
3949
3950 ## Step 10
3951 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3952
3953 ## Step 11
3954 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3955 !!!cp ('t36');
3956 ## Step 7'
3957 $i++;
3958 $entry = $active_formatting_elements->[$i];
3959
3960 redo S7;
3961 }
3962
3963 !!!cp ('t37');
3964 } # S7
3965 }; # $reconstruct_active_formatting_elements
3966
3967 my $clear_up_to_marker = sub {
3968 for (reverse 0..$#$active_formatting_elements) {
3969 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3970 !!!cp ('t38');
3971 splice @$active_formatting_elements, $_;
3972 return;
3973 }
3974 }
3975
3976 !!!cp ('t39');
3977 }; # $clear_up_to_marker
3978
3979 my $insert;
3980
3981 my $parse_rcdata = sub ($) {
3982 my ($content_model_flag) = @_;
3983
3984 ## Step 1
3985 my $start_tag_name = $token->{tag_name};
3986 my $el;
3987 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3988
3989 ## Step 2
3990 $insert->($el);
3991
3992 ## Step 3
3993 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3994 delete $self->{escape}; # MUST
3995
3996 ## Step 4
3997 my $text = '';
3998 !!!nack ('t40.1');
3999 !!!next-token;
4000 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
4001 !!!cp ('t40');
4002 $text .= $token->{data};
4003 !!!next-token;
4004 }
4005
4006 ## Step 5
4007 if (length $text) {
4008 !!!cp ('t41');
4009 my $text = $self->{document}->create_text_node ($text);
4010 $el->append_child ($text);
4011 }
4012
4013 ## Step 6
4014 $self->{content_model} = PCDATA_CONTENT_MODEL;
4015
4016 ## Step 7
4017 if ($token->{type} == END_TAG_TOKEN and
4018 $token->{tag_name} eq $start_tag_name) {
4019 !!!cp ('t42');
4020 ## Ignore the token
4021 } else {
4022 ## NOTE: An end-of-file token.
4023 if ($content_model_flag == CDATA_CONTENT_MODEL) {
4024 !!!cp ('t43');
4025 !!!parse-error (type => 'in CDATA:#eof', token => $token);
4026 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
4027 !!!cp ('t44');
4028 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
4029 } else {
4030 die "$0: $content_model_flag in parse_rcdata";
4031 }
4032 }
4033 !!!next-token;
4034 }; # $parse_rcdata
4035
4036 my $script_start_tag = sub () {
4037 my $script_el;
4038 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
4039 ## TODO: mark as "parser-inserted"
4040
4041 $self->{content_model} = CDATA_CONTENT_MODEL;
4042 delete $self->{escape}; # MUST
4043
4044 my $text = '';
4045 !!!nack ('t45.1');
4046 !!!next-token;
4047 while ($token->{type} == CHARACTER_TOKEN) {
4048 !!!cp ('t45');
4049 $text .= $token->{data};
4050 !!!next-token;
4051 } # stop if non-character token or tokenizer stops tokenising
4052 if (length $text) {
4053 !!!cp ('t46');
4054 $script_el->manakai_append_text ($text);
4055 }
4056
4057 $self->{content_model} = PCDATA_CONTENT_MODEL;
4058
4059 if ($token->{type} == END_TAG_TOKEN and
4060 $token->{tag_name} eq 'script') {
4061 !!!cp ('t47');
4062 ## Ignore the token
4063 } else {
4064 !!!cp ('t48');
4065 !!!parse-error (type => 'in CDATA:#eof', token => $token);
4066 ## ISSUE: And ignore?
4067 ## TODO: mark as "already executed"
4068 }
4069
4070 if (defined $self->{inner_html_node}) {
4071 !!!cp ('t49');
4072 ## TODO: mark as "already executed"
4073 } else {
4074 !!!cp ('t50');
4075 ## TODO: $old_insertion_point = current insertion point
4076 ## TODO: insertion point = just before the next input character
4077
4078 $insert->($script_el);
4079
4080 ## TODO: insertion point = $old_insertion_point (might be "undefined")
4081
4082 ## TODO: if there is a script that will execute as soon as the parser resume, then...
4083 }
4084
4085 !!!next-token;
4086 }; # $script_start_tag
4087
4088 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
4089 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
4090 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
4091
4092 my $formatting_end_tag = sub {
4093 my $end_tag_token = shift;
4094 my $tag_name = $end_tag_token->{tag_name};
4095
4096 ## NOTE: The adoption agency algorithm (AAA).
4097
4098 FET: {
4099 ## Step 1
4100 my $formatting_element;
4101 my $formatting_element_i_in_active;
4102 AFE: for (reverse 0..$#$active_formatting_elements) {
4103 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
4104 !!!cp ('t52');
4105 last AFE;
4106 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
4107 eq $tag_name) {
4108 !!!cp ('t51');
4109 $formatting_element = $active_formatting_elements->[$_];
4110 $formatting_element_i_in_active = $_;
4111 last AFE;
4112 }
4113 } # AFE
4114 unless (defined $formatting_element) {
4115 !!!cp ('t53');
4116 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
4117 ## Ignore the token
4118 !!!next-token;
4119 return;
4120 }
4121 ## has an element in scope
4122 my $in_scope = 1;
4123 my $formatting_element_i_in_open;
4124 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4125 my $node = $self->{open_elements}->[$_];
4126 if ($node->[0] eq $formatting_element->[0]) {
4127 if ($in_scope) {
4128 !!!cp ('t54');
4129 $formatting_element_i_in_open = $_;
4130 last INSCOPE;
4131 } else { # in open elements but not in scope
4132 !!!cp ('t55');
4133 !!!parse-error (type => 'unmatched end tag',
4134 text => $token->{tag_name},
4135 token => $end_tag_token);
4136 ## Ignore the token
4137 !!!next-token;
4138 return;
4139 }
4140 } elsif ($node->[1] & SCOPING_EL) {
4141 !!!cp ('t56');
4142 $in_scope = 0;
4143 }
4144 } # INSCOPE
4145 unless (defined $formatting_element_i_in_open) {
4146 !!!cp ('t57');
4147 !!!parse-error (type => 'unmatched end tag',
4148 text => $token->{tag_name},
4149 token => $end_tag_token);
4150 pop @$active_formatting_elements; # $formatting_element
4151 !!!next-token; ## TODO: ok?
4152 return;
4153 }
4154 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
4155 !!!cp ('t58');
4156 !!!parse-error (type => 'not closed',
4157 text => $self->{open_elements}->[-1]->[0]
4158 ->manakai_local_name,
4159 token => $end_tag_token);
4160 }
4161
4162 ## Step 2
4163 my $furthest_block;
4164 my $furthest_block_i_in_open;
4165 OE: for (reverse 0..$#{$self->{open_elements}}) {
4166 my $node = $self->{open_elements}->[$_];
4167 if (not ($node->[1] & FORMATTING_EL) and
4168 #not $phrasing_category->{$node->[1]} and
4169 ($node->[1] & SPECIAL_EL or
4170 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
4171 !!!cp ('t59');
4172 $furthest_block = $node;
4173 $furthest_block_i_in_open = $_;
4174 } elsif ($node->[0] eq $formatting_element->[0]) {
4175 !!!cp ('t60');
4176 last OE;
4177 }
4178 } # OE
4179
4180 ## Step 3
4181 unless (defined $furthest_block) { # MUST
4182 !!!cp ('t61');
4183 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
4184 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
4185 !!!next-token;
4186 return;
4187 }
4188
4189 ## Step 4
4190 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
4191
4192 ## Step 5
4193 my $furthest_block_parent = $furthest_block->[0]->parent_node;
4194 if (defined $furthest_block_parent) {
4195 !!!cp ('t62');
4196 $furthest_block_parent->remove_child ($furthest_block->[0]);
4197 }
4198
4199 ## Step 6
4200 my $bookmark_prev_el
4201 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
4202 ->[0];
4203
4204 ## Step 7
4205 my $node = $furthest_block;
4206 my $node_i_in_open = $furthest_block_i_in_open;
4207 my $last_node = $furthest_block;
4208 S7: {
4209 ## Step 1
4210 $node_i_in_open--;
4211 $node = $self->{open_elements}->[$node_i_in_open];
4212
4213 ## Step 2
4214 my $node_i_in_active;
4215 S7S2: {
4216 for (reverse 0..$#$active_formatting_elements) {
4217 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4218 !!!cp ('t63');
4219 $node_i_in_active = $_;
4220 last S7S2;
4221 }
4222 }
4223 splice @{$self->{open_elements}}, $node_i_in_open, 1;
4224 redo S7;
4225 } # S7S2
4226
4227 ## Step 3
4228 last S7 if $node->[0] eq $formatting_element->[0];
4229
4230 ## Step 4
4231 if ($last_node->[0] eq $furthest_block->[0]) {
4232 !!!cp ('t64');
4233 $bookmark_prev_el = $node->[0];
4234 }
4235
4236 ## Step 5
4237 if ($node->[0]->has_child_nodes ()) {
4238 !!!cp ('t65');
4239 my $clone = [$node->[0]->clone_node (0), $node->[1]];
4240 $active_formatting_elements->[$node_i_in_active] = $clone;
4241 $self->{open_elements}->[$node_i_in_open] = $clone;
4242 $node = $clone;
4243 }
4244
4245 ## Step 6
4246 $node->[0]->append_child ($last_node->[0]);
4247
4248 ## Step 7
4249 $last_node = $node;
4250
4251 ## Step 8
4252 redo S7;
4253 } # S7
4254
4255 ## Step 8
4256 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
4257 my $foster_parent_element;
4258 my $next_sibling;
4259 OE: for (reverse 0..$#{$self->{open_elements}}) {
4260 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4261 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4262 if (defined $parent and $parent->node_type == 1) {
4263 !!!cp ('t65.1');
4264 $foster_parent_element = $parent;
4265 $next_sibling = $self->{open_elements}->[$_]->[0];
4266 } else {
4267 !!!cp ('t65.2');
4268 $foster_parent_element
4269 = $self->{open_elements}->[$_ - 1]->[0];
4270 }
4271 last OE;
4272 }
4273 } # OE
4274 $foster_parent_element = $self->{open_elements}->[0]->[0]
4275 unless defined $foster_parent_element;
4276 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
4277 $open_tables->[-1]->[1] = 1; # tainted
4278 } else {
4279 !!!cp ('t65.3');
4280 $common_ancestor_node->[0]->append_child ($last_node->[0]);
4281 }
4282
4283 ## Step 9
4284 my $clone = [$formatting_element->[0]->clone_node (0),
4285 $formatting_element->[1]];
4286
4287 ## Step 10
4288 my @cn = @{$furthest_block->[0]->child_nodes};
4289 $clone->[0]->append_child ($_) for @cn;
4290
4291 ## Step 11
4292 $furthest_block->[0]->append_child ($clone->[0]);
4293
4294 ## Step 12
4295 my $i;
4296 AFE: for (reverse 0..$#$active_formatting_elements) {
4297 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
4298 !!!cp ('t66');
4299 splice @$active_formatting_elements, $_, 1;
4300 $i-- and last AFE if defined $i;
4301 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
4302 !!!cp ('t67');
4303 $i = $_;
4304 }
4305 } # AFE
4306 splice @$active_formatting_elements, $i + 1, 0, $clone;
4307
4308 ## Step 13
4309 undef $i;
4310 OE: for (reverse 0..$#{$self->{open_elements}}) {
4311 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
4312 !!!cp ('t68');
4313 splice @{$self->{open_elements}}, $_, 1;
4314 $i-- and last OE if defined $i;
4315 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
4316 !!!cp ('t69');
4317 $i = $_;
4318 }
4319 } # OE
4320 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
4321
4322 ## Step 14
4323 redo FET;
4324 } # FET
4325 }; # $formatting_end_tag
4326
4327 $insert = my $insert_to_current = sub {
4328 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
4329 }; # $insert_to_current
4330
4331 my $insert_to_foster = sub {
4332 my $child = shift;
4333 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4334 # MUST
4335 my $foster_parent_element;
4336 my $next_sibling;
4337 OE: for (reverse 0..$#{$self->{open_elements}}) {
4338 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4339 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4340 if (defined $parent and $parent->node_type == 1) {
4341 !!!cp ('t70');
4342 $foster_parent_element = $parent;
4343 $next_sibling = $self->{open_elements}->[$_]->[0];
4344 } else {
4345 !!!cp ('t71');
4346 $foster_parent_element
4347 = $self->{open_elements}->[$_ - 1]->[0];
4348 }
4349 last OE;
4350 }
4351 } # OE
4352 $foster_parent_element = $self->{open_elements}->[0]->[0]
4353 unless defined $foster_parent_element;
4354 $foster_parent_element->insert_before
4355 ($child, $next_sibling);
4356 $open_tables->[-1]->[1] = 1; # tainted
4357 } else {
4358 !!!cp ('t72');
4359 $self->{open_elements}->[-1]->[0]->append_child ($child);
4360 }
4361 }; # $insert_to_foster
4362
4363 B: while (1) {
4364 if ($token->{type} == DOCTYPE_TOKEN) {
4365 !!!cp ('t73');
4366 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4367 ## Ignore the token
4368 ## Stay in the phase
4369 !!!next-token;
4370 next B;
4371 } elsif ($token->{type} == START_TAG_TOKEN and
4372 $token->{tag_name} eq 'html') {
4373 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4374 !!!cp ('t79');
4375 !!!parse-error (type => 'after html', text => 'html', token => $token);
4376 $self->{insertion_mode} = AFTER_BODY_IM;
4377 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4378 !!!cp ('t80');
4379 !!!parse-error (type => 'after html', text => 'html', token => $token);
4380 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4381 } else {
4382 !!!cp ('t81');
4383 }
4384
4385 !!!cp ('t82');
4386 !!!parse-error (type => 'not first start tag', token => $token);
4387 my $top_el = $self->{open_elements}->[0]->[0];
4388 for my $attr_name (keys %{$token->{attributes}}) {
4389 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4390 !!!cp ('t84');
4391 $top_el->set_attribute_ns
4392 (undef, [undef, $attr_name],
4393 $token->{attributes}->{$attr_name}->{value});
4394 }
4395 }
4396 !!!nack ('t84.1');
4397 !!!next-token;
4398 next B;
4399 } elsif ($token->{type} == COMMENT_TOKEN) {
4400 my $comment = $self->{document}->create_comment ($token->{data});
4401 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4402 !!!cp ('t85');
4403 $self->{document}->append_child ($comment);
4404 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4405 !!!cp ('t86');
4406 $self->{open_elements}->[0]->[0]->append_child ($comment);
4407 } else {
4408 !!!cp ('t87');
4409 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4410 }
4411 !!!next-token;
4412 next B;
4413 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4414 if ($token->{type} == CHARACTER_TOKEN) {
4415 !!!cp ('t87.1');
4416 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4417 !!!next-token;
4418 next B;
4419 } elsif ($token->{type} == START_TAG_TOKEN) {
4420 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4421 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4422 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4423 ($token->{tag_name} eq 'svg' and
4424 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4425 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4426 !!!cp ('t87.2');
4427 #
4428 } elsif ({
4429 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4430 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4431 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4432 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4433 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4434 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4435 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4436 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4437 }->{$token->{tag_name}}) {
4438 !!!cp ('t87.2');
4439 !!!parse-error (type => 'not closed',
4440 text => $self->{open_elements}->[-1]->[0]
4441 ->manakai_local_name,
4442 token => $token);
4443
4444 pop @{$self->{open_elements}}
4445 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4446
4447 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4448 ## Reprocess.
4449 next B;
4450 } else {
4451 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4452 my $tag_name = $token->{tag_name};
4453 if ($nsuri eq $SVG_NS) {
4454 $tag_name = {
4455 altglyph => 'altGlyph',
4456 altglyphdef => 'altGlyphDef',
4457 altglyphitem => 'altGlyphItem',
4458 animatecolor => 'animateColor',
4459 animatemotion => 'animateMotion',
4460 animatetransform => 'animateTransform',
4461 clippath => 'clipPath',
4462 feblend => 'feBlend',
4463 fecolormatrix => 'feColorMatrix',
4464 fecomponenttransfer => 'feComponentTransfer',
4465 fecomposite => 'feComposite',
4466 feconvolvematrix => 'feConvolveMatrix',
4467 fediffuselighting => 'feDiffuseLighting',
4468 fedisplacementmap => 'feDisplacementMap',
4469 fedistantlight => 'feDistantLight',
4470 feflood => 'feFlood',
4471 fefunca => 'feFuncA',
4472 fefuncb => 'feFuncB',
4473 fefuncg => 'feFuncG',
4474 fefuncr => 'feFuncR',
4475 fegaussianblur => 'feGaussianBlur',
4476 feimage => 'feImage',
4477 femerge => 'feMerge',
4478 femergenode => 'feMergeNode',
4479 femorphology => 'feMorphology',
4480 feoffset => 'feOffset',
4481 fepointlight => 'fePointLight',
4482 fespecularlighting => 'feSpecularLighting',
4483 fespotlight => 'feSpotLight',
4484 fetile => 'feTile',
4485 feturbulence => 'feTurbulence',
4486 foreignobject => 'foreignObject',
4487 glyphref => 'glyphRef',
4488 lineargradient => 'linearGradient',
4489 radialgradient => 'radialGradient',
4490 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4491 textpath => 'textPath',
4492 }->{$tag_name} || $tag_name;
4493 }
4494
4495 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4496
4497 ## "adjust foreign attributes" - done in insert-element-f
4498
4499 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4500
4501 if ($self->{self_closing}) {
4502 pop @{$self->{open_elements}};
4503 !!!ack ('t87.3');
4504 } else {
4505 !!!cp ('t87.4');
4506 }
4507
4508 !!!next-token;
4509 next B;
4510 }
4511 } elsif ($token->{type} == END_TAG_TOKEN) {
4512 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4513 !!!cp ('t87.5');
4514 #
4515 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4516 !!!cp ('t87.6');
4517 !!!parse-error (type => 'not closed',
4518 text => $self->{open_elements}->[-1]->[0]
4519 ->manakai_local_name,
4520 token => $token);
4521
4522 pop @{$self->{open_elements}}
4523 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4524
4525 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4526 ## Reprocess.
4527 next B;
4528 } else {
4529 die "$0: $token->{type}: Unknown token type";
4530 }
4531 }
4532
4533 if ($self->{insertion_mode} & HEAD_IMS) {
4534 if ($token->{type} == CHARACTER_TOKEN) {
4535 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
4536 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4537 !!!cp ('t88.2');
4538 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4539 #
4540 } else {
4541 !!!cp ('t88.1');
4542 ## Ignore the token.
4543 #
4544 }
4545 unless (length $token->{data}) {
4546 !!!cp ('t88');
4547 !!!next-token;
4548 next B;
4549 }
4550 ## TODO: set $token->{column} appropriately
4551 }
4552
4553 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4554 !!!cp ('t89');
4555 ## As if <head>
4556 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4557 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4558 push @{$self->{open_elements}},
4559 [$self->{head_element}, $el_category->{head}];
4560
4561 ## Reprocess in the "in head" insertion mode...
4562 pop @{$self->{open_elements}};
4563
4564 ## Reprocess in the "after head" insertion mode...
4565 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4566 !!!cp ('t90');
4567 ## As if </noscript>
4568 pop @{$self->{open_elements}};
4569 !!!parse-error (type => 'in noscript:#text', token => $token);
4570
4571 ## Reprocess in the "in head" insertion mode...
4572 ## As if </head>
4573 pop @{$self->{open_elements}};
4574
4575 ## Reprocess in the "after head" insertion mode...
4576 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4577 !!!cp ('t91');
4578 pop @{$self->{open_elements}};
4579
4580 ## Reprocess in the "after head" insertion mode...
4581 } else {
4582 !!!cp ('t92');
4583 }
4584
4585 ## "after head" insertion mode
4586 ## As if <body>
4587 !!!insert-element ('body',, $token);
4588 $self->{insertion_mode} = IN_BODY_IM;
4589 ## reprocess
4590 next B;
4591 } elsif ($token->{type} == START_TAG_TOKEN) {
4592 if ($token->{tag_name} eq 'head') {
4593 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4594 !!!cp ('t93');
4595 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4596 $self->{open_elements}->[-1]->[0]->append_child
4597 ($self->{head_element});
4598 push @{$self->{open_elements}},
4599 [$self->{head_element}, $el_category->{head}];
4600 $self->{insertion_mode} = IN_HEAD_IM;
4601 !!!nack ('t93.1');
4602 !!!next-token;
4603 next B;
4604 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4605 !!!cp ('t93.2');
4606 !!!parse-error (type => 'after head', text => 'head',
4607 token => $token);
4608 ## Ignore the token
4609 !!!nack ('t93.3');
4610 !!!next-token;
4611 next B;
4612 } else {
4613 !!!cp ('t95');
4614 !!!parse-error (type => 'in head:head',
4615 token => $token); # or in head noscript
4616 ## Ignore the token
4617 !!!nack ('t95.1');
4618 !!!next-token;
4619 next B;
4620 }
4621 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4622 !!!cp ('t96');
4623 ## As if <head>
4624 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4625 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4626 push @{$self->{open_elements}},
4627 [$self->{head_element}, $el_category->{head}];
4628
4629 $self->{insertion_mode} = IN_HEAD_IM;
4630 ## Reprocess in the "in head" insertion mode...
4631 } else {
4632 !!!cp ('t97');
4633 }
4634
4635 if ($token->{tag_name} eq 'base') {
4636 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4637 !!!cp ('t98');
4638 ## As if </noscript>
4639 pop @{$self->{open_elements}};
4640 !!!parse-error (type => 'in noscript', text => 'base',
4641 token => $token);
4642
4643 $self->{insertion_mode} = IN_HEAD_IM;
4644 ## Reprocess in the "in head" insertion mode...
4645 } else {
4646 !!!cp ('t99');
4647 }
4648
4649 ## NOTE: There is a "as if in head" code clone.
4650 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4651 !!!cp ('t100');
4652 !!!parse-error (type => 'after head',
4653 text => $token->{tag_name}, token => $token);
4654 push @{$self->{open_elements}},
4655 [$self->{head_element}, $el_category->{head}];
4656 } else {
4657 !!!cp ('t101');
4658 }
4659 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4660 pop @{$self->{open_elements}};
4661 pop @{$self->{open_elements}} # <head>
4662 if $self->{insertion_mode} == AFTER_HEAD_IM;
4663 !!!nack ('t101.1');
4664 !!!next-token;
4665 next B;
4666 } elsif ($token->{tag_name} eq 'link') {
4667 ## NOTE: There is a "as if in head" code clone.
4668 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4669 !!!cp ('t102');
4670 !!!parse-error (type => 'after head',
4671 text => $token->{tag_name}, token => $token);
4672 push @{$self->{open_elements}},
4673 [$self->{head_element}, $el_category->{head}];
4674 } else {
4675 !!!cp ('t103');
4676 }
4677 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4678 pop @{$self->{open_elements}};
4679 pop @{$self->{open_elements}} # <head>
4680 if $self->{insertion_mode} == AFTER_HEAD_IM;
4681 !!!ack ('t103.1');
4682 !!!next-token;
4683 next B;
4684 } elsif ($token->{tag_name} eq 'command' or
4685 $token->{tag_name} eq 'eventsource') {
4686 if ($self->{insertion_mode} == IN_HEAD_IM) {
4687 ## NOTE: If the insertion mode at the time of the emission
4688 ## of the token was "before head", $self->{insertion_mode}
4689 ## is already changed to |IN_HEAD_IM|.
4690
4691 ## NOTE: There is a "as if in head" code clone.
4692 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4693 pop @{$self->{open_elements}};
4694 pop @{$self->{open_elements}} # <head>
4695 if $self->{insertion_mode} == AFTER_HEAD_IM;
4696 !!!ack ('t103.2');
4697 !!!next-token;
4698 next B;
4699 } else {
4700 ## NOTE: "in head noscript" or "after head" insertion mode
4701 ## - in these cases, these tags are treated as same as
4702 ## normal in-body tags.
4703 !!!cp ('t103.3');
4704 #
4705 }
4706 } elsif ($token->{tag_name} eq 'meta') {
4707 ## NOTE: There is a "as if in head" code clone.
4708 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4709 !!!cp ('t104');
4710 !!!parse-error (type => 'after head',
4711 text => $token->{tag_name}, token => $token);
4712 push @{$self->{open_elements}},
4713 [$self->{head_element}, $el_category->{head}];
4714 } else {
4715 !!!cp ('t105');
4716 }
4717 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4718 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4719
4720 unless ($self->{confident}) {
4721 if ($token->{attributes}->{charset}) {
4722 !!!cp ('t106');
4723 ## NOTE: Whether the encoding is supported or not is handled
4724 ## in the {change_encoding} callback.
4725 $self->{change_encoding}
4726 ->($self, $token->{attributes}->{charset}->{value},
4727 $token);
4728
4729 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4730 ->set_user_data (manakai_has_reference =>
4731 $token->{attributes}->{charset}
4732 ->{has_reference});
4733 } elsif ($token->{attributes}->{content}) {
4734 if ($token->{attributes}->{content}->{value}
4735 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4736 [\x09\x0A\x0C\x0D\x20]*=
4737 [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4738 ([^"'\x09\x0A\x0C\x0D\x20]
4739 [^\x09\x0A\x0C\x0D\x20\x3B]*))/x) {
4740 !!!cp ('t107');
4741 ## NOTE: Whether the encoding is supported or not is handled
4742 ## in the {change_encoding} callback.
4743 $self->{change_encoding}
4744 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4745 $token);
4746 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4747 ->set_user_data (manakai_has_reference =>
4748 $token->{attributes}->{content}
4749 ->{has_reference});
4750 } else {
4751 !!!cp ('t108');
4752 }
4753 }
4754 } else {
4755 if ($token->{attributes}->{charset}) {
4756 !!!cp ('t109');
4757 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4758 ->set_user_data (manakai_has_reference =>
4759 $token->{attributes}->{charset}
4760 ->{has_reference});
4761 }
4762 if ($token->{attributes}->{content}) {
4763 !!!cp ('t110');
4764 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4765 ->set_user_data (manakai_has_reference =>
4766 $token->{attributes}->{content}
4767 ->{has_reference});
4768 }
4769 }
4770
4771 pop @{$self->{open_elements}} # <head>
4772 if $self->{insertion_mode} == AFTER_HEAD_IM;
4773 !!!ack ('t110.1');
4774 !!!next-token;
4775 next B;
4776 } elsif ($token->{tag_name} eq 'title') {
4777 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4778 !!!cp ('t111');
4779 ## As if </noscript>
4780 pop @{$self->{open_elements}};
4781 !!!parse-error (type => 'in noscript', text => 'title',
4782 token => $token);
4783
4784 $self->{insertion_mode} = IN_HEAD_IM;
4785 ## Reprocess in the "in head" insertion mode...
4786 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4787 !!!cp ('t112');
4788 !!!parse-error (type => 'after head',
4789 text => $token->{tag_name}, token => $token);
4790 push @{$self->{open_elements}},
4791 [$self->{head_element}, $el_category->{head}];
4792 } else {
4793 !!!cp ('t113');
4794 }
4795
4796 ## NOTE: There is a "as if in head" code clone.
4797 my $parent = defined $self->{head_element} ? $self->{head_element}
4798 : $self->{open_elements}->[-1]->[0];
4799 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4800 pop @{$self->{open_elements}} # <head>
4801 if $self->{insertion_mode} == AFTER_HEAD_IM;
4802 next B;
4803 } elsif ($token->{tag_name} eq 'style' or
4804 $token->{tag_name} eq 'noframes') {
4805 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4806 ## insertion mode IN_HEAD_IM)
4807 ## NOTE: There is a "as if in head" code clone.
4808 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4809 !!!cp ('t114');
4810 !!!parse-error (type => 'after head',
4811 text => $token->{tag_name}, token => $token);
4812 push @{$self->{open_elements}},
4813 [$self->{head_element}, $el_category->{head}];
4814 } else {
4815 !!!cp ('t115');
4816 }
4817 $parse_rcdata->(CDATA_CONTENT_MODEL);
4818 pop @{$self->{open_elements}} # <head>
4819 if $self->{insertion_mode} == AFTER_HEAD_IM;
4820 next B;
4821 } elsif ($token->{tag_name} eq 'noscript') {
4822 if ($self->{insertion_mode} == IN_HEAD_IM) {
4823 !!!cp ('t116');
4824 ## NOTE: and scripting is disalbed
4825 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4826 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4827 !!!nack ('t116.1');
4828 !!!next-token;
4829 next B;
4830 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4831 !!!cp ('t117');
4832 !!!parse-error (type => 'in noscript', text => 'noscript',
4833 token => $token);
4834 ## Ignore the token
4835 !!!nack ('t117.1');
4836 !!!next-token;
4837 next B;
4838 } else {
4839 !!!cp ('t118');
4840 #
4841 }
4842 } elsif ($token->{tag_name} eq 'script') {
4843 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4844 !!!cp ('t119');
4845 ## As if </noscript>
4846 pop @{$self->{open_elements}};
4847 !!!parse-error (type => 'in noscript', text => 'script',
4848 token => $token);
4849
4850 $self->{insertion_mode} = IN_HEAD_IM;
4851 ## Reprocess in the "in head" insertion mode...
4852 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4853 !!!cp ('t120');
4854 !!!parse-error (type => 'after head',
4855 text => $token->{tag_name}, token => $token);
4856 push @{$self->{open_elements}},
4857 [$self->{head_element}, $el_category->{head}];
4858 } else {
4859 !!!cp ('t121');
4860 }
4861
4862 ## NOTE: There is a "as if in head" code clone.
4863 $script_start_tag->();
4864 pop @{$self->{open_elements}} # <head>
4865 if $self->{insertion_mode} == AFTER_HEAD_IM;
4866 next B;
4867 } elsif ($token->{tag_name} eq 'body' or
4868 $token->{tag_name} eq 'frameset') {
4869 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4870 !!!cp ('t122');
4871 ## As if </noscript>
4872 pop @{$self->{open_elements}};
4873 !!!parse-error (type => 'in noscript',
4874 text => $token->{tag_name}, token => $token);
4875
4876 ## Reprocess in the "in head" insertion mode...
4877 ## As if </head>
4878 pop @{$self->{open_elements}};
4879
4880 ## Reprocess in the "after head" insertion mode...
4881 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4882 !!!cp ('t124');
4883 pop @{$self->{open_elements}};
4884
4885 ## Reprocess in the "after head" insertion mode...
4886 } else {
4887 !!!cp ('t125');
4888 }
4889
4890 ## "after head" insertion mode
4891 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4892 if ($token->{tag_name} eq 'body') {
4893 !!!cp ('t126');
4894 $self->{insertion_mode} = IN_BODY_IM;
4895 } elsif ($token->{tag_name} eq 'frameset') {
4896 !!!cp ('t127');
4897 $self->{insertion_mode} = IN_FRAMESET_IM;
4898 } else {
4899 die "$0: tag name: $self->{tag_name}";
4900 }
4901 !!!nack ('t127.1');
4902 !!!next-token;
4903 next B;
4904 } else {
4905 !!!cp ('t128');
4906 #
4907 }
4908
4909 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4910 !!!cp ('t129');
4911 ## As if </noscript>
4912 pop @{$self->{open_elements}};
4913 !!!parse-error (type => 'in noscript:/',
4914 text => $token->{tag_name}, token => $token);
4915
4916 ## Reprocess in the "in head" insertion mode...
4917 ## As if </head>
4918 pop @{$self->{open_elements}};
4919
4920 ## Reprocess in the "after head" insertion mode...
4921 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4922 !!!cp ('t130');
4923 ## As if </head>
4924 pop @{$self->{open_elements}};
4925
4926 ## Reprocess in the "after head" insertion mode...
4927 } else {
4928 !!!cp ('t131');
4929 }
4930
4931 ## "after head" insertion mode
4932 ## As if <body>
4933 !!!insert-element ('body',, $token);
4934 $self->{insertion_mode} = IN_BODY_IM;
4935 ## reprocess
4936 !!!ack-later;
4937 next B;
4938 } elsif ($token->{type} == END_TAG_TOKEN) {
4939 if ($token->{tag_name} eq 'head') {
4940 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4941 !!!cp ('t132');
4942 ## As if <head>
4943 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4944 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4945 push @{$self->{open_elements}},
4946 [$self->{head_element}, $el_category->{head}];
4947
4948 ## Reprocess in the "in head" insertion mode...
4949 pop @{$self->{open_elements}};
4950 $self->{insertion_mode} = AFTER_HEAD_IM;
4951 !!!next-token;
4952 next B;
4953 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4954 !!!cp ('t133');
4955 ## As if </noscript>
4956 pop @{$self->{open_elements}};
4957 !!!parse-error (type => 'in noscript:/',
4958 text => 'head', token => $token);
4959
4960 ## Reprocess in the "in head" insertion mode...
4961 pop @{$self->{open_elements}};
4962 $self->{insertion_mode} = AFTER_HEAD_IM;
4963 !!!next-token;
4964 next B;
4965 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4966 !!!cp ('t134');
4967 pop @{$self->{open_elements}};
4968 $self->{insertion_mode} = AFTER_HEAD_IM;
4969 !!!next-token;
4970 next B;
4971 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4972 !!!cp ('t134.1');
4973 !!!parse-error (type => 'unmatched end tag', text => 'head',
4974 token => $token);
4975 ## Ignore the token
4976 !!!next-token;
4977 next B;
4978 } else {
4979 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4980 }
4981 } elsif ($token->{tag_name} eq 'noscript') {
4982 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4983 !!!cp ('t136');
4984 pop @{$self->{open_elements}};
4985 $self->{insertion_mode} = IN_HEAD_IM;
4986 !!!next-token;
4987 next B;
4988 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4989 $self->{insertion_mode} == AFTER_HEAD_IM) {
4990 !!!cp ('t137');
4991 !!!parse-error (type => 'unmatched end tag',
4992 text => 'noscript', token => $token);
4993 ## Ignore the token ## ISSUE: An issue in the spec.
4994 !!!next-token;
4995 next B;
4996 } else {
4997 !!!cp ('t138');
4998 #
4999 }
5000 } elsif ({
5001 body => 1, html => 1,
5002 }->{$token->{tag_name}}) {
5003 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
5004 $self->{insertion_mode} == IN_HEAD_IM or
5005 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5006 !!!cp ('t140');
5007 !!!parse-error (type => 'unmatched end tag',
5008 text => $token->{tag_name}, token => $token);
5009 ## Ignore the token
5010 !!!next-token;
5011 next B;
5012 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
5013 !!!cp ('t140.1');
5014 !!!parse-error (type => 'unmatched end tag',
5015 text => $token->{tag_name}, token => $token);
5016 ## Ignore the token
5017 !!!next-token;
5018 next B;
5019 } else {
5020 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5021 }
5022 } elsif ($token->{tag_name} eq 'p') {
5023 !!!cp ('t142');
5024 !!!parse-error (type => 'unmatched end tag',
5025 text => $token->{tag_name}, token => $token);
5026 ## Ignore the token
5027 !!!next-token;
5028 next B;
5029 } elsif ($token->{tag_name} eq 'br') {
5030 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5031 !!!cp ('t142.2');
5032 ## (before head) as if <head>, (in head) as if </head>
5033 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
5034 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
5035 $self->{insertion_mode} = AFTER_HEAD_IM;
5036
5037 ## Reprocess in the "after head" insertion mode...
5038 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5039 !!!cp ('t143.2');
5040 ## As if </head>
5041 pop @{$self->{open_elements}};
5042 $self->{insertion_mode} = AFTER_HEAD_IM;
5043
5044 ## Reprocess in the "after head" insertion mode...
5045 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5046 !!!cp ('t143.3');
5047 ## ISSUE: Two parse errors for <head><noscript></br>
5048 !!!parse-error (type => 'unmatched end tag',
5049 text => 'br', token => $token);
5050 ## As if </noscript>
5051 pop @{$self->{open_elements}};
5052 $self->{insertion_mode} = IN_HEAD_IM;
5053
5054 ## Reprocess in the "in head" insertion mode...
5055 ## As if </head>
5056 pop @{$self->{open_elements}};
5057 $self->{insertion_mode} = AFTER_HEAD_IM;
5058
5059 ## Reprocess in the "after head" insertion mode...
5060 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
5061 !!!cp ('t143.4');
5062 #
5063 } else {
5064 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5065 }
5066
5067 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
5068 !!!parse-error (type => 'unmatched end tag',
5069 text => 'br', token => $token);
5070 ## Ignore the token
5071 !!!next-token;
5072 next B;
5073 } else {
5074 !!!cp ('t145');
5075 !!!parse-error (type => 'unmatched end tag',
5076 text => $token->{tag_name}, token => $token);
5077 ## Ignore the token
5078 !!!next-token;
5079 next B;
5080 }
5081
5082 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5083 !!!cp ('t146');
5084 ## As if </noscript>
5085 pop @{$self->{open_elements}};
5086 !!!parse-error (type => 'in noscript:/',
5087 text => $token->{tag_name}, token => $token);
5088
5089 ## Reprocess in the "in head" insertion mode...
5090 ## As if </head>
5091 pop @{$self->{open_elements}};
5092
5093 ## Reprocess in the "after head" insertion mode...
5094 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5095 !!!cp ('t147');
5096 ## As if </head>
5097 pop @{$self->{open_elements}};
5098
5099 ## Reprocess in the "after head" insertion mode...
5100 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5101 ## ISSUE: This case cannot be reached?
5102 !!!cp ('t148');
5103 !!!parse-error (type => 'unmatched end tag',
5104 text => $token->{tag_name}, token => $token);
5105 ## Ignore the token ## ISSUE: An issue in the spec.
5106 !!!next-token;
5107 next B;
5108 } else {
5109 !!!cp ('t149');
5110 }
5111
5112 ## "after head" insertion mode
5113 ## As if <body>
5114 !!!insert-element ('body',, $token);
5115 $self->{insertion_mode} = IN_BODY_IM;
5116 ## reprocess
5117 next B;
5118 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5119 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5120 !!!cp ('t149.1');
5121
5122 ## NOTE: As if <head>
5123 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
5124 $self->{open_elements}->[-1]->[0]->append_child
5125 ($self->{head_element});
5126 #push @{$self->{open_elements}},
5127 # [$self->{head_element}, $el_category->{head}];
5128 #$self->{insertion_mode} = IN_HEAD_IM;
5129 ## NOTE: Reprocess.
5130
5131 ## NOTE: As if </head>
5132 #pop @{$self->{open_elements}};
5133 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5134 ## NOTE: Reprocess.
5135
5136 #
5137 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5138 !!!cp ('t149.2');
5139
5140 ## NOTE: As if </head>
5141 pop @{$self->{open_elements}};
5142 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5143 ## NOTE: Reprocess.
5144
5145 #
5146 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5147 !!!cp ('t149.3');
5148
5149 !!!parse-error (type => 'in noscript:#eof', token => $token);
5150
5151 ## As if </noscript>
5152 pop @{$self->{open_elements}};
5153 #$self->{insertion_mode} = IN_HEAD_IM;
5154 ## NOTE: Reprocess.
5155
5156 ## NOTE: As if </head>
5157 pop @{$self->{open_elements}};
5158 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5159 ## NOTE: Reprocess.
5160
5161 #
5162 } else {
5163 !!!cp ('t149.4');
5164 #
5165 }
5166
5167 ## NOTE: As if <body>
5168 !!!insert-element ('body',, $token);
5169 $self->{insertion_mode} = IN_BODY_IM;
5170 ## NOTE: Reprocess.
5171 next B;
5172 } else {
5173 die "$0: $token->{type}: Unknown token type";
5174 }
5175
5176 ## ISSUE: An issue in the spec.
5177 } elsif ($self->{insertion_mode} & BODY_IMS) {
5178 if ($token->{type} == CHARACTER_TOKEN) {
5179 !!!cp ('t150');
5180 ## NOTE: There is a code clone of "character in body".
5181 $reconstruct_active_formatting_elements->($insert_to_current);
5182
5183 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5184
5185 !!!next-token;
5186 next B;
5187 } elsif ($token->{type} == START_TAG_TOKEN) {
5188 if ({
5189 caption => 1, col => 1, colgroup => 1, tbody => 1,
5190 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
5191 }->{$token->{tag_name}}) {
5192 if ($self->{insertion_mode} == IN_CELL_IM) {
5193 ## have an element in table scope
5194 for (reverse 0..$#{$self->{open_elements}}) {
5195 my $node = $self->{open_elements}->[$_];
5196 if ($node->[1] & TABLE_CELL_EL) {
5197 !!!cp ('t151');
5198
5199 ## Close the cell
5200 !!!back-token; # <x>
5201 $token = {type => END_TAG_TOKEN,
5202 tag_name => $node->[0]->manakai_local_name,
5203 line => $token->{line},
5204 column => $token->{column}};
5205 next B;
5206 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5207 !!!cp ('t152');
5208 ## ISSUE: This case can never be reached, maybe.
5209 last;
5210 }
5211 }
5212
5213 !!!cp ('t153');
5214 !!!parse-error (type => 'start tag not allowed',
5215 text => $token->{tag_name}, token => $token);
5216 ## Ignore the token
5217 !!!nack ('t153.1');
5218 !!!next-token;
5219 next B;
5220 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5221 !!!parse-error (type => 'not closed', text => 'caption',
5222 token => $token);
5223
5224 ## NOTE: As if </caption>.
5225 ## have a table element in table scope
5226 my $i;
5227 INSCOPE: {
5228 for (reverse 0..$#{$self->{open_elements}}) {
5229 my $node = $self->{open_elements}->[$_];
5230 if ($node->[1] & CAPTION_EL) {
5231 !!!cp ('t155');
5232 $i = $_;
5233 last INSCOPE;
5234 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5235 !!!cp ('t156');
5236 last;
5237 }
5238 }
5239
5240 !!!cp ('t157');
5241 !!!parse-error (type => 'start tag not allowed',
5242 text => $token->{tag_name}, token => $token);
5243 ## Ignore the token
5244 !!!nack ('t157.1');
5245 !!!next-token;
5246 next B;
5247 } # INSCOPE
5248
5249 ## generate implied end tags
5250 while ($self->{open_elements}->[-1]->[1]
5251 & END_TAG_OPTIONAL_EL) {
5252 !!!cp ('t158');
5253 pop @{$self->{open_elements}};
5254 }
5255
5256 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5257 !!!cp ('t159');
5258 !!!parse-error (type => 'not closed',
5259 text => $self->{open_elements}->[-1]->[0]
5260 ->manakai_local_name,
5261 token => $token);
5262 } else {
5263 !!!cp ('t160');
5264 }
5265
5266 splice @{$self->{open_elements}}, $i;
5267
5268 $clear_up_to_marker->();
5269
5270 $self->{insertion_mode} = IN_TABLE_IM;
5271
5272 ## reprocess
5273 !!!ack-later;
5274 next B;
5275 } else {
5276 !!!cp ('t161');
5277 #
5278 }
5279 } else {
5280 !!!cp ('t162');
5281 #
5282 }
5283 } elsif ($token->{type} == END_TAG_TOKEN) {
5284 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5285 if ($self->{insertion_mode} == IN_CELL_IM) {
5286 ## have an element in table scope
5287 my $i;
5288 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5289 my $node = $self->{open_elements}->[$_];
5290 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5291 !!!cp ('t163');
5292 $i = $_;
5293 last INSCOPE;
5294 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5295 !!!cp ('t164');
5296 last INSCOPE;
5297 }
5298 } # INSCOPE
5299 unless (defined $i) {
5300 !!!cp ('t165');
5301 !!!parse-error (type => 'unmatched end tag',
5302 text => $token->{tag_name},
5303 token => $token);
5304 ## Ignore the token
5305 !!!next-token;
5306 next B;
5307 }
5308
5309 ## generate implied end tags
5310 while ($self->{open_elements}->[-1]->[1]
5311 & END_TAG_OPTIONAL_EL) {
5312 !!!cp ('t166');
5313 pop @{$self->{open_elements}};
5314 }
5315
5316 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
5317 ne $token->{tag_name}) {
5318 !!!cp ('t167');
5319 !!!parse-error (type => 'not closed',
5320 text => $self->{open_elements}->[-1]->[0]
5321 ->manakai_local_name,
5322 token => $token);
5323 } else {
5324 !!!cp ('t168');
5325 }
5326
5327 splice @{$self->{open_elements}}, $i;
5328
5329 $clear_up_to_marker->();
5330
5331 $self->{insertion_mode} = IN_ROW_IM;
5332
5333 !!!next-token;
5334 next B;
5335 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5336 !!!cp ('t169');
5337 !!!parse-error (type => 'unmatched end tag',
5338 text => $token->{tag_name}, token => $token);
5339 ## Ignore the token
5340 !!!next-token;
5341 next B;
5342 } else {
5343 !!!cp ('t170');
5344 #
5345 }
5346 } elsif ($token->{tag_name} eq 'caption') {
5347 if ($self->{insertion_mode} == IN_CAPTION_IM) {
5348 ## have a table element in table scope
5349 my $i;
5350 INSCOPE: {
5351 for (reverse 0..$#{$self->{open_elements}}) {
5352 my $node = $self->{open_elements}->[$_];
5353 if ($node->[1] & CAPTION_EL) {
5354 !!!cp ('t171');
5355 $i = $_;
5356 last INSCOPE;
5357 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5358 !!!cp ('t172');
5359 last;
5360 }
5361 }
5362
5363 !!!cp ('t173');
5364 !!!parse-error (type => 'unmatched end tag',
5365 text => $token->{tag_name}, token => $token);
5366 ## Ignore the token
5367 !!!next-token;
5368 next B;
5369 } # INSCOPE
5370
5371 ## generate implied end tags
5372 while ($self->{open_elements}->[-1]->[1]
5373 & END_TAG_OPTIONAL_EL) {
5374 !!!cp ('t174');
5375 pop @{$self->{open_elements}};
5376 }
5377
5378 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5379 !!!cp ('t175');
5380 !!!parse-error (type => 'not closed',
5381 text => $self->{open_elements}->[-1]->[0]
5382 ->manakai_local_name,
5383 token => $token);
5384 } else {
5385 !!!cp ('t176');
5386 }
5387
5388 splice @{$self->{open_elements}}, $i;
5389
5390 $clear_up_to_marker->();
5391
5392 $self->{insertion_mode} = IN_TABLE_IM;
5393
5394 !!!next-token;
5395 next B;
5396 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5397 !!!cp ('t177');
5398 !!!parse-error (type => 'unmatched end tag',
5399 text => $token->{tag_name}, token => $token);
5400 ## Ignore the token
5401 !!!next-token;
5402 next B;
5403 } else {
5404 !!!cp ('t178');
5405 #
5406 }
5407 } elsif ({
5408 table => 1, tbody => 1, tfoot => 1,
5409 thead => 1, tr => 1,
5410 }->{$token->{tag_name}} and
5411 $self->{insertion_mode} == IN_CELL_IM) {
5412 ## have an element in table scope
5413 my $i;
5414 my $tn;
5415 INSCOPE: {
5416 for (reverse 0..$#{$self->{open_elements}}) {
5417 my $node = $self->{open_elements}->[$_];
5418 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5419 !!!cp ('t179');
5420 $i = $_;
5421
5422 ## Close the cell
5423 !!!back-token; # </x>
5424 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5425 line => $token->{line},
5426 column => $token->{column}};
5427 next B;
5428 } elsif ($node->[1] & TABLE_CELL_EL) {
5429 !!!cp ('t180');
5430 $tn = $node->[0]->manakai_local_name;
5431 ## NOTE: There is exactly one |td| or |th| element
5432 ## in scope in the stack of open elements by definition.
5433 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5434 ## ISSUE: Can this be reached?
5435 !!!cp ('t181');
5436 last;
5437 }
5438 }
5439
5440 !!!cp ('t182');
5441 !!!parse-error (type => 'unmatched end tag',
5442 text => $token->{tag_name}, token => $token);
5443 ## Ignore the token
5444 !!!next-token;
5445 next B;
5446 } # INSCOPE
5447 } elsif ($token->{tag_name} eq 'table' and
5448 $self->{insertion_mode} == IN_CAPTION_IM) {
5449 !!!parse-error (type => 'not closed', text => 'caption',
5450 token => $token);
5451
5452 ## As if </caption>
5453 ## have a table element in table scope
5454 my $i;
5455 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5456 my $node = $self->{open_elements}->[$_];
5457 if ($node->[1] & CAPTION_EL) {
5458 !!!cp ('t184');
5459 $i = $_;
5460 last INSCOPE;
5461 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5462 !!!cp ('t185');
5463 last INSCOPE;
5464 }
5465 } # INSCOPE
5466 unless (defined $i) {
5467 !!!cp ('t186');
5468 !!!parse-error (type => 'unmatched end tag',
5469 text => 'caption', token => $token);
5470 ## Ignore the token
5471 !!!next-token;
5472 next B;
5473 }
5474
5475 ## generate implied end tags
5476 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5477 !!!cp ('t187');
5478 pop @{$self->{open_elements}};
5479 }
5480
5481 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5482 !!!cp ('t188');
5483 !!!parse-error (type => 'not closed',
5484 text => $self->{open_elements}->[-1]->[0]
5485 ->manakai_local_name,
5486 token => $token);
5487 } else {
5488 !!!cp ('t189');
5489 }
5490
5491 splice @{$self->{open_elements}}, $i;
5492
5493 $clear_up_to_marker->();
5494
5495 $self->{insertion_mode} = IN_TABLE_IM;
5496
5497 ## reprocess
5498 next B;
5499 } elsif ({
5500 body => 1, col => 1, colgroup => 1, html => 1,
5501 }->{$token->{tag_name}}) {
5502 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5503 !!!cp ('t190');
5504 !!!parse-error (type => 'unmatched end tag',
5505 text => $token->{tag_name}, token => $token);
5506 ## Ignore the token
5507 !!!next-token;
5508 next B;
5509 } else {
5510 !!!cp ('t191');
5511 #
5512 }
5513 } elsif ({
5514 tbody => 1, tfoot => 1,
5515 thead => 1, tr => 1,
5516 }->{$token->{tag_name}} and
5517 $self->{insertion_mode} == IN_CAPTION_IM) {
5518 !!!cp ('t192');
5519 !!!parse-error (type => 'unmatched end tag',
5520 text => $token->{tag_name}, token => $token);
5521 ## Ignore the token
5522 !!!next-token;
5523 next B;
5524 } else {
5525 !!!cp ('t193');
5526 #
5527 }
5528 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5529 for my $entry (@{$self->{open_elements}}) {
5530 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5531 !!!cp ('t75');
5532 !!!parse-error (type => 'in body:#eof', token => $token);
5533 last;
5534 }
5535 }
5536
5537 ## Stop parsing.
5538 last B;
5539 } else {
5540 die "$0: $token->{type}: Unknown token type";
5541 }
5542
5543 $insert = $insert_to_current;
5544 #
5545 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5546 if ($token->{type} == CHARACTER_TOKEN) {
5547 if (not $open_tables->[-1]->[1] and # tainted
5548 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
5549 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5550
5551 unless (length $token->{data}) {
5552 !!!cp ('t194');
5553 !!!next-token;
5554 next B;
5555 } else {
5556 !!!cp ('t195');
5557 }
5558 }
5559
5560 !!!parse-error (type => 'in table:#text', token => $token);
5561
5562 ## As if in body, but insert into foster parent element
5563 ## ISSUE: Spec says that "whenever a node would be inserted
5564 ## into the current node" while characters might not be
5565 ## result in a new Text node.
5566 $reconstruct_active_formatting_elements->($insert_to_foster);
5567
5568 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5569 # MUST
5570 my $foster_parent_element;
5571 my $next_sibling;
5572 my $prev_sibling;
5573 OE: for (reverse 0..$#{$self->{open_elements}}) {
5574 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5575 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5576 if (defined $parent and $parent->node_type == 1) {
5577 !!!cp ('t196');
5578 $foster_parent_element = $parent;
5579 $next_sibling = $self->{open_elements}->[$_]->[0];
5580 $prev_sibling = $next_sibling->previous_sibling;
5581 } else {
5582 !!!cp ('t197');
5583 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5584 $prev_sibling = $foster_parent_element->last_child;
5585 }
5586 last OE;
5587 }
5588 } # OE
5589 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5590 $prev_sibling = $foster_parent_element->last_child
5591 unless defined $foster_parent_element;
5592 if (defined $prev_sibling and
5593 $prev_sibling->node_type == 3) {
5594 !!!cp ('t198');
5595 $prev_sibling->manakai_append_text ($token->{data});
5596 } else {
5597 !!!cp ('t199');
5598 $foster_parent_element->insert_before
5599 ($self->{document}->create_text_node ($token->{data}),
5600 $next_sibling);
5601 }
5602 $open_tables->[-1]->[1] = 1; # tainted
5603 } else {
5604 !!!cp ('t200');
5605 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5606 }
5607
5608 !!!next-token;
5609 next B;
5610 } elsif ($token->{type} == START_TAG_TOKEN) {
5611 if ({
5612 tr => ($self->{insertion_mode} != IN_ROW_IM),
5613 th => 1, td => 1,
5614 }->{$token->{tag_name}}) {
5615 if ($self->{insertion_mode} == IN_TABLE_IM) {
5616 ## Clear back to table context
5617 while (not ($self->{open_elements}->[-1]->[1]
5618 & TABLE_SCOPING_EL)) {
5619 !!!cp ('t201');
5620 pop @{$self->{open_elements}};
5621 }
5622
5623 !!!insert-element ('tbody',, $token);
5624 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5625 ## reprocess in the "in table body" insertion mode...
5626 }
5627
5628 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5629 unless ($token->{tag_name} eq 'tr') {
5630 !!!cp ('t202');
5631 !!!parse-error (type => 'missing start tag:tr', token => $token);
5632 }
5633
5634 ## Clear back to table body context
5635 while (not ($self->{open_elements}->[-1]->[1]
5636 & TABLE_ROWS_SCOPING_EL)) {
5637 !!!cp ('t203');
5638 ## ISSUE: Can this case be reached?
5639 pop @{$self->{open_elements}};
5640 }
5641
5642 $self->{insertion_mode} = IN_ROW_IM;
5643 if ($token->{tag_name} eq 'tr') {
5644 !!!cp ('t204');
5645 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5646 !!!nack ('t204');
5647 !!!next-token;
5648 next B;
5649 } else {
5650 !!!cp ('t205');
5651 !!!insert-element ('tr',, $token);
5652 ## reprocess in the "in row" insertion mode
5653 }
5654 } else {
5655 !!!cp ('t206');
5656 }
5657
5658 ## Clear back to table row context
5659 while (not ($self->{open_elements}->[-1]->[1]
5660 & TABLE_ROW_SCOPING_EL)) {
5661 !!!cp ('t207');
5662 pop @{$self->{open_elements}};
5663 }
5664
5665 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5666 $self->{insertion_mode} = IN_CELL_IM;
5667
5668 push @$active_formatting_elements, ['#marker', ''];
5669
5670 !!!nack ('t207.1');
5671 !!!next-token;
5672 next B;
5673 } elsif ({
5674 caption => 1, col => 1, colgroup => 1,
5675 tbody => 1, tfoot => 1, thead => 1,
5676 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5677 }->{$token->{tag_name}}) {
5678 if ($self->{insertion_mode} == IN_ROW_IM) {
5679 ## As if </tr>
5680 ## have an element in table scope
5681 my $i;
5682 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5683 my $node = $self->{open_elements}->[$_];
5684 if ($node->[1] & TABLE_ROW_EL) {
5685 !!!cp ('t208');
5686 $i = $_;
5687 last INSCOPE;
5688 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5689 !!!cp ('t209');
5690 last INSCOPE;
5691 }
5692 } # INSCOPE
5693 unless (defined $i) {
5694 !!!cp ('t210');
5695 ## TODO: This type is wrong.
5696 !!!parse-error (type => 'unmacthed end tag',
5697 text => $token->{tag_name}, token => $token);
5698 ## Ignore the token
5699 !!!nack ('t210.1');
5700 !!!next-token;
5701 next B;
5702 }
5703
5704 ## Clear back to table row context
5705 while (not ($self->{open_elements}->[-1]->[1]
5706 & TABLE_ROW_SCOPING_EL)) {
5707 !!!cp ('t211');
5708 ## ISSUE: Can this case be reached?
5709 pop @{$self->{open_elements}};
5710 }
5711
5712 pop @{$self->{open_elements}}; # tr
5713 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5714 if ($token->{tag_name} eq 'tr') {
5715 !!!cp ('t212');
5716 ## reprocess
5717 !!!ack-later;
5718 next B;
5719 } else {
5720 !!!cp ('t213');
5721 ## reprocess in the "in table body" insertion mode...
5722 }
5723 }
5724
5725 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5726 ## have an element in table scope
5727 my $i;
5728 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5729 my $node = $self->{open_elements}->[$_];
5730 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5731 !!!cp ('t214');
5732 $i = $_;
5733 last INSCOPE;
5734 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5735 !!!cp ('t215');
5736 last INSCOPE;
5737 }
5738 } # INSCOPE
5739 unless (defined $i) {
5740 !!!cp ('t216');
5741 ## TODO: This erorr type is wrong.
5742 !!!parse-error (type => 'unmatched end tag',
5743 text => $token->{tag_name}, token => $token);
5744 ## Ignore the token
5745 !!!nack ('t216.1');
5746 !!!next-token;
5747 next B;
5748 }
5749
5750 ## Clear back to table body context
5751 while (not ($self->{open_elements}->[-1]->[1]
5752 & TABLE_ROWS_SCOPING_EL)) {
5753 !!!cp ('t217');
5754 ## ISSUE: Can this state be reached?
5755 pop @{$self->{open_elements}};
5756 }
5757
5758 ## As if <{current node}>
5759 ## have an element in table scope
5760 ## true by definition
5761
5762 ## Clear back to table body context
5763 ## nop by definition
5764
5765 pop @{$self->{open_elements}};
5766 $self->{insertion_mode} = IN_TABLE_IM;
5767 ## reprocess in "in table" insertion mode...
5768 } else {
5769 !!!cp ('t218');
5770 }
5771
5772 if ($token->{tag_name} eq 'col') {
5773 ## Clear back to table context
5774 while (not ($self->{open_elements}->[-1]->[1]
5775 & TABLE_SCOPING_EL)) {
5776 !!!cp ('t219');
5777 ## ISSUE: Can this state be reached?
5778 pop @{$self->{open_elements}};
5779 }
5780
5781 !!!insert-element ('colgroup',, $token);
5782 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5783 ## reprocess
5784 !!!ack-later;
5785 next B;
5786 } elsif ({
5787 caption => 1,
5788 colgroup => 1,
5789 tbody => 1, tfoot => 1, thead => 1,
5790 }->{$token->{tag_name}}) {
5791 ## Clear back to table context
5792 while (not ($self->{open_elements}->[-1]->[1]
5793 & TABLE_SCOPING_EL)) {
5794 !!!cp ('t220');
5795 ## ISSUE: Can this state be reached?
5796 pop @{$self->{open_elements}};
5797 }
5798
5799 push @$active_formatting_elements, ['#marker', '']
5800 if $token->{tag_name} eq 'caption';
5801
5802 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5803 $self->{insertion_mode} = {
5804 caption => IN_CAPTION_IM,
5805 colgroup => IN_COLUMN_GROUP_IM,
5806 tbody => IN_TABLE_BODY_IM,
5807 tfoot => IN_TABLE_BODY_IM,
5808 thead => IN_TABLE_BODY_IM,
5809 }->{$token->{tag_name}};
5810 !!!next-token;
5811 !!!nack ('t220.1');
5812 next B;
5813 } else {
5814 die "$0: in table: <>: $token->{tag_name}";
5815 }
5816 } elsif ($token->{tag_name} eq 'table') {
5817 !!!parse-error (type => 'not closed',
5818 text => $self->{open_elements}->[-1]->[0]
5819 ->manakai_local_name,
5820 token => $token);
5821
5822 ## As if </table>
5823 ## have a table element in table scope
5824 my $i;
5825 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5826 my $node = $self->{open_elements}->[$_];
5827 if ($node->[1] & TABLE_EL) {
5828 !!!cp ('t221');
5829 $i = $_;
5830 last INSCOPE;
5831 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5832 !!!cp ('t222');
5833 last INSCOPE;
5834 }
5835 } # INSCOPE
5836 unless (defined $i) {
5837 !!!cp ('t223');
5838 ## TODO: The following is wrong, maybe.
5839 !!!parse-error (type => 'unmatched end tag', text => 'table',
5840 token => $token);
5841 ## Ignore tokens </table><table>
5842 !!!nack ('t223.1');
5843 !!!next-token;
5844 next B;
5845 }
5846
5847 ## TODO: Followings are removed from the latest spec.
5848 ## generate implied end tags
5849 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5850 !!!cp ('t224');
5851 pop @{$self->{open_elements}};
5852 }
5853
5854 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5855 !!!cp ('t225');
5856 ## NOTE: |<table><tr><table>|
5857 !!!parse-error (type => 'not closed',
5858 text => $self->{open_elements}->[-1]->[0]
5859 ->manakai_local_name,
5860 token => $token);
5861 } else {
5862 !!!cp ('t226');
5863 }
5864
5865 splice @{$self->{open_elements}}, $i;
5866 pop @{$open_tables};
5867
5868 $self->_reset_insertion_mode;
5869
5870 ## reprocess
5871 !!!ack-later;
5872 next B;
5873 } elsif ($token->{tag_name} eq 'style') {
5874 if (not $open_tables->[-1]->[1]) { # tainted
5875 !!!cp ('t227.8');
5876 ## NOTE: This is a "as if in head" code clone.
5877 $parse_rcdata->(CDATA_CONTENT_MODEL);
5878 next B;
5879 } else {
5880 !!!cp ('t227.7');
5881 #
5882 }
5883 } elsif ($token->{tag_name} eq 'script') {
5884 if (not $open_tables->[-1]->[1]) { # tainted
5885 !!!cp ('t227.6');
5886 ## NOTE: This is a "as if in head" code clone.
5887 $script_start_tag->();
5888 next B;
5889 } else {
5890 !!!cp ('t227.5');
5891 #
5892 }
5893 } elsif ($token->{tag_name} eq 'input') {
5894 if (not $open_tables->[-1]->[1]) { # tainted
5895 if ($token->{attributes}->{type}) { ## TODO: case
5896 my $type = lc $token->{attributes}->{type}->{value};
5897 if ($type eq 'hidden') {
5898 !!!cp ('t227.3');
5899 !!!parse-error (type => 'in table',
5900 text => $token->{tag_name}, token => $token);
5901
5902 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5903
5904 ## TODO: form element pointer
5905
5906 pop @{$self->{open_elements}};
5907
5908 !!!next-token;
5909 !!!ack ('t227.2.1');
5910 next B;
5911 } else {
5912 !!!cp ('t227.2');
5913 #
5914 }
5915 } else {
5916 !!!cp ('t227.1');
5917 #
5918 }
5919 } else {
5920 !!!cp ('t227.4');
5921 #
5922 }
5923 } else {
5924 !!!cp ('t227');
5925 #
5926 }
5927
5928 !!!parse-error (type => 'in table', text => $token->{tag_name},
5929 token => $token);
5930
5931 $insert = $insert_to_foster;
5932 #
5933 } elsif ($token->{type} == END_TAG_TOKEN) {
5934 if ($token->{tag_name} eq 'tr' and
5935 $self->{insertion_mode} == IN_ROW_IM) {
5936 ## have an element in table scope
5937 my $i;
5938 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5939 my $node = $self->{open_elements}->[$_];
5940 if ($node->[1] & TABLE_ROW_EL) {
5941 !!!cp ('t228');
5942 $i = $_;
5943 last INSCOPE;
5944 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5945 !!!cp ('t229');
5946 last INSCOPE;
5947 }
5948 } # INSCOPE
5949 unless (defined $i) {
5950 !!!cp ('t230');
5951 !!!parse-error (type => 'unmatched end tag',
5952 text => $token->{tag_name}, token => $token);
5953 ## Ignore the token
5954 !!!nack ('t230.1');
5955 !!!next-token;
5956 next B;
5957 } else {
5958 !!!cp ('t232');
5959 }
5960
5961 ## Clear back to table row context
5962 while (not ($self->{open_elements}->[-1]->[1]
5963 & TABLE_ROW_SCOPING_EL)) {
5964 !!!cp ('t231');
5965 ## ISSUE: Can this state be reached?
5966 pop @{$self->{open_elements}};
5967 }
5968
5969 pop @{$self->{open_elements}}; # tr
5970 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5971 !!!next-token;
5972 !!!nack ('t231.1');
5973 next B;
5974 } elsif ($token->{tag_name} eq 'table') {
5975 if ($self->{insertion_mode} == IN_ROW_IM) {
5976 ## As if </tr>
5977 ## have an element in table scope
5978 my $i;
5979 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5980 my $node = $self->{open_elements}->[$_];
5981 if ($node->[1] & TABLE_ROW_EL) {
5982 !!!cp ('t233');
5983 $i = $_;
5984 last INSCOPE;
5985 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5986 !!!cp ('t234');
5987 last INSCOPE;
5988 }
5989 } # INSCOPE
5990 unless (defined $i) {
5991 !!!cp ('t235');
5992 ## TODO: The following is wrong.
5993 !!!parse-error (type => 'unmatched end tag',
5994 text => $token->{type}, token => $token);
5995 ## Ignore the token
5996 !!!nack ('t236.1');
5997 !!!next-token;
5998 next B;
5999 }
6000
6001 ## Clear back to table row context
6002 while (not ($self->{open_elements}->[-1]->[1]
6003 & TABLE_ROW_SCOPING_EL)) {
6004 !!!cp ('t236');
6005 ## ISSUE: Can this state be reached?
6006 pop @{$self->{open_elements}};
6007 }
6008
6009 pop @{$self->{open_elements}}; # tr
6010 $self->{insertion_mode} = IN_TABLE_BODY_IM;
6011 ## reprocess in the "in table body" insertion mode...
6012 }
6013
6014 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
6015 ## have an element in table scope
6016 my $i;
6017 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6018 my $node = $self->{open_elements}->[$_];
6019 if ($node->[1] & TABLE_ROW_GROUP_EL) {
6020 !!!cp ('t237');
6021 $i = $_;
6022 last INSCOPE;
6023 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6024 !!!cp ('t238');
6025 last INSCOPE;
6026 }
6027 } # INSCOPE
6028 unless (defined $i) {
6029 !!!cp ('t239');
6030 !!!parse-error (type => 'unmatched end tag',
6031 text => $token->{tag_name}, token => $token);
6032 ## Ignore the token
6033 !!!nack ('t239.1');
6034 !!!next-token;
6035 next B;
6036 }
6037
6038 ## Clear back to table body context
6039 while (not ($self->{open_elements}->[-1]->[1]
6040 & TABLE_ROWS_SCOPING_EL)) {
6041 !!!cp ('t240');
6042 pop @{$self->{open_elements}};
6043 }
6044
6045 ## As if <{current node}>
6046 ## have an element in table scope
6047 ## true by definition
6048
6049 ## Clear back to table body context
6050 ## nop by definition
6051
6052 pop @{$self->{open_elements}};
6053 $self->{insertion_mode} = IN_TABLE_IM;
6054 ## reprocess in the "in table" insertion mode...
6055 }
6056
6057 ## NOTE: </table> in the "in table" insertion mode.
6058 ## When you edit the code fragment below, please ensure that
6059 ## the code for <table> in the "in table" insertion mode
6060 ## is synced with it.
6061
6062 ## have a table element in table scope
6063 my $i;
6064 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6065 my $node = $self->{open_elements}->[$_];
6066 if ($node->[1] & TABLE_EL) {
6067 !!!cp ('t241');
6068 $i = $_;
6069 last INSCOPE;
6070 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6071 !!!cp ('t242');
6072 last INSCOPE;
6073 }
6074 } # INSCOPE
6075 unless (defined $i) {
6076 !!!cp ('t243');
6077 !!!parse-error (type => 'unmatched end tag',
6078 text => $token->{tag_name}, token => $token);
6079 ## Ignore the token
6080 !!!nack ('t243.1');
6081 !!!next-token;
6082 next B;
6083 }
6084
6085 splice @{$self->{open_elements}}, $i;
6086 pop @{$open_tables};
6087
6088 $self->_reset_insertion_mode;
6089
6090 !!!next-token;
6091 next B;
6092 } elsif ({
6093 tbody => 1, tfoot => 1, thead => 1,
6094 }->{$token->{tag_name}} and
6095 $self->{insertion_mode} & ROW_IMS) {
6096 if ($self->{insertion_mode} == IN_ROW_IM) {
6097 ## have an element in table scope
6098 my $i;
6099 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6100 my $node = $self->{open_elements}->[$_];
6101 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6102 !!!cp ('t247');
6103 $i = $_;
6104 last INSCOPE;
6105 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6106 !!!cp ('t248');
6107 last INSCOPE;
6108 }
6109 } # INSCOPE
6110 unless (defined $i) {
6111 !!!cp ('t249');
6112 !!!parse-error (type => 'unmatched end tag',
6113 text => $token->{tag_name}, token => $token);
6114 ## Ignore the token
6115 !!!nack ('t249.1');
6116 !!!next-token;
6117 next B;
6118 }
6119
6120 ## As if </tr>
6121 ## have an element in table scope
6122 my $i;
6123 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6124 my $node = $self->{open_elements}->[$_];
6125 if ($node->[1] & TABLE_ROW_EL) {
6126 !!!cp ('t250');
6127 $i = $_;
6128 last INSCOPE;
6129 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6130 !!!cp ('t251');
6131 last INSCOPE;
6132 }
6133 } # INSCOPE
6134 unless (defined $i) {
6135 !!!cp ('t252');
6136 !!!parse-error (type => 'unmatched end tag',
6137 text => 'tr', token => $token);
6138 ## Ignore the token
6139 !!!nack ('t252.1');
6140 !!!next-token;
6141 next B;
6142 }
6143
6144 ## Clear back to table row context
6145 while (not ($self->{open_elements}->[-1]->[1]
6146 & TABLE_ROW_SCOPING_EL)) {
6147 !!!cp ('t253');
6148 ## ISSUE: Can this case be reached?
6149 pop @{$self->{open_elements}};
6150 }
6151
6152 pop @{$self->{open_elements}}; # tr
6153 $self->{insertion_mode} = IN_TABLE_BODY_IM;
6154 ## reprocess in the "in table body" insertion mode...
6155 }
6156
6157 ## have an element in table scope
6158 my $i;
6159 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6160 my $node = $self->{open_elements}->[$_];
6161 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6162 !!!cp ('t254');
6163 $i = $_;
6164 last INSCOPE;
6165 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6166 !!!cp ('t255');
6167 last INSCOPE;
6168 }
6169 } # INSCOPE
6170 unless (defined $i) {
6171 !!!cp ('t256');
6172 !!!parse-error (type => 'unmatched end tag',
6173 text => $token->{tag_name}, token => $token);
6174 ## Ignore the token
6175 !!!nack ('t256.1');
6176 !!!next-token;
6177 next B;
6178 }
6179
6180 ## Clear back to table body context
6181 while (not ($self->{open_elements}->[-1]->[1]
6182 & TABLE_ROWS_SCOPING_EL)) {
6183 !!!cp ('t257');
6184 ## ISSUE: Can this case be reached?
6185 pop @{$self->{open_elements}};
6186 }
6187
6188 pop @{$self->{open_elements}};
6189 $self->{insertion_mode} = IN_TABLE_IM;
6190 !!!nack ('t257.1');
6191 !!!next-token;
6192 next B;
6193 } elsif ({
6194 body => 1, caption => 1, col => 1, colgroup => 1,
6195 html => 1, td => 1, th => 1,
6196 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
6197 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
6198 }->{$token->{tag_name}}) {
6199 !!!cp ('t258');
6200 !!!parse-error (type => 'unmatched end tag',
6201 text => $token->{tag_name}, token => $token);
6202 ## Ignore the token
6203 !!!nack ('t258.1');
6204 !!!next-token;
6205 next B;
6206 } else {
6207 !!!cp ('t259');
6208 !!!parse-error (type => 'in table:/',
6209 text => $token->{tag_name}, token => $token);
6210
6211 $insert = $insert_to_foster;
6212 #
6213 }
6214 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6215 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6216 @{$self->{open_elements}} == 1) { # redundant, maybe
6217 !!!parse-error (type => 'in body:#eof', token => $token);
6218 !!!cp ('t259.1');
6219 #
6220 } else {
6221 !!!cp ('t259.2');
6222 #
6223 }
6224
6225 ## Stop parsing
6226 last B;
6227 } else {
6228 die "$0: $token->{type}: Unknown token type";
6229 }
6230 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6231 if ($token->{type} == CHARACTER_TOKEN) {
6232 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6233 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6234 unless (length $token->{data}) {
6235 !!!cp ('t260');
6236 !!!next-token;
6237 next B;
6238 }
6239 }
6240
6241 !!!cp ('t261');
6242 #
6243 } elsif ($token->{type} == START_TAG_TOKEN) {
6244 if ($token->{tag_name} eq 'col') {
6245 !!!cp ('t262');
6246 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6247 pop @{$self->{open_elements}};
6248 !!!ack ('t262.1');
6249 !!!next-token;
6250 next B;
6251 } else {
6252 !!!cp ('t263');
6253 #
6254 }
6255 } elsif ($token->{type} == END_TAG_TOKEN) {
6256 if ($token->{tag_name} eq 'colgroup') {
6257 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6258 !!!cp ('t264');
6259 !!!parse-error (type => 'unmatched end tag',
6260 text => 'colgroup', token => $token);
6261 ## Ignore the token
6262 !!!next-token;
6263 next B;
6264 } else {
6265 !!!cp ('t265');
6266 pop @{$self->{open_elements}}; # colgroup
6267 $self->{insertion_mode} = IN_TABLE_IM;
6268 !!!next-token;
6269 next B;
6270 }
6271 } elsif ($token->{tag_name} eq 'col') {
6272 !!!cp ('t266');
6273 !!!parse-error (type => 'unmatched end tag',
6274 text => 'col', token => $token);
6275 ## Ignore the token
6276 !!!next-token;
6277 next B;
6278 } else {
6279 !!!cp ('t267');
6280 #
6281 }
6282 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6283 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6284 @{$self->{open_elements}} == 1) { # redundant, maybe
6285 !!!cp ('t270.2');
6286 ## Stop parsing.
6287 last B;
6288 } else {
6289 ## NOTE: As if </colgroup>.
6290 !!!cp ('t270.1');
6291 pop @{$self->{open_elements}}; # colgroup
6292 $self->{insertion_mode} = IN_TABLE_IM;
6293 ## Reprocess.
6294 next B;
6295 }
6296 } else {
6297 die "$0: $token->{type}: Unknown token type";
6298 }
6299
6300 ## As if </colgroup>
6301 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6302 !!!cp ('t269');
6303 ## TODO: Wrong error type?
6304 !!!parse-error (type => 'unmatched end tag',
6305 text => 'colgroup', token => $token);
6306 ## Ignore the token
6307 !!!nack ('t269.1');
6308 !!!next-token;
6309 next B;
6310 } else {
6311 !!!cp ('t270');
6312 pop @{$self->{open_elements}}; # colgroup
6313 $self->{insertion_mode} = IN_TABLE_IM;
6314 !!!ack-later;
6315 ## reprocess
6316 next B;
6317 }
6318 } elsif ($self->{insertion_mode} & SELECT_IMS) {
6319 if ($token->{type} == CHARACTER_TOKEN) {
6320 !!!cp ('t271');
6321 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
6322 !!!next-token;
6323 next B;
6324 } elsif ($token->{type} == START_TAG_TOKEN) {
6325 if ($token->{tag_name} eq 'option') {
6326 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6327 !!!cp ('t272');
6328 ## As if </option>
6329 pop @{$self->{open_elements}};
6330 } else {
6331 !!!cp ('t273');
6332 }
6333
6334 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6335 !!!nack ('t273.1');
6336 !!!next-token;
6337 next B;
6338 } elsif ($token->{tag_name} eq 'optgroup') {
6339 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6340 !!!cp ('t274');
6341 ## As if </option>
6342 pop @{$self->{open_elements}};
6343 } else {
6344 !!!cp ('t275');
6345 }
6346
6347 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6348 !!!cp ('t276');
6349 ## As if </optgroup>
6350 pop @{$self->{open_elements}};
6351 } else {
6352 !!!cp ('t277');
6353 }
6354
6355 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6356 !!!nack ('t277.1');
6357 !!!next-token;
6358 next B;
6359 } elsif ({
6360 select => 1, input => 1, textarea => 1,
6361 }->{$token->{tag_name}} or
6362 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6363 {
6364 caption => 1, table => 1,
6365 tbody => 1, tfoot => 1, thead => 1,
6366 tr => 1, td => 1, th => 1,
6367 }->{$token->{tag_name}})) {
6368 ## TODO: The type below is not good - <select> is replaced by </select>
6369 !!!parse-error (type => 'not closed', text => 'select',
6370 token => $token);
6371 ## NOTE: As if the token were </select> (<select> case) or
6372 ## as if there were </select> (otherwise).
6373 ## have an element in table scope
6374 my $i;
6375 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6376 my $node = $self->{open_elements}->[$_];
6377 if ($node->[1] & SELECT_EL) {
6378 !!!cp ('t278');
6379 $i = $_;
6380 last INSCOPE;
6381 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6382 !!!cp ('t279');
6383 last INSCOPE;
6384 }
6385 } # INSCOPE
6386 unless (defined $i) {
6387 !!!cp ('t280');
6388 !!!parse-error (type => 'unmatched end tag',
6389 text => 'select', token => $token);
6390 ## Ignore the token
6391 !!!nack ('t280.1');
6392 !!!next-token;
6393 next B;
6394 }
6395
6396 !!!cp ('t281');
6397 splice @{$self->{open_elements}}, $i;
6398
6399 $self->_reset_insertion_mode;
6400
6401 if ($token->{tag_name} eq 'select') {
6402 !!!nack ('t281.2');
6403 !!!next-token;
6404 next B;
6405 } else {
6406 !!!cp ('t281.1');
6407 !!!ack-later;
6408 ## Reprocess the token.
6409 next B;
6410 }
6411 } else {
6412 !!!cp ('t282');
6413 !!!parse-error (type => 'in select',
6414 text => $token->{tag_name}, token => $token);
6415 ## Ignore the token
6416 !!!nack ('t282.1');
6417 !!!next-token;
6418 next B;
6419 }
6420 } elsif ($token->{type} == END_TAG_TOKEN) {
6421 if ($token->{tag_name} eq 'optgroup') {
6422 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6423 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6424 !!!cp ('t283');
6425 ## As if </option>
6426 splice @{$self->{open_elements}}, -2;
6427 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6428 !!!cp ('t284');
6429 pop @{$self->{open_elements}};
6430 } else {
6431 !!!cp ('t285');
6432 !!!parse-error (type => 'unmatched end tag',
6433 text => $token->{tag_name}, token => $token);
6434 ## Ignore the token
6435 }
6436 !!!nack ('t285.1');
6437 !!!next-token;
6438 next B;
6439 } elsif ($token->{tag_name} eq 'option') {
6440 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6441 !!!cp ('t286');
6442 pop @{$self->{open_elements}};
6443 } else {
6444 !!!cp ('t287');
6445 !!!parse-error (type => 'unmatched end tag',
6446 text => $token->{tag_name}, token => $token);
6447 ## Ignore the token
6448 }
6449 !!!nack ('t287.1');
6450 !!!next-token;
6451 next B;
6452 } elsif ($token->{tag_name} eq 'select') {
6453 ## have an element in table scope
6454 my $i;
6455 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6456 my $node = $self->{open_elements}->[$_];
6457 if ($node->[1] & SELECT_EL) {
6458 !!!cp ('t288');
6459 $i = $_;
6460 last INSCOPE;
6461 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6462 !!!cp ('t289');
6463 last INSCOPE;
6464 }
6465 } # INSCOPE
6466 unless (defined $i) {
6467 !!!cp ('t290');
6468 !!!parse-error (type => 'unmatched end tag',
6469 text => $token->{tag_name}, token => $token);
6470 ## Ignore the token
6471 !!!nack ('t290.1');
6472 !!!next-token;
6473 next B;
6474 }
6475
6476 !!!cp ('t291');
6477 splice @{$self->{open_elements}}, $i;
6478
6479 $self->_reset_insertion_mode;
6480
6481 !!!nack ('t291.1');
6482 !!!next-token;
6483 next B;
6484 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6485 {
6486 caption => 1, table => 1, tbody => 1,
6487 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6488 }->{$token->{tag_name}}) {
6489 ## TODO: The following is wrong?
6490 !!!parse-error (type => 'unmatched end tag',
6491 text => $token->{tag_name}, token => $token);
6492
6493 ## have an element in table scope
6494 my $i;
6495 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6496 my $node = $self->{open_elements}->[$_];
6497 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6498 !!!cp ('t292');
6499 $i = $_;
6500 last INSCOPE;
6501 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6502 !!!cp ('t293');
6503 last INSCOPE;
6504 }
6505 } # INSCOPE
6506 unless (defined $i) {
6507 !!!cp ('t294');
6508 ## Ignore the token
6509 !!!nack ('t294.1');
6510 !!!next-token;
6511 next B;
6512 }
6513
6514 ## As if </select>
6515 ## have an element in table scope
6516 undef $i;
6517 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6518 my $node = $self->{open_elements}->[$_];
6519 if ($node->[1] & SELECT_EL) {
6520 !!!cp ('t295');
6521 $i = $_;
6522 last INSCOPE;
6523 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6524 ## ISSUE: Can this state be reached?
6525 !!!cp ('t296');
6526 last INSCOPE;
6527 }
6528 } # INSCOPE
6529 unless (defined $i) {
6530 !!!cp ('t297');
6531 ## TODO: The following error type is correct?
6532 !!!parse-error (type => 'unmatched end tag',
6533 text => 'select', token => $token);
6534 ## Ignore the </select> token
6535 !!!nack ('t297.1');
6536 !!!next-token; ## TODO: ok?
6537 next B;
6538 }
6539
6540 !!!cp ('t298');
6541 splice @{$self->{open_elements}}, $i;
6542
6543 $self->_reset_insertion_mode;
6544
6545 !!!ack-later;
6546 ## reprocess
6547 next B;
6548 } else {
6549 !!!cp ('t299');
6550 !!!parse-error (type => 'in select:/',
6551 text => $token->{tag_name}, token => $token);
6552 ## Ignore the token
6553 !!!nack ('t299.3');
6554 !!!next-token;
6555 next B;
6556 }
6557 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6558 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6559 @{$self->{open_elements}} == 1) { # redundant, maybe
6560 !!!cp ('t299.1');
6561 !!!parse-error (type => 'in body:#eof', token => $token);
6562 } else {
6563 !!!cp ('t299.2');
6564 }
6565
6566 ## Stop parsing.
6567 last B;
6568 } else {
6569 die "$0: $token->{type}: Unknown token type";
6570 }
6571 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6572 if ($token->{type} == CHARACTER_TOKEN) {
6573 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6574 my $data = $1;
6575 ## As if in body
6576 $reconstruct_active_formatting_elements->($insert_to_current);
6577
6578 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6579
6580 unless (length $token->{data}) {
6581 !!!cp ('t300');
6582 !!!next-token;
6583 next B;
6584 }
6585 }
6586
6587 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6588 !!!cp ('t301');
6589 !!!parse-error (type => 'after html:#text', token => $token);
6590 #
6591 } else {
6592 !!!cp ('t302');
6593 ## "after body" insertion mode
6594 !!!parse-error (type => 'after body:#text', token => $token);
6595 #
6596 }
6597
6598 $self->{insertion_mode} = IN_BODY_IM;
6599 ## reprocess
6600 next B;
6601 } elsif ($token->{type} == START_TAG_TOKEN) {
6602 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6603 !!!cp ('t303');
6604 !!!parse-error (type => 'after html',
6605 text => $token->{tag_name}, token => $token);
6606 #
6607 } else {
6608 !!!cp ('t304');
6609 ## "after body" insertion mode
6610 !!!parse-error (type => 'after body',
6611 text => $token->{tag_name}, token => $token);
6612 #
6613 }
6614
6615 $self->{insertion_mode} = IN_BODY_IM;
6616 !!!ack-later;
6617 ## reprocess
6618 next B;
6619 } elsif ($token->{type} == END_TAG_TOKEN) {
6620 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6621 !!!cp ('t305');
6622 !!!parse-error (type => 'after html:/',
6623 text => $token->{tag_name}, token => $token);
6624
6625 $self->{insertion_mode} = IN_BODY_IM;
6626 ## Reprocess.
6627 next B;
6628 } else {
6629 !!!cp ('t306');
6630 }
6631
6632 ## "after body" insertion mode
6633 if ($token->{tag_name} eq 'html') {
6634 if (defined $self->{inner_html_node}) {
6635 !!!cp ('t307');
6636 !!!parse-error (type => 'unmatched end tag',
6637 text => 'html', token => $token);
6638 ## Ignore the token
6639 !!!next-token;
6640 next B;
6641 } else {
6642 !!!cp ('t308');
6643 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6644 !!!next-token;
6645 next B;
6646 }
6647 } else {
6648 !!!cp ('t309');
6649 !!!parse-error (type => 'after body:/',
6650 text => $token->{tag_name}, token => $token);
6651
6652 $self->{insertion_mode} = IN_BODY_IM;
6653 ## reprocess
6654 next B;
6655 }
6656 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6657 !!!cp ('t309.2');
6658 ## Stop parsing
6659 last B;
6660 } else {
6661 die "$0: $token->{type}: Unknown token type";
6662 }
6663 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6664 if ($token->{type} == CHARACTER_TOKEN) {
6665 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6666 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6667
6668 unless (length $token->{data}) {
6669 !!!cp ('t310');
6670 !!!next-token;
6671 next B;
6672 }
6673 }
6674
6675 if ($token->{data} =~ s/^[^\x09\x0A\x0C\x20]+//) {
6676 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6677 !!!cp ('t311');
6678 !!!parse-error (type => 'in frameset:#text', token => $token);
6679 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6680 !!!cp ('t312');
6681 !!!parse-error (type => 'after frameset:#text', token => $token);
6682 } else { # "after after frameset"
6683 !!!cp ('t313');
6684 !!!parse-error (type => 'after html:#text', token => $token);
6685 }
6686
6687 ## Ignore the token.
6688 if (length $token->{data}) {
6689 !!!cp ('t314');
6690 ## reprocess the rest of characters
6691 } else {
6692 !!!cp ('t315');
6693 !!!next-token;
6694 }
6695 next B;
6696 }
6697
6698 die qq[$0: Character "$token->{data}"];
6699 } elsif ($token->{type} == START_TAG_TOKEN) {
6700 if ($token->{tag_name} eq 'frameset' and
6701 $self->{insertion_mode} == IN_FRAMESET_IM) {
6702 !!!cp ('t318');
6703 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6704 !!!nack ('t318.1');
6705 !!!next-token;
6706 next B;
6707 } elsif ($token->{tag_name} eq 'frame' and
6708 $self->{insertion_mode} == IN_FRAMESET_IM) {
6709 !!!cp ('t319');
6710 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6711 pop @{$self->{open_elements}};
6712 !!!ack ('t319.1');
6713 !!!next-token;
6714 next B;
6715 } elsif ($token->{tag_name} eq 'noframes') {
6716 !!!cp ('t320');
6717 ## NOTE: As if in head.
6718 $parse_rcdata->(CDATA_CONTENT_MODEL);
6719 next B;
6720
6721 ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6722 ## has no parse error.
6723 } else {
6724 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6725 !!!cp ('t321');
6726 !!!parse-error (type => 'in frameset',
6727 text => $token->{tag_name}, token => $token);
6728 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6729 !!!cp ('t322');
6730 !!!parse-error (type => 'after frameset',
6731 text => $token->{tag_name}, token => $token);
6732 } else { # "after after frameset"
6733 !!!cp ('t322.2');
6734 !!!parse-error (type => 'after after frameset',
6735 text => $token->{tag_name}, token => $token);
6736 }
6737 ## Ignore the token
6738 !!!nack ('t322.1');
6739 !!!next-token;
6740 next B;
6741 }
6742 } elsif ($token->{type} == END_TAG_TOKEN) {
6743 if ($token->{tag_name} eq 'frameset' and
6744 $self->{insertion_mode} == IN_FRAMESET_IM) {
6745 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6746 @{$self->{open_elements}} == 1) {
6747 !!!cp ('t325');
6748 !!!parse-error (type => 'unmatched end tag',
6749 text => $token->{tag_name}, token => $token);
6750 ## Ignore the token
6751 !!!next-token;
6752 } else {
6753 !!!cp ('t326');
6754 pop @{$self->{open_elements}};
6755 !!!next-token;
6756 }
6757
6758 if (not defined $self->{inner_html_node} and
6759 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6760 !!!cp ('t327');
6761 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6762 } else {
6763 !!!cp ('t328');
6764 }
6765 next B;
6766 } elsif ($token->{tag_name} eq 'html' and
6767 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6768 !!!cp ('t329');
6769 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6770 !!!next-token;
6771 next B;
6772 } else {
6773 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6774 !!!cp ('t330');
6775 !!!parse-error (type => 'in frameset:/',
6776 text => $token->{tag_name}, token => $token);
6777 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6778 !!!cp ('t330.1');
6779 !!!parse-error (type => 'after frameset:/',
6780 text => $token->{tag_name}, token => $token);
6781 } else { # "after after html"
6782 !!!cp ('t331');
6783 !!!parse-error (type => 'after after frameset:/',
6784 text => $token->{tag_name}, token => $token);
6785 }
6786 ## Ignore the token
6787 !!!next-token;
6788 next B;
6789 }
6790 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6791 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6792 @{$self->{open_elements}} == 1) { # redundant, maybe
6793 !!!cp ('t331.1');
6794 !!!parse-error (type => 'in body:#eof', token => $token);
6795 } else {
6796 !!!cp ('t331.2');
6797 }
6798
6799 ## Stop parsing
6800 last B;
6801 } else {
6802 die "$0: $token->{type}: Unknown token type";
6803 }
6804
6805 ## ISSUE: An issue in spec here
6806 } else {
6807 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6808 }
6809
6810 ## "in body" insertion mode
6811 if ($token->{type} == START_TAG_TOKEN) {
6812 if ($token->{tag_name} eq 'script') {
6813 !!!cp ('t332');
6814 ## NOTE: This is an "as if in head" code clone
6815 $script_start_tag->();
6816 next B;
6817 } elsif ($token->{tag_name} eq 'style') {
6818 !!!cp ('t333');
6819 ## NOTE: This is an "as if in head" code clone
6820 $parse_rcdata->(CDATA_CONTENT_MODEL);
6821 next B;
6822 } elsif ({
6823 base => 1, command => 1, eventsource => 1, link => 1,
6824 }->{$token->{tag_name}}) {
6825 !!!cp ('t334');
6826 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6827 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6828 pop @{$self->{open_elements}};
6829 !!!ack ('t334.1');
6830 !!!next-token;
6831 next B;
6832 } elsif ($token->{tag_name} eq 'meta') {
6833 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6834 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6835 my $meta_el = pop @{$self->{open_elements}};
6836
6837 unless ($self->{confident}) {
6838 if ($token->{attributes}->{charset}) {
6839 !!!cp ('t335');
6840 ## NOTE: Whether the encoding is supported or not is handled
6841 ## in the {change_encoding} callback.
6842 $self->{change_encoding}
6843 ->($self, $token->{attributes}->{charset}->{value}, $token);
6844
6845 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6846 ->set_user_data (manakai_has_reference =>
6847 $token->{attributes}->{charset}
6848 ->{has_reference});
6849 } elsif ($token->{attributes}->{content}) {
6850 if ($token->{attributes}->{content}->{value}
6851 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6852 [\x09\x0A\x0C\x0D\x20]*=
6853 [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6854 ([^"'\x09\x0A\x0C\x0D\x20][^\x09\x0A\x0C\x0D\x20\x3B]*))
6855 /x) {
6856 !!!cp ('t336');
6857 ## NOTE: Whether the encoding is supported or not is handled
6858 ## in the {change_encoding} callback.
6859 $self->{change_encoding}
6860 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6861 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6862 ->set_user_data (manakai_has_reference =>
6863 $token->{attributes}->{content}
6864 ->{has_reference});
6865 }
6866 }
6867 } else {
6868 if ($token->{attributes}->{charset}) {
6869 !!!cp ('t337');
6870 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6871 ->set_user_data (manakai_has_reference =>
6872 $token->{attributes}->{charset}
6873 ->{has_reference});
6874 }
6875 if ($token->{attributes}->{content}) {
6876 !!!cp ('t338');
6877 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6878 ->set_user_data (manakai_has_reference =>
6879 $token->{attributes}->{content}
6880 ->{has_reference});
6881 }
6882 }
6883
6884 !!!ack ('t338.1');
6885 !!!next-token;
6886 next B;
6887 } elsif ($token->{tag_name} eq 'title') {
6888 !!!cp ('t341');
6889 ## NOTE: This is an "as if in head" code clone
6890 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6891 next B;
6892 } elsif ($token->{tag_name} eq 'body') {
6893 !!!parse-error (type => 'in body', text => 'body', token => $token);
6894
6895 if (@{$self->{open_elements}} == 1 or
6896 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6897 !!!cp ('t342');
6898 ## Ignore the token
6899 } else {
6900 my $body_el = $self->{open_elements}->[1]->[0];
6901 for my $attr_name (keys %{$token->{attributes}}) {
6902 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6903 !!!cp ('t343');
6904 $body_el->set_attribute_ns
6905 (undef, [undef, $attr_name],
6906 $token->{attributes}->{$attr_name}->{value});
6907 }
6908 }
6909 }
6910 !!!nack ('t343.1');
6911 !!!next-token;
6912 next B;
6913 } elsif ({
6914 ## NOTE: Start tags for non-phrasing flow content elements
6915
6916 ## NOTE: The normal one
6917 address => 1, article => 1, aside => 1, blockquote => 1,
6918 center => 1, datagrid => 1, details => 1, dialog => 1,
6919 dir => 1, div => 1, dl => 1, fieldset => 1, figure => 1,
6920 footer => 1, h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1,
6921 h6 => 1, header => 1, menu => 1, nav => 1, ol => 1, p => 1,
6922 section => 1, ul => 1,
6923 ## NOTE: As normal, but drops leading newline
6924 pre => 1, listing => 1,
6925 ## NOTE: As normal, but interacts with the form element pointer
6926 form => 1,
6927
6928 table => 1,
6929 hr => 1,
6930 }->{$token->{tag_name}}) {
6931 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6932 !!!cp ('t350');
6933 !!!parse-error (type => 'in form:form', token => $token);
6934 ## Ignore the token
6935 !!!nack ('t350.1');
6936 !!!next-token;
6937 next B;
6938 }
6939
6940 ## has a p element in scope
6941 INSCOPE: for (reverse @{$self->{open_elements}}) {
6942 if ($_->[1] & P_EL) {
6943 !!!cp ('t344');
6944 !!!back-token; # <form>
6945 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6946 line => $token->{line}, column => $token->{column}};
6947 next B;
6948 } elsif ($_->[1] & SCOPING_EL) {
6949 !!!cp ('t345');
6950 last INSCOPE;
6951 }
6952 } # INSCOPE
6953
6954 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6955 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6956 !!!nack ('t346.1');
6957 !!!next-token;
6958 if ($token->{type} == CHARACTER_TOKEN) {
6959 $token->{data} =~ s/^\x0A//;
6960 unless (length $token->{data}) {
6961 !!!cp ('t346');
6962 !!!next-token;
6963 } else {
6964 !!!cp ('t349');
6965 }
6966 } else {
6967 !!!cp ('t348');
6968 }
6969 } elsif ($token->{tag_name} eq 'form') {
6970 !!!cp ('t347.1');
6971 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6972
6973 !!!nack ('t347.2');
6974 !!!next-token;
6975 } elsif ($token->{tag_name} eq 'table') {
6976 !!!cp ('t382');
6977 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6978
6979 $self->{insertion_mode} = IN_TABLE_IM;
6980
6981 !!!nack ('t382.1');
6982 !!!next-token;
6983 } elsif ($token->{tag_name} eq 'hr') {
6984 !!!cp ('t386');
6985 pop @{$self->{open_elements}};
6986
6987 !!!nack ('t386.1');
6988 !!!next-token;
6989 } else {
6990 !!!nack ('t347.1');
6991 !!!next-token;
6992 }
6993 next B;
6994 } elsif ($token->{tag_name} eq 'li') {
6995 ## NOTE: As normal, but imply </li> when there's another <li> ...
6996
6997 ## NOTE: Special, Scope (<li><foo><li> == <li><foo><li/></foo></li>)
6998 ## Interpreted as <li><foo/></li><li/> (non-conforming)
6999 ## blockquote (O9.27), center (O), dd (Fx3, O, S3.1.2, IE7),
7000 ## dt (Fx, O, S, IE), dl (O), fieldset (O, S, IE), form (Fx, O, S),
7001 ## hn (O), pre (O), applet (O, S), button (O, S), marquee (Fx, O, S),
7002 ## object (Fx)
7003 ## Generate non-tree (non-conforming)
7004 ## basefont (IE7 (where basefont is non-void)), center (IE),
7005 ## form (IE), hn (IE)
7006 ## address, div, p (<li><foo><li> == <li><foo/></li><li/>)
7007 ## Interpreted as <li><foo><li/></foo></li> (non-conforming)
7008 ## div (Fx, S)
7009
7010 my $non_optional;
7011 my $i = -1;
7012
7013 ## 1.
7014 for my $node (reverse @{$self->{open_elements}}) {
7015 if ($node->[1] & LI_EL) {
7016 ## 2. (a) As if </li>
7017 {
7018 ## If no </li> - not applied
7019 #
7020
7021 ## Otherwise
7022
7023 ## 1. generate implied end tags, except for </li>
7024 #
7025
7026 ## 2. If current node != "li", parse error
7027 if ($non_optional) {
7028 !!!parse-error (type => 'not closed',
7029 text => $non_optional->[0]->manakai_local_name,
7030 token => $token);
7031 !!!cp ('t355');
7032 } else {
7033 !!!cp ('t356');
7034 }
7035
7036 ## 3. Pop
7037 splice @{$self->{open_elements}}, $i;
7038 }
7039
7040 last; ## 2. (b) goto 5.
7041 } elsif (
7042 ## NOTE: not "formatting" and not "phrasing"
7043 ($node->[1] & SPECIAL_EL or
7044 $node->[1] & SCOPING_EL) and
7045 ## NOTE: "li", "dt", and "dd" are in |SPECIAL_EL|.
7046
7047 (not $node->[1] & ADDRESS_EL) &
7048 (not $node->[1] & DIV_EL) &
7049 (not $node->[1] & P_EL)) {
7050 ## 3.
7051 !!!cp ('t357');
7052 last; ## goto 5.
7053 } elsif ($node->[1] & END_TAG_OPTIONAL_EL) {
7054 !!!cp ('t358');
7055 #
7056 } else {
7057 !!!cp ('t359');
7058 $non_optional ||= $node;
7059 #
7060 }
7061 ## 4.
7062 ## goto 2.
7063 $i--;
7064 }
7065
7066 ## 5. (a) has a |p| element in scope
7067 INSCOPE: for (reverse @{$self->{open_elements}}) {
7068 if ($_->[1] & P_EL) {
7069 !!!cp ('t353');
7070 !!!back-token; # <x>
7071 $token = {type => END_TAG_TOKEN, tag_name => 'p',
7072 line => $token->{line}, column => $token->{column}};
7073 next B;
7074 } elsif ($_->[1] & SCOPING_EL) {
7075 !!!cp ('t354');
7076 last INSCOPE;
7077 }
7078 } # INSCOPE
7079
7080 ## 5. (b) insert
7081 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7082 !!!nack ('t359.1');
7083 !!!next-token;
7084 next B;
7085 } elsif ($token->{tag_name} eq 'dt' or
7086 $token->{tag_name} eq 'dd') {
7087 ## NOTE: As normal, but imply </dt> or </dd> when ...
7088
7089 my $non_optional;
7090 my $i = -1;
7091
7092 ## 1.
7093 for my $node (reverse @{$self->{open_elements}}) {
7094 if ($node->[1] & DT_EL or $node->[1] & DD_EL) {
7095 ## 2. (a) As if </li>
7096 {
7097 ## If no </li> - not applied
7098 #
7099
7100 ## Otherwise
7101
7102 ## 1. generate implied end tags, except for </dt> or </dd>
7103 #
7104
7105 ## 2. If current node != "dt"|"dd", parse error
7106 if ($non_optional) {
7107 !!!parse-error (type => 'not closed',
7108 text => $non_optional->[0]->manakai_local_name,
7109 token => $token);
7110 !!!cp ('t355.1');
7111 } else {
7112 !!!cp ('t356.1');
7113 }
7114
7115 ## 3. Pop
7116 splice @{$self->{open_elements}}, $i;
7117 }
7118
7119 last; ## 2. (b) goto 5.
7120 } elsif (
7121 ## NOTE: not "formatting" and not "phrasing"
7122 ($node->[1] & SPECIAL_EL or
7123 $node->[1] & SCOPING_EL) and
7124 ## NOTE: "li", "dt", and "dd" are in |SPECIAL_EL|.
7125
7126 (not $node->[1] & ADDRESS_EL) &
7127 (not $node->[1] & DIV_EL) &
7128 (not $node->[1] & P_EL)) {
7129 ## 3.
7130 !!!cp ('t357.1');
7131 last; ## goto 5.
7132 } elsif ($node->[1] & END_TAG_OPTIONAL_EL) {
7133 !!!cp ('t358.1');
7134 #
7135 } else {
7136 !!!cp ('t359.1');
7137 $non_optional ||= $node;
7138 #
7139 }
7140 ## 4.
7141 ## goto 2.
7142 $i--;
7143 }
7144
7145 ## 5. (a) has a |p| element in scope
7146 INSCOPE: for (reverse @{$self->{open_elements}}) {
7147 if ($_->[1] & P_EL) {
7148 !!!cp ('t353.1');
7149 !!!back-token; # <x>
7150 $token = {type => END_TAG_TOKEN, tag_name => 'p',
7151 line => $token->{line}, column => $token->{column}};
7152 next B;
7153 } elsif ($_->[1] & SCOPING_EL) {
7154 !!!cp ('t354.1');
7155 last INSCOPE;
7156 }
7157 } # INSCOPE
7158
7159 ## 5. (b) insert
7160 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7161 !!!nack ('t359.2');
7162 !!!next-token;
7163 next B;
7164 } elsif ($token->{tag_name} eq 'plaintext') {
7165 ## NOTE: As normal, but effectively ends parsing
7166
7167 ## has a p element in scope
7168 INSCOPE: for (reverse @{$self->{open_elements}}) {
7169 if ($_->[1] & P_EL) {
7170 !!!cp ('t367');
7171 !!!back-token; # <plaintext>
7172 $token = {type => END_TAG_TOKEN, tag_name => 'p',
7173 line => $token->{line}, column => $token->{column}};
7174 next B;
7175 } elsif ($_->[1] & SCOPING_EL) {
7176 !!!cp ('t368');
7177 last INSCOPE;
7178 }
7179 } # INSCOPE
7180
7181 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7182
7183 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
7184
7185 !!!nack ('t368.1');
7186 !!!next-token;
7187 next B;
7188 } elsif ($token->{tag_name} eq 'a') {
7189 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
7190 my $node = $active_formatting_elements->[$i];
7191 if ($node->[1] & A_EL) {
7192 !!!cp ('t371');
7193 !!!parse-error (type => 'in a:a', token => $token);
7194
7195 !!!back-token; # <a>
7196 $token = {type => END_TAG_TOKEN, tag_name => 'a',
7197 line => $token->{line}, column => $token->{column}};
7198 $formatting_end_tag->($token);
7199
7200 AFE2: for (reverse 0..$#$active_formatting_elements) {
7201 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
7202 !!!cp ('t372');
7203 splice @$active_formatting_elements, $_, 1;
7204 last AFE2;
7205 }
7206 } # AFE2
7207 OE: for (reverse 0..$#{$self->{open_elements}}) {
7208 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
7209 !!!cp ('t373');
7210 splice @{$self->{open_elements}}, $_, 1;
7211 last OE;
7212 }
7213 } # OE
7214 last AFE;
7215 } elsif ($node->[0] eq '#marker') {
7216 !!!cp ('t374');
7217 last AFE;
7218 }
7219 } # AFE
7220
7221 $reconstruct_active_formatting_elements->($insert_to_current);
7222
7223 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7224 push @$active_formatting_elements, $self->{open_elements}->[-1];
7225
7226 !!!nack ('t374.1');
7227 !!!next-token;
7228 next B;
7229 } elsif ($token->{tag_name} eq 'nobr') {
7230 $reconstruct_active_formatting_elements->($insert_to_current);
7231
7232 ## has a |nobr| element in scope
7233 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7234 my $node = $self->{open_elements}->[$_];
7235 if ($node->[1] & NOBR_EL) {
7236 !!!cp ('t376');
7237 !!!parse-error (type => 'in nobr:nobr', token => $token);
7238 !!!back-token; # <nobr>
7239 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
7240 line => $token->{line}, column => $token->{column}};
7241 next B;
7242 } elsif ($node->[1] & SCOPING_EL) {
7243 !!!cp ('t377');
7244 last INSCOPE;
7245 }
7246 } # INSCOPE
7247
7248 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7249 push @$active_formatting_elements, $self->{open_elements}->[-1];
7250
7251 !!!nack ('t377.1');
7252 !!!next-token;
7253 next B;
7254 } elsif ($token->{tag_name} eq 'button') {
7255 ## has a button element in scope
7256 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7257 my $node = $self->{open_elements}->[$_];
7258 if ($node->[1] & BUTTON_EL) {
7259 !!!cp ('t378');
7260 !!!parse-error (type => 'in button:button', token => $token);
7261 !!!back-token; # <button>
7262 $token = {type => END_TAG_TOKEN, tag_name => 'button',
7263 line => $token->{line}, column => $token->{column}};
7264 next B;
7265 } elsif ($node->[1] & SCOPING_EL) {
7266 !!!cp ('t379');
7267 last INSCOPE;
7268 }
7269 } # INSCOPE
7270
7271 $reconstruct_active_formatting_elements->($insert_to_current);
7272
7273 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7274
7275 ## TODO: associate with $self->{form_element} if defined
7276
7277 push @$active_formatting_elements, ['#marker', ''];
7278
7279 !!!nack ('t379.1');
7280 !!!next-token;
7281 next B;
7282 } elsif ({
7283 xmp => 1,
7284 iframe => 1,
7285 noembed => 1,
7286 noframes => 1, ## NOTE: This is an "as if in head" code clone.
7287 noscript => 0, ## TODO: 1 if scripting is enabled
7288 }->{$token->{tag_name}}) {
7289 if ($token->{tag_name} eq 'xmp') {
7290 !!!cp ('t381');
7291 $reconstruct_active_formatting_elements->($insert_to_current);
7292 } else {
7293 !!!cp ('t399');
7294 }
7295 ## NOTE: There is an "as if in body" code clone.
7296 $parse_rcdata->(CDATA_CONTENT_MODEL);
7297 next B;
7298 } elsif ($token->{tag_name} eq 'isindex') {
7299 !!!parse-error (type => 'isindex', token => $token);
7300
7301 if (defined $self->{form_element}) {
7302 !!!cp ('t389');
7303 ## Ignore the token
7304 !!!nack ('t389'); ## NOTE: Not acknowledged.
7305 !!!next-token;
7306 next B;
7307 } else {
7308 !!!ack ('t391.1');
7309
7310 my $at = $token->{attributes};
7311 my $form_attrs;
7312 $form_attrs->{action} = $at->{action} if $at->{action};
7313 my $prompt_attr = $at->{prompt};
7314 $at->{name} = {name => 'name', value => 'isindex'};
7315 delete $at->{action};
7316 delete $at->{prompt};
7317 my @tokens = (
7318 {type => START_TAG_TOKEN, tag_name => 'form',
7319 attributes => $form_attrs,
7320 line => $token->{line}, column => $token->{column}},
7321 {type => START_TAG_TOKEN, tag_name => 'hr',
7322 line => $token->{line}, column => $token->{column}},
7323 {type => START_TAG_TOKEN, tag_name => 'p',
7324 line => $token->{line}, column => $token->{column}},
7325 {type => START_TAG_TOKEN, tag_name => 'label',
7326 line => $token->{line}, column => $token->{column}},
7327 );
7328 if ($prompt_attr) {
7329 !!!cp ('t390');
7330 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
7331 #line => $token->{line}, column => $token->{column},
7332 };
7333 } else {
7334 !!!cp ('t391');
7335 push @tokens, {type => CHARACTER_TOKEN,
7336 data => 'This is a searchable index. Insert your search keywords here: ',
7337 #line => $token->{line}, column => $token->{column},
7338 }; # SHOULD
7339 ## TODO: make this configurable
7340 }
7341 push @tokens,
7342 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
7343 line => $token->{line}, column => $token->{column}},
7344 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
7345 {type => END_TAG_TOKEN, tag_name => 'label',
7346 line => $token->{line}, column => $token->{column}},
7347 {type => END_TAG_TOKEN, tag_name => 'p',
7348 line => $token->{line}, column => $token->{column}},
7349 {type => START_TAG_TOKEN, tag_name => 'hr',
7350 line => $token->{line}, column => $token->{column}},
7351 {type => END_TAG_TOKEN, tag_name => 'form',
7352 line => $token->{line}, column => $token->{column}};
7353 !!!back-token (@tokens);
7354 !!!next-token;
7355 next B;
7356 }
7357 } elsif ($token->{tag_name} eq 'textarea') {
7358 my $tag_name = $token->{tag_name};
7359 my $el;
7360 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
7361
7362 ## TODO: $self->{form_element} if defined
7363 $self->{content_model} = RCDATA_CONTENT_MODEL;
7364 delete $self->{escape}; # MUST
7365
7366 $insert->($el);
7367
7368 my $text = '';
7369 !!!nack ('t392.1');
7370 !!!next-token;
7371 if ($token->{type} == CHARACTER_TOKEN) {
7372 $token->{data} =~ s/^\x0A//;
7373 unless (length $token->{data}) {
7374 !!!cp ('t392');
7375 !!!next-token;
7376 } else {
7377 !!!cp ('t393');
7378 }
7379 } else {
7380 !!!cp ('t394');
7381 }
7382 while ($token->{type} == CHARACTER_TOKEN) {
7383 !!!cp ('t395');
7384 $text .= $token->{data};
7385 !!!next-token;
7386 }
7387 if (length $text) {
7388 !!!cp ('t396');
7389 $el->manakai_append_text ($text);
7390 }
7391
7392 $self->{content_model} = PCDATA_CONTENT_MODEL;
7393
7394 if ($token->{type} == END_TAG_TOKEN and
7395 $token->{tag_name} eq $tag_name) {
7396 !!!cp ('t397');
7397 ## Ignore the token
7398 } else {
7399 !!!cp ('t398');
7400 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
7401 }
7402 !!!next-token;
7403 next B;
7404 } elsif ($token->{tag_name} eq 'rt' or
7405 $token->{tag_name} eq 'rp') {
7406 ## has a |ruby| element in scope
7407 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7408 my $node = $self->{open_elements}->[$_];
7409 if ($node->[1] & RUBY_EL) {
7410 !!!cp ('t398.1');
7411 ## generate implied end tags
7412 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7413 !!!cp ('t398.2');
7414 pop @{$self->{open_elements}};
7415 }
7416 unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
7417 !!!cp ('t398.3');
7418 !!!parse-error (type => 'not closed',
7419 text => $self->{open_elements}->[-1]->[0]
7420 ->manakai_local_name,
7421 token => $token);
7422 pop @{$self->{open_elements}}
7423 while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
7424 }
7425 last INSCOPE;
7426 } elsif ($node->[1] & SCOPING_EL) {
7427 !!!cp ('t398.4');
7428 last INSCOPE;
7429 }
7430 } # INSCOPE
7431
7432 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7433
7434 !!!nack ('t398.5');
7435 !!!next-token;
7436 redo B;
7437 } elsif ($token->{tag_name} eq 'math' or
7438 $token->{tag_name} eq 'svg') {
7439 $reconstruct_active_formatting_elements->($insert_to_current);
7440
7441 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
7442
7443 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
7444
7445 ## "adjust foreign attributes" - done in insert-element-f
7446
7447 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
7448
7449 if ($self->{self_closing}) {
7450 pop @{$self->{open_elements}};
7451 !!!ack ('t398.1');
7452 } else {
7453 !!!cp ('t398.2');
7454 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
7455 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
7456 ## mode, "in body" (not "in foreign content") secondary insertion
7457 ## mode, maybe.
7458 }
7459
7460 !!!next-token;
7461 next B;
7462 } elsif ({
7463 caption => 1, col => 1, colgroup => 1, frame => 1,
7464 frameset => 1, head => 1, option => 1, optgroup => 1,
7465 tbody => 1, td => 1, tfoot => 1, th => 1,
7466 thead => 1, tr => 1,
7467 }->{$token->{tag_name}}) {
7468 !!!cp ('t401');
7469 !!!parse-error (type => 'in body',
7470 text => $token->{tag_name}, token => $token);
7471 ## Ignore the token
7472 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
7473 !!!next-token;
7474 next B;
7475
7476 ## ISSUE: An issue on HTML5 new elements in the spec.
7477 } else {
7478 if ($token->{tag_name} eq 'image') {
7479 !!!cp ('t384');
7480 !!!parse-error (type => 'image', token => $token);
7481 $token->{tag_name} = 'img';
7482 } else {
7483 !!!cp ('t385');
7484 }
7485
7486 ## NOTE: There is an "as if <br>" code clone.
7487 $reconstruct_active_formatting_elements->($insert_to_current);
7488
7489 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7490
7491 if ({
7492 applet => 1, marquee => 1, object => 1,
7493 }->{$token->{tag_name}}) {
7494 !!!cp ('t380');
7495 push @$active_formatting_elements, ['#marker', ''];
7496 !!!nack ('t380.1');
7497 } elsif ({
7498 b => 1, big => 1, em => 1, font => 1, i => 1,
7499 s => 1, small => 1, strike => 1,
7500 strong => 1, tt => 1, u => 1,
7501 }->{$token->{tag_name}}) {
7502 !!!cp ('t375');
7503 push @$active_formatting_elements, $self->{open_elements}->[-1];
7504 !!!nack ('t375.1');
7505 } elsif ($token->{tag_name} eq 'input') {
7506 !!!cp ('t388');
7507 ## TODO: associate with $self->{form_element} if defined
7508 pop @{$self->{open_elements}};
7509 !!!ack ('t388.2');
7510 } elsif ({
7511 area => 1, basefont => 1, bgsound => 1, br => 1,
7512 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7513 #image => 1,
7514 }->{$token->{tag_name}}) {
7515 !!!cp ('t388.1');
7516 pop @{$self->{open_elements}};
7517 !!!ack ('t388.3');
7518 } elsif ($token->{tag_name} eq 'select') {
7519 ## TODO: associate with $self->{form_element} if defined
7520
7521 if ($self->{insertion_mode} & TABLE_IMS or
7522 $self->{insertion_mode} & BODY_TABLE_IMS or
7523 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7524 !!!cp ('t400.1');
7525 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7526 } else {
7527 !!!cp ('t400.2');
7528 $self->{insertion_mode} = IN_SELECT_IM;
7529 }
7530 !!!nack ('t400.3');
7531 } else {
7532 !!!nack ('t402');
7533 }
7534
7535 !!!next-token;
7536 next B;
7537 }
7538 } elsif ($token->{type} == END_TAG_TOKEN) {
7539 if ($token->{tag_name} eq 'body') {
7540 ## has a |body| element in scope
7541 my $i;
7542 INSCOPE: {
7543 for (reverse @{$self->{open_elements}}) {
7544 if ($_->[1] & BODY_EL) {
7545 !!!cp ('t405');
7546 $i = $_;
7547 last INSCOPE;
7548 } elsif ($_->[1] & SCOPING_EL) {
7549 !!!cp ('t405.1');
7550 last;
7551 }
7552 }
7553
7554 !!!parse-error (type => 'start tag not allowed',
7555 text => $token->{tag_name}, token => $token);
7556 ## NOTE: Ignore the token.
7557 !!!next-token;
7558 next B;
7559 } # INSCOPE
7560
7561 for (@{$self->{open_elements}}) {
7562 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7563 !!!cp ('t403');
7564 !!!parse-error (type => 'not closed',
7565 text => $_->[0]->manakai_local_name,
7566 token => $token);
7567 last;
7568 } else {
7569 !!!cp ('t404');
7570 }
7571 }
7572
7573 $self->{insertion_mode} = AFTER_BODY_IM;
7574 !!!next-token;
7575 next B;
7576 } elsif ($token->{tag_name} eq 'html') {
7577 ## TODO: Update this code. It seems that the code below is not
7578 ## up-to-date, though it has same effect as speced.
7579 if (@{$self->{open_elements}} > 1 and
7580 $self->{open_elements}->[1]->[1] & BODY_EL) {
7581 ## ISSUE: There is an issue in the spec.
7582 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7583 !!!cp ('t406');
7584 !!!parse-error (type => 'not closed',
7585 text => $self->{open_elements}->[1]->[0]
7586 ->manakai_local_name,
7587 token => $token);
7588 } else {
7589 !!!cp ('t407');
7590 }
7591 $self->{insertion_mode} = AFTER_BODY_IM;
7592 ## reprocess
7593 next B;
7594 } else {
7595 !!!cp ('t408');
7596 !!!parse-error (type => 'unmatched end tag',
7597 text => $token->{tag_name}, token => $token);
7598 ## Ignore the token
7599 !!!next-token;
7600 next B;
7601 }
7602 } elsif ({
7603 ## NOTE: End tags for non-phrasing flow content elements
7604
7605 ## NOTE: The normal ones
7606 address => 1, article => 1, aside => 1, blockquote => 1,
7607 center => 1, datagrid => 1, details => 1, dialog => 1,
7608 dir => 1, div => 1, dl => 1, fieldset => 1, figure => 1,
7609 footer => 1, header => 1, listing => 1, menu => 1, nav => 1,
7610 ol => 1, pre => 1, section => 1, ul => 1,
7611
7612 ## NOTE: As normal, but ... optional tags
7613 dd => 1, dt => 1, li => 1,
7614
7615 applet => 1, button => 1, marquee => 1, object => 1,
7616 }->{$token->{tag_name}}) {
7617 ## NOTE: Code for <li> start tags includes "as if </li>" code.
7618 ## Code for <dt> or <dd> start tags includes "as if </dt> or
7619 ## </dd>" code.
7620
7621 ## has an element in scope
7622 my $i;
7623 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7624 my $node = $self->{open_elements}->[$_];
7625 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7626 !!!cp ('t410');
7627 $i = $_;
7628 last INSCOPE;
7629 } elsif ($node->[1] & SCOPING_EL) {
7630 !!!cp ('t411');
7631 last INSCOPE;
7632 }
7633 } # INSCOPE
7634
7635 unless (defined $i) { # has an element in scope
7636 !!!cp ('t413');
7637 !!!parse-error (type => 'unmatched end tag',
7638 text => $token->{tag_name}, token => $token);
7639 ## NOTE: Ignore the token.
7640 } else {
7641 ## Step 1. generate implied end tags
7642 while ({
7643 ## END_TAG_OPTIONAL_EL
7644 dd => ($token->{tag_name} ne 'dd'),
7645 dt => ($token->{tag_name} ne 'dt'),
7646 li => ($token->{tag_name} ne 'li'),
7647 option => 1,
7648 optgroup => 1,
7649 p => 1,
7650 rt => 1,
7651 rp => 1,
7652 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7653 !!!cp ('t409');
7654 pop @{$self->{open_elements}};
7655 }
7656
7657 ## Step 2.
7658 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7659 ne $token->{tag_name}) {
7660 !!!cp ('t412');
7661 !!!parse-error (type => 'not closed',
7662 text => $self->{open_elements}->[-1]->[0]
7663 ->manakai_local_name,
7664 token => $token);
7665 } else {
7666 !!!cp ('t414');
7667 }
7668
7669 ## Step 3.
7670 splice @{$self->{open_elements}}, $i;
7671
7672 ## Step 4.
7673 $clear_up_to_marker->()
7674 if {
7675 applet => 1, button => 1, marquee => 1, object => 1,
7676 }->{$token->{tag_name}};
7677 }
7678 !!!next-token;
7679 next B;
7680 } elsif ($token->{tag_name} eq 'form') {
7681 ## NOTE: As normal, but interacts with the form element pointer
7682
7683 undef $self->{form_element};
7684
7685 ## has an element in scope
7686 my $i;
7687 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7688 my $node = $self->{open_elements}->[$_];
7689 if ($node->[1] & FORM_EL) {
7690 !!!cp ('t418');
7691 $i = $_;
7692 last INSCOPE;
7693 } elsif ($node->[1] & SCOPING_EL) {
7694 !!!cp ('t419');
7695 last INSCOPE;
7696 }
7697 } # INSCOPE
7698
7699 unless (defined $i) { # has an element in scope
7700 !!!cp ('t421');
7701 !!!parse-error (type => 'unmatched end tag',
7702 text => $token->{tag_name}, token => $token);
7703 ## NOTE: Ignore the token.
7704 } else {
7705 ## Step 1. generate implied end tags
7706 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7707 !!!cp ('t417');
7708 pop @{$self->{open_elements}};
7709 }
7710
7711 ## Step 2.
7712 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7713 ne $token->{tag_name}) {
7714 !!!cp ('t417.1');
7715 !!!parse-error (type => 'not closed',
7716 text => $self->{open_elements}->[-1]->[0]
7717 ->manakai_local_name,
7718 token => $token);
7719 } else {
7720 !!!cp ('t420');
7721 }
7722
7723 ## Step 3.
7724 splice @{$self->{open_elements}}, $i;
7725 }
7726
7727 !!!next-token;
7728 next B;
7729 } elsif ({
7730 ## NOTE: As normal, except acts as a closer for any ...
7731 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7732 }->{$token->{tag_name}}) {
7733 ## has an element in scope
7734 my $i;
7735 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7736 my $node = $self->{open_elements}->[$_];
7737 if ($node->[1] & HEADING_EL) {
7738 !!!cp ('t423');
7739 $i = $_;
7740 last INSCOPE;
7741 } elsif ($node->[1] & SCOPING_EL) {
7742 !!!cp ('t424');
7743 last INSCOPE;
7744 }
7745 } # INSCOPE
7746
7747 unless (defined $i) { # has an element in scope
7748 !!!cp ('t425.1');
7749 !!!parse-error (type => 'unmatched end tag',
7750 text => $token->{tag_name}, token => $token);
7751 ## NOTE: Ignore the token.
7752 } else {
7753 ## Step 1. generate implied end tags
7754 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7755 !!!cp ('t422');
7756 pop @{$self->{open_elements}};
7757 }
7758
7759 ## Step 2.
7760 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7761 ne $token->{tag_name}) {
7762 !!!cp ('t425');
7763 !!!parse-error (type => 'unmatched end tag',
7764 text => $token->{tag_name}, token => $token);
7765 } else {
7766 !!!cp ('t426');
7767 }
7768
7769 ## Step 3.
7770 splice @{$self->{open_elements}}, $i;
7771 }
7772
7773 !!!next-token;
7774 next B;
7775 } elsif ($token->{tag_name} eq 'p') {
7776 ## NOTE: As normal, except </p> implies <p> and ...
7777
7778 ## has an element in scope
7779 my $non_optional;
7780 my $i;
7781 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7782 my $node = $self->{open_elements}->[$_];
7783 if ($node->[1] & P_EL) {
7784 !!!cp ('t410.1');
7785 $i = $_;
7786 last INSCOPE;
7787 } elsif ($node->[1] & SCOPING_EL) {
7788 !!!cp ('t411.1');
7789 last INSCOPE;
7790 } elsif ($node->[1] & END_TAG_OPTIONAL_EL) {
7791 ## NOTE: |END_TAG_OPTIONAL_EL| includes "p"
7792 !!!cp ('t411.2');
7793 #
7794 } else {
7795 !!!cp ('t411.3');
7796 $non_optional ||= $node;
7797 #
7798 }
7799 } # INSCOPE
7800
7801 if (defined $i) {
7802 ## 1. Generate implied end tags
7803 #
7804
7805 ## 2. If current node != "p", parse error
7806 if ($non_optional) {
7807 !!!cp ('t412.1');
7808 !!!parse-error (type => 'not closed',
7809 text => $non_optional->[0]->manakai_local_name,
7810 token => $token);
7811 } else {
7812 !!!cp ('t414.1');
7813 }
7814
7815 ## 3. Pop
7816 splice @{$self->{open_elements}}, $i;
7817 } else {
7818 !!!cp ('t413.1');
7819 !!!parse-error (type => 'unmatched end tag',
7820 text => $token->{tag_name}, token => $token);
7821
7822 !!!cp ('t415.1');
7823 ## As if <p>, then reprocess the current token
7824 my $el;
7825 !!!create-element ($el, $HTML_NS, 'p',, $token);
7826 $insert->($el);
7827 ## NOTE: Not inserted into |$self->{open_elements}|.
7828 }
7829
7830 !!!next-token;
7831 next B;
7832 } elsif ({
7833 a => 1,
7834 b => 1, big => 1, em => 1, font => 1, i => 1,
7835 nobr => 1, s => 1, small => 1, strike => 1,
7836 strong => 1, tt => 1, u => 1,
7837 }->{$token->{tag_name}}) {
7838 !!!cp ('t427');
7839 $formatting_end_tag->($token);
7840 next B;
7841 } elsif ($token->{tag_name} eq 'br') {
7842 !!!cp ('t428');
7843 !!!parse-error (type => 'unmatched end tag',
7844 text => 'br', token => $token);
7845
7846 ## As if <br>
7847 $reconstruct_active_formatting_elements->($insert_to_current);
7848
7849 my $el;
7850 !!!create-element ($el, $HTML_NS, 'br',, $token);
7851 $insert->($el);
7852
7853 ## Ignore the token.
7854 !!!next-token;
7855 next B;
7856 } elsif ({
7857 caption => 1, col => 1, colgroup => 1, frame => 1,
7858 frameset => 1, head => 1, option => 1, optgroup => 1,
7859 tbody => 1, td => 1, tfoot => 1, th => 1,
7860 thead => 1, tr => 1,
7861 area => 1, basefont => 1, bgsound => 1,
7862 embed => 1, hr => 1, iframe => 1, image => 1,
7863 img => 1, input => 1, isindex => 1, noembed => 1,
7864 noframes => 1, param => 1, select => 1, spacer => 1,
7865 table => 1, textarea => 1, wbr => 1,
7866 noscript => 0, ## TODO: if scripting is enabled
7867 }->{$token->{tag_name}}) {
7868 !!!cp ('t429');
7869 !!!parse-error (type => 'unmatched end tag',
7870 text => $token->{tag_name}, token => $token);
7871 ## Ignore the token
7872 !!!next-token;
7873 next B;
7874 } else {
7875 if ($token->{tag_name} eq 'sarcasm') {
7876 sleep 0.001; # take a deep breath
7877 }
7878
7879 ## Step 1
7880 my $node_i = -1;
7881 my $node = $self->{open_elements}->[$node_i];
7882
7883 ## Step 2
7884 S2: {
7885 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7886 ## Step 1
7887 ## generate implied end tags
7888 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7889 !!!cp ('t430');
7890 ## NOTE: |<ruby><rt></ruby>|.
7891 ## ISSUE: <ruby><rt></rt> will also take this code path,
7892 ## which seems wrong.
7893 pop @{$self->{open_elements}};
7894 $node_i++;
7895 }
7896
7897 ## Step 2
7898 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7899 ne $token->{tag_name}) {
7900 !!!cp ('t431');
7901 ## NOTE: <x><y></x>
7902 !!!parse-error (type => 'not closed',
7903 text => $self->{open_elements}->[-1]->[0]
7904 ->manakai_local_name,
7905 token => $token);
7906 } else {
7907 !!!cp ('t432');
7908 }
7909
7910 ## Step 3
7911 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7912
7913 !!!next-token;
7914 last S2;
7915 } else {
7916 ## Step 3
7917 if (not ($node->[1] & FORMATTING_EL) and
7918 #not $phrasing_category->{$node->[1]} and
7919 ($node->[1] & SPECIAL_EL or
7920 $node->[1] & SCOPING_EL)) {
7921 !!!cp ('t433');
7922 !!!parse-error (type => 'unmatched end tag',
7923 text => $token->{tag_name}, token => $token);
7924 ## Ignore the token
7925 !!!next-token;
7926 last S2;
7927
7928 ## NOTE: |<span><dd></span>a|: In Safari 3.1.2 and Opera
7929 ## 9.27, "a" is a child of <dd> (conforming). In
7930 ## Firefox 3.0.2, "a" is a child of <body>. In WinIE 7,
7931 ## "a" is a child of both <body> and <dd>.
7932 }
7933
7934 !!!cp ('t434');
7935 }
7936
7937 ## Step 4
7938 $node_i--;
7939 $node = $self->{open_elements}->[$node_i];
7940
7941 ## Step 5;
7942 redo S2;
7943 } # S2
7944 next B;
7945 }
7946 }
7947 next B;
7948 } continue { # B
7949 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7950 ## NOTE: The code below is executed in cases where it does not have
7951 ## to be, but it it is harmless even in those cases.
7952 ## has an element in scope
7953 INSCOPE: {
7954 for (reverse 0..$#{$self->{open_elements}}) {
7955 my $node = $self->{open_elements}->[$_];
7956 if ($node->[1] & FOREIGN_EL) {
7957 last INSCOPE;
7958 } elsif ($node->[1] & SCOPING_EL) {
7959 last;
7960 }
7961 }
7962
7963 ## NOTE: No foreign element in scope.
7964 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7965 } # INSCOPE
7966 }
7967 } # B
7968
7969 ## Stop parsing # MUST
7970
7971 ## TODO: script stuffs
7972 } # _tree_construct_main
7973
7974 sub set_inner_html ($$$$;$) {
7975 my $class = shift;
7976 my $node = shift;
7977 #my $s = \$_[0];
7978 my $onerror = $_[1];
7979 my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7980
7981 ## ISSUE: Should {confident} be true?
7982
7983 my $nt = $node->node_type;
7984 if ($nt == 9) {
7985 # MUST
7986
7987 ## Step 1 # MUST
7988 ## TODO: If the document has an active parser, ...
7989 ## ISSUE: There is an issue in the spec.
7990
7991 ## Step 2 # MUST
7992 my @cn = @{$node->child_nodes};
7993 for (@cn) {
7994 $node->remove_child ($_);
7995 }
7996
7997 ## Step 3, 4, 5 # MUST
7998 $class->parse_char_string ($_[0] => $node, $onerror, $get_wrapper);
7999 } elsif ($nt == 1) {
8000 ## TODO: If non-html element
8001
8002 ## NOTE: Most of this code is copied from |parse_string|
8003
8004 ## TODO: Support for $get_wrapper
8005
8006 ## Step 1 # MUST
8007 my $this_doc = $node->owner_document;
8008 my $doc = $this_doc->implementation->create_document;
8009 $doc->manakai_is_html (1);
8010 my $p = $class->new;
8011 $p->{document} = $doc;
8012
8013 ## Step 8 # MUST
8014 my $i = 0;
8015 $p->{line_prev} = $p->{line} = 1;
8016 $p->{column_prev} = $p->{column} = 0;
8017 require Whatpm::Charset::DecodeHandle;
8018 my $input = Whatpm::Charset::DecodeHandle::CharString->new (\($_[0]));
8019 $input = $get_wrapper->($input);
8020 $p->{set_nc} = sub {
8021 my $self = shift;
8022
8023 my $char = '';
8024 if (defined $self->{next_nc}) {
8025 $char = $self->{next_nc};
8026 delete $self->{next_nc};
8027 $self->{nc} = ord $char;
8028 } else {
8029 $self->{char_buffer} = '';
8030 $self->{char_buffer_pos} = 0;
8031
8032 my $count = $input->manakai_read_until
8033 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/,
8034 $self->{char_buffer_pos});
8035 if ($count) {
8036 $self->{line_prev} = $self->{line};
8037 $self->{column_prev} = $self->{column};
8038 $self->{column}++;
8039 $self->{nc}
8040 = ord substr ($self->{char_buffer},
8041 $self->{char_buffer_pos}++, 1);
8042 return;
8043 }
8044
8045 if ($input->read ($char, 1)) {
8046 $self->{nc} = ord $char;
8047 } else {
8048 $self->{nc} = -1;
8049 return;
8050 }
8051 }
8052
8053 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
8054 $p->{column}++;
8055
8056 if ($self->{nc} == 0x000A) { # LF
8057 $p->{line}++;
8058 $p->{column} = 0;
8059 !!!cp ('i1');
8060 } elsif ($self->{nc} == 0x000D) { # CR
8061 ## TODO: support for abort/streaming
8062 my $next = '';
8063 if ($input->read ($next, 1) and $next ne "\x0A") {
8064 $self->{next_nc} = $next;
8065 }
8066 $self->{nc} = 0x000A; # LF # MUST
8067 $p->{line}++;
8068 $p->{column} = 0;
8069 !!!cp ('i2');
8070 } elsif ($self->{nc} == 0x0000) { # NULL
8071 !!!cp ('i4');
8072 !!!parse-error (type => 'NULL');
8073 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
8074 }
8075 };
8076
8077 $p->{read_until} = sub {
8078 #my ($scalar, $specials_range, $offset) = @_;
8079 return 0 if defined $p->{next_nc};
8080
8081 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
8082 my $offset = $_[2] || 0;
8083
8084 if ($p->{char_buffer_pos} < length $p->{char_buffer}) {
8085 pos ($p->{char_buffer}) = $p->{char_buffer_pos};
8086 if ($p->{char_buffer} =~ /\G(?>$pattern)+/) {
8087 substr ($_[0], $offset)
8088 = substr ($p->{char_buffer}, $-[0], $+[0] - $-[0]);
8089 my $count = $+[0] - $-[0];
8090 if ($count) {
8091 $p->{column} += $count;
8092 $p->{char_buffer_pos} += $count;
8093 $p->{line_prev} = $p->{line};
8094 $p->{column_prev} = $p->{column} - 1;
8095 $p->{nc} = -1;
8096 }
8097 return $count;
8098 } else {
8099 return 0;
8100 }
8101 } else {
8102 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
8103 if ($count) {
8104 $p->{column} += $count;
8105 $p->{column_prev} += $count;
8106 $p->{nc} = -1;
8107 }
8108 return $count;
8109 }
8110 }; # $p->{read_until}
8111
8112 my $ponerror = $onerror || sub {
8113 my (%opt) = @_;
8114 my $line = $opt{line};
8115 my $column = $opt{column};
8116 if (defined $opt{token} and defined $opt{token}->{line}) {
8117 $line = $opt{token}->{line};
8118 $column = $opt{token}->{column};
8119 }
8120 warn "Parse error ($opt{type}) at line $line column $column\n";
8121 };
8122 $p->{parse_error} = sub {
8123 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
8124 };
8125
8126 my $char_onerror = sub {
8127 my (undef, $type, %opt) = @_;
8128 $ponerror->(layer => 'encode',
8129 line => $p->{line}, column => $p->{column} + 1,
8130 %opt, type => $type);
8131 }; # $char_onerror
8132 $input->onerror ($char_onerror);
8133
8134 $p->_initialize_tokenizer;
8135 $p->_initialize_tree_constructor;
8136
8137 ## Step 2
8138 my $node_ln = $node->manakai_local_name;
8139 $p->{content_model} = {
8140 title => RCDATA_CONTENT_MODEL,
8141 textarea => RCDATA_CONTENT_MODEL,
8142 style => CDATA_CONTENT_MODEL,
8143 script => CDATA_CONTENT_MODEL,
8144 xmp => CDATA_CONTENT_MODEL,
8145 iframe => CDATA_CONTENT_MODEL,
8146 noembed => CDATA_CONTENT_MODEL,
8147 noframes => CDATA_CONTENT_MODEL,
8148 noscript => CDATA_CONTENT_MODEL,
8149 plaintext => PLAINTEXT_CONTENT_MODEL,
8150 }->{$node_ln};
8151 $p->{content_model} = PCDATA_CONTENT_MODEL
8152 unless defined $p->{content_model};
8153 ## ISSUE: What is "the name of the element"? local name?
8154
8155 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
8156 ## TODO: Foreign element OK?
8157
8158 ## Step 3
8159 my $root = $doc->create_element_ns
8160 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
8161
8162 ## Step 4 # MUST
8163 $doc->append_child ($root);
8164
8165 ## Step 5 # MUST
8166 push @{$p->{open_elements}}, [$root, $el_category->{html}];
8167
8168 undef $p->{head_element};
8169
8170 ## Step 6 # MUST
8171 $p->_reset_insertion_mode;
8172
8173 ## Step 7 # MUST
8174 my $anode = $node;
8175 AN: while (defined $anode) {
8176 if ($anode->node_type == 1) {
8177 my $nsuri = $anode->namespace_uri;
8178 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
8179 if ($anode->manakai_local_name eq 'form') {
8180 !!!cp ('i5');
8181 $p->{form_element} = $anode;
8182 last AN;
8183 }
8184 }
8185 }
8186 $anode = $anode->parent_node;
8187 } # AN
8188
8189 ## Step 9 # MUST
8190 {
8191 my $self = $p;
8192 !!!next-token;
8193 }
8194 $p->_tree_construction_main;
8195
8196 ## Step 10 # MUST
8197 my @cn = @{$node->child_nodes};
8198 for (@cn) {
8199 $node->remove_child ($_);
8200 }
8201 ## ISSUE: mutation events? read-only?
8202
8203 ## Step 11 # MUST
8204 @cn = @{$root->child_nodes};
8205 for (@cn) {
8206 $this_doc->adopt_node ($_);
8207 $node->append_child ($_);
8208 }
8209 ## ISSUE: mutation events?
8210
8211 $p->_terminate_tree_constructor;
8212
8213 delete $p->{parse_error}; # delete loop
8214 } else {
8215 die "$0: |set_inner_html| is not defined for node of type $nt";
8216 }
8217 } # set_inner_html
8218
8219 } # tree construction stage
8220
8221 package Whatpm::HTML::RestartParser;
8222 push our @ISA, 'Error';
8223
8224 1;
8225 # $Date: 2008/10/04 07:58:58 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24