/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.150 - (show annotations) (download) (as text)
Sun Jun 1 06:47:08 2008 UTC (17 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.149: +3 -7 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	1 Jun 2008 06:46:25 -0000
	* HTML-tokenizer.t, tokenizer-test-1.test: Update for new format.

2008-06-01  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	1 Jun 2008 06:39:50 -0000
	* HTML.pm.src (_get_next_token): A parse error was missing.

2008-06-01  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.149 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 require IO::Handle;
12
13 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15 my $SVG_NS = q<http://www.w3.org/2000/svg>;
16 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19
20 sub A_EL () { 0b1 }
21 sub ADDRESS_EL () { 0b10 }
22 sub BODY_EL () { 0b100 }
23 sub BUTTON_EL () { 0b1000 }
24 sub CAPTION_EL () { 0b10000 }
25 sub DD_EL () { 0b100000 }
26 sub DIV_EL () { 0b1000000 }
27 sub DT_EL () { 0b10000000 }
28 sub FORM_EL () { 0b100000000 }
29 sub FORMATTING_EL () { 0b1000000000 }
30 sub FRAMESET_EL () { 0b10000000000 }
31 sub HEADING_EL () { 0b100000000000 }
32 sub HTML_EL () { 0b1000000000000 }
33 sub LI_EL () { 0b10000000000000 }
34 sub NOBR_EL () { 0b100000000000000 }
35 sub OPTION_EL () { 0b1000000000000000 }
36 sub OPTGROUP_EL () { 0b10000000000000000 }
37 sub P_EL () { 0b100000000000000000 }
38 sub SELECT_EL () { 0b1000000000000000000 }
39 sub TABLE_EL () { 0b10000000000000000000 }
40 sub TABLE_CELL_EL () { 0b100000000000000000000 }
41 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48
49 sub TABLE_ROWS_EL () {
50 TABLE_EL |
51 TABLE_ROW_EL |
52 TABLE_ROW_GROUP_EL
53 }
54
55 sub END_TAG_OPTIONAL_EL () {
56 DD_EL |
57 DT_EL |
58 LI_EL |
59 P_EL
60 }
61
62 sub ALL_END_TAG_OPTIONAL_EL () {
63 END_TAG_OPTIONAL_EL |
64 BODY_EL |
65 HTML_EL |
66 TABLE_CELL_EL |
67 TABLE_ROW_EL |
68 TABLE_ROW_GROUP_EL
69 }
70
71 sub SCOPING_EL () {
72 BUTTON_EL |
73 CAPTION_EL |
74 HTML_EL |
75 TABLE_EL |
76 TABLE_CELL_EL |
77 MISC_SCOPING_EL
78 }
79
80 sub TABLE_SCOPING_EL () {
81 HTML_EL |
82 TABLE_EL
83 }
84
85 sub TABLE_ROWS_SCOPING_EL () {
86 HTML_EL |
87 TABLE_ROW_GROUP_EL
88 }
89
90 sub TABLE_ROW_SCOPING_EL () {
91 HTML_EL |
92 TABLE_ROW_EL
93 }
94
95 sub SPECIAL_EL () {
96 ADDRESS_EL |
97 BODY_EL |
98 DIV_EL |
99 END_TAG_OPTIONAL_EL |
100 FORM_EL |
101 FRAMESET_EL |
102 HEADING_EL |
103 OPTION_EL |
104 OPTGROUP_EL |
105 SELECT_EL |
106 TABLE_ROW_EL |
107 TABLE_ROW_GROUP_EL |
108 MISC_SPECIAL_EL
109 }
110
111 my $el_category = {
112 a => A_EL | FORMATTING_EL,
113 address => ADDRESS_EL,
114 applet => MISC_SCOPING_EL,
115 area => MISC_SPECIAL_EL,
116 b => FORMATTING_EL,
117 base => MISC_SPECIAL_EL,
118 basefont => MISC_SPECIAL_EL,
119 bgsound => MISC_SPECIAL_EL,
120 big => FORMATTING_EL,
121 blockquote => MISC_SPECIAL_EL,
122 body => BODY_EL,
123 br => MISC_SPECIAL_EL,
124 button => BUTTON_EL,
125 caption => CAPTION_EL,
126 center => MISC_SPECIAL_EL,
127 col => MISC_SPECIAL_EL,
128 colgroup => MISC_SPECIAL_EL,
129 dd => DD_EL,
130 dir => MISC_SPECIAL_EL,
131 div => DIV_EL,
132 dl => MISC_SPECIAL_EL,
133 dt => DT_EL,
134 em => FORMATTING_EL,
135 embed => MISC_SPECIAL_EL,
136 fieldset => MISC_SPECIAL_EL,
137 font => FORMATTING_EL,
138 form => FORM_EL,
139 frame => MISC_SPECIAL_EL,
140 frameset => FRAMESET_EL,
141 h1 => HEADING_EL,
142 h2 => HEADING_EL,
143 h3 => HEADING_EL,
144 h4 => HEADING_EL,
145 h5 => HEADING_EL,
146 h6 => HEADING_EL,
147 head => MISC_SPECIAL_EL,
148 hr => MISC_SPECIAL_EL,
149 html => HTML_EL,
150 i => FORMATTING_EL,
151 iframe => MISC_SPECIAL_EL,
152 img => MISC_SPECIAL_EL,
153 input => MISC_SPECIAL_EL,
154 isindex => MISC_SPECIAL_EL,
155 li => LI_EL,
156 link => MISC_SPECIAL_EL,
157 listing => MISC_SPECIAL_EL,
158 marquee => MISC_SCOPING_EL,
159 menu => MISC_SPECIAL_EL,
160 meta => MISC_SPECIAL_EL,
161 nobr => NOBR_EL | FORMATTING_EL,
162 noembed => MISC_SPECIAL_EL,
163 noframes => MISC_SPECIAL_EL,
164 noscript => MISC_SPECIAL_EL,
165 object => MISC_SCOPING_EL,
166 ol => MISC_SPECIAL_EL,
167 optgroup => OPTGROUP_EL,
168 option => OPTION_EL,
169 p => P_EL,
170 param => MISC_SPECIAL_EL,
171 plaintext => MISC_SPECIAL_EL,
172 pre => MISC_SPECIAL_EL,
173 s => FORMATTING_EL,
174 script => MISC_SPECIAL_EL,
175 select => SELECT_EL,
176 small => FORMATTING_EL,
177 spacer => MISC_SPECIAL_EL,
178 strike => FORMATTING_EL,
179 strong => FORMATTING_EL,
180 style => MISC_SPECIAL_EL,
181 table => TABLE_EL,
182 tbody => TABLE_ROW_GROUP_EL,
183 td => TABLE_CELL_EL,
184 textarea => MISC_SPECIAL_EL,
185 tfoot => TABLE_ROW_GROUP_EL,
186 th => TABLE_CELL_EL,
187 thead => TABLE_ROW_GROUP_EL,
188 title => MISC_SPECIAL_EL,
189 tr => TABLE_ROW_EL,
190 tt => FORMATTING_EL,
191 u => FORMATTING_EL,
192 ul => MISC_SPECIAL_EL,
193 wbr => MISC_SPECIAL_EL,
194 };
195
196 my $el_category_f = {
197 $MML_NS => {
198 'annotation-xml' => MML_AXML_EL,
199 mi => FOREIGN_FLOW_CONTENT_EL,
200 mo => FOREIGN_FLOW_CONTENT_EL,
201 mn => FOREIGN_FLOW_CONTENT_EL,
202 ms => FOREIGN_FLOW_CONTENT_EL,
203 mtext => FOREIGN_FLOW_CONTENT_EL,
204 },
205 $SVG_NS => {
206 foreignObject => FOREIGN_FLOW_CONTENT_EL,
207 desc => FOREIGN_FLOW_CONTENT_EL,
208 title => FOREIGN_FLOW_CONTENT_EL,
209 },
210 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
211 };
212
213 my $svg_attr_name = {
214 attributename => 'attributeName',
215 attributetype => 'attributeType',
216 basefrequency => 'baseFrequency',
217 baseprofile => 'baseProfile',
218 calcmode => 'calcMode',
219 clippathunits => 'clipPathUnits',
220 contentscripttype => 'contentScriptType',
221 contentstyletype => 'contentStyleType',
222 diffuseconstant => 'diffuseConstant',
223 edgemode => 'edgeMode',
224 externalresourcesrequired => 'externalResourcesRequired',
225 filterres => 'filterRes',
226 filterunits => 'filterUnits',
227 glyphref => 'glyphRef',
228 gradienttransform => 'gradientTransform',
229 gradientunits => 'gradientUnits',
230 kernelmatrix => 'kernelMatrix',
231 kernelunitlength => 'kernelUnitLength',
232 keypoints => 'keyPoints',
233 keysplines => 'keySplines',
234 keytimes => 'keyTimes',
235 lengthadjust => 'lengthAdjust',
236 limitingconeangle => 'limitingConeAngle',
237 markerheight => 'markerHeight',
238 markerunits => 'markerUnits',
239 markerwidth => 'markerWidth',
240 maskcontentunits => 'maskContentUnits',
241 maskunits => 'maskUnits',
242 numoctaves => 'numOctaves',
243 pathlength => 'pathLength',
244 patterncontentunits => 'patternContentUnits',
245 patterntransform => 'patternTransform',
246 patternunits => 'patternUnits',
247 pointsatx => 'pointsAtX',
248 pointsaty => 'pointsAtY',
249 pointsatz => 'pointsAtZ',
250 preservealpha => 'preserveAlpha',
251 preserveaspectratio => 'preserveAspectRatio',
252 primitiveunits => 'primitiveUnits',
253 refx => 'refX',
254 refy => 'refY',
255 repeatcount => 'repeatCount',
256 repeatdur => 'repeatDur',
257 requiredextensions => 'requiredExtensions',
258 requiredfeatures => 'requiredFeatures',
259 specularconstant => 'specularConstant',
260 specularexponent => 'specularExponent',
261 spreadmethod => 'spreadMethod',
262 startoffset => 'startOffset',
263 stddeviation => 'stdDeviation',
264 stitchtiles => 'stitchTiles',
265 surfacescale => 'surfaceScale',
266 systemlanguage => 'systemLanguage',
267 tablevalues => 'tableValues',
268 targetx => 'targetX',
269 targety => 'targetY',
270 textlength => 'textLength',
271 viewbox => 'viewBox',
272 viewtarget => 'viewTarget',
273 xchannelselector => 'xChannelSelector',
274 ychannelselector => 'yChannelSelector',
275 zoomandpan => 'zoomAndPan',
276 };
277
278 my $foreign_attr_xname = {
279 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
280 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
281 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
282 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
283 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
284 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
285 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
286 'xml:base' => [$XML_NS, ['xml', 'base']],
287 'xml:lang' => [$XML_NS, ['xml', 'lang']],
288 'xml:space' => [$XML_NS, ['xml', 'space']],
289 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
290 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
291 };
292
293 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
294
295 my $c1_entity_char = {
296 0x80 => 0x20AC,
297 0x81 => 0xFFFD,
298 0x82 => 0x201A,
299 0x83 => 0x0192,
300 0x84 => 0x201E,
301 0x85 => 0x2026,
302 0x86 => 0x2020,
303 0x87 => 0x2021,
304 0x88 => 0x02C6,
305 0x89 => 0x2030,
306 0x8A => 0x0160,
307 0x8B => 0x2039,
308 0x8C => 0x0152,
309 0x8D => 0xFFFD,
310 0x8E => 0x017D,
311 0x8F => 0xFFFD,
312 0x90 => 0xFFFD,
313 0x91 => 0x2018,
314 0x92 => 0x2019,
315 0x93 => 0x201C,
316 0x94 => 0x201D,
317 0x95 => 0x2022,
318 0x96 => 0x2013,
319 0x97 => 0x2014,
320 0x98 => 0x02DC,
321 0x99 => 0x2122,
322 0x9A => 0x0161,
323 0x9B => 0x203A,
324 0x9C => 0x0153,
325 0x9D => 0xFFFD,
326 0x9E => 0x017E,
327 0x9F => 0x0178,
328 }; # $c1_entity_char
329
330 sub parse_byte_string ($$$$;$) {
331 my $self = shift;
332 my $charset_name = shift;
333 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
334 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
335 } # parse_byte_string
336
337 sub parse_byte_stream ($$$$;$) {
338 my $self = ref $_[0] ? shift : shift->new;
339 my $charset_name = shift;
340 my $byte_stream = $_[0];
341
342 my $onerror = $_[2] || sub {
343 my (%opt) = @_;
344 warn "Parse error ($opt{type})\n";
345 };
346 $self->{parse_error} = $onerror; # updated later by parse_char_string
347
348 ## HTML5 encoding sniffing algorithm
349 require Message::Charset::Info;
350 my $charset;
351 my $buffer;
352 my ($char_stream, $e_status);
353
354 SNIFFING: {
355
356 ## Step 1
357 if (defined $charset_name) {
358 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
359
360 ## ISSUE: Unsupported encoding is not ignored according to the spec.
361 ($char_stream, $e_status) = $charset->get_decode_handle
362 ($byte_stream, allow_error_reporting => 1,
363 allow_fallback => 1);
364 if ($char_stream) {
365 $self->{confident} = 1;
366 last SNIFFING;
367 } else {
368 ## TODO: unsupported error
369 }
370 }
371
372 ## Step 2
373 my $byte_buffer = '';
374 for (1..1024) {
375 my $char = $byte_stream->getc;
376 last unless defined $char;
377 $byte_buffer .= $char;
378 } ## TODO: timeout
379
380 ## Step 3
381 if ($byte_buffer =~ /^\xFE\xFF/) {
382 $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
383 ($char_stream, $e_status) = $charset->get_decode_handle
384 ($byte_stream, allow_error_reporting => 1,
385 allow_fallback => 1, byte_buffer => \$byte_buffer);
386 $self->{confident} = 1;
387 last SNIFFING;
388 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
389 $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
390 ($char_stream, $e_status) = $charset->get_decode_handle
391 ($byte_stream, allow_error_reporting => 1,
392 allow_fallback => 1, byte_buffer => \$byte_buffer);
393 $self->{confident} = 1;
394 last SNIFFING;
395 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
396 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
397 ($char_stream, $e_status) = $charset->get_decode_handle
398 ($byte_stream, allow_error_reporting => 1,
399 allow_fallback => 1, byte_buffer => \$byte_buffer);
400 $self->{confident} = 1;
401 last SNIFFING;
402 }
403
404 ## Step 4
405 ## TODO: <meta charset>
406
407 ## Step 5
408 ## TODO: from history
409
410 ## Step 6
411 require Whatpm::Charset::UniversalCharDet;
412 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
413 ($byte_buffer);
414 if (defined $charset_name) {
415 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
416
417 ## ISSUE: Unsupported encoding is not ignored according to the spec.
418 require Whatpm::Charset::DecodeHandle;
419 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
420 ($byte_stream);
421 ($char_stream, $e_status) = $charset->get_decode_handle
422 ($buffer, allow_error_reporting => 1,
423 allow_fallback => 1, byte_buffer => \$byte_buffer);
424 if ($char_stream) {
425 $buffer->{buffer} = $byte_buffer;
426 !!!parse-error (type => 'sniffing:chardet', ## TODO: type name
427 value => $charset_name,
428 level => $self->{info_level},
429 line => 1, column => 1);
430 $self->{confident} = 0;
431 last SNIFFING;
432 }
433 }
434
435 ## Step 7: default
436 ## TODO: Make this configurable.
437 $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
438 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
439 ## detectable in the step 6.
440 require Whatpm::Charset::DecodeHandle;
441 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
442 ($byte_stream);
443 ($char_stream, $e_status)
444 = $charset->get_decode_handle ($buffer,
445 allow_error_reporting => 1,
446 allow_fallback => 1,
447 byte_buffer => \$byte_buffer);
448 $buffer->{buffer} = $byte_buffer;
449 !!!parse-error (type => 'sniffing:default', ## TODO: type name
450 value => 'windows-1252',
451 level => $self->{info_level},
452 line => 1, column => 1);
453 $self->{confident} = 0;
454 } # SNIFFING
455
456 $self->{input_encoding} = $charset->get_iana_name;
457 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
458 !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
459 value => $self->{input_encoding},
460 level => $self->{unsupported_level},
461 line => 1, column => 1);
462 } elsif (not ($e_status &
463 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
464 !!!parse-error (type => 'chardecode:no error', ## TODO: type name
465 value => $self->{input_encoding},
466 level => $self->{unsupported_level},
467 line => 1, column => 1);
468 }
469
470 $self->{change_encoding} = sub {
471 my $self = shift;
472 $charset_name = shift;
473 my $token = shift;
474
475 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
476 ($char_stream, $e_status) = $charset->get_decode_handle
477 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
478 byte_buffer => \ $buffer->{buffer});
479
480 if ($char_stream) { # if supported
481 ## "Change the encoding" algorithm:
482
483 ## Step 1
484 if ($charset->{category} &
485 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
486 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
487 ($char_stream, $e_status) = $charset->get_decode_handle
488 ($byte_stream,
489 byte_buffer => \ $buffer->{buffer});
490 }
491 $charset_name = $charset->get_iana_name;
492
493 ## Step 2
494 if (defined $self->{input_encoding} and
495 $self->{input_encoding} eq $charset_name) {
496 !!!parse-error (type => 'charset label:matching', ## TODO: type
497 value => $charset_name,
498 level => $self->{info_level});
499 $self->{confident} = 1;
500 return;
501 }
502
503 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
504 ':'.$charset_name, level => 'w', token => $token);
505
506 ## Step 3
507 # if (can) {
508 ## change the encoding on the fly.
509 #$self->{confident} = 1;
510 #return;
511 # }
512
513 ## Step 4
514 throw Whatpm::HTML::RestartParser ();
515 }
516 }; # $self->{change_encoding}
517
518 my $char_onerror = sub {
519 my (undef, $type, %opt) = @_;
520 !!!parse-error (%opt, type => $type,
521 line => $self->{line}, column => $self->{column} + 1);
522 if ($opt{octets}) {
523 ${$opt{octets}} = "\x{FFFD}"; # relacement character
524 }
525 };
526 $char_stream->onerror ($char_onerror);
527
528 my @args = @_; shift @args; # $s
529 my $return;
530 try {
531 $return = $self->parse_char_stream ($char_stream, @args);
532 } catch Whatpm::HTML::RestartParser with {
533 ## NOTE: Invoked after {change_encoding}.
534
535 $self->{input_encoding} = $charset->get_iana_name;
536 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
537 !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
538 value => $self->{input_encoding},
539 level => $self->{unsupported_level},
540 line => 1, column => 1);
541 } elsif (not ($e_status &
542 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
543 !!!parse-error (type => 'chardecode:no error', ## TODO: type name
544 value => $self->{input_encoding},
545 level => $self->{unsupported_level},
546 line => 1, column => 1);
547 }
548 $self->{confident} = 1;
549 $char_stream->onerror ($char_onerror);
550 $return = $self->parse_char_stream ($char_stream, @args);
551 };
552 return $return;
553 } # parse_byte_stream
554
555 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
556 ## and the HTML layer MUST ignore it. However, we does strip BOM in
557 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
558 ## because the core part of our HTML parser expects a string of character,
559 ## not a string of bytes or code units or anything which might contain a BOM.
560 ## Therefore, any parser interface that accepts a string of bytes,
561 ## such as |parse_byte_string| in this module, must ensure that it does
562 ## strip the BOM and never strip any ZWNBSP.
563
564 sub parse_char_string ($$$;$) {
565 my $self = shift;
566 require utf8;
567 my $s = ref $_[0] ? $_[0] : \($_[0]);
568 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
569 return $self->parse_char_stream ($input, @_[1..$#_]);
570 } # parse_char_string
571 *parse_string = \&parse_char_string;
572
573 sub parse_char_stream ($$$;$) {
574 my $self = ref $_[0] ? shift : shift->new;
575 my $input = $_[0];
576 $self->{document} = $_[1];
577 @{$self->{document}->child_nodes} = ();
578
579 ## NOTE: |set_inner_html| copies most of this method's code
580
581 $self->{confident} = 1 unless exists $self->{confident};
582 $self->{document}->input_encoding ($self->{input_encoding})
583 if defined $self->{input_encoding};
584
585 my $i = 0;
586 $self->{line_prev} = $self->{line} = 1;
587 $self->{column_prev} = $self->{column} = 0;
588 $self->{set_next_char} = sub {
589 my $self = shift;
590
591 pop @{$self->{prev_char}};
592 unshift @{$self->{prev_char}}, $self->{next_char};
593
594 my $char;
595 if (defined $self->{next_next_char}) {
596 $char = $self->{next_next_char};
597 delete $self->{next_next_char};
598 } else {
599 $char = $input->getc;
600 }
601 $self->{next_char} = -1 and return unless defined $char;
602 $self->{next_char} = ord $char;
603
604 ($self->{line_prev}, $self->{column_prev})
605 = ($self->{line}, $self->{column});
606 $self->{column}++;
607
608 if ($self->{next_char} == 0x000A) { # LF
609 !!!cp ('j1');
610 $self->{line}++;
611 $self->{column} = 0;
612 } elsif ($self->{next_char} == 0x000D) { # CR
613 !!!cp ('j2');
614 my $next = $input->getc;
615 if (defined $next and $next ne "\x0A") {
616 $self->{next_next_char} = $next;
617 }
618 $self->{next_char} = 0x000A; # LF # MUST
619 $self->{line}++;
620 $self->{column} = 0;
621 } elsif ($self->{next_char} > 0x10FFFF) {
622 !!!cp ('j3');
623 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
624 } elsif ($self->{next_char} == 0x0000) { # NULL
625 !!!cp ('j4');
626 !!!parse-error (type => 'NULL');
627 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
628 } elsif ($self->{next_char} <= 0x0008 or
629 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
630 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
631 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
632 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
633 {
634 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
635 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
636 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
637 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
638 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
639 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
640 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
641 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
642 0x10FFFE => 1, 0x10FFFF => 1,
643 }->{$self->{next_char}}) {
644 !!!cp ('j5');
645 !!!parse-error (type => 'control char', level => $self->{must_level});
646 ## TODO: error type documentation
647 }
648 };
649 $self->{prev_char} = [-1, -1, -1];
650 $self->{next_char} = -1;
651
652 my $onerror = $_[2] || sub {
653 my (%opt) = @_;
654 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
655 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
656 warn "Parse error ($opt{type}) at line $line column $column\n";
657 };
658 $self->{parse_error} = sub {
659 $onerror->(line => $self->{line}, column => $self->{column}, @_);
660 };
661
662 $self->_initialize_tokenizer;
663 $self->_initialize_tree_constructor;
664 $self->_construct_tree;
665 $self->_terminate_tree_constructor;
666
667 delete $self->{parse_error}; # remove loop
668
669 return $self->{document};
670 } # parse_char_stream
671
672 sub new ($) {
673 my $class = shift;
674 my $self = bless {
675 must_level => 'm',
676 should_level => 's',
677 good_level => 'w',
678 warn_level => 'w',
679 info_level => 'i',
680 unsupported_level => 'u',
681 }, $class;
682 $self->{set_next_char} = sub {
683 $self->{next_char} = -1;
684 };
685 $self->{parse_error} = sub {
686 #
687 };
688 $self->{change_encoding} = sub {
689 # if ($_[0] is a supported encoding) {
690 # run "change the encoding" algorithm;
691 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
692 # }
693 };
694 $self->{application_cache_selection} = sub {
695 #
696 };
697 return $self;
698 } # new
699
700 sub CM_ENTITY () { 0b001 } # & markup in data
701 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
702 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
703
704 sub PLAINTEXT_CONTENT_MODEL () { 0 }
705 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
706 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
707 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
708
709 sub DATA_STATE () { 0 }
710 sub ENTITY_DATA_STATE () { 1 }
711 sub TAG_OPEN_STATE () { 2 }
712 sub CLOSE_TAG_OPEN_STATE () { 3 }
713 sub TAG_NAME_STATE () { 4 }
714 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
715 sub ATTRIBUTE_NAME_STATE () { 6 }
716 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
717 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
718 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
719 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
720 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
721 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
722 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
723 sub COMMENT_START_STATE () { 14 }
724 sub COMMENT_START_DASH_STATE () { 15 }
725 sub COMMENT_STATE () { 16 }
726 sub COMMENT_END_STATE () { 17 }
727 sub COMMENT_END_DASH_STATE () { 18 }
728 sub BOGUS_COMMENT_STATE () { 19 }
729 sub DOCTYPE_STATE () { 20 }
730 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
731 sub DOCTYPE_NAME_STATE () { 22 }
732 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
733 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
734 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
735 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
736 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
737 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
738 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
739 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
740 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
741 sub BOGUS_DOCTYPE_STATE () { 32 }
742 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
743 sub SELF_CLOSING_START_TAG_STATE () { 34 }
744 sub CDATA_BLOCK_STATE () { 35 }
745
746 sub DOCTYPE_TOKEN () { 1 }
747 sub COMMENT_TOKEN () { 2 }
748 sub START_TAG_TOKEN () { 3 }
749 sub END_TAG_TOKEN () { 4 }
750 sub END_OF_FILE_TOKEN () { 5 }
751 sub CHARACTER_TOKEN () { 6 }
752
753 sub AFTER_HTML_IMS () { 0b100 }
754 sub HEAD_IMS () { 0b1000 }
755 sub BODY_IMS () { 0b10000 }
756 sub BODY_TABLE_IMS () { 0b100000 }
757 sub TABLE_IMS () { 0b1000000 }
758 sub ROW_IMS () { 0b10000000 }
759 sub BODY_AFTER_IMS () { 0b100000000 }
760 sub FRAME_IMS () { 0b1000000000 }
761 sub SELECT_IMS () { 0b10000000000 }
762 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
763 ## NOTE: "in foreign content" insertion mode is special; it is combined
764 ## with the secondary insertion mode. In this parser, they are stored
765 ## together in the bit-or'ed form.
766
767 ## NOTE: "initial" and "before html" insertion modes have no constants.
768
769 ## NOTE: "after after body" insertion mode.
770 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
771
772 ## NOTE: "after after frameset" insertion mode.
773 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
774
775 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
776 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
777 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
778 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
779 sub IN_BODY_IM () { BODY_IMS }
780 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
781 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
782 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
783 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
784 sub IN_TABLE_IM () { TABLE_IMS }
785 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
786 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
787 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
788 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
789 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
790 sub IN_COLUMN_GROUP_IM () { 0b10 }
791
792 ## Implementations MUST act as if state machine in the spec
793
794 sub _initialize_tokenizer ($) {
795 my $self = shift;
796 $self->{state} = DATA_STATE; # MUST
797 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
798 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
799 undef $self->{current_attribute};
800 undef $self->{last_emitted_start_tag_name};
801 undef $self->{last_attribute_value_state};
802 delete $self->{self_closing};
803 $self->{char} = [];
804 # $self->{next_char}
805 !!!next-input-character;
806 $self->{token} = [];
807 # $self->{escape}
808 } # _initialize_tokenizer
809
810 ## A token has:
811 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
812 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
813 ## ->{name} (DOCTYPE_TOKEN)
814 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
815 ## ->{public_identifier} (DOCTYPE_TOKEN)
816 ## ->{system_identifier} (DOCTYPE_TOKEN)
817 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
818 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
819 ## ->{name}
820 ## ->{value}
821 ## ->{has_reference} == 1 or 0
822 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
823 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
824 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
825 ## while the token is pushed back to the stack.
826
827 ## Emitted token MUST immediately be handled by the tree construction state.
828
829 ## Before each step, UA MAY check to see if either one of the scripts in
830 ## "list of scripts that will execute as soon as possible" or the first
831 ## script in the "list of scripts that will execute asynchronously",
832 ## has completed loading. If one has, then it MUST be executed
833 ## and removed from the list.
834
835 ## NOTE: HTML5 "Writing HTML documents" section, applied to
836 ## documents and not to user agents and conformance checkers,
837 ## contains some requirements that are not detected by the
838 ## parsing algorithm:
839 ## - Some requirements on character encoding declarations. ## TODO
840 ## - "Elements MUST NOT contain content that their content model disallows."
841 ## ... Some are parse error, some are not (will be reported by c.c.).
842 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
843 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
844 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
845
846 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
847 ## be detected by the HTML5 parsing algorithm:
848 ## - Text,
849
850 sub _get_next_token ($) {
851 my $self = shift;
852
853 if ($self->{self_closing}) {
854 !!!parse-error (type => 'nestc', token => $self->{current_token});
855 ## NOTE: The |self_closing| flag is only set by start tag token.
856 ## In addition, when a start tag token is emitted, it is always set to
857 ## |current_token|.
858 delete $self->{self_closing};
859 }
860
861 if (@{$self->{token}}) {
862 $self->{self_closing} = $self->{token}->[0]->{self_closing};
863 return shift @{$self->{token}};
864 }
865
866 A: {
867 if ($self->{state} == DATA_STATE) {
868 if ($self->{next_char} == 0x0026) { # &
869 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
870 not $self->{escape}) {
871 !!!cp (1);
872 $self->{state} = ENTITY_DATA_STATE;
873 !!!next-input-character;
874 redo A;
875 } else {
876 !!!cp (2);
877 #
878 }
879 } elsif ($self->{next_char} == 0x002D) { # -
880 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
881 unless ($self->{escape}) {
882 if ($self->{prev_char}->[0] == 0x002D and # -
883 $self->{prev_char}->[1] == 0x0021 and # !
884 $self->{prev_char}->[2] == 0x003C) { # <
885 !!!cp (3);
886 $self->{escape} = 1;
887 } else {
888 !!!cp (4);
889 }
890 } else {
891 !!!cp (5);
892 }
893 }
894
895 #
896 } elsif ($self->{next_char} == 0x003C) { # <
897 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
898 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
899 not $self->{escape})) {
900 !!!cp (6);
901 $self->{state} = TAG_OPEN_STATE;
902 !!!next-input-character;
903 redo A;
904 } else {
905 !!!cp (7);
906 #
907 }
908 } elsif ($self->{next_char} == 0x003E) { # >
909 if ($self->{escape} and
910 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
911 if ($self->{prev_char}->[0] == 0x002D and # -
912 $self->{prev_char}->[1] == 0x002D) { # -
913 !!!cp (8);
914 delete $self->{escape};
915 } else {
916 !!!cp (9);
917 }
918 } else {
919 !!!cp (10);
920 }
921
922 #
923 } elsif ($self->{next_char} == -1) {
924 !!!cp (11);
925 !!!emit ({type => END_OF_FILE_TOKEN,
926 line => $self->{line}, column => $self->{column}});
927 last A; ## TODO: ok?
928 } else {
929 !!!cp (12);
930 }
931 # Anything else
932 my $token = {type => CHARACTER_TOKEN,
933 data => chr $self->{next_char},
934 line => $self->{line}, column => $self->{column},
935 };
936 ## Stay in the data state
937 !!!next-input-character;
938
939 !!!emit ($token);
940
941 redo A;
942 } elsif ($self->{state} == ENTITY_DATA_STATE) {
943 ## (cannot happen in CDATA state)
944
945 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
946
947 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
948
949 $self->{state} = DATA_STATE;
950 # next-input-character is already done
951
952 unless (defined $token) {
953 !!!cp (13);
954 !!!emit ({type => CHARACTER_TOKEN, data => '&',
955 line => $l, column => $c,
956 });
957 } else {
958 !!!cp (14);
959 !!!emit ($token);
960 }
961
962 redo A;
963 } elsif ($self->{state} == TAG_OPEN_STATE) {
964 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
965 if ($self->{next_char} == 0x002F) { # /
966 !!!cp (15);
967 !!!next-input-character;
968 $self->{state} = CLOSE_TAG_OPEN_STATE;
969 redo A;
970 } else {
971 !!!cp (16);
972 ## reconsume
973 $self->{state} = DATA_STATE;
974
975 !!!emit ({type => CHARACTER_TOKEN, data => '<',
976 line => $self->{line_prev},
977 column => $self->{column_prev},
978 });
979
980 redo A;
981 }
982 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
983 if ($self->{next_char} == 0x0021) { # !
984 !!!cp (17);
985 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
986 !!!next-input-character;
987 redo A;
988 } elsif ($self->{next_char} == 0x002F) { # /
989 !!!cp (18);
990 $self->{state} = CLOSE_TAG_OPEN_STATE;
991 !!!next-input-character;
992 redo A;
993 } elsif (0x0041 <= $self->{next_char} and
994 $self->{next_char} <= 0x005A) { # A..Z
995 !!!cp (19);
996 $self->{current_token}
997 = {type => START_TAG_TOKEN,
998 tag_name => chr ($self->{next_char} + 0x0020),
999 line => $self->{line_prev},
1000 column => $self->{column_prev}};
1001 $self->{state} = TAG_NAME_STATE;
1002 !!!next-input-character;
1003 redo A;
1004 } elsif (0x0061 <= $self->{next_char} and
1005 $self->{next_char} <= 0x007A) { # a..z
1006 !!!cp (20);
1007 $self->{current_token} = {type => START_TAG_TOKEN,
1008 tag_name => chr ($self->{next_char}),
1009 line => $self->{line_prev},
1010 column => $self->{column_prev}};
1011 $self->{state} = TAG_NAME_STATE;
1012 !!!next-input-character;
1013 redo A;
1014 } elsif ($self->{next_char} == 0x003E) { # >
1015 !!!cp (21);
1016 !!!parse-error (type => 'empty start tag',
1017 line => $self->{line_prev},
1018 column => $self->{column_prev});
1019 $self->{state} = DATA_STATE;
1020 !!!next-input-character;
1021
1022 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1023 line => $self->{line_prev},
1024 column => $self->{column_prev},
1025 });
1026
1027 redo A;
1028 } elsif ($self->{next_char} == 0x003F) { # ?
1029 !!!cp (22);
1030 !!!parse-error (type => 'pio',
1031 line => $self->{line_prev},
1032 column => $self->{column_prev});
1033 $self->{state} = BOGUS_COMMENT_STATE;
1034 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1035 line => $self->{line_prev},
1036 column => $self->{column_prev},
1037 };
1038 ## $self->{next_char} is intentionally left as is
1039 redo A;
1040 } else {
1041 !!!cp (23);
1042 !!!parse-error (type => 'bare stago',
1043 line => $self->{line_prev},
1044 column => $self->{column_prev});
1045 $self->{state} = DATA_STATE;
1046 ## reconsume
1047
1048 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1049 line => $self->{line_prev},
1050 column => $self->{column_prev},
1051 });
1052
1053 redo A;
1054 }
1055 } else {
1056 die "$0: $self->{content_model} in tag open";
1057 }
1058 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1059 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1060 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1061 if (defined $self->{last_emitted_start_tag_name}) {
1062
1063 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
1064 my @next_char;
1065 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
1066 push @next_char, $self->{next_char};
1067 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
1068 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
1069 if ($self->{next_char} == $c or $self->{next_char} == $C) {
1070 !!!cp (24);
1071 !!!next-input-character;
1072 next TAGNAME;
1073 } else {
1074 !!!cp (25);
1075 $self->{next_char} = shift @next_char; # reconsume
1076 !!!back-next-input-character (@next_char);
1077 $self->{state} = DATA_STATE;
1078
1079 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1080 line => $l, column => $c,
1081 });
1082
1083 redo A;
1084 }
1085 }
1086 push @next_char, $self->{next_char};
1087
1088 unless ($self->{next_char} == 0x0009 or # HT
1089 $self->{next_char} == 0x000A or # LF
1090 $self->{next_char} == 0x000B or # VT
1091 $self->{next_char} == 0x000C or # FF
1092 $self->{next_char} == 0x0020 or # SP
1093 $self->{next_char} == 0x003E or # >
1094 $self->{next_char} == 0x002F or # /
1095 $self->{next_char} == -1) {
1096 !!!cp (26);
1097 $self->{next_char} = shift @next_char; # reconsume
1098 !!!back-next-input-character (@next_char);
1099 $self->{state} = DATA_STATE;
1100 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1101 line => $l, column => $c,
1102 });
1103 redo A;
1104 } else {
1105 !!!cp (27);
1106 $self->{next_char} = shift @next_char;
1107 !!!back-next-input-character (@next_char);
1108 # and consume...
1109 }
1110 } else {
1111 ## No start tag token has ever been emitted
1112 !!!cp (28);
1113 # next-input-character is already done
1114 $self->{state} = DATA_STATE;
1115 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1116 line => $l, column => $c,
1117 });
1118 redo A;
1119 }
1120 }
1121
1122 if (0x0041 <= $self->{next_char} and
1123 $self->{next_char} <= 0x005A) { # A..Z
1124 !!!cp (29);
1125 $self->{current_token}
1126 = {type => END_TAG_TOKEN,
1127 tag_name => chr ($self->{next_char} + 0x0020),
1128 line => $l, column => $c};
1129 $self->{state} = TAG_NAME_STATE;
1130 !!!next-input-character;
1131 redo A;
1132 } elsif (0x0061 <= $self->{next_char} and
1133 $self->{next_char} <= 0x007A) { # a..z
1134 !!!cp (30);
1135 $self->{current_token} = {type => END_TAG_TOKEN,
1136 tag_name => chr ($self->{next_char}),
1137 line => $l, column => $c};
1138 $self->{state} = TAG_NAME_STATE;
1139 !!!next-input-character;
1140 redo A;
1141 } elsif ($self->{next_char} == 0x003E) { # >
1142 !!!cp (31);
1143 !!!parse-error (type => 'empty end tag',
1144 line => $self->{line_prev}, ## "<" in "</>"
1145 column => $self->{column_prev} - 1);
1146 $self->{state} = DATA_STATE;
1147 !!!next-input-character;
1148 redo A;
1149 } elsif ($self->{next_char} == -1) {
1150 !!!cp (32);
1151 !!!parse-error (type => 'bare etago');
1152 $self->{state} = DATA_STATE;
1153 # reconsume
1154
1155 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1156 line => $l, column => $c,
1157 });
1158
1159 redo A;
1160 } else {
1161 !!!cp (33);
1162 !!!parse-error (type => 'bogus end tag');
1163 $self->{state} = BOGUS_COMMENT_STATE;
1164 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1165 line => $self->{line_prev}, # "<" of "</"
1166 column => $self->{column_prev} - 1,
1167 };
1168 ## $self->{next_char} is intentionally left as is
1169 redo A;
1170 }
1171 } elsif ($self->{state} == TAG_NAME_STATE) {
1172 if ($self->{next_char} == 0x0009 or # HT
1173 $self->{next_char} == 0x000A or # LF
1174 $self->{next_char} == 0x000B or # VT
1175 $self->{next_char} == 0x000C or # FF
1176 $self->{next_char} == 0x0020) { # SP
1177 !!!cp (34);
1178 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1179 !!!next-input-character;
1180 redo A;
1181 } elsif ($self->{next_char} == 0x003E) { # >
1182 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1183 !!!cp (35);
1184 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1185 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1186 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1187 #if ($self->{current_token}->{attributes}) {
1188 # ## NOTE: This should never be reached.
1189 # !!! cp (36);
1190 # !!! parse-error (type => 'end tag attribute');
1191 #} else {
1192 !!!cp (37);
1193 #}
1194 } else {
1195 die "$0: $self->{current_token}->{type}: Unknown token type";
1196 }
1197 $self->{state} = DATA_STATE;
1198 !!!next-input-character;
1199
1200 !!!emit ($self->{current_token}); # start tag or end tag
1201
1202 redo A;
1203 } elsif (0x0041 <= $self->{next_char} and
1204 $self->{next_char} <= 0x005A) { # A..Z
1205 !!!cp (38);
1206 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1207 # start tag or end tag
1208 ## Stay in this state
1209 !!!next-input-character;
1210 redo A;
1211 } elsif ($self->{next_char} == -1) {
1212 !!!parse-error (type => 'unclosed tag');
1213 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1214 !!!cp (39);
1215 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1216 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1217 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1218 #if ($self->{current_token}->{attributes}) {
1219 # ## NOTE: This state should never be reached.
1220 # !!! cp (40);
1221 # !!! parse-error (type => 'end tag attribute');
1222 #} else {
1223 !!!cp (41);
1224 #}
1225 } else {
1226 die "$0: $self->{current_token}->{type}: Unknown token type";
1227 }
1228 $self->{state} = DATA_STATE;
1229 # reconsume
1230
1231 !!!emit ($self->{current_token}); # start tag or end tag
1232
1233 redo A;
1234 } elsif ($self->{next_char} == 0x002F) { # /
1235 !!!cp (42);
1236 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1237 !!!next-input-character;
1238 redo A;
1239 } else {
1240 !!!cp (44);
1241 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1242 # start tag or end tag
1243 ## Stay in the state
1244 !!!next-input-character;
1245 redo A;
1246 }
1247 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1248 if ($self->{next_char} == 0x0009 or # HT
1249 $self->{next_char} == 0x000A or # LF
1250 $self->{next_char} == 0x000B or # VT
1251 $self->{next_char} == 0x000C or # FF
1252 $self->{next_char} == 0x0020) { # SP
1253 !!!cp (45);
1254 ## Stay in the state
1255 !!!next-input-character;
1256 redo A;
1257 } elsif ($self->{next_char} == 0x003E) { # >
1258 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1259 !!!cp (46);
1260 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1261 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1262 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1263 if ($self->{current_token}->{attributes}) {
1264 !!!cp (47);
1265 !!!parse-error (type => 'end tag attribute');
1266 } else {
1267 !!!cp (48);
1268 }
1269 } else {
1270 die "$0: $self->{current_token}->{type}: Unknown token type";
1271 }
1272 $self->{state} = DATA_STATE;
1273 !!!next-input-character;
1274
1275 !!!emit ($self->{current_token}); # start tag or end tag
1276
1277 redo A;
1278 } elsif (0x0041 <= $self->{next_char} and
1279 $self->{next_char} <= 0x005A) { # A..Z
1280 !!!cp (49);
1281 $self->{current_attribute}
1282 = {name => chr ($self->{next_char} + 0x0020),
1283 value => '',
1284 line => $self->{line}, column => $self->{column}};
1285 $self->{state} = ATTRIBUTE_NAME_STATE;
1286 !!!next-input-character;
1287 redo A;
1288 } elsif ($self->{next_char} == 0x002F) { # /
1289 !!!cp (50);
1290 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1291 !!!next-input-character;
1292 redo A;
1293 } elsif ($self->{next_char} == -1) {
1294 !!!parse-error (type => 'unclosed tag');
1295 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1296 !!!cp (52);
1297 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1298 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1299 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1300 if ($self->{current_token}->{attributes}) {
1301 !!!cp (53);
1302 !!!parse-error (type => 'end tag attribute');
1303 } else {
1304 !!!cp (54);
1305 }
1306 } else {
1307 die "$0: $self->{current_token}->{type}: Unknown token type";
1308 }
1309 $self->{state} = DATA_STATE;
1310 # reconsume
1311
1312 !!!emit ($self->{current_token}); # start tag or end tag
1313
1314 redo A;
1315 } else {
1316 if ({
1317 0x0022 => 1, # "
1318 0x0027 => 1, # '
1319 0x003D => 1, # =
1320 }->{$self->{next_char}}) {
1321 !!!cp (55);
1322 !!!parse-error (type => 'bad attribute name');
1323 } else {
1324 !!!cp (56);
1325 }
1326 $self->{current_attribute}
1327 = {name => chr ($self->{next_char}),
1328 value => '',
1329 line => $self->{line}, column => $self->{column}};
1330 $self->{state} = ATTRIBUTE_NAME_STATE;
1331 !!!next-input-character;
1332 redo A;
1333 }
1334 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1335 my $before_leave = sub {
1336 if (exists $self->{current_token}->{attributes} # start tag or end tag
1337 ->{$self->{current_attribute}->{name}}) { # MUST
1338 !!!cp (57);
1339 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1340 ## Discard $self->{current_attribute} # MUST
1341 } else {
1342 !!!cp (58);
1343 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1344 = $self->{current_attribute};
1345 }
1346 }; # $before_leave
1347
1348 if ($self->{next_char} == 0x0009 or # HT
1349 $self->{next_char} == 0x000A or # LF
1350 $self->{next_char} == 0x000B or # VT
1351 $self->{next_char} == 0x000C or # FF
1352 $self->{next_char} == 0x0020) { # SP
1353 !!!cp (59);
1354 $before_leave->();
1355 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1356 !!!next-input-character;
1357 redo A;
1358 } elsif ($self->{next_char} == 0x003D) { # =
1359 !!!cp (60);
1360 $before_leave->();
1361 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1362 !!!next-input-character;
1363 redo A;
1364 } elsif ($self->{next_char} == 0x003E) { # >
1365 $before_leave->();
1366 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1367 !!!cp (61);
1368 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1369 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1370 !!!cp (62);
1371 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1372 if ($self->{current_token}->{attributes}) {
1373 !!!parse-error (type => 'end tag attribute');
1374 }
1375 } else {
1376 die "$0: $self->{current_token}->{type}: Unknown token type";
1377 }
1378 $self->{state} = DATA_STATE;
1379 !!!next-input-character;
1380
1381 !!!emit ($self->{current_token}); # start tag or end tag
1382
1383 redo A;
1384 } elsif (0x0041 <= $self->{next_char} and
1385 $self->{next_char} <= 0x005A) { # A..Z
1386 !!!cp (63);
1387 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1388 ## Stay in the state
1389 !!!next-input-character;
1390 redo A;
1391 } elsif ($self->{next_char} == 0x002F) { # /
1392 !!!cp (64);
1393 $before_leave->();
1394 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1395 !!!next-input-character;
1396 redo A;
1397 } elsif ($self->{next_char} == -1) {
1398 !!!parse-error (type => 'unclosed tag');
1399 $before_leave->();
1400 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1401 !!!cp (66);
1402 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1403 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1404 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1405 if ($self->{current_token}->{attributes}) {
1406 !!!cp (67);
1407 !!!parse-error (type => 'end tag attribute');
1408 } else {
1409 ## NOTE: This state should never be reached.
1410 !!!cp (68);
1411 }
1412 } else {
1413 die "$0: $self->{current_token}->{type}: Unknown token type";
1414 }
1415 $self->{state} = DATA_STATE;
1416 # reconsume
1417
1418 !!!emit ($self->{current_token}); # start tag or end tag
1419
1420 redo A;
1421 } else {
1422 if ($self->{next_char} == 0x0022 or # "
1423 $self->{next_char} == 0x0027) { # '
1424 !!!cp (69);
1425 !!!parse-error (type => 'bad attribute name');
1426 } else {
1427 !!!cp (70);
1428 }
1429 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1430 ## Stay in the state
1431 !!!next-input-character;
1432 redo A;
1433 }
1434 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1435 if ($self->{next_char} == 0x0009 or # HT
1436 $self->{next_char} == 0x000A or # LF
1437 $self->{next_char} == 0x000B or # VT
1438 $self->{next_char} == 0x000C or # FF
1439 $self->{next_char} == 0x0020) { # SP
1440 !!!cp (71);
1441 ## Stay in the state
1442 !!!next-input-character;
1443 redo A;
1444 } elsif ($self->{next_char} == 0x003D) { # =
1445 !!!cp (72);
1446 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1447 !!!next-input-character;
1448 redo A;
1449 } elsif ($self->{next_char} == 0x003E) { # >
1450 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1451 !!!cp (73);
1452 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1453 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1454 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1455 if ($self->{current_token}->{attributes}) {
1456 !!!cp (74);
1457 !!!parse-error (type => 'end tag attribute');
1458 } else {
1459 ## NOTE: This state should never be reached.
1460 !!!cp (75);
1461 }
1462 } else {
1463 die "$0: $self->{current_token}->{type}: Unknown token type";
1464 }
1465 $self->{state} = DATA_STATE;
1466 !!!next-input-character;
1467
1468 !!!emit ($self->{current_token}); # start tag or end tag
1469
1470 redo A;
1471 } elsif (0x0041 <= $self->{next_char} and
1472 $self->{next_char} <= 0x005A) { # A..Z
1473 !!!cp (76);
1474 $self->{current_attribute}
1475 = {name => chr ($self->{next_char} + 0x0020),
1476 value => '',
1477 line => $self->{line}, column => $self->{column}};
1478 $self->{state} = ATTRIBUTE_NAME_STATE;
1479 !!!next-input-character;
1480 redo A;
1481 } elsif ($self->{next_char} == 0x002F) { # /
1482 !!!cp (77);
1483 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1484 !!!next-input-character;
1485 redo A;
1486 } elsif ($self->{next_char} == -1) {
1487 !!!parse-error (type => 'unclosed tag');
1488 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1489 !!!cp (79);
1490 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1491 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1492 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1493 if ($self->{current_token}->{attributes}) {
1494 !!!cp (80);
1495 !!!parse-error (type => 'end tag attribute');
1496 } else {
1497 ## NOTE: This state should never be reached.
1498 !!!cp (81);
1499 }
1500 } else {
1501 die "$0: $self->{current_token}->{type}: Unknown token type";
1502 }
1503 $self->{state} = DATA_STATE;
1504 # reconsume
1505
1506 !!!emit ($self->{current_token}); # start tag or end tag
1507
1508 redo A;
1509 } else {
1510 !!!cp (82);
1511 $self->{current_attribute}
1512 = {name => chr ($self->{next_char}),
1513 value => '',
1514 line => $self->{line}, column => $self->{column}};
1515 $self->{state} = ATTRIBUTE_NAME_STATE;
1516 !!!next-input-character;
1517 redo A;
1518 }
1519 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1520 if ($self->{next_char} == 0x0009 or # HT
1521 $self->{next_char} == 0x000A or # LF
1522 $self->{next_char} == 0x000B or # VT
1523 $self->{next_char} == 0x000C or # FF
1524 $self->{next_char} == 0x0020) { # SP
1525 !!!cp (83);
1526 ## Stay in the state
1527 !!!next-input-character;
1528 redo A;
1529 } elsif ($self->{next_char} == 0x0022) { # "
1530 !!!cp (84);
1531 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1532 !!!next-input-character;
1533 redo A;
1534 } elsif ($self->{next_char} == 0x0026) { # &
1535 !!!cp (85);
1536 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1537 ## reconsume
1538 redo A;
1539 } elsif ($self->{next_char} == 0x0027) { # '
1540 !!!cp (86);
1541 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1542 !!!next-input-character;
1543 redo A;
1544 } elsif ($self->{next_char} == 0x003E) { # >
1545 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1546 !!!cp (87);
1547 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1548 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1549 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1550 if ($self->{current_token}->{attributes}) {
1551 !!!cp (88);
1552 !!!parse-error (type => 'end tag attribute');
1553 } else {
1554 ## NOTE: This state should never be reached.
1555 !!!cp (89);
1556 }
1557 } else {
1558 die "$0: $self->{current_token}->{type}: Unknown token type";
1559 }
1560 $self->{state} = DATA_STATE;
1561 !!!next-input-character;
1562
1563 !!!emit ($self->{current_token}); # start tag or end tag
1564
1565 redo A;
1566 } elsif ($self->{next_char} == -1) {
1567 !!!parse-error (type => 'unclosed tag');
1568 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1569 !!!cp (90);
1570 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1571 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1572 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1573 if ($self->{current_token}->{attributes}) {
1574 !!!cp (91);
1575 !!!parse-error (type => 'end tag attribute');
1576 } else {
1577 ## NOTE: This state should never be reached.
1578 !!!cp (92);
1579 }
1580 } else {
1581 die "$0: $self->{current_token}->{type}: Unknown token type";
1582 }
1583 $self->{state} = DATA_STATE;
1584 ## reconsume
1585
1586 !!!emit ($self->{current_token}); # start tag or end tag
1587
1588 redo A;
1589 } else {
1590 if ($self->{next_char} == 0x003D) { # =
1591 !!!cp (93);
1592 !!!parse-error (type => 'bad attribute value');
1593 } else {
1594 !!!cp (94);
1595 }
1596 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1597 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1598 !!!next-input-character;
1599 redo A;
1600 }
1601 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1602 if ($self->{next_char} == 0x0022) { # "
1603 !!!cp (95);
1604 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1605 !!!next-input-character;
1606 redo A;
1607 } elsif ($self->{next_char} == 0x0026) { # &
1608 !!!cp (96);
1609 $self->{last_attribute_value_state} = $self->{state};
1610 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1611 !!!next-input-character;
1612 redo A;
1613 } elsif ($self->{next_char} == -1) {
1614 !!!parse-error (type => 'unclosed attribute value');
1615 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1616 !!!cp (97);
1617 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1618 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1619 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1620 if ($self->{current_token}->{attributes}) {
1621 !!!cp (98);
1622 !!!parse-error (type => 'end tag attribute');
1623 } else {
1624 ## NOTE: This state should never be reached.
1625 !!!cp (99);
1626 }
1627 } else {
1628 die "$0: $self->{current_token}->{type}: Unknown token type";
1629 }
1630 $self->{state} = DATA_STATE;
1631 ## reconsume
1632
1633 !!!emit ($self->{current_token}); # start tag or end tag
1634
1635 redo A;
1636 } else {
1637 !!!cp (100);
1638 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1639 ## Stay in the state
1640 !!!next-input-character;
1641 redo A;
1642 }
1643 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1644 if ($self->{next_char} == 0x0027) { # '
1645 !!!cp (101);
1646 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1647 !!!next-input-character;
1648 redo A;
1649 } elsif ($self->{next_char} == 0x0026) { # &
1650 !!!cp (102);
1651 $self->{last_attribute_value_state} = $self->{state};
1652 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1653 !!!next-input-character;
1654 redo A;
1655 } elsif ($self->{next_char} == -1) {
1656 !!!parse-error (type => 'unclosed attribute value');
1657 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1658 !!!cp (103);
1659 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1660 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1661 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1662 if ($self->{current_token}->{attributes}) {
1663 !!!cp (104);
1664 !!!parse-error (type => 'end tag attribute');
1665 } else {
1666 ## NOTE: This state should never be reached.
1667 !!!cp (105);
1668 }
1669 } else {
1670 die "$0: $self->{current_token}->{type}: Unknown token type";
1671 }
1672 $self->{state} = DATA_STATE;
1673 ## reconsume
1674
1675 !!!emit ($self->{current_token}); # start tag or end tag
1676
1677 redo A;
1678 } else {
1679 !!!cp (106);
1680 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1681 ## Stay in the state
1682 !!!next-input-character;
1683 redo A;
1684 }
1685 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1686 if ($self->{next_char} == 0x0009 or # HT
1687 $self->{next_char} == 0x000A or # LF
1688 $self->{next_char} == 0x000B or # HT
1689 $self->{next_char} == 0x000C or # FF
1690 $self->{next_char} == 0x0020) { # SP
1691 !!!cp (107);
1692 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1693 !!!next-input-character;
1694 redo A;
1695 } elsif ($self->{next_char} == 0x0026) { # &
1696 !!!cp (108);
1697 $self->{last_attribute_value_state} = $self->{state};
1698 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1699 !!!next-input-character;
1700 redo A;
1701 } elsif ($self->{next_char} == 0x003E) { # >
1702 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1703 !!!cp (109);
1704 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1705 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1706 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1707 if ($self->{current_token}->{attributes}) {
1708 !!!cp (110);
1709 !!!parse-error (type => 'end tag attribute');
1710 } else {
1711 ## NOTE: This state should never be reached.
1712 !!!cp (111);
1713 }
1714 } else {
1715 die "$0: $self->{current_token}->{type}: Unknown token type";
1716 }
1717 $self->{state} = DATA_STATE;
1718 !!!next-input-character;
1719
1720 !!!emit ($self->{current_token}); # start tag or end tag
1721
1722 redo A;
1723 } elsif ($self->{next_char} == -1) {
1724 !!!parse-error (type => 'unclosed tag');
1725 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1726 !!!cp (112);
1727 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1728 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1729 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1730 if ($self->{current_token}->{attributes}) {
1731 !!!cp (113);
1732 !!!parse-error (type => 'end tag attribute');
1733 } else {
1734 ## NOTE: This state should never be reached.
1735 !!!cp (114);
1736 }
1737 } else {
1738 die "$0: $self->{current_token}->{type}: Unknown token type";
1739 }
1740 $self->{state} = DATA_STATE;
1741 ## reconsume
1742
1743 !!!emit ($self->{current_token}); # start tag or end tag
1744
1745 redo A;
1746 } else {
1747 if ({
1748 0x0022 => 1, # "
1749 0x0027 => 1, # '
1750 0x003D => 1, # =
1751 }->{$self->{next_char}}) {
1752 !!!cp (115);
1753 !!!parse-error (type => 'bad attribute value');
1754 } else {
1755 !!!cp (116);
1756 }
1757 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1758 ## Stay in the state
1759 !!!next-input-character;
1760 redo A;
1761 }
1762 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1763 my $token = $self->_tokenize_attempt_to_consume_an_entity
1764 (1,
1765 $self->{last_attribute_value_state}
1766 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1767 $self->{last_attribute_value_state}
1768 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1769 -1);
1770
1771 unless (defined $token) {
1772 !!!cp (117);
1773 $self->{current_attribute}->{value} .= '&';
1774 } else {
1775 !!!cp (118);
1776 $self->{current_attribute}->{value} .= $token->{data};
1777 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1778 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1779 }
1780
1781 $self->{state} = $self->{last_attribute_value_state};
1782 # next-input-character is already done
1783 redo A;
1784 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1785 if ($self->{next_char} == 0x0009 or # HT
1786 $self->{next_char} == 0x000A or # LF
1787 $self->{next_char} == 0x000B or # VT
1788 $self->{next_char} == 0x000C or # FF
1789 $self->{next_char} == 0x0020) { # SP
1790 !!!cp (118);
1791 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1792 !!!next-input-character;
1793 redo A;
1794 } elsif ($self->{next_char} == 0x003E) { # >
1795 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1796 !!!cp (119);
1797 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1798 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1799 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1800 if ($self->{current_token}->{attributes}) {
1801 !!!cp (120);
1802 !!!parse-error (type => 'end tag attribute');
1803 } else {
1804 ## NOTE: This state should never be reached.
1805 !!!cp (121);
1806 }
1807 } else {
1808 die "$0: $self->{current_token}->{type}: Unknown token type";
1809 }
1810 $self->{state} = DATA_STATE;
1811 !!!next-input-character;
1812
1813 !!!emit ($self->{current_token}); # start tag or end tag
1814
1815 redo A;
1816 } elsif ($self->{next_char} == 0x002F) { # /
1817 !!!cp (122);
1818 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1819 !!!next-input-character;
1820 redo A;
1821 } elsif ($self->{next_char} == -1) {
1822 !!!parse-error (type => 'unclosed tag');
1823 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1824 !!!cp (122.3);
1825 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1826 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1827 if ($self->{current_token}->{attributes}) {
1828 !!!cp (122.1);
1829 !!!parse-error (type => 'end tag attribute');
1830 } else {
1831 ## NOTE: This state should never be reached.
1832 !!!cp (122.2);
1833 }
1834 } else {
1835 die "$0: $self->{current_token}->{type}: Unknown token type";
1836 }
1837 $self->{state} = DATA_STATE;
1838 ## Reconsume.
1839 !!!emit ($self->{current_token}); # start tag or end tag
1840 redo A;
1841 } else {
1842 !!!cp ('124.1');
1843 !!!parse-error (type => 'no space between attributes');
1844 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1845 ## reconsume
1846 redo A;
1847 }
1848 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1849 if ($self->{next_char} == 0x003E) { # >
1850 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1851 !!!cp ('124.2');
1852 !!!parse-error (type => 'nestc', token => $self->{current_token});
1853 ## TODO: Different type than slash in start tag
1854 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1855 if ($self->{current_token}->{attributes}) {
1856 !!!cp ('124.4');
1857 !!!parse-error (type => 'end tag attribute');
1858 } else {
1859 !!!cp ('124.5');
1860 }
1861 ## TODO: Test |<title></title/>|
1862 } else {
1863 !!!cp ('124.3');
1864 $self->{self_closing} = 1;
1865 }
1866
1867 $self->{state} = DATA_STATE;
1868 !!!next-input-character;
1869
1870 !!!emit ($self->{current_token}); # start tag or end tag
1871
1872 redo A;
1873 } elsif ($self->{next_char} == -1) {
1874 !!!parse-error (type => 'unclosed tag');
1875 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1876 !!!cp (124.7);
1877 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1878 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1879 if ($self->{current_token}->{attributes}) {
1880 !!!cp (124.5);
1881 !!!parse-error (type => 'end tag attribute');
1882 } else {
1883 ## NOTE: This state should never be reached.
1884 !!!cp (124.6);
1885 }
1886 } else {
1887 die "$0: $self->{current_token}->{type}: Unknown token type";
1888 }
1889 $self->{state} = DATA_STATE;
1890 ## Reconsume.
1891 !!!emit ($self->{current_token}); # start tag or end tag
1892 redo A;
1893 } else {
1894 !!!cp ('124.4');
1895 !!!parse-error (type => 'nestc');
1896 ## TODO: This error type is wrong.
1897 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1898 ## Reconsume.
1899 redo A;
1900 }
1901 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1902 ## (only happen if PCDATA state)
1903
1904 ## NOTE: Set by the previous state
1905 #my $token = {type => COMMENT_TOKEN, data => ''};
1906
1907 BC: {
1908 if ($self->{next_char} == 0x003E) { # >
1909 !!!cp (124);
1910 $self->{state} = DATA_STATE;
1911 !!!next-input-character;
1912
1913 !!!emit ($self->{current_token}); # comment
1914
1915 redo A;
1916 } elsif ($self->{next_char} == -1) {
1917 !!!cp (125);
1918 $self->{state} = DATA_STATE;
1919 ## reconsume
1920
1921 !!!emit ($self->{current_token}); # comment
1922
1923 redo A;
1924 } else {
1925 !!!cp (126);
1926 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1927 !!!next-input-character;
1928 redo BC;
1929 }
1930 } # BC
1931
1932 die "$0: _get_next_token: unexpected case [BC]";
1933 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1934 ## (only happen if PCDATA state)
1935
1936 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1937
1938 my @next_char;
1939 push @next_char, $self->{next_char};
1940
1941 if ($self->{next_char} == 0x002D) { # -
1942 !!!next-input-character;
1943 push @next_char, $self->{next_char};
1944 if ($self->{next_char} == 0x002D) { # -
1945 !!!cp (127);
1946 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1947 line => $l, column => $c,
1948 };
1949 $self->{state} = COMMENT_START_STATE;
1950 !!!next-input-character;
1951 redo A;
1952 } else {
1953 !!!cp (128);
1954 }
1955 } elsif ($self->{next_char} == 0x0044 or # D
1956 $self->{next_char} == 0x0064) { # d
1957 !!!next-input-character;
1958 push @next_char, $self->{next_char};
1959 if ($self->{next_char} == 0x004F or # O
1960 $self->{next_char} == 0x006F) { # o
1961 !!!next-input-character;
1962 push @next_char, $self->{next_char};
1963 if ($self->{next_char} == 0x0043 or # C
1964 $self->{next_char} == 0x0063) { # c
1965 !!!next-input-character;
1966 push @next_char, $self->{next_char};
1967 if ($self->{next_char} == 0x0054 or # T
1968 $self->{next_char} == 0x0074) { # t
1969 !!!next-input-character;
1970 push @next_char, $self->{next_char};
1971 if ($self->{next_char} == 0x0059 or # Y
1972 $self->{next_char} == 0x0079) { # y
1973 !!!next-input-character;
1974 push @next_char, $self->{next_char};
1975 if ($self->{next_char} == 0x0050 or # P
1976 $self->{next_char} == 0x0070) { # p
1977 !!!next-input-character;
1978 push @next_char, $self->{next_char};
1979 if ($self->{next_char} == 0x0045 or # E
1980 $self->{next_char} == 0x0065) { # e
1981 !!!cp (129);
1982 ## TODO: What a stupid code this is!
1983 $self->{state} = DOCTYPE_STATE;
1984 $self->{current_token} = {type => DOCTYPE_TOKEN,
1985 quirks => 1,
1986 line => $l, column => $c,
1987 };
1988 !!!next-input-character;
1989 redo A;
1990 } else {
1991 !!!cp (130);
1992 }
1993 } else {
1994 !!!cp (131);
1995 }
1996 } else {
1997 !!!cp (132);
1998 }
1999 } else {
2000 !!!cp (133);
2001 }
2002 } else {
2003 !!!cp (134);
2004 }
2005 } else {
2006 !!!cp (135);
2007 }
2008 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2009 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2010 $self->{next_char} == 0x005B) { # [
2011 !!!next-input-character;
2012 push @next_char, $self->{next_char};
2013 if ($self->{next_char} == 0x0043) { # C
2014 !!!next-input-character;
2015 push @next_char, $self->{next_char};
2016 if ($self->{next_char} == 0x0044) { # D
2017 !!!next-input-character;
2018 push @next_char, $self->{next_char};
2019 if ($self->{next_char} == 0x0041) { # A
2020 !!!next-input-character;
2021 push @next_char, $self->{next_char};
2022 if ($self->{next_char} == 0x0054) { # T
2023 !!!next-input-character;
2024 push @next_char, $self->{next_char};
2025 if ($self->{next_char} == 0x0041) { # A
2026 !!!next-input-character;
2027 push @next_char, $self->{next_char};
2028 if ($self->{next_char} == 0x005B) { # [
2029 !!!cp (135.1);
2030 $self->{state} = CDATA_BLOCK_STATE;
2031 !!!next-input-character;
2032 redo A;
2033 } else {
2034 !!!cp (135.2);
2035 }
2036 } else {
2037 !!!cp (135.3);
2038 }
2039 } else {
2040 !!!cp (135.4);
2041 }
2042 } else {
2043 !!!cp (135.5);
2044 }
2045 } else {
2046 !!!cp (135.6);
2047 }
2048 } else {
2049 !!!cp (135.7);
2050 }
2051 } else {
2052 !!!cp (136);
2053 }
2054
2055 !!!parse-error (type => 'bogus comment');
2056 $self->{next_char} = shift @next_char;
2057 !!!back-next-input-character (@next_char);
2058 $self->{state} = BOGUS_COMMENT_STATE;
2059 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2060 line => $l, column => $c,
2061 };
2062 redo A;
2063
2064 ## ISSUE: typos in spec: chacacters, is is a parse error
2065 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
2066 } elsif ($self->{state} == COMMENT_START_STATE) {
2067 if ($self->{next_char} == 0x002D) { # -
2068 !!!cp (137);
2069 $self->{state} = COMMENT_START_DASH_STATE;
2070 !!!next-input-character;
2071 redo A;
2072 } elsif ($self->{next_char} == 0x003E) { # >
2073 !!!cp (138);
2074 !!!parse-error (type => 'bogus comment');
2075 $self->{state} = DATA_STATE;
2076 !!!next-input-character;
2077
2078 !!!emit ($self->{current_token}); # comment
2079
2080 redo A;
2081 } elsif ($self->{next_char} == -1) {
2082 !!!cp (139);
2083 !!!parse-error (type => 'unclosed comment');
2084 $self->{state} = DATA_STATE;
2085 ## reconsume
2086
2087 !!!emit ($self->{current_token}); # comment
2088
2089 redo A;
2090 } else {
2091 !!!cp (140);
2092 $self->{current_token}->{data} # comment
2093 .= chr ($self->{next_char});
2094 $self->{state} = COMMENT_STATE;
2095 !!!next-input-character;
2096 redo A;
2097 }
2098 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2099 if ($self->{next_char} == 0x002D) { # -
2100 !!!cp (141);
2101 $self->{state} = COMMENT_END_STATE;
2102 !!!next-input-character;
2103 redo A;
2104 } elsif ($self->{next_char} == 0x003E) { # >
2105 !!!cp (142);
2106 !!!parse-error (type => 'bogus comment');
2107 $self->{state} = DATA_STATE;
2108 !!!next-input-character;
2109
2110 !!!emit ($self->{current_token}); # comment
2111
2112 redo A;
2113 } elsif ($self->{next_char} == -1) {
2114 !!!cp (143);
2115 !!!parse-error (type => 'unclosed comment');
2116 $self->{state} = DATA_STATE;
2117 ## reconsume
2118
2119 !!!emit ($self->{current_token}); # comment
2120
2121 redo A;
2122 } else {
2123 !!!cp (144);
2124 $self->{current_token}->{data} # comment
2125 .= '-' . chr ($self->{next_char});
2126 $self->{state} = COMMENT_STATE;
2127 !!!next-input-character;
2128 redo A;
2129 }
2130 } elsif ($self->{state} == COMMENT_STATE) {
2131 if ($self->{next_char} == 0x002D) { # -
2132 !!!cp (145);
2133 $self->{state} = COMMENT_END_DASH_STATE;
2134 !!!next-input-character;
2135 redo A;
2136 } elsif ($self->{next_char} == -1) {
2137 !!!cp (146);
2138 !!!parse-error (type => 'unclosed comment');
2139 $self->{state} = DATA_STATE;
2140 ## reconsume
2141
2142 !!!emit ($self->{current_token}); # comment
2143
2144 redo A;
2145 } else {
2146 !!!cp (147);
2147 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2148 ## Stay in the state
2149 !!!next-input-character;
2150 redo A;
2151 }
2152 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2153 if ($self->{next_char} == 0x002D) { # -
2154 !!!cp (148);
2155 $self->{state} = COMMENT_END_STATE;
2156 !!!next-input-character;
2157 redo A;
2158 } elsif ($self->{next_char} == -1) {
2159 !!!cp (149);
2160 !!!parse-error (type => 'unclosed comment');
2161 $self->{state} = DATA_STATE;
2162 ## reconsume
2163
2164 !!!emit ($self->{current_token}); # comment
2165
2166 redo A;
2167 } else {
2168 !!!cp (150);
2169 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2170 $self->{state} = COMMENT_STATE;
2171 !!!next-input-character;
2172 redo A;
2173 }
2174 } elsif ($self->{state} == COMMENT_END_STATE) {
2175 if ($self->{next_char} == 0x003E) { # >
2176 !!!cp (151);
2177 $self->{state} = DATA_STATE;
2178 !!!next-input-character;
2179
2180 !!!emit ($self->{current_token}); # comment
2181
2182 redo A;
2183 } elsif ($self->{next_char} == 0x002D) { # -
2184 !!!cp (152);
2185 !!!parse-error (type => 'dash in comment',
2186 line => $self->{line_prev},
2187 column => $self->{column_prev});
2188 $self->{current_token}->{data} .= '-'; # comment
2189 ## Stay in the state
2190 !!!next-input-character;
2191 redo A;
2192 } elsif ($self->{next_char} == -1) {
2193 !!!cp (153);
2194 !!!parse-error (type => 'unclosed comment');
2195 $self->{state} = DATA_STATE;
2196 ## reconsume
2197
2198 !!!emit ($self->{current_token}); # comment
2199
2200 redo A;
2201 } else {
2202 !!!cp (154);
2203 !!!parse-error (type => 'dash in comment',
2204 line => $self->{line_prev},
2205 column => $self->{column_prev});
2206 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2207 $self->{state} = COMMENT_STATE;
2208 !!!next-input-character;
2209 redo A;
2210 }
2211 } elsif ($self->{state} == DOCTYPE_STATE) {
2212 if ($self->{next_char} == 0x0009 or # HT
2213 $self->{next_char} == 0x000A or # LF
2214 $self->{next_char} == 0x000B or # VT
2215 $self->{next_char} == 0x000C or # FF
2216 $self->{next_char} == 0x0020) { # SP
2217 !!!cp (155);
2218 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2219 !!!next-input-character;
2220 redo A;
2221 } else {
2222 !!!cp (156);
2223 !!!parse-error (type => 'no space before DOCTYPE name');
2224 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2225 ## reconsume
2226 redo A;
2227 }
2228 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2229 if ($self->{next_char} == 0x0009 or # HT
2230 $self->{next_char} == 0x000A or # LF
2231 $self->{next_char} == 0x000B or # VT
2232 $self->{next_char} == 0x000C or # FF
2233 $self->{next_char} == 0x0020) { # SP
2234 !!!cp (157);
2235 ## Stay in the state
2236 !!!next-input-character;
2237 redo A;
2238 } elsif ($self->{next_char} == 0x003E) { # >
2239 !!!cp (158);
2240 !!!parse-error (type => 'no DOCTYPE name');
2241 $self->{state} = DATA_STATE;
2242 !!!next-input-character;
2243
2244 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2245
2246 redo A;
2247 } elsif ($self->{next_char} == -1) {
2248 !!!cp (159);
2249 !!!parse-error (type => 'no DOCTYPE name');
2250 $self->{state} = DATA_STATE;
2251 ## reconsume
2252
2253 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2254
2255 redo A;
2256 } else {
2257 !!!cp (160);
2258 $self->{current_token}->{name} = chr $self->{next_char};
2259 delete $self->{current_token}->{quirks};
2260 ## ISSUE: "Set the token's name name to the" in the spec
2261 $self->{state} = DOCTYPE_NAME_STATE;
2262 !!!next-input-character;
2263 redo A;
2264 }
2265 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2266 ## ISSUE: Redundant "First," in the spec.
2267 if ($self->{next_char} == 0x0009 or # HT
2268 $self->{next_char} == 0x000A or # LF
2269 $self->{next_char} == 0x000B or # VT
2270 $self->{next_char} == 0x000C or # FF
2271 $self->{next_char} == 0x0020) { # SP
2272 !!!cp (161);
2273 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2274 !!!next-input-character;
2275 redo A;
2276 } elsif ($self->{next_char} == 0x003E) { # >
2277 !!!cp (162);
2278 $self->{state} = DATA_STATE;
2279 !!!next-input-character;
2280
2281 !!!emit ($self->{current_token}); # DOCTYPE
2282
2283 redo A;
2284 } elsif ($self->{next_char} == -1) {
2285 !!!cp (163);
2286 !!!parse-error (type => 'unclosed DOCTYPE');
2287 $self->{state} = DATA_STATE;
2288 ## reconsume
2289
2290 $self->{current_token}->{quirks} = 1;
2291 !!!emit ($self->{current_token}); # DOCTYPE
2292
2293 redo A;
2294 } else {
2295 !!!cp (164);
2296 $self->{current_token}->{name}
2297 .= chr ($self->{next_char}); # DOCTYPE
2298 ## Stay in the state
2299 !!!next-input-character;
2300 redo A;
2301 }
2302 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2303 if ($self->{next_char} == 0x0009 or # HT
2304 $self->{next_char} == 0x000A or # LF
2305 $self->{next_char} == 0x000B or # VT
2306 $self->{next_char} == 0x000C or # FF
2307 $self->{next_char} == 0x0020) { # SP
2308 !!!cp (165);
2309 ## Stay in the state
2310 !!!next-input-character;
2311 redo A;
2312 } elsif ($self->{next_char} == 0x003E) { # >
2313 !!!cp (166);
2314 $self->{state} = DATA_STATE;
2315 !!!next-input-character;
2316
2317 !!!emit ($self->{current_token}); # DOCTYPE
2318
2319 redo A;
2320 } elsif ($self->{next_char} == -1) {
2321 !!!cp (167);
2322 !!!parse-error (type => 'unclosed DOCTYPE');
2323 $self->{state} = DATA_STATE;
2324 ## reconsume
2325
2326 $self->{current_token}->{quirks} = 1;
2327 !!!emit ($self->{current_token}); # DOCTYPE
2328
2329 redo A;
2330 } elsif ($self->{next_char} == 0x0050 or # P
2331 $self->{next_char} == 0x0070) { # p
2332 !!!next-input-character;
2333 if ($self->{next_char} == 0x0055 or # U
2334 $self->{next_char} == 0x0075) { # u
2335 !!!next-input-character;
2336 if ($self->{next_char} == 0x0042 or # B
2337 $self->{next_char} == 0x0062) { # b
2338 !!!next-input-character;
2339 if ($self->{next_char} == 0x004C or # L
2340 $self->{next_char} == 0x006C) { # l
2341 !!!next-input-character;
2342 if ($self->{next_char} == 0x0049 or # I
2343 $self->{next_char} == 0x0069) { # i
2344 !!!next-input-character;
2345 if ($self->{next_char} == 0x0043 or # C
2346 $self->{next_char} == 0x0063) { # c
2347 !!!cp (168);
2348 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2349 !!!next-input-character;
2350 redo A;
2351 } else {
2352 !!!cp (169);
2353 }
2354 } else {
2355 !!!cp (170);
2356 }
2357 } else {
2358 !!!cp (171);
2359 }
2360 } else {
2361 !!!cp (172);
2362 }
2363 } else {
2364 !!!cp (173);
2365 }
2366
2367 #
2368 } elsif ($self->{next_char} == 0x0053 or # S
2369 $self->{next_char} == 0x0073) { # s
2370 !!!next-input-character;
2371 if ($self->{next_char} == 0x0059 or # Y
2372 $self->{next_char} == 0x0079) { # y
2373 !!!next-input-character;
2374 if ($self->{next_char} == 0x0053 or # S
2375 $self->{next_char} == 0x0073) { # s
2376 !!!next-input-character;
2377 if ($self->{next_char} == 0x0054 or # T
2378 $self->{next_char} == 0x0074) { # t
2379 !!!next-input-character;
2380 if ($self->{next_char} == 0x0045 or # E
2381 $self->{next_char} == 0x0065) { # e
2382 !!!next-input-character;
2383 if ($self->{next_char} == 0x004D or # M
2384 $self->{next_char} == 0x006D) { # m
2385 !!!cp (174);
2386 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2387 !!!next-input-character;
2388 redo A;
2389 } else {
2390 !!!cp (175);
2391 }
2392 } else {
2393 !!!cp (176);
2394 }
2395 } else {
2396 !!!cp (177);
2397 }
2398 } else {
2399 !!!cp (178);
2400 }
2401 } else {
2402 !!!cp (179);
2403 }
2404
2405 #
2406 } else {
2407 !!!cp (180);
2408 !!!next-input-character;
2409 #
2410 }
2411
2412 !!!parse-error (type => 'string after DOCTYPE name');
2413 $self->{current_token}->{quirks} = 1;
2414
2415 $self->{state} = BOGUS_DOCTYPE_STATE;
2416 # next-input-character is already done
2417 redo A;
2418 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2419 if ({
2420 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2421 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2422 }->{$self->{next_char}}) {
2423 !!!cp (181);
2424 ## Stay in the state
2425 !!!next-input-character;
2426 redo A;
2427 } elsif ($self->{next_char} eq 0x0022) { # "
2428 !!!cp (182);
2429 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2430 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2431 !!!next-input-character;
2432 redo A;
2433 } elsif ($self->{next_char} eq 0x0027) { # '
2434 !!!cp (183);
2435 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2436 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2437 !!!next-input-character;
2438 redo A;
2439 } elsif ($self->{next_char} eq 0x003E) { # >
2440 !!!cp (184);
2441 !!!parse-error (type => 'no PUBLIC literal');
2442
2443 $self->{state} = DATA_STATE;
2444 !!!next-input-character;
2445
2446 $self->{current_token}->{quirks} = 1;
2447 !!!emit ($self->{current_token}); # DOCTYPE
2448
2449 redo A;
2450 } elsif ($self->{next_char} == -1) {
2451 !!!cp (185);
2452 !!!parse-error (type => 'unclosed DOCTYPE');
2453
2454 $self->{state} = DATA_STATE;
2455 ## reconsume
2456
2457 $self->{current_token}->{quirks} = 1;
2458 !!!emit ($self->{current_token}); # DOCTYPE
2459
2460 redo A;
2461 } else {
2462 !!!cp (186);
2463 !!!parse-error (type => 'string after PUBLIC');
2464 $self->{current_token}->{quirks} = 1;
2465
2466 $self->{state} = BOGUS_DOCTYPE_STATE;
2467 !!!next-input-character;
2468 redo A;
2469 }
2470 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2471 if ($self->{next_char} == 0x0022) { # "
2472 !!!cp (187);
2473 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2474 !!!next-input-character;
2475 redo A;
2476 } elsif ($self->{next_char} == 0x003E) { # >
2477 !!!cp (188);
2478 !!!parse-error (type => 'unclosed PUBLIC literal');
2479
2480 $self->{state} = DATA_STATE;
2481 !!!next-input-character;
2482
2483 $self->{current_token}->{quirks} = 1;
2484 !!!emit ($self->{current_token}); # DOCTYPE
2485
2486 redo A;
2487 } elsif ($self->{next_char} == -1) {
2488 !!!cp (189);
2489 !!!parse-error (type => 'unclosed PUBLIC literal');
2490
2491 $self->{state} = DATA_STATE;
2492 ## reconsume
2493
2494 $self->{current_token}->{quirks} = 1;
2495 !!!emit ($self->{current_token}); # DOCTYPE
2496
2497 redo A;
2498 } else {
2499 !!!cp (190);
2500 $self->{current_token}->{public_identifier} # DOCTYPE
2501 .= chr $self->{next_char};
2502 ## Stay in the state
2503 !!!next-input-character;
2504 redo A;
2505 }
2506 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2507 if ($self->{next_char} == 0x0027) { # '
2508 !!!cp (191);
2509 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2510 !!!next-input-character;
2511 redo A;
2512 } elsif ($self->{next_char} == 0x003E) { # >
2513 !!!cp (192);
2514 !!!parse-error (type => 'unclosed PUBLIC literal');
2515
2516 $self->{state} = DATA_STATE;
2517 !!!next-input-character;
2518
2519 $self->{current_token}->{quirks} = 1;
2520 !!!emit ($self->{current_token}); # DOCTYPE
2521
2522 redo A;
2523 } elsif ($self->{next_char} == -1) {
2524 !!!cp (193);
2525 !!!parse-error (type => 'unclosed PUBLIC literal');
2526
2527 $self->{state} = DATA_STATE;
2528 ## reconsume
2529
2530 $self->{current_token}->{quirks} = 1;
2531 !!!emit ($self->{current_token}); # DOCTYPE
2532
2533 redo A;
2534 } else {
2535 !!!cp (194);
2536 $self->{current_token}->{public_identifier} # DOCTYPE
2537 .= chr $self->{next_char};
2538 ## Stay in the state
2539 !!!next-input-character;
2540 redo A;
2541 }
2542 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2543 if ({
2544 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2545 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2546 }->{$self->{next_char}}) {
2547 !!!cp (195);
2548 ## Stay in the state
2549 !!!next-input-character;
2550 redo A;
2551 } elsif ($self->{next_char} == 0x0022) { # "
2552 !!!cp (196);
2553 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2554 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2555 !!!next-input-character;
2556 redo A;
2557 } elsif ($self->{next_char} == 0x0027) { # '
2558 !!!cp (197);
2559 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2560 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2561 !!!next-input-character;
2562 redo A;
2563 } elsif ($self->{next_char} == 0x003E) { # >
2564 !!!cp (198);
2565 $self->{state} = DATA_STATE;
2566 !!!next-input-character;
2567
2568 !!!emit ($self->{current_token}); # DOCTYPE
2569
2570 redo A;
2571 } elsif ($self->{next_char} == -1) {
2572 !!!cp (199);
2573 !!!parse-error (type => 'unclosed DOCTYPE');
2574
2575 $self->{state} = DATA_STATE;
2576 ## reconsume
2577
2578 $self->{current_token}->{quirks} = 1;
2579 !!!emit ($self->{current_token}); # DOCTYPE
2580
2581 redo A;
2582 } else {
2583 !!!cp (200);
2584 !!!parse-error (type => 'string after PUBLIC literal');
2585 $self->{current_token}->{quirks} = 1;
2586
2587 $self->{state} = BOGUS_DOCTYPE_STATE;
2588 !!!next-input-character;
2589 redo A;
2590 }
2591 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2592 if ({
2593 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2594 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2595 }->{$self->{next_char}}) {
2596 !!!cp (201);
2597 ## Stay in the state
2598 !!!next-input-character;
2599 redo A;
2600 } elsif ($self->{next_char} == 0x0022) { # "
2601 !!!cp (202);
2602 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2603 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2604 !!!next-input-character;
2605 redo A;
2606 } elsif ($self->{next_char} == 0x0027) { # '
2607 !!!cp (203);
2608 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2609 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2610 !!!next-input-character;
2611 redo A;
2612 } elsif ($self->{next_char} == 0x003E) { # >
2613 !!!cp (204);
2614 !!!parse-error (type => 'no SYSTEM literal');
2615 $self->{state} = DATA_STATE;
2616 !!!next-input-character;
2617
2618 $self->{current_token}->{quirks} = 1;
2619 !!!emit ($self->{current_token}); # DOCTYPE
2620
2621 redo A;
2622 } elsif ($self->{next_char} == -1) {
2623 !!!cp (205);
2624 !!!parse-error (type => 'unclosed DOCTYPE');
2625
2626 $self->{state} = DATA_STATE;
2627 ## reconsume
2628
2629 $self->{current_token}->{quirks} = 1;
2630 !!!emit ($self->{current_token}); # DOCTYPE
2631
2632 redo A;
2633 } else {
2634 !!!cp (206);
2635 !!!parse-error (type => 'string after SYSTEM');
2636 $self->{current_token}->{quirks} = 1;
2637
2638 $self->{state} = BOGUS_DOCTYPE_STATE;
2639 !!!next-input-character;
2640 redo A;
2641 }
2642 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2643 if ($self->{next_char} == 0x0022) { # "
2644 !!!cp (207);
2645 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2646 !!!next-input-character;
2647 redo A;
2648 } elsif ($self->{next_char} == 0x003E) { # >
2649 !!!cp (208);
2650 !!!parse-error (type => 'unclosed PUBLIC literal');
2651
2652 $self->{state} = DATA_STATE;
2653 !!!next-input-character;
2654
2655 $self->{current_token}->{quirks} = 1;
2656 !!!emit ($self->{current_token}); # DOCTYPE
2657
2658 redo A;
2659 } elsif ($self->{next_char} == -1) {
2660 !!!cp (209);
2661 !!!parse-error (type => 'unclosed SYSTEM literal');
2662
2663 $self->{state} = DATA_STATE;
2664 ## reconsume
2665
2666 $self->{current_token}->{quirks} = 1;
2667 !!!emit ($self->{current_token}); # DOCTYPE
2668
2669 redo A;
2670 } else {
2671 !!!cp (210);
2672 $self->{current_token}->{system_identifier} # DOCTYPE
2673 .= chr $self->{next_char};
2674 ## Stay in the state
2675 !!!next-input-character;
2676 redo A;
2677 }
2678 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2679 if ($self->{next_char} == 0x0027) { # '
2680 !!!cp (211);
2681 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2682 !!!next-input-character;
2683 redo A;
2684 } elsif ($self->{next_char} == 0x003E) { # >
2685 !!!cp (212);
2686 !!!parse-error (type => 'unclosed PUBLIC literal');
2687
2688 $self->{state} = DATA_STATE;
2689 !!!next-input-character;
2690
2691 $self->{current_token}->{quirks} = 1;
2692 !!!emit ($self->{current_token}); # DOCTYPE
2693
2694 redo A;
2695 } elsif ($self->{next_char} == -1) {
2696 !!!cp (213);
2697 !!!parse-error (type => 'unclosed SYSTEM literal');
2698
2699 $self->{state} = DATA_STATE;
2700 ## reconsume
2701
2702 $self->{current_token}->{quirks} = 1;
2703 !!!emit ($self->{current_token}); # DOCTYPE
2704
2705 redo A;
2706 } else {
2707 !!!cp (214);
2708 $self->{current_token}->{system_identifier} # DOCTYPE
2709 .= chr $self->{next_char};
2710 ## Stay in the state
2711 !!!next-input-character;
2712 redo A;
2713 }
2714 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2715 if ({
2716 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2717 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2718 }->{$self->{next_char}}) {
2719 !!!cp (215);
2720 ## Stay in the state
2721 !!!next-input-character;
2722 redo A;
2723 } elsif ($self->{next_char} == 0x003E) { # >
2724 !!!cp (216);
2725 $self->{state} = DATA_STATE;
2726 !!!next-input-character;
2727
2728 !!!emit ($self->{current_token}); # DOCTYPE
2729
2730 redo A;
2731 } elsif ($self->{next_char} == -1) {
2732 !!!cp (217);
2733 !!!parse-error (type => 'unclosed DOCTYPE');
2734 $self->{state} = DATA_STATE;
2735 ## reconsume
2736
2737 $self->{current_token}->{quirks} = 1;
2738 !!!emit ($self->{current_token}); # DOCTYPE
2739
2740 redo A;
2741 } else {
2742 !!!cp (218);
2743 !!!parse-error (type => 'string after SYSTEM literal');
2744 #$self->{current_token}->{quirks} = 1;
2745
2746 $self->{state} = BOGUS_DOCTYPE_STATE;
2747 !!!next-input-character;
2748 redo A;
2749 }
2750 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2751 if ($self->{next_char} == 0x003E) { # >
2752 !!!cp (219);
2753 $self->{state} = DATA_STATE;
2754 !!!next-input-character;
2755
2756 !!!emit ($self->{current_token}); # DOCTYPE
2757
2758 redo A;
2759 } elsif ($self->{next_char} == -1) {
2760 !!!cp (220);
2761 !!!parse-error (type => 'unclosed DOCTYPE');
2762 $self->{state} = DATA_STATE;
2763 ## reconsume
2764
2765 !!!emit ($self->{current_token}); # DOCTYPE
2766
2767 redo A;
2768 } else {
2769 !!!cp (221);
2770 ## Stay in the state
2771 !!!next-input-character;
2772 redo A;
2773 }
2774 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2775 my $s = '';
2776
2777 my ($l, $c) = ($self->{line}, $self->{column});
2778
2779 CS: while ($self->{next_char} != -1) {
2780 if ($self->{next_char} == 0x005D) { # ]
2781 !!!next-input-character;
2782 if ($self->{next_char} == 0x005D) { # ]
2783 !!!next-input-character;
2784 MDC: {
2785 if ($self->{next_char} == 0x003E) { # >
2786 !!!cp (221.1);
2787 !!!next-input-character;
2788 last CS;
2789 } elsif ($self->{next_char} == 0x005D) { # ]
2790 !!!cp (221.2);
2791 $s .= ']';
2792 !!!next-input-character;
2793 redo MDC;
2794 } else {
2795 !!!cp (221.3);
2796 $s .= ']]';
2797 #
2798 }
2799 } # MDC
2800 } else {
2801 !!!cp (221.4);
2802 $s .= ']';
2803 #
2804 }
2805 } else {
2806 !!!cp (221.5);
2807 #
2808 }
2809 $s .= chr $self->{next_char};
2810 !!!next-input-character;
2811 } # CS
2812
2813 $self->{state} = DATA_STATE;
2814 ## next-input-character done or EOF, which is reconsumed.
2815
2816 if (length $s) {
2817 !!!cp (221.6);
2818 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2819 line => $l, column => $c});
2820 } else {
2821 !!!cp (221.7);
2822 }
2823
2824 redo A;
2825
2826 ## ISSUE: "text tokens" in spec.
2827 ## TODO: Streaming support
2828 } else {
2829 die "$0: $self->{state}: Unknown state";
2830 }
2831 } # A
2832
2833 die "$0: _get_next_token: unexpected case";
2834 } # _get_next_token
2835
2836 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2837 my ($self, $in_attr, $additional) = @_;
2838
2839 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2840
2841 if ({
2842 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2843 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2844 $additional => 1,
2845 }->{$self->{next_char}}) {
2846 !!!cp (1001);
2847 ## Don't consume
2848 ## No error
2849 return undef;
2850 } elsif ($self->{next_char} == 0x0023) { # #
2851 !!!next-input-character;
2852 if ($self->{next_char} == 0x0078 or # x
2853 $self->{next_char} == 0x0058) { # X
2854 my $code;
2855 X: {
2856 my $x_char = $self->{next_char};
2857 !!!next-input-character;
2858 if (0x0030 <= $self->{next_char} and
2859 $self->{next_char} <= 0x0039) { # 0..9
2860 !!!cp (1002);
2861 $code ||= 0;
2862 $code *= 0x10;
2863 $code += $self->{next_char} - 0x0030;
2864 redo X;
2865 } elsif (0x0061 <= $self->{next_char} and
2866 $self->{next_char} <= 0x0066) { # a..f
2867 !!!cp (1003);
2868 $code ||= 0;
2869 $code *= 0x10;
2870 $code += $self->{next_char} - 0x0060 + 9;
2871 redo X;
2872 } elsif (0x0041 <= $self->{next_char} and
2873 $self->{next_char} <= 0x0046) { # A..F
2874 !!!cp (1004);
2875 $code ||= 0;
2876 $code *= 0x10;
2877 $code += $self->{next_char} - 0x0040 + 9;
2878 redo X;
2879 } elsif (not defined $code) { # no hexadecimal digit
2880 !!!cp (1005);
2881 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2882 !!!back-next-input-character ($x_char, $self->{next_char});
2883 $self->{next_char} = 0x0023; # #
2884 return undef;
2885 } elsif ($self->{next_char} == 0x003B) { # ;
2886 !!!cp (1006);
2887 !!!next-input-character;
2888 } else {
2889 !!!cp (1007);
2890 !!!parse-error (type => 'no refc', line => $l, column => $c);
2891 }
2892
2893 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2894 !!!cp (1008);
2895 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2896 $code = 0xFFFD;
2897 } elsif ($code > 0x10FFFF) {
2898 !!!cp (1009);
2899 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2900 $code = 0xFFFD;
2901 } elsif ($code == 0x000D) {
2902 !!!cp (1010);
2903 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2904 $code = 0x000A;
2905 } elsif (0x80 <= $code and $code <= 0x9F) {
2906 !!!cp (1011);
2907 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2908 $code = $c1_entity_char->{$code};
2909 }
2910
2911 return {type => CHARACTER_TOKEN, data => chr $code,
2912 has_reference => 1,
2913 line => $l, column => $c,
2914 };
2915 } # X
2916 } elsif (0x0030 <= $self->{next_char} and
2917 $self->{next_char} <= 0x0039) { # 0..9
2918 my $code = $self->{next_char} - 0x0030;
2919 !!!next-input-character;
2920
2921 while (0x0030 <= $self->{next_char} and
2922 $self->{next_char} <= 0x0039) { # 0..9
2923 !!!cp (1012);
2924 $code *= 10;
2925 $code += $self->{next_char} - 0x0030;
2926
2927 !!!next-input-character;
2928 }
2929
2930 if ($self->{next_char} == 0x003B) { # ;
2931 !!!cp (1013);
2932 !!!next-input-character;
2933 } else {
2934 !!!cp (1014);
2935 !!!parse-error (type => 'no refc', line => $l, column => $c);
2936 }
2937
2938 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2939 !!!cp (1015);
2940 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2941 $code = 0xFFFD;
2942 } elsif ($code > 0x10FFFF) {
2943 !!!cp (1016);
2944 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2945 $code = 0xFFFD;
2946 } elsif ($code == 0x000D) {
2947 !!!cp (1017);
2948 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2949 $code = 0x000A;
2950 } elsif (0x80 <= $code and $code <= 0x9F) {
2951 !!!cp (1018);
2952 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2953 $code = $c1_entity_char->{$code};
2954 }
2955
2956 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2957 line => $l, column => $c,
2958 };
2959 } else {
2960 !!!cp (1019);
2961 !!!parse-error (type => 'bare nero', line => $l, column => $c);
2962 !!!back-next-input-character ($self->{next_char});
2963 $self->{next_char} = 0x0023; # #
2964 return undef;
2965 }
2966 } elsif ((0x0041 <= $self->{next_char} and
2967 $self->{next_char} <= 0x005A) or
2968 (0x0061 <= $self->{next_char} and
2969 $self->{next_char} <= 0x007A)) {
2970 my $entity_name = chr $self->{next_char};
2971 !!!next-input-character;
2972
2973 my $value = $entity_name;
2974 my $match = 0;
2975 require Whatpm::_NamedEntityList;
2976 our $EntityChar;
2977
2978 while (length $entity_name < 30 and
2979 ## NOTE: Some number greater than the maximum length of entity name
2980 ((0x0041 <= $self->{next_char} and # a
2981 $self->{next_char} <= 0x005A) or # x
2982 (0x0061 <= $self->{next_char} and # a
2983 $self->{next_char} <= 0x007A) or # z
2984 (0x0030 <= $self->{next_char} and # 0
2985 $self->{next_char} <= 0x0039) or # 9
2986 $self->{next_char} == 0x003B)) { # ;
2987 $entity_name .= chr $self->{next_char};
2988 if (defined $EntityChar->{$entity_name}) {
2989 if ($self->{next_char} == 0x003B) { # ;
2990 !!!cp (1020);
2991 $value = $EntityChar->{$entity_name};
2992 $match = 1;
2993 !!!next-input-character;
2994 last;
2995 } else {
2996 !!!cp (1021);
2997 $value = $EntityChar->{$entity_name};
2998 $match = -1;
2999 !!!next-input-character;
3000 }
3001 } else {
3002 !!!cp (1022);
3003 $value .= chr $self->{next_char};
3004 $match *= 2;
3005 !!!next-input-character;
3006 }
3007 }
3008
3009 if ($match > 0) {
3010 !!!cp (1023);
3011 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3012 line => $l, column => $c,
3013 };
3014 } elsif ($match < 0) {
3015 !!!parse-error (type => 'no refc', line => $l, column => $c);
3016 if ($in_attr and $match < -1) {
3017 !!!cp (1024);
3018 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
3019 line => $l, column => $c,
3020 };
3021 } else {
3022 !!!cp (1025);
3023 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3024 line => $l, column => $c,
3025 };
3026 }
3027 } else {
3028 !!!cp (1026);
3029 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3030 ## NOTE: "No characters are consumed" in the spec.
3031 return {type => CHARACTER_TOKEN, data => '&'.$value,
3032 line => $l, column => $c,
3033 };
3034 }
3035 } else {
3036 !!!cp (1027);
3037 ## no characters are consumed
3038 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3039 return undef;
3040 }
3041 } # _tokenize_attempt_to_consume_an_entity
3042
3043 sub _initialize_tree_constructor ($) {
3044 my $self = shift;
3045 ## NOTE: $self->{document} MUST be specified before this method is called
3046 $self->{document}->strict_error_checking (0);
3047 ## TODO: Turn mutation events off # MUST
3048 ## TODO: Turn loose Document option (manakai extension) on
3049 $self->{document}->manakai_is_html (1); # MUST
3050 } # _initialize_tree_constructor
3051
3052 sub _terminate_tree_constructor ($) {
3053 my $self = shift;
3054 $self->{document}->strict_error_checking (1);
3055 ## TODO: Turn mutation events on
3056 } # _terminate_tree_constructor
3057
3058 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3059
3060 { # tree construction stage
3061 my $token;
3062
3063 sub _construct_tree ($) {
3064 my ($self) = @_;
3065
3066 ## When an interactive UA render the $self->{document} available
3067 ## to the user, or when it begin accepting user input, are
3068 ## not defined.
3069
3070 ## Append a character: collect it and all subsequent consecutive
3071 ## characters and insert one Text node whose data is concatenation
3072 ## of all those characters. # MUST
3073
3074 !!!next-token;
3075
3076 undef $self->{form_element};
3077 undef $self->{head_element};
3078 $self->{open_elements} = [];
3079 undef $self->{inner_html_node};
3080
3081 ## NOTE: The "initial" insertion mode.
3082 $self->_tree_construction_initial; # MUST
3083
3084 ## NOTE: The "before html" insertion mode.
3085 $self->_tree_construction_root_element;
3086 $self->{insertion_mode} = BEFORE_HEAD_IM;
3087
3088 ## NOTE: The "before head" insertion mode and so on.
3089 $self->_tree_construction_main;
3090 } # _construct_tree
3091
3092 sub _tree_construction_initial ($) {
3093 my $self = shift;
3094
3095 ## NOTE: "initial" insertion mode
3096
3097 INITIAL: {
3098 if ($token->{type} == DOCTYPE_TOKEN) {
3099 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3100 ## error, switch to a conformance checking mode for another
3101 ## language.
3102 my $doctype_name = $token->{name};
3103 $doctype_name = '' unless defined $doctype_name;
3104 $doctype_name =~ tr/a-z/A-Z/;
3105 if (not defined $token->{name} or # <!DOCTYPE>
3106 defined $token->{public_identifier} or
3107 defined $token->{system_identifier}) {
3108 !!!cp ('t1');
3109 !!!parse-error (type => 'not HTML5', token => $token);
3110 } elsif ($doctype_name ne 'HTML') {
3111 !!!cp ('t2');
3112 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
3113 !!!parse-error (type => 'not HTML5', token => $token);
3114 } else {
3115 !!!cp ('t3');
3116 }
3117
3118 my $doctype = $self->{document}->create_document_type_definition
3119 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3120 ## NOTE: Default value for both |public_id| and |system_id| attributes
3121 ## are empty strings, so that we don't set any value in missing cases.
3122 $doctype->public_id ($token->{public_identifier})
3123 if defined $token->{public_identifier};
3124 $doctype->system_id ($token->{system_identifier})
3125 if defined $token->{system_identifier};
3126 ## NOTE: Other DocumentType attributes are null or empty lists.
3127 ## ISSUE: internalSubset = null??
3128 $self->{document}->append_child ($doctype);
3129
3130 if ($token->{quirks} or $doctype_name ne 'HTML') {
3131 !!!cp ('t4');
3132 $self->{document}->manakai_compat_mode ('quirks');
3133 } elsif (defined $token->{public_identifier}) {
3134 my $pubid = $token->{public_identifier};
3135 $pubid =~ tr/a-z/A-z/;
3136 my $prefix = [
3137 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3138 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3139 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3140 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3141 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3142 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3143 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3144 "-//IETF//DTD HTML 2.0 STRICT//",
3145 "-//IETF//DTD HTML 2.0//",
3146 "-//IETF//DTD HTML 2.1E//",
3147 "-//IETF//DTD HTML 3.0//",
3148 "-//IETF//DTD HTML 3.2 FINAL//",
3149 "-//IETF//DTD HTML 3.2//",
3150 "-//IETF//DTD HTML 3//",
3151 "-//IETF//DTD HTML LEVEL 0//",
3152 "-//IETF//DTD HTML LEVEL 1//",
3153 "-//IETF//DTD HTML LEVEL 2//",
3154 "-//IETF//DTD HTML LEVEL 3//",
3155 "-//IETF//DTD HTML STRICT LEVEL 0//",
3156 "-//IETF//DTD HTML STRICT LEVEL 1//",
3157 "-//IETF//DTD HTML STRICT LEVEL 2//",
3158 "-//IETF//DTD HTML STRICT LEVEL 3//",
3159 "-//IETF//DTD HTML STRICT//",
3160 "-//IETF//DTD HTML//",
3161 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3162 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3163 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3164 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3165 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3166 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3167 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3168 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3169 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3170 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3171 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3172 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3173 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3174 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3175 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3176 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3177 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3178 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3179 "-//W3C//DTD HTML 3 1995-03-24//",
3180 "-//W3C//DTD HTML 3.2 DRAFT//",
3181 "-//W3C//DTD HTML 3.2 FINAL//",
3182 "-//W3C//DTD HTML 3.2//",
3183 "-//W3C//DTD HTML 3.2S DRAFT//",
3184 "-//W3C//DTD HTML 4.0 FRAMESET//",
3185 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3186 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3187 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3188 "-//W3C//DTD W3 HTML//",
3189 "-//W3O//DTD W3 HTML 3.0//",
3190 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3191 "-//WEBTECHS//DTD MOZILLA HTML//",
3192 ]; # $prefix
3193 my $match;
3194 for (@$prefix) {
3195 if (substr ($prefix, 0, length $_) eq $_) {
3196 $match = 1;
3197 last;
3198 }
3199 }
3200 if ($match or
3201 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3202 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3203 $pubid eq "HTML") {
3204 !!!cp ('t5');
3205 $self->{document}->manakai_compat_mode ('quirks');
3206 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3207 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3208 if (defined $token->{system_identifier}) {
3209 !!!cp ('t6');
3210 $self->{document}->manakai_compat_mode ('quirks');
3211 } else {
3212 !!!cp ('t7');
3213 $self->{document}->manakai_compat_mode ('limited quirks');
3214 }
3215 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3216 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3217 !!!cp ('t8');
3218 $self->{document}->manakai_compat_mode ('limited quirks');
3219 } else {
3220 !!!cp ('t9');
3221 }
3222 } else {
3223 !!!cp ('t10');
3224 }
3225 if (defined $token->{system_identifier}) {
3226 my $sysid = $token->{system_identifier};
3227 $sysid =~ tr/A-Z/a-z/;
3228 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3229 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3230 ## marked as quirks.
3231 $self->{document}->manakai_compat_mode ('quirks');
3232 !!!cp ('t11');
3233 } else {
3234 !!!cp ('t12');
3235 }
3236 } else {
3237 !!!cp ('t13');
3238 }
3239
3240 ## Go to the "before html" insertion mode.
3241 !!!next-token;
3242 return;
3243 } elsif ({
3244 START_TAG_TOKEN, 1,
3245 END_TAG_TOKEN, 1,
3246 END_OF_FILE_TOKEN, 1,
3247 }->{$token->{type}}) {
3248 !!!cp ('t14');
3249 !!!parse-error (type => 'no DOCTYPE', token => $token);
3250 $self->{document}->manakai_compat_mode ('quirks');
3251 ## Go to the "before html" insertion mode.
3252 ## reprocess
3253 !!!ack-later;
3254 return;
3255 } elsif ($token->{type} == CHARACTER_TOKEN) {
3256 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3257 ## Ignore the token
3258
3259 unless (length $token->{data}) {
3260 !!!cp ('t15');
3261 ## Stay in the insertion mode.
3262 !!!next-token;
3263 redo INITIAL;
3264 } else {
3265 !!!cp ('t16');
3266 }
3267 } else {
3268 !!!cp ('t17');
3269 }
3270
3271 !!!parse-error (type => 'no DOCTYPE', token => $token);
3272 $self->{document}->manakai_compat_mode ('quirks');
3273 ## Go to the "before html" insertion mode.
3274 ## reprocess
3275 return;
3276 } elsif ($token->{type} == COMMENT_TOKEN) {
3277 !!!cp ('t18');
3278 my $comment = $self->{document}->create_comment ($token->{data});
3279 $self->{document}->append_child ($comment);
3280
3281 ## Stay in the insertion mode.
3282 !!!next-token;
3283 redo INITIAL;
3284 } else {
3285 die "$0: $token->{type}: Unknown token type";
3286 }
3287 } # INITIAL
3288
3289 die "$0: _tree_construction_initial: This should be never reached";
3290 } # _tree_construction_initial
3291
3292 sub _tree_construction_root_element ($) {
3293 my $self = shift;
3294
3295 ## NOTE: "before html" insertion mode.
3296
3297 B: {
3298 if ($token->{type} == DOCTYPE_TOKEN) {
3299 !!!cp ('t19');
3300 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3301 ## Ignore the token
3302 ## Stay in the insertion mode.
3303 !!!next-token;
3304 redo B;
3305 } elsif ($token->{type} == COMMENT_TOKEN) {
3306 !!!cp ('t20');
3307 my $comment = $self->{document}->create_comment ($token->{data});
3308 $self->{document}->append_child ($comment);
3309 ## Stay in the insertion mode.
3310 !!!next-token;
3311 redo B;
3312 } elsif ($token->{type} == CHARACTER_TOKEN) {
3313 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3314 ## Ignore the token.
3315
3316 unless (length $token->{data}) {
3317 !!!cp ('t21');
3318 ## Stay in the insertion mode.
3319 !!!next-token;
3320 redo B;
3321 } else {
3322 !!!cp ('t22');
3323 }
3324 } else {
3325 !!!cp ('t23');
3326 }
3327
3328 $self->{application_cache_selection}->(undef);
3329
3330 #
3331 } elsif ($token->{type} == START_TAG_TOKEN) {
3332 if ($token->{tag_name} eq 'html') {
3333 my $root_element;
3334 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3335 $self->{document}->append_child ($root_element);
3336 push @{$self->{open_elements}},
3337 [$root_element, $el_category->{html}];
3338
3339 if ($token->{attributes}->{manifest}) {
3340 !!!cp ('t24');
3341 $self->{application_cache_selection}
3342 ->($token->{attributes}->{manifest}->{value});
3343 ## ISSUE: Spec is unclear on relative references.
3344 ## According to Hixie (#whatwg 2008-03-19), it should be
3345 ## resolved against the base URI of the document in HTML
3346 ## or xml:base of the element in XHTML.
3347 } else {
3348 !!!cp ('t25');
3349 $self->{application_cache_selection}->(undef);
3350 }
3351
3352 !!!nack ('t25c');
3353
3354 !!!next-token;
3355 return; ## Go to the "before head" insertion mode.
3356 } else {
3357 !!!cp ('t25.1');
3358 #
3359 }
3360 } elsif ({
3361 END_TAG_TOKEN, 1,
3362 END_OF_FILE_TOKEN, 1,
3363 }->{$token->{type}}) {
3364 !!!cp ('t26');
3365 #
3366 } else {
3367 die "$0: $token->{type}: Unknown token type";
3368 }
3369
3370 my $root_element;
3371 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3372 $self->{document}->append_child ($root_element);
3373 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3374
3375 $self->{application_cache_selection}->(undef);
3376
3377 ## NOTE: Reprocess the token.
3378 !!!ack-later;
3379 return; ## Go to the "before head" insertion mode.
3380
3381 ## ISSUE: There is an issue in the spec
3382 } # B
3383
3384 die "$0: _tree_construction_root_element: This should never be reached";
3385 } # _tree_construction_root_element
3386
3387 sub _reset_insertion_mode ($) {
3388 my $self = shift;
3389
3390 ## Step 1
3391 my $last;
3392
3393 ## Step 2
3394 my $i = -1;
3395 my $node = $self->{open_elements}->[$i];
3396
3397 ## Step 3
3398 S3: {
3399 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3400 $last = 1;
3401 if (defined $self->{inner_html_node}) {
3402 !!!cp ('t28');
3403 $node = $self->{inner_html_node};
3404 } else {
3405 die "_reset_insertion_mode: t27";
3406 }
3407 }
3408
3409 ## Step 4..14
3410 my $new_mode;
3411 if ($node->[1] & FOREIGN_EL) {
3412 !!!cp ('t28.1');
3413 ## NOTE: Strictly spaking, the line below only applies to MathML and
3414 ## SVG elements. Currently the HTML syntax supports only MathML and
3415 ## SVG elements as foreigners.
3416 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3417 } elsif ($node->[1] & TABLE_CELL_EL) {
3418 if ($last) {
3419 !!!cp ('t28.2');
3420 #
3421 } else {
3422 !!!cp ('t28.3');
3423 $new_mode = IN_CELL_IM;
3424 }
3425 } else {
3426 !!!cp ('t28.4');
3427 $new_mode = {
3428 select => IN_SELECT_IM,
3429 ## NOTE: |option| and |optgroup| do not set
3430 ## insertion mode to "in select" by themselves.
3431 tr => IN_ROW_IM,
3432 tbody => IN_TABLE_BODY_IM,
3433 thead => IN_TABLE_BODY_IM,
3434 tfoot => IN_TABLE_BODY_IM,
3435 caption => IN_CAPTION_IM,
3436 colgroup => IN_COLUMN_GROUP_IM,
3437 table => IN_TABLE_IM,
3438 head => IN_BODY_IM, # not in head!
3439 body => IN_BODY_IM,
3440 frameset => IN_FRAMESET_IM,
3441 }->{$node->[0]->manakai_local_name};
3442 }
3443 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3444
3445 ## Step 15
3446 if ($node->[1] & HTML_EL) {
3447 unless (defined $self->{head_element}) {
3448 !!!cp ('t29');
3449 $self->{insertion_mode} = BEFORE_HEAD_IM;
3450 } else {
3451 ## ISSUE: Can this state be reached?
3452 !!!cp ('t30');
3453 $self->{insertion_mode} = AFTER_HEAD_IM;
3454 }
3455 return;
3456 } else {
3457 !!!cp ('t31');
3458 }
3459
3460 ## Step 16
3461 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3462
3463 ## Step 17
3464 $i--;
3465 $node = $self->{open_elements}->[$i];
3466
3467 ## Step 18
3468 redo S3;
3469 } # S3
3470
3471 die "$0: _reset_insertion_mode: This line should never be reached";
3472 } # _reset_insertion_mode
3473
3474 sub _tree_construction_main ($) {
3475 my $self = shift;
3476
3477 my $active_formatting_elements = [];
3478
3479 my $reconstruct_active_formatting_elements = sub { # MUST
3480 my $insert = shift;
3481
3482 ## Step 1
3483 return unless @$active_formatting_elements;
3484
3485 ## Step 3
3486 my $i = -1;
3487 my $entry = $active_formatting_elements->[$i];
3488
3489 ## Step 2
3490 return if $entry->[0] eq '#marker';
3491 for (@{$self->{open_elements}}) {
3492 if ($entry->[0] eq $_->[0]) {
3493 !!!cp ('t32');
3494 return;
3495 }
3496 }
3497
3498 S4: {
3499 ## Step 4
3500 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3501
3502 ## Step 5
3503 $i--;
3504 $entry = $active_formatting_elements->[$i];
3505
3506 ## Step 6
3507 if ($entry->[0] eq '#marker') {
3508 !!!cp ('t33_1');
3509 #
3510 } else {
3511 my $in_open_elements;
3512 OE: for (@{$self->{open_elements}}) {
3513 if ($entry->[0] eq $_->[0]) {
3514 !!!cp ('t33');
3515 $in_open_elements = 1;
3516 last OE;
3517 }
3518 }
3519 if ($in_open_elements) {
3520 !!!cp ('t34');
3521 #
3522 } else {
3523 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3524 !!!cp ('t35');
3525 redo S4;
3526 }
3527 }
3528
3529 ## Step 7
3530 $i++;
3531 $entry = $active_formatting_elements->[$i];
3532 } # S4
3533
3534 S7: {
3535 ## Step 8
3536 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3537
3538 ## Step 9
3539 $insert->($clone->[0]);
3540 push @{$self->{open_elements}}, $clone;
3541
3542 ## Step 10
3543 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3544
3545 ## Step 11
3546 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3547 !!!cp ('t36');
3548 ## Step 7'
3549 $i++;
3550 $entry = $active_formatting_elements->[$i];
3551
3552 redo S7;
3553 }
3554
3555 !!!cp ('t37');
3556 } # S7
3557 }; # $reconstruct_active_formatting_elements
3558
3559 my $clear_up_to_marker = sub {
3560 for (reverse 0..$#$active_formatting_elements) {
3561 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3562 !!!cp ('t38');
3563 splice @$active_formatting_elements, $_;
3564 return;
3565 }
3566 }
3567
3568 !!!cp ('t39');
3569 }; # $clear_up_to_marker
3570
3571 my $insert;
3572
3573 my $parse_rcdata = sub ($) {
3574 my ($content_model_flag) = @_;
3575
3576 ## Step 1
3577 my $start_tag_name = $token->{tag_name};
3578 my $el;
3579 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3580
3581 ## Step 2
3582 $insert->($el);
3583
3584 ## Step 3
3585 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3586 delete $self->{escape}; # MUST
3587
3588 ## Step 4
3589 my $text = '';
3590 !!!nack ('t40.1');
3591 !!!next-token;
3592 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3593 !!!cp ('t40');
3594 $text .= $token->{data};
3595 !!!next-token;
3596 }
3597
3598 ## Step 5
3599 if (length $text) {
3600 !!!cp ('t41');
3601 my $text = $self->{document}->create_text_node ($text);
3602 $el->append_child ($text);
3603 }
3604
3605 ## Step 6
3606 $self->{content_model} = PCDATA_CONTENT_MODEL;
3607
3608 ## Step 7
3609 if ($token->{type} == END_TAG_TOKEN and
3610 $token->{tag_name} eq $start_tag_name) {
3611 !!!cp ('t42');
3612 ## Ignore the token
3613 } else {
3614 ## NOTE: An end-of-file token.
3615 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3616 !!!cp ('t43');
3617 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3618 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3619 !!!cp ('t44');
3620 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
3621 } else {
3622 die "$0: $content_model_flag in parse_rcdata";
3623 }
3624 }
3625 !!!next-token;
3626 }; # $parse_rcdata
3627
3628 my $script_start_tag = sub () {
3629 my $script_el;
3630 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3631 ## TODO: mark as "parser-inserted"
3632
3633 $self->{content_model} = CDATA_CONTENT_MODEL;
3634 delete $self->{escape}; # MUST
3635
3636 my $text = '';
3637 !!!nack ('t45.1');
3638 !!!next-token;
3639 while ($token->{type} == CHARACTER_TOKEN) {
3640 !!!cp ('t45');
3641 $text .= $token->{data};
3642 !!!next-token;
3643 } # stop if non-character token or tokenizer stops tokenising
3644 if (length $text) {
3645 !!!cp ('t46');
3646 $script_el->manakai_append_text ($text);
3647 }
3648
3649 $self->{content_model} = PCDATA_CONTENT_MODEL;
3650
3651 if ($token->{type} == END_TAG_TOKEN and
3652 $token->{tag_name} eq 'script') {
3653 !!!cp ('t47');
3654 ## Ignore the token
3655 } else {
3656 !!!cp ('t48');
3657 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3658 ## ISSUE: And ignore?
3659 ## TODO: mark as "already executed"
3660 }
3661
3662 if (defined $self->{inner_html_node}) {
3663 !!!cp ('t49');
3664 ## TODO: mark as "already executed"
3665 } else {
3666 !!!cp ('t50');
3667 ## TODO: $old_insertion_point = current insertion point
3668 ## TODO: insertion point = just before the next input character
3669
3670 $insert->($script_el);
3671
3672 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3673
3674 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3675 }
3676
3677 !!!next-token;
3678 }; # $script_start_tag
3679
3680 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3681 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3682 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3683
3684 my $formatting_end_tag = sub {
3685 my $end_tag_token = shift;
3686 my $tag_name = $end_tag_token->{tag_name};
3687
3688 ## NOTE: The adoption agency algorithm (AAA).
3689
3690 FET: {
3691 ## Step 1
3692 my $formatting_element;
3693 my $formatting_element_i_in_active;
3694 AFE: for (reverse 0..$#$active_formatting_elements) {
3695 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3696 !!!cp ('t52');
3697 last AFE;
3698 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3699 eq $tag_name) {
3700 !!!cp ('t51');
3701 $formatting_element = $active_formatting_elements->[$_];
3702 $formatting_element_i_in_active = $_;
3703 last AFE;
3704 }
3705 } # AFE
3706 unless (defined $formatting_element) {
3707 !!!cp ('t53');
3708 !!!parse-error (type => 'unmatched end tag:'.$tag_name, token => $end_tag_token);
3709 ## Ignore the token
3710 !!!next-token;
3711 return;
3712 }
3713 ## has an element in scope
3714 my $in_scope = 1;
3715 my $formatting_element_i_in_open;
3716 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3717 my $node = $self->{open_elements}->[$_];
3718 if ($node->[0] eq $formatting_element->[0]) {
3719 if ($in_scope) {
3720 !!!cp ('t54');
3721 $formatting_element_i_in_open = $_;
3722 last INSCOPE;
3723 } else { # in open elements but not in scope
3724 !!!cp ('t55');
3725 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3726 token => $end_tag_token);
3727 ## Ignore the token
3728 !!!next-token;
3729 return;
3730 }
3731 } elsif ($node->[1] & SCOPING_EL) {
3732 !!!cp ('t56');
3733 $in_scope = 0;
3734 }
3735 } # INSCOPE
3736 unless (defined $formatting_element_i_in_open) {
3737 !!!cp ('t57');
3738 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3739 token => $end_tag_token);
3740 pop @$active_formatting_elements; # $formatting_element
3741 !!!next-token; ## TODO: ok?
3742 return;
3743 }
3744 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3745 !!!cp ('t58');
3746 !!!parse-error (type => 'not closed',
3747 value => $self->{open_elements}->[-1]->[0]
3748 ->manakai_local_name,
3749 token => $end_tag_token);
3750 }
3751
3752 ## Step 2
3753 my $furthest_block;
3754 my $furthest_block_i_in_open;
3755 OE: for (reverse 0..$#{$self->{open_elements}}) {
3756 my $node = $self->{open_elements}->[$_];
3757 if (not ($node->[1] & FORMATTING_EL) and
3758 #not $phrasing_category->{$node->[1]} and
3759 ($node->[1] & SPECIAL_EL or
3760 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3761 !!!cp ('t59');
3762 $furthest_block = $node;
3763 $furthest_block_i_in_open = $_;
3764 } elsif ($node->[0] eq $formatting_element->[0]) {
3765 !!!cp ('t60');
3766 last OE;
3767 }
3768 } # OE
3769
3770 ## Step 3
3771 unless (defined $furthest_block) { # MUST
3772 !!!cp ('t61');
3773 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3774 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3775 !!!next-token;
3776 return;
3777 }
3778
3779 ## Step 4
3780 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3781
3782 ## Step 5
3783 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3784 if (defined $furthest_block_parent) {
3785 !!!cp ('t62');
3786 $furthest_block_parent->remove_child ($furthest_block->[0]);
3787 }
3788
3789 ## Step 6
3790 my $bookmark_prev_el
3791 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3792 ->[0];
3793
3794 ## Step 7
3795 my $node = $furthest_block;
3796 my $node_i_in_open = $furthest_block_i_in_open;
3797 my $last_node = $furthest_block;
3798 S7: {
3799 ## Step 1
3800 $node_i_in_open--;
3801 $node = $self->{open_elements}->[$node_i_in_open];
3802
3803 ## Step 2
3804 my $node_i_in_active;
3805 S7S2: {
3806 for (reverse 0..$#$active_formatting_elements) {
3807 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3808 !!!cp ('t63');
3809 $node_i_in_active = $_;
3810 last S7S2;
3811 }
3812 }
3813 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3814 redo S7;
3815 } # S7S2
3816
3817 ## Step 3
3818 last S7 if $node->[0] eq $formatting_element->[0];
3819
3820 ## Step 4
3821 if ($last_node->[0] eq $furthest_block->[0]) {
3822 !!!cp ('t64');
3823 $bookmark_prev_el = $node->[0];
3824 }
3825
3826 ## Step 5
3827 if ($node->[0]->has_child_nodes ()) {
3828 !!!cp ('t65');
3829 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3830 $active_formatting_elements->[$node_i_in_active] = $clone;
3831 $self->{open_elements}->[$node_i_in_open] = $clone;
3832 $node = $clone;
3833 }
3834
3835 ## Step 6
3836 $node->[0]->append_child ($last_node->[0]);
3837
3838 ## Step 7
3839 $last_node = $node;
3840
3841 ## Step 8
3842 redo S7;
3843 } # S7
3844
3845 ## Step 8
3846 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3847 my $foster_parent_element;
3848 my $next_sibling;
3849 OE: for (reverse 0..$#{$self->{open_elements}}) {
3850 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3851 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3852 if (defined $parent and $parent->node_type == 1) {
3853 !!!cp ('t65.1');
3854 $foster_parent_element = $parent;
3855 $next_sibling = $self->{open_elements}->[$_]->[0];
3856 } else {
3857 !!!cp ('t65.2');
3858 $foster_parent_element
3859 = $self->{open_elements}->[$_ - 1]->[0];
3860 }
3861 last OE;
3862 }
3863 } # OE
3864 $foster_parent_element = $self->{open_elements}->[0]->[0]
3865 unless defined $foster_parent_element;
3866 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3867 $open_tables->[-1]->[1] = 1; # tainted
3868 } else {
3869 !!!cp ('t65.3');
3870 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3871 }
3872
3873 ## Step 9
3874 my $clone = [$formatting_element->[0]->clone_node (0),
3875 $formatting_element->[1]];
3876
3877 ## Step 10
3878 my @cn = @{$furthest_block->[0]->child_nodes};
3879 $clone->[0]->append_child ($_) for @cn;
3880
3881 ## Step 11
3882 $furthest_block->[0]->append_child ($clone->[0]);
3883
3884 ## Step 12
3885 my $i;
3886 AFE: for (reverse 0..$#$active_formatting_elements) {
3887 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3888 !!!cp ('t66');
3889 splice @$active_formatting_elements, $_, 1;
3890 $i-- and last AFE if defined $i;
3891 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3892 !!!cp ('t67');
3893 $i = $_;
3894 }
3895 } # AFE
3896 splice @$active_formatting_elements, $i + 1, 0, $clone;
3897
3898 ## Step 13
3899 undef $i;
3900 OE: for (reverse 0..$#{$self->{open_elements}}) {
3901 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3902 !!!cp ('t68');
3903 splice @{$self->{open_elements}}, $_, 1;
3904 $i-- and last OE if defined $i;
3905 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3906 !!!cp ('t69');
3907 $i = $_;
3908 }
3909 } # OE
3910 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3911
3912 ## Step 14
3913 redo FET;
3914 } # FET
3915 }; # $formatting_end_tag
3916
3917 $insert = my $insert_to_current = sub {
3918 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3919 }; # $insert_to_current
3920
3921 my $insert_to_foster = sub {
3922 my $child = shift;
3923 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3924 # MUST
3925 my $foster_parent_element;
3926 my $next_sibling;
3927 OE: for (reverse 0..$#{$self->{open_elements}}) {
3928 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3929 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3930 if (defined $parent and $parent->node_type == 1) {
3931 !!!cp ('t70');
3932 $foster_parent_element = $parent;
3933 $next_sibling = $self->{open_elements}->[$_]->[0];
3934 } else {
3935 !!!cp ('t71');
3936 $foster_parent_element
3937 = $self->{open_elements}->[$_ - 1]->[0];
3938 }
3939 last OE;
3940 }
3941 } # OE
3942 $foster_parent_element = $self->{open_elements}->[0]->[0]
3943 unless defined $foster_parent_element;
3944 $foster_parent_element->insert_before
3945 ($child, $next_sibling);
3946 $open_tables->[-1]->[1] = 1; # tainted
3947 } else {
3948 !!!cp ('t72');
3949 $self->{open_elements}->[-1]->[0]->append_child ($child);
3950 }
3951 }; # $insert_to_foster
3952
3953 B: while (1) {
3954 if ($token->{type} == DOCTYPE_TOKEN) {
3955 !!!cp ('t73');
3956 !!!parse-error (type => 'DOCTYPE in the middle', token => $token);
3957 ## Ignore the token
3958 ## Stay in the phase
3959 !!!next-token;
3960 next B;
3961 } elsif ($token->{type} == START_TAG_TOKEN and
3962 $token->{tag_name} eq 'html') {
3963 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3964 !!!cp ('t79');
3965 !!!parse-error (type => 'after html:html', token => $token);
3966 $self->{insertion_mode} = AFTER_BODY_IM;
3967 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3968 !!!cp ('t80');
3969 !!!parse-error (type => 'after html:html', token => $token);
3970 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3971 } else {
3972 !!!cp ('t81');
3973 }
3974
3975 !!!cp ('t82');
3976 !!!parse-error (type => 'not first start tag', token => $token);
3977 my $top_el = $self->{open_elements}->[0]->[0];
3978 for my $attr_name (keys %{$token->{attributes}}) {
3979 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3980 !!!cp ('t84');
3981 $top_el->set_attribute_ns
3982 (undef, [undef, $attr_name],
3983 $token->{attributes}->{$attr_name}->{value});
3984 }
3985 }
3986 !!!nack ('t84.1');
3987 !!!next-token;
3988 next B;
3989 } elsif ($token->{type} == COMMENT_TOKEN) {
3990 my $comment = $self->{document}->create_comment ($token->{data});
3991 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3992 !!!cp ('t85');
3993 $self->{document}->append_child ($comment);
3994 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3995 !!!cp ('t86');
3996 $self->{open_elements}->[0]->[0]->append_child ($comment);
3997 } else {
3998 !!!cp ('t87');
3999 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4000 }
4001 !!!next-token;
4002 next B;
4003 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4004 if ($token->{type} == CHARACTER_TOKEN) {
4005 !!!cp ('t87.1');
4006 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4007 !!!next-token;
4008 next B;
4009 } elsif ($token->{type} == START_TAG_TOKEN) {
4010 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4011 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4012 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4013 ($token->{tag_name} eq 'svg' and
4014 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4015 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4016 !!!cp ('t87.2');
4017 #
4018 } elsif ({
4019 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4020 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4021 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4022 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4023 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4024 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4025 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4026 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4027 }->{$token->{tag_name}}) {
4028 !!!cp ('t87.2');
4029 !!!parse-error (type => 'not closed',
4030 value => $self->{open_elements}->[-1]->[0]
4031 ->manakai_local_name,
4032 token => $token);
4033
4034 pop @{$self->{open_elements}}
4035 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4036
4037 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4038 ## Reprocess.
4039 next B;
4040 } else {
4041 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4042 my $tag_name = $token->{tag_name};
4043 if ($nsuri eq $SVG_NS) {
4044 $tag_name = {
4045 altglyph => 'altGlyph',
4046 altglyphdef => 'altGlyphDef',
4047 altglyphitem => 'altGlyphItem',
4048 animatecolor => 'animateColor',
4049 animatemotion => 'animateMotion',
4050 animatetransform => 'animateTransform',
4051 clippath => 'clipPath',
4052 feblend => 'feBlend',
4053 fecolormatrix => 'feColorMatrix',
4054 fecomponenttransfer => 'feComponentTransfer',
4055 fecomposite => 'feComposite',
4056 feconvolvematrix => 'feConvolveMatrix',
4057 fediffuselighting => 'feDiffuseLighting',
4058 fedisplacementmap => 'feDisplacementMap',
4059 fedistantlight => 'feDistantLight',
4060 feflood => 'feFlood',
4061 fefunca => 'feFuncA',
4062 fefuncb => 'feFuncB',
4063 fefuncg => 'feFuncG',
4064 fefuncr => 'feFuncR',
4065 fegaussianblur => 'feGaussianBlur',
4066 feimage => 'feImage',
4067 femerge => 'feMerge',
4068 femergenode => 'feMergeNode',
4069 femorphology => 'feMorphology',
4070 feoffset => 'feOffset',
4071 fepointlight => 'fePointLight',
4072 fespecularlighting => 'feSpecularLighting',
4073 fespotlight => 'feSpotLight',
4074 fetile => 'feTile',
4075 feturbulence => 'feTurbulence',
4076 foreignobject => 'foreignObject',
4077 glyphref => 'glyphRef',
4078 lineargradient => 'linearGradient',
4079 radialgradient => 'radialGradient',
4080 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4081 textpath => 'textPath',
4082 }->{$tag_name} || $tag_name;
4083 }
4084
4085 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4086
4087 ## "adjust foreign attributes" - done in insert-element-f
4088
4089 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4090
4091 if ($self->{self_closing}) {
4092 pop @{$self->{open_elements}};
4093 !!!ack ('t87.3');
4094 } else {
4095 !!!cp ('t87.4');
4096 }
4097
4098 !!!next-token;
4099 next B;
4100 }
4101 } elsif ($token->{type} == END_TAG_TOKEN) {
4102 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4103 !!!cp ('t87.5');
4104 #
4105 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4106 !!!cp ('t87.6');
4107 !!!parse-error (type => 'not closed',
4108 value => $self->{open_elements}->[-1]->[0]
4109 ->manakai_local_name,
4110 token => $token);
4111
4112 pop @{$self->{open_elements}}
4113 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4114
4115 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4116 ## Reprocess.
4117 next B;
4118 } else {
4119 die "$0: $token->{type}: Unknown token type";
4120 }
4121 }
4122
4123 if ($self->{insertion_mode} & HEAD_IMS) {
4124 if ($token->{type} == CHARACTER_TOKEN) {
4125 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4126 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4127 !!!cp ('t88.2');
4128 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4129 } else {
4130 !!!cp ('t88.1');
4131 ## Ignore the token.
4132 !!!next-token;
4133 next B;
4134 }
4135 unless (length $token->{data}) {
4136 !!!cp ('t88');
4137 !!!next-token;
4138 next B;
4139 }
4140 }
4141
4142 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4143 !!!cp ('t89');
4144 ## As if <head>
4145 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4146 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4147 push @{$self->{open_elements}},
4148 [$self->{head_element}, $el_category->{head}];
4149
4150 ## Reprocess in the "in head" insertion mode...
4151 pop @{$self->{open_elements}};
4152
4153 ## Reprocess in the "after head" insertion mode...
4154 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4155 !!!cp ('t90');
4156 ## As if </noscript>
4157 pop @{$self->{open_elements}};
4158 !!!parse-error (type => 'in noscript:#character', token => $token);
4159
4160 ## Reprocess in the "in head" insertion mode...
4161 ## As if </head>
4162 pop @{$self->{open_elements}};
4163
4164 ## Reprocess in the "after head" insertion mode...
4165 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4166 !!!cp ('t91');
4167 pop @{$self->{open_elements}};
4168
4169 ## Reprocess in the "after head" insertion mode...
4170 } else {
4171 !!!cp ('t92');
4172 }
4173
4174 ## "after head" insertion mode
4175 ## As if <body>
4176 !!!insert-element ('body',, $token);
4177 $self->{insertion_mode} = IN_BODY_IM;
4178 ## reprocess
4179 next B;
4180 } elsif ($token->{type} == START_TAG_TOKEN) {
4181 if ($token->{tag_name} eq 'head') {
4182 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4183 !!!cp ('t93');
4184 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4185 $self->{open_elements}->[-1]->[0]->append_child
4186 ($self->{head_element});
4187 push @{$self->{open_elements}},
4188 [$self->{head_element}, $el_category->{head}];
4189 $self->{insertion_mode} = IN_HEAD_IM;
4190 !!!nack ('t93.1');
4191 !!!next-token;
4192 next B;
4193 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4194 !!!cp ('t93.2');
4195 !!!parse-error (type => 'after head:head', token => $token); ## TODO: error type
4196 ## Ignore the token
4197 !!!nack ('t93.3');
4198 !!!next-token;
4199 next B;
4200 } else {
4201 !!!cp ('t95');
4202 !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
4203 ## Ignore the token
4204 !!!nack ('t95.1');
4205 !!!next-token;
4206 next B;
4207 }
4208 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4209 !!!cp ('t96');
4210 ## As if <head>
4211 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4212 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4213 push @{$self->{open_elements}},
4214 [$self->{head_element}, $el_category->{head}];
4215
4216 $self->{insertion_mode} = IN_HEAD_IM;
4217 ## Reprocess in the "in head" insertion mode...
4218 } else {
4219 !!!cp ('t97');
4220 }
4221
4222 if ($token->{tag_name} eq 'base') {
4223 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4224 !!!cp ('t98');
4225 ## As if </noscript>
4226 pop @{$self->{open_elements}};
4227 !!!parse-error (type => 'in noscript:base', token => $token);
4228
4229 $self->{insertion_mode} = IN_HEAD_IM;
4230 ## Reprocess in the "in head" insertion mode...
4231 } else {
4232 !!!cp ('t99');
4233 }
4234
4235 ## NOTE: There is a "as if in head" code clone.
4236 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4237 !!!cp ('t100');
4238 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4239 push @{$self->{open_elements}},
4240 [$self->{head_element}, $el_category->{head}];
4241 } else {
4242 !!!cp ('t101');
4243 }
4244 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4245 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4246 pop @{$self->{open_elements}} # <head>
4247 if $self->{insertion_mode} == AFTER_HEAD_IM;
4248 !!!nack ('t101.1');
4249 !!!next-token;
4250 next B;
4251 } elsif ($token->{tag_name} eq 'link') {
4252 ## NOTE: There is a "as if in head" code clone.
4253 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4254 !!!cp ('t102');
4255 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4256 push @{$self->{open_elements}},
4257 [$self->{head_element}, $el_category->{head}];
4258 } else {
4259 !!!cp ('t103');
4260 }
4261 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4262 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4263 pop @{$self->{open_elements}} # <head>
4264 if $self->{insertion_mode} == AFTER_HEAD_IM;
4265 !!!ack ('t103.1');
4266 !!!next-token;
4267 next B;
4268 } elsif ($token->{tag_name} eq 'meta') {
4269 ## NOTE: There is a "as if in head" code clone.
4270 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4271 !!!cp ('t104');
4272 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4273 push @{$self->{open_elements}},
4274 [$self->{head_element}, $el_category->{head}];
4275 } else {
4276 !!!cp ('t105');
4277 }
4278 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4279 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4280
4281 unless ($self->{confident}) {
4282 if ($token->{attributes}->{charset}) {
4283 !!!cp ('t106');
4284 ## NOTE: Whether the encoding is supported or not is handled
4285 ## in the {change_encoding} callback.
4286 $self->{change_encoding}
4287 ->($self, $token->{attributes}->{charset}->{value},
4288 $token);
4289
4290 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4291 ->set_user_data (manakai_has_reference =>
4292 $token->{attributes}->{charset}
4293 ->{has_reference});
4294 } elsif ($token->{attributes}->{content}) {
4295 if ($token->{attributes}->{content}->{value}
4296 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4297 [\x09-\x0D\x20]*=
4298 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4299 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4300 !!!cp ('t107');
4301 ## NOTE: Whether the encoding is supported or not is handled
4302 ## in the {change_encoding} callback.
4303 $self->{change_encoding}
4304 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4305 $token);
4306 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4307 ->set_user_data (manakai_has_reference =>
4308 $token->{attributes}->{content}
4309 ->{has_reference});
4310 } else {
4311 !!!cp ('t108');
4312 }
4313 }
4314 } else {
4315 if ($token->{attributes}->{charset}) {
4316 !!!cp ('t109');
4317 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4318 ->set_user_data (manakai_has_reference =>
4319 $token->{attributes}->{charset}
4320 ->{has_reference});
4321 }
4322 if ($token->{attributes}->{content}) {
4323 !!!cp ('t110');
4324 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4325 ->set_user_data (manakai_has_reference =>
4326 $token->{attributes}->{content}
4327 ->{has_reference});
4328 }
4329 }
4330
4331 pop @{$self->{open_elements}} # <head>
4332 if $self->{insertion_mode} == AFTER_HEAD_IM;
4333 !!!ack ('t110.1');
4334 !!!next-token;
4335 next B;
4336 } elsif ($token->{tag_name} eq 'title') {
4337 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4338 !!!cp ('t111');
4339 ## As if </noscript>
4340 pop @{$self->{open_elements}};
4341 !!!parse-error (type => 'in noscript:title', token => $token);
4342
4343 $self->{insertion_mode} = IN_HEAD_IM;
4344 ## Reprocess in the "in head" insertion mode...
4345 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4346 !!!cp ('t112');
4347 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4348 push @{$self->{open_elements}},
4349 [$self->{head_element}, $el_category->{head}];
4350 } else {
4351 !!!cp ('t113');
4352 }
4353
4354 ## NOTE: There is a "as if in head" code clone.
4355 my $parent = defined $self->{head_element} ? $self->{head_element}
4356 : $self->{open_elements}->[-1]->[0];
4357 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4358 pop @{$self->{open_elements}} # <head>
4359 if $self->{insertion_mode} == AFTER_HEAD_IM;
4360 next B;
4361 } elsif ($token->{tag_name} eq 'style' or
4362 $token->{tag_name} eq 'noframes') {
4363 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4364 ## insertion mode IN_HEAD_IM)
4365 ## NOTE: There is a "as if in head" code clone.
4366 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4367 !!!cp ('t114');
4368 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4369 push @{$self->{open_elements}},
4370 [$self->{head_element}, $el_category->{head}];
4371 } else {
4372 !!!cp ('t115');
4373 }
4374 $parse_rcdata->(CDATA_CONTENT_MODEL);
4375 pop @{$self->{open_elements}} # <head>
4376 if $self->{insertion_mode} == AFTER_HEAD_IM;
4377 next B;
4378 } elsif ($token->{tag_name} eq 'noscript') {
4379 if ($self->{insertion_mode} == IN_HEAD_IM) {
4380 !!!cp ('t116');
4381 ## NOTE: and scripting is disalbed
4382 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4383 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4384 !!!nack ('t116.1');
4385 !!!next-token;
4386 next B;
4387 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4388 !!!cp ('t117');
4389 !!!parse-error (type => 'in noscript:noscript', token => $token);
4390 ## Ignore the token
4391 !!!nack ('t117.1');
4392 !!!next-token;
4393 next B;
4394 } else {
4395 !!!cp ('t118');
4396 #
4397 }
4398 } elsif ($token->{tag_name} eq 'script') {
4399 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4400 !!!cp ('t119');
4401 ## As if </noscript>
4402 pop @{$self->{open_elements}};
4403 !!!parse-error (type => 'in noscript:script', token => $token);
4404
4405 $self->{insertion_mode} = IN_HEAD_IM;
4406 ## Reprocess in the "in head" insertion mode...
4407 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4408 !!!cp ('t120');
4409 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4410 push @{$self->{open_elements}},
4411 [$self->{head_element}, $el_category->{head}];
4412 } else {
4413 !!!cp ('t121');
4414 }
4415
4416 ## NOTE: There is a "as if in head" code clone.
4417 $script_start_tag->();
4418 pop @{$self->{open_elements}} # <head>
4419 if $self->{insertion_mode} == AFTER_HEAD_IM;
4420 next B;
4421 } elsif ($token->{tag_name} eq 'body' or
4422 $token->{tag_name} eq 'frameset') {
4423 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4424 !!!cp ('t122');
4425 ## As if </noscript>
4426 pop @{$self->{open_elements}};
4427 !!!parse-error (type => 'in noscript:'.$token->{tag_name}, token => $token);
4428
4429 ## Reprocess in the "in head" insertion mode...
4430 ## As if </head>
4431 pop @{$self->{open_elements}};
4432
4433 ## Reprocess in the "after head" insertion mode...
4434 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4435 !!!cp ('t124');
4436 pop @{$self->{open_elements}};
4437
4438 ## Reprocess in the "after head" insertion mode...
4439 } else {
4440 !!!cp ('t125');
4441 }
4442
4443 ## "after head" insertion mode
4444 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4445 if ($token->{tag_name} eq 'body') {
4446 !!!cp ('t126');
4447 $self->{insertion_mode} = IN_BODY_IM;
4448 } elsif ($token->{tag_name} eq 'frameset') {
4449 !!!cp ('t127');
4450 $self->{insertion_mode} = IN_FRAMESET_IM;
4451 } else {
4452 die "$0: tag name: $self->{tag_name}";
4453 }
4454 !!!nack ('t127.1');
4455 !!!next-token;
4456 next B;
4457 } else {
4458 !!!cp ('t128');
4459 #
4460 }
4461
4462 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4463 !!!cp ('t129');
4464 ## As if </noscript>
4465 pop @{$self->{open_elements}};
4466 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4467
4468 ## Reprocess in the "in head" insertion mode...
4469 ## As if </head>
4470 pop @{$self->{open_elements}};
4471
4472 ## Reprocess in the "after head" insertion mode...
4473 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4474 !!!cp ('t130');
4475 ## As if </head>
4476 pop @{$self->{open_elements}};
4477
4478 ## Reprocess in the "after head" insertion mode...
4479 } else {
4480 !!!cp ('t131');
4481 }
4482
4483 ## "after head" insertion mode
4484 ## As if <body>
4485 !!!insert-element ('body',, $token);
4486 $self->{insertion_mode} = IN_BODY_IM;
4487 ## reprocess
4488 !!!ack-later;
4489 next B;
4490 } elsif ($token->{type} == END_TAG_TOKEN) {
4491 if ($token->{tag_name} eq 'head') {
4492 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4493 !!!cp ('t132');
4494 ## As if <head>
4495 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4496 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4497 push @{$self->{open_elements}},
4498 [$self->{head_element}, $el_category->{head}];
4499
4500 ## Reprocess in the "in head" insertion mode...
4501 pop @{$self->{open_elements}};
4502 $self->{insertion_mode} = AFTER_HEAD_IM;
4503 !!!next-token;
4504 next B;
4505 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4506 !!!cp ('t133');
4507 ## As if </noscript>
4508 pop @{$self->{open_elements}};
4509 !!!parse-error (type => 'in noscript:/head', token => $token);
4510
4511 ## Reprocess in the "in head" insertion mode...
4512 pop @{$self->{open_elements}};
4513 $self->{insertion_mode} = AFTER_HEAD_IM;
4514 !!!next-token;
4515 next B;
4516 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4517 !!!cp ('t134');
4518 pop @{$self->{open_elements}};
4519 $self->{insertion_mode} = AFTER_HEAD_IM;
4520 !!!next-token;
4521 next B;
4522 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4523 !!!cp ('t134.1');
4524 !!!parse-error (type => 'unmatched end tag:head', token => $token);
4525 ## Ignore the token
4526 !!!next-token;
4527 next B;
4528 } else {
4529 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4530 }
4531 } elsif ($token->{tag_name} eq 'noscript') {
4532 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4533 !!!cp ('t136');
4534 pop @{$self->{open_elements}};
4535 $self->{insertion_mode} = IN_HEAD_IM;
4536 !!!next-token;
4537 next B;
4538 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4539 $self->{insertion_mode} == AFTER_HEAD_IM) {
4540 !!!cp ('t137');
4541 !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4542 ## Ignore the token ## ISSUE: An issue in the spec.
4543 !!!next-token;
4544 next B;
4545 } else {
4546 !!!cp ('t138');
4547 #
4548 }
4549 } elsif ({
4550 body => 1, html => 1,
4551 }->{$token->{tag_name}}) {
4552 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4553 $self->{insertion_mode} == IN_HEAD_IM or
4554 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4555 !!!cp ('t140');
4556 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4557 ## Ignore the token
4558 !!!next-token;
4559 next B;
4560 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4561 !!!cp ('t140.1');
4562 !!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token);
4563 ## Ignore the token
4564 !!!next-token;
4565 next B;
4566 } else {
4567 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4568 }
4569 } elsif ($token->{tag_name} eq 'p') {
4570 !!!cp ('t142');
4571 !!!parse-error (type => 'unmatched end tag:p', token => $token);
4572 ## Ignore the token
4573 !!!next-token;
4574 next B;
4575 } elsif ($token->{tag_name} eq 'br') {
4576 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4577 !!!cp ('t142.2');
4578 ## (before head) as if <head>, (in head) as if </head>
4579 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4580 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4581 $self->{insertion_mode} = AFTER_HEAD_IM;
4582
4583 ## Reprocess in the "after head" insertion mode...
4584 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4585 !!!cp ('t143.2');
4586 ## As if </head>
4587 pop @{$self->{open_elements}};
4588 $self->{insertion_mode} = AFTER_HEAD_IM;
4589
4590 ## Reprocess in the "after head" insertion mode...
4591 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4592 !!!cp ('t143.3');
4593 ## ISSUE: Two parse errors for <head><noscript></br>
4594 !!!parse-error (type => 'unmatched end tag:br', token => $token);
4595 ## As if </noscript>
4596 pop @{$self->{open_elements}};
4597 $self->{insertion_mode} = IN_HEAD_IM;
4598
4599 ## Reprocess in the "in head" insertion mode...
4600 ## As if </head>
4601 pop @{$self->{open_elements}};
4602 $self->{insertion_mode} = AFTER_HEAD_IM;
4603
4604 ## Reprocess in the "after head" insertion mode...
4605 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4606 !!!cp ('t143.4');
4607 #
4608 } else {
4609 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4610 }
4611
4612 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4613 !!!parse-error (type => 'unmatched end tag:br', token => $token);
4614 ## Ignore the token
4615 !!!next-token;
4616 next B;
4617 } else {
4618 !!!cp ('t145');
4619 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4620 ## Ignore the token
4621 !!!next-token;
4622 next B;
4623 }
4624
4625 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4626 !!!cp ('t146');
4627 ## As if </noscript>
4628 pop @{$self->{open_elements}};
4629 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4630
4631 ## Reprocess in the "in head" insertion mode...
4632 ## As if </head>
4633 pop @{$self->{open_elements}};
4634
4635 ## Reprocess in the "after head" insertion mode...
4636 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4637 !!!cp ('t147');
4638 ## As if </head>
4639 pop @{$self->{open_elements}};
4640
4641 ## Reprocess in the "after head" insertion mode...
4642 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4643 ## ISSUE: This case cannot be reached?
4644 !!!cp ('t148');
4645 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4646 ## Ignore the token ## ISSUE: An issue in the spec.
4647 !!!next-token;
4648 next B;
4649 } else {
4650 !!!cp ('t149');
4651 }
4652
4653 ## "after head" insertion mode
4654 ## As if <body>
4655 !!!insert-element ('body',, $token);
4656 $self->{insertion_mode} = IN_BODY_IM;
4657 ## reprocess
4658 next B;
4659 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4660 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4661 !!!cp ('t149.1');
4662
4663 ## NOTE: As if <head>
4664 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4665 $self->{open_elements}->[-1]->[0]->append_child
4666 ($self->{head_element});
4667 #push @{$self->{open_elements}},
4668 # [$self->{head_element}, $el_category->{head}];
4669 #$self->{insertion_mode} = IN_HEAD_IM;
4670 ## NOTE: Reprocess.
4671
4672 ## NOTE: As if </head>
4673 #pop @{$self->{open_elements}};
4674 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4675 ## NOTE: Reprocess.
4676
4677 #
4678 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4679 !!!cp ('t149.2');
4680
4681 ## NOTE: As if </head>
4682 pop @{$self->{open_elements}};
4683 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4684 ## NOTE: Reprocess.
4685
4686 #
4687 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4688 !!!cp ('t149.3');
4689
4690 !!!parse-error (type => 'in noscript:#eof', token => $token);
4691
4692 ## As if </noscript>
4693 pop @{$self->{open_elements}};
4694 #$self->{insertion_mode} = IN_HEAD_IM;
4695 ## NOTE: Reprocess.
4696
4697 ## NOTE: As if </head>
4698 pop @{$self->{open_elements}};
4699 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4700 ## NOTE: Reprocess.
4701
4702 #
4703 } else {
4704 !!!cp ('t149.4');
4705 #
4706 }
4707
4708 ## NOTE: As if <body>
4709 !!!insert-element ('body',, $token);
4710 $self->{insertion_mode} = IN_BODY_IM;
4711 ## NOTE: Reprocess.
4712 next B;
4713 } else {
4714 die "$0: $token->{type}: Unknown token type";
4715 }
4716
4717 ## ISSUE: An issue in the spec.
4718 } elsif ($self->{insertion_mode} & BODY_IMS) {
4719 if ($token->{type} == CHARACTER_TOKEN) {
4720 !!!cp ('t150');
4721 ## NOTE: There is a code clone of "character in body".
4722 $reconstruct_active_formatting_elements->($insert_to_current);
4723
4724 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4725
4726 !!!next-token;
4727 next B;
4728 } elsif ($token->{type} == START_TAG_TOKEN) {
4729 if ({
4730 caption => 1, col => 1, colgroup => 1, tbody => 1,
4731 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4732 }->{$token->{tag_name}}) {
4733 if ($self->{insertion_mode} == IN_CELL_IM) {
4734 ## have an element in table scope
4735 for (reverse 0..$#{$self->{open_elements}}) {
4736 my $node = $self->{open_elements}->[$_];
4737 if ($node->[1] & TABLE_CELL_EL) {
4738 !!!cp ('t151');
4739
4740 ## Close the cell
4741 !!!back-token; # <x>
4742 $token = {type => END_TAG_TOKEN,
4743 tag_name => $node->[0]->manakai_local_name,
4744 line => $token->{line},
4745 column => $token->{column}};
4746 next B;
4747 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4748 !!!cp ('t152');
4749 ## ISSUE: This case can never be reached, maybe.
4750 last;
4751 }
4752 }
4753
4754 !!!cp ('t153');
4755 !!!parse-error (type => 'start tag not allowed',
4756 value => $token->{tag_name}, token => $token);
4757 ## Ignore the token
4758 !!!nack ('t153.1');
4759 !!!next-token;
4760 next B;
4761 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4762 !!!parse-error (type => 'not closed:caption', token => $token);
4763
4764 ## NOTE: As if </caption>.
4765 ## have a table element in table scope
4766 my $i;
4767 INSCOPE: {
4768 for (reverse 0..$#{$self->{open_elements}}) {
4769 my $node = $self->{open_elements}->[$_];
4770 if ($node->[1] & CAPTION_EL) {
4771 !!!cp ('t155');
4772 $i = $_;
4773 last INSCOPE;
4774 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4775 !!!cp ('t156');
4776 last;
4777 }
4778 }
4779
4780 !!!cp ('t157');
4781 !!!parse-error (type => 'start tag not allowed',
4782 value => $token->{tag_name}, token => $token);
4783 ## Ignore the token
4784 !!!nack ('t157.1');
4785 !!!next-token;
4786 next B;
4787 } # INSCOPE
4788
4789 ## generate implied end tags
4790 while ($self->{open_elements}->[-1]->[1]
4791 & END_TAG_OPTIONAL_EL) {
4792 !!!cp ('t158');
4793 pop @{$self->{open_elements}};
4794 }
4795
4796 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4797 !!!cp ('t159');
4798 !!!parse-error (type => 'not closed',
4799 value => $self->{open_elements}->[-1]->[0]
4800 ->manakai_local_name,
4801 token => $token);
4802 } else {
4803 !!!cp ('t160');
4804 }
4805
4806 splice @{$self->{open_elements}}, $i;
4807
4808 $clear_up_to_marker->();
4809
4810 $self->{insertion_mode} = IN_TABLE_IM;
4811
4812 ## reprocess
4813 !!!ack-later;
4814 next B;
4815 } else {
4816 !!!cp ('t161');
4817 #
4818 }
4819 } else {
4820 !!!cp ('t162');
4821 #
4822 }
4823 } elsif ($token->{type} == END_TAG_TOKEN) {
4824 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4825 if ($self->{insertion_mode} == IN_CELL_IM) {
4826 ## have an element in table scope
4827 my $i;
4828 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4829 my $node = $self->{open_elements}->[$_];
4830 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4831 !!!cp ('t163');
4832 $i = $_;
4833 last INSCOPE;
4834 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4835 !!!cp ('t164');
4836 last INSCOPE;
4837 }
4838 } # INSCOPE
4839 unless (defined $i) {
4840 !!!cp ('t165');
4841 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4842 ## Ignore the token
4843 !!!next-token;
4844 next B;
4845 }
4846
4847 ## generate implied end tags
4848 while ($self->{open_elements}->[-1]->[1]
4849 & END_TAG_OPTIONAL_EL) {
4850 !!!cp ('t166');
4851 pop @{$self->{open_elements}};
4852 }
4853
4854 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4855 ne $token->{tag_name}) {
4856 !!!cp ('t167');
4857 !!!parse-error (type => 'not closed',
4858 value => $self->{open_elements}->[-1]->[0]
4859 ->manakai_local_name,
4860 token => $token);
4861 } else {
4862 !!!cp ('t168');
4863 }
4864
4865 splice @{$self->{open_elements}}, $i;
4866
4867 $clear_up_to_marker->();
4868
4869 $self->{insertion_mode} = IN_ROW_IM;
4870
4871 !!!next-token;
4872 next B;
4873 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4874 !!!cp ('t169');
4875 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4876 ## Ignore the token
4877 !!!next-token;
4878 next B;
4879 } else {
4880 !!!cp ('t170');
4881 #
4882 }
4883 } elsif ($token->{tag_name} eq 'caption') {
4884 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4885 ## have a table element in table scope
4886 my $i;
4887 INSCOPE: {
4888 for (reverse 0..$#{$self->{open_elements}}) {
4889 my $node = $self->{open_elements}->[$_];
4890 if ($node->[1] & CAPTION_EL) {
4891 !!!cp ('t171');
4892 $i = $_;
4893 last INSCOPE;
4894 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4895 !!!cp ('t172');
4896 last;
4897 }
4898 }
4899
4900 !!!cp ('t173');
4901 !!!parse-error (type => 'unmatched end tag',
4902 value => $token->{tag_name}, token => $token);
4903 ## Ignore the token
4904 !!!next-token;
4905 next B;
4906 } # INSCOPE
4907
4908 ## generate implied end tags
4909 while ($self->{open_elements}->[-1]->[1]
4910 & END_TAG_OPTIONAL_EL) {
4911 !!!cp ('t174');
4912 pop @{$self->{open_elements}};
4913 }
4914
4915 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4916 !!!cp ('t175');
4917 !!!parse-error (type => 'not closed',
4918 value => $self->{open_elements}->[-1]->[0]
4919 ->manakai_local_name,
4920 token => $token);
4921 } else {
4922 !!!cp ('t176');
4923 }
4924
4925 splice @{$self->{open_elements}}, $i;
4926
4927 $clear_up_to_marker->();
4928
4929 $self->{insertion_mode} = IN_TABLE_IM;
4930
4931 !!!next-token;
4932 next B;
4933 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4934 !!!cp ('t177');
4935 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4936 ## Ignore the token
4937 !!!next-token;
4938 next B;
4939 } else {
4940 !!!cp ('t178');
4941 #
4942 }
4943 } elsif ({
4944 table => 1, tbody => 1, tfoot => 1,
4945 thead => 1, tr => 1,
4946 }->{$token->{tag_name}} and
4947 $self->{insertion_mode} == IN_CELL_IM) {
4948 ## have an element in table scope
4949 my $i;
4950 my $tn;
4951 INSCOPE: {
4952 for (reverse 0..$#{$self->{open_elements}}) {
4953 my $node = $self->{open_elements}->[$_];
4954 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4955 !!!cp ('t179');
4956 $i = $_;
4957
4958 ## Close the cell
4959 !!!back-token; # </x>
4960 $token = {type => END_TAG_TOKEN, tag_name => $tn,
4961 line => $token->{line},
4962 column => $token->{column}};
4963 next B;
4964 } elsif ($node->[1] & TABLE_CELL_EL) {
4965 !!!cp ('t180');
4966 $tn = $node->[0]->manakai_local_name;
4967 ## NOTE: There is exactly one |td| or |th| element
4968 ## in scope in the stack of open elements by definition.
4969 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4970 ## ISSUE: Can this be reached?
4971 !!!cp ('t181');
4972 last;
4973 }
4974 }
4975
4976 !!!cp ('t182');
4977 !!!parse-error (type => 'unmatched end tag',
4978 value => $token->{tag_name}, token => $token);
4979 ## Ignore the token
4980 !!!next-token;
4981 next B;
4982 } # INSCOPE
4983 } elsif ($token->{tag_name} eq 'table' and
4984 $self->{insertion_mode} == IN_CAPTION_IM) {
4985 !!!parse-error (type => 'not closed:caption', token => $token);
4986
4987 ## As if </caption>
4988 ## have a table element in table scope
4989 my $i;
4990 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4991 my $node = $self->{open_elements}->[$_];
4992 if ($node->[1] & CAPTION_EL) {
4993 !!!cp ('t184');
4994 $i = $_;
4995 last INSCOPE;
4996 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4997 !!!cp ('t185');
4998 last INSCOPE;
4999 }
5000 } # INSCOPE
5001 unless (defined $i) {
5002 !!!cp ('t186');
5003 !!!parse-error (type => 'unmatched end tag:caption', token => $token);
5004 ## Ignore the token
5005 !!!next-token;
5006 next B;
5007 }
5008
5009 ## generate implied end tags
5010 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5011 !!!cp ('t187');
5012 pop @{$self->{open_elements}};
5013 }
5014
5015 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5016 !!!cp ('t188');
5017 !!!parse-error (type => 'not closed',
5018 value => $self->{open_elements}->[-1]->[0]
5019 ->manakai_local_name,
5020 token => $token);
5021 } else {
5022 !!!cp ('t189');
5023 }
5024
5025 splice @{$self->{open_elements}}, $i;
5026
5027 $clear_up_to_marker->();
5028
5029 $self->{insertion_mode} = IN_TABLE_IM;
5030
5031 ## reprocess
5032 next B;
5033 } elsif ({
5034 body => 1, col => 1, colgroup => 1, html => 1,
5035 }->{$token->{tag_name}}) {
5036 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5037 !!!cp ('t190');
5038 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5039 ## Ignore the token
5040 !!!next-token;
5041 next B;
5042 } else {
5043 !!!cp ('t191');
5044 #
5045 }
5046 } elsif ({
5047 tbody => 1, tfoot => 1,
5048 thead => 1, tr => 1,
5049 }->{$token->{tag_name}} and
5050 $self->{insertion_mode} == IN_CAPTION_IM) {
5051 !!!cp ('t192');
5052 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5053 ## Ignore the token
5054 !!!next-token;
5055 next B;
5056 } else {
5057 !!!cp ('t193');
5058 #
5059 }
5060 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5061 for my $entry (@{$self->{open_elements}}) {
5062 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5063 !!!cp ('t75');
5064 !!!parse-error (type => 'in body:#eof', token => $token);
5065 last;
5066 }
5067 }
5068
5069 ## Stop parsing.
5070 last B;
5071 } else {
5072 die "$0: $token->{type}: Unknown token type";
5073 }
5074
5075 $insert = $insert_to_current;
5076 #
5077 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5078 if ($token->{type} == CHARACTER_TOKEN) {
5079 if (not $open_tables->[-1]->[1] and # tainted
5080 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5081 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5082
5083 unless (length $token->{data}) {
5084 !!!cp ('t194');
5085 !!!next-token;
5086 next B;
5087 } else {
5088 !!!cp ('t195');
5089 }
5090 }
5091
5092 !!!parse-error (type => 'in table:#character', token => $token);
5093
5094 ## As if in body, but insert into foster parent element
5095 ## ISSUE: Spec says that "whenever a node would be inserted
5096 ## into the current node" while characters might not be
5097 ## result in a new Text node.
5098 $reconstruct_active_formatting_elements->($insert_to_foster);
5099
5100 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5101 # MUST
5102 my $foster_parent_element;
5103 my $next_sibling;
5104 my $prev_sibling;
5105 OE: for (reverse 0..$#{$self->{open_elements}}) {
5106 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5107 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5108 if (defined $parent and $parent->node_type == 1) {
5109 !!!cp ('t196');
5110 $foster_parent_element = $parent;
5111 $next_sibling = $self->{open_elements}->[$_]->[0];
5112 $prev_sibling = $next_sibling->previous_sibling;
5113 } else {
5114 !!!cp ('t197');
5115 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5116 $prev_sibling = $foster_parent_element->last_child;
5117 }
5118 last OE;
5119 }
5120 } # OE
5121 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5122 $prev_sibling = $foster_parent_element->last_child
5123 unless defined $foster_parent_element;
5124 if (defined $prev_sibling and
5125 $prev_sibling->node_type == 3) {
5126 !!!cp ('t198');
5127 $prev_sibling->manakai_append_text ($token->{data});
5128 } else {
5129 !!!cp ('t199');
5130 $foster_parent_element->insert_before
5131 ($self->{document}->create_text_node ($token->{data}),
5132 $next_sibling);
5133 }
5134 $open_tables->[-1]->[1] = 1; # tainted
5135 } else {
5136 !!!cp ('t200');
5137 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5138 }
5139
5140 !!!next-token;
5141 next B;
5142 } elsif ($token->{type} == START_TAG_TOKEN) {
5143 if ({
5144 tr => ($self->{insertion_mode} != IN_ROW_IM),
5145 th => 1, td => 1,
5146 }->{$token->{tag_name}}) {
5147 if ($self->{insertion_mode} == IN_TABLE_IM) {
5148 ## Clear back to table context
5149 while (not ($self->{open_elements}->[-1]->[1]
5150 & TABLE_SCOPING_EL)) {
5151 !!!cp ('t201');
5152 pop @{$self->{open_elements}};
5153 }
5154
5155 !!!insert-element ('tbody',, $token);
5156 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5157 ## reprocess in the "in table body" insertion mode...
5158 }
5159
5160 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5161 unless ($token->{tag_name} eq 'tr') {
5162 !!!cp ('t202');
5163 !!!parse-error (type => 'missing start tag:tr', token => $token);
5164 }
5165
5166 ## Clear back to table body context
5167 while (not ($self->{open_elements}->[-1]->[1]
5168 & TABLE_ROWS_SCOPING_EL)) {
5169 !!!cp ('t203');
5170 ## ISSUE: Can this case be reached?
5171 pop @{$self->{open_elements}};
5172 }
5173
5174 $self->{insertion_mode} = IN_ROW_IM;
5175 if ($token->{tag_name} eq 'tr') {
5176 !!!cp ('t204');
5177 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5178 !!!nack ('t204');
5179 !!!next-token;
5180 next B;
5181 } else {
5182 !!!cp ('t205');
5183 !!!insert-element ('tr',, $token);
5184 ## reprocess in the "in row" insertion mode
5185 }
5186 } else {
5187 !!!cp ('t206');
5188 }
5189
5190 ## Clear back to table row context
5191 while (not ($self->{open_elements}->[-1]->[1]
5192 & TABLE_ROW_SCOPING_EL)) {
5193 !!!cp ('t207');
5194 pop @{$self->{open_elements}};
5195 }
5196
5197 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5198 $self->{insertion_mode} = IN_CELL_IM;
5199
5200 push @$active_formatting_elements, ['#marker', ''];
5201
5202 !!!nack ('t207.1');
5203 !!!next-token;
5204 next B;
5205 } elsif ({
5206 caption => 1, col => 1, colgroup => 1,
5207 tbody => 1, tfoot => 1, thead => 1,
5208 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5209 }->{$token->{tag_name}}) {
5210 if ($self->{insertion_mode} == IN_ROW_IM) {
5211 ## As if </tr>
5212 ## have an element in table scope
5213 my $i;
5214 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5215 my $node = $self->{open_elements}->[$_];
5216 if ($node->[1] & TABLE_ROW_EL) {
5217 !!!cp ('t208');
5218 $i = $_;
5219 last INSCOPE;
5220 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5221 !!!cp ('t209');
5222 last INSCOPE;
5223 }
5224 } # INSCOPE
5225 unless (defined $i) {
5226 !!!cp ('t210');
5227 ## TODO: This type is wrong.
5228 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name}, token => $token);
5229 ## Ignore the token
5230 !!!nack ('t210.1');
5231 !!!next-token;
5232 next B;
5233 }
5234
5235 ## Clear back to table row context
5236 while (not ($self->{open_elements}->[-1]->[1]
5237 & TABLE_ROW_SCOPING_EL)) {
5238 !!!cp ('t211');
5239 ## ISSUE: Can this case be reached?
5240 pop @{$self->{open_elements}};
5241 }
5242
5243 pop @{$self->{open_elements}}; # tr
5244 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5245 if ($token->{tag_name} eq 'tr') {
5246 !!!cp ('t212');
5247 ## reprocess
5248 !!!ack-later;
5249 next B;
5250 } else {
5251 !!!cp ('t213');
5252 ## reprocess in the "in table body" insertion mode...
5253 }
5254 }
5255
5256 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5257 ## have an element in table scope
5258 my $i;
5259 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5260 my $node = $self->{open_elements}->[$_];
5261 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5262 !!!cp ('t214');
5263 $i = $_;
5264 last INSCOPE;
5265 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5266 !!!cp ('t215');
5267 last INSCOPE;
5268 }
5269 } # INSCOPE
5270 unless (defined $i) {
5271 !!!cp ('t216');
5272 ## TODO: This erorr type ios wrong.
5273 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5274 ## Ignore the token
5275 !!!nack ('t216.1');
5276 !!!next-token;
5277 next B;
5278 }
5279
5280 ## Clear back to table body context
5281 while (not ($self->{open_elements}->[-1]->[1]
5282 & TABLE_ROWS_SCOPING_EL)) {
5283 !!!cp ('t217');
5284 ## ISSUE: Can this state be reached?
5285 pop @{$self->{open_elements}};
5286 }
5287
5288 ## As if <{current node}>
5289 ## have an element in table scope
5290 ## true by definition
5291
5292 ## Clear back to table body context
5293 ## nop by definition
5294
5295 pop @{$self->{open_elements}};
5296 $self->{insertion_mode} = IN_TABLE_IM;
5297 ## reprocess in "in table" insertion mode...
5298 } else {
5299 !!!cp ('t218');
5300 }
5301
5302 if ($token->{tag_name} eq 'col') {
5303 ## Clear back to table context
5304 while (not ($self->{open_elements}->[-1]->[1]
5305 & TABLE_SCOPING_EL)) {
5306 !!!cp ('t219');
5307 ## ISSUE: Can this state be reached?
5308 pop @{$self->{open_elements}};
5309 }
5310
5311 !!!insert-element ('colgroup',, $token);
5312 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5313 ## reprocess
5314 !!!ack-later;
5315 next B;
5316 } elsif ({
5317 caption => 1,
5318 colgroup => 1,
5319 tbody => 1, tfoot => 1, thead => 1,
5320 }->{$token->{tag_name}}) {
5321 ## Clear back to table context
5322 while (not ($self->{open_elements}->[-1]->[1]
5323 & TABLE_SCOPING_EL)) {
5324 !!!cp ('t220');
5325 ## ISSUE: Can this state be reached?
5326 pop @{$self->{open_elements}};
5327 }
5328
5329 push @$active_formatting_elements, ['#marker', '']
5330 if $token->{tag_name} eq 'caption';
5331
5332 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5333 $self->{insertion_mode} = {
5334 caption => IN_CAPTION_IM,
5335 colgroup => IN_COLUMN_GROUP_IM,
5336 tbody => IN_TABLE_BODY_IM,
5337 tfoot => IN_TABLE_BODY_IM,
5338 thead => IN_TABLE_BODY_IM,
5339 }->{$token->{tag_name}};
5340 !!!next-token;
5341 !!!nack ('t220.1');
5342 next B;
5343 } else {
5344 die "$0: in table: <>: $token->{tag_name}";
5345 }
5346 } elsif ($token->{tag_name} eq 'table') {
5347 !!!parse-error (type => 'not closed',
5348 value => $self->{open_elements}->[-1]->[0]
5349 ->manakai_local_name,
5350 token => $token);
5351
5352 ## As if </table>
5353 ## have a table element in table scope
5354 my $i;
5355 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5356 my $node = $self->{open_elements}->[$_];
5357 if ($node->[1] & TABLE_EL) {
5358 !!!cp ('t221');
5359 $i = $_;
5360 last INSCOPE;
5361 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5362 !!!cp ('t222');
5363 last INSCOPE;
5364 }
5365 } # INSCOPE
5366 unless (defined $i) {
5367 !!!cp ('t223');
5368 ## TODO: The following is wrong, maybe.
5369 !!!parse-error (type => 'unmatched end tag:table', token => $token);
5370 ## Ignore tokens </table><table>
5371 !!!nack ('t223.1');
5372 !!!next-token;
5373 next B;
5374 }
5375
5376 ## TODO: Followings are removed from the latest spec.
5377 ## generate implied end tags
5378 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5379 !!!cp ('t224');
5380 pop @{$self->{open_elements}};
5381 }
5382
5383 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5384 !!!cp ('t225');
5385 ## NOTE: |<table><tr><table>|
5386 !!!parse-error (type => 'not closed',
5387 value => $self->{open_elements}->[-1]->[0]
5388 ->manakai_local_name,
5389 token => $token);
5390 } else {
5391 !!!cp ('t226');
5392 }
5393
5394 splice @{$self->{open_elements}}, $i;
5395 pop @{$open_tables};
5396
5397 $self->_reset_insertion_mode;
5398
5399 ## reprocess
5400 !!!ack-later;
5401 next B;
5402 } elsif ($token->{tag_name} eq 'style') {
5403 if (not $open_tables->[-1]->[1]) { # tainted
5404 !!!cp ('t227.8');
5405 ## NOTE: This is a "as if in head" code clone.
5406 $parse_rcdata->(CDATA_CONTENT_MODEL);
5407 next B;
5408 } else {
5409 !!!cp ('t227.7');
5410 #
5411 }
5412 } elsif ($token->{tag_name} eq 'script') {
5413 if (not $open_tables->[-1]->[1]) { # tainted
5414 !!!cp ('t227.6');
5415 ## NOTE: This is a "as if in head" code clone.
5416 $script_start_tag->();
5417 next B;
5418 } else {
5419 !!!cp ('t227.5');
5420 #
5421 }
5422 } elsif ($token->{tag_name} eq 'input') {
5423 if (not $open_tables->[-1]->[1]) { # tainted
5424 if ($token->{attributes}->{type}) { ## TODO: case
5425 my $type = lc $token->{attributes}->{type}->{value};
5426 if ($type eq 'hidden') {
5427 !!!cp ('t227.3');
5428 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5429
5430 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5431
5432 ## TODO: form element pointer
5433
5434 pop @{$self->{open_elements}};
5435
5436 !!!next-token;
5437 !!!ack ('t227.2.1');
5438 next B;
5439 } else {
5440 !!!cp ('t227.2');
5441 #
5442 }
5443 } else {
5444 !!!cp ('t227.1');
5445 #
5446 }
5447 } else {
5448 !!!cp ('t227.4');
5449 #
5450 }
5451 } else {
5452 !!!cp ('t227');
5453 #
5454 }
5455
5456 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5457
5458 $insert = $insert_to_foster;
5459 #
5460 } elsif ($token->{type} == END_TAG_TOKEN) {
5461 if ($token->{tag_name} eq 'tr' and
5462 $self->{insertion_mode} == IN_ROW_IM) {
5463 ## have an element in table scope
5464 my $i;
5465 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5466 my $node = $self->{open_elements}->[$_];
5467 if ($node->[1] & TABLE_ROW_EL) {
5468 !!!cp ('t228');
5469 $i = $_;
5470 last INSCOPE;
5471 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5472 !!!cp ('t229');
5473 last INSCOPE;
5474 }
5475 } # INSCOPE
5476 unless (defined $i) {
5477 !!!cp ('t230');
5478 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5479 ## Ignore the token
5480 !!!nack ('t230.1');
5481 !!!next-token;
5482 next B;
5483 } else {
5484 !!!cp ('t232');
5485 }
5486
5487 ## Clear back to table row context
5488 while (not ($self->{open_elements}->[-1]->[1]
5489 & TABLE_ROW_SCOPING_EL)) {
5490 !!!cp ('t231');
5491 ## ISSUE: Can this state be reached?
5492 pop @{$self->{open_elements}};
5493 }
5494
5495 pop @{$self->{open_elements}}; # tr
5496 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5497 !!!next-token;
5498 !!!nack ('t231.1');
5499 next B;
5500 } elsif ($token->{tag_name} eq 'table') {
5501 if ($self->{insertion_mode} == IN_ROW_IM) {
5502 ## As if </tr>
5503 ## have an element in table scope
5504 my $i;
5505 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5506 my $node = $self->{open_elements}->[$_];
5507 if ($node->[1] & TABLE_ROW_EL) {
5508 !!!cp ('t233');
5509 $i = $_;
5510 last INSCOPE;
5511 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5512 !!!cp ('t234');
5513 last INSCOPE;
5514 }
5515 } # INSCOPE
5516 unless (defined $i) {
5517 !!!cp ('t235');
5518 ## TODO: The following is wrong.
5519 !!!parse-error (type => 'unmatched end tag:'.$token->{type}, token => $token);
5520 ## Ignore the token
5521 !!!nack ('t236.1');
5522 !!!next-token;
5523 next B;
5524 }
5525
5526 ## Clear back to table row context
5527 while (not ($self->{open_elements}->[-1]->[1]
5528 & TABLE_ROW_SCOPING_EL)) {
5529 !!!cp ('t236');
5530 ## ISSUE: Can this state be reached?
5531 pop @{$self->{open_elements}};
5532 }
5533
5534 pop @{$self->{open_elements}}; # tr
5535 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5536 ## reprocess in the "in table body" insertion mode...
5537 }
5538
5539 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5540 ## have an element in table scope
5541 my $i;
5542 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5543 my $node = $self->{open_elements}->[$_];
5544 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5545 !!!cp ('t237');
5546 $i = $_;
5547 last INSCOPE;
5548 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5549 !!!cp ('t238');
5550 last INSCOPE;
5551 }
5552 } # INSCOPE
5553 unless (defined $i) {
5554 !!!cp ('t239');
5555 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5556 ## Ignore the token
5557 !!!nack ('t239.1');
5558 !!!next-token;
5559 next B;
5560 }
5561
5562 ## Clear back to table body context
5563 while (not ($self->{open_elements}->[-1]->[1]
5564 & TABLE_ROWS_SCOPING_EL)) {
5565 !!!cp ('t240');
5566 pop @{$self->{open_elements}};
5567 }
5568
5569 ## As if <{current node}>
5570 ## have an element in table scope
5571 ## true by definition
5572
5573 ## Clear back to table body context
5574 ## nop by definition
5575
5576 pop @{$self->{open_elements}};
5577 $self->{insertion_mode} = IN_TABLE_IM;
5578 ## reprocess in the "in table" insertion mode...
5579 }
5580
5581 ## NOTE: </table> in the "in table" insertion mode.
5582 ## When you edit the code fragment below, please ensure that
5583 ## the code for <table> in the "in table" insertion mode
5584 ## is synced with it.
5585
5586 ## have a table element in table scope
5587 my $i;
5588 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5589 my $node = $self->{open_elements}->[$_];
5590 if ($node->[1] & TABLE_EL) {
5591 !!!cp ('t241');
5592 $i = $_;
5593 last INSCOPE;
5594 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5595 !!!cp ('t242');
5596 last INSCOPE;
5597 }
5598 } # INSCOPE
5599 unless (defined $i) {
5600 !!!cp ('t243');
5601 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5602 ## Ignore the token
5603 !!!nack ('t243.1');
5604 !!!next-token;
5605 next B;
5606 }
5607
5608 splice @{$self->{open_elements}}, $i;
5609 pop @{$open_tables};
5610
5611 $self->_reset_insertion_mode;
5612
5613 !!!next-token;
5614 next B;
5615 } elsif ({
5616 tbody => 1, tfoot => 1, thead => 1,
5617 }->{$token->{tag_name}} and
5618 $self->{insertion_mode} & ROW_IMS) {
5619 if ($self->{insertion_mode} == IN_ROW_IM) {
5620 ## have an element in table scope
5621 my $i;
5622 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5623 my $node = $self->{open_elements}->[$_];
5624 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5625 !!!cp ('t247');
5626 $i = $_;
5627 last INSCOPE;
5628 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5629 !!!cp ('t248');
5630 last INSCOPE;
5631 }
5632 } # INSCOPE
5633 unless (defined $i) {
5634 !!!cp ('t249');
5635 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5636 ## Ignore the token
5637 !!!nack ('t249.1');
5638 !!!next-token;
5639 next B;
5640 }
5641
5642 ## As if </tr>
5643 ## have an element in table scope
5644 my $i;
5645 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5646 my $node = $self->{open_elements}->[$_];
5647 if ($node->[1] & TABLE_ROW_EL) {
5648 !!!cp ('t250');
5649 $i = $_;
5650 last INSCOPE;
5651 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5652 !!!cp ('t251');
5653 last INSCOPE;
5654 }
5655 } # INSCOPE
5656 unless (defined $i) {
5657 !!!cp ('t252');
5658 !!!parse-error (type => 'unmatched end tag:tr', token => $token);
5659 ## Ignore the token
5660 !!!nack ('t252.1');
5661 !!!next-token;
5662 next B;
5663 }
5664
5665 ## Clear back to table row context
5666 while (not ($self->{open_elements}->[-1]->[1]
5667 & TABLE_ROW_SCOPING_EL)) {
5668 !!!cp ('t253');
5669 ## ISSUE: Can this case be reached?
5670 pop @{$self->{open_elements}};
5671 }
5672
5673 pop @{$self->{open_elements}}; # tr
5674 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5675 ## reprocess in the "in table body" insertion mode...
5676 }
5677
5678 ## have an element in table scope
5679 my $i;
5680 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5681 my $node = $self->{open_elements}->[$_];
5682 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5683 !!!cp ('t254');
5684 $i = $_;
5685 last INSCOPE;
5686 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5687 !!!cp ('t255');
5688 last INSCOPE;
5689 }
5690 } # INSCOPE
5691 unless (defined $i) {
5692 !!!cp ('t256');
5693 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5694 ## Ignore the token
5695 !!!nack ('t256.1');
5696 !!!next-token;
5697 next B;
5698 }
5699
5700 ## Clear back to table body context
5701 while (not ($self->{open_elements}->[-1]->[1]
5702 & TABLE_ROWS_SCOPING_EL)) {
5703 !!!cp ('t257');
5704 ## ISSUE: Can this case be reached?
5705 pop @{$self->{open_elements}};
5706 }
5707
5708 pop @{$self->{open_elements}};
5709 $self->{insertion_mode} = IN_TABLE_IM;
5710 !!!nack ('t257.1');
5711 !!!next-token;
5712 next B;
5713 } elsif ({
5714 body => 1, caption => 1, col => 1, colgroup => 1,
5715 html => 1, td => 1, th => 1,
5716 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5717 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5718 }->{$token->{tag_name}}) {
5719 !!!cp ('t258');
5720 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5721 ## Ignore the token
5722 !!!nack ('t258.1');
5723 !!!next-token;
5724 next B;
5725 } else {
5726 !!!cp ('t259');
5727 !!!parse-error (type => 'in table:/'.$token->{tag_name}, token => $token);
5728
5729 $insert = $insert_to_foster;
5730 #
5731 }
5732 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5733 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5734 @{$self->{open_elements}} == 1) { # redundant, maybe
5735 !!!parse-error (type => 'in body:#eof', token => $token);
5736 !!!cp ('t259.1');
5737 #
5738 } else {
5739 !!!cp ('t259.2');
5740 #
5741 }
5742
5743 ## Stop parsing
5744 last B;
5745 } else {
5746 die "$0: $token->{type}: Unknown token type";
5747 }
5748 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5749 if ($token->{type} == CHARACTER_TOKEN) {
5750 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5751 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5752 unless (length $token->{data}) {
5753 !!!cp ('t260');
5754 !!!next-token;
5755 next B;
5756 }
5757 }
5758
5759 !!!cp ('t261');
5760 #
5761 } elsif ($token->{type} == START_TAG_TOKEN) {
5762 if ($token->{tag_name} eq 'col') {
5763 !!!cp ('t262');
5764 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5765 pop @{$self->{open_elements}};
5766 !!!ack ('t262.1');
5767 !!!next-token;
5768 next B;
5769 } else {
5770 !!!cp ('t263');
5771 #
5772 }
5773 } elsif ($token->{type} == END_TAG_TOKEN) {
5774 if ($token->{tag_name} eq 'colgroup') {
5775 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5776 !!!cp ('t264');
5777 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5778 ## Ignore the token
5779 !!!next-token;
5780 next B;
5781 } else {
5782 !!!cp ('t265');
5783 pop @{$self->{open_elements}}; # colgroup
5784 $self->{insertion_mode} = IN_TABLE_IM;
5785 !!!next-token;
5786 next B;
5787 }
5788 } elsif ($token->{tag_name} eq 'col') {
5789 !!!cp ('t266');
5790 !!!parse-error (type => 'unmatched end tag:col', token => $token);
5791 ## Ignore the token
5792 !!!next-token;
5793 next B;
5794 } else {
5795 !!!cp ('t267');
5796 #
5797 }
5798 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5799 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5800 @{$self->{open_elements}} == 1) { # redundant, maybe
5801 !!!cp ('t270.2');
5802 ## Stop parsing.
5803 last B;
5804 } else {
5805 ## NOTE: As if </colgroup>.
5806 !!!cp ('t270.1');
5807 pop @{$self->{open_elements}}; # colgroup
5808 $self->{insertion_mode} = IN_TABLE_IM;
5809 ## Reprocess.
5810 next B;
5811 }
5812 } else {
5813 die "$0: $token->{type}: Unknown token type";
5814 }
5815
5816 ## As if </colgroup>
5817 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5818 !!!cp ('t269');
5819 ## TODO: Wrong error type?
5820 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5821 ## Ignore the token
5822 !!!nack ('t269.1');
5823 !!!next-token;
5824 next B;
5825 } else {
5826 !!!cp ('t270');
5827 pop @{$self->{open_elements}}; # colgroup
5828 $self->{insertion_mode} = IN_TABLE_IM;
5829 !!!ack-later;
5830 ## reprocess
5831 next B;
5832 }
5833 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5834 if ($token->{type} == CHARACTER_TOKEN) {
5835 !!!cp ('t271');
5836 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5837 !!!next-token;
5838 next B;
5839 } elsif ($token->{type} == START_TAG_TOKEN) {
5840 if ($token->{tag_name} eq 'option') {
5841 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5842 !!!cp ('t272');
5843 ## As if </option>
5844 pop @{$self->{open_elements}};
5845 } else {
5846 !!!cp ('t273');
5847 }
5848
5849 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5850 !!!nack ('t273.1');
5851 !!!next-token;
5852 next B;
5853 } elsif ($token->{tag_name} eq 'optgroup') {
5854 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5855 !!!cp ('t274');
5856 ## As if </option>
5857 pop @{$self->{open_elements}};
5858 } else {
5859 !!!cp ('t275');
5860 }
5861
5862 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5863 !!!cp ('t276');
5864 ## As if </optgroup>
5865 pop @{$self->{open_elements}};
5866 } else {
5867 !!!cp ('t277');
5868 }
5869
5870 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5871 !!!nack ('t277.1');
5872 !!!next-token;
5873 next B;
5874 } elsif ({
5875 select => 1, input => 1, textarea => 1,
5876 }->{$token->{tag_name}} or
5877 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5878 {
5879 caption => 1, table => 1,
5880 tbody => 1, tfoot => 1, thead => 1,
5881 tr => 1, td => 1, th => 1,
5882 }->{$token->{tag_name}})) {
5883 ## TODO: The type below is not good - <select> is replaced by </select>
5884 !!!parse-error (type => 'not closed:select', token => $token);
5885 ## NOTE: As if the token were </select> (<select> case) or
5886 ## as if there were </select> (otherwise).
5887 ## have an element in table scope
5888 my $i;
5889 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5890 my $node = $self->{open_elements}->[$_];
5891 if ($node->[1] & SELECT_EL) {
5892 !!!cp ('t278');
5893 $i = $_;
5894 last INSCOPE;
5895 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5896 !!!cp ('t279');
5897 last INSCOPE;
5898 }
5899 } # INSCOPE
5900 unless (defined $i) {
5901 !!!cp ('t280');
5902 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5903 ## Ignore the token
5904 !!!nack ('t280.1');
5905 !!!next-token;
5906 next B;
5907 }
5908
5909 !!!cp ('t281');
5910 splice @{$self->{open_elements}}, $i;
5911
5912 $self->_reset_insertion_mode;
5913
5914 if ($token->{tag_name} eq 'select') {
5915 !!!nack ('t281.2');
5916 !!!next-token;
5917 next B;
5918 } else {
5919 !!!cp ('t281.1');
5920 !!!ack-later;
5921 ## Reprocess the token.
5922 next B;
5923 }
5924 } else {
5925 !!!cp ('t282');
5926 !!!parse-error (type => 'in select:'.$token->{tag_name}, token => $token);
5927 ## Ignore the token
5928 !!!nack ('t282.1');
5929 !!!next-token;
5930 next B;
5931 }
5932 } elsif ($token->{type} == END_TAG_TOKEN) {
5933 if ($token->{tag_name} eq 'optgroup') {
5934 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
5935 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
5936 !!!cp ('t283');
5937 ## As if </option>
5938 splice @{$self->{open_elements}}, -2;
5939 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5940 !!!cp ('t284');
5941 pop @{$self->{open_elements}};
5942 } else {
5943 !!!cp ('t285');
5944 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5945 ## Ignore the token
5946 }
5947 !!!nack ('t285.1');
5948 !!!next-token;
5949 next B;
5950 } elsif ($token->{tag_name} eq 'option') {
5951 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5952 !!!cp ('t286');
5953 pop @{$self->{open_elements}};
5954 } else {
5955 !!!cp ('t287');
5956 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5957 ## Ignore the token
5958 }
5959 !!!nack ('t287.1');
5960 !!!next-token;
5961 next B;
5962 } elsif ($token->{tag_name} eq 'select') {
5963 ## have an element in table scope
5964 my $i;
5965 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5966 my $node = $self->{open_elements}->[$_];
5967 if ($node->[1] & SELECT_EL) {
5968 !!!cp ('t288');
5969 $i = $_;
5970 last INSCOPE;
5971 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5972 !!!cp ('t289');
5973 last INSCOPE;
5974 }
5975 } # INSCOPE
5976 unless (defined $i) {
5977 !!!cp ('t290');
5978 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5979 ## Ignore the token
5980 !!!nack ('t290.1');
5981 !!!next-token;
5982 next B;
5983 }
5984
5985 !!!cp ('t291');
5986 splice @{$self->{open_elements}}, $i;
5987
5988 $self->_reset_insertion_mode;
5989
5990 !!!nack ('t291.1');
5991 !!!next-token;
5992 next B;
5993 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5994 {
5995 caption => 1, table => 1, tbody => 1,
5996 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5997 }->{$token->{tag_name}}) {
5998 ## TODO: The following is wrong?
5999 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6000
6001 ## have an element in table scope
6002 my $i;
6003 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6004 my $node = $self->{open_elements}->[$_];
6005 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6006 !!!cp ('t292');
6007 $i = $_;
6008 last INSCOPE;
6009 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6010 !!!cp ('t293');
6011 last INSCOPE;
6012 }
6013 } # INSCOPE
6014 unless (defined $i) {
6015 !!!cp ('t294');
6016 ## Ignore the token
6017 !!!nack ('t294.1');
6018 !!!next-token;
6019 next B;
6020 }
6021
6022 ## As if </select>
6023 ## have an element in table scope
6024 undef $i;
6025 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6026 my $node = $self->{open_elements}->[$_];
6027 if ($node->[1] & SELECT_EL) {
6028 !!!cp ('t295');
6029 $i = $_;
6030 last INSCOPE;
6031 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6032 ## ISSUE: Can this state be reached?
6033 !!!cp ('t296');
6034 last INSCOPE;
6035 }
6036 } # INSCOPE
6037 unless (defined $i) {
6038 !!!cp ('t297');
6039 ## TODO: The following error type is correct?
6040 !!!parse-error (type => 'unmatched end tag:select', token => $token);
6041 ## Ignore the </select> token
6042 !!!nack ('t297.1');
6043 !!!next-token; ## TODO: ok?
6044 next B;
6045 }
6046
6047 !!!cp ('t298');
6048 splice @{$self->{open_elements}}, $i;
6049
6050 $self->_reset_insertion_mode;
6051
6052 !!!ack-later;
6053 ## reprocess
6054 next B;
6055 } else {
6056 !!!cp ('t299');
6057 !!!parse-error (type => 'in select:/'.$token->{tag_name}, token => $token);
6058 ## Ignore the token
6059 !!!nack ('t299.3');
6060 !!!next-token;
6061 next B;
6062 }
6063 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6064 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6065 @{$self->{open_elements}} == 1) { # redundant, maybe
6066 !!!cp ('t299.1');
6067 !!!parse-error (type => 'in body:#eof', token => $token);
6068 } else {
6069 !!!cp ('t299.2');
6070 }
6071
6072 ## Stop parsing.
6073 last B;
6074 } else {
6075 die "$0: $token->{type}: Unknown token type";
6076 }
6077 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6078 if ($token->{type} == CHARACTER_TOKEN) {
6079 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6080 my $data = $1;
6081 ## As if in body
6082 $reconstruct_active_formatting_elements->($insert_to_current);
6083
6084 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6085
6086 unless (length $token->{data}) {
6087 !!!cp ('t300');
6088 !!!next-token;
6089 next B;
6090 }
6091 }
6092
6093 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6094 !!!cp ('t301');
6095 !!!parse-error (type => 'after html:#character', token => $token);
6096
6097 ## Reprocess in the "after body" insertion mode.
6098 } else {
6099 !!!cp ('t302');
6100 }
6101
6102 ## "after body" insertion mode
6103 !!!parse-error (type => 'after body:#character', token => $token);
6104
6105 $self->{insertion_mode} = IN_BODY_IM;
6106 ## reprocess
6107 next B;
6108 } elsif ($token->{type} == START_TAG_TOKEN) {
6109 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6110 !!!cp ('t303');
6111 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
6112
6113 ## Reprocess in the "after body" insertion mode.
6114 } else {
6115 !!!cp ('t304');
6116 }
6117
6118 ## "after body" insertion mode
6119 !!!parse-error (type => 'after body:'.$token->{tag_name}, token => $token);
6120
6121 $self->{insertion_mode} = IN_BODY_IM;
6122 !!!ack-later;
6123 ## reprocess
6124 next B;
6125 } elsif ($token->{type} == END_TAG_TOKEN) {
6126 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6127 !!!cp ('t305');
6128 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
6129
6130 $self->{insertion_mode} = AFTER_BODY_IM;
6131 ## Reprocess in the "after body" insertion mode.
6132 } else {
6133 !!!cp ('t306');
6134 }
6135
6136 ## "after body" insertion mode
6137 if ($token->{tag_name} eq 'html') {
6138 if (defined $self->{inner_html_node}) {
6139 !!!cp ('t307');
6140 !!!parse-error (type => 'unmatched end tag:html', token => $token);
6141 ## Ignore the token
6142 !!!next-token;
6143 next B;
6144 } else {
6145 !!!cp ('t308');
6146 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6147 !!!next-token;
6148 next B;
6149 }
6150 } else {
6151 !!!cp ('t309');
6152 !!!parse-error (type => 'after body:/'.$token->{tag_name}, token => $token);
6153
6154 $self->{insertion_mode} = IN_BODY_IM;
6155 ## reprocess
6156 next B;
6157 }
6158 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6159 !!!cp ('t309.2');
6160 ## Stop parsing
6161 last B;
6162 } else {
6163 die "$0: $token->{type}: Unknown token type";
6164 }
6165 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6166 if ($token->{type} == CHARACTER_TOKEN) {
6167 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6168 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6169
6170 unless (length $token->{data}) {
6171 !!!cp ('t310');
6172 !!!next-token;
6173 next B;
6174 }
6175 }
6176
6177 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6178 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6179 !!!cp ('t311');
6180 !!!parse-error (type => 'in frameset:#character', token => $token);
6181 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6182 !!!cp ('t312');
6183 !!!parse-error (type => 'after frameset:#character', token => $token);
6184 } else { # "after html frameset"
6185 !!!cp ('t313');
6186 !!!parse-error (type => 'after html:#character', token => $token);
6187
6188 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6189 ## Reprocess in the "after frameset" insertion mode.
6190 !!!parse-error (type => 'after frameset:#character', token => $token);
6191 }
6192
6193 ## Ignore the token.
6194 if (length $token->{data}) {
6195 !!!cp ('t314');
6196 ## reprocess the rest of characters
6197 } else {
6198 !!!cp ('t315');
6199 !!!next-token;
6200 }
6201 next B;
6202 }
6203
6204 die qq[$0: Character "$token->{data}"];
6205 } elsif ($token->{type} == START_TAG_TOKEN) {
6206 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6207 !!!cp ('t316');
6208 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
6209
6210 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6211 ## Process in the "after frameset" insertion mode.
6212 } else {
6213 !!!cp ('t317');
6214 }
6215
6216 if ($token->{tag_name} eq 'frameset' and
6217 $self->{insertion_mode} == IN_FRAMESET_IM) {
6218 !!!cp ('t318');
6219 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6220 !!!nack ('t318.1');
6221 !!!next-token;
6222 next B;
6223 } elsif ($token->{tag_name} eq 'frame' and
6224 $self->{insertion_mode} == IN_FRAMESET_IM) {
6225 !!!cp ('t319');
6226 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6227 pop @{$self->{open_elements}};
6228 !!!ack ('t319.1');
6229 !!!next-token;
6230 next B;
6231 } elsif ($token->{tag_name} eq 'noframes') {
6232 !!!cp ('t320');
6233 ## NOTE: As if in head.
6234 $parse_rcdata->(CDATA_CONTENT_MODEL);
6235 next B;
6236 } else {
6237 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6238 !!!cp ('t321');
6239 !!!parse-error (type => 'in frameset:'.$token->{tag_name}, token => $token);
6240 } else {
6241 !!!cp ('t322');
6242 !!!parse-error (type => 'after frameset:'.$token->{tag_name}, token => $token);
6243 }
6244 ## Ignore the token
6245 !!!nack ('t322.1');
6246 !!!next-token;
6247 next B;
6248 }
6249 } elsif ($token->{type} == END_TAG_TOKEN) {
6250 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6251 !!!cp ('t323');
6252 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
6253
6254 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6255 ## Process in the "after frameset" insertion mode.
6256 } else {
6257 !!!cp ('t324');
6258 }
6259
6260 if ($token->{tag_name} eq 'frameset' and
6261 $self->{insertion_mode} == IN_FRAMESET_IM) {
6262 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6263 @{$self->{open_elements}} == 1) {
6264 !!!cp ('t325');
6265 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6266 ## Ignore the token
6267 !!!next-token;
6268 } else {
6269 !!!cp ('t326');
6270 pop @{$self->{open_elements}};
6271 !!!next-token;
6272 }
6273
6274 if (not defined $self->{inner_html_node} and
6275 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6276 !!!cp ('t327');
6277 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6278 } else {
6279 !!!cp ('t328');
6280 }
6281 next B;
6282 } elsif ($token->{tag_name} eq 'html' and
6283 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6284 !!!cp ('t329');
6285 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6286 !!!next-token;
6287 next B;
6288 } else {
6289 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6290 !!!cp ('t330');
6291 !!!parse-error (type => 'in frameset:/'.$token->{tag_name}, token => $token);
6292 } else {
6293 !!!cp ('t331');
6294 !!!parse-error (type => 'after frameset:/'.$token->{tag_name}, token => $token);
6295 }
6296 ## Ignore the token
6297 !!!next-token;
6298 next B;
6299 }
6300 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6301 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6302 @{$self->{open_elements}} == 1) { # redundant, maybe
6303 !!!cp ('t331.1');
6304 !!!parse-error (type => 'in body:#eof', token => $token);
6305 } else {
6306 !!!cp ('t331.2');
6307 }
6308
6309 ## Stop parsing
6310 last B;
6311 } else {
6312 die "$0: $token->{type}: Unknown token type";
6313 }
6314
6315 ## ISSUE: An issue in spec here
6316 } else {
6317 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6318 }
6319
6320 ## "in body" insertion mode
6321 if ($token->{type} == START_TAG_TOKEN) {
6322 if ($token->{tag_name} eq 'script') {
6323 !!!cp ('t332');
6324 ## NOTE: This is an "as if in head" code clone
6325 $script_start_tag->();
6326 next B;
6327 } elsif ($token->{tag_name} eq 'style') {
6328 !!!cp ('t333');
6329 ## NOTE: This is an "as if in head" code clone
6330 $parse_rcdata->(CDATA_CONTENT_MODEL);
6331 next B;
6332 } elsif ({
6333 base => 1, link => 1,
6334 }->{$token->{tag_name}}) {
6335 !!!cp ('t334');
6336 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6337 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6338 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6339 !!!ack ('t334.1');
6340 !!!next-token;
6341 next B;
6342 } elsif ($token->{tag_name} eq 'meta') {
6343 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6344 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6345 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6346
6347 unless ($self->{confident}) {
6348 if ($token->{attributes}->{charset}) {
6349 !!!cp ('t335');
6350 ## NOTE: Whether the encoding is supported or not is handled
6351 ## in the {change_encoding} callback.
6352 $self->{change_encoding}
6353 ->($self, $token->{attributes}->{charset}->{value}, $token);
6354
6355 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6356 ->set_user_data (manakai_has_reference =>
6357 $token->{attributes}->{charset}
6358 ->{has_reference});
6359 } elsif ($token->{attributes}->{content}) {
6360 if ($token->{attributes}->{content}->{value}
6361 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6362 [\x09-\x0D\x20]*=
6363 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6364 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6365 !!!cp ('t336');
6366 ## NOTE: Whether the encoding is supported or not is handled
6367 ## in the {change_encoding} callback.
6368 $self->{change_encoding}
6369 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6370 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6371 ->set_user_data (manakai_has_reference =>
6372 $token->{attributes}->{content}
6373 ->{has_reference});
6374 }
6375 }
6376 } else {
6377 if ($token->{attributes}->{charset}) {
6378 !!!cp ('t337');
6379 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6380 ->set_user_data (manakai_has_reference =>
6381 $token->{attributes}->{charset}
6382 ->{has_reference});
6383 }
6384 if ($token->{attributes}->{content}) {
6385 !!!cp ('t338');
6386 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6387 ->set_user_data (manakai_has_reference =>
6388 $token->{attributes}->{content}
6389 ->{has_reference});
6390 }
6391 }
6392
6393 !!!ack ('t338.1');
6394 !!!next-token;
6395 next B;
6396 } elsif ($token->{tag_name} eq 'title') {
6397 !!!cp ('t341');
6398 ## NOTE: This is an "as if in head" code clone
6399 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6400 next B;
6401 } elsif ($token->{tag_name} eq 'body') {
6402 !!!parse-error (type => 'in body:body', token => $token);
6403
6404 if (@{$self->{open_elements}} == 1 or
6405 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6406 !!!cp ('t342');
6407 ## Ignore the token
6408 } else {
6409 my $body_el = $self->{open_elements}->[1]->[0];
6410 for my $attr_name (keys %{$token->{attributes}}) {
6411 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6412 !!!cp ('t343');
6413 $body_el->set_attribute_ns
6414 (undef, [undef, $attr_name],
6415 $token->{attributes}->{$attr_name}->{value});
6416 }
6417 }
6418 }
6419 !!!nack ('t343.1');
6420 !!!next-token;
6421 next B;
6422 } elsif ({
6423 address => 1, blockquote => 1, center => 1, dir => 1,
6424 div => 1, dl => 1, fieldset => 1,
6425 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6426 menu => 1, ol => 1, p => 1, ul => 1,
6427 pre => 1, listing => 1,
6428 form => 1,
6429 table => 1,
6430 hr => 1,
6431 }->{$token->{tag_name}}) {
6432 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6433 !!!cp ('t350');
6434 !!!parse-error (type => 'in form:form', token => $token);
6435 ## Ignore the token
6436 !!!nack ('t350.1');
6437 !!!next-token;
6438 next B;
6439 }
6440
6441 ## has a p element in scope
6442 INSCOPE: for (reverse @{$self->{open_elements}}) {
6443 if ($_->[1] & P_EL) {
6444 !!!cp ('t344');
6445 !!!back-token; # <form>
6446 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6447 line => $token->{line}, column => $token->{column}};
6448 next B;
6449 } elsif ($_->[1] & SCOPING_EL) {
6450 !!!cp ('t345');
6451 last INSCOPE;
6452 }
6453 } # INSCOPE
6454
6455 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6456 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6457 !!!nack ('t346.1');
6458 !!!next-token;
6459 if ($token->{type} == CHARACTER_TOKEN) {
6460 $token->{data} =~ s/^\x0A//;
6461 unless (length $token->{data}) {
6462 !!!cp ('t346');
6463 !!!next-token;
6464 } else {
6465 !!!cp ('t349');
6466 }
6467 } else {
6468 !!!cp ('t348');
6469 }
6470 } elsif ($token->{tag_name} eq 'form') {
6471 !!!cp ('t347.1');
6472 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6473
6474 !!!nack ('t347.2');
6475 !!!next-token;
6476 } elsif ($token->{tag_name} eq 'table') {
6477 !!!cp ('t382');
6478 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6479
6480 $self->{insertion_mode} = IN_TABLE_IM;
6481
6482 !!!nack ('t382.1');
6483 !!!next-token;
6484 } elsif ($token->{tag_name} eq 'hr') {
6485 !!!cp ('t386');
6486 pop @{$self->{open_elements}};
6487
6488 !!!nack ('t386.1');
6489 !!!next-token;
6490 } else {
6491 !!!nack ('t347.1');
6492 !!!next-token;
6493 }
6494 next B;
6495 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6496 ## has a p element in scope
6497 INSCOPE: for (reverse @{$self->{open_elements}}) {
6498 if ($_->[1] & P_EL) {
6499 !!!cp ('t353');
6500 !!!back-token; # <x>
6501 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6502 line => $token->{line}, column => $token->{column}};
6503 next B;
6504 } elsif ($_->[1] & SCOPING_EL) {
6505 !!!cp ('t354');
6506 last INSCOPE;
6507 }
6508 } # INSCOPE
6509
6510 ## Step 1
6511 my $i = -1;
6512 my $node = $self->{open_elements}->[$i];
6513 my $li_or_dtdd = {li => {li => 1},
6514 dt => {dt => 1, dd => 1},
6515 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6516 LI: {
6517 ## Step 2
6518 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6519 if ($i != -1) {
6520 !!!cp ('t355');
6521 !!!parse-error (type => 'not closed',
6522 value => $self->{open_elements}->[-1]->[0]
6523 ->manakai_local_name,
6524 token => $token);
6525 } else {
6526 !!!cp ('t356');
6527 }
6528 splice @{$self->{open_elements}}, $i;
6529 last LI;
6530 } else {
6531 !!!cp ('t357');
6532 }
6533
6534 ## Step 3
6535 if (not ($node->[1] & FORMATTING_EL) and
6536 #not $phrasing_category->{$node->[1]} and
6537 ($node->[1] & SPECIAL_EL or
6538 $node->[1] & SCOPING_EL) and
6539 not ($node->[1] & ADDRESS_EL) and
6540 not ($node->[1] & DIV_EL)) {
6541 !!!cp ('t358');
6542 last LI;
6543 }
6544
6545 !!!cp ('t359');
6546 ## Step 4
6547 $i--;
6548 $node = $self->{open_elements}->[$i];
6549 redo LI;
6550 } # LI
6551
6552 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6553 !!!nack ('t359.1');
6554 !!!next-token;
6555 next B;
6556 } elsif ($token->{tag_name} eq 'plaintext') {
6557 ## has a p element in scope
6558 INSCOPE: for (reverse @{$self->{open_elements}}) {
6559 if ($_->[1] & P_EL) {
6560 !!!cp ('t367');
6561 !!!back-token; # <plaintext>
6562 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6563 line => $token->{line}, column => $token->{column}};
6564 next B;
6565 } elsif ($_->[1] & SCOPING_EL) {
6566 !!!cp ('t368');
6567 last INSCOPE;
6568 }
6569 } # INSCOPE
6570
6571 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6572
6573 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6574
6575 !!!nack ('t368.1');
6576 !!!next-token;
6577 next B;
6578 } elsif ($token->{tag_name} eq 'a') {
6579 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6580 my $node = $active_formatting_elements->[$i];
6581 if ($node->[1] & A_EL) {
6582 !!!cp ('t371');
6583 !!!parse-error (type => 'in a:a', token => $token);
6584
6585 !!!back-token; # <a>
6586 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6587 line => $token->{line}, column => $token->{column}};
6588 $formatting_end_tag->($token);
6589
6590 AFE2: for (reverse 0..$#$active_formatting_elements) {
6591 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6592 !!!cp ('t372');
6593 splice @$active_formatting_elements, $_, 1;
6594 last AFE2;
6595 }
6596 } # AFE2
6597 OE: for (reverse 0..$#{$self->{open_elements}}) {
6598 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6599 !!!cp ('t373');
6600 splice @{$self->{open_elements}}, $_, 1;
6601 last OE;
6602 }
6603 } # OE
6604 last AFE;
6605 } elsif ($node->[0] eq '#marker') {
6606 !!!cp ('t374');
6607 last AFE;
6608 }
6609 } # AFE
6610
6611 $reconstruct_active_formatting_elements->($insert_to_current);
6612
6613 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6614 push @$active_formatting_elements, $self->{open_elements}->[-1];
6615
6616 !!!nack ('t374.1');
6617 !!!next-token;
6618 next B;
6619 } elsif ($token->{tag_name} eq 'nobr') {
6620 $reconstruct_active_formatting_elements->($insert_to_current);
6621
6622 ## has a |nobr| element in scope
6623 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6624 my $node = $self->{open_elements}->[$_];
6625 if ($node->[1] & NOBR_EL) {
6626 !!!cp ('t376');
6627 !!!parse-error (type => 'in nobr:nobr', token => $token);
6628 !!!back-token; # <nobr>
6629 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6630 line => $token->{line}, column => $token->{column}};
6631 next B;
6632 } elsif ($node->[1] & SCOPING_EL) {
6633 !!!cp ('t377');
6634 last INSCOPE;
6635 }
6636 } # INSCOPE
6637
6638 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6639 push @$active_formatting_elements, $self->{open_elements}->[-1];
6640
6641 !!!nack ('t377.1');
6642 !!!next-token;
6643 next B;
6644 } elsif ($token->{tag_name} eq 'button') {
6645 ## has a button element in scope
6646 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6647 my $node = $self->{open_elements}->[$_];
6648 if ($node->[1] & BUTTON_EL) {
6649 !!!cp ('t378');
6650 !!!parse-error (type => 'in button:button', token => $token);
6651 !!!back-token; # <button>
6652 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6653 line => $token->{line}, column => $token->{column}};
6654 next B;
6655 } elsif ($node->[1] & SCOPING_EL) {
6656 !!!cp ('t379');
6657 last INSCOPE;
6658 }
6659 } # INSCOPE
6660
6661 $reconstruct_active_formatting_elements->($insert_to_current);
6662
6663 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6664
6665 ## TODO: associate with $self->{form_element} if defined
6666
6667 push @$active_formatting_elements, ['#marker', ''];
6668
6669 !!!nack ('t379.1');
6670 !!!next-token;
6671 next B;
6672 } elsif ({
6673 xmp => 1,
6674 iframe => 1,
6675 noembed => 1,
6676 noframes => 1, ## NOTE: This is an "as if in head" code clone.
6677 noscript => 0, ## TODO: 1 if scripting is enabled
6678 }->{$token->{tag_name}}) {
6679 if ($token->{tag_name} eq 'xmp') {
6680 !!!cp ('t381');
6681 $reconstruct_active_formatting_elements->($insert_to_current);
6682 } else {
6683 !!!cp ('t399');
6684 }
6685 ## NOTE: There is an "as if in body" code clone.
6686 $parse_rcdata->(CDATA_CONTENT_MODEL);
6687 next B;
6688 } elsif ($token->{tag_name} eq 'isindex') {
6689 !!!parse-error (type => 'isindex', token => $token);
6690
6691 if (defined $self->{form_element}) {
6692 !!!cp ('t389');
6693 ## Ignore the token
6694 !!!nack ('t389'); ## NOTE: Not acknowledged.
6695 !!!next-token;
6696 next B;
6697 } else {
6698 !!!ack ('t391.1');
6699
6700 my $at = $token->{attributes};
6701 my $form_attrs;
6702 $form_attrs->{action} = $at->{action} if $at->{action};
6703 my $prompt_attr = $at->{prompt};
6704 $at->{name} = {name => 'name', value => 'isindex'};
6705 delete $at->{action};
6706 delete $at->{prompt};
6707 my @tokens = (
6708 {type => START_TAG_TOKEN, tag_name => 'form',
6709 attributes => $form_attrs,
6710 line => $token->{line}, column => $token->{column}},
6711 {type => START_TAG_TOKEN, tag_name => 'hr',
6712 line => $token->{line}, column => $token->{column}},
6713 {type => START_TAG_TOKEN, tag_name => 'p',
6714 line => $token->{line}, column => $token->{column}},
6715 {type => START_TAG_TOKEN, tag_name => 'label',
6716 line => $token->{line}, column => $token->{column}},
6717 );
6718 if ($prompt_attr) {
6719 !!!cp ('t390');
6720 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6721 #line => $token->{line}, column => $token->{column},
6722 };
6723 } else {
6724 !!!cp ('t391');
6725 push @tokens, {type => CHARACTER_TOKEN,
6726 data => 'This is a searchable index. Insert your search keywords here: ',
6727 #line => $token->{line}, column => $token->{column},
6728 }; # SHOULD
6729 ## TODO: make this configurable
6730 }
6731 push @tokens,
6732 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6733 line => $token->{line}, column => $token->{column}},
6734 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6735 {type => END_TAG_TOKEN, tag_name => 'label',
6736 line => $token->{line}, column => $token->{column}},
6737 {type => END_TAG_TOKEN, tag_name => 'p',
6738 line => $token->{line}, column => $token->{column}},
6739 {type => START_TAG_TOKEN, tag_name => 'hr',
6740 line => $token->{line}, column => $token->{column}},
6741 {type => END_TAG_TOKEN, tag_name => 'form',
6742 line => $token->{line}, column => $token->{column}};
6743 !!!back-token (@tokens);
6744 !!!next-token;
6745 next B;
6746 }
6747 } elsif ($token->{tag_name} eq 'textarea') {
6748 my $tag_name = $token->{tag_name};
6749 my $el;
6750 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6751
6752 ## TODO: $self->{form_element} if defined
6753 $self->{content_model} = RCDATA_CONTENT_MODEL;
6754 delete $self->{escape}; # MUST
6755
6756 $insert->($el);
6757
6758 my $text = '';
6759 !!!nack ('t392.1');
6760 !!!next-token;
6761 if ($token->{type} == CHARACTER_TOKEN) {
6762 $token->{data} =~ s/^\x0A//;
6763 unless (length $token->{data}) {
6764 !!!cp ('t392');
6765 !!!next-token;
6766 } else {
6767 !!!cp ('t393');
6768 }
6769 } else {
6770 !!!cp ('t394');
6771 }
6772 while ($token->{type} == CHARACTER_TOKEN) {
6773 !!!cp ('t395');
6774 $text .= $token->{data};
6775 !!!next-token;
6776 }
6777 if (length $text) {
6778 !!!cp ('t396');
6779 $el->manakai_append_text ($text);
6780 }
6781
6782 $self->{content_model} = PCDATA_CONTENT_MODEL;
6783
6784 if ($token->{type} == END_TAG_TOKEN and
6785 $token->{tag_name} eq $tag_name) {
6786 !!!cp ('t397');
6787 ## Ignore the token
6788 } else {
6789 !!!cp ('t398');
6790 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
6791 }
6792 !!!next-token;
6793 next B;
6794 } elsif ($token->{tag_name} eq 'math' or
6795 $token->{tag_name} eq 'svg') {
6796 $reconstruct_active_formatting_elements->($insert_to_current);
6797
6798 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
6799
6800 ## "adjust foreign attributes" - done in insert-element-f
6801
6802 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
6803
6804 if ($self->{self_closing}) {
6805 pop @{$self->{open_elements}};
6806 !!!ack ('t398.1');
6807 } else {
6808 !!!cp ('t398.2');
6809 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6810 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6811 ## mode, "in body" (not "in foreign content") secondary insertion
6812 ## mode, maybe.
6813 }
6814
6815 !!!next-token;
6816 next B;
6817 } elsif ({
6818 caption => 1, col => 1, colgroup => 1, frame => 1,
6819 frameset => 1, head => 1, option => 1, optgroup => 1,
6820 tbody => 1, td => 1, tfoot => 1, th => 1,
6821 thead => 1, tr => 1,
6822 }->{$token->{tag_name}}) {
6823 !!!cp ('t401');
6824 !!!parse-error (type => 'in body:'.$token->{tag_name}, token => $token);
6825 ## Ignore the token
6826 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6827 !!!next-token;
6828 next B;
6829
6830 ## ISSUE: An issue on HTML5 new elements in the spec.
6831 } else {
6832 if ($token->{tag_name} eq 'image') {
6833 !!!cp ('t384');
6834 !!!parse-error (type => 'image', token => $token);
6835 $token->{tag_name} = 'img';
6836 } else {
6837 !!!cp ('t385');
6838 }
6839
6840 ## NOTE: There is an "as if <br>" code clone.
6841 $reconstruct_active_formatting_elements->($insert_to_current);
6842
6843 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6844
6845 if ({
6846 applet => 1, marquee => 1, object => 1,
6847 }->{$token->{tag_name}}) {
6848 !!!cp ('t380');
6849 push @$active_formatting_elements, ['#marker', ''];
6850 !!!nack ('t380.1');
6851 } elsif ({
6852 b => 1, big => 1, em => 1, font => 1, i => 1,
6853 s => 1, small => 1, strile => 1,
6854 strong => 1, tt => 1, u => 1,
6855 }->{$token->{tag_name}}) {
6856 !!!cp ('t375');
6857 push @$active_formatting_elements, $self->{open_elements}->[-1];
6858 !!!nack ('t375.1');
6859 } elsif ($token->{tag_name} eq 'input') {
6860 !!!cp ('t388');
6861 ## TODO: associate with $self->{form_element} if defined
6862 pop @{$self->{open_elements}};
6863 !!!ack ('t388.2');
6864 } elsif ({
6865 area => 1, basefont => 1, bgsound => 1, br => 1,
6866 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
6867 #image => 1,
6868 }->{$token->{tag_name}}) {
6869 !!!cp ('t388.1');
6870 pop @{$self->{open_elements}};
6871 !!!ack ('t388.3');
6872 } elsif ($token->{tag_name} eq 'select') {
6873 ## TODO: associate with $self->{form_element} if defined
6874
6875 if ($self->{insertion_mode} & TABLE_IMS or
6876 $self->{insertion_mode} & BODY_TABLE_IMS or
6877 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6878 !!!cp ('t400.1');
6879 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6880 } else {
6881 !!!cp ('t400.2');
6882 $self->{insertion_mode} = IN_SELECT_IM;
6883 }
6884 !!!nack ('t400.3');
6885 } else {
6886 !!!nack ('t402');
6887 }
6888
6889 !!!next-token;
6890 next B;
6891 }
6892 } elsif ($token->{type} == END_TAG_TOKEN) {
6893 if ($token->{tag_name} eq 'body') {
6894 ## has a |body| element in scope
6895 my $i;
6896 INSCOPE: {
6897 for (reverse @{$self->{open_elements}}) {
6898 if ($_->[1] & BODY_EL) {
6899 !!!cp ('t405');
6900 $i = $_;
6901 last INSCOPE;
6902 } elsif ($_->[1] & SCOPING_EL) {
6903 !!!cp ('t405.1');
6904 last;
6905 }
6906 }
6907
6908 !!!parse-error (type => 'start tag not allowed',
6909 value => $token->{tag_name}, token => $token);
6910 ## NOTE: Ignore the token.
6911 !!!next-token;
6912 next B;
6913 } # INSCOPE
6914
6915 for (@{$self->{open_elements}}) {
6916 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
6917 !!!cp ('t403');
6918 !!!parse-error (type => 'not closed',
6919 value => $_->[0]->manakai_local_name,
6920 token => $token);
6921 last;
6922 } else {
6923 !!!cp ('t404');
6924 }
6925 }
6926
6927 $self->{insertion_mode} = AFTER_BODY_IM;
6928 !!!next-token;
6929 next B;
6930 } elsif ($token->{tag_name} eq 'html') {
6931 ## TODO: Update this code. It seems that the code below is not
6932 ## up-to-date, though it has same effect as speced.
6933 if (@{$self->{open_elements}} > 1 and
6934 $self->{open_elements}->[1]->[1] & BODY_EL) {
6935 ## ISSUE: There is an issue in the spec.
6936 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
6937 !!!cp ('t406');
6938 !!!parse-error (type => 'not closed',
6939 value => $self->{open_elements}->[1]->[0]
6940 ->manakai_local_name,
6941 token => $token);
6942 } else {
6943 !!!cp ('t407');
6944 }
6945 $self->{insertion_mode} = AFTER_BODY_IM;
6946 ## reprocess
6947 next B;
6948 } else {
6949 !!!cp ('t408');
6950 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6951 ## Ignore the token
6952 !!!next-token;
6953 next B;
6954 }
6955 } elsif ({
6956 address => 1, blockquote => 1, center => 1, dir => 1,
6957 div => 1, dl => 1, fieldset => 1, listing => 1,
6958 menu => 1, ol => 1, pre => 1, ul => 1,
6959 dd => 1, dt => 1, li => 1,
6960 applet => 1, button => 1, marquee => 1, object => 1,
6961 }->{$token->{tag_name}}) {
6962 ## has an element in scope
6963 my $i;
6964 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6965 my $node = $self->{open_elements}->[$_];
6966 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6967 !!!cp ('t410');
6968 $i = $_;
6969 last INSCOPE;
6970 } elsif ($node->[1] & SCOPING_EL) {
6971 !!!cp ('t411');
6972 last INSCOPE;
6973 }
6974 } # INSCOPE
6975
6976 unless (defined $i) { # has an element in scope
6977 !!!cp ('t413');
6978 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6979 } else {
6980 ## Step 1. generate implied end tags
6981 while ({
6982 dd => ($token->{tag_name} ne 'dd'),
6983 dt => ($token->{tag_name} ne 'dt'),
6984 li => ($token->{tag_name} ne 'li'),
6985 p => 1,
6986 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
6987 !!!cp ('t409');
6988 pop @{$self->{open_elements}};
6989 }
6990
6991 ## Step 2.
6992 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6993 ne $token->{tag_name}) {
6994 !!!cp ('t412');
6995 !!!parse-error (type => 'not closed',
6996 value => $self->{open_elements}->[-1]->[0]
6997 ->manakai_local_name,
6998 token => $token);
6999 } else {
7000 !!!cp ('t414');
7001 }
7002
7003 ## Step 3.
7004 splice @{$self->{open_elements}}, $i;
7005
7006 ## Step 4.
7007 $clear_up_to_marker->()
7008 if {
7009 applet => 1, button => 1, marquee => 1, object => 1,
7010 }->{$token->{tag_name}};
7011 }
7012 !!!next-token;
7013 next B;
7014 } elsif ($token->{tag_name} eq 'form') {
7015 undef $self->{form_element};
7016
7017 ## has an element in scope
7018 my $i;
7019 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7020 my $node = $self->{open_elements}->[$_];
7021 if ($node->[1] & FORM_EL) {
7022 !!!cp ('t418');
7023 $i = $_;
7024 last INSCOPE;
7025 } elsif ($node->[1] & SCOPING_EL) {
7026 !!!cp ('t419');
7027 last INSCOPE;
7028 }
7029 } # INSCOPE
7030
7031 unless (defined $i) { # has an element in scope
7032 !!!cp ('t421');
7033 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7034 } else {
7035 ## Step 1. generate implied end tags
7036 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7037 !!!cp ('t417');
7038 pop @{$self->{open_elements}};
7039 }
7040
7041 ## Step 2.
7042 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7043 ne $token->{tag_name}) {
7044 !!!cp ('t417.1');
7045 !!!parse-error (type => 'not closed',
7046 value => $self->{open_elements}->[-1]->[0]
7047 ->manakai_local_name,
7048 token => $token);
7049 } else {
7050 !!!cp ('t420');
7051 }
7052
7053 ## Step 3.
7054 splice @{$self->{open_elements}}, $i;
7055 }
7056
7057 !!!next-token;
7058 next B;
7059 } elsif ({
7060 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7061 }->{$token->{tag_name}}) {
7062 ## has an element in scope
7063 my $i;
7064 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7065 my $node = $self->{open_elements}->[$_];
7066 if ($node->[1] & HEADING_EL) {
7067 !!!cp ('t423');
7068 $i = $_;
7069 last INSCOPE;
7070 } elsif ($node->[1] & SCOPING_EL) {
7071 !!!cp ('t424');
7072 last INSCOPE;
7073 }
7074 } # INSCOPE
7075
7076 unless (defined $i) { # has an element in scope
7077 !!!cp ('t425.1');
7078 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7079 } else {
7080 ## Step 1. generate implied end tags
7081 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7082 !!!cp ('t422');
7083 pop @{$self->{open_elements}};
7084 }
7085
7086 ## Step 2.
7087 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7088 ne $token->{tag_name}) {
7089 !!!cp ('t425');
7090 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7091 } else {
7092 !!!cp ('t426');
7093 }
7094
7095 ## Step 3.
7096 splice @{$self->{open_elements}}, $i;
7097 }
7098
7099 !!!next-token;
7100 next B;
7101 } elsif ($token->{tag_name} eq 'p') {
7102 ## has an element in scope
7103 my $i;
7104 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7105 my $node = $self->{open_elements}->[$_];
7106 if ($node->[1] & P_EL) {
7107 !!!cp ('t410.1');
7108 $i = $_;
7109 last INSCOPE;
7110 } elsif ($node->[1] & SCOPING_EL) {
7111 !!!cp ('t411.1');
7112 last INSCOPE;
7113 }
7114 } # INSCOPE
7115
7116 if (defined $i) {
7117 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7118 ne $token->{tag_name}) {
7119 !!!cp ('t412.1');
7120 !!!parse-error (type => 'not closed',
7121 value => $self->{open_elements}->[-1]->[0]
7122 ->manakai_local_name,
7123 token => $token);
7124 } else {
7125 !!!cp ('t414.1');
7126 }
7127
7128 splice @{$self->{open_elements}}, $i;
7129 } else {
7130 !!!cp ('t413.1');
7131 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7132
7133 !!!cp ('t415.1');
7134 ## As if <p>, then reprocess the current token
7135 my $el;
7136 !!!create-element ($el, $HTML_NS, 'p',, $token);
7137 $insert->($el);
7138 ## NOTE: Not inserted into |$self->{open_elements}|.
7139 }
7140
7141 !!!next-token;
7142 next B;
7143 } elsif ({
7144 a => 1,
7145 b => 1, big => 1, em => 1, font => 1, i => 1,
7146 nobr => 1, s => 1, small => 1, strile => 1,
7147 strong => 1, tt => 1, u => 1,
7148 }->{$token->{tag_name}}) {
7149 !!!cp ('t427');
7150 $formatting_end_tag->($token);
7151 next B;
7152 } elsif ($token->{tag_name} eq 'br') {
7153 !!!cp ('t428');
7154 !!!parse-error (type => 'unmatched end tag:br', token => $token);
7155
7156 ## As if <br>
7157 $reconstruct_active_formatting_elements->($insert_to_current);
7158
7159 my $el;
7160 !!!create-element ($el, $HTML_NS, 'br',, $token);
7161 $insert->($el);
7162
7163 ## Ignore the token.
7164 !!!next-token;
7165 next B;
7166 } elsif ({
7167 caption => 1, col => 1, colgroup => 1, frame => 1,
7168 frameset => 1, head => 1, option => 1, optgroup => 1,
7169 tbody => 1, td => 1, tfoot => 1, th => 1,
7170 thead => 1, tr => 1,
7171 area => 1, basefont => 1, bgsound => 1,
7172 embed => 1, hr => 1, iframe => 1, image => 1,
7173 img => 1, input => 1, isindex => 1, noembed => 1,
7174 noframes => 1, param => 1, select => 1, spacer => 1,
7175 table => 1, textarea => 1, wbr => 1,
7176 noscript => 0, ## TODO: if scripting is enabled
7177 }->{$token->{tag_name}}) {
7178 !!!cp ('t429');
7179 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7180 ## Ignore the token
7181 !!!next-token;
7182 next B;
7183
7184 ## ISSUE: Issue on HTML5 new elements in spec
7185
7186 } else {
7187 ## Step 1
7188 my $node_i = -1;
7189 my $node = $self->{open_elements}->[$node_i];
7190
7191 ## Step 2
7192 S2: {
7193 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7194 ## Step 1
7195 ## generate implied end tags
7196 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7197 !!!cp ('t430');
7198 ## ISSUE: Can this case be reached?
7199 pop @{$self->{open_elements}};
7200 }
7201
7202 ## Step 2
7203 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7204 ne $token->{tag_name}) {
7205 !!!cp ('t431');
7206 ## NOTE: <x><y></x>
7207 !!!parse-error (type => 'not closed',
7208 value => $self->{open_elements}->[-1]->[0]
7209 ->manakai_local_name,
7210 token => $token);
7211 } else {
7212 !!!cp ('t432');
7213 }
7214
7215 ## Step 3
7216 splice @{$self->{open_elements}}, $node_i;
7217
7218 !!!next-token;
7219 last S2;
7220 } else {
7221 ## Step 3
7222 if (not ($node->[1] & FORMATTING_EL) and
7223 #not $phrasing_category->{$node->[1]} and
7224 ($node->[1] & SPECIAL_EL or
7225 $node->[1] & SCOPING_EL)) {
7226 !!!cp ('t433');
7227 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7228 ## Ignore the token
7229 !!!next-token;
7230 last S2;
7231 }
7232
7233 !!!cp ('t434');
7234 }
7235
7236 ## Step 4
7237 $node_i--;
7238 $node = $self->{open_elements}->[$node_i];
7239
7240 ## Step 5;
7241 redo S2;
7242 } # S2
7243 next B;
7244 }
7245 }
7246 next B;
7247 } continue { # B
7248 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7249 ## NOTE: The code below is executed in cases where it does not have
7250 ## to be, but it it is harmless even in those cases.
7251 ## has an element in scope
7252 INSCOPE: {
7253 for (reverse 0..$#{$self->{open_elements}}) {
7254 my $node = $self->{open_elements}->[$_];
7255 if ($node->[1] & FOREIGN_EL) {
7256 last INSCOPE;
7257 } elsif ($node->[1] & SCOPING_EL) {
7258 last;
7259 }
7260 }
7261
7262 ## NOTE: No foreign element in scope.
7263 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7264 } # INSCOPE
7265 }
7266 } # B
7267
7268 ## Stop parsing # MUST
7269
7270 ## TODO: script stuffs
7271 } # _tree_construct_main
7272
7273 sub set_inner_html ($$$) {
7274 my $class = shift;
7275 my $node = shift;
7276 my $s = \$_[0];
7277 my $onerror = $_[1];
7278
7279 ## ISSUE: Should {confident} be true?
7280
7281 my $nt = $node->node_type;
7282 if ($nt == 9) {
7283 # MUST
7284
7285 ## Step 1 # MUST
7286 ## TODO: If the document has an active parser, ...
7287 ## ISSUE: There is an issue in the spec.
7288
7289 ## Step 2 # MUST
7290 my @cn = @{$node->child_nodes};
7291 for (@cn) {
7292 $node->remove_child ($_);
7293 }
7294
7295 ## Step 3, 4, 5 # MUST
7296 $class->parse_string ($$s => $node, $onerror);
7297 } elsif ($nt == 1) {
7298 ## TODO: If non-html element
7299
7300 ## NOTE: Most of this code is copied from |parse_string|
7301
7302 ## Step 1 # MUST
7303 my $this_doc = $node->owner_document;
7304 my $doc = $this_doc->implementation->create_document;
7305 $doc->manakai_is_html (1);
7306 my $p = $class->new;
7307 $p->{document} = $doc;
7308
7309 ## Step 8 # MUST
7310 my $i = 0;
7311 $p->{line_prev} = $p->{line} = 1;
7312 $p->{column_prev} = $p->{column} = 0;
7313 $p->{set_next_char} = sub {
7314 my $self = shift;
7315
7316 pop @{$self->{prev_char}};
7317 unshift @{$self->{prev_char}}, $self->{next_char};
7318
7319 $self->{next_char} = -1 and return if $i >= length $$s;
7320 $self->{next_char} = ord substr $$s, $i++, 1;
7321
7322 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7323 $p->{column}++;
7324
7325 if ($self->{next_char} == 0x000A) { # LF
7326 $p->{line}++;
7327 $p->{column} = 0;
7328 !!!cp ('i1');
7329 } elsif ($self->{next_char} == 0x000D) { # CR
7330 $i++ if substr ($$s, $i, 1) eq "\x0A";
7331 $self->{next_char} = 0x000A; # LF # MUST
7332 $p->{line}++;
7333 $p->{column} = 0;
7334 !!!cp ('i2');
7335 } elsif ($self->{next_char} > 0x10FFFF) {
7336 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7337 !!!cp ('i3');
7338 } elsif ($self->{next_char} == 0x0000) { # NULL
7339 !!!cp ('i4');
7340 !!!parse-error (type => 'NULL');
7341 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7342 } elsif ($self->{next_char} <= 0x0008 or
7343 (0x000E <= $self->{next_char} and
7344 $self->{next_char} <= 0x001F) or
7345 (0x007F <= $self->{next_char} and
7346 $self->{next_char} <= 0x009F) or
7347 (0xD800 <= $self->{next_char} and
7348 $self->{next_char} <= 0xDFFF) or
7349 (0xFDD0 <= $self->{next_char} and
7350 $self->{next_char} <= 0xFDDF) or
7351 {
7352 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7353 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7354 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7355 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7356 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7357 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7358 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7359 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7360 0x10FFFE => 1, 0x10FFFF => 1,
7361 }->{$self->{next_char}}) {
7362 !!!cp ('i4.1');
7363 !!!parse-error (type => 'control char', level => $self->{must_level});
7364 ## TODO: error type documentation
7365 }
7366 };
7367 $p->{prev_char} = [-1, -1, -1];
7368 $p->{next_char} = -1;
7369
7370 my $ponerror = $onerror || sub {
7371 my (%opt) = @_;
7372 my $line = $opt{line};
7373 my $column = $opt{column};
7374 if (defined $opt{token} and defined $opt{token}->{line}) {
7375 $line = $opt{token}->{line};
7376 $column = $opt{token}->{column};
7377 }
7378 warn "Parse error ($opt{type}) at line $line column $column\n";
7379 };
7380 $p->{parse_error} = sub {
7381 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7382 };
7383
7384 $p->_initialize_tokenizer;
7385 $p->_initialize_tree_constructor;
7386
7387 ## Step 2
7388 my $node_ln = $node->manakai_local_name;
7389 $p->{content_model} = {
7390 title => RCDATA_CONTENT_MODEL,
7391 textarea => RCDATA_CONTENT_MODEL,
7392 style => CDATA_CONTENT_MODEL,
7393 script => CDATA_CONTENT_MODEL,
7394 xmp => CDATA_CONTENT_MODEL,
7395 iframe => CDATA_CONTENT_MODEL,
7396 noembed => CDATA_CONTENT_MODEL,
7397 noframes => CDATA_CONTENT_MODEL,
7398 noscript => CDATA_CONTENT_MODEL,
7399 plaintext => PLAINTEXT_CONTENT_MODEL,
7400 }->{$node_ln};
7401 $p->{content_model} = PCDATA_CONTENT_MODEL
7402 unless defined $p->{content_model};
7403 ## ISSUE: What is "the name of the element"? local name?
7404
7405 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7406 ## TODO: Foreign element OK?
7407
7408 ## Step 3
7409 my $root = $doc->create_element_ns
7410 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7411
7412 ## Step 4 # MUST
7413 $doc->append_child ($root);
7414
7415 ## Step 5 # MUST
7416 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7417
7418 undef $p->{head_element};
7419
7420 ## Step 6 # MUST
7421 $p->_reset_insertion_mode;
7422
7423 ## Step 7 # MUST
7424 my $anode = $node;
7425 AN: while (defined $anode) {
7426 if ($anode->node_type == 1) {
7427 my $nsuri = $anode->namespace_uri;
7428 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7429 if ($anode->manakai_local_name eq 'form') {
7430 !!!cp ('i5');
7431 $p->{form_element} = $anode;
7432 last AN;
7433 }
7434 }
7435 }
7436 $anode = $anode->parent_node;
7437 } # AN
7438
7439 ## Step 9 # MUST
7440 {
7441 my $self = $p;
7442 !!!next-token;
7443 }
7444 $p->_tree_construction_main;
7445
7446 ## Step 10 # MUST
7447 my @cn = @{$node->child_nodes};
7448 for (@cn) {
7449 $node->remove_child ($_);
7450 }
7451 ## ISSUE: mutation events? read-only?
7452
7453 ## Step 11 # MUST
7454 @cn = @{$root->child_nodes};
7455 for (@cn) {
7456 $this_doc->adopt_node ($_);
7457 $node->append_child ($_);
7458 }
7459 ## ISSUE: mutation events?
7460
7461 $p->_terminate_tree_constructor;
7462
7463 delete $p->{parse_error}; # delete loop
7464 } else {
7465 die "$0: |set_inner_html| is not defined for node of type $nt";
7466 }
7467 } # set_inner_html
7468
7469 } # tree construction stage
7470
7471 package Whatpm::HTML::RestartParser;
7472 push our @ISA, 'Error';
7473
7474 1;
7475 # $Date: 2008/05/25 08:53:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24