/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.149 - (show annotations) (download) (as text)
Sun May 25 08:53:49 2008 UTC (16 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.148: +4 -3 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	25 May 2008 08:53:05 -0000
	* ContentType.t: Test result related to UTF-32 updated (HTML5
	revision 1701).

2008-05-25  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	25 May 2008 08:53:43 -0000
	* ContentType.pm: Drop support for UTF-32 (HTML5 revision 1701).

	* HTML.pm.src: UTF-16BE and UTF-16LE should be considered
	as UTF-16 (HTML5 revision 1701).

2008-05-25  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.148 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 require IO::Handle;
12
13 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15 my $SVG_NS = q<http://www.w3.org/2000/svg>;
16 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19
20 sub A_EL () { 0b1 }
21 sub ADDRESS_EL () { 0b10 }
22 sub BODY_EL () { 0b100 }
23 sub BUTTON_EL () { 0b1000 }
24 sub CAPTION_EL () { 0b10000 }
25 sub DD_EL () { 0b100000 }
26 sub DIV_EL () { 0b1000000 }
27 sub DT_EL () { 0b10000000 }
28 sub FORM_EL () { 0b100000000 }
29 sub FORMATTING_EL () { 0b1000000000 }
30 sub FRAMESET_EL () { 0b10000000000 }
31 sub HEADING_EL () { 0b100000000000 }
32 sub HTML_EL () { 0b1000000000000 }
33 sub LI_EL () { 0b10000000000000 }
34 sub NOBR_EL () { 0b100000000000000 }
35 sub OPTION_EL () { 0b1000000000000000 }
36 sub OPTGROUP_EL () { 0b10000000000000000 }
37 sub P_EL () { 0b100000000000000000 }
38 sub SELECT_EL () { 0b1000000000000000000 }
39 sub TABLE_EL () { 0b10000000000000000000 }
40 sub TABLE_CELL_EL () { 0b100000000000000000000 }
41 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48
49 sub TABLE_ROWS_EL () {
50 TABLE_EL |
51 TABLE_ROW_EL |
52 TABLE_ROW_GROUP_EL
53 }
54
55 sub END_TAG_OPTIONAL_EL () {
56 DD_EL |
57 DT_EL |
58 LI_EL |
59 P_EL
60 }
61
62 sub ALL_END_TAG_OPTIONAL_EL () {
63 END_TAG_OPTIONAL_EL |
64 BODY_EL |
65 HTML_EL |
66 TABLE_CELL_EL |
67 TABLE_ROW_EL |
68 TABLE_ROW_GROUP_EL
69 }
70
71 sub SCOPING_EL () {
72 BUTTON_EL |
73 CAPTION_EL |
74 HTML_EL |
75 TABLE_EL |
76 TABLE_CELL_EL |
77 MISC_SCOPING_EL
78 }
79
80 sub TABLE_SCOPING_EL () {
81 HTML_EL |
82 TABLE_EL
83 }
84
85 sub TABLE_ROWS_SCOPING_EL () {
86 HTML_EL |
87 TABLE_ROW_GROUP_EL
88 }
89
90 sub TABLE_ROW_SCOPING_EL () {
91 HTML_EL |
92 TABLE_ROW_EL
93 }
94
95 sub SPECIAL_EL () {
96 ADDRESS_EL |
97 BODY_EL |
98 DIV_EL |
99 END_TAG_OPTIONAL_EL |
100 FORM_EL |
101 FRAMESET_EL |
102 HEADING_EL |
103 OPTION_EL |
104 OPTGROUP_EL |
105 SELECT_EL |
106 TABLE_ROW_EL |
107 TABLE_ROW_GROUP_EL |
108 MISC_SPECIAL_EL
109 }
110
111 my $el_category = {
112 a => A_EL | FORMATTING_EL,
113 address => ADDRESS_EL,
114 applet => MISC_SCOPING_EL,
115 area => MISC_SPECIAL_EL,
116 b => FORMATTING_EL,
117 base => MISC_SPECIAL_EL,
118 basefont => MISC_SPECIAL_EL,
119 bgsound => MISC_SPECIAL_EL,
120 big => FORMATTING_EL,
121 blockquote => MISC_SPECIAL_EL,
122 body => BODY_EL,
123 br => MISC_SPECIAL_EL,
124 button => BUTTON_EL,
125 caption => CAPTION_EL,
126 center => MISC_SPECIAL_EL,
127 col => MISC_SPECIAL_EL,
128 colgroup => MISC_SPECIAL_EL,
129 dd => DD_EL,
130 dir => MISC_SPECIAL_EL,
131 div => DIV_EL,
132 dl => MISC_SPECIAL_EL,
133 dt => DT_EL,
134 em => FORMATTING_EL,
135 embed => MISC_SPECIAL_EL,
136 fieldset => MISC_SPECIAL_EL,
137 font => FORMATTING_EL,
138 form => FORM_EL,
139 frame => MISC_SPECIAL_EL,
140 frameset => FRAMESET_EL,
141 h1 => HEADING_EL,
142 h2 => HEADING_EL,
143 h3 => HEADING_EL,
144 h4 => HEADING_EL,
145 h5 => HEADING_EL,
146 h6 => HEADING_EL,
147 head => MISC_SPECIAL_EL,
148 hr => MISC_SPECIAL_EL,
149 html => HTML_EL,
150 i => FORMATTING_EL,
151 iframe => MISC_SPECIAL_EL,
152 img => MISC_SPECIAL_EL,
153 input => MISC_SPECIAL_EL,
154 isindex => MISC_SPECIAL_EL,
155 li => LI_EL,
156 link => MISC_SPECIAL_EL,
157 listing => MISC_SPECIAL_EL,
158 marquee => MISC_SCOPING_EL,
159 menu => MISC_SPECIAL_EL,
160 meta => MISC_SPECIAL_EL,
161 nobr => NOBR_EL | FORMATTING_EL,
162 noembed => MISC_SPECIAL_EL,
163 noframes => MISC_SPECIAL_EL,
164 noscript => MISC_SPECIAL_EL,
165 object => MISC_SCOPING_EL,
166 ol => MISC_SPECIAL_EL,
167 optgroup => OPTGROUP_EL,
168 option => OPTION_EL,
169 p => P_EL,
170 param => MISC_SPECIAL_EL,
171 plaintext => MISC_SPECIAL_EL,
172 pre => MISC_SPECIAL_EL,
173 s => FORMATTING_EL,
174 script => MISC_SPECIAL_EL,
175 select => SELECT_EL,
176 small => FORMATTING_EL,
177 spacer => MISC_SPECIAL_EL,
178 strike => FORMATTING_EL,
179 strong => FORMATTING_EL,
180 style => MISC_SPECIAL_EL,
181 table => TABLE_EL,
182 tbody => TABLE_ROW_GROUP_EL,
183 td => TABLE_CELL_EL,
184 textarea => MISC_SPECIAL_EL,
185 tfoot => TABLE_ROW_GROUP_EL,
186 th => TABLE_CELL_EL,
187 thead => TABLE_ROW_GROUP_EL,
188 title => MISC_SPECIAL_EL,
189 tr => TABLE_ROW_EL,
190 tt => FORMATTING_EL,
191 u => FORMATTING_EL,
192 ul => MISC_SPECIAL_EL,
193 wbr => MISC_SPECIAL_EL,
194 };
195
196 my $el_category_f = {
197 $MML_NS => {
198 'annotation-xml' => MML_AXML_EL,
199 mi => FOREIGN_FLOW_CONTENT_EL,
200 mo => FOREIGN_FLOW_CONTENT_EL,
201 mn => FOREIGN_FLOW_CONTENT_EL,
202 ms => FOREIGN_FLOW_CONTENT_EL,
203 mtext => FOREIGN_FLOW_CONTENT_EL,
204 },
205 $SVG_NS => {
206 foreignObject => FOREIGN_FLOW_CONTENT_EL,
207 desc => FOREIGN_FLOW_CONTENT_EL,
208 title => FOREIGN_FLOW_CONTENT_EL,
209 },
210 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
211 };
212
213 my $svg_attr_name = {
214 attributename => 'attributeName',
215 attributetype => 'attributeType',
216 basefrequency => 'baseFrequency',
217 baseprofile => 'baseProfile',
218 calcmode => 'calcMode',
219 clippathunits => 'clipPathUnits',
220 contentscripttype => 'contentScriptType',
221 contentstyletype => 'contentStyleType',
222 diffuseconstant => 'diffuseConstant',
223 edgemode => 'edgeMode',
224 externalresourcesrequired => 'externalResourcesRequired',
225 filterres => 'filterRes',
226 filterunits => 'filterUnits',
227 glyphref => 'glyphRef',
228 gradienttransform => 'gradientTransform',
229 gradientunits => 'gradientUnits',
230 kernelmatrix => 'kernelMatrix',
231 kernelunitlength => 'kernelUnitLength',
232 keypoints => 'keyPoints',
233 keysplines => 'keySplines',
234 keytimes => 'keyTimes',
235 lengthadjust => 'lengthAdjust',
236 limitingconeangle => 'limitingConeAngle',
237 markerheight => 'markerHeight',
238 markerunits => 'markerUnits',
239 markerwidth => 'markerWidth',
240 maskcontentunits => 'maskContentUnits',
241 maskunits => 'maskUnits',
242 numoctaves => 'numOctaves',
243 pathlength => 'pathLength',
244 patterncontentunits => 'patternContentUnits',
245 patterntransform => 'patternTransform',
246 patternunits => 'patternUnits',
247 pointsatx => 'pointsAtX',
248 pointsaty => 'pointsAtY',
249 pointsatz => 'pointsAtZ',
250 preservealpha => 'preserveAlpha',
251 preserveaspectratio => 'preserveAspectRatio',
252 primitiveunits => 'primitiveUnits',
253 refx => 'refX',
254 refy => 'refY',
255 repeatcount => 'repeatCount',
256 repeatdur => 'repeatDur',
257 requiredextensions => 'requiredExtensions',
258 requiredfeatures => 'requiredFeatures',
259 specularconstant => 'specularConstant',
260 specularexponent => 'specularExponent',
261 spreadmethod => 'spreadMethod',
262 startoffset => 'startOffset',
263 stddeviation => 'stdDeviation',
264 stitchtiles => 'stitchTiles',
265 surfacescale => 'surfaceScale',
266 systemlanguage => 'systemLanguage',
267 tablevalues => 'tableValues',
268 targetx => 'targetX',
269 targety => 'targetY',
270 textlength => 'textLength',
271 viewbox => 'viewBox',
272 viewtarget => 'viewTarget',
273 xchannelselector => 'xChannelSelector',
274 ychannelselector => 'yChannelSelector',
275 zoomandpan => 'zoomAndPan',
276 };
277
278 my $foreign_attr_xname = {
279 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
280 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
281 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
282 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
283 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
284 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
285 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
286 'xml:base' => [$XML_NS, ['xml', 'base']],
287 'xml:lang' => [$XML_NS, ['xml', 'lang']],
288 'xml:space' => [$XML_NS, ['xml', 'space']],
289 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
290 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
291 };
292
293 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
294
295 my $c1_entity_char = {
296 0x80 => 0x20AC,
297 0x81 => 0xFFFD,
298 0x82 => 0x201A,
299 0x83 => 0x0192,
300 0x84 => 0x201E,
301 0x85 => 0x2026,
302 0x86 => 0x2020,
303 0x87 => 0x2021,
304 0x88 => 0x02C6,
305 0x89 => 0x2030,
306 0x8A => 0x0160,
307 0x8B => 0x2039,
308 0x8C => 0x0152,
309 0x8D => 0xFFFD,
310 0x8E => 0x017D,
311 0x8F => 0xFFFD,
312 0x90 => 0xFFFD,
313 0x91 => 0x2018,
314 0x92 => 0x2019,
315 0x93 => 0x201C,
316 0x94 => 0x201D,
317 0x95 => 0x2022,
318 0x96 => 0x2013,
319 0x97 => 0x2014,
320 0x98 => 0x02DC,
321 0x99 => 0x2122,
322 0x9A => 0x0161,
323 0x9B => 0x203A,
324 0x9C => 0x0153,
325 0x9D => 0xFFFD,
326 0x9E => 0x017E,
327 0x9F => 0x0178,
328 }; # $c1_entity_char
329
330 sub parse_byte_string ($$$$;$) {
331 my $self = shift;
332 my $charset_name = shift;
333 open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
334 return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
335 } # parse_byte_string
336
337 sub parse_byte_stream ($$$$;$) {
338 my $self = ref $_[0] ? shift : shift->new;
339 my $charset_name = shift;
340 my $byte_stream = $_[0];
341
342 my $onerror = $_[2] || sub {
343 my (%opt) = @_;
344 warn "Parse error ($opt{type})\n";
345 };
346 $self->{parse_error} = $onerror; # updated later by parse_char_string
347
348 ## HTML5 encoding sniffing algorithm
349 require Message::Charset::Info;
350 my $charset;
351 my $buffer;
352 my ($char_stream, $e_status);
353
354 SNIFFING: {
355
356 ## Step 1
357 if (defined $charset_name) {
358 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
359
360 ## ISSUE: Unsupported encoding is not ignored according to the spec.
361 ($char_stream, $e_status) = $charset->get_decode_handle
362 ($byte_stream, allow_error_reporting => 1,
363 allow_fallback => 1);
364 if ($char_stream) {
365 $self->{confident} = 1;
366 last SNIFFING;
367 } else {
368 ## TODO: unsupported error
369 }
370 }
371
372 ## Step 2
373 my $byte_buffer = '';
374 for (1..1024) {
375 my $char = $byte_stream->getc;
376 last unless defined $char;
377 $byte_buffer .= $char;
378 } ## TODO: timeout
379
380 ## Step 3
381 if ($byte_buffer =~ /^\xFE\xFF/) {
382 $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
383 ($char_stream, $e_status) = $charset->get_decode_handle
384 ($byte_stream, allow_error_reporting => 1,
385 allow_fallback => 1, byte_buffer => \$byte_buffer);
386 $self->{confident} = 1;
387 last SNIFFING;
388 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
389 $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
390 ($char_stream, $e_status) = $charset->get_decode_handle
391 ($byte_stream, allow_error_reporting => 1,
392 allow_fallback => 1, byte_buffer => \$byte_buffer);
393 $self->{confident} = 1;
394 last SNIFFING;
395 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
396 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
397 ($char_stream, $e_status) = $charset->get_decode_handle
398 ($byte_stream, allow_error_reporting => 1,
399 allow_fallback => 1, byte_buffer => \$byte_buffer);
400 $self->{confident} = 1;
401 last SNIFFING;
402 }
403
404 ## Step 4
405 ## TODO: <meta charset>
406
407 ## Step 5
408 ## TODO: from history
409
410 ## Step 6
411 require Whatpm::Charset::UniversalCharDet;
412 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
413 ($byte_buffer);
414 if (defined $charset_name) {
415 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
416
417 ## ISSUE: Unsupported encoding is not ignored according to the spec.
418 require Whatpm::Charset::DecodeHandle;
419 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
420 ($byte_stream);
421 ($char_stream, $e_status) = $charset->get_decode_handle
422 ($buffer, allow_error_reporting => 1,
423 allow_fallback => 1, byte_buffer => \$byte_buffer);
424 if ($char_stream) {
425 $buffer->{buffer} = $byte_buffer;
426 !!!parse-error (type => 'sniffing:chardet', ## TODO: type name
427 value => $charset_name,
428 level => $self->{info_level},
429 line => 1, column => 1);
430 $self->{confident} = 0;
431 last SNIFFING;
432 }
433 }
434
435 ## Step 7: default
436 ## TODO: Make this configurable.
437 $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
438 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
439 ## detectable in the step 6.
440 require Whatpm::Charset::DecodeHandle;
441 $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
442 ($byte_stream);
443 ($char_stream, $e_status)
444 = $charset->get_decode_handle ($buffer,
445 allow_error_reporting => 1,
446 allow_fallback => 1,
447 byte_buffer => \$byte_buffer);
448 $buffer->{buffer} = $byte_buffer;
449 !!!parse-error (type => 'sniffing:default', ## TODO: type name
450 value => 'windows-1252',
451 level => $self->{info_level},
452 line => 1, column => 1);
453 $self->{confident} = 0;
454 } # SNIFFING
455
456 $self->{input_encoding} = $charset->get_iana_name;
457 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
458 !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
459 value => $self->{input_encoding},
460 level => $self->{unsupported_level},
461 line => 1, column => 1);
462 } elsif (not ($e_status &
463 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
464 !!!parse-error (type => 'chardecode:no error', ## TODO: type name
465 value => $self->{input_encoding},
466 level => $self->{unsupported_level},
467 line => 1, column => 1);
468 }
469
470 $self->{change_encoding} = sub {
471 my $self = shift;
472 $charset_name = shift;
473 my $token = shift;
474
475 $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
476 ($char_stream, $e_status) = $charset->get_decode_handle
477 ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
478 byte_buffer => \ $buffer->{buffer});
479
480 if ($char_stream) { # if supported
481 ## "Change the encoding" algorithm:
482
483 ## Step 1
484 if ($charset->{category} &
485 Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
486 $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
487 ($char_stream, $e_status) = $charset->get_decode_handle
488 ($byte_stream,
489 byte_buffer => \ $buffer->{buffer});
490 }
491 $charset_name = $charset->get_iana_name;
492
493 ## Step 2
494 if (defined $self->{input_encoding} and
495 $self->{input_encoding} eq $charset_name) {
496 !!!parse-error (type => 'charset label:matching', ## TODO: type
497 value => $charset_name,
498 level => $self->{info_level});
499 $self->{confident} = 1;
500 return;
501 }
502
503 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
504 ':'.$charset_name, level => 'w', token => $token);
505
506 ## Step 3
507 # if (can) {
508 ## change the encoding on the fly.
509 #$self->{confident} = 1;
510 #return;
511 # }
512
513 ## Step 4
514 throw Whatpm::HTML::RestartParser ();
515 }
516 }; # $self->{change_encoding}
517
518 my $char_onerror = sub {
519 my (undef, $type, %opt) = @_;
520 !!!parse-error (%opt, type => $type,
521 line => $self->{line}, column => $self->{column} + 1);
522 if ($opt{octets}) {
523 ${$opt{octets}} = "\x{FFFD}"; # relacement character
524 }
525 };
526 $char_stream->onerror ($char_onerror);
527
528 my @args = @_; shift @args; # $s
529 my $return;
530 try {
531 $return = $self->parse_char_stream ($char_stream, @args);
532 } catch Whatpm::HTML::RestartParser with {
533 ## NOTE: Invoked after {change_encoding}.
534
535 $self->{input_encoding} = $charset->get_iana_name;
536 if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
537 !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
538 value => $self->{input_encoding},
539 level => $self->{unsupported_level},
540 line => 1, column => 1);
541 } elsif (not ($e_status &
542 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
543 !!!parse-error (type => 'chardecode:no error', ## TODO: type name
544 value => $self->{input_encoding},
545 level => $self->{unsupported_level},
546 line => 1, column => 1);
547 }
548 $self->{confident} = 1;
549 $char_stream->onerror ($char_onerror);
550 $return = $self->parse_char_stream ($char_stream, @args);
551 };
552 return $return;
553 } # parse_byte_stream
554
555 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
556 ## and the HTML layer MUST ignore it. However, we does strip BOM in
557 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
558 ## because the core part of our HTML parser expects a string of character,
559 ## not a string of bytes or code units or anything which might contain a BOM.
560 ## Therefore, any parser interface that accepts a string of bytes,
561 ## such as |parse_byte_string| in this module, must ensure that it does
562 ## strip the BOM and never strip any ZWNBSP.
563
564 sub parse_char_string ($$$;$) {
565 my $self = shift;
566 require utf8;
567 my $s = ref $_[0] ? $_[0] : \($_[0]);
568 open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
569 return $self->parse_char_stream ($input, @_[1..$#_]);
570 } # parse_char_string
571 *parse_string = \&parse_char_string;
572
573 sub parse_char_stream ($$$;$) {
574 my $self = ref $_[0] ? shift : shift->new;
575 my $input = $_[0];
576 $self->{document} = $_[1];
577 @{$self->{document}->child_nodes} = ();
578
579 ## NOTE: |set_inner_html| copies most of this method's code
580
581 $self->{confident} = 1 unless exists $self->{confident};
582 $self->{document}->input_encoding ($self->{input_encoding})
583 if defined $self->{input_encoding};
584
585 my $i = 0;
586 $self->{line_prev} = $self->{line} = 1;
587 $self->{column_prev} = $self->{column} = 0;
588 $self->{set_next_char} = sub {
589 my $self = shift;
590
591 pop @{$self->{prev_char}};
592 unshift @{$self->{prev_char}}, $self->{next_char};
593
594 my $char;
595 if (defined $self->{next_next_char}) {
596 $char = $self->{next_next_char};
597 delete $self->{next_next_char};
598 } else {
599 $char = $input->getc;
600 }
601 $self->{next_char} = -1 and return unless defined $char;
602 $self->{next_char} = ord $char;
603
604 ($self->{line_prev}, $self->{column_prev})
605 = ($self->{line}, $self->{column});
606 $self->{column}++;
607
608 if ($self->{next_char} == 0x000A) { # LF
609 !!!cp ('j1');
610 $self->{line}++;
611 $self->{column} = 0;
612 } elsif ($self->{next_char} == 0x000D) { # CR
613 !!!cp ('j2');
614 my $next = $input->getc;
615 if (defined $next and $next ne "\x0A") {
616 $self->{next_next_char} = $next;
617 }
618 $self->{next_char} = 0x000A; # LF # MUST
619 $self->{line}++;
620 $self->{column} = 0;
621 } elsif ($self->{next_char} > 0x10FFFF) {
622 !!!cp ('j3');
623 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
624 } elsif ($self->{next_char} == 0x0000) { # NULL
625 !!!cp ('j4');
626 !!!parse-error (type => 'NULL');
627 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
628 } elsif ($self->{next_char} <= 0x0008 or
629 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
630 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
631 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
632 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
633 {
634 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
635 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
636 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
637 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
638 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
639 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
640 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
641 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
642 0x10FFFE => 1, 0x10FFFF => 1,
643 }->{$self->{next_char}}) {
644 !!!cp ('j5');
645 !!!parse-error (type => 'control char', level => $self->{must_level});
646 ## TODO: error type documentation
647 }
648 };
649 $self->{prev_char} = [-1, -1, -1];
650 $self->{next_char} = -1;
651
652 my $onerror = $_[2] || sub {
653 my (%opt) = @_;
654 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
655 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
656 warn "Parse error ($opt{type}) at line $line column $column\n";
657 };
658 $self->{parse_error} = sub {
659 $onerror->(line => $self->{line}, column => $self->{column}, @_);
660 };
661
662 $self->_initialize_tokenizer;
663 $self->_initialize_tree_constructor;
664 $self->_construct_tree;
665 $self->_terminate_tree_constructor;
666
667 delete $self->{parse_error}; # remove loop
668
669 return $self->{document};
670 } # parse_char_stream
671
672 sub new ($) {
673 my $class = shift;
674 my $self = bless {
675 must_level => 'm',
676 should_level => 's',
677 good_level => 'w',
678 warn_level => 'w',
679 info_level => 'i',
680 unsupported_level => 'u',
681 }, $class;
682 $self->{set_next_char} = sub {
683 $self->{next_char} = -1;
684 };
685 $self->{parse_error} = sub {
686 #
687 };
688 $self->{change_encoding} = sub {
689 # if ($_[0] is a supported encoding) {
690 # run "change the encoding" algorithm;
691 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
692 # }
693 };
694 $self->{application_cache_selection} = sub {
695 #
696 };
697 return $self;
698 } # new
699
700 sub CM_ENTITY () { 0b001 } # & markup in data
701 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
702 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
703
704 sub PLAINTEXT_CONTENT_MODEL () { 0 }
705 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
706 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
707 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
708
709 sub DATA_STATE () { 0 }
710 sub ENTITY_DATA_STATE () { 1 }
711 sub TAG_OPEN_STATE () { 2 }
712 sub CLOSE_TAG_OPEN_STATE () { 3 }
713 sub TAG_NAME_STATE () { 4 }
714 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
715 sub ATTRIBUTE_NAME_STATE () { 6 }
716 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
717 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
718 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
719 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
720 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
721 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
722 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
723 sub COMMENT_START_STATE () { 14 }
724 sub COMMENT_START_DASH_STATE () { 15 }
725 sub COMMENT_STATE () { 16 }
726 sub COMMENT_END_STATE () { 17 }
727 sub COMMENT_END_DASH_STATE () { 18 }
728 sub BOGUS_COMMENT_STATE () { 19 }
729 sub DOCTYPE_STATE () { 20 }
730 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
731 sub DOCTYPE_NAME_STATE () { 22 }
732 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
733 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
734 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
735 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
736 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
737 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
738 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
739 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
740 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
741 sub BOGUS_DOCTYPE_STATE () { 32 }
742 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
743 sub SELF_CLOSING_START_TAG_STATE () { 34 }
744 sub CDATA_BLOCK_STATE () { 35 }
745
746 sub DOCTYPE_TOKEN () { 1 }
747 sub COMMENT_TOKEN () { 2 }
748 sub START_TAG_TOKEN () { 3 }
749 sub END_TAG_TOKEN () { 4 }
750 sub END_OF_FILE_TOKEN () { 5 }
751 sub CHARACTER_TOKEN () { 6 }
752
753 sub AFTER_HTML_IMS () { 0b100 }
754 sub HEAD_IMS () { 0b1000 }
755 sub BODY_IMS () { 0b10000 }
756 sub BODY_TABLE_IMS () { 0b100000 }
757 sub TABLE_IMS () { 0b1000000 }
758 sub ROW_IMS () { 0b10000000 }
759 sub BODY_AFTER_IMS () { 0b100000000 }
760 sub FRAME_IMS () { 0b1000000000 }
761 sub SELECT_IMS () { 0b10000000000 }
762 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
763 ## NOTE: "in foreign content" insertion mode is special; it is combined
764 ## with the secondary insertion mode. In this parser, they are stored
765 ## together in the bit-or'ed form.
766
767 ## NOTE: "initial" and "before html" insertion modes have no constants.
768
769 ## NOTE: "after after body" insertion mode.
770 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
771
772 ## NOTE: "after after frameset" insertion mode.
773 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
774
775 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
776 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
777 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
778 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
779 sub IN_BODY_IM () { BODY_IMS }
780 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
781 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
782 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
783 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
784 sub IN_TABLE_IM () { TABLE_IMS }
785 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
786 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
787 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
788 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
789 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
790 sub IN_COLUMN_GROUP_IM () { 0b10 }
791
792 ## Implementations MUST act as if state machine in the spec
793
794 sub _initialize_tokenizer ($) {
795 my $self = shift;
796 $self->{state} = DATA_STATE; # MUST
797 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
798 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
799 undef $self->{current_attribute};
800 undef $self->{last_emitted_start_tag_name};
801 undef $self->{last_attribute_value_state};
802 delete $self->{self_closing};
803 $self->{char} = [];
804 # $self->{next_char}
805 !!!next-input-character;
806 $self->{token} = [];
807 # $self->{escape}
808 } # _initialize_tokenizer
809
810 ## A token has:
811 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
812 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
813 ## ->{name} (DOCTYPE_TOKEN)
814 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
815 ## ->{public_identifier} (DOCTYPE_TOKEN)
816 ## ->{system_identifier} (DOCTYPE_TOKEN)
817 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
818 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
819 ## ->{name}
820 ## ->{value}
821 ## ->{has_reference} == 1 or 0
822 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
823 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
824 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
825 ## while the token is pushed back to the stack.
826
827 ## ISSUE: "When a DOCTYPE token is created, its
828 ## <i>self-closing flag</i> must be unset (its other state is that it
829 ## be set), and its attributes list must be empty.": Wrong subject?
830
831 ## Emitted token MUST immediately be handled by the tree construction state.
832
833 ## Before each step, UA MAY check to see if either one of the scripts in
834 ## "list of scripts that will execute as soon as possible" or the first
835 ## script in the "list of scripts that will execute asynchronously",
836 ## has completed loading. If one has, then it MUST be executed
837 ## and removed from the list.
838
839 ## NOTE: HTML5 "Writing HTML documents" section, applied to
840 ## documents and not to user agents and conformance checkers,
841 ## contains some requirements that are not detected by the
842 ## parsing algorithm:
843 ## - Some requirements on character encoding declarations. ## TODO
844 ## - "Elements MUST NOT contain content that their content model disallows."
845 ## ... Some are parse error, some are not (will be reported by c.c.).
846 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
847 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
848 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
849
850 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
851 ## be detected by the HTML5 parsing algorithm:
852 ## - Text,
853
854 sub _get_next_token ($) {
855 my $self = shift;
856
857 if ($self->{self_closing}) {
858 !!!parse-error (type => 'nestc', token => $self->{current_token});
859 ## NOTE: The |self_closing| flag is only set by start tag token.
860 ## In addition, when a start tag token is emitted, it is always set to
861 ## |current_token|.
862 delete $self->{self_closing};
863 }
864
865 if (@{$self->{token}}) {
866 $self->{self_closing} = $self->{token}->[0]->{self_closing};
867 return shift @{$self->{token}};
868 }
869
870 A: {
871 if ($self->{state} == DATA_STATE) {
872 if ($self->{next_char} == 0x0026) { # &
873 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
874 not $self->{escape}) {
875 !!!cp (1);
876 $self->{state} = ENTITY_DATA_STATE;
877 !!!next-input-character;
878 redo A;
879 } else {
880 !!!cp (2);
881 #
882 }
883 } elsif ($self->{next_char} == 0x002D) { # -
884 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
885 unless ($self->{escape}) {
886 if ($self->{prev_char}->[0] == 0x002D and # -
887 $self->{prev_char}->[1] == 0x0021 and # !
888 $self->{prev_char}->[2] == 0x003C) { # <
889 !!!cp (3);
890 $self->{escape} = 1;
891 } else {
892 !!!cp (4);
893 }
894 } else {
895 !!!cp (5);
896 }
897 }
898
899 #
900 } elsif ($self->{next_char} == 0x003C) { # <
901 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
902 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
903 not $self->{escape})) {
904 !!!cp (6);
905 $self->{state} = TAG_OPEN_STATE;
906 !!!next-input-character;
907 redo A;
908 } else {
909 !!!cp (7);
910 #
911 }
912 } elsif ($self->{next_char} == 0x003E) { # >
913 if ($self->{escape} and
914 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
915 if ($self->{prev_char}->[0] == 0x002D and # -
916 $self->{prev_char}->[1] == 0x002D) { # -
917 !!!cp (8);
918 delete $self->{escape};
919 } else {
920 !!!cp (9);
921 }
922 } else {
923 !!!cp (10);
924 }
925
926 #
927 } elsif ($self->{next_char} == -1) {
928 !!!cp (11);
929 !!!emit ({type => END_OF_FILE_TOKEN,
930 line => $self->{line}, column => $self->{column}});
931 last A; ## TODO: ok?
932 } else {
933 !!!cp (12);
934 }
935 # Anything else
936 my $token = {type => CHARACTER_TOKEN,
937 data => chr $self->{next_char},
938 line => $self->{line}, column => $self->{column},
939 };
940 ## Stay in the data state
941 !!!next-input-character;
942
943 !!!emit ($token);
944
945 redo A;
946 } elsif ($self->{state} == ENTITY_DATA_STATE) {
947 ## (cannot happen in CDATA state)
948
949 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
950
951 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
952
953 $self->{state} = DATA_STATE;
954 # next-input-character is already done
955
956 unless (defined $token) {
957 !!!cp (13);
958 !!!emit ({type => CHARACTER_TOKEN, data => '&',
959 line => $l, column => $c,
960 });
961 } else {
962 !!!cp (14);
963 !!!emit ($token);
964 }
965
966 redo A;
967 } elsif ($self->{state} == TAG_OPEN_STATE) {
968 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
969 if ($self->{next_char} == 0x002F) { # /
970 !!!cp (15);
971 !!!next-input-character;
972 $self->{state} = CLOSE_TAG_OPEN_STATE;
973 redo A;
974 } else {
975 !!!cp (16);
976 ## reconsume
977 $self->{state} = DATA_STATE;
978
979 !!!emit ({type => CHARACTER_TOKEN, data => '<',
980 line => $self->{line_prev},
981 column => $self->{column_prev},
982 });
983
984 redo A;
985 }
986 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
987 if ($self->{next_char} == 0x0021) { # !
988 !!!cp (17);
989 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
990 !!!next-input-character;
991 redo A;
992 } elsif ($self->{next_char} == 0x002F) { # /
993 !!!cp (18);
994 $self->{state} = CLOSE_TAG_OPEN_STATE;
995 !!!next-input-character;
996 redo A;
997 } elsif (0x0041 <= $self->{next_char} and
998 $self->{next_char} <= 0x005A) { # A..Z
999 !!!cp (19);
1000 $self->{current_token}
1001 = {type => START_TAG_TOKEN,
1002 tag_name => chr ($self->{next_char} + 0x0020),
1003 line => $self->{line_prev},
1004 column => $self->{column_prev}};
1005 $self->{state} = TAG_NAME_STATE;
1006 !!!next-input-character;
1007 redo A;
1008 } elsif (0x0061 <= $self->{next_char} and
1009 $self->{next_char} <= 0x007A) { # a..z
1010 !!!cp (20);
1011 $self->{current_token} = {type => START_TAG_TOKEN,
1012 tag_name => chr ($self->{next_char}),
1013 line => $self->{line_prev},
1014 column => $self->{column_prev}};
1015 $self->{state} = TAG_NAME_STATE;
1016 !!!next-input-character;
1017 redo A;
1018 } elsif ($self->{next_char} == 0x003E) { # >
1019 !!!cp (21);
1020 !!!parse-error (type => 'empty start tag',
1021 line => $self->{line_prev},
1022 column => $self->{column_prev});
1023 $self->{state} = DATA_STATE;
1024 !!!next-input-character;
1025
1026 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1027 line => $self->{line_prev},
1028 column => $self->{column_prev},
1029 });
1030
1031 redo A;
1032 } elsif ($self->{next_char} == 0x003F) { # ?
1033 !!!cp (22);
1034 !!!parse-error (type => 'pio',
1035 line => $self->{line_prev},
1036 column => $self->{column_prev});
1037 $self->{state} = BOGUS_COMMENT_STATE;
1038 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1039 line => $self->{line_prev},
1040 column => $self->{column_prev},
1041 };
1042 ## $self->{next_char} is intentionally left as is
1043 redo A;
1044 } else {
1045 !!!cp (23);
1046 !!!parse-error (type => 'bare stago',
1047 line => $self->{line_prev},
1048 column => $self->{column_prev});
1049 $self->{state} = DATA_STATE;
1050 ## reconsume
1051
1052 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1053 line => $self->{line_prev},
1054 column => $self->{column_prev},
1055 });
1056
1057 redo A;
1058 }
1059 } else {
1060 die "$0: $self->{content_model} in tag open";
1061 }
1062 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1063 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1064 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1065 if (defined $self->{last_emitted_start_tag_name}) {
1066
1067 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
1068 my @next_char;
1069 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
1070 push @next_char, $self->{next_char};
1071 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
1072 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
1073 if ($self->{next_char} == $c or $self->{next_char} == $C) {
1074 !!!cp (24);
1075 !!!next-input-character;
1076 next TAGNAME;
1077 } else {
1078 !!!cp (25);
1079 $self->{next_char} = shift @next_char; # reconsume
1080 !!!back-next-input-character (@next_char);
1081 $self->{state} = DATA_STATE;
1082
1083 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1084 line => $l, column => $c,
1085 });
1086
1087 redo A;
1088 }
1089 }
1090 push @next_char, $self->{next_char};
1091
1092 unless ($self->{next_char} == 0x0009 or # HT
1093 $self->{next_char} == 0x000A or # LF
1094 $self->{next_char} == 0x000B or # VT
1095 $self->{next_char} == 0x000C or # FF
1096 $self->{next_char} == 0x0020 or # SP
1097 $self->{next_char} == 0x003E or # >
1098 $self->{next_char} == 0x002F or # /
1099 $self->{next_char} == -1) {
1100 !!!cp (26);
1101 $self->{next_char} = shift @next_char; # reconsume
1102 !!!back-next-input-character (@next_char);
1103 $self->{state} = DATA_STATE;
1104 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1105 line => $l, column => $c,
1106 });
1107 redo A;
1108 } else {
1109 !!!cp (27);
1110 $self->{next_char} = shift @next_char;
1111 !!!back-next-input-character (@next_char);
1112 # and consume...
1113 }
1114 } else {
1115 ## No start tag token has ever been emitted
1116 !!!cp (28);
1117 # next-input-character is already done
1118 $self->{state} = DATA_STATE;
1119 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1120 line => $l, column => $c,
1121 });
1122 redo A;
1123 }
1124 }
1125
1126 if (0x0041 <= $self->{next_char} and
1127 $self->{next_char} <= 0x005A) { # A..Z
1128 !!!cp (29);
1129 $self->{current_token}
1130 = {type => END_TAG_TOKEN,
1131 tag_name => chr ($self->{next_char} + 0x0020),
1132 line => $l, column => $c};
1133 $self->{state} = TAG_NAME_STATE;
1134 !!!next-input-character;
1135 redo A;
1136 } elsif (0x0061 <= $self->{next_char} and
1137 $self->{next_char} <= 0x007A) { # a..z
1138 !!!cp (30);
1139 $self->{current_token} = {type => END_TAG_TOKEN,
1140 tag_name => chr ($self->{next_char}),
1141 line => $l, column => $c};
1142 $self->{state} = TAG_NAME_STATE;
1143 !!!next-input-character;
1144 redo A;
1145 } elsif ($self->{next_char} == 0x003E) { # >
1146 !!!cp (31);
1147 !!!parse-error (type => 'empty end tag',
1148 line => $self->{line_prev}, ## "<" in "</>"
1149 column => $self->{column_prev} - 1);
1150 $self->{state} = DATA_STATE;
1151 !!!next-input-character;
1152 redo A;
1153 } elsif ($self->{next_char} == -1) {
1154 !!!cp (32);
1155 !!!parse-error (type => 'bare etago');
1156 $self->{state} = DATA_STATE;
1157 # reconsume
1158
1159 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1160 line => $l, column => $c,
1161 });
1162
1163 redo A;
1164 } else {
1165 !!!cp (33);
1166 !!!parse-error (type => 'bogus end tag');
1167 $self->{state} = BOGUS_COMMENT_STATE;
1168 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1169 line => $self->{line_prev}, # "<" of "</"
1170 column => $self->{column_prev} - 1,
1171 };
1172 ## $self->{next_char} is intentionally left as is
1173 redo A;
1174 }
1175 } elsif ($self->{state} == TAG_NAME_STATE) {
1176 if ($self->{next_char} == 0x0009 or # HT
1177 $self->{next_char} == 0x000A or # LF
1178 $self->{next_char} == 0x000B or # VT
1179 $self->{next_char} == 0x000C or # FF
1180 $self->{next_char} == 0x0020) { # SP
1181 !!!cp (34);
1182 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1183 !!!next-input-character;
1184 redo A;
1185 } elsif ($self->{next_char} == 0x003E) { # >
1186 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1187 !!!cp (35);
1188 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1189 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1190 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1191 #if ($self->{current_token}->{attributes}) {
1192 # ## NOTE: This should never be reached.
1193 # !!! cp (36);
1194 # !!! parse-error (type => 'end tag attribute');
1195 #} else {
1196 !!!cp (37);
1197 #}
1198 } else {
1199 die "$0: $self->{current_token}->{type}: Unknown token type";
1200 }
1201 $self->{state} = DATA_STATE;
1202 !!!next-input-character;
1203
1204 !!!emit ($self->{current_token}); # start tag or end tag
1205
1206 redo A;
1207 } elsif (0x0041 <= $self->{next_char} and
1208 $self->{next_char} <= 0x005A) { # A..Z
1209 !!!cp (38);
1210 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1211 # start tag or end tag
1212 ## Stay in this state
1213 !!!next-input-character;
1214 redo A;
1215 } elsif ($self->{next_char} == -1) {
1216 !!!parse-error (type => 'unclosed tag');
1217 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1218 !!!cp (39);
1219 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1220 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1221 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1222 #if ($self->{current_token}->{attributes}) {
1223 # ## NOTE: This state should never be reached.
1224 # !!! cp (40);
1225 # !!! parse-error (type => 'end tag attribute');
1226 #} else {
1227 !!!cp (41);
1228 #}
1229 } else {
1230 die "$0: $self->{current_token}->{type}: Unknown token type";
1231 }
1232 $self->{state} = DATA_STATE;
1233 # reconsume
1234
1235 !!!emit ($self->{current_token}); # start tag or end tag
1236
1237 redo A;
1238 } elsif ($self->{next_char} == 0x002F) { # /
1239 !!!cp (42);
1240 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1241 !!!next-input-character;
1242 redo A;
1243 } else {
1244 !!!cp (44);
1245 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1246 # start tag or end tag
1247 ## Stay in the state
1248 !!!next-input-character;
1249 redo A;
1250 }
1251 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1252 if ($self->{next_char} == 0x0009 or # HT
1253 $self->{next_char} == 0x000A or # LF
1254 $self->{next_char} == 0x000B or # VT
1255 $self->{next_char} == 0x000C or # FF
1256 $self->{next_char} == 0x0020) { # SP
1257 !!!cp (45);
1258 ## Stay in the state
1259 !!!next-input-character;
1260 redo A;
1261 } elsif ($self->{next_char} == 0x003E) { # >
1262 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1263 !!!cp (46);
1264 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1265 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1266 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1267 if ($self->{current_token}->{attributes}) {
1268 !!!cp (47);
1269 !!!parse-error (type => 'end tag attribute');
1270 } else {
1271 !!!cp (48);
1272 }
1273 } else {
1274 die "$0: $self->{current_token}->{type}: Unknown token type";
1275 }
1276 $self->{state} = DATA_STATE;
1277 !!!next-input-character;
1278
1279 !!!emit ($self->{current_token}); # start tag or end tag
1280
1281 redo A;
1282 } elsif (0x0041 <= $self->{next_char} and
1283 $self->{next_char} <= 0x005A) { # A..Z
1284 !!!cp (49);
1285 $self->{current_attribute}
1286 = {name => chr ($self->{next_char} + 0x0020),
1287 value => '',
1288 line => $self->{line}, column => $self->{column}};
1289 $self->{state} = ATTRIBUTE_NAME_STATE;
1290 !!!next-input-character;
1291 redo A;
1292 } elsif ($self->{next_char} == 0x002F) { # /
1293 !!!cp (50);
1294 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1295 !!!next-input-character;
1296 redo A;
1297 } elsif ($self->{next_char} == -1) {
1298 !!!parse-error (type => 'unclosed tag');
1299 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1300 !!!cp (52);
1301 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1302 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1303 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1304 if ($self->{current_token}->{attributes}) {
1305 !!!cp (53);
1306 !!!parse-error (type => 'end tag attribute');
1307 } else {
1308 !!!cp (54);
1309 }
1310 } else {
1311 die "$0: $self->{current_token}->{type}: Unknown token type";
1312 }
1313 $self->{state} = DATA_STATE;
1314 # reconsume
1315
1316 !!!emit ($self->{current_token}); # start tag or end tag
1317
1318 redo A;
1319 } else {
1320 if ({
1321 0x0022 => 1, # "
1322 0x0027 => 1, # '
1323 0x003D => 1, # =
1324 }->{$self->{next_char}}) {
1325 !!!cp (55);
1326 !!!parse-error (type => 'bad attribute name');
1327 } else {
1328 !!!cp (56);
1329 }
1330 $self->{current_attribute}
1331 = {name => chr ($self->{next_char}),
1332 value => '',
1333 line => $self->{line}, column => $self->{column}};
1334 $self->{state} = ATTRIBUTE_NAME_STATE;
1335 !!!next-input-character;
1336 redo A;
1337 }
1338 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1339 my $before_leave = sub {
1340 if (exists $self->{current_token}->{attributes} # start tag or end tag
1341 ->{$self->{current_attribute}->{name}}) { # MUST
1342 !!!cp (57);
1343 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1344 ## Discard $self->{current_attribute} # MUST
1345 } else {
1346 !!!cp (58);
1347 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1348 = $self->{current_attribute};
1349 }
1350 }; # $before_leave
1351
1352 if ($self->{next_char} == 0x0009 or # HT
1353 $self->{next_char} == 0x000A or # LF
1354 $self->{next_char} == 0x000B or # VT
1355 $self->{next_char} == 0x000C or # FF
1356 $self->{next_char} == 0x0020) { # SP
1357 !!!cp (59);
1358 $before_leave->();
1359 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1360 !!!next-input-character;
1361 redo A;
1362 } elsif ($self->{next_char} == 0x003D) { # =
1363 !!!cp (60);
1364 $before_leave->();
1365 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1366 !!!next-input-character;
1367 redo A;
1368 } elsif ($self->{next_char} == 0x003E) { # >
1369 $before_leave->();
1370 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1371 !!!cp (61);
1372 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1373 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1374 !!!cp (62);
1375 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1376 if ($self->{current_token}->{attributes}) {
1377 !!!parse-error (type => 'end tag attribute');
1378 }
1379 } else {
1380 die "$0: $self->{current_token}->{type}: Unknown token type";
1381 }
1382 $self->{state} = DATA_STATE;
1383 !!!next-input-character;
1384
1385 !!!emit ($self->{current_token}); # start tag or end tag
1386
1387 redo A;
1388 } elsif (0x0041 <= $self->{next_char} and
1389 $self->{next_char} <= 0x005A) { # A..Z
1390 !!!cp (63);
1391 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1392 ## Stay in the state
1393 !!!next-input-character;
1394 redo A;
1395 } elsif ($self->{next_char} == 0x002F) { # /
1396 !!!cp (64);
1397 $before_leave->();
1398 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1399 !!!next-input-character;
1400 redo A;
1401 } elsif ($self->{next_char} == -1) {
1402 !!!parse-error (type => 'unclosed tag');
1403 $before_leave->();
1404 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1405 !!!cp (66);
1406 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1407 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1408 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1409 if ($self->{current_token}->{attributes}) {
1410 !!!cp (67);
1411 !!!parse-error (type => 'end tag attribute');
1412 } else {
1413 ## NOTE: This state should never be reached.
1414 !!!cp (68);
1415 }
1416 } else {
1417 die "$0: $self->{current_token}->{type}: Unknown token type";
1418 }
1419 $self->{state} = DATA_STATE;
1420 # reconsume
1421
1422 !!!emit ($self->{current_token}); # start tag or end tag
1423
1424 redo A;
1425 } else {
1426 if ($self->{next_char} == 0x0022 or # "
1427 $self->{next_char} == 0x0027) { # '
1428 !!!cp (69);
1429 !!!parse-error (type => 'bad attribute name');
1430 } else {
1431 !!!cp (70);
1432 }
1433 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1434 ## Stay in the state
1435 !!!next-input-character;
1436 redo A;
1437 }
1438 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1439 if ($self->{next_char} == 0x0009 or # HT
1440 $self->{next_char} == 0x000A or # LF
1441 $self->{next_char} == 0x000B or # VT
1442 $self->{next_char} == 0x000C or # FF
1443 $self->{next_char} == 0x0020) { # SP
1444 !!!cp (71);
1445 ## Stay in the state
1446 !!!next-input-character;
1447 redo A;
1448 } elsif ($self->{next_char} == 0x003D) { # =
1449 !!!cp (72);
1450 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1451 !!!next-input-character;
1452 redo A;
1453 } elsif ($self->{next_char} == 0x003E) { # >
1454 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1455 !!!cp (73);
1456 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1457 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1458 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1459 if ($self->{current_token}->{attributes}) {
1460 !!!cp (74);
1461 !!!parse-error (type => 'end tag attribute');
1462 } else {
1463 ## NOTE: This state should never be reached.
1464 !!!cp (75);
1465 }
1466 } else {
1467 die "$0: $self->{current_token}->{type}: Unknown token type";
1468 }
1469 $self->{state} = DATA_STATE;
1470 !!!next-input-character;
1471
1472 !!!emit ($self->{current_token}); # start tag or end tag
1473
1474 redo A;
1475 } elsif (0x0041 <= $self->{next_char} and
1476 $self->{next_char} <= 0x005A) { # A..Z
1477 !!!cp (76);
1478 $self->{current_attribute}
1479 = {name => chr ($self->{next_char} + 0x0020),
1480 value => '',
1481 line => $self->{line}, column => $self->{column}};
1482 $self->{state} = ATTRIBUTE_NAME_STATE;
1483 !!!next-input-character;
1484 redo A;
1485 } elsif ($self->{next_char} == 0x002F) { # /
1486 !!!cp (77);
1487 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1488 !!!next-input-character;
1489 redo A;
1490 } elsif ($self->{next_char} == -1) {
1491 !!!parse-error (type => 'unclosed tag');
1492 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1493 !!!cp (79);
1494 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1495 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1496 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1497 if ($self->{current_token}->{attributes}) {
1498 !!!cp (80);
1499 !!!parse-error (type => 'end tag attribute');
1500 } else {
1501 ## NOTE: This state should never be reached.
1502 !!!cp (81);
1503 }
1504 } else {
1505 die "$0: $self->{current_token}->{type}: Unknown token type";
1506 }
1507 $self->{state} = DATA_STATE;
1508 # reconsume
1509
1510 !!!emit ($self->{current_token}); # start tag or end tag
1511
1512 redo A;
1513 } else {
1514 !!!cp (82);
1515 $self->{current_attribute}
1516 = {name => chr ($self->{next_char}),
1517 value => '',
1518 line => $self->{line}, column => $self->{column}};
1519 $self->{state} = ATTRIBUTE_NAME_STATE;
1520 !!!next-input-character;
1521 redo A;
1522 }
1523 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1524 if ($self->{next_char} == 0x0009 or # HT
1525 $self->{next_char} == 0x000A or # LF
1526 $self->{next_char} == 0x000B or # VT
1527 $self->{next_char} == 0x000C or # FF
1528 $self->{next_char} == 0x0020) { # SP
1529 !!!cp (83);
1530 ## Stay in the state
1531 !!!next-input-character;
1532 redo A;
1533 } elsif ($self->{next_char} == 0x0022) { # "
1534 !!!cp (84);
1535 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1536 !!!next-input-character;
1537 redo A;
1538 } elsif ($self->{next_char} == 0x0026) { # &
1539 !!!cp (85);
1540 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1541 ## reconsume
1542 redo A;
1543 } elsif ($self->{next_char} == 0x0027) { # '
1544 !!!cp (86);
1545 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1546 !!!next-input-character;
1547 redo A;
1548 } elsif ($self->{next_char} == 0x003E) { # >
1549 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1550 !!!cp (87);
1551 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1552 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1553 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1554 if ($self->{current_token}->{attributes}) {
1555 !!!cp (88);
1556 !!!parse-error (type => 'end tag attribute');
1557 } else {
1558 ## NOTE: This state should never be reached.
1559 !!!cp (89);
1560 }
1561 } else {
1562 die "$0: $self->{current_token}->{type}: Unknown token type";
1563 }
1564 $self->{state} = DATA_STATE;
1565 !!!next-input-character;
1566
1567 !!!emit ($self->{current_token}); # start tag or end tag
1568
1569 redo A;
1570 } elsif ($self->{next_char} == -1) {
1571 !!!parse-error (type => 'unclosed tag');
1572 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1573 !!!cp (90);
1574 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1575 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1576 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1577 if ($self->{current_token}->{attributes}) {
1578 !!!cp (91);
1579 !!!parse-error (type => 'end tag attribute');
1580 } else {
1581 ## NOTE: This state should never be reached.
1582 !!!cp (92);
1583 }
1584 } else {
1585 die "$0: $self->{current_token}->{type}: Unknown token type";
1586 }
1587 $self->{state} = DATA_STATE;
1588 ## reconsume
1589
1590 !!!emit ($self->{current_token}); # start tag or end tag
1591
1592 redo A;
1593 } else {
1594 if ($self->{next_char} == 0x003D) { # =
1595 !!!cp (93);
1596 !!!parse-error (type => 'bad attribute value');
1597 } else {
1598 !!!cp (94);
1599 }
1600 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1601 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1602 !!!next-input-character;
1603 redo A;
1604 }
1605 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1606 if ($self->{next_char} == 0x0022) { # "
1607 !!!cp (95);
1608 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1609 !!!next-input-character;
1610 redo A;
1611 } elsif ($self->{next_char} == 0x0026) { # &
1612 !!!cp (96);
1613 $self->{last_attribute_value_state} = $self->{state};
1614 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1615 !!!next-input-character;
1616 redo A;
1617 } elsif ($self->{next_char} == -1) {
1618 !!!parse-error (type => 'unclosed attribute value');
1619 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1620 !!!cp (97);
1621 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1622 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1623 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1624 if ($self->{current_token}->{attributes}) {
1625 !!!cp (98);
1626 !!!parse-error (type => 'end tag attribute');
1627 } else {
1628 ## NOTE: This state should never be reached.
1629 !!!cp (99);
1630 }
1631 } else {
1632 die "$0: $self->{current_token}->{type}: Unknown token type";
1633 }
1634 $self->{state} = DATA_STATE;
1635 ## reconsume
1636
1637 !!!emit ($self->{current_token}); # start tag or end tag
1638
1639 redo A;
1640 } else {
1641 !!!cp (100);
1642 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1643 ## Stay in the state
1644 !!!next-input-character;
1645 redo A;
1646 }
1647 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1648 if ($self->{next_char} == 0x0027) { # '
1649 !!!cp (101);
1650 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1651 !!!next-input-character;
1652 redo A;
1653 } elsif ($self->{next_char} == 0x0026) { # &
1654 !!!cp (102);
1655 $self->{last_attribute_value_state} = $self->{state};
1656 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1657 !!!next-input-character;
1658 redo A;
1659 } elsif ($self->{next_char} == -1) {
1660 !!!parse-error (type => 'unclosed attribute value');
1661 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1662 !!!cp (103);
1663 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1664 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1665 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1666 if ($self->{current_token}->{attributes}) {
1667 !!!cp (104);
1668 !!!parse-error (type => 'end tag attribute');
1669 } else {
1670 ## NOTE: This state should never be reached.
1671 !!!cp (105);
1672 }
1673 } else {
1674 die "$0: $self->{current_token}->{type}: Unknown token type";
1675 }
1676 $self->{state} = DATA_STATE;
1677 ## reconsume
1678
1679 !!!emit ($self->{current_token}); # start tag or end tag
1680
1681 redo A;
1682 } else {
1683 !!!cp (106);
1684 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1685 ## Stay in the state
1686 !!!next-input-character;
1687 redo A;
1688 }
1689 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1690 if ($self->{next_char} == 0x0009 or # HT
1691 $self->{next_char} == 0x000A or # LF
1692 $self->{next_char} == 0x000B or # HT
1693 $self->{next_char} == 0x000C or # FF
1694 $self->{next_char} == 0x0020) { # SP
1695 !!!cp (107);
1696 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1697 !!!next-input-character;
1698 redo A;
1699 } elsif ($self->{next_char} == 0x0026) { # &
1700 !!!cp (108);
1701 $self->{last_attribute_value_state} = $self->{state};
1702 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1703 !!!next-input-character;
1704 redo A;
1705 } elsif ($self->{next_char} == 0x003E) { # >
1706 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1707 !!!cp (109);
1708 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1709 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1710 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1711 if ($self->{current_token}->{attributes}) {
1712 !!!cp (110);
1713 !!!parse-error (type => 'end tag attribute');
1714 } else {
1715 ## NOTE: This state should never be reached.
1716 !!!cp (111);
1717 }
1718 } else {
1719 die "$0: $self->{current_token}->{type}: Unknown token type";
1720 }
1721 $self->{state} = DATA_STATE;
1722 !!!next-input-character;
1723
1724 !!!emit ($self->{current_token}); # start tag or end tag
1725
1726 redo A;
1727 } elsif ($self->{next_char} == -1) {
1728 !!!parse-error (type => 'unclosed tag');
1729 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1730 !!!cp (112);
1731 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1732 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1733 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1734 if ($self->{current_token}->{attributes}) {
1735 !!!cp (113);
1736 !!!parse-error (type => 'end tag attribute');
1737 } else {
1738 ## NOTE: This state should never be reached.
1739 !!!cp (114);
1740 }
1741 } else {
1742 die "$0: $self->{current_token}->{type}: Unknown token type";
1743 }
1744 $self->{state} = DATA_STATE;
1745 ## reconsume
1746
1747 !!!emit ($self->{current_token}); # start tag or end tag
1748
1749 redo A;
1750 } else {
1751 if ({
1752 0x0022 => 1, # "
1753 0x0027 => 1, # '
1754 0x003D => 1, # =
1755 }->{$self->{next_char}}) {
1756 !!!cp (115);
1757 !!!parse-error (type => 'bad attribute value');
1758 } else {
1759 !!!cp (116);
1760 }
1761 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1762 ## Stay in the state
1763 !!!next-input-character;
1764 redo A;
1765 }
1766 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1767 my $token = $self->_tokenize_attempt_to_consume_an_entity
1768 (1,
1769 $self->{last_attribute_value_state}
1770 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1771 $self->{last_attribute_value_state}
1772 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1773 -1);
1774
1775 unless (defined $token) {
1776 !!!cp (117);
1777 $self->{current_attribute}->{value} .= '&';
1778 } else {
1779 !!!cp (118);
1780 $self->{current_attribute}->{value} .= $token->{data};
1781 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1782 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1783 }
1784
1785 $self->{state} = $self->{last_attribute_value_state};
1786 # next-input-character is already done
1787 redo A;
1788 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1789 if ($self->{next_char} == 0x0009 or # HT
1790 $self->{next_char} == 0x000A or # LF
1791 $self->{next_char} == 0x000B or # VT
1792 $self->{next_char} == 0x000C or # FF
1793 $self->{next_char} == 0x0020) { # SP
1794 !!!cp (118);
1795 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1796 !!!next-input-character;
1797 redo A;
1798 } elsif ($self->{next_char} == 0x003E) { # >
1799 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1800 !!!cp (119);
1801 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1802 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1803 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1804 if ($self->{current_token}->{attributes}) {
1805 !!!cp (120);
1806 !!!parse-error (type => 'end tag attribute');
1807 } else {
1808 ## NOTE: This state should never be reached.
1809 !!!cp (121);
1810 }
1811 } else {
1812 die "$0: $self->{current_token}->{type}: Unknown token type";
1813 }
1814 $self->{state} = DATA_STATE;
1815 !!!next-input-character;
1816
1817 !!!emit ($self->{current_token}); # start tag or end tag
1818
1819 redo A;
1820 } elsif ($self->{next_char} == 0x002F) { # /
1821 !!!cp (122);
1822 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1823 !!!next-input-character;
1824 redo A;
1825 } elsif ($self->{next_char} == -1) {
1826 !!!parse-error (type => 'unclosed tag');
1827 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1828 !!!cp (122.3);
1829 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1830 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1831 if ($self->{current_token}->{attributes}) {
1832 !!!cp (122.1);
1833 !!!parse-error (type => 'end tag attribute');
1834 } else {
1835 ## NOTE: This state should never be reached.
1836 !!!cp (122.2);
1837 }
1838 } else {
1839 die "$0: $self->{current_token}->{type}: Unknown token type";
1840 }
1841 $self->{state} = DATA_STATE;
1842 ## Reconsume.
1843 !!!emit ($self->{current_token}); # start tag or end tag
1844 redo A;
1845 } else {
1846 !!!cp ('124.1');
1847 !!!parse-error (type => 'no space between attributes');
1848 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1849 ## reconsume
1850 redo A;
1851 }
1852 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1853 if ($self->{next_char} == 0x003E) { # >
1854 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1855 !!!cp ('124.2');
1856 !!!parse-error (type => 'nestc', token => $self->{current_token});
1857 ## TODO: Different type than slash in start tag
1858 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1859 if ($self->{current_token}->{attributes}) {
1860 !!!cp ('124.4');
1861 !!!parse-error (type => 'end tag attribute');
1862 } else {
1863 !!!cp ('124.5');
1864 }
1865 ## TODO: Test |<title></title/>|
1866 } else {
1867 !!!cp ('124.3');
1868 $self->{self_closing} = 1;
1869 }
1870
1871 $self->{state} = DATA_STATE;
1872 !!!next-input-character;
1873
1874 !!!emit ($self->{current_token}); # start tag or end tag
1875
1876 redo A;
1877 } elsif ($self->{next_char} == -1) {
1878 !!!parse-error (type => 'unclosed tag');
1879 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1880 !!!cp (124.7);
1881 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1882 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1883 if ($self->{current_token}->{attributes}) {
1884 !!!cp (124.5);
1885 !!!parse-error (type => 'end tag attribute');
1886 } else {
1887 ## NOTE: This state should never be reached.
1888 !!!cp (124.6);
1889 }
1890 } else {
1891 die "$0: $self->{current_token}->{type}: Unknown token type";
1892 }
1893 $self->{state} = DATA_STATE;
1894 ## Reconsume.
1895 !!!emit ($self->{current_token}); # start tag or end tag
1896 redo A;
1897 } else {
1898 !!!cp ('124.4');
1899 !!!parse-error (type => 'nestc');
1900 ## TODO: This error type is wrong.
1901 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1902 ## Reconsume.
1903 redo A;
1904 }
1905 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1906 ## (only happen if PCDATA state)
1907
1908 ## NOTE: Set by the previous state
1909 #my $token = {type => COMMENT_TOKEN, data => ''};
1910
1911 BC: {
1912 if ($self->{next_char} == 0x003E) { # >
1913 !!!cp (124);
1914 $self->{state} = DATA_STATE;
1915 !!!next-input-character;
1916
1917 !!!emit ($self->{current_token}); # comment
1918
1919 redo A;
1920 } elsif ($self->{next_char} == -1) {
1921 !!!cp (125);
1922 $self->{state} = DATA_STATE;
1923 ## reconsume
1924
1925 !!!emit ($self->{current_token}); # comment
1926
1927 redo A;
1928 } else {
1929 !!!cp (126);
1930 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1931 !!!next-input-character;
1932 redo BC;
1933 }
1934 } # BC
1935
1936 die "$0: _get_next_token: unexpected case [BC]";
1937 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1938 ## (only happen if PCDATA state)
1939
1940 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1941
1942 my @next_char;
1943 push @next_char, $self->{next_char};
1944
1945 if ($self->{next_char} == 0x002D) { # -
1946 !!!next-input-character;
1947 push @next_char, $self->{next_char};
1948 if ($self->{next_char} == 0x002D) { # -
1949 !!!cp (127);
1950 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1951 line => $l, column => $c,
1952 };
1953 $self->{state} = COMMENT_START_STATE;
1954 !!!next-input-character;
1955 redo A;
1956 } else {
1957 !!!cp (128);
1958 }
1959 } elsif ($self->{next_char} == 0x0044 or # D
1960 $self->{next_char} == 0x0064) { # d
1961 !!!next-input-character;
1962 push @next_char, $self->{next_char};
1963 if ($self->{next_char} == 0x004F or # O
1964 $self->{next_char} == 0x006F) { # o
1965 !!!next-input-character;
1966 push @next_char, $self->{next_char};
1967 if ($self->{next_char} == 0x0043 or # C
1968 $self->{next_char} == 0x0063) { # c
1969 !!!next-input-character;
1970 push @next_char, $self->{next_char};
1971 if ($self->{next_char} == 0x0054 or # T
1972 $self->{next_char} == 0x0074) { # t
1973 !!!next-input-character;
1974 push @next_char, $self->{next_char};
1975 if ($self->{next_char} == 0x0059 or # Y
1976 $self->{next_char} == 0x0079) { # y
1977 !!!next-input-character;
1978 push @next_char, $self->{next_char};
1979 if ($self->{next_char} == 0x0050 or # P
1980 $self->{next_char} == 0x0070) { # p
1981 !!!next-input-character;
1982 push @next_char, $self->{next_char};
1983 if ($self->{next_char} == 0x0045 or # E
1984 $self->{next_char} == 0x0065) { # e
1985 !!!cp (129);
1986 ## TODO: What a stupid code this is!
1987 $self->{state} = DOCTYPE_STATE;
1988 $self->{current_token} = {type => DOCTYPE_TOKEN,
1989 quirks => 1,
1990 line => $l, column => $c,
1991 };
1992 !!!next-input-character;
1993 redo A;
1994 } else {
1995 !!!cp (130);
1996 }
1997 } else {
1998 !!!cp (131);
1999 }
2000 } else {
2001 !!!cp (132);
2002 }
2003 } else {
2004 !!!cp (133);
2005 }
2006 } else {
2007 !!!cp (134);
2008 }
2009 } else {
2010 !!!cp (135);
2011 }
2012 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2013 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2014 $self->{next_char} == 0x005B) { # [
2015 !!!next-input-character;
2016 push @next_char, $self->{next_char};
2017 if ($self->{next_char} == 0x0043) { # C
2018 !!!next-input-character;
2019 push @next_char, $self->{next_char};
2020 if ($self->{next_char} == 0x0044) { # D
2021 !!!next-input-character;
2022 push @next_char, $self->{next_char};
2023 if ($self->{next_char} == 0x0041) { # A
2024 !!!next-input-character;
2025 push @next_char, $self->{next_char};
2026 if ($self->{next_char} == 0x0054) { # T
2027 !!!next-input-character;
2028 push @next_char, $self->{next_char};
2029 if ($self->{next_char} == 0x0041) { # A
2030 !!!next-input-character;
2031 push @next_char, $self->{next_char};
2032 if ($self->{next_char} == 0x005B) { # [
2033 !!!cp (135.1);
2034 $self->{state} = CDATA_BLOCK_STATE;
2035 !!!next-input-character;
2036 redo A;
2037 } else {
2038 !!!cp (135.2);
2039 }
2040 } else {
2041 !!!cp (135.3);
2042 }
2043 } else {
2044 !!!cp (135.4);
2045 }
2046 } else {
2047 !!!cp (135.5);
2048 }
2049 } else {
2050 !!!cp (135.6);
2051 }
2052 } else {
2053 !!!cp (135.7);
2054 }
2055 } else {
2056 !!!cp (136);
2057 }
2058
2059 !!!parse-error (type => 'bogus comment');
2060 $self->{next_char} = shift @next_char;
2061 !!!back-next-input-character (@next_char);
2062 $self->{state} = BOGUS_COMMENT_STATE;
2063 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2064 line => $l, column => $c,
2065 };
2066 redo A;
2067
2068 ## ISSUE: typos in spec: chacacters, is is a parse error
2069 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
2070 } elsif ($self->{state} == COMMENT_START_STATE) {
2071 if ($self->{next_char} == 0x002D) { # -
2072 !!!cp (137);
2073 $self->{state} = COMMENT_START_DASH_STATE;
2074 !!!next-input-character;
2075 redo A;
2076 } elsif ($self->{next_char} == 0x003E) { # >
2077 !!!cp (138);
2078 !!!parse-error (type => 'bogus comment');
2079 $self->{state} = DATA_STATE;
2080 !!!next-input-character;
2081
2082 !!!emit ($self->{current_token}); # comment
2083
2084 redo A;
2085 } elsif ($self->{next_char} == -1) {
2086 !!!cp (139);
2087 !!!parse-error (type => 'unclosed comment');
2088 $self->{state} = DATA_STATE;
2089 ## reconsume
2090
2091 !!!emit ($self->{current_token}); # comment
2092
2093 redo A;
2094 } else {
2095 !!!cp (140);
2096 $self->{current_token}->{data} # comment
2097 .= chr ($self->{next_char});
2098 $self->{state} = COMMENT_STATE;
2099 !!!next-input-character;
2100 redo A;
2101 }
2102 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2103 if ($self->{next_char} == 0x002D) { # -
2104 !!!cp (141);
2105 $self->{state} = COMMENT_END_STATE;
2106 !!!next-input-character;
2107 redo A;
2108 } elsif ($self->{next_char} == 0x003E) { # >
2109 !!!cp (142);
2110 !!!parse-error (type => 'bogus comment');
2111 $self->{state} = DATA_STATE;
2112 !!!next-input-character;
2113
2114 !!!emit ($self->{current_token}); # comment
2115
2116 redo A;
2117 } elsif ($self->{next_char} == -1) {
2118 !!!cp (143);
2119 !!!parse-error (type => 'unclosed comment');
2120 $self->{state} = DATA_STATE;
2121 ## reconsume
2122
2123 !!!emit ($self->{current_token}); # comment
2124
2125 redo A;
2126 } else {
2127 !!!cp (144);
2128 $self->{current_token}->{data} # comment
2129 .= '-' . chr ($self->{next_char});
2130 $self->{state} = COMMENT_STATE;
2131 !!!next-input-character;
2132 redo A;
2133 }
2134 } elsif ($self->{state} == COMMENT_STATE) {
2135 if ($self->{next_char} == 0x002D) { # -
2136 !!!cp (145);
2137 $self->{state} = COMMENT_END_DASH_STATE;
2138 !!!next-input-character;
2139 redo A;
2140 } elsif ($self->{next_char} == -1) {
2141 !!!cp (146);
2142 !!!parse-error (type => 'unclosed comment');
2143 $self->{state} = DATA_STATE;
2144 ## reconsume
2145
2146 !!!emit ($self->{current_token}); # comment
2147
2148 redo A;
2149 } else {
2150 !!!cp (147);
2151 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2152 ## Stay in the state
2153 !!!next-input-character;
2154 redo A;
2155 }
2156 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2157 if ($self->{next_char} == 0x002D) { # -
2158 !!!cp (148);
2159 $self->{state} = COMMENT_END_STATE;
2160 !!!next-input-character;
2161 redo A;
2162 } elsif ($self->{next_char} == -1) {
2163 !!!cp (149);
2164 !!!parse-error (type => 'unclosed comment');
2165 $self->{state} = DATA_STATE;
2166 ## reconsume
2167
2168 !!!emit ($self->{current_token}); # comment
2169
2170 redo A;
2171 } else {
2172 !!!cp (150);
2173 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2174 $self->{state} = COMMENT_STATE;
2175 !!!next-input-character;
2176 redo A;
2177 }
2178 } elsif ($self->{state} == COMMENT_END_STATE) {
2179 if ($self->{next_char} == 0x003E) { # >
2180 !!!cp (151);
2181 $self->{state} = DATA_STATE;
2182 !!!next-input-character;
2183
2184 !!!emit ($self->{current_token}); # comment
2185
2186 redo A;
2187 } elsif ($self->{next_char} == 0x002D) { # -
2188 !!!cp (152);
2189 !!!parse-error (type => 'dash in comment',
2190 line => $self->{line_prev},
2191 column => $self->{column_prev});
2192 $self->{current_token}->{data} .= '-'; # comment
2193 ## Stay in the state
2194 !!!next-input-character;
2195 redo A;
2196 } elsif ($self->{next_char} == -1) {
2197 !!!cp (153);
2198 !!!parse-error (type => 'unclosed comment');
2199 $self->{state} = DATA_STATE;
2200 ## reconsume
2201
2202 !!!emit ($self->{current_token}); # comment
2203
2204 redo A;
2205 } else {
2206 !!!cp (154);
2207 !!!parse-error (type => 'dash in comment',
2208 line => $self->{line_prev},
2209 column => $self->{column_prev});
2210 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2211 $self->{state} = COMMENT_STATE;
2212 !!!next-input-character;
2213 redo A;
2214 }
2215 } elsif ($self->{state} == DOCTYPE_STATE) {
2216 if ($self->{next_char} == 0x0009 or # HT
2217 $self->{next_char} == 0x000A or # LF
2218 $self->{next_char} == 0x000B or # VT
2219 $self->{next_char} == 0x000C or # FF
2220 $self->{next_char} == 0x0020) { # SP
2221 !!!cp (155);
2222 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2223 !!!next-input-character;
2224 redo A;
2225 } else {
2226 !!!cp (156);
2227 !!!parse-error (type => 'no space before DOCTYPE name');
2228 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2229 ## reconsume
2230 redo A;
2231 }
2232 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2233 if ($self->{next_char} == 0x0009 or # HT
2234 $self->{next_char} == 0x000A or # LF
2235 $self->{next_char} == 0x000B or # VT
2236 $self->{next_char} == 0x000C or # FF
2237 $self->{next_char} == 0x0020) { # SP
2238 !!!cp (157);
2239 ## Stay in the state
2240 !!!next-input-character;
2241 redo A;
2242 } elsif ($self->{next_char} == 0x003E) { # >
2243 !!!cp (158);
2244 !!!parse-error (type => 'no DOCTYPE name');
2245 $self->{state} = DATA_STATE;
2246 !!!next-input-character;
2247
2248 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2249
2250 redo A;
2251 } elsif ($self->{next_char} == -1) {
2252 !!!cp (159);
2253 !!!parse-error (type => 'no DOCTYPE name');
2254 $self->{state} = DATA_STATE;
2255 ## reconsume
2256
2257 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2258
2259 redo A;
2260 } else {
2261 !!!cp (160);
2262 $self->{current_token}->{name} = chr $self->{next_char};
2263 delete $self->{current_token}->{quirks};
2264 ## ISSUE: "Set the token's name name to the" in the spec
2265 $self->{state} = DOCTYPE_NAME_STATE;
2266 !!!next-input-character;
2267 redo A;
2268 }
2269 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2270 ## ISSUE: Redundant "First," in the spec.
2271 if ($self->{next_char} == 0x0009 or # HT
2272 $self->{next_char} == 0x000A or # LF
2273 $self->{next_char} == 0x000B or # VT
2274 $self->{next_char} == 0x000C or # FF
2275 $self->{next_char} == 0x0020) { # SP
2276 !!!cp (161);
2277 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2278 !!!next-input-character;
2279 redo A;
2280 } elsif ($self->{next_char} == 0x003E) { # >
2281 !!!cp (162);
2282 $self->{state} = DATA_STATE;
2283 !!!next-input-character;
2284
2285 !!!emit ($self->{current_token}); # DOCTYPE
2286
2287 redo A;
2288 } elsif ($self->{next_char} == -1) {
2289 !!!cp (163);
2290 !!!parse-error (type => 'unclosed DOCTYPE');
2291 $self->{state} = DATA_STATE;
2292 ## reconsume
2293
2294 $self->{current_token}->{quirks} = 1;
2295 !!!emit ($self->{current_token}); # DOCTYPE
2296
2297 redo A;
2298 } else {
2299 !!!cp (164);
2300 $self->{current_token}->{name}
2301 .= chr ($self->{next_char}); # DOCTYPE
2302 ## Stay in the state
2303 !!!next-input-character;
2304 redo A;
2305 }
2306 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2307 if ($self->{next_char} == 0x0009 or # HT
2308 $self->{next_char} == 0x000A or # LF
2309 $self->{next_char} == 0x000B or # VT
2310 $self->{next_char} == 0x000C or # FF
2311 $self->{next_char} == 0x0020) { # SP
2312 !!!cp (165);
2313 ## Stay in the state
2314 !!!next-input-character;
2315 redo A;
2316 } elsif ($self->{next_char} == 0x003E) { # >
2317 !!!cp (166);
2318 $self->{state} = DATA_STATE;
2319 !!!next-input-character;
2320
2321 !!!emit ($self->{current_token}); # DOCTYPE
2322
2323 redo A;
2324 } elsif ($self->{next_char} == -1) {
2325 !!!cp (167);
2326 !!!parse-error (type => 'unclosed DOCTYPE');
2327 $self->{state} = DATA_STATE;
2328 ## reconsume
2329
2330 $self->{current_token}->{quirks} = 1;
2331 !!!emit ($self->{current_token}); # DOCTYPE
2332
2333 redo A;
2334 } elsif ($self->{next_char} == 0x0050 or # P
2335 $self->{next_char} == 0x0070) { # p
2336 !!!next-input-character;
2337 if ($self->{next_char} == 0x0055 or # U
2338 $self->{next_char} == 0x0075) { # u
2339 !!!next-input-character;
2340 if ($self->{next_char} == 0x0042 or # B
2341 $self->{next_char} == 0x0062) { # b
2342 !!!next-input-character;
2343 if ($self->{next_char} == 0x004C or # L
2344 $self->{next_char} == 0x006C) { # l
2345 !!!next-input-character;
2346 if ($self->{next_char} == 0x0049 or # I
2347 $self->{next_char} == 0x0069) { # i
2348 !!!next-input-character;
2349 if ($self->{next_char} == 0x0043 or # C
2350 $self->{next_char} == 0x0063) { # c
2351 !!!cp (168);
2352 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2353 !!!next-input-character;
2354 redo A;
2355 } else {
2356 !!!cp (169);
2357 }
2358 } else {
2359 !!!cp (170);
2360 }
2361 } else {
2362 !!!cp (171);
2363 }
2364 } else {
2365 !!!cp (172);
2366 }
2367 } else {
2368 !!!cp (173);
2369 }
2370
2371 #
2372 } elsif ($self->{next_char} == 0x0053 or # S
2373 $self->{next_char} == 0x0073) { # s
2374 !!!next-input-character;
2375 if ($self->{next_char} == 0x0059 or # Y
2376 $self->{next_char} == 0x0079) { # y
2377 !!!next-input-character;
2378 if ($self->{next_char} == 0x0053 or # S
2379 $self->{next_char} == 0x0073) { # s
2380 !!!next-input-character;
2381 if ($self->{next_char} == 0x0054 or # T
2382 $self->{next_char} == 0x0074) { # t
2383 !!!next-input-character;
2384 if ($self->{next_char} == 0x0045 or # E
2385 $self->{next_char} == 0x0065) { # e
2386 !!!next-input-character;
2387 if ($self->{next_char} == 0x004D or # M
2388 $self->{next_char} == 0x006D) { # m
2389 !!!cp (174);
2390 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2391 !!!next-input-character;
2392 redo A;
2393 } else {
2394 !!!cp (175);
2395 }
2396 } else {
2397 !!!cp (176);
2398 }
2399 } else {
2400 !!!cp (177);
2401 }
2402 } else {
2403 !!!cp (178);
2404 }
2405 } else {
2406 !!!cp (179);
2407 }
2408
2409 #
2410 } else {
2411 !!!cp (180);
2412 !!!next-input-character;
2413 #
2414 }
2415
2416 !!!parse-error (type => 'string after DOCTYPE name');
2417 $self->{current_token}->{quirks} = 1;
2418
2419 $self->{state} = BOGUS_DOCTYPE_STATE;
2420 # next-input-character is already done
2421 redo A;
2422 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2423 if ({
2424 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2425 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2426 }->{$self->{next_char}}) {
2427 !!!cp (181);
2428 ## Stay in the state
2429 !!!next-input-character;
2430 redo A;
2431 } elsif ($self->{next_char} eq 0x0022) { # "
2432 !!!cp (182);
2433 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2434 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2435 !!!next-input-character;
2436 redo A;
2437 } elsif ($self->{next_char} eq 0x0027) { # '
2438 !!!cp (183);
2439 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2440 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2441 !!!next-input-character;
2442 redo A;
2443 } elsif ($self->{next_char} eq 0x003E) { # >
2444 !!!cp (184);
2445 !!!parse-error (type => 'no PUBLIC literal');
2446
2447 $self->{state} = DATA_STATE;
2448 !!!next-input-character;
2449
2450 $self->{current_token}->{quirks} = 1;
2451 !!!emit ($self->{current_token}); # DOCTYPE
2452
2453 redo A;
2454 } elsif ($self->{next_char} == -1) {
2455 !!!cp (185);
2456 !!!parse-error (type => 'unclosed DOCTYPE');
2457
2458 $self->{state} = DATA_STATE;
2459 ## reconsume
2460
2461 $self->{current_token}->{quirks} = 1;
2462 !!!emit ($self->{current_token}); # DOCTYPE
2463
2464 redo A;
2465 } else {
2466 !!!cp (186);
2467 !!!parse-error (type => 'string after PUBLIC');
2468 $self->{current_token}->{quirks} = 1;
2469
2470 $self->{state} = BOGUS_DOCTYPE_STATE;
2471 !!!next-input-character;
2472 redo A;
2473 }
2474 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2475 if ($self->{next_char} == 0x0022) { # "
2476 !!!cp (187);
2477 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2478 !!!next-input-character;
2479 redo A;
2480 } elsif ($self->{next_char} == 0x003E) { # >
2481 !!!cp (188);
2482 !!!parse-error (type => 'unclosed PUBLIC literal');
2483
2484 $self->{state} = DATA_STATE;
2485 !!!next-input-character;
2486
2487 $self->{current_token}->{quirks} = 1;
2488 !!!emit ($self->{current_token}); # DOCTYPE
2489
2490 redo A;
2491 } elsif ($self->{next_char} == -1) {
2492 !!!cp (189);
2493 !!!parse-error (type => 'unclosed PUBLIC literal');
2494
2495 $self->{state} = DATA_STATE;
2496 ## reconsume
2497
2498 $self->{current_token}->{quirks} = 1;
2499 !!!emit ($self->{current_token}); # DOCTYPE
2500
2501 redo A;
2502 } else {
2503 !!!cp (190);
2504 $self->{current_token}->{public_identifier} # DOCTYPE
2505 .= chr $self->{next_char};
2506 ## Stay in the state
2507 !!!next-input-character;
2508 redo A;
2509 }
2510 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2511 if ($self->{next_char} == 0x0027) { # '
2512 !!!cp (191);
2513 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2514 !!!next-input-character;
2515 redo A;
2516 } elsif ($self->{next_char} == 0x003E) { # >
2517 !!!cp (192);
2518 !!!parse-error (type => 'unclosed PUBLIC literal');
2519
2520 $self->{state} = DATA_STATE;
2521 !!!next-input-character;
2522
2523 $self->{current_token}->{quirks} = 1;
2524 !!!emit ($self->{current_token}); # DOCTYPE
2525
2526 redo A;
2527 } elsif ($self->{next_char} == -1) {
2528 !!!cp (193);
2529 !!!parse-error (type => 'unclosed PUBLIC literal');
2530
2531 $self->{state} = DATA_STATE;
2532 ## reconsume
2533
2534 $self->{current_token}->{quirks} = 1;
2535 !!!emit ($self->{current_token}); # DOCTYPE
2536
2537 redo A;
2538 } else {
2539 !!!cp (194);
2540 $self->{current_token}->{public_identifier} # DOCTYPE
2541 .= chr $self->{next_char};
2542 ## Stay in the state
2543 !!!next-input-character;
2544 redo A;
2545 }
2546 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2547 if ({
2548 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2549 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2550 }->{$self->{next_char}}) {
2551 !!!cp (195);
2552 ## Stay in the state
2553 !!!next-input-character;
2554 redo A;
2555 } elsif ($self->{next_char} == 0x0022) { # "
2556 !!!cp (196);
2557 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2558 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2559 !!!next-input-character;
2560 redo A;
2561 } elsif ($self->{next_char} == 0x0027) { # '
2562 !!!cp (197);
2563 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2564 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2565 !!!next-input-character;
2566 redo A;
2567 } elsif ($self->{next_char} == 0x003E) { # >
2568 !!!cp (198);
2569 $self->{state} = DATA_STATE;
2570 !!!next-input-character;
2571
2572 !!!emit ($self->{current_token}); # DOCTYPE
2573
2574 redo A;
2575 } elsif ($self->{next_char} == -1) {
2576 !!!cp (199);
2577 !!!parse-error (type => 'unclosed DOCTYPE');
2578
2579 $self->{state} = DATA_STATE;
2580 ## reconsume
2581
2582 $self->{current_token}->{quirks} = 1;
2583 !!!emit ($self->{current_token}); # DOCTYPE
2584
2585 redo A;
2586 } else {
2587 !!!cp (200);
2588 !!!parse-error (type => 'string after PUBLIC literal');
2589 $self->{current_token}->{quirks} = 1;
2590
2591 $self->{state} = BOGUS_DOCTYPE_STATE;
2592 !!!next-input-character;
2593 redo A;
2594 }
2595 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2596 if ({
2597 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2598 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2599 }->{$self->{next_char}}) {
2600 !!!cp (201);
2601 ## Stay in the state
2602 !!!next-input-character;
2603 redo A;
2604 } elsif ($self->{next_char} == 0x0022) { # "
2605 !!!cp (202);
2606 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2607 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2608 !!!next-input-character;
2609 redo A;
2610 } elsif ($self->{next_char} == 0x0027) { # '
2611 !!!cp (203);
2612 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2613 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2614 !!!next-input-character;
2615 redo A;
2616 } elsif ($self->{next_char} == 0x003E) { # >
2617 !!!cp (204);
2618 !!!parse-error (type => 'no SYSTEM literal');
2619 $self->{state} = DATA_STATE;
2620 !!!next-input-character;
2621
2622 $self->{current_token}->{quirks} = 1;
2623 !!!emit ($self->{current_token}); # DOCTYPE
2624
2625 redo A;
2626 } elsif ($self->{next_char} == -1) {
2627 !!!cp (205);
2628 !!!parse-error (type => 'unclosed DOCTYPE');
2629
2630 $self->{state} = DATA_STATE;
2631 ## reconsume
2632
2633 $self->{current_token}->{quirks} = 1;
2634 !!!emit ($self->{current_token}); # DOCTYPE
2635
2636 redo A;
2637 } else {
2638 !!!cp (206);
2639 !!!parse-error (type => 'string after SYSTEM');
2640 $self->{current_token}->{quirks} = 1;
2641
2642 $self->{state} = BOGUS_DOCTYPE_STATE;
2643 !!!next-input-character;
2644 redo A;
2645 }
2646 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2647 if ($self->{next_char} == 0x0022) { # "
2648 !!!cp (207);
2649 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2650 !!!next-input-character;
2651 redo A;
2652 } elsif ($self->{next_char} == 0x003E) { # >
2653 !!!cp (208);
2654 !!!parse-error (type => 'unclosed PUBLIC literal');
2655
2656 $self->{state} = DATA_STATE;
2657 !!!next-input-character;
2658
2659 $self->{current_token}->{quirks} = 1;
2660 !!!emit ($self->{current_token}); # DOCTYPE
2661
2662 redo A;
2663 } elsif ($self->{next_char} == -1) {
2664 !!!cp (209);
2665 !!!parse-error (type => 'unclosed SYSTEM literal');
2666
2667 $self->{state} = DATA_STATE;
2668 ## reconsume
2669
2670 $self->{current_token}->{quirks} = 1;
2671 !!!emit ($self->{current_token}); # DOCTYPE
2672
2673 redo A;
2674 } else {
2675 !!!cp (210);
2676 $self->{current_token}->{system_identifier} # DOCTYPE
2677 .= chr $self->{next_char};
2678 ## Stay in the state
2679 !!!next-input-character;
2680 redo A;
2681 }
2682 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2683 if ($self->{next_char} == 0x0027) { # '
2684 !!!cp (211);
2685 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2686 !!!next-input-character;
2687 redo A;
2688 } elsif ($self->{next_char} == 0x003E) { # >
2689 !!!cp (212);
2690 !!!parse-error (type => 'unclosed PUBLIC literal');
2691
2692 $self->{state} = DATA_STATE;
2693 !!!next-input-character;
2694
2695 $self->{current_token}->{quirks} = 1;
2696 !!!emit ($self->{current_token}); # DOCTYPE
2697
2698 redo A;
2699 } elsif ($self->{next_char} == -1) {
2700 !!!cp (213);
2701 !!!parse-error (type => 'unclosed SYSTEM literal');
2702
2703 $self->{state} = DATA_STATE;
2704 ## reconsume
2705
2706 $self->{current_token}->{quirks} = 1;
2707 !!!emit ($self->{current_token}); # DOCTYPE
2708
2709 redo A;
2710 } else {
2711 !!!cp (214);
2712 $self->{current_token}->{system_identifier} # DOCTYPE
2713 .= chr $self->{next_char};
2714 ## Stay in the state
2715 !!!next-input-character;
2716 redo A;
2717 }
2718 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2719 if ({
2720 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2721 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2722 }->{$self->{next_char}}) {
2723 !!!cp (215);
2724 ## Stay in the state
2725 !!!next-input-character;
2726 redo A;
2727 } elsif ($self->{next_char} == 0x003E) { # >
2728 !!!cp (216);
2729 $self->{state} = DATA_STATE;
2730 !!!next-input-character;
2731
2732 !!!emit ($self->{current_token}); # DOCTYPE
2733
2734 redo A;
2735 } elsif ($self->{next_char} == -1) {
2736 !!!cp (217);
2737
2738 $self->{state} = DATA_STATE;
2739 ## reconsume
2740
2741 $self->{current_token}->{quirks} = 1;
2742 !!!emit ($self->{current_token}); # DOCTYPE
2743
2744 redo A;
2745 } else {
2746 !!!cp (218);
2747 !!!parse-error (type => 'string after SYSTEM literal');
2748 #$self->{current_token}->{quirks} = 1;
2749
2750 $self->{state} = BOGUS_DOCTYPE_STATE;
2751 !!!next-input-character;
2752 redo A;
2753 }
2754 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2755 if ($self->{next_char} == 0x003E) { # >
2756 !!!cp (219);
2757 $self->{state} = DATA_STATE;
2758 !!!next-input-character;
2759
2760 !!!emit ($self->{current_token}); # DOCTYPE
2761
2762 redo A;
2763 } elsif ($self->{next_char} == -1) {
2764 !!!cp (220);
2765 !!!parse-error (type => 'unclosed DOCTYPE');
2766 $self->{state} = DATA_STATE;
2767 ## reconsume
2768
2769 !!!emit ($self->{current_token}); # DOCTYPE
2770
2771 redo A;
2772 } else {
2773 !!!cp (221);
2774 ## Stay in the state
2775 !!!next-input-character;
2776 redo A;
2777 }
2778 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2779 my $s = '';
2780
2781 my ($l, $c) = ($self->{line}, $self->{column});
2782
2783 CS: while ($self->{next_char} != -1) {
2784 if ($self->{next_char} == 0x005D) { # ]
2785 !!!next-input-character;
2786 if ($self->{next_char} == 0x005D) { # ]
2787 !!!next-input-character;
2788 MDC: {
2789 if ($self->{next_char} == 0x003E) { # >
2790 !!!cp (221.1);
2791 !!!next-input-character;
2792 last CS;
2793 } elsif ($self->{next_char} == 0x005D) { # ]
2794 !!!cp (221.2);
2795 $s .= ']';
2796 !!!next-input-character;
2797 redo MDC;
2798 } else {
2799 !!!cp (221.3);
2800 $s .= ']]';
2801 #
2802 }
2803 } # MDC
2804 } else {
2805 !!!cp (221.4);
2806 $s .= ']';
2807 #
2808 }
2809 } else {
2810 !!!cp (221.5);
2811 #
2812 }
2813 $s .= chr $self->{next_char};
2814 !!!next-input-character;
2815 } # CS
2816
2817 $self->{state} = DATA_STATE;
2818 ## next-input-character done or EOF, which is reconsumed.
2819
2820 if (length $s) {
2821 !!!cp (221.6);
2822 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2823 line => $l, column => $c});
2824 } else {
2825 !!!cp (221.7);
2826 }
2827
2828 redo A;
2829
2830 ## ISSUE: "text tokens" in spec.
2831 ## TODO: Streaming support
2832 } else {
2833 die "$0: $self->{state}: Unknown state";
2834 }
2835 } # A
2836
2837 die "$0: _get_next_token: unexpected case";
2838 } # _get_next_token
2839
2840 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2841 my ($self, $in_attr, $additional) = @_;
2842
2843 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2844
2845 if ({
2846 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2847 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2848 $additional => 1,
2849 }->{$self->{next_char}}) {
2850 !!!cp (1001);
2851 ## Don't consume
2852 ## No error
2853 return undef;
2854 } elsif ($self->{next_char} == 0x0023) { # #
2855 !!!next-input-character;
2856 if ($self->{next_char} == 0x0078 or # x
2857 $self->{next_char} == 0x0058) { # X
2858 my $code;
2859 X: {
2860 my $x_char = $self->{next_char};
2861 !!!next-input-character;
2862 if (0x0030 <= $self->{next_char} and
2863 $self->{next_char} <= 0x0039) { # 0..9
2864 !!!cp (1002);
2865 $code ||= 0;
2866 $code *= 0x10;
2867 $code += $self->{next_char} - 0x0030;
2868 redo X;
2869 } elsif (0x0061 <= $self->{next_char} and
2870 $self->{next_char} <= 0x0066) { # a..f
2871 !!!cp (1003);
2872 $code ||= 0;
2873 $code *= 0x10;
2874 $code += $self->{next_char} - 0x0060 + 9;
2875 redo X;
2876 } elsif (0x0041 <= $self->{next_char} and
2877 $self->{next_char} <= 0x0046) { # A..F
2878 !!!cp (1004);
2879 $code ||= 0;
2880 $code *= 0x10;
2881 $code += $self->{next_char} - 0x0040 + 9;
2882 redo X;
2883 } elsif (not defined $code) { # no hexadecimal digit
2884 !!!cp (1005);
2885 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2886 !!!back-next-input-character ($x_char, $self->{next_char});
2887 $self->{next_char} = 0x0023; # #
2888 return undef;
2889 } elsif ($self->{next_char} == 0x003B) { # ;
2890 !!!cp (1006);
2891 !!!next-input-character;
2892 } else {
2893 !!!cp (1007);
2894 !!!parse-error (type => 'no refc', line => $l, column => $c);
2895 }
2896
2897 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2898 !!!cp (1008);
2899 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2900 $code = 0xFFFD;
2901 } elsif ($code > 0x10FFFF) {
2902 !!!cp (1009);
2903 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2904 $code = 0xFFFD;
2905 } elsif ($code == 0x000D) {
2906 !!!cp (1010);
2907 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2908 $code = 0x000A;
2909 } elsif (0x80 <= $code and $code <= 0x9F) {
2910 !!!cp (1011);
2911 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2912 $code = $c1_entity_char->{$code};
2913 }
2914
2915 return {type => CHARACTER_TOKEN, data => chr $code,
2916 has_reference => 1,
2917 line => $l, column => $c,
2918 };
2919 } # X
2920 } elsif (0x0030 <= $self->{next_char} and
2921 $self->{next_char} <= 0x0039) { # 0..9
2922 my $code = $self->{next_char} - 0x0030;
2923 !!!next-input-character;
2924
2925 while (0x0030 <= $self->{next_char} and
2926 $self->{next_char} <= 0x0039) { # 0..9
2927 !!!cp (1012);
2928 $code *= 10;
2929 $code += $self->{next_char} - 0x0030;
2930
2931 !!!next-input-character;
2932 }
2933
2934 if ($self->{next_char} == 0x003B) { # ;
2935 !!!cp (1013);
2936 !!!next-input-character;
2937 } else {
2938 !!!cp (1014);
2939 !!!parse-error (type => 'no refc', line => $l, column => $c);
2940 }
2941
2942 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2943 !!!cp (1015);
2944 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2945 $code = 0xFFFD;
2946 } elsif ($code > 0x10FFFF) {
2947 !!!cp (1016);
2948 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2949 $code = 0xFFFD;
2950 } elsif ($code == 0x000D) {
2951 !!!cp (1017);
2952 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2953 $code = 0x000A;
2954 } elsif (0x80 <= $code and $code <= 0x9F) {
2955 !!!cp (1018);
2956 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2957 $code = $c1_entity_char->{$code};
2958 }
2959
2960 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2961 line => $l, column => $c,
2962 };
2963 } else {
2964 !!!cp (1019);
2965 !!!parse-error (type => 'bare nero', line => $l, column => $c);
2966 !!!back-next-input-character ($self->{next_char});
2967 $self->{next_char} = 0x0023; # #
2968 return undef;
2969 }
2970 } elsif ((0x0041 <= $self->{next_char} and
2971 $self->{next_char} <= 0x005A) or
2972 (0x0061 <= $self->{next_char} and
2973 $self->{next_char} <= 0x007A)) {
2974 my $entity_name = chr $self->{next_char};
2975 !!!next-input-character;
2976
2977 my $value = $entity_name;
2978 my $match = 0;
2979 require Whatpm::_NamedEntityList;
2980 our $EntityChar;
2981
2982 while (length $entity_name < 30 and
2983 ## NOTE: Some number greater than the maximum length of entity name
2984 ((0x0041 <= $self->{next_char} and # a
2985 $self->{next_char} <= 0x005A) or # x
2986 (0x0061 <= $self->{next_char} and # a
2987 $self->{next_char} <= 0x007A) or # z
2988 (0x0030 <= $self->{next_char} and # 0
2989 $self->{next_char} <= 0x0039) or # 9
2990 $self->{next_char} == 0x003B)) { # ;
2991 $entity_name .= chr $self->{next_char};
2992 if (defined $EntityChar->{$entity_name}) {
2993 if ($self->{next_char} == 0x003B) { # ;
2994 !!!cp (1020);
2995 $value = $EntityChar->{$entity_name};
2996 $match = 1;
2997 !!!next-input-character;
2998 last;
2999 } else {
3000 !!!cp (1021);
3001 $value = $EntityChar->{$entity_name};
3002 $match = -1;
3003 !!!next-input-character;
3004 }
3005 } else {
3006 !!!cp (1022);
3007 $value .= chr $self->{next_char};
3008 $match *= 2;
3009 !!!next-input-character;
3010 }
3011 }
3012
3013 if ($match > 0) {
3014 !!!cp (1023);
3015 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3016 line => $l, column => $c,
3017 };
3018 } elsif ($match < 0) {
3019 !!!parse-error (type => 'no refc', line => $l, column => $c);
3020 if ($in_attr and $match < -1) {
3021 !!!cp (1024);
3022 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
3023 line => $l, column => $c,
3024 };
3025 } else {
3026 !!!cp (1025);
3027 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
3028 line => $l, column => $c,
3029 };
3030 }
3031 } else {
3032 !!!cp (1026);
3033 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3034 ## NOTE: "No characters are consumed" in the spec.
3035 return {type => CHARACTER_TOKEN, data => '&'.$value,
3036 line => $l, column => $c,
3037 };
3038 }
3039 } else {
3040 !!!cp (1027);
3041 ## no characters are consumed
3042 !!!parse-error (type => 'bare ero', line => $l, column => $c);
3043 return undef;
3044 }
3045 } # _tokenize_attempt_to_consume_an_entity
3046
3047 sub _initialize_tree_constructor ($) {
3048 my $self = shift;
3049 ## NOTE: $self->{document} MUST be specified before this method is called
3050 $self->{document}->strict_error_checking (0);
3051 ## TODO: Turn mutation events off # MUST
3052 ## TODO: Turn loose Document option (manakai extension) on
3053 $self->{document}->manakai_is_html (1); # MUST
3054 } # _initialize_tree_constructor
3055
3056 sub _terminate_tree_constructor ($) {
3057 my $self = shift;
3058 $self->{document}->strict_error_checking (1);
3059 ## TODO: Turn mutation events on
3060 } # _terminate_tree_constructor
3061
3062 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3063
3064 { # tree construction stage
3065 my $token;
3066
3067 sub _construct_tree ($) {
3068 my ($self) = @_;
3069
3070 ## When an interactive UA render the $self->{document} available
3071 ## to the user, or when it begin accepting user input, are
3072 ## not defined.
3073
3074 ## Append a character: collect it and all subsequent consecutive
3075 ## characters and insert one Text node whose data is concatenation
3076 ## of all those characters. # MUST
3077
3078 !!!next-token;
3079
3080 undef $self->{form_element};
3081 undef $self->{head_element};
3082 $self->{open_elements} = [];
3083 undef $self->{inner_html_node};
3084
3085 ## NOTE: The "initial" insertion mode.
3086 $self->_tree_construction_initial; # MUST
3087
3088 ## NOTE: The "before html" insertion mode.
3089 $self->_tree_construction_root_element;
3090 $self->{insertion_mode} = BEFORE_HEAD_IM;
3091
3092 ## NOTE: The "before head" insertion mode and so on.
3093 $self->_tree_construction_main;
3094 } # _construct_tree
3095
3096 sub _tree_construction_initial ($) {
3097 my $self = shift;
3098
3099 ## NOTE: "initial" insertion mode
3100
3101 INITIAL: {
3102 if ($token->{type} == DOCTYPE_TOKEN) {
3103 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3104 ## error, switch to a conformance checking mode for another
3105 ## language.
3106 my $doctype_name = $token->{name};
3107 $doctype_name = '' unless defined $doctype_name;
3108 $doctype_name =~ tr/a-z/A-Z/;
3109 if (not defined $token->{name} or # <!DOCTYPE>
3110 defined $token->{public_identifier} or
3111 defined $token->{system_identifier}) {
3112 !!!cp ('t1');
3113 !!!parse-error (type => 'not HTML5', token => $token);
3114 } elsif ($doctype_name ne 'HTML') {
3115 !!!cp ('t2');
3116 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
3117 !!!parse-error (type => 'not HTML5', token => $token);
3118 } else {
3119 !!!cp ('t3');
3120 }
3121
3122 my $doctype = $self->{document}->create_document_type_definition
3123 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3124 ## NOTE: Default value for both |public_id| and |system_id| attributes
3125 ## are empty strings, so that we don't set any value in missing cases.
3126 $doctype->public_id ($token->{public_identifier})
3127 if defined $token->{public_identifier};
3128 $doctype->system_id ($token->{system_identifier})
3129 if defined $token->{system_identifier};
3130 ## NOTE: Other DocumentType attributes are null or empty lists.
3131 ## ISSUE: internalSubset = null??
3132 $self->{document}->append_child ($doctype);
3133
3134 if ($token->{quirks} or $doctype_name ne 'HTML') {
3135 !!!cp ('t4');
3136 $self->{document}->manakai_compat_mode ('quirks');
3137 } elsif (defined $token->{public_identifier}) {
3138 my $pubid = $token->{public_identifier};
3139 $pubid =~ tr/a-z/A-z/;
3140 my $prefix = [
3141 "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3142 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3143 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3144 "-//IETF//DTD HTML 2.0 LEVEL 1//",
3145 "-//IETF//DTD HTML 2.0 LEVEL 2//",
3146 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3147 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3148 "-//IETF//DTD HTML 2.0 STRICT//",
3149 "-//IETF//DTD HTML 2.0//",
3150 "-//IETF//DTD HTML 2.1E//",
3151 "-//IETF//DTD HTML 3.0//",
3152 "-//IETF//DTD HTML 3.2 FINAL//",
3153 "-//IETF//DTD HTML 3.2//",
3154 "-//IETF//DTD HTML 3//",
3155 "-//IETF//DTD HTML LEVEL 0//",
3156 "-//IETF//DTD HTML LEVEL 1//",
3157 "-//IETF//DTD HTML LEVEL 2//",
3158 "-//IETF//DTD HTML LEVEL 3//",
3159 "-//IETF//DTD HTML STRICT LEVEL 0//",
3160 "-//IETF//DTD HTML STRICT LEVEL 1//",
3161 "-//IETF//DTD HTML STRICT LEVEL 2//",
3162 "-//IETF//DTD HTML STRICT LEVEL 3//",
3163 "-//IETF//DTD HTML STRICT//",
3164 "-//IETF//DTD HTML//",
3165 "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3166 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3167 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3168 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3169 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3170 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3171 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3172 "-//NETSCAPE COMM. CORP.//DTD HTML//",
3173 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3174 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3175 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3176 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3177 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3178 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3179 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3180 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3181 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3182 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3183 "-//W3C//DTD HTML 3 1995-03-24//",
3184 "-//W3C//DTD HTML 3.2 DRAFT//",
3185 "-//W3C//DTD HTML 3.2 FINAL//",
3186 "-//W3C//DTD HTML 3.2//",
3187 "-//W3C//DTD HTML 3.2S DRAFT//",
3188 "-//W3C//DTD HTML 4.0 FRAMESET//",
3189 "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3190 "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3191 "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3192 "-//W3C//DTD W3 HTML//",
3193 "-//W3O//DTD W3 HTML 3.0//",
3194 "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3195 "-//WEBTECHS//DTD MOZILLA HTML//",
3196 ]; # $prefix
3197 my $match;
3198 for (@$prefix) {
3199 if (substr ($prefix, 0, length $_) eq $_) {
3200 $match = 1;
3201 last;
3202 }
3203 }
3204 if ($match or
3205 $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3206 $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3207 $pubid eq "HTML") {
3208 !!!cp ('t5');
3209 $self->{document}->manakai_compat_mode ('quirks');
3210 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3211 $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3212 if (defined $token->{system_identifier}) {
3213 !!!cp ('t6');
3214 $self->{document}->manakai_compat_mode ('quirks');
3215 } else {
3216 !!!cp ('t7');
3217 $self->{document}->manakai_compat_mode ('limited quirks');
3218 }
3219 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3220 $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3221 !!!cp ('t8');
3222 $self->{document}->manakai_compat_mode ('limited quirks');
3223 } else {
3224 !!!cp ('t9');
3225 }
3226 } else {
3227 !!!cp ('t10');
3228 }
3229 if (defined $token->{system_identifier}) {
3230 my $sysid = $token->{system_identifier};
3231 $sysid =~ tr/A-Z/a-z/;
3232 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3233 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3234 ## marked as quirks.
3235 $self->{document}->manakai_compat_mode ('quirks');
3236 !!!cp ('t11');
3237 } else {
3238 !!!cp ('t12');
3239 }
3240 } else {
3241 !!!cp ('t13');
3242 }
3243
3244 ## Go to the "before html" insertion mode.
3245 !!!next-token;
3246 return;
3247 } elsif ({
3248 START_TAG_TOKEN, 1,
3249 END_TAG_TOKEN, 1,
3250 END_OF_FILE_TOKEN, 1,
3251 }->{$token->{type}}) {
3252 !!!cp ('t14');
3253 !!!parse-error (type => 'no DOCTYPE', token => $token);
3254 $self->{document}->manakai_compat_mode ('quirks');
3255 ## Go to the "before html" insertion mode.
3256 ## reprocess
3257 !!!ack-later;
3258 return;
3259 } elsif ($token->{type} == CHARACTER_TOKEN) {
3260 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3261 ## Ignore the token
3262
3263 unless (length $token->{data}) {
3264 !!!cp ('t15');
3265 ## Stay in the insertion mode.
3266 !!!next-token;
3267 redo INITIAL;
3268 } else {
3269 !!!cp ('t16');
3270 }
3271 } else {
3272 !!!cp ('t17');
3273 }
3274
3275 !!!parse-error (type => 'no DOCTYPE', token => $token);
3276 $self->{document}->manakai_compat_mode ('quirks');
3277 ## Go to the "before html" insertion mode.
3278 ## reprocess
3279 return;
3280 } elsif ($token->{type} == COMMENT_TOKEN) {
3281 !!!cp ('t18');
3282 my $comment = $self->{document}->create_comment ($token->{data});
3283 $self->{document}->append_child ($comment);
3284
3285 ## Stay in the insertion mode.
3286 !!!next-token;
3287 redo INITIAL;
3288 } else {
3289 die "$0: $token->{type}: Unknown token type";
3290 }
3291 } # INITIAL
3292
3293 die "$0: _tree_construction_initial: This should be never reached";
3294 } # _tree_construction_initial
3295
3296 sub _tree_construction_root_element ($) {
3297 my $self = shift;
3298
3299 ## NOTE: "before html" insertion mode.
3300
3301 B: {
3302 if ($token->{type} == DOCTYPE_TOKEN) {
3303 !!!cp ('t19');
3304 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3305 ## Ignore the token
3306 ## Stay in the insertion mode.
3307 !!!next-token;
3308 redo B;
3309 } elsif ($token->{type} == COMMENT_TOKEN) {
3310 !!!cp ('t20');
3311 my $comment = $self->{document}->create_comment ($token->{data});
3312 $self->{document}->append_child ($comment);
3313 ## Stay in the insertion mode.
3314 !!!next-token;
3315 redo B;
3316 } elsif ($token->{type} == CHARACTER_TOKEN) {
3317 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3318 ## Ignore the token.
3319
3320 unless (length $token->{data}) {
3321 !!!cp ('t21');
3322 ## Stay in the insertion mode.
3323 !!!next-token;
3324 redo B;
3325 } else {
3326 !!!cp ('t22');
3327 }
3328 } else {
3329 !!!cp ('t23');
3330 }
3331
3332 $self->{application_cache_selection}->(undef);
3333
3334 #
3335 } elsif ($token->{type} == START_TAG_TOKEN) {
3336 if ($token->{tag_name} eq 'html') {
3337 my $root_element;
3338 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3339 $self->{document}->append_child ($root_element);
3340 push @{$self->{open_elements}},
3341 [$root_element, $el_category->{html}];
3342
3343 if ($token->{attributes}->{manifest}) {
3344 !!!cp ('t24');
3345 $self->{application_cache_selection}
3346 ->($token->{attributes}->{manifest}->{value});
3347 ## ISSUE: Spec is unclear on relative references.
3348 ## According to Hixie (#whatwg 2008-03-19), it should be
3349 ## resolved against the base URI of the document in HTML
3350 ## or xml:base of the element in XHTML.
3351 } else {
3352 !!!cp ('t25');
3353 $self->{application_cache_selection}->(undef);
3354 }
3355
3356 !!!nack ('t25c');
3357
3358 !!!next-token;
3359 return; ## Go to the "before head" insertion mode.
3360 } else {
3361 !!!cp ('t25.1');
3362 #
3363 }
3364 } elsif ({
3365 END_TAG_TOKEN, 1,
3366 END_OF_FILE_TOKEN, 1,
3367 }->{$token->{type}}) {
3368 !!!cp ('t26');
3369 #
3370 } else {
3371 die "$0: $token->{type}: Unknown token type";
3372 }
3373
3374 my $root_element;
3375 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3376 $self->{document}->append_child ($root_element);
3377 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3378
3379 $self->{application_cache_selection}->(undef);
3380
3381 ## NOTE: Reprocess the token.
3382 !!!ack-later;
3383 return; ## Go to the "before head" insertion mode.
3384
3385 ## ISSUE: There is an issue in the spec
3386 } # B
3387
3388 die "$0: _tree_construction_root_element: This should never be reached";
3389 } # _tree_construction_root_element
3390
3391 sub _reset_insertion_mode ($) {
3392 my $self = shift;
3393
3394 ## Step 1
3395 my $last;
3396
3397 ## Step 2
3398 my $i = -1;
3399 my $node = $self->{open_elements}->[$i];
3400
3401 ## Step 3
3402 S3: {
3403 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3404 $last = 1;
3405 if (defined $self->{inner_html_node}) {
3406 !!!cp ('t28');
3407 $node = $self->{inner_html_node};
3408 } else {
3409 die "_reset_insertion_mode: t27";
3410 }
3411 }
3412
3413 ## Step 4..14
3414 my $new_mode;
3415 if ($node->[1] & FOREIGN_EL) {
3416 !!!cp ('t28.1');
3417 ## NOTE: Strictly spaking, the line below only applies to MathML and
3418 ## SVG elements. Currently the HTML syntax supports only MathML and
3419 ## SVG elements as foreigners.
3420 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3421 } elsif ($node->[1] & TABLE_CELL_EL) {
3422 if ($last) {
3423 !!!cp ('t28.2');
3424 #
3425 } else {
3426 !!!cp ('t28.3');
3427 $new_mode = IN_CELL_IM;
3428 }
3429 } else {
3430 !!!cp ('t28.4');
3431 $new_mode = {
3432 select => IN_SELECT_IM,
3433 ## NOTE: |option| and |optgroup| do not set
3434 ## insertion mode to "in select" by themselves.
3435 tr => IN_ROW_IM,
3436 tbody => IN_TABLE_BODY_IM,
3437 thead => IN_TABLE_BODY_IM,
3438 tfoot => IN_TABLE_BODY_IM,
3439 caption => IN_CAPTION_IM,
3440 colgroup => IN_COLUMN_GROUP_IM,
3441 table => IN_TABLE_IM,
3442 head => IN_BODY_IM, # not in head!
3443 body => IN_BODY_IM,
3444 frameset => IN_FRAMESET_IM,
3445 }->{$node->[0]->manakai_local_name};
3446 }
3447 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3448
3449 ## Step 15
3450 if ($node->[1] & HTML_EL) {
3451 unless (defined $self->{head_element}) {
3452 !!!cp ('t29');
3453 $self->{insertion_mode} = BEFORE_HEAD_IM;
3454 } else {
3455 ## ISSUE: Can this state be reached?
3456 !!!cp ('t30');
3457 $self->{insertion_mode} = AFTER_HEAD_IM;
3458 }
3459 return;
3460 } else {
3461 !!!cp ('t31');
3462 }
3463
3464 ## Step 16
3465 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3466
3467 ## Step 17
3468 $i--;
3469 $node = $self->{open_elements}->[$i];
3470
3471 ## Step 18
3472 redo S3;
3473 } # S3
3474
3475 die "$0: _reset_insertion_mode: This line should never be reached";
3476 } # _reset_insertion_mode
3477
3478 sub _tree_construction_main ($) {
3479 my $self = shift;
3480
3481 my $active_formatting_elements = [];
3482
3483 my $reconstruct_active_formatting_elements = sub { # MUST
3484 my $insert = shift;
3485
3486 ## Step 1
3487 return unless @$active_formatting_elements;
3488
3489 ## Step 3
3490 my $i = -1;
3491 my $entry = $active_formatting_elements->[$i];
3492
3493 ## Step 2
3494 return if $entry->[0] eq '#marker';
3495 for (@{$self->{open_elements}}) {
3496 if ($entry->[0] eq $_->[0]) {
3497 !!!cp ('t32');
3498 return;
3499 }
3500 }
3501
3502 S4: {
3503 ## Step 4
3504 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3505
3506 ## Step 5
3507 $i--;
3508 $entry = $active_formatting_elements->[$i];
3509
3510 ## Step 6
3511 if ($entry->[0] eq '#marker') {
3512 !!!cp ('t33_1');
3513 #
3514 } else {
3515 my $in_open_elements;
3516 OE: for (@{$self->{open_elements}}) {
3517 if ($entry->[0] eq $_->[0]) {
3518 !!!cp ('t33');
3519 $in_open_elements = 1;
3520 last OE;
3521 }
3522 }
3523 if ($in_open_elements) {
3524 !!!cp ('t34');
3525 #
3526 } else {
3527 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3528 !!!cp ('t35');
3529 redo S4;
3530 }
3531 }
3532
3533 ## Step 7
3534 $i++;
3535 $entry = $active_formatting_elements->[$i];
3536 } # S4
3537
3538 S7: {
3539 ## Step 8
3540 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3541
3542 ## Step 9
3543 $insert->($clone->[0]);
3544 push @{$self->{open_elements}}, $clone;
3545
3546 ## Step 10
3547 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3548
3549 ## Step 11
3550 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3551 !!!cp ('t36');
3552 ## Step 7'
3553 $i++;
3554 $entry = $active_formatting_elements->[$i];
3555
3556 redo S7;
3557 }
3558
3559 !!!cp ('t37');
3560 } # S7
3561 }; # $reconstruct_active_formatting_elements
3562
3563 my $clear_up_to_marker = sub {
3564 for (reverse 0..$#$active_formatting_elements) {
3565 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3566 !!!cp ('t38');
3567 splice @$active_formatting_elements, $_;
3568 return;
3569 }
3570 }
3571
3572 !!!cp ('t39');
3573 }; # $clear_up_to_marker
3574
3575 my $insert;
3576
3577 my $parse_rcdata = sub ($) {
3578 my ($content_model_flag) = @_;
3579
3580 ## Step 1
3581 my $start_tag_name = $token->{tag_name};
3582 my $el;
3583 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3584
3585 ## Step 2
3586 $insert->($el);
3587
3588 ## Step 3
3589 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3590 delete $self->{escape}; # MUST
3591
3592 ## Step 4
3593 my $text = '';
3594 !!!nack ('t40.1');
3595 !!!next-token;
3596 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3597 !!!cp ('t40');
3598 $text .= $token->{data};
3599 !!!next-token;
3600 }
3601
3602 ## Step 5
3603 if (length $text) {
3604 !!!cp ('t41');
3605 my $text = $self->{document}->create_text_node ($text);
3606 $el->append_child ($text);
3607 }
3608
3609 ## Step 6
3610 $self->{content_model} = PCDATA_CONTENT_MODEL;
3611
3612 ## Step 7
3613 if ($token->{type} == END_TAG_TOKEN and
3614 $token->{tag_name} eq $start_tag_name) {
3615 !!!cp ('t42');
3616 ## Ignore the token
3617 } else {
3618 ## NOTE: An end-of-file token.
3619 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3620 !!!cp ('t43');
3621 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3622 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3623 !!!cp ('t44');
3624 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
3625 } else {
3626 die "$0: $content_model_flag in parse_rcdata";
3627 }
3628 }
3629 !!!next-token;
3630 }; # $parse_rcdata
3631
3632 my $script_start_tag = sub () {
3633 my $script_el;
3634 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3635 ## TODO: mark as "parser-inserted"
3636
3637 $self->{content_model} = CDATA_CONTENT_MODEL;
3638 delete $self->{escape}; # MUST
3639
3640 my $text = '';
3641 !!!nack ('t45.1');
3642 !!!next-token;
3643 while ($token->{type} == CHARACTER_TOKEN) {
3644 !!!cp ('t45');
3645 $text .= $token->{data};
3646 !!!next-token;
3647 } # stop if non-character token or tokenizer stops tokenising
3648 if (length $text) {
3649 !!!cp ('t46');
3650 $script_el->manakai_append_text ($text);
3651 }
3652
3653 $self->{content_model} = PCDATA_CONTENT_MODEL;
3654
3655 if ($token->{type} == END_TAG_TOKEN and
3656 $token->{tag_name} eq 'script') {
3657 !!!cp ('t47');
3658 ## Ignore the token
3659 } else {
3660 !!!cp ('t48');
3661 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3662 ## ISSUE: And ignore?
3663 ## TODO: mark as "already executed"
3664 }
3665
3666 if (defined $self->{inner_html_node}) {
3667 !!!cp ('t49');
3668 ## TODO: mark as "already executed"
3669 } else {
3670 !!!cp ('t50');
3671 ## TODO: $old_insertion_point = current insertion point
3672 ## TODO: insertion point = just before the next input character
3673
3674 $insert->($script_el);
3675
3676 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3677
3678 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3679 }
3680
3681 !!!next-token;
3682 }; # $script_start_tag
3683
3684 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3685 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3686 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3687
3688 my $formatting_end_tag = sub {
3689 my $end_tag_token = shift;
3690 my $tag_name = $end_tag_token->{tag_name};
3691
3692 ## NOTE: The adoption agency algorithm (AAA).
3693
3694 FET: {
3695 ## Step 1
3696 my $formatting_element;
3697 my $formatting_element_i_in_active;
3698 AFE: for (reverse 0..$#$active_formatting_elements) {
3699 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3700 !!!cp ('t52');
3701 last AFE;
3702 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3703 eq $tag_name) {
3704 !!!cp ('t51');
3705 $formatting_element = $active_formatting_elements->[$_];
3706 $formatting_element_i_in_active = $_;
3707 last AFE;
3708 }
3709 } # AFE
3710 unless (defined $formatting_element) {
3711 !!!cp ('t53');
3712 !!!parse-error (type => 'unmatched end tag:'.$tag_name, token => $end_tag_token);
3713 ## Ignore the token
3714 !!!next-token;
3715 return;
3716 }
3717 ## has an element in scope
3718 my $in_scope = 1;
3719 my $formatting_element_i_in_open;
3720 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3721 my $node = $self->{open_elements}->[$_];
3722 if ($node->[0] eq $formatting_element->[0]) {
3723 if ($in_scope) {
3724 !!!cp ('t54');
3725 $formatting_element_i_in_open = $_;
3726 last INSCOPE;
3727 } else { # in open elements but not in scope
3728 !!!cp ('t55');
3729 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3730 token => $end_tag_token);
3731 ## Ignore the token
3732 !!!next-token;
3733 return;
3734 }
3735 } elsif ($node->[1] & SCOPING_EL) {
3736 !!!cp ('t56');
3737 $in_scope = 0;
3738 }
3739 } # INSCOPE
3740 unless (defined $formatting_element_i_in_open) {
3741 !!!cp ('t57');
3742 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3743 token => $end_tag_token);
3744 pop @$active_formatting_elements; # $formatting_element
3745 !!!next-token; ## TODO: ok?
3746 return;
3747 }
3748 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3749 !!!cp ('t58');
3750 !!!parse-error (type => 'not closed',
3751 value => $self->{open_elements}->[-1]->[0]
3752 ->manakai_local_name,
3753 token => $end_tag_token);
3754 }
3755
3756 ## Step 2
3757 my $furthest_block;
3758 my $furthest_block_i_in_open;
3759 OE: for (reverse 0..$#{$self->{open_elements}}) {
3760 my $node = $self->{open_elements}->[$_];
3761 if (not ($node->[1] & FORMATTING_EL) and
3762 #not $phrasing_category->{$node->[1]} and
3763 ($node->[1] & SPECIAL_EL or
3764 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3765 !!!cp ('t59');
3766 $furthest_block = $node;
3767 $furthest_block_i_in_open = $_;
3768 } elsif ($node->[0] eq $formatting_element->[0]) {
3769 !!!cp ('t60');
3770 last OE;
3771 }
3772 } # OE
3773
3774 ## Step 3
3775 unless (defined $furthest_block) { # MUST
3776 !!!cp ('t61');
3777 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3778 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3779 !!!next-token;
3780 return;
3781 }
3782
3783 ## Step 4
3784 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3785
3786 ## Step 5
3787 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3788 if (defined $furthest_block_parent) {
3789 !!!cp ('t62');
3790 $furthest_block_parent->remove_child ($furthest_block->[0]);
3791 }
3792
3793 ## Step 6
3794 my $bookmark_prev_el
3795 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3796 ->[0];
3797
3798 ## Step 7
3799 my $node = $furthest_block;
3800 my $node_i_in_open = $furthest_block_i_in_open;
3801 my $last_node = $furthest_block;
3802 S7: {
3803 ## Step 1
3804 $node_i_in_open--;
3805 $node = $self->{open_elements}->[$node_i_in_open];
3806
3807 ## Step 2
3808 my $node_i_in_active;
3809 S7S2: {
3810 for (reverse 0..$#$active_formatting_elements) {
3811 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3812 !!!cp ('t63');
3813 $node_i_in_active = $_;
3814 last S7S2;
3815 }
3816 }
3817 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3818 redo S7;
3819 } # S7S2
3820
3821 ## Step 3
3822 last S7 if $node->[0] eq $formatting_element->[0];
3823
3824 ## Step 4
3825 if ($last_node->[0] eq $furthest_block->[0]) {
3826 !!!cp ('t64');
3827 $bookmark_prev_el = $node->[0];
3828 }
3829
3830 ## Step 5
3831 if ($node->[0]->has_child_nodes ()) {
3832 !!!cp ('t65');
3833 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3834 $active_formatting_elements->[$node_i_in_active] = $clone;
3835 $self->{open_elements}->[$node_i_in_open] = $clone;
3836 $node = $clone;
3837 }
3838
3839 ## Step 6
3840 $node->[0]->append_child ($last_node->[0]);
3841
3842 ## Step 7
3843 $last_node = $node;
3844
3845 ## Step 8
3846 redo S7;
3847 } # S7
3848
3849 ## Step 8
3850 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3851 my $foster_parent_element;
3852 my $next_sibling;
3853 OE: for (reverse 0..$#{$self->{open_elements}}) {
3854 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3855 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3856 if (defined $parent and $parent->node_type == 1) {
3857 !!!cp ('t65.1');
3858 $foster_parent_element = $parent;
3859 $next_sibling = $self->{open_elements}->[$_]->[0];
3860 } else {
3861 !!!cp ('t65.2');
3862 $foster_parent_element
3863 = $self->{open_elements}->[$_ - 1]->[0];
3864 }
3865 last OE;
3866 }
3867 } # OE
3868 $foster_parent_element = $self->{open_elements}->[0]->[0]
3869 unless defined $foster_parent_element;
3870 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3871 $open_tables->[-1]->[1] = 1; # tainted
3872 } else {
3873 !!!cp ('t65.3');
3874 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3875 }
3876
3877 ## Step 9
3878 my $clone = [$formatting_element->[0]->clone_node (0),
3879 $formatting_element->[1]];
3880
3881 ## Step 10
3882 my @cn = @{$furthest_block->[0]->child_nodes};
3883 $clone->[0]->append_child ($_) for @cn;
3884
3885 ## Step 11
3886 $furthest_block->[0]->append_child ($clone->[0]);
3887
3888 ## Step 12
3889 my $i;
3890 AFE: for (reverse 0..$#$active_formatting_elements) {
3891 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3892 !!!cp ('t66');
3893 splice @$active_formatting_elements, $_, 1;
3894 $i-- and last AFE if defined $i;
3895 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3896 !!!cp ('t67');
3897 $i = $_;
3898 }
3899 } # AFE
3900 splice @$active_formatting_elements, $i + 1, 0, $clone;
3901
3902 ## Step 13
3903 undef $i;
3904 OE: for (reverse 0..$#{$self->{open_elements}}) {
3905 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3906 !!!cp ('t68');
3907 splice @{$self->{open_elements}}, $_, 1;
3908 $i-- and last OE if defined $i;
3909 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3910 !!!cp ('t69');
3911 $i = $_;
3912 }
3913 } # OE
3914 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3915
3916 ## Step 14
3917 redo FET;
3918 } # FET
3919 }; # $formatting_end_tag
3920
3921 $insert = my $insert_to_current = sub {
3922 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3923 }; # $insert_to_current
3924
3925 my $insert_to_foster = sub {
3926 my $child = shift;
3927 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3928 # MUST
3929 my $foster_parent_element;
3930 my $next_sibling;
3931 OE: for (reverse 0..$#{$self->{open_elements}}) {
3932 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3933 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3934 if (defined $parent and $parent->node_type == 1) {
3935 !!!cp ('t70');
3936 $foster_parent_element = $parent;
3937 $next_sibling = $self->{open_elements}->[$_]->[0];
3938 } else {
3939 !!!cp ('t71');
3940 $foster_parent_element
3941 = $self->{open_elements}->[$_ - 1]->[0];
3942 }
3943 last OE;
3944 }
3945 } # OE
3946 $foster_parent_element = $self->{open_elements}->[0]->[0]
3947 unless defined $foster_parent_element;
3948 $foster_parent_element->insert_before
3949 ($child, $next_sibling);
3950 $open_tables->[-1]->[1] = 1; # tainted
3951 } else {
3952 !!!cp ('t72');
3953 $self->{open_elements}->[-1]->[0]->append_child ($child);
3954 }
3955 }; # $insert_to_foster
3956
3957 B: while (1) {
3958 if ($token->{type} == DOCTYPE_TOKEN) {
3959 !!!cp ('t73');
3960 !!!parse-error (type => 'DOCTYPE in the middle', token => $token);
3961 ## Ignore the token
3962 ## Stay in the phase
3963 !!!next-token;
3964 next B;
3965 } elsif ($token->{type} == START_TAG_TOKEN and
3966 $token->{tag_name} eq 'html') {
3967 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3968 !!!cp ('t79');
3969 !!!parse-error (type => 'after html:html', token => $token);
3970 $self->{insertion_mode} = AFTER_BODY_IM;
3971 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3972 !!!cp ('t80');
3973 !!!parse-error (type => 'after html:html', token => $token);
3974 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3975 } else {
3976 !!!cp ('t81');
3977 }
3978
3979 !!!cp ('t82');
3980 !!!parse-error (type => 'not first start tag', token => $token);
3981 my $top_el = $self->{open_elements}->[0]->[0];
3982 for my $attr_name (keys %{$token->{attributes}}) {
3983 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3984 !!!cp ('t84');
3985 $top_el->set_attribute_ns
3986 (undef, [undef, $attr_name],
3987 $token->{attributes}->{$attr_name}->{value});
3988 }
3989 }
3990 !!!nack ('t84.1');
3991 !!!next-token;
3992 next B;
3993 } elsif ($token->{type} == COMMENT_TOKEN) {
3994 my $comment = $self->{document}->create_comment ($token->{data});
3995 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3996 !!!cp ('t85');
3997 $self->{document}->append_child ($comment);
3998 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3999 !!!cp ('t86');
4000 $self->{open_elements}->[0]->[0]->append_child ($comment);
4001 } else {
4002 !!!cp ('t87');
4003 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4004 }
4005 !!!next-token;
4006 next B;
4007 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4008 if ($token->{type} == CHARACTER_TOKEN) {
4009 !!!cp ('t87.1');
4010 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4011 !!!next-token;
4012 next B;
4013 } elsif ($token->{type} == START_TAG_TOKEN) {
4014 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4015 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4016 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4017 ($token->{tag_name} eq 'svg' and
4018 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4019 ## NOTE: "using the rules for secondary insertion mode"then"continue"
4020 !!!cp ('t87.2');
4021 #
4022 } elsif ({
4023 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4024 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4025 em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4026 h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4027 img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4028 nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4029 small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4030 sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4031 }->{$token->{tag_name}}) {
4032 !!!cp ('t87.2');
4033 !!!parse-error (type => 'not closed',
4034 value => $self->{open_elements}->[-1]->[0]
4035 ->manakai_local_name,
4036 token => $token);
4037
4038 pop @{$self->{open_elements}}
4039 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4040
4041 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4042 ## Reprocess.
4043 next B;
4044 } else {
4045 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4046 my $tag_name = $token->{tag_name};
4047 if ($nsuri eq $SVG_NS) {
4048 $tag_name = {
4049 altglyph => 'altGlyph',
4050 altglyphdef => 'altGlyphDef',
4051 altglyphitem => 'altGlyphItem',
4052 animatecolor => 'animateColor',
4053 animatemotion => 'animateMotion',
4054 animatetransform => 'animateTransform',
4055 clippath => 'clipPath',
4056 feblend => 'feBlend',
4057 fecolormatrix => 'feColorMatrix',
4058 fecomponenttransfer => 'feComponentTransfer',
4059 fecomposite => 'feComposite',
4060 feconvolvematrix => 'feConvolveMatrix',
4061 fediffuselighting => 'feDiffuseLighting',
4062 fedisplacementmap => 'feDisplacementMap',
4063 fedistantlight => 'feDistantLight',
4064 feflood => 'feFlood',
4065 fefunca => 'feFuncA',
4066 fefuncb => 'feFuncB',
4067 fefuncg => 'feFuncG',
4068 fefuncr => 'feFuncR',
4069 fegaussianblur => 'feGaussianBlur',
4070 feimage => 'feImage',
4071 femerge => 'feMerge',
4072 femergenode => 'feMergeNode',
4073 femorphology => 'feMorphology',
4074 feoffset => 'feOffset',
4075 fepointlight => 'fePointLight',
4076 fespecularlighting => 'feSpecularLighting',
4077 fespotlight => 'feSpotLight',
4078 fetile => 'feTile',
4079 feturbulence => 'feTurbulence',
4080 foreignobject => 'foreignObject',
4081 glyphref => 'glyphRef',
4082 lineargradient => 'linearGradient',
4083 radialgradient => 'radialGradient',
4084 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4085 textpath => 'textPath',
4086 }->{$tag_name} || $tag_name;
4087 }
4088
4089 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4090
4091 ## "adjust foreign attributes" - done in insert-element-f
4092
4093 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4094
4095 if ($self->{self_closing}) {
4096 pop @{$self->{open_elements}};
4097 !!!ack ('t87.3');
4098 } else {
4099 !!!cp ('t87.4');
4100 }
4101
4102 !!!next-token;
4103 next B;
4104 }
4105 } elsif ($token->{type} == END_TAG_TOKEN) {
4106 ## NOTE: "using the rules for secondary insertion mode" then "continue"
4107 !!!cp ('t87.5');
4108 #
4109 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4110 !!!cp ('t87.6');
4111 !!!parse-error (type => 'not closed',
4112 value => $self->{open_elements}->[-1]->[0]
4113 ->manakai_local_name,
4114 token => $token);
4115
4116 pop @{$self->{open_elements}}
4117 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4118
4119 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4120 ## Reprocess.
4121 next B;
4122 } else {
4123 die "$0: $token->{type}: Unknown token type";
4124 }
4125 }
4126
4127 if ($self->{insertion_mode} & HEAD_IMS) {
4128 if ($token->{type} == CHARACTER_TOKEN) {
4129 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4130 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4131 !!!cp ('t88.2');
4132 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4133 } else {
4134 !!!cp ('t88.1');
4135 ## Ignore the token.
4136 !!!next-token;
4137 next B;
4138 }
4139 unless (length $token->{data}) {
4140 !!!cp ('t88');
4141 !!!next-token;
4142 next B;
4143 }
4144 }
4145
4146 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4147 !!!cp ('t89');
4148 ## As if <head>
4149 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4150 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4151 push @{$self->{open_elements}},
4152 [$self->{head_element}, $el_category->{head}];
4153
4154 ## Reprocess in the "in head" insertion mode...
4155 pop @{$self->{open_elements}};
4156
4157 ## Reprocess in the "after head" insertion mode...
4158 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4159 !!!cp ('t90');
4160 ## As if </noscript>
4161 pop @{$self->{open_elements}};
4162 !!!parse-error (type => 'in noscript:#character', token => $token);
4163
4164 ## Reprocess in the "in head" insertion mode...
4165 ## As if </head>
4166 pop @{$self->{open_elements}};
4167
4168 ## Reprocess in the "after head" insertion mode...
4169 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4170 !!!cp ('t91');
4171 pop @{$self->{open_elements}};
4172
4173 ## Reprocess in the "after head" insertion mode...
4174 } else {
4175 !!!cp ('t92');
4176 }
4177
4178 ## "after head" insertion mode
4179 ## As if <body>
4180 !!!insert-element ('body',, $token);
4181 $self->{insertion_mode} = IN_BODY_IM;
4182 ## reprocess
4183 next B;
4184 } elsif ($token->{type} == START_TAG_TOKEN) {
4185 if ($token->{tag_name} eq 'head') {
4186 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4187 !!!cp ('t93');
4188 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4189 $self->{open_elements}->[-1]->[0]->append_child
4190 ($self->{head_element});
4191 push @{$self->{open_elements}},
4192 [$self->{head_element}, $el_category->{head}];
4193 $self->{insertion_mode} = IN_HEAD_IM;
4194 !!!nack ('t93.1');
4195 !!!next-token;
4196 next B;
4197 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4198 !!!cp ('t93.2');
4199 !!!parse-error (type => 'after head:head', token => $token); ## TODO: error type
4200 ## Ignore the token
4201 !!!nack ('t93.3');
4202 !!!next-token;
4203 next B;
4204 } else {
4205 !!!cp ('t95');
4206 !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
4207 ## Ignore the token
4208 !!!nack ('t95.1');
4209 !!!next-token;
4210 next B;
4211 }
4212 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4213 !!!cp ('t96');
4214 ## As if <head>
4215 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4216 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4217 push @{$self->{open_elements}},
4218 [$self->{head_element}, $el_category->{head}];
4219
4220 $self->{insertion_mode} = IN_HEAD_IM;
4221 ## Reprocess in the "in head" insertion mode...
4222 } else {
4223 !!!cp ('t97');
4224 }
4225
4226 if ($token->{tag_name} eq 'base') {
4227 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4228 !!!cp ('t98');
4229 ## As if </noscript>
4230 pop @{$self->{open_elements}};
4231 !!!parse-error (type => 'in noscript:base', token => $token);
4232
4233 $self->{insertion_mode} = IN_HEAD_IM;
4234 ## Reprocess in the "in head" insertion mode...
4235 } else {
4236 !!!cp ('t99');
4237 }
4238
4239 ## NOTE: There is a "as if in head" code clone.
4240 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4241 !!!cp ('t100');
4242 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4243 push @{$self->{open_elements}},
4244 [$self->{head_element}, $el_category->{head}];
4245 } else {
4246 !!!cp ('t101');
4247 }
4248 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4249 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4250 pop @{$self->{open_elements}} # <head>
4251 if $self->{insertion_mode} == AFTER_HEAD_IM;
4252 !!!nack ('t101.1');
4253 !!!next-token;
4254 next B;
4255 } elsif ($token->{tag_name} eq 'link') {
4256 ## NOTE: There is a "as if in head" code clone.
4257 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4258 !!!cp ('t102');
4259 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4260 push @{$self->{open_elements}},
4261 [$self->{head_element}, $el_category->{head}];
4262 } else {
4263 !!!cp ('t103');
4264 }
4265 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4266 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4267 pop @{$self->{open_elements}} # <head>
4268 if $self->{insertion_mode} == AFTER_HEAD_IM;
4269 !!!ack ('t103.1');
4270 !!!next-token;
4271 next B;
4272 } elsif ($token->{tag_name} eq 'meta') {
4273 ## NOTE: There is a "as if in head" code clone.
4274 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4275 !!!cp ('t104');
4276 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4277 push @{$self->{open_elements}},
4278 [$self->{head_element}, $el_category->{head}];
4279 } else {
4280 !!!cp ('t105');
4281 }
4282 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4283 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4284
4285 unless ($self->{confident}) {
4286 if ($token->{attributes}->{charset}) {
4287 !!!cp ('t106');
4288 ## NOTE: Whether the encoding is supported or not is handled
4289 ## in the {change_encoding} callback.
4290 $self->{change_encoding}
4291 ->($self, $token->{attributes}->{charset}->{value},
4292 $token);
4293
4294 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4295 ->set_user_data (manakai_has_reference =>
4296 $token->{attributes}->{charset}
4297 ->{has_reference});
4298 } elsif ($token->{attributes}->{content}) {
4299 if ($token->{attributes}->{content}->{value}
4300 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4301 [\x09-\x0D\x20]*=
4302 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4303 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4304 !!!cp ('t107');
4305 ## NOTE: Whether the encoding is supported or not is handled
4306 ## in the {change_encoding} callback.
4307 $self->{change_encoding}
4308 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4309 $token);
4310 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4311 ->set_user_data (manakai_has_reference =>
4312 $token->{attributes}->{content}
4313 ->{has_reference});
4314 } else {
4315 !!!cp ('t108');
4316 }
4317 }
4318 } else {
4319 if ($token->{attributes}->{charset}) {
4320 !!!cp ('t109');
4321 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4322 ->set_user_data (manakai_has_reference =>
4323 $token->{attributes}->{charset}
4324 ->{has_reference});
4325 }
4326 if ($token->{attributes}->{content}) {
4327 !!!cp ('t110');
4328 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4329 ->set_user_data (manakai_has_reference =>
4330 $token->{attributes}->{content}
4331 ->{has_reference});
4332 }
4333 }
4334
4335 pop @{$self->{open_elements}} # <head>
4336 if $self->{insertion_mode} == AFTER_HEAD_IM;
4337 !!!ack ('t110.1');
4338 !!!next-token;
4339 next B;
4340 } elsif ($token->{tag_name} eq 'title') {
4341 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4342 !!!cp ('t111');
4343 ## As if </noscript>
4344 pop @{$self->{open_elements}};
4345 !!!parse-error (type => 'in noscript:title', token => $token);
4346
4347 $self->{insertion_mode} = IN_HEAD_IM;
4348 ## Reprocess in the "in head" insertion mode...
4349 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4350 !!!cp ('t112');
4351 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4352 push @{$self->{open_elements}},
4353 [$self->{head_element}, $el_category->{head}];
4354 } else {
4355 !!!cp ('t113');
4356 }
4357
4358 ## NOTE: There is a "as if in head" code clone.
4359 my $parent = defined $self->{head_element} ? $self->{head_element}
4360 : $self->{open_elements}->[-1]->[0];
4361 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4362 pop @{$self->{open_elements}} # <head>
4363 if $self->{insertion_mode} == AFTER_HEAD_IM;
4364 next B;
4365 } elsif ($token->{tag_name} eq 'style' or
4366 $token->{tag_name} eq 'noframes') {
4367 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4368 ## insertion mode IN_HEAD_IM)
4369 ## NOTE: There is a "as if in head" code clone.
4370 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4371 !!!cp ('t114');
4372 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4373 push @{$self->{open_elements}},
4374 [$self->{head_element}, $el_category->{head}];
4375 } else {
4376 !!!cp ('t115');
4377 }
4378 $parse_rcdata->(CDATA_CONTENT_MODEL);
4379 pop @{$self->{open_elements}} # <head>
4380 if $self->{insertion_mode} == AFTER_HEAD_IM;
4381 next B;
4382 } elsif ($token->{tag_name} eq 'noscript') {
4383 if ($self->{insertion_mode} == IN_HEAD_IM) {
4384 !!!cp ('t116');
4385 ## NOTE: and scripting is disalbed
4386 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4387 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4388 !!!nack ('t116.1');
4389 !!!next-token;
4390 next B;
4391 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4392 !!!cp ('t117');
4393 !!!parse-error (type => 'in noscript:noscript', token => $token);
4394 ## Ignore the token
4395 !!!nack ('t117.1');
4396 !!!next-token;
4397 next B;
4398 } else {
4399 !!!cp ('t118');
4400 #
4401 }
4402 } elsif ($token->{tag_name} eq 'script') {
4403 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4404 !!!cp ('t119');
4405 ## As if </noscript>
4406 pop @{$self->{open_elements}};
4407 !!!parse-error (type => 'in noscript:script', token => $token);
4408
4409 $self->{insertion_mode} = IN_HEAD_IM;
4410 ## Reprocess in the "in head" insertion mode...
4411 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4412 !!!cp ('t120');
4413 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4414 push @{$self->{open_elements}},
4415 [$self->{head_element}, $el_category->{head}];
4416 } else {
4417 !!!cp ('t121');
4418 }
4419
4420 ## NOTE: There is a "as if in head" code clone.
4421 $script_start_tag->();
4422 pop @{$self->{open_elements}} # <head>
4423 if $self->{insertion_mode} == AFTER_HEAD_IM;
4424 next B;
4425 } elsif ($token->{tag_name} eq 'body' or
4426 $token->{tag_name} eq 'frameset') {
4427 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4428 !!!cp ('t122');
4429 ## As if </noscript>
4430 pop @{$self->{open_elements}};
4431 !!!parse-error (type => 'in noscript:'.$token->{tag_name}, token => $token);
4432
4433 ## Reprocess in the "in head" insertion mode...
4434 ## As if </head>
4435 pop @{$self->{open_elements}};
4436
4437 ## Reprocess in the "after head" insertion mode...
4438 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4439 !!!cp ('t124');
4440 pop @{$self->{open_elements}};
4441
4442 ## Reprocess in the "after head" insertion mode...
4443 } else {
4444 !!!cp ('t125');
4445 }
4446
4447 ## "after head" insertion mode
4448 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4449 if ($token->{tag_name} eq 'body') {
4450 !!!cp ('t126');
4451 $self->{insertion_mode} = IN_BODY_IM;
4452 } elsif ($token->{tag_name} eq 'frameset') {
4453 !!!cp ('t127');
4454 $self->{insertion_mode} = IN_FRAMESET_IM;
4455 } else {
4456 die "$0: tag name: $self->{tag_name}";
4457 }
4458 !!!nack ('t127.1');
4459 !!!next-token;
4460 next B;
4461 } else {
4462 !!!cp ('t128');
4463 #
4464 }
4465
4466 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4467 !!!cp ('t129');
4468 ## As if </noscript>
4469 pop @{$self->{open_elements}};
4470 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4471
4472 ## Reprocess in the "in head" insertion mode...
4473 ## As if </head>
4474 pop @{$self->{open_elements}};
4475
4476 ## Reprocess in the "after head" insertion mode...
4477 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4478 !!!cp ('t130');
4479 ## As if </head>
4480 pop @{$self->{open_elements}};
4481
4482 ## Reprocess in the "after head" insertion mode...
4483 } else {
4484 !!!cp ('t131');
4485 }
4486
4487 ## "after head" insertion mode
4488 ## As if <body>
4489 !!!insert-element ('body',, $token);
4490 $self->{insertion_mode} = IN_BODY_IM;
4491 ## reprocess
4492 !!!ack-later;
4493 next B;
4494 } elsif ($token->{type} == END_TAG_TOKEN) {
4495 if ($token->{tag_name} eq 'head') {
4496 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4497 !!!cp ('t132');
4498 ## As if <head>
4499 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4500 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4501 push @{$self->{open_elements}},
4502 [$self->{head_element}, $el_category->{head}];
4503
4504 ## Reprocess in the "in head" insertion mode...
4505 pop @{$self->{open_elements}};
4506 $self->{insertion_mode} = AFTER_HEAD_IM;
4507 !!!next-token;
4508 next B;
4509 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4510 !!!cp ('t133');
4511 ## As if </noscript>
4512 pop @{$self->{open_elements}};
4513 !!!parse-error (type => 'in noscript:/head', token => $token);
4514
4515 ## Reprocess in the "in head" insertion mode...
4516 pop @{$self->{open_elements}};
4517 $self->{insertion_mode} = AFTER_HEAD_IM;
4518 !!!next-token;
4519 next B;
4520 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4521 !!!cp ('t134');
4522 pop @{$self->{open_elements}};
4523 $self->{insertion_mode} = AFTER_HEAD_IM;
4524 !!!next-token;
4525 next B;
4526 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4527 !!!cp ('t134.1');
4528 !!!parse-error (type => 'unmatched end tag:head', token => $token);
4529 ## Ignore the token
4530 !!!next-token;
4531 next B;
4532 } else {
4533 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4534 }
4535 } elsif ($token->{tag_name} eq 'noscript') {
4536 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4537 !!!cp ('t136');
4538 pop @{$self->{open_elements}};
4539 $self->{insertion_mode} = IN_HEAD_IM;
4540 !!!next-token;
4541 next B;
4542 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4543 $self->{insertion_mode} == AFTER_HEAD_IM) {
4544 !!!cp ('t137');
4545 !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4546 ## Ignore the token ## ISSUE: An issue in the spec.
4547 !!!next-token;
4548 next B;
4549 } else {
4550 !!!cp ('t138');
4551 #
4552 }
4553 } elsif ({
4554 body => 1, html => 1,
4555 }->{$token->{tag_name}}) {
4556 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4557 $self->{insertion_mode} == IN_HEAD_IM or
4558 $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4559 !!!cp ('t140');
4560 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4561 ## Ignore the token
4562 !!!next-token;
4563 next B;
4564 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4565 !!!cp ('t140.1');
4566 !!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token);
4567 ## Ignore the token
4568 !!!next-token;
4569 next B;
4570 } else {
4571 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4572 }
4573 } elsif ($token->{tag_name} eq 'p') {
4574 !!!cp ('t142');
4575 !!!parse-error (type => 'unmatched end tag:p', token => $token);
4576 ## Ignore the token
4577 !!!next-token;
4578 next B;
4579 } elsif ($token->{tag_name} eq 'br') {
4580 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4581 !!!cp ('t142.2');
4582 ## (before head) as if <head>, (in head) as if </head>
4583 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4584 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4585 $self->{insertion_mode} = AFTER_HEAD_IM;
4586
4587 ## Reprocess in the "after head" insertion mode...
4588 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4589 !!!cp ('t143.2');
4590 ## As if </head>
4591 pop @{$self->{open_elements}};
4592 $self->{insertion_mode} = AFTER_HEAD_IM;
4593
4594 ## Reprocess in the "after head" insertion mode...
4595 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4596 !!!cp ('t143.3');
4597 ## ISSUE: Two parse errors for <head><noscript></br>
4598 !!!parse-error (type => 'unmatched end tag:br', token => $token);
4599 ## As if </noscript>
4600 pop @{$self->{open_elements}};
4601 $self->{insertion_mode} = IN_HEAD_IM;
4602
4603 ## Reprocess in the "in head" insertion mode...
4604 ## As if </head>
4605 pop @{$self->{open_elements}};
4606 $self->{insertion_mode} = AFTER_HEAD_IM;
4607
4608 ## Reprocess in the "after head" insertion mode...
4609 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4610 !!!cp ('t143.4');
4611 #
4612 } else {
4613 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4614 }
4615
4616 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4617 !!!parse-error (type => 'unmatched end tag:br', token => $token);
4618 ## Ignore the token
4619 !!!next-token;
4620 next B;
4621 } else {
4622 !!!cp ('t145');
4623 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4624 ## Ignore the token
4625 !!!next-token;
4626 next B;
4627 }
4628
4629 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4630 !!!cp ('t146');
4631 ## As if </noscript>
4632 pop @{$self->{open_elements}};
4633 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4634
4635 ## Reprocess in the "in head" insertion mode...
4636 ## As if </head>
4637 pop @{$self->{open_elements}};
4638
4639 ## Reprocess in the "after head" insertion mode...
4640 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4641 !!!cp ('t147');
4642 ## As if </head>
4643 pop @{$self->{open_elements}};
4644
4645 ## Reprocess in the "after head" insertion mode...
4646 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4647 ## ISSUE: This case cannot be reached?
4648 !!!cp ('t148');
4649 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4650 ## Ignore the token ## ISSUE: An issue in the spec.
4651 !!!next-token;
4652 next B;
4653 } else {
4654 !!!cp ('t149');
4655 }
4656
4657 ## "after head" insertion mode
4658 ## As if <body>
4659 !!!insert-element ('body',, $token);
4660 $self->{insertion_mode} = IN_BODY_IM;
4661 ## reprocess
4662 next B;
4663 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4664 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4665 !!!cp ('t149.1');
4666
4667 ## NOTE: As if <head>
4668 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4669 $self->{open_elements}->[-1]->[0]->append_child
4670 ($self->{head_element});
4671 #push @{$self->{open_elements}},
4672 # [$self->{head_element}, $el_category->{head}];
4673 #$self->{insertion_mode} = IN_HEAD_IM;
4674 ## NOTE: Reprocess.
4675
4676 ## NOTE: As if </head>
4677 #pop @{$self->{open_elements}};
4678 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4679 ## NOTE: Reprocess.
4680
4681 #
4682 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4683 !!!cp ('t149.2');
4684
4685 ## NOTE: As if </head>
4686 pop @{$self->{open_elements}};
4687 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4688 ## NOTE: Reprocess.
4689
4690 #
4691 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4692 !!!cp ('t149.3');
4693
4694 !!!parse-error (type => 'in noscript:#eof', token => $token);
4695
4696 ## As if </noscript>
4697 pop @{$self->{open_elements}};
4698 #$self->{insertion_mode} = IN_HEAD_IM;
4699 ## NOTE: Reprocess.
4700
4701 ## NOTE: As if </head>
4702 pop @{$self->{open_elements}};
4703 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4704 ## NOTE: Reprocess.
4705
4706 #
4707 } else {
4708 !!!cp ('t149.4');
4709 #
4710 }
4711
4712 ## NOTE: As if <body>
4713 !!!insert-element ('body',, $token);
4714 $self->{insertion_mode} = IN_BODY_IM;
4715 ## NOTE: Reprocess.
4716 next B;
4717 } else {
4718 die "$0: $token->{type}: Unknown token type";
4719 }
4720
4721 ## ISSUE: An issue in the spec.
4722 } elsif ($self->{insertion_mode} & BODY_IMS) {
4723 if ($token->{type} == CHARACTER_TOKEN) {
4724 !!!cp ('t150');
4725 ## NOTE: There is a code clone of "character in body".
4726 $reconstruct_active_formatting_elements->($insert_to_current);
4727
4728 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4729
4730 !!!next-token;
4731 next B;
4732 } elsif ($token->{type} == START_TAG_TOKEN) {
4733 if ({
4734 caption => 1, col => 1, colgroup => 1, tbody => 1,
4735 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4736 }->{$token->{tag_name}}) {
4737 if ($self->{insertion_mode} == IN_CELL_IM) {
4738 ## have an element in table scope
4739 for (reverse 0..$#{$self->{open_elements}}) {
4740 my $node = $self->{open_elements}->[$_];
4741 if ($node->[1] & TABLE_CELL_EL) {
4742 !!!cp ('t151');
4743
4744 ## Close the cell
4745 !!!back-token; # <x>
4746 $token = {type => END_TAG_TOKEN,
4747 tag_name => $node->[0]->manakai_local_name,
4748 line => $token->{line},
4749 column => $token->{column}};
4750 next B;
4751 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4752 !!!cp ('t152');
4753 ## ISSUE: This case can never be reached, maybe.
4754 last;
4755 }
4756 }
4757
4758 !!!cp ('t153');
4759 !!!parse-error (type => 'start tag not allowed',
4760 value => $token->{tag_name}, token => $token);
4761 ## Ignore the token
4762 !!!nack ('t153.1');
4763 !!!next-token;
4764 next B;
4765 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4766 !!!parse-error (type => 'not closed:caption', token => $token);
4767
4768 ## NOTE: As if </caption>.
4769 ## have a table element in table scope
4770 my $i;
4771 INSCOPE: {
4772 for (reverse 0..$#{$self->{open_elements}}) {
4773 my $node = $self->{open_elements}->[$_];
4774 if ($node->[1] & CAPTION_EL) {
4775 !!!cp ('t155');
4776 $i = $_;
4777 last INSCOPE;
4778 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4779 !!!cp ('t156');
4780 last;
4781 }
4782 }
4783
4784 !!!cp ('t157');
4785 !!!parse-error (type => 'start tag not allowed',
4786 value => $token->{tag_name}, token => $token);
4787 ## Ignore the token
4788 !!!nack ('t157.1');
4789 !!!next-token;
4790 next B;
4791 } # INSCOPE
4792
4793 ## generate implied end tags
4794 while ($self->{open_elements}->[-1]->[1]
4795 & END_TAG_OPTIONAL_EL) {
4796 !!!cp ('t158');
4797 pop @{$self->{open_elements}};
4798 }
4799
4800 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4801 !!!cp ('t159');
4802 !!!parse-error (type => 'not closed',
4803 value => $self->{open_elements}->[-1]->[0]
4804 ->manakai_local_name,
4805 token => $token);
4806 } else {
4807 !!!cp ('t160');
4808 }
4809
4810 splice @{$self->{open_elements}}, $i;
4811
4812 $clear_up_to_marker->();
4813
4814 $self->{insertion_mode} = IN_TABLE_IM;
4815
4816 ## reprocess
4817 !!!ack-later;
4818 next B;
4819 } else {
4820 !!!cp ('t161');
4821 #
4822 }
4823 } else {
4824 !!!cp ('t162');
4825 #
4826 }
4827 } elsif ($token->{type} == END_TAG_TOKEN) {
4828 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4829 if ($self->{insertion_mode} == IN_CELL_IM) {
4830 ## have an element in table scope
4831 my $i;
4832 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4833 my $node = $self->{open_elements}->[$_];
4834 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4835 !!!cp ('t163');
4836 $i = $_;
4837 last INSCOPE;
4838 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4839 !!!cp ('t164');
4840 last INSCOPE;
4841 }
4842 } # INSCOPE
4843 unless (defined $i) {
4844 !!!cp ('t165');
4845 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4846 ## Ignore the token
4847 !!!next-token;
4848 next B;
4849 }
4850
4851 ## generate implied end tags
4852 while ($self->{open_elements}->[-1]->[1]
4853 & END_TAG_OPTIONAL_EL) {
4854 !!!cp ('t166');
4855 pop @{$self->{open_elements}};
4856 }
4857
4858 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4859 ne $token->{tag_name}) {
4860 !!!cp ('t167');
4861 !!!parse-error (type => 'not closed',
4862 value => $self->{open_elements}->[-1]->[0]
4863 ->manakai_local_name,
4864 token => $token);
4865 } else {
4866 !!!cp ('t168');
4867 }
4868
4869 splice @{$self->{open_elements}}, $i;
4870
4871 $clear_up_to_marker->();
4872
4873 $self->{insertion_mode} = IN_ROW_IM;
4874
4875 !!!next-token;
4876 next B;
4877 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4878 !!!cp ('t169');
4879 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4880 ## Ignore the token
4881 !!!next-token;
4882 next B;
4883 } else {
4884 !!!cp ('t170');
4885 #
4886 }
4887 } elsif ($token->{tag_name} eq 'caption') {
4888 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4889 ## have a table element in table scope
4890 my $i;
4891 INSCOPE: {
4892 for (reverse 0..$#{$self->{open_elements}}) {
4893 my $node = $self->{open_elements}->[$_];
4894 if ($node->[1] & CAPTION_EL) {
4895 !!!cp ('t171');
4896 $i = $_;
4897 last INSCOPE;
4898 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4899 !!!cp ('t172');
4900 last;
4901 }
4902 }
4903
4904 !!!cp ('t173');
4905 !!!parse-error (type => 'unmatched end tag',
4906 value => $token->{tag_name}, token => $token);
4907 ## Ignore the token
4908 !!!next-token;
4909 next B;
4910 } # INSCOPE
4911
4912 ## generate implied end tags
4913 while ($self->{open_elements}->[-1]->[1]
4914 & END_TAG_OPTIONAL_EL) {
4915 !!!cp ('t174');
4916 pop @{$self->{open_elements}};
4917 }
4918
4919 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4920 !!!cp ('t175');
4921 !!!parse-error (type => 'not closed',
4922 value => $self->{open_elements}->[-1]->[0]
4923 ->manakai_local_name,
4924 token => $token);
4925 } else {
4926 !!!cp ('t176');
4927 }
4928
4929 splice @{$self->{open_elements}}, $i;
4930
4931 $clear_up_to_marker->();
4932
4933 $self->{insertion_mode} = IN_TABLE_IM;
4934
4935 !!!next-token;
4936 next B;
4937 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4938 !!!cp ('t177');
4939 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4940 ## Ignore the token
4941 !!!next-token;
4942 next B;
4943 } else {
4944 !!!cp ('t178');
4945 #
4946 }
4947 } elsif ({
4948 table => 1, tbody => 1, tfoot => 1,
4949 thead => 1, tr => 1,
4950 }->{$token->{tag_name}} and
4951 $self->{insertion_mode} == IN_CELL_IM) {
4952 ## have an element in table scope
4953 my $i;
4954 my $tn;
4955 INSCOPE: {
4956 for (reverse 0..$#{$self->{open_elements}}) {
4957 my $node = $self->{open_elements}->[$_];
4958 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4959 !!!cp ('t179');
4960 $i = $_;
4961
4962 ## Close the cell
4963 !!!back-token; # </x>
4964 $token = {type => END_TAG_TOKEN, tag_name => $tn,
4965 line => $token->{line},
4966 column => $token->{column}};
4967 next B;
4968 } elsif ($node->[1] & TABLE_CELL_EL) {
4969 !!!cp ('t180');
4970 $tn = $node->[0]->manakai_local_name;
4971 ## NOTE: There is exactly one |td| or |th| element
4972 ## in scope in the stack of open elements by definition.
4973 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4974 ## ISSUE: Can this be reached?
4975 !!!cp ('t181');
4976 last;
4977 }
4978 }
4979
4980 !!!cp ('t182');
4981 !!!parse-error (type => 'unmatched end tag',
4982 value => $token->{tag_name}, token => $token);
4983 ## Ignore the token
4984 !!!next-token;
4985 next B;
4986 } # INSCOPE
4987 } elsif ($token->{tag_name} eq 'table' and
4988 $self->{insertion_mode} == IN_CAPTION_IM) {
4989 !!!parse-error (type => 'not closed:caption', token => $token);
4990
4991 ## As if </caption>
4992 ## have a table element in table scope
4993 my $i;
4994 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4995 my $node = $self->{open_elements}->[$_];
4996 if ($node->[1] & CAPTION_EL) {
4997 !!!cp ('t184');
4998 $i = $_;
4999 last INSCOPE;
5000 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5001 !!!cp ('t185');
5002 last INSCOPE;
5003 }
5004 } # INSCOPE
5005 unless (defined $i) {
5006 !!!cp ('t186');
5007 !!!parse-error (type => 'unmatched end tag:caption', token => $token);
5008 ## Ignore the token
5009 !!!next-token;
5010 next B;
5011 }
5012
5013 ## generate implied end tags
5014 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5015 !!!cp ('t187');
5016 pop @{$self->{open_elements}};
5017 }
5018
5019 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5020 !!!cp ('t188');
5021 !!!parse-error (type => 'not closed',
5022 value => $self->{open_elements}->[-1]->[0]
5023 ->manakai_local_name,
5024 token => $token);
5025 } else {
5026 !!!cp ('t189');
5027 }
5028
5029 splice @{$self->{open_elements}}, $i;
5030
5031 $clear_up_to_marker->();
5032
5033 $self->{insertion_mode} = IN_TABLE_IM;
5034
5035 ## reprocess
5036 next B;
5037 } elsif ({
5038 body => 1, col => 1, colgroup => 1, html => 1,
5039 }->{$token->{tag_name}}) {
5040 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5041 !!!cp ('t190');
5042 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5043 ## Ignore the token
5044 !!!next-token;
5045 next B;
5046 } else {
5047 !!!cp ('t191');
5048 #
5049 }
5050 } elsif ({
5051 tbody => 1, tfoot => 1,
5052 thead => 1, tr => 1,
5053 }->{$token->{tag_name}} and
5054 $self->{insertion_mode} == IN_CAPTION_IM) {
5055 !!!cp ('t192');
5056 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5057 ## Ignore the token
5058 !!!next-token;
5059 next B;
5060 } else {
5061 !!!cp ('t193');
5062 #
5063 }
5064 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5065 for my $entry (@{$self->{open_elements}}) {
5066 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5067 !!!cp ('t75');
5068 !!!parse-error (type => 'in body:#eof', token => $token);
5069 last;
5070 }
5071 }
5072
5073 ## Stop parsing.
5074 last B;
5075 } else {
5076 die "$0: $token->{type}: Unknown token type";
5077 }
5078
5079 $insert = $insert_to_current;
5080 #
5081 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5082 if ($token->{type} == CHARACTER_TOKEN) {
5083 if (not $open_tables->[-1]->[1] and # tainted
5084 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5085 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5086
5087 unless (length $token->{data}) {
5088 !!!cp ('t194');
5089 !!!next-token;
5090 next B;
5091 } else {
5092 !!!cp ('t195');
5093 }
5094 }
5095
5096 !!!parse-error (type => 'in table:#character', token => $token);
5097
5098 ## As if in body, but insert into foster parent element
5099 ## ISSUE: Spec says that "whenever a node would be inserted
5100 ## into the current node" while characters might not be
5101 ## result in a new Text node.
5102 $reconstruct_active_formatting_elements->($insert_to_foster);
5103
5104 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5105 # MUST
5106 my $foster_parent_element;
5107 my $next_sibling;
5108 my $prev_sibling;
5109 OE: for (reverse 0..$#{$self->{open_elements}}) {
5110 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5111 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5112 if (defined $parent and $parent->node_type == 1) {
5113 !!!cp ('t196');
5114 $foster_parent_element = $parent;
5115 $next_sibling = $self->{open_elements}->[$_]->[0];
5116 $prev_sibling = $next_sibling->previous_sibling;
5117 } else {
5118 !!!cp ('t197');
5119 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5120 $prev_sibling = $foster_parent_element->last_child;
5121 }
5122 last OE;
5123 }
5124 } # OE
5125 $foster_parent_element = $self->{open_elements}->[0]->[0] and
5126 $prev_sibling = $foster_parent_element->last_child
5127 unless defined $foster_parent_element;
5128 if (defined $prev_sibling and
5129 $prev_sibling->node_type == 3) {
5130 !!!cp ('t198');
5131 $prev_sibling->manakai_append_text ($token->{data});
5132 } else {
5133 !!!cp ('t199');
5134 $foster_parent_element->insert_before
5135 ($self->{document}->create_text_node ($token->{data}),
5136 $next_sibling);
5137 }
5138 $open_tables->[-1]->[1] = 1; # tainted
5139 } else {
5140 !!!cp ('t200');
5141 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5142 }
5143
5144 !!!next-token;
5145 next B;
5146 } elsif ($token->{type} == START_TAG_TOKEN) {
5147 if ({
5148 tr => ($self->{insertion_mode} != IN_ROW_IM),
5149 th => 1, td => 1,
5150 }->{$token->{tag_name}}) {
5151 if ($self->{insertion_mode} == IN_TABLE_IM) {
5152 ## Clear back to table context
5153 while (not ($self->{open_elements}->[-1]->[1]
5154 & TABLE_SCOPING_EL)) {
5155 !!!cp ('t201');
5156 pop @{$self->{open_elements}};
5157 }
5158
5159 !!!insert-element ('tbody',, $token);
5160 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5161 ## reprocess in the "in table body" insertion mode...
5162 }
5163
5164 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5165 unless ($token->{tag_name} eq 'tr') {
5166 !!!cp ('t202');
5167 !!!parse-error (type => 'missing start tag:tr', token => $token);
5168 }
5169
5170 ## Clear back to table body context
5171 while (not ($self->{open_elements}->[-1]->[1]
5172 & TABLE_ROWS_SCOPING_EL)) {
5173 !!!cp ('t203');
5174 ## ISSUE: Can this case be reached?
5175 pop @{$self->{open_elements}};
5176 }
5177
5178 $self->{insertion_mode} = IN_ROW_IM;
5179 if ($token->{tag_name} eq 'tr') {
5180 !!!cp ('t204');
5181 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5182 !!!nack ('t204');
5183 !!!next-token;
5184 next B;
5185 } else {
5186 !!!cp ('t205');
5187 !!!insert-element ('tr',, $token);
5188 ## reprocess in the "in row" insertion mode
5189 }
5190 } else {
5191 !!!cp ('t206');
5192 }
5193
5194 ## Clear back to table row context
5195 while (not ($self->{open_elements}->[-1]->[1]
5196 & TABLE_ROW_SCOPING_EL)) {
5197 !!!cp ('t207');
5198 pop @{$self->{open_elements}};
5199 }
5200
5201 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5202 $self->{insertion_mode} = IN_CELL_IM;
5203
5204 push @$active_formatting_elements, ['#marker', ''];
5205
5206 !!!nack ('t207.1');
5207 !!!next-token;
5208 next B;
5209 } elsif ({
5210 caption => 1, col => 1, colgroup => 1,
5211 tbody => 1, tfoot => 1, thead => 1,
5212 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5213 }->{$token->{tag_name}}) {
5214 if ($self->{insertion_mode} == IN_ROW_IM) {
5215 ## As if </tr>
5216 ## have an element in table scope
5217 my $i;
5218 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5219 my $node = $self->{open_elements}->[$_];
5220 if ($node->[1] & TABLE_ROW_EL) {
5221 !!!cp ('t208');
5222 $i = $_;
5223 last INSCOPE;
5224 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5225 !!!cp ('t209');
5226 last INSCOPE;
5227 }
5228 } # INSCOPE
5229 unless (defined $i) {
5230 !!!cp ('t210');
5231 ## TODO: This type is wrong.
5232 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name}, token => $token);
5233 ## Ignore the token
5234 !!!nack ('t210.1');
5235 !!!next-token;
5236 next B;
5237 }
5238
5239 ## Clear back to table row context
5240 while (not ($self->{open_elements}->[-1]->[1]
5241 & TABLE_ROW_SCOPING_EL)) {
5242 !!!cp ('t211');
5243 ## ISSUE: Can this case be reached?
5244 pop @{$self->{open_elements}};
5245 }
5246
5247 pop @{$self->{open_elements}}; # tr
5248 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5249 if ($token->{tag_name} eq 'tr') {
5250 !!!cp ('t212');
5251 ## reprocess
5252 !!!ack-later;
5253 next B;
5254 } else {
5255 !!!cp ('t213');
5256 ## reprocess in the "in table body" insertion mode...
5257 }
5258 }
5259
5260 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5261 ## have an element in table scope
5262 my $i;
5263 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5264 my $node = $self->{open_elements}->[$_];
5265 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5266 !!!cp ('t214');
5267 $i = $_;
5268 last INSCOPE;
5269 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5270 !!!cp ('t215');
5271 last INSCOPE;
5272 }
5273 } # INSCOPE
5274 unless (defined $i) {
5275 !!!cp ('t216');
5276 ## TODO: This erorr type ios wrong.
5277 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5278 ## Ignore the token
5279 !!!nack ('t216.1');
5280 !!!next-token;
5281 next B;
5282 }
5283
5284 ## Clear back to table body context
5285 while (not ($self->{open_elements}->[-1]->[1]
5286 & TABLE_ROWS_SCOPING_EL)) {
5287 !!!cp ('t217');
5288 ## ISSUE: Can this state be reached?
5289 pop @{$self->{open_elements}};
5290 }
5291
5292 ## As if <{current node}>
5293 ## have an element in table scope
5294 ## true by definition
5295
5296 ## Clear back to table body context
5297 ## nop by definition
5298
5299 pop @{$self->{open_elements}};
5300 $self->{insertion_mode} = IN_TABLE_IM;
5301 ## reprocess in "in table" insertion mode...
5302 } else {
5303 !!!cp ('t218');
5304 }
5305
5306 if ($token->{tag_name} eq 'col') {
5307 ## Clear back to table context
5308 while (not ($self->{open_elements}->[-1]->[1]
5309 & TABLE_SCOPING_EL)) {
5310 !!!cp ('t219');
5311 ## ISSUE: Can this state be reached?
5312 pop @{$self->{open_elements}};
5313 }
5314
5315 !!!insert-element ('colgroup',, $token);
5316 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5317 ## reprocess
5318 !!!ack-later;
5319 next B;
5320 } elsif ({
5321 caption => 1,
5322 colgroup => 1,
5323 tbody => 1, tfoot => 1, thead => 1,
5324 }->{$token->{tag_name}}) {
5325 ## Clear back to table context
5326 while (not ($self->{open_elements}->[-1]->[1]
5327 & TABLE_SCOPING_EL)) {
5328 !!!cp ('t220');
5329 ## ISSUE: Can this state be reached?
5330 pop @{$self->{open_elements}};
5331 }
5332
5333 push @$active_formatting_elements, ['#marker', '']
5334 if $token->{tag_name} eq 'caption';
5335
5336 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5337 $self->{insertion_mode} = {
5338 caption => IN_CAPTION_IM,
5339 colgroup => IN_COLUMN_GROUP_IM,
5340 tbody => IN_TABLE_BODY_IM,
5341 tfoot => IN_TABLE_BODY_IM,
5342 thead => IN_TABLE_BODY_IM,
5343 }->{$token->{tag_name}};
5344 !!!next-token;
5345 !!!nack ('t220.1');
5346 next B;
5347 } else {
5348 die "$0: in table: <>: $token->{tag_name}";
5349 }
5350 } elsif ($token->{tag_name} eq 'table') {
5351 !!!parse-error (type => 'not closed',
5352 value => $self->{open_elements}->[-1]->[0]
5353 ->manakai_local_name,
5354 token => $token);
5355
5356 ## As if </table>
5357 ## have a table element in table scope
5358 my $i;
5359 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5360 my $node = $self->{open_elements}->[$_];
5361 if ($node->[1] & TABLE_EL) {
5362 !!!cp ('t221');
5363 $i = $_;
5364 last INSCOPE;
5365 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5366 !!!cp ('t222');
5367 last INSCOPE;
5368 }
5369 } # INSCOPE
5370 unless (defined $i) {
5371 !!!cp ('t223');
5372 ## TODO: The following is wrong, maybe.
5373 !!!parse-error (type => 'unmatched end tag:table', token => $token);
5374 ## Ignore tokens </table><table>
5375 !!!nack ('t223.1');
5376 !!!next-token;
5377 next B;
5378 }
5379
5380 ## TODO: Followings are removed from the latest spec.
5381 ## generate implied end tags
5382 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5383 !!!cp ('t224');
5384 pop @{$self->{open_elements}};
5385 }
5386
5387 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5388 !!!cp ('t225');
5389 ## NOTE: |<table><tr><table>|
5390 !!!parse-error (type => 'not closed',
5391 value => $self->{open_elements}->[-1]->[0]
5392 ->manakai_local_name,
5393 token => $token);
5394 } else {
5395 !!!cp ('t226');
5396 }
5397
5398 splice @{$self->{open_elements}}, $i;
5399 pop @{$open_tables};
5400
5401 $self->_reset_insertion_mode;
5402
5403 ## reprocess
5404 !!!ack-later;
5405 next B;
5406 } elsif ($token->{tag_name} eq 'style') {
5407 if (not $open_tables->[-1]->[1]) { # tainted
5408 !!!cp ('t227.8');
5409 ## NOTE: This is a "as if in head" code clone.
5410 $parse_rcdata->(CDATA_CONTENT_MODEL);
5411 next B;
5412 } else {
5413 !!!cp ('t227.7');
5414 #
5415 }
5416 } elsif ($token->{tag_name} eq 'script') {
5417 if (not $open_tables->[-1]->[1]) { # tainted
5418 !!!cp ('t227.6');
5419 ## NOTE: This is a "as if in head" code clone.
5420 $script_start_tag->();
5421 next B;
5422 } else {
5423 !!!cp ('t227.5');
5424 #
5425 }
5426 } elsif ($token->{tag_name} eq 'input') {
5427 if (not $open_tables->[-1]->[1]) { # tainted
5428 if ($token->{attributes}->{type}) { ## TODO: case
5429 my $type = lc $token->{attributes}->{type}->{value};
5430 if ($type eq 'hidden') {
5431 !!!cp ('t227.3');
5432 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5433
5434 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5435
5436 ## TODO: form element pointer
5437
5438 pop @{$self->{open_elements}};
5439
5440 !!!next-token;
5441 !!!ack ('t227.2.1');
5442 next B;
5443 } else {
5444 !!!cp ('t227.2');
5445 #
5446 }
5447 } else {
5448 !!!cp ('t227.1');
5449 #
5450 }
5451 } else {
5452 !!!cp ('t227.4');
5453 #
5454 }
5455 } else {
5456 !!!cp ('t227');
5457 #
5458 }
5459
5460 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5461
5462 $insert = $insert_to_foster;
5463 #
5464 } elsif ($token->{type} == END_TAG_TOKEN) {
5465 if ($token->{tag_name} eq 'tr' and
5466 $self->{insertion_mode} == IN_ROW_IM) {
5467 ## have an element in table scope
5468 my $i;
5469 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5470 my $node = $self->{open_elements}->[$_];
5471 if ($node->[1] & TABLE_ROW_EL) {
5472 !!!cp ('t228');
5473 $i = $_;
5474 last INSCOPE;
5475 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5476 !!!cp ('t229');
5477 last INSCOPE;
5478 }
5479 } # INSCOPE
5480 unless (defined $i) {
5481 !!!cp ('t230');
5482 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5483 ## Ignore the token
5484 !!!nack ('t230.1');
5485 !!!next-token;
5486 next B;
5487 } else {
5488 !!!cp ('t232');
5489 }
5490
5491 ## Clear back to table row context
5492 while (not ($self->{open_elements}->[-1]->[1]
5493 & TABLE_ROW_SCOPING_EL)) {
5494 !!!cp ('t231');
5495 ## ISSUE: Can this state be reached?
5496 pop @{$self->{open_elements}};
5497 }
5498
5499 pop @{$self->{open_elements}}; # tr
5500 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5501 !!!next-token;
5502 !!!nack ('t231.1');
5503 next B;
5504 } elsif ($token->{tag_name} eq 'table') {
5505 if ($self->{insertion_mode} == IN_ROW_IM) {
5506 ## As if </tr>
5507 ## have an element in table scope
5508 my $i;
5509 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5510 my $node = $self->{open_elements}->[$_];
5511 if ($node->[1] & TABLE_ROW_EL) {
5512 !!!cp ('t233');
5513 $i = $_;
5514 last INSCOPE;
5515 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5516 !!!cp ('t234');
5517 last INSCOPE;
5518 }
5519 } # INSCOPE
5520 unless (defined $i) {
5521 !!!cp ('t235');
5522 ## TODO: The following is wrong.
5523 !!!parse-error (type => 'unmatched end tag:'.$token->{type}, token => $token);
5524 ## Ignore the token
5525 !!!nack ('t236.1');
5526 !!!next-token;
5527 next B;
5528 }
5529
5530 ## Clear back to table row context
5531 while (not ($self->{open_elements}->[-1]->[1]
5532 & TABLE_ROW_SCOPING_EL)) {
5533 !!!cp ('t236');
5534 ## ISSUE: Can this state be reached?
5535 pop @{$self->{open_elements}};
5536 }
5537
5538 pop @{$self->{open_elements}}; # tr
5539 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5540 ## reprocess in the "in table body" insertion mode...
5541 }
5542
5543 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5544 ## have an element in table scope
5545 my $i;
5546 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5547 my $node = $self->{open_elements}->[$_];
5548 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5549 !!!cp ('t237');
5550 $i = $_;
5551 last INSCOPE;
5552 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5553 !!!cp ('t238');
5554 last INSCOPE;
5555 }
5556 } # INSCOPE
5557 unless (defined $i) {
5558 !!!cp ('t239');
5559 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5560 ## Ignore the token
5561 !!!nack ('t239.1');
5562 !!!next-token;
5563 next B;
5564 }
5565
5566 ## Clear back to table body context
5567 while (not ($self->{open_elements}->[-1]->[1]
5568 & TABLE_ROWS_SCOPING_EL)) {
5569 !!!cp ('t240');
5570 pop @{$self->{open_elements}};
5571 }
5572
5573 ## As if <{current node}>
5574 ## have an element in table scope
5575 ## true by definition
5576
5577 ## Clear back to table body context
5578 ## nop by definition
5579
5580 pop @{$self->{open_elements}};
5581 $self->{insertion_mode} = IN_TABLE_IM;
5582 ## reprocess in the "in table" insertion mode...
5583 }
5584
5585 ## NOTE: </table> in the "in table" insertion mode.
5586 ## When you edit the code fragment below, please ensure that
5587 ## the code for <table> in the "in table" insertion mode
5588 ## is synced with it.
5589
5590 ## have a table element in table scope
5591 my $i;
5592 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5593 my $node = $self->{open_elements}->[$_];
5594 if ($node->[1] & TABLE_EL) {
5595 !!!cp ('t241');
5596 $i = $_;
5597 last INSCOPE;
5598 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5599 !!!cp ('t242');
5600 last INSCOPE;
5601 }
5602 } # INSCOPE
5603 unless (defined $i) {
5604 !!!cp ('t243');
5605 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5606 ## Ignore the token
5607 !!!nack ('t243.1');
5608 !!!next-token;
5609 next B;
5610 }
5611
5612 splice @{$self->{open_elements}}, $i;
5613 pop @{$open_tables};
5614
5615 $self->_reset_insertion_mode;
5616
5617 !!!next-token;
5618 next B;
5619 } elsif ({
5620 tbody => 1, tfoot => 1, thead => 1,
5621 }->{$token->{tag_name}} and
5622 $self->{insertion_mode} & ROW_IMS) {
5623 if ($self->{insertion_mode} == IN_ROW_IM) {
5624 ## have an element in table scope
5625 my $i;
5626 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5627 my $node = $self->{open_elements}->[$_];
5628 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5629 !!!cp ('t247');
5630 $i = $_;
5631 last INSCOPE;
5632 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5633 !!!cp ('t248');
5634 last INSCOPE;
5635 }
5636 } # INSCOPE
5637 unless (defined $i) {
5638 !!!cp ('t249');
5639 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5640 ## Ignore the token
5641 !!!nack ('t249.1');
5642 !!!next-token;
5643 next B;
5644 }
5645
5646 ## As if </tr>
5647 ## have an element in table scope
5648 my $i;
5649 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5650 my $node = $self->{open_elements}->[$_];
5651 if ($node->[1] & TABLE_ROW_EL) {
5652 !!!cp ('t250');
5653 $i = $_;
5654 last INSCOPE;
5655 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5656 !!!cp ('t251');
5657 last INSCOPE;
5658 }
5659 } # INSCOPE
5660 unless (defined $i) {
5661 !!!cp ('t252');
5662 !!!parse-error (type => 'unmatched end tag:tr', token => $token);
5663 ## Ignore the token
5664 !!!nack ('t252.1');
5665 !!!next-token;
5666 next B;
5667 }
5668
5669 ## Clear back to table row context
5670 while (not ($self->{open_elements}->[-1]->[1]
5671 & TABLE_ROW_SCOPING_EL)) {
5672 !!!cp ('t253');
5673 ## ISSUE: Can this case be reached?
5674 pop @{$self->{open_elements}};
5675 }
5676
5677 pop @{$self->{open_elements}}; # tr
5678 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5679 ## reprocess in the "in table body" insertion mode...
5680 }
5681
5682 ## have an element in table scope
5683 my $i;
5684 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5685 my $node = $self->{open_elements}->[$_];
5686 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5687 !!!cp ('t254');
5688 $i = $_;
5689 last INSCOPE;
5690 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5691 !!!cp ('t255');
5692 last INSCOPE;
5693 }
5694 } # INSCOPE
5695 unless (defined $i) {
5696 !!!cp ('t256');
5697 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5698 ## Ignore the token
5699 !!!nack ('t256.1');
5700 !!!next-token;
5701 next B;
5702 }
5703
5704 ## Clear back to table body context
5705 while (not ($self->{open_elements}->[-1]->[1]
5706 & TABLE_ROWS_SCOPING_EL)) {
5707 !!!cp ('t257');
5708 ## ISSUE: Can this case be reached?
5709 pop @{$self->{open_elements}};
5710 }
5711
5712 pop @{$self->{open_elements}};
5713 $self->{insertion_mode} = IN_TABLE_IM;
5714 !!!nack ('t257.1');
5715 !!!next-token;
5716 next B;
5717 } elsif ({
5718 body => 1, caption => 1, col => 1, colgroup => 1,
5719 html => 1, td => 1, th => 1,
5720 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5721 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5722 }->{$token->{tag_name}}) {
5723 !!!cp ('t258');
5724 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5725 ## Ignore the token
5726 !!!nack ('t258.1');
5727 !!!next-token;
5728 next B;
5729 } else {
5730 !!!cp ('t259');
5731 !!!parse-error (type => 'in table:/'.$token->{tag_name}, token => $token);
5732
5733 $insert = $insert_to_foster;
5734 #
5735 }
5736 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5737 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5738 @{$self->{open_elements}} == 1) { # redundant, maybe
5739 !!!parse-error (type => 'in body:#eof', token => $token);
5740 !!!cp ('t259.1');
5741 #
5742 } else {
5743 !!!cp ('t259.2');
5744 #
5745 }
5746
5747 ## Stop parsing
5748 last B;
5749 } else {
5750 die "$0: $token->{type}: Unknown token type";
5751 }
5752 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5753 if ($token->{type} == CHARACTER_TOKEN) {
5754 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5755 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5756 unless (length $token->{data}) {
5757 !!!cp ('t260');
5758 !!!next-token;
5759 next B;
5760 }
5761 }
5762
5763 !!!cp ('t261');
5764 #
5765 } elsif ($token->{type} == START_TAG_TOKEN) {
5766 if ($token->{tag_name} eq 'col') {
5767 !!!cp ('t262');
5768 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5769 pop @{$self->{open_elements}};
5770 !!!ack ('t262.1');
5771 !!!next-token;
5772 next B;
5773 } else {
5774 !!!cp ('t263');
5775 #
5776 }
5777 } elsif ($token->{type} == END_TAG_TOKEN) {
5778 if ($token->{tag_name} eq 'colgroup') {
5779 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5780 !!!cp ('t264');
5781 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5782 ## Ignore the token
5783 !!!next-token;
5784 next B;
5785 } else {
5786 !!!cp ('t265');
5787 pop @{$self->{open_elements}}; # colgroup
5788 $self->{insertion_mode} = IN_TABLE_IM;
5789 !!!next-token;
5790 next B;
5791 }
5792 } elsif ($token->{tag_name} eq 'col') {
5793 !!!cp ('t266');
5794 !!!parse-error (type => 'unmatched end tag:col', token => $token);
5795 ## Ignore the token
5796 !!!next-token;
5797 next B;
5798 } else {
5799 !!!cp ('t267');
5800 #
5801 }
5802 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5803 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5804 @{$self->{open_elements}} == 1) { # redundant, maybe
5805 !!!cp ('t270.2');
5806 ## Stop parsing.
5807 last B;
5808 } else {
5809 ## NOTE: As if </colgroup>.
5810 !!!cp ('t270.1');
5811 pop @{$self->{open_elements}}; # colgroup
5812 $self->{insertion_mode} = IN_TABLE_IM;
5813 ## Reprocess.
5814 next B;
5815 }
5816 } else {
5817 die "$0: $token->{type}: Unknown token type";
5818 }
5819
5820 ## As if </colgroup>
5821 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5822 !!!cp ('t269');
5823 ## TODO: Wrong error type?
5824 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5825 ## Ignore the token
5826 !!!nack ('t269.1');
5827 !!!next-token;
5828 next B;
5829 } else {
5830 !!!cp ('t270');
5831 pop @{$self->{open_elements}}; # colgroup
5832 $self->{insertion_mode} = IN_TABLE_IM;
5833 !!!ack-later;
5834 ## reprocess
5835 next B;
5836 }
5837 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5838 if ($token->{type} == CHARACTER_TOKEN) {
5839 !!!cp ('t271');
5840 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5841 !!!next-token;
5842 next B;
5843 } elsif ($token->{type} == START_TAG_TOKEN) {
5844 if ($token->{tag_name} eq 'option') {
5845 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5846 !!!cp ('t272');
5847 ## As if </option>
5848 pop @{$self->{open_elements}};
5849 } else {
5850 !!!cp ('t273');
5851 }
5852
5853 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5854 !!!nack ('t273.1');
5855 !!!next-token;
5856 next B;
5857 } elsif ($token->{tag_name} eq 'optgroup') {
5858 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5859 !!!cp ('t274');
5860 ## As if </option>
5861 pop @{$self->{open_elements}};
5862 } else {
5863 !!!cp ('t275');
5864 }
5865
5866 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5867 !!!cp ('t276');
5868 ## As if </optgroup>
5869 pop @{$self->{open_elements}};
5870 } else {
5871 !!!cp ('t277');
5872 }
5873
5874 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5875 !!!nack ('t277.1');
5876 !!!next-token;
5877 next B;
5878 } elsif ({
5879 select => 1, input => 1, textarea => 1,
5880 }->{$token->{tag_name}} or
5881 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5882 {
5883 caption => 1, table => 1,
5884 tbody => 1, tfoot => 1, thead => 1,
5885 tr => 1, td => 1, th => 1,
5886 }->{$token->{tag_name}})) {
5887 ## TODO: The type below is not good - <select> is replaced by </select>
5888 !!!parse-error (type => 'not closed:select', token => $token);
5889 ## NOTE: As if the token were </select> (<select> case) or
5890 ## as if there were </select> (otherwise).
5891 ## have an element in table scope
5892 my $i;
5893 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5894 my $node = $self->{open_elements}->[$_];
5895 if ($node->[1] & SELECT_EL) {
5896 !!!cp ('t278');
5897 $i = $_;
5898 last INSCOPE;
5899 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5900 !!!cp ('t279');
5901 last INSCOPE;
5902 }
5903 } # INSCOPE
5904 unless (defined $i) {
5905 !!!cp ('t280');
5906 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5907 ## Ignore the token
5908 !!!nack ('t280.1');
5909 !!!next-token;
5910 next B;
5911 }
5912
5913 !!!cp ('t281');
5914 splice @{$self->{open_elements}}, $i;
5915
5916 $self->_reset_insertion_mode;
5917
5918 if ($token->{tag_name} eq 'select') {
5919 !!!nack ('t281.2');
5920 !!!next-token;
5921 next B;
5922 } else {
5923 !!!cp ('t281.1');
5924 !!!ack-later;
5925 ## Reprocess the token.
5926 next B;
5927 }
5928 } else {
5929 !!!cp ('t282');
5930 !!!parse-error (type => 'in select:'.$token->{tag_name}, token => $token);
5931 ## Ignore the token
5932 !!!nack ('t282.1');
5933 !!!next-token;
5934 next B;
5935 }
5936 } elsif ($token->{type} == END_TAG_TOKEN) {
5937 if ($token->{tag_name} eq 'optgroup') {
5938 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
5939 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
5940 !!!cp ('t283');
5941 ## As if </option>
5942 splice @{$self->{open_elements}}, -2;
5943 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5944 !!!cp ('t284');
5945 pop @{$self->{open_elements}};
5946 } else {
5947 !!!cp ('t285');
5948 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5949 ## Ignore the token
5950 }
5951 !!!nack ('t285.1');
5952 !!!next-token;
5953 next B;
5954 } elsif ($token->{tag_name} eq 'option') {
5955 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5956 !!!cp ('t286');
5957 pop @{$self->{open_elements}};
5958 } else {
5959 !!!cp ('t287');
5960 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5961 ## Ignore the token
5962 }
5963 !!!nack ('t287.1');
5964 !!!next-token;
5965 next B;
5966 } elsif ($token->{tag_name} eq 'select') {
5967 ## have an element in table scope
5968 my $i;
5969 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5970 my $node = $self->{open_elements}->[$_];
5971 if ($node->[1] & SELECT_EL) {
5972 !!!cp ('t288');
5973 $i = $_;
5974 last INSCOPE;
5975 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5976 !!!cp ('t289');
5977 last INSCOPE;
5978 }
5979 } # INSCOPE
5980 unless (defined $i) {
5981 !!!cp ('t290');
5982 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5983 ## Ignore the token
5984 !!!nack ('t290.1');
5985 !!!next-token;
5986 next B;
5987 }
5988
5989 !!!cp ('t291');
5990 splice @{$self->{open_elements}}, $i;
5991
5992 $self->_reset_insertion_mode;
5993
5994 !!!nack ('t291.1');
5995 !!!next-token;
5996 next B;
5997 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5998 {
5999 caption => 1, table => 1, tbody => 1,
6000 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6001 }->{$token->{tag_name}}) {
6002 ## TODO: The following is wrong?
6003 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6004
6005 ## have an element in table scope
6006 my $i;
6007 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6008 my $node = $self->{open_elements}->[$_];
6009 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6010 !!!cp ('t292');
6011 $i = $_;
6012 last INSCOPE;
6013 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6014 !!!cp ('t293');
6015 last INSCOPE;
6016 }
6017 } # INSCOPE
6018 unless (defined $i) {
6019 !!!cp ('t294');
6020 ## Ignore the token
6021 !!!nack ('t294.1');
6022 !!!next-token;
6023 next B;
6024 }
6025
6026 ## As if </select>
6027 ## have an element in table scope
6028 undef $i;
6029 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6030 my $node = $self->{open_elements}->[$_];
6031 if ($node->[1] & SELECT_EL) {
6032 !!!cp ('t295');
6033 $i = $_;
6034 last INSCOPE;
6035 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6036 ## ISSUE: Can this state be reached?
6037 !!!cp ('t296');
6038 last INSCOPE;
6039 }
6040 } # INSCOPE
6041 unless (defined $i) {
6042 !!!cp ('t297');
6043 ## TODO: The following error type is correct?
6044 !!!parse-error (type => 'unmatched end tag:select', token => $token);
6045 ## Ignore the </select> token
6046 !!!nack ('t297.1');
6047 !!!next-token; ## TODO: ok?
6048 next B;
6049 }
6050
6051 !!!cp ('t298');
6052 splice @{$self->{open_elements}}, $i;
6053
6054 $self->_reset_insertion_mode;
6055
6056 !!!ack-later;
6057 ## reprocess
6058 next B;
6059 } else {
6060 !!!cp ('t299');
6061 !!!parse-error (type => 'in select:/'.$token->{tag_name}, token => $token);
6062 ## Ignore the token
6063 !!!nack ('t299.3');
6064 !!!next-token;
6065 next B;
6066 }
6067 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6068 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6069 @{$self->{open_elements}} == 1) { # redundant, maybe
6070 !!!cp ('t299.1');
6071 !!!parse-error (type => 'in body:#eof', token => $token);
6072 } else {
6073 !!!cp ('t299.2');
6074 }
6075
6076 ## Stop parsing.
6077 last B;
6078 } else {
6079 die "$0: $token->{type}: Unknown token type";
6080 }
6081 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6082 if ($token->{type} == CHARACTER_TOKEN) {
6083 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6084 my $data = $1;
6085 ## As if in body
6086 $reconstruct_active_formatting_elements->($insert_to_current);
6087
6088 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6089
6090 unless (length $token->{data}) {
6091 !!!cp ('t300');
6092 !!!next-token;
6093 next B;
6094 }
6095 }
6096
6097 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6098 !!!cp ('t301');
6099 !!!parse-error (type => 'after html:#character', token => $token);
6100
6101 ## Reprocess in the "after body" insertion mode.
6102 } else {
6103 !!!cp ('t302');
6104 }
6105
6106 ## "after body" insertion mode
6107 !!!parse-error (type => 'after body:#character', token => $token);
6108
6109 $self->{insertion_mode} = IN_BODY_IM;
6110 ## reprocess
6111 next B;
6112 } elsif ($token->{type} == START_TAG_TOKEN) {
6113 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6114 !!!cp ('t303');
6115 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
6116
6117 ## Reprocess in the "after body" insertion mode.
6118 } else {
6119 !!!cp ('t304');
6120 }
6121
6122 ## "after body" insertion mode
6123 !!!parse-error (type => 'after body:'.$token->{tag_name}, token => $token);
6124
6125 $self->{insertion_mode} = IN_BODY_IM;
6126 !!!ack-later;
6127 ## reprocess
6128 next B;
6129 } elsif ($token->{type} == END_TAG_TOKEN) {
6130 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6131 !!!cp ('t305');
6132 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
6133
6134 $self->{insertion_mode} = AFTER_BODY_IM;
6135 ## Reprocess in the "after body" insertion mode.
6136 } else {
6137 !!!cp ('t306');
6138 }
6139
6140 ## "after body" insertion mode
6141 if ($token->{tag_name} eq 'html') {
6142 if (defined $self->{inner_html_node}) {
6143 !!!cp ('t307');
6144 !!!parse-error (type => 'unmatched end tag:html', token => $token);
6145 ## Ignore the token
6146 !!!next-token;
6147 next B;
6148 } else {
6149 !!!cp ('t308');
6150 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6151 !!!next-token;
6152 next B;
6153 }
6154 } else {
6155 !!!cp ('t309');
6156 !!!parse-error (type => 'after body:/'.$token->{tag_name}, token => $token);
6157
6158 $self->{insertion_mode} = IN_BODY_IM;
6159 ## reprocess
6160 next B;
6161 }
6162 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6163 !!!cp ('t309.2');
6164 ## Stop parsing
6165 last B;
6166 } else {
6167 die "$0: $token->{type}: Unknown token type";
6168 }
6169 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6170 if ($token->{type} == CHARACTER_TOKEN) {
6171 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6172 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6173
6174 unless (length $token->{data}) {
6175 !!!cp ('t310');
6176 !!!next-token;
6177 next B;
6178 }
6179 }
6180
6181 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6182 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6183 !!!cp ('t311');
6184 !!!parse-error (type => 'in frameset:#character', token => $token);
6185 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6186 !!!cp ('t312');
6187 !!!parse-error (type => 'after frameset:#character', token => $token);
6188 } else { # "after html frameset"
6189 !!!cp ('t313');
6190 !!!parse-error (type => 'after html:#character', token => $token);
6191
6192 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6193 ## Reprocess in the "after frameset" insertion mode.
6194 !!!parse-error (type => 'after frameset:#character', token => $token);
6195 }
6196
6197 ## Ignore the token.
6198 if (length $token->{data}) {
6199 !!!cp ('t314');
6200 ## reprocess the rest of characters
6201 } else {
6202 !!!cp ('t315');
6203 !!!next-token;
6204 }
6205 next B;
6206 }
6207
6208 die qq[$0: Character "$token->{data}"];
6209 } elsif ($token->{type} == START_TAG_TOKEN) {
6210 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6211 !!!cp ('t316');
6212 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
6213
6214 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6215 ## Process in the "after frameset" insertion mode.
6216 } else {
6217 !!!cp ('t317');
6218 }
6219
6220 if ($token->{tag_name} eq 'frameset' and
6221 $self->{insertion_mode} == IN_FRAMESET_IM) {
6222 !!!cp ('t318');
6223 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6224 !!!nack ('t318.1');
6225 !!!next-token;
6226 next B;
6227 } elsif ($token->{tag_name} eq 'frame' and
6228 $self->{insertion_mode} == IN_FRAMESET_IM) {
6229 !!!cp ('t319');
6230 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6231 pop @{$self->{open_elements}};
6232 !!!ack ('t319.1');
6233 !!!next-token;
6234 next B;
6235 } elsif ($token->{tag_name} eq 'noframes') {
6236 !!!cp ('t320');
6237 ## NOTE: As if in head.
6238 $parse_rcdata->(CDATA_CONTENT_MODEL);
6239 next B;
6240 } else {
6241 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6242 !!!cp ('t321');
6243 !!!parse-error (type => 'in frameset:'.$token->{tag_name}, token => $token);
6244 } else {
6245 !!!cp ('t322');
6246 !!!parse-error (type => 'after frameset:'.$token->{tag_name}, token => $token);
6247 }
6248 ## Ignore the token
6249 !!!nack ('t322.1');
6250 !!!next-token;
6251 next B;
6252 }
6253 } elsif ($token->{type} == END_TAG_TOKEN) {
6254 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
6255 !!!cp ('t323');
6256 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
6257
6258 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6259 ## Process in the "after frameset" insertion mode.
6260 } else {
6261 !!!cp ('t324');
6262 }
6263
6264 if ($token->{tag_name} eq 'frameset' and
6265 $self->{insertion_mode} == IN_FRAMESET_IM) {
6266 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6267 @{$self->{open_elements}} == 1) {
6268 !!!cp ('t325');
6269 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6270 ## Ignore the token
6271 !!!next-token;
6272 } else {
6273 !!!cp ('t326');
6274 pop @{$self->{open_elements}};
6275 !!!next-token;
6276 }
6277
6278 if (not defined $self->{inner_html_node} and
6279 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6280 !!!cp ('t327');
6281 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6282 } else {
6283 !!!cp ('t328');
6284 }
6285 next B;
6286 } elsif ($token->{tag_name} eq 'html' and
6287 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6288 !!!cp ('t329');
6289 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6290 !!!next-token;
6291 next B;
6292 } else {
6293 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6294 !!!cp ('t330');
6295 !!!parse-error (type => 'in frameset:/'.$token->{tag_name}, token => $token);
6296 } else {
6297 !!!cp ('t331');
6298 !!!parse-error (type => 'after frameset:/'.$token->{tag_name}, token => $token);
6299 }
6300 ## Ignore the token
6301 !!!next-token;
6302 next B;
6303 }
6304 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6305 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6306 @{$self->{open_elements}} == 1) { # redundant, maybe
6307 !!!cp ('t331.1');
6308 !!!parse-error (type => 'in body:#eof', token => $token);
6309 } else {
6310 !!!cp ('t331.2');
6311 }
6312
6313 ## Stop parsing
6314 last B;
6315 } else {
6316 die "$0: $token->{type}: Unknown token type";
6317 }
6318
6319 ## ISSUE: An issue in spec here
6320 } else {
6321 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6322 }
6323
6324 ## "in body" insertion mode
6325 if ($token->{type} == START_TAG_TOKEN) {
6326 if ($token->{tag_name} eq 'script') {
6327 !!!cp ('t332');
6328 ## NOTE: This is an "as if in head" code clone
6329 $script_start_tag->();
6330 next B;
6331 } elsif ($token->{tag_name} eq 'style') {
6332 !!!cp ('t333');
6333 ## NOTE: This is an "as if in head" code clone
6334 $parse_rcdata->(CDATA_CONTENT_MODEL);
6335 next B;
6336 } elsif ({
6337 base => 1, link => 1,
6338 }->{$token->{tag_name}}) {
6339 !!!cp ('t334');
6340 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6341 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6342 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6343 !!!ack ('t334.1');
6344 !!!next-token;
6345 next B;
6346 } elsif ($token->{tag_name} eq 'meta') {
6347 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6348 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6349 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6350
6351 unless ($self->{confident}) {
6352 if ($token->{attributes}->{charset}) {
6353 !!!cp ('t335');
6354 ## NOTE: Whether the encoding is supported or not is handled
6355 ## in the {change_encoding} callback.
6356 $self->{change_encoding}
6357 ->($self, $token->{attributes}->{charset}->{value}, $token);
6358
6359 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6360 ->set_user_data (manakai_has_reference =>
6361 $token->{attributes}->{charset}
6362 ->{has_reference});
6363 } elsif ($token->{attributes}->{content}) {
6364 if ($token->{attributes}->{content}->{value}
6365 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6366 [\x09-\x0D\x20]*=
6367 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6368 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6369 !!!cp ('t336');
6370 ## NOTE: Whether the encoding is supported or not is handled
6371 ## in the {change_encoding} callback.
6372 $self->{change_encoding}
6373 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6374 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6375 ->set_user_data (manakai_has_reference =>
6376 $token->{attributes}->{content}
6377 ->{has_reference});
6378 }
6379 }
6380 } else {
6381 if ($token->{attributes}->{charset}) {
6382 !!!cp ('t337');
6383 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6384 ->set_user_data (manakai_has_reference =>
6385 $token->{attributes}->{charset}
6386 ->{has_reference});
6387 }
6388 if ($token->{attributes}->{content}) {
6389 !!!cp ('t338');
6390 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6391 ->set_user_data (manakai_has_reference =>
6392 $token->{attributes}->{content}
6393 ->{has_reference});
6394 }
6395 }
6396
6397 !!!ack ('t338.1');
6398 !!!next-token;
6399 next B;
6400 } elsif ($token->{tag_name} eq 'title') {
6401 !!!cp ('t341');
6402 ## NOTE: This is an "as if in head" code clone
6403 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6404 next B;
6405 } elsif ($token->{tag_name} eq 'body') {
6406 !!!parse-error (type => 'in body:body', token => $token);
6407
6408 if (@{$self->{open_elements}} == 1 or
6409 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6410 !!!cp ('t342');
6411 ## Ignore the token
6412 } else {
6413 my $body_el = $self->{open_elements}->[1]->[0];
6414 for my $attr_name (keys %{$token->{attributes}}) {
6415 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6416 !!!cp ('t343');
6417 $body_el->set_attribute_ns
6418 (undef, [undef, $attr_name],
6419 $token->{attributes}->{$attr_name}->{value});
6420 }
6421 }
6422 }
6423 !!!nack ('t343.1');
6424 !!!next-token;
6425 next B;
6426 } elsif ({
6427 address => 1, blockquote => 1, center => 1, dir => 1,
6428 div => 1, dl => 1, fieldset => 1,
6429 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6430 menu => 1, ol => 1, p => 1, ul => 1,
6431 pre => 1, listing => 1,
6432 form => 1,
6433 table => 1,
6434 hr => 1,
6435 }->{$token->{tag_name}}) {
6436 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6437 !!!cp ('t350');
6438 !!!parse-error (type => 'in form:form', token => $token);
6439 ## Ignore the token
6440 !!!nack ('t350.1');
6441 !!!next-token;
6442 next B;
6443 }
6444
6445 ## has a p element in scope
6446 INSCOPE: for (reverse @{$self->{open_elements}}) {
6447 if ($_->[1] & P_EL) {
6448 !!!cp ('t344');
6449 !!!back-token; # <form>
6450 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6451 line => $token->{line}, column => $token->{column}};
6452 next B;
6453 } elsif ($_->[1] & SCOPING_EL) {
6454 !!!cp ('t345');
6455 last INSCOPE;
6456 }
6457 } # INSCOPE
6458
6459 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6460 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6461 !!!nack ('t346.1');
6462 !!!next-token;
6463 if ($token->{type} == CHARACTER_TOKEN) {
6464 $token->{data} =~ s/^\x0A//;
6465 unless (length $token->{data}) {
6466 !!!cp ('t346');
6467 !!!next-token;
6468 } else {
6469 !!!cp ('t349');
6470 }
6471 } else {
6472 !!!cp ('t348');
6473 }
6474 } elsif ($token->{tag_name} eq 'form') {
6475 !!!cp ('t347.1');
6476 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6477
6478 !!!nack ('t347.2');
6479 !!!next-token;
6480 } elsif ($token->{tag_name} eq 'table') {
6481 !!!cp ('t382');
6482 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6483
6484 $self->{insertion_mode} = IN_TABLE_IM;
6485
6486 !!!nack ('t382.1');
6487 !!!next-token;
6488 } elsif ($token->{tag_name} eq 'hr') {
6489 !!!cp ('t386');
6490 pop @{$self->{open_elements}};
6491
6492 !!!nack ('t386.1');
6493 !!!next-token;
6494 } else {
6495 !!!nack ('t347.1');
6496 !!!next-token;
6497 }
6498 next B;
6499 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6500 ## has a p element in scope
6501 INSCOPE: for (reverse @{$self->{open_elements}}) {
6502 if ($_->[1] & P_EL) {
6503 !!!cp ('t353');
6504 !!!back-token; # <x>
6505 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6506 line => $token->{line}, column => $token->{column}};
6507 next B;
6508 } elsif ($_->[1] & SCOPING_EL) {
6509 !!!cp ('t354');
6510 last INSCOPE;
6511 }
6512 } # INSCOPE
6513
6514 ## Step 1
6515 my $i = -1;
6516 my $node = $self->{open_elements}->[$i];
6517 my $li_or_dtdd = {li => {li => 1},
6518 dt => {dt => 1, dd => 1},
6519 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6520 LI: {
6521 ## Step 2
6522 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6523 if ($i != -1) {
6524 !!!cp ('t355');
6525 !!!parse-error (type => 'not closed',
6526 value => $self->{open_elements}->[-1]->[0]
6527 ->manakai_local_name,
6528 token => $token);
6529 } else {
6530 !!!cp ('t356');
6531 }
6532 splice @{$self->{open_elements}}, $i;
6533 last LI;
6534 } else {
6535 !!!cp ('t357');
6536 }
6537
6538 ## Step 3
6539 if (not ($node->[1] & FORMATTING_EL) and
6540 #not $phrasing_category->{$node->[1]} and
6541 ($node->[1] & SPECIAL_EL or
6542 $node->[1] & SCOPING_EL) and
6543 not ($node->[1] & ADDRESS_EL) and
6544 not ($node->[1] & DIV_EL)) {
6545 !!!cp ('t358');
6546 last LI;
6547 }
6548
6549 !!!cp ('t359');
6550 ## Step 4
6551 $i--;
6552 $node = $self->{open_elements}->[$i];
6553 redo LI;
6554 } # LI
6555
6556 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6557 !!!nack ('t359.1');
6558 !!!next-token;
6559 next B;
6560 } elsif ($token->{tag_name} eq 'plaintext') {
6561 ## has a p element in scope
6562 INSCOPE: for (reverse @{$self->{open_elements}}) {
6563 if ($_->[1] & P_EL) {
6564 !!!cp ('t367');
6565 !!!back-token; # <plaintext>
6566 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6567 line => $token->{line}, column => $token->{column}};
6568 next B;
6569 } elsif ($_->[1] & SCOPING_EL) {
6570 !!!cp ('t368');
6571 last INSCOPE;
6572 }
6573 } # INSCOPE
6574
6575 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6576
6577 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6578
6579 !!!nack ('t368.1');
6580 !!!next-token;
6581 next B;
6582 } elsif ($token->{tag_name} eq 'a') {
6583 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6584 my $node = $active_formatting_elements->[$i];
6585 if ($node->[1] & A_EL) {
6586 !!!cp ('t371');
6587 !!!parse-error (type => 'in a:a', token => $token);
6588
6589 !!!back-token; # <a>
6590 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6591 line => $token->{line}, column => $token->{column}};
6592 $formatting_end_tag->($token);
6593
6594 AFE2: for (reverse 0..$#$active_formatting_elements) {
6595 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6596 !!!cp ('t372');
6597 splice @$active_formatting_elements, $_, 1;
6598 last AFE2;
6599 }
6600 } # AFE2
6601 OE: for (reverse 0..$#{$self->{open_elements}}) {
6602 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6603 !!!cp ('t373');
6604 splice @{$self->{open_elements}}, $_, 1;
6605 last OE;
6606 }
6607 } # OE
6608 last AFE;
6609 } elsif ($node->[0] eq '#marker') {
6610 !!!cp ('t374');
6611 last AFE;
6612 }
6613 } # AFE
6614
6615 $reconstruct_active_formatting_elements->($insert_to_current);
6616
6617 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6618 push @$active_formatting_elements, $self->{open_elements}->[-1];
6619
6620 !!!nack ('t374.1');
6621 !!!next-token;
6622 next B;
6623 } elsif ($token->{tag_name} eq 'nobr') {
6624 $reconstruct_active_formatting_elements->($insert_to_current);
6625
6626 ## has a |nobr| element in scope
6627 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6628 my $node = $self->{open_elements}->[$_];
6629 if ($node->[1] & NOBR_EL) {
6630 !!!cp ('t376');
6631 !!!parse-error (type => 'in nobr:nobr', token => $token);
6632 !!!back-token; # <nobr>
6633 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6634 line => $token->{line}, column => $token->{column}};
6635 next B;
6636 } elsif ($node->[1] & SCOPING_EL) {
6637 !!!cp ('t377');
6638 last INSCOPE;
6639 }
6640 } # INSCOPE
6641
6642 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6643 push @$active_formatting_elements, $self->{open_elements}->[-1];
6644
6645 !!!nack ('t377.1');
6646 !!!next-token;
6647 next B;
6648 } elsif ($token->{tag_name} eq 'button') {
6649 ## has a button element in scope
6650 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6651 my $node = $self->{open_elements}->[$_];
6652 if ($node->[1] & BUTTON_EL) {
6653 !!!cp ('t378');
6654 !!!parse-error (type => 'in button:button', token => $token);
6655 !!!back-token; # <button>
6656 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6657 line => $token->{line}, column => $token->{column}};
6658 next B;
6659 } elsif ($node->[1] & SCOPING_EL) {
6660 !!!cp ('t379');
6661 last INSCOPE;
6662 }
6663 } # INSCOPE
6664
6665 $reconstruct_active_formatting_elements->($insert_to_current);
6666
6667 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6668
6669 ## TODO: associate with $self->{form_element} if defined
6670
6671 push @$active_formatting_elements, ['#marker', ''];
6672
6673 !!!nack ('t379.1');
6674 !!!next-token;
6675 next B;
6676 } elsif ({
6677 xmp => 1,
6678 iframe => 1,
6679 noembed => 1,
6680 noframes => 1, ## NOTE: This is an "as if in head" code clone.
6681 noscript => 0, ## TODO: 1 if scripting is enabled
6682 }->{$token->{tag_name}}) {
6683 if ($token->{tag_name} eq 'xmp') {
6684 !!!cp ('t381');
6685 $reconstruct_active_formatting_elements->($insert_to_current);
6686 } else {
6687 !!!cp ('t399');
6688 }
6689 ## NOTE: There is an "as if in body" code clone.
6690 $parse_rcdata->(CDATA_CONTENT_MODEL);
6691 next B;
6692 } elsif ($token->{tag_name} eq 'isindex') {
6693 !!!parse-error (type => 'isindex', token => $token);
6694
6695 if (defined $self->{form_element}) {
6696 !!!cp ('t389');
6697 ## Ignore the token
6698 !!!nack ('t389'); ## NOTE: Not acknowledged.
6699 !!!next-token;
6700 next B;
6701 } else {
6702 !!!ack ('t391.1');
6703
6704 my $at = $token->{attributes};
6705 my $form_attrs;
6706 $form_attrs->{action} = $at->{action} if $at->{action};
6707 my $prompt_attr = $at->{prompt};
6708 $at->{name} = {name => 'name', value => 'isindex'};
6709 delete $at->{action};
6710 delete $at->{prompt};
6711 my @tokens = (
6712 {type => START_TAG_TOKEN, tag_name => 'form',
6713 attributes => $form_attrs,
6714 line => $token->{line}, column => $token->{column}},
6715 {type => START_TAG_TOKEN, tag_name => 'hr',
6716 line => $token->{line}, column => $token->{column}},
6717 {type => START_TAG_TOKEN, tag_name => 'p',
6718 line => $token->{line}, column => $token->{column}},
6719 {type => START_TAG_TOKEN, tag_name => 'label',
6720 line => $token->{line}, column => $token->{column}},
6721 );
6722 if ($prompt_attr) {
6723 !!!cp ('t390');
6724 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6725 #line => $token->{line}, column => $token->{column},
6726 };
6727 } else {
6728 !!!cp ('t391');
6729 push @tokens, {type => CHARACTER_TOKEN,
6730 data => 'This is a searchable index. Insert your search keywords here: ',
6731 #line => $token->{line}, column => $token->{column},
6732 }; # SHOULD
6733 ## TODO: make this configurable
6734 }
6735 push @tokens,
6736 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6737 line => $token->{line}, column => $token->{column}},
6738 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6739 {type => END_TAG_TOKEN, tag_name => 'label',
6740 line => $token->{line}, column => $token->{column}},
6741 {type => END_TAG_TOKEN, tag_name => 'p',
6742 line => $token->{line}, column => $token->{column}},
6743 {type => START_TAG_TOKEN, tag_name => 'hr',
6744 line => $token->{line}, column => $token->{column}},
6745 {type => END_TAG_TOKEN, tag_name => 'form',
6746 line => $token->{line}, column => $token->{column}};
6747 !!!back-token (@tokens);
6748 !!!next-token;
6749 next B;
6750 }
6751 } elsif ($token->{tag_name} eq 'textarea') {
6752 my $tag_name = $token->{tag_name};
6753 my $el;
6754 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6755
6756 ## TODO: $self->{form_element} if defined
6757 $self->{content_model} = RCDATA_CONTENT_MODEL;
6758 delete $self->{escape}; # MUST
6759
6760 $insert->($el);
6761
6762 my $text = '';
6763 !!!nack ('t392.1');
6764 !!!next-token;
6765 if ($token->{type} == CHARACTER_TOKEN) {
6766 $token->{data} =~ s/^\x0A//;
6767 unless (length $token->{data}) {
6768 !!!cp ('t392');
6769 !!!next-token;
6770 } else {
6771 !!!cp ('t393');
6772 }
6773 } else {
6774 !!!cp ('t394');
6775 }
6776 while ($token->{type} == CHARACTER_TOKEN) {
6777 !!!cp ('t395');
6778 $text .= $token->{data};
6779 !!!next-token;
6780 }
6781 if (length $text) {
6782 !!!cp ('t396');
6783 $el->manakai_append_text ($text);
6784 }
6785
6786 $self->{content_model} = PCDATA_CONTENT_MODEL;
6787
6788 if ($token->{type} == END_TAG_TOKEN and
6789 $token->{tag_name} eq $tag_name) {
6790 !!!cp ('t397');
6791 ## Ignore the token
6792 } else {
6793 !!!cp ('t398');
6794 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
6795 }
6796 !!!next-token;
6797 next B;
6798 } elsif ($token->{tag_name} eq 'math' or
6799 $token->{tag_name} eq 'svg') {
6800 $reconstruct_active_formatting_elements->($insert_to_current);
6801
6802 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
6803
6804 ## "adjust foreign attributes" - done in insert-element-f
6805
6806 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
6807
6808 if ($self->{self_closing}) {
6809 pop @{$self->{open_elements}};
6810 !!!ack ('t398.1');
6811 } else {
6812 !!!cp ('t398.2');
6813 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6814 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6815 ## mode, "in body" (not "in foreign content") secondary insertion
6816 ## mode, maybe.
6817 }
6818
6819 !!!next-token;
6820 next B;
6821 } elsif ({
6822 caption => 1, col => 1, colgroup => 1, frame => 1,
6823 frameset => 1, head => 1, option => 1, optgroup => 1,
6824 tbody => 1, td => 1, tfoot => 1, th => 1,
6825 thead => 1, tr => 1,
6826 }->{$token->{tag_name}}) {
6827 !!!cp ('t401');
6828 !!!parse-error (type => 'in body:'.$token->{tag_name}, token => $token);
6829 ## Ignore the token
6830 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6831 !!!next-token;
6832 next B;
6833
6834 ## ISSUE: An issue on HTML5 new elements in the spec.
6835 } else {
6836 if ($token->{tag_name} eq 'image') {
6837 !!!cp ('t384');
6838 !!!parse-error (type => 'image', token => $token);
6839 $token->{tag_name} = 'img';
6840 } else {
6841 !!!cp ('t385');
6842 }
6843
6844 ## NOTE: There is an "as if <br>" code clone.
6845 $reconstruct_active_formatting_elements->($insert_to_current);
6846
6847 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6848
6849 if ({
6850 applet => 1, marquee => 1, object => 1,
6851 }->{$token->{tag_name}}) {
6852 !!!cp ('t380');
6853 push @$active_formatting_elements, ['#marker', ''];
6854 !!!nack ('t380.1');
6855 } elsif ({
6856 b => 1, big => 1, em => 1, font => 1, i => 1,
6857 s => 1, small => 1, strile => 1,
6858 strong => 1, tt => 1, u => 1,
6859 }->{$token->{tag_name}}) {
6860 !!!cp ('t375');
6861 push @$active_formatting_elements, $self->{open_elements}->[-1];
6862 !!!nack ('t375.1');
6863 } elsif ($token->{tag_name} eq 'input') {
6864 !!!cp ('t388');
6865 ## TODO: associate with $self->{form_element} if defined
6866 pop @{$self->{open_elements}};
6867 !!!ack ('t388.2');
6868 } elsif ({
6869 area => 1, basefont => 1, bgsound => 1, br => 1,
6870 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
6871 #image => 1,
6872 }->{$token->{tag_name}}) {
6873 !!!cp ('t388.1');
6874 pop @{$self->{open_elements}};
6875 !!!ack ('t388.3');
6876 } elsif ($token->{tag_name} eq 'select') {
6877 ## TODO: associate with $self->{form_element} if defined
6878
6879 if ($self->{insertion_mode} & TABLE_IMS or
6880 $self->{insertion_mode} & BODY_TABLE_IMS or
6881 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6882 !!!cp ('t400.1');
6883 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6884 } else {
6885 !!!cp ('t400.2');
6886 $self->{insertion_mode} = IN_SELECT_IM;
6887 }
6888 !!!nack ('t400.3');
6889 } else {
6890 !!!nack ('t402');
6891 }
6892
6893 !!!next-token;
6894 next B;
6895 }
6896 } elsif ($token->{type} == END_TAG_TOKEN) {
6897 if ($token->{tag_name} eq 'body') {
6898 ## has a |body| element in scope
6899 my $i;
6900 INSCOPE: {
6901 for (reverse @{$self->{open_elements}}) {
6902 if ($_->[1] & BODY_EL) {
6903 !!!cp ('t405');
6904 $i = $_;
6905 last INSCOPE;
6906 } elsif ($_->[1] & SCOPING_EL) {
6907 !!!cp ('t405.1');
6908 last;
6909 }
6910 }
6911
6912 !!!parse-error (type => 'start tag not allowed',
6913 value => $token->{tag_name}, token => $token);
6914 ## NOTE: Ignore the token.
6915 !!!next-token;
6916 next B;
6917 } # INSCOPE
6918
6919 for (@{$self->{open_elements}}) {
6920 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
6921 !!!cp ('t403');
6922 !!!parse-error (type => 'not closed',
6923 value => $_->[0]->manakai_local_name,
6924 token => $token);
6925 last;
6926 } else {
6927 !!!cp ('t404');
6928 }
6929 }
6930
6931 $self->{insertion_mode} = AFTER_BODY_IM;
6932 !!!next-token;
6933 next B;
6934 } elsif ($token->{tag_name} eq 'html') {
6935 ## TODO: Update this code. It seems that the code below is not
6936 ## up-to-date, though it has same effect as speced.
6937 if (@{$self->{open_elements}} > 1 and
6938 $self->{open_elements}->[1]->[1] & BODY_EL) {
6939 ## ISSUE: There is an issue in the spec.
6940 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
6941 !!!cp ('t406');
6942 !!!parse-error (type => 'not closed',
6943 value => $self->{open_elements}->[1]->[0]
6944 ->manakai_local_name,
6945 token => $token);
6946 } else {
6947 !!!cp ('t407');
6948 }
6949 $self->{insertion_mode} = AFTER_BODY_IM;
6950 ## reprocess
6951 next B;
6952 } else {
6953 !!!cp ('t408');
6954 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6955 ## Ignore the token
6956 !!!next-token;
6957 next B;
6958 }
6959 } elsif ({
6960 address => 1, blockquote => 1, center => 1, dir => 1,
6961 div => 1, dl => 1, fieldset => 1, listing => 1,
6962 menu => 1, ol => 1, pre => 1, ul => 1,
6963 dd => 1, dt => 1, li => 1,
6964 applet => 1, button => 1, marquee => 1, object => 1,
6965 }->{$token->{tag_name}}) {
6966 ## has an element in scope
6967 my $i;
6968 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6969 my $node = $self->{open_elements}->[$_];
6970 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6971 !!!cp ('t410');
6972 $i = $_;
6973 last INSCOPE;
6974 } elsif ($node->[1] & SCOPING_EL) {
6975 !!!cp ('t411');
6976 last INSCOPE;
6977 }
6978 } # INSCOPE
6979
6980 unless (defined $i) { # has an element in scope
6981 !!!cp ('t413');
6982 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6983 } else {
6984 ## Step 1. generate implied end tags
6985 while ({
6986 dd => ($token->{tag_name} ne 'dd'),
6987 dt => ($token->{tag_name} ne 'dt'),
6988 li => ($token->{tag_name} ne 'li'),
6989 p => 1,
6990 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
6991 !!!cp ('t409');
6992 pop @{$self->{open_elements}};
6993 }
6994
6995 ## Step 2.
6996 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6997 ne $token->{tag_name}) {
6998 !!!cp ('t412');
6999 !!!parse-error (type => 'not closed',
7000 value => $self->{open_elements}->[-1]->[0]
7001 ->manakai_local_name,
7002 token => $token);
7003 } else {
7004 !!!cp ('t414');
7005 }
7006
7007 ## Step 3.
7008 splice @{$self->{open_elements}}, $i;
7009
7010 ## Step 4.
7011 $clear_up_to_marker->()
7012 if {
7013 applet => 1, button => 1, marquee => 1, object => 1,
7014 }->{$token->{tag_name}};
7015 }
7016 !!!next-token;
7017 next B;
7018 } elsif ($token->{tag_name} eq 'form') {
7019 undef $self->{form_element};
7020
7021 ## has an element in scope
7022 my $i;
7023 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7024 my $node = $self->{open_elements}->[$_];
7025 if ($node->[1] & FORM_EL) {
7026 !!!cp ('t418');
7027 $i = $_;
7028 last INSCOPE;
7029 } elsif ($node->[1] & SCOPING_EL) {
7030 !!!cp ('t419');
7031 last INSCOPE;
7032 }
7033 } # INSCOPE
7034
7035 unless (defined $i) { # has an element in scope
7036 !!!cp ('t421');
7037 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7038 } else {
7039 ## Step 1. generate implied end tags
7040 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7041 !!!cp ('t417');
7042 pop @{$self->{open_elements}};
7043 }
7044
7045 ## Step 2.
7046 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7047 ne $token->{tag_name}) {
7048 !!!cp ('t417.1');
7049 !!!parse-error (type => 'not closed',
7050 value => $self->{open_elements}->[-1]->[0]
7051 ->manakai_local_name,
7052 token => $token);
7053 } else {
7054 !!!cp ('t420');
7055 }
7056
7057 ## Step 3.
7058 splice @{$self->{open_elements}}, $i;
7059 }
7060
7061 !!!next-token;
7062 next B;
7063 } elsif ({
7064 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7065 }->{$token->{tag_name}}) {
7066 ## has an element in scope
7067 my $i;
7068 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7069 my $node = $self->{open_elements}->[$_];
7070 if ($node->[1] & HEADING_EL) {
7071 !!!cp ('t423');
7072 $i = $_;
7073 last INSCOPE;
7074 } elsif ($node->[1] & SCOPING_EL) {
7075 !!!cp ('t424');
7076 last INSCOPE;
7077 }
7078 } # INSCOPE
7079
7080 unless (defined $i) { # has an element in scope
7081 !!!cp ('t425.1');
7082 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7083 } else {
7084 ## Step 1. generate implied end tags
7085 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7086 !!!cp ('t422');
7087 pop @{$self->{open_elements}};
7088 }
7089
7090 ## Step 2.
7091 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7092 ne $token->{tag_name}) {
7093 !!!cp ('t425');
7094 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7095 } else {
7096 !!!cp ('t426');
7097 }
7098
7099 ## Step 3.
7100 splice @{$self->{open_elements}}, $i;
7101 }
7102
7103 !!!next-token;
7104 next B;
7105 } elsif ($token->{tag_name} eq 'p') {
7106 ## has an element in scope
7107 my $i;
7108 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7109 my $node = $self->{open_elements}->[$_];
7110 if ($node->[1] & P_EL) {
7111 !!!cp ('t410.1');
7112 $i = $_;
7113 last INSCOPE;
7114 } elsif ($node->[1] & SCOPING_EL) {
7115 !!!cp ('t411.1');
7116 last INSCOPE;
7117 }
7118 } # INSCOPE
7119
7120 if (defined $i) {
7121 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7122 ne $token->{tag_name}) {
7123 !!!cp ('t412.1');
7124 !!!parse-error (type => 'not closed',
7125 value => $self->{open_elements}->[-1]->[0]
7126 ->manakai_local_name,
7127 token => $token);
7128 } else {
7129 !!!cp ('t414.1');
7130 }
7131
7132 splice @{$self->{open_elements}}, $i;
7133 } else {
7134 !!!cp ('t413.1');
7135 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7136
7137 !!!cp ('t415.1');
7138 ## As if <p>, then reprocess the current token
7139 my $el;
7140 !!!create-element ($el, $HTML_NS, 'p',, $token);
7141 $insert->($el);
7142 ## NOTE: Not inserted into |$self->{open_elements}|.
7143 }
7144
7145 !!!next-token;
7146 next B;
7147 } elsif ({
7148 a => 1,
7149 b => 1, big => 1, em => 1, font => 1, i => 1,
7150 nobr => 1, s => 1, small => 1, strile => 1,
7151 strong => 1, tt => 1, u => 1,
7152 }->{$token->{tag_name}}) {
7153 !!!cp ('t427');
7154 $formatting_end_tag->($token);
7155 next B;
7156 } elsif ($token->{tag_name} eq 'br') {
7157 !!!cp ('t428');
7158 !!!parse-error (type => 'unmatched end tag:br', token => $token);
7159
7160 ## As if <br>
7161 $reconstruct_active_formatting_elements->($insert_to_current);
7162
7163 my $el;
7164 !!!create-element ($el, $HTML_NS, 'br',, $token);
7165 $insert->($el);
7166
7167 ## Ignore the token.
7168 !!!next-token;
7169 next B;
7170 } elsif ({
7171 caption => 1, col => 1, colgroup => 1, frame => 1,
7172 frameset => 1, head => 1, option => 1, optgroup => 1,
7173 tbody => 1, td => 1, tfoot => 1, th => 1,
7174 thead => 1, tr => 1,
7175 area => 1, basefont => 1, bgsound => 1,
7176 embed => 1, hr => 1, iframe => 1, image => 1,
7177 img => 1, input => 1, isindex => 1, noembed => 1,
7178 noframes => 1, param => 1, select => 1, spacer => 1,
7179 table => 1, textarea => 1, wbr => 1,
7180 noscript => 0, ## TODO: if scripting is enabled
7181 }->{$token->{tag_name}}) {
7182 !!!cp ('t429');
7183 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7184 ## Ignore the token
7185 !!!next-token;
7186 next B;
7187
7188 ## ISSUE: Issue on HTML5 new elements in spec
7189
7190 } else {
7191 ## Step 1
7192 my $node_i = -1;
7193 my $node = $self->{open_elements}->[$node_i];
7194
7195 ## Step 2
7196 S2: {
7197 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7198 ## Step 1
7199 ## generate implied end tags
7200 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7201 !!!cp ('t430');
7202 ## ISSUE: Can this case be reached?
7203 pop @{$self->{open_elements}};
7204 }
7205
7206 ## Step 2
7207 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7208 ne $token->{tag_name}) {
7209 !!!cp ('t431');
7210 ## NOTE: <x><y></x>
7211 !!!parse-error (type => 'not closed',
7212 value => $self->{open_elements}->[-1]->[0]
7213 ->manakai_local_name,
7214 token => $token);
7215 } else {
7216 !!!cp ('t432');
7217 }
7218
7219 ## Step 3
7220 splice @{$self->{open_elements}}, $node_i;
7221
7222 !!!next-token;
7223 last S2;
7224 } else {
7225 ## Step 3
7226 if (not ($node->[1] & FORMATTING_EL) and
7227 #not $phrasing_category->{$node->[1]} and
7228 ($node->[1] & SPECIAL_EL or
7229 $node->[1] & SCOPING_EL)) {
7230 !!!cp ('t433');
7231 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
7232 ## Ignore the token
7233 !!!next-token;
7234 last S2;
7235 }
7236
7237 !!!cp ('t434');
7238 }
7239
7240 ## Step 4
7241 $node_i--;
7242 $node = $self->{open_elements}->[$node_i];
7243
7244 ## Step 5;
7245 redo S2;
7246 } # S2
7247 next B;
7248 }
7249 }
7250 next B;
7251 } continue { # B
7252 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7253 ## NOTE: The code below is executed in cases where it does not have
7254 ## to be, but it it is harmless even in those cases.
7255 ## has an element in scope
7256 INSCOPE: {
7257 for (reverse 0..$#{$self->{open_elements}}) {
7258 my $node = $self->{open_elements}->[$_];
7259 if ($node->[1] & FOREIGN_EL) {
7260 last INSCOPE;
7261 } elsif ($node->[1] & SCOPING_EL) {
7262 last;
7263 }
7264 }
7265
7266 ## NOTE: No foreign element in scope.
7267 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7268 } # INSCOPE
7269 }
7270 } # B
7271
7272 ## Stop parsing # MUST
7273
7274 ## TODO: script stuffs
7275 } # _tree_construct_main
7276
7277 sub set_inner_html ($$$) {
7278 my $class = shift;
7279 my $node = shift;
7280 my $s = \$_[0];
7281 my $onerror = $_[1];
7282
7283 ## ISSUE: Should {confident} be true?
7284
7285 my $nt = $node->node_type;
7286 if ($nt == 9) {
7287 # MUST
7288
7289 ## Step 1 # MUST
7290 ## TODO: If the document has an active parser, ...
7291 ## ISSUE: There is an issue in the spec.
7292
7293 ## Step 2 # MUST
7294 my @cn = @{$node->child_nodes};
7295 for (@cn) {
7296 $node->remove_child ($_);
7297 }
7298
7299 ## Step 3, 4, 5 # MUST
7300 $class->parse_string ($$s => $node, $onerror);
7301 } elsif ($nt == 1) {
7302 ## TODO: If non-html element
7303
7304 ## NOTE: Most of this code is copied from |parse_string|
7305
7306 ## Step 1 # MUST
7307 my $this_doc = $node->owner_document;
7308 my $doc = $this_doc->implementation->create_document;
7309 $doc->manakai_is_html (1);
7310 my $p = $class->new;
7311 $p->{document} = $doc;
7312
7313 ## Step 8 # MUST
7314 my $i = 0;
7315 $p->{line_prev} = $p->{line} = 1;
7316 $p->{column_prev} = $p->{column} = 0;
7317 $p->{set_next_char} = sub {
7318 my $self = shift;
7319
7320 pop @{$self->{prev_char}};
7321 unshift @{$self->{prev_char}}, $self->{next_char};
7322
7323 $self->{next_char} = -1 and return if $i >= length $$s;
7324 $self->{next_char} = ord substr $$s, $i++, 1;
7325
7326 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7327 $p->{column}++;
7328
7329 if ($self->{next_char} == 0x000A) { # LF
7330 $p->{line}++;
7331 $p->{column} = 0;
7332 !!!cp ('i1');
7333 } elsif ($self->{next_char} == 0x000D) { # CR
7334 $i++ if substr ($$s, $i, 1) eq "\x0A";
7335 $self->{next_char} = 0x000A; # LF # MUST
7336 $p->{line}++;
7337 $p->{column} = 0;
7338 !!!cp ('i2');
7339 } elsif ($self->{next_char} > 0x10FFFF) {
7340 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7341 !!!cp ('i3');
7342 } elsif ($self->{next_char} == 0x0000) { # NULL
7343 !!!cp ('i4');
7344 !!!parse-error (type => 'NULL');
7345 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7346 } elsif ($self->{next_char} <= 0x0008 or
7347 (0x000E <= $self->{next_char} and
7348 $self->{next_char} <= 0x001F) or
7349 (0x007F <= $self->{next_char} and
7350 $self->{next_char} <= 0x009F) or
7351 (0xD800 <= $self->{next_char} and
7352 $self->{next_char} <= 0xDFFF) or
7353 (0xFDD0 <= $self->{next_char} and
7354 $self->{next_char} <= 0xFDDF) or
7355 {
7356 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7357 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7358 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7359 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7360 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7361 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7362 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7363 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7364 0x10FFFE => 1, 0x10FFFF => 1,
7365 }->{$self->{next_char}}) {
7366 !!!cp ('i4.1');
7367 !!!parse-error (type => 'control char', level => $self->{must_level});
7368 ## TODO: error type documentation
7369 }
7370 };
7371 $p->{prev_char} = [-1, -1, -1];
7372 $p->{next_char} = -1;
7373
7374 my $ponerror = $onerror || sub {
7375 my (%opt) = @_;
7376 my $line = $opt{line};
7377 my $column = $opt{column};
7378 if (defined $opt{token} and defined $opt{token}->{line}) {
7379 $line = $opt{token}->{line};
7380 $column = $opt{token}->{column};
7381 }
7382 warn "Parse error ($opt{type}) at line $line column $column\n";
7383 };
7384 $p->{parse_error} = sub {
7385 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7386 };
7387
7388 $p->_initialize_tokenizer;
7389 $p->_initialize_tree_constructor;
7390
7391 ## Step 2
7392 my $node_ln = $node->manakai_local_name;
7393 $p->{content_model} = {
7394 title => RCDATA_CONTENT_MODEL,
7395 textarea => RCDATA_CONTENT_MODEL,
7396 style => CDATA_CONTENT_MODEL,
7397 script => CDATA_CONTENT_MODEL,
7398 xmp => CDATA_CONTENT_MODEL,
7399 iframe => CDATA_CONTENT_MODEL,
7400 noembed => CDATA_CONTENT_MODEL,
7401 noframes => CDATA_CONTENT_MODEL,
7402 noscript => CDATA_CONTENT_MODEL,
7403 plaintext => PLAINTEXT_CONTENT_MODEL,
7404 }->{$node_ln};
7405 $p->{content_model} = PCDATA_CONTENT_MODEL
7406 unless defined $p->{content_model};
7407 ## ISSUE: What is "the name of the element"? local name?
7408
7409 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7410 ## TODO: Foreign element OK?
7411
7412 ## Step 3
7413 my $root = $doc->create_element_ns
7414 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7415
7416 ## Step 4 # MUST
7417 $doc->append_child ($root);
7418
7419 ## Step 5 # MUST
7420 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7421
7422 undef $p->{head_element};
7423
7424 ## Step 6 # MUST
7425 $p->_reset_insertion_mode;
7426
7427 ## Step 7 # MUST
7428 my $anode = $node;
7429 AN: while (defined $anode) {
7430 if ($anode->node_type == 1) {
7431 my $nsuri = $anode->namespace_uri;
7432 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7433 if ($anode->manakai_local_name eq 'form') {
7434 !!!cp ('i5');
7435 $p->{form_element} = $anode;
7436 last AN;
7437 }
7438 }
7439 }
7440 $anode = $anode->parent_node;
7441 } # AN
7442
7443 ## Step 9 # MUST
7444 {
7445 my $self = $p;
7446 !!!next-token;
7447 }
7448 $p->_tree_construction_main;
7449
7450 ## Step 10 # MUST
7451 my @cn = @{$node->child_nodes};
7452 for (@cn) {
7453 $node->remove_child ($_);
7454 }
7455 ## ISSUE: mutation events? read-only?
7456
7457 ## Step 11 # MUST
7458 @cn = @{$root->child_nodes};
7459 for (@cn) {
7460 $this_doc->adopt_node ($_);
7461 $node->append_child ($_);
7462 }
7463 ## ISSUE: mutation events?
7464
7465 $p->_terminate_tree_constructor;
7466
7467 delete $p->{parse_error}; # delete loop
7468 } else {
7469 die "$0: |set_inner_html| is not defined for node of type $nt";
7470 }
7471 } # set_inner_html
7472
7473 } # tree construction stage
7474
7475 package Whatpm::HTML::RestartParser;
7476 push our @ISA, 'Error';
7477
7478 1;
7479 # $Date: 2008/05/25 07:54:33 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24