/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.132 - (show annotations) (download) (as text)
Sun Apr 13 10:36:40 2008 UTC (17 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.131: +48 -3 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	13 Apr 2008 10:11:49 -0000
	* HTML-tokenizer.t: Raise a parse error if there are disallowed
	character (for compatibility with existing html5lib test data).

	* tokenizer-test-1.test: Some test results are updated with
	regard to parse errors on disallowed characters.

	* tokenizer-test-2.dat: Test data for disallowed characters
	are added (HTML5 revision 1263).

2008-04-13  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	13 Apr 2008 10:12:20 -0000
	* HTML.pm.src: Raise an parse error for any disallowed
	character (HTML5 revision 1263).

2008-04-13  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.131 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: 1252 parse error (revision 1264)
12 ## TODO: 8859-11 = 874 (revision 1271)
13
14 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
15 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
16 my $SVG_NS = q<http://www.w3.org/2000/svg>;
17 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
18 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
19 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
20
21 sub A_EL () { 0b1 }
22 sub ADDRESS_EL () { 0b10 }
23 sub BODY_EL () { 0b100 }
24 sub BUTTON_EL () { 0b1000 }
25 sub CAPTION_EL () { 0b10000 }
26 sub DD_EL () { 0b100000 }
27 sub DIV_EL () { 0b1000000 }
28 sub DT_EL () { 0b10000000 }
29 sub FORM_EL () { 0b100000000 }
30 sub FORMATTING_EL () { 0b1000000000 }
31 sub FRAMESET_EL () { 0b10000000000 }
32 sub HEADING_EL () { 0b100000000000 }
33 sub HTML_EL () { 0b1000000000000 }
34 sub LI_EL () { 0b10000000000000 }
35 sub NOBR_EL () { 0b100000000000000 }
36 sub OPTION_EL () { 0b1000000000000000 }
37 sub OPTGROUP_EL () { 0b10000000000000000 }
38 sub P_EL () { 0b100000000000000000 }
39 sub SELECT_EL () { 0b1000000000000000000 }
40 sub TABLE_EL () { 0b10000000000000000000 }
41 sub TABLE_CELL_EL () { 0b100000000000000000000 }
42 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
43 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
44 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
45 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
46 sub FOREIGN_EL () { 0b10000000000000000000000000 }
47 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
48 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
49
50 sub TABLE_ROWS_EL () {
51 TABLE_EL |
52 TABLE_ROW_EL |
53 TABLE_ROW_GROUP_EL
54 }
55
56 sub END_TAG_OPTIONAL_EL () {
57 DD_EL |
58 DT_EL |
59 LI_EL |
60 P_EL
61 }
62
63 sub ALL_END_TAG_OPTIONAL_EL () {
64 END_TAG_OPTIONAL_EL |
65 BODY_EL |
66 HTML_EL |
67 TABLE_CELL_EL |
68 TABLE_ROW_EL |
69 TABLE_ROW_GROUP_EL
70 }
71
72 sub SCOPING_EL () {
73 BUTTON_EL |
74 CAPTION_EL |
75 HTML_EL |
76 TABLE_EL |
77 TABLE_CELL_EL |
78 MISC_SCOPING_EL
79 }
80
81 sub TABLE_SCOPING_EL () {
82 HTML_EL |
83 TABLE_EL
84 }
85
86 sub TABLE_ROWS_SCOPING_EL () {
87 HTML_EL |
88 TABLE_ROW_GROUP_EL
89 }
90
91 sub TABLE_ROW_SCOPING_EL () {
92 HTML_EL |
93 TABLE_ROW_EL
94 }
95
96 sub SPECIAL_EL () {
97 ADDRESS_EL |
98 BODY_EL |
99 DIV_EL |
100 END_TAG_OPTIONAL_EL |
101 FORM_EL |
102 FRAMESET_EL |
103 HEADING_EL |
104 OPTION_EL |
105 OPTGROUP_EL |
106 SELECT_EL |
107 TABLE_ROW_EL |
108 TABLE_ROW_GROUP_EL |
109 MISC_SPECIAL_EL
110 }
111
112 my $el_category = {
113 a => A_EL | FORMATTING_EL,
114 address => ADDRESS_EL,
115 applet => MISC_SCOPING_EL,
116 area => MISC_SPECIAL_EL,
117 b => FORMATTING_EL,
118 base => MISC_SPECIAL_EL,
119 basefont => MISC_SPECIAL_EL,
120 bgsound => MISC_SPECIAL_EL,
121 big => FORMATTING_EL,
122 blockquote => MISC_SPECIAL_EL,
123 body => BODY_EL,
124 br => MISC_SPECIAL_EL,
125 button => BUTTON_EL,
126 caption => CAPTION_EL,
127 center => MISC_SPECIAL_EL,
128 col => MISC_SPECIAL_EL,
129 colgroup => MISC_SPECIAL_EL,
130 dd => DD_EL,
131 dir => MISC_SPECIAL_EL,
132 div => DIV_EL,
133 dl => MISC_SPECIAL_EL,
134 dt => DT_EL,
135 em => FORMATTING_EL,
136 embed => MISC_SPECIAL_EL,
137 fieldset => MISC_SPECIAL_EL,
138 font => FORMATTING_EL,
139 form => FORM_EL,
140 frame => MISC_SPECIAL_EL,
141 frameset => FRAMESET_EL,
142 h1 => HEADING_EL,
143 h2 => HEADING_EL,
144 h3 => HEADING_EL,
145 h4 => HEADING_EL,
146 h5 => HEADING_EL,
147 h6 => HEADING_EL,
148 head => MISC_SPECIAL_EL,
149 hr => MISC_SPECIAL_EL,
150 html => HTML_EL,
151 i => FORMATTING_EL,
152 iframe => MISC_SPECIAL_EL,
153 img => MISC_SPECIAL_EL,
154 input => MISC_SPECIAL_EL,
155 isindex => MISC_SPECIAL_EL,
156 li => LI_EL,
157 link => MISC_SPECIAL_EL,
158 listing => MISC_SPECIAL_EL,
159 marquee => MISC_SCOPING_EL,
160 menu => MISC_SPECIAL_EL,
161 meta => MISC_SPECIAL_EL,
162 nobr => NOBR_EL | FORMATTING_EL,
163 noembed => MISC_SPECIAL_EL,
164 noframes => MISC_SPECIAL_EL,
165 noscript => MISC_SPECIAL_EL,
166 object => MISC_SCOPING_EL,
167 ol => MISC_SPECIAL_EL,
168 optgroup => OPTGROUP_EL,
169 option => OPTION_EL,
170 p => P_EL,
171 param => MISC_SPECIAL_EL,
172 plaintext => MISC_SPECIAL_EL,
173 pre => MISC_SPECIAL_EL,
174 s => FORMATTING_EL,
175 script => MISC_SPECIAL_EL,
176 select => SELECT_EL,
177 small => FORMATTING_EL,
178 spacer => MISC_SPECIAL_EL,
179 strike => FORMATTING_EL,
180 strong => FORMATTING_EL,
181 style => MISC_SPECIAL_EL,
182 table => TABLE_EL,
183 tbody => TABLE_ROW_GROUP_EL,
184 td => TABLE_CELL_EL,
185 textarea => MISC_SPECIAL_EL,
186 tfoot => TABLE_ROW_GROUP_EL,
187 th => TABLE_CELL_EL,
188 thead => TABLE_ROW_GROUP_EL,
189 title => MISC_SPECIAL_EL,
190 tr => TABLE_ROW_EL,
191 tt => FORMATTING_EL,
192 u => FORMATTING_EL,
193 ul => MISC_SPECIAL_EL,
194 wbr => MISC_SPECIAL_EL,
195 };
196
197 my $el_category_f = {
198 $MML_NS => {
199 'annotation-xml' => MML_AXML_EL,
200 mi => FOREIGN_FLOW_CONTENT_EL,
201 mo => FOREIGN_FLOW_CONTENT_EL,
202 mn => FOREIGN_FLOW_CONTENT_EL,
203 ms => FOREIGN_FLOW_CONTENT_EL,
204 mtext => FOREIGN_FLOW_CONTENT_EL,
205 },
206 $SVG_NS => {
207 foreignObject => FOREIGN_FLOW_CONTENT_EL,
208 desc => FOREIGN_FLOW_CONTENT_EL,
209 title => FOREIGN_FLOW_CONTENT_EL,
210 },
211 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
212 };
213
214 my $svg_attr_name = {
215 attributetype => 'attributeType',
216 basefrequency => 'baseFrequency',
217 baseprofile => 'baseProfile',
218 calcmode => 'calcMode',
219 clippathunits => 'clipPathUnits',
220 contentscripttype => 'contentScriptType',
221 contentstyletype => 'contentStyleType',
222 diffuseconstant => 'diffuseConstant',
223 edgemode => 'edgeMode',
224 externalresourcesrequired => 'externalResourcesRequired',
225 fecolormatrix => 'feColorMatrix',
226 fecomposite => 'feComposite',
227 fegaussianblur => 'feGaussianBlur',
228 femorphology => 'feMorphology',
229 fetile => 'feTile',
230 filterres => 'filterRes',
231 filterunits => 'filterUnits',
232 glyphref => 'glyphRef',
233 gradienttransform => 'gradientTransform',
234 gradientunits => 'gradientUnits',
235 kernelmatrix => 'kernelMatrix',
236 kernelunitlength => 'kernelUnitLength',
237 keypoints => 'keyPoints',
238 keysplines => 'keySplines',
239 keytimes => 'keyTimes',
240 lengthadjust => 'lengthAdjust',
241 limitingconeangle => 'limitingConeAngle',
242 markerheight => 'markerHeight',
243 markerunits => 'markerUnits',
244 markerwidth => 'markerWidth',
245 maskcontentunits => 'maskContentUnits',
246 maskunits => 'maskUnits',
247 numoctaves => 'numOctaves',
248 pathlength => 'pathLength',
249 patterncontentunits => 'patternContentUnits',
250 patterntransform => 'patternTransform',
251 patternunits => 'patternUnits',
252 pointsatx => 'pointsAtX',
253 pointsaty => 'pointsAtY',
254 pointsatz => 'pointsAtZ',
255 preservealpha => 'preserveAlpha',
256 preserveaspectratio => 'preserveAspectRatio',
257 primitiveunits => 'primitiveUnits',
258 refx => 'refX',
259 refy => 'refY',
260 repeatcount => 'repeatCount',
261 repeatdur => 'repeatDur',
262 requiredextensions => 'requiredExtensions',
263 specularconstant => 'specularConstant',
264 specularexponent => 'specularExponent',
265 spreadmethod => 'spreadMethod',
266 startoffset => 'startOffset',
267 stddeviation => 'stdDeviation',
268 stitchtiles => 'stitchTiles',
269 surfacescale => 'surfaceScale',
270 systemlanguage => 'systemLanguage',
271 tablevalues => 'tableValues',
272 targetx => 'targetX',
273 targety => 'targetY',
274 textlength => 'textLength',
275 viewbox => 'viewBox',
276 viewtarget => 'viewTarget',
277 xchannelselector => 'xChannelSelector',
278 ychannelselector => 'yChannelSelector',
279 zoomandpan => 'zoomAndPan',
280 };
281
282 my $foreign_attr_xname = {
283 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
284 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
285 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
286 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
287 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
288 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
289 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
290 'xml:base' => [$XML_NS, ['xml', 'base']],
291 'xml:lang' => [$XML_NS, ['xml', 'lang']],
292 'xml:space' => [$XML_NS, ['xml', 'space']],
293 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
294 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
295 };
296
297 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
298
299 my $c1_entity_char = {
300 0x80 => 0x20AC,
301 0x81 => 0xFFFD,
302 0x82 => 0x201A,
303 0x83 => 0x0192,
304 0x84 => 0x201E,
305 0x85 => 0x2026,
306 0x86 => 0x2020,
307 0x87 => 0x2021,
308 0x88 => 0x02C6,
309 0x89 => 0x2030,
310 0x8A => 0x0160,
311 0x8B => 0x2039,
312 0x8C => 0x0152,
313 0x8D => 0xFFFD,
314 0x8E => 0x017D,
315 0x8F => 0xFFFD,
316 0x90 => 0xFFFD,
317 0x91 => 0x2018,
318 0x92 => 0x2019,
319 0x93 => 0x201C,
320 0x94 => 0x201D,
321 0x95 => 0x2022,
322 0x96 => 0x2013,
323 0x97 => 0x2014,
324 0x98 => 0x02DC,
325 0x99 => 0x2122,
326 0x9A => 0x0161,
327 0x9B => 0x203A,
328 0x9C => 0x0153,
329 0x9D => 0xFFFD,
330 0x9E => 0x017E,
331 0x9F => 0x0178,
332 }; # $c1_entity_char
333
334 sub parse_byte_string ($$$$;$) {
335 my $self = ref $_[0] ? shift : shift->new;
336 my $charset = shift;
337 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
338 my $s;
339
340 if (defined $charset) {
341 require Encode; ## TODO: decode(utf8) don't delete BOM
342 $s = \ (Encode::decode ($charset, $$bytes_s));
343 $self->{input_encoding} = lc $charset; ## TODO: normalize name
344 $self->{confident} = 1;
345 } else {
346 ## TODO: Implement HTML5 detection algorithm
347 require Whatpm::Charset::UniversalCharDet;
348 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
349 (substr ($$bytes_s, 0, 1024));
350 $charset ||= 'windows-1252';
351 $s = \ (Encode::decode ($charset, $$bytes_s));
352 $self->{input_encoding} = $charset;
353 $self->{confident} = 0;
354 }
355
356 $self->{change_encoding} = sub {
357 my $self = shift;
358 my $charset = lc shift;
359 my $token = shift;
360 ## TODO: if $charset is supported
361 ## TODO: normalize charset name
362
363 ## "Change the encoding" algorithm:
364
365 ## Step 1
366 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
367 $charset = 'utf-8';
368 }
369
370 ## Step 2
371 if (defined $self->{input_encoding} and
372 $self->{input_encoding} eq $charset) {
373 $self->{confident} = 1;
374 return;
375 }
376
377 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
378 ':'.$charset, level => 'w', token => $token);
379
380 ## Step 3
381 # if (can) {
382 ## change the encoding on the fly.
383 #$self->{confident} = 1;
384 #return;
385 # }
386
387 ## Step 4
388 throw Whatpm::HTML::RestartParser (charset => $charset);
389 }; # $self->{change_encoding}
390
391 my @args = @_; shift @args; # $s
392 my $return;
393 try {
394 $return = $self->parse_char_string ($s, @args);
395 } catch Whatpm::HTML::RestartParser with {
396 my $charset = shift->{charset};
397 $s = \ (Encode::decode ($charset, $$bytes_s));
398 $self->{input_encoding} = $charset; ## TODO: normalize
399 $self->{confident} = 1;
400 $return = $self->parse_char_string ($s, @args);
401 };
402 return $return;
403 } # parse_byte_string
404
405 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
406 ## and the HTML layer MUST ignore it. However, we does strip BOM in
407 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
408 ## because the core part of our HTML parser expects a string of character,
409 ## not a string of bytes or code units or anything which might contain a BOM.
410 ## Therefore, any parser interface that accepts a string of bytes,
411 ## such as |parse_byte_string| in this module, must ensure that it does
412 ## strip the BOM and never strip any ZWNBSP.
413
414 *parse_char_string = \&parse_string;
415
416 sub parse_string ($$$;$) {
417 my $self = ref $_[0] ? shift : shift->new;
418 my $s = ref $_[0] ? $_[0] : \($_[0]);
419 $self->{document} = $_[1];
420 @{$self->{document}->child_nodes} = ();
421
422 ## NOTE: |set_inner_html| copies most of this method's code
423
424 $self->{confident} = 1 unless exists $self->{confident};
425 $self->{document}->input_encoding ($self->{input_encoding})
426 if defined $self->{input_encoding};
427
428 my $i = 0;
429 $self->{line_prev} = $self->{line} = 1;
430 $self->{column_prev} = $self->{column} = 0;
431 $self->{set_next_char} = sub {
432 my $self = shift;
433
434 pop @{$self->{prev_char}};
435 unshift @{$self->{prev_char}}, $self->{next_char};
436
437 $self->{next_char} = -1 and return if $i >= length $$s;
438 $self->{next_char} = ord substr $$s, $i++, 1;
439
440 ($self->{line_prev}, $self->{column_prev})
441 = ($self->{line}, $self->{column});
442 $self->{column}++;
443
444 if ($self->{next_char} == 0x000A) { # LF
445 !!!cp ('j1');
446 $self->{line}++;
447 $self->{column} = 0;
448 } elsif ($self->{next_char} == 0x000D) { # CR
449 !!!cp ('j2');
450 $i++ if substr ($$s, $i, 1) eq "\x0A";
451 $self->{next_char} = 0x000A; # LF # MUST
452 $self->{line}++;
453 $self->{column} = 0;
454 } elsif ($self->{next_char} > 0x10FFFF) {
455 !!!cp ('j3');
456 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
457 } elsif ($self->{next_char} == 0x0000) { # NULL
458 !!!cp ('j4');
459 !!!parse-error (type => 'NULL');
460 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
461 } elsif ($self->{next_char} <= 0x0008 or
462 (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
463 (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
464 (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
465 (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
466 {
467 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
468 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
469 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
470 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
471 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
472 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
473 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
474 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
475 0x10FFFE => 1, 0x10FFFF => 1,
476 }->{$self->{next_char}}) {
477 !!!cp ('j5');
478 !!!parse-error (type => 'control char', level => $self->{must_level});
479 ## TODO: error type documentation
480 }
481 };
482 $self->{prev_char} = [-1, -1, -1];
483 $self->{next_char} = -1;
484
485 my $onerror = $_[2] || sub {
486 my (%opt) = @_;
487 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
488 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
489 warn "Parse error ($opt{type}) at line $line column $column\n";
490 };
491 $self->{parse_error} = sub {
492 $onerror->(line => $self->{line}, column => $self->{column}, @_);
493 };
494
495 $self->_initialize_tokenizer;
496 $self->_initialize_tree_constructor;
497 $self->_construct_tree;
498 $self->_terminate_tree_constructor;
499
500 delete $self->{parse_error}; # remove loop
501
502 return $self->{document};
503 } # parse_string
504
505 sub new ($) {
506 my $class = shift;
507 my $self = bless {}, $class;
508 $self->{set_next_char} = sub {
509 $self->{next_char} = -1;
510 };
511 $self->{parse_error} = sub {
512 #
513 };
514 $self->{change_encoding} = sub {
515 # if ($_[0] is a supported encoding) {
516 # run "change the encoding" algorithm;
517 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
518 # }
519 };
520 $self->{application_cache_selection} = sub {
521 #
522 };
523 return $self;
524 } # new
525
526 sub CM_ENTITY () { 0b001 } # & markup in data
527 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
528 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
529
530 sub PLAINTEXT_CONTENT_MODEL () { 0 }
531 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
532 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
533 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
534
535 sub DATA_STATE () { 0 }
536 sub ENTITY_DATA_STATE () { 1 }
537 sub TAG_OPEN_STATE () { 2 }
538 sub CLOSE_TAG_OPEN_STATE () { 3 }
539 sub TAG_NAME_STATE () { 4 }
540 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
541 sub ATTRIBUTE_NAME_STATE () { 6 }
542 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
543 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
544 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
545 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
546 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
547 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
548 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
549 sub COMMENT_START_STATE () { 14 }
550 sub COMMENT_START_DASH_STATE () { 15 }
551 sub COMMENT_STATE () { 16 }
552 sub COMMENT_END_STATE () { 17 }
553 sub COMMENT_END_DASH_STATE () { 18 }
554 sub BOGUS_COMMENT_STATE () { 19 }
555 sub DOCTYPE_STATE () { 20 }
556 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
557 sub DOCTYPE_NAME_STATE () { 22 }
558 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
559 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
560 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
561 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
562 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
563 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
564 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
565 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
566 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
567 sub BOGUS_DOCTYPE_STATE () { 32 }
568 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
569 sub SELF_CLOSING_START_TAG_STATE () { 34 }
570 sub CDATA_BLOCK_STATE () { 35 }
571
572 sub DOCTYPE_TOKEN () { 1 }
573 sub COMMENT_TOKEN () { 2 }
574 sub START_TAG_TOKEN () { 3 }
575 sub END_TAG_TOKEN () { 4 }
576 sub END_OF_FILE_TOKEN () { 5 }
577 sub CHARACTER_TOKEN () { 6 }
578
579 sub AFTER_HTML_IMS () { 0b100 }
580 sub HEAD_IMS () { 0b1000 }
581 sub BODY_IMS () { 0b10000 }
582 sub BODY_TABLE_IMS () { 0b100000 }
583 sub TABLE_IMS () { 0b1000000 }
584 sub ROW_IMS () { 0b10000000 }
585 sub BODY_AFTER_IMS () { 0b100000000 }
586 sub FRAME_IMS () { 0b1000000000 }
587 sub SELECT_IMS () { 0b10000000000 }
588 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
589 ## NOTE: "in foreign content" insertion mode is special; it is combined
590 ## with the secondary insertion mode. In this parser, they are stored
591 ## together in the bit-or'ed form.
592
593 ## NOTE: "initial" and "before html" insertion modes have no constants.
594
595 ## NOTE: "after after body" insertion mode.
596 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
597
598 ## NOTE: "after after frameset" insertion mode.
599 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
600
601 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
602 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
603 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
604 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
605 sub IN_BODY_IM () { BODY_IMS }
606 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
607 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
608 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
609 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
610 sub IN_TABLE_IM () { TABLE_IMS }
611 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
612 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
613 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
614 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
615 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
616 sub IN_COLUMN_GROUP_IM () { 0b10 }
617
618 ## Implementations MUST act as if state machine in the spec
619
620 sub _initialize_tokenizer ($) {
621 my $self = shift;
622 $self->{state} = DATA_STATE; # MUST
623 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
624 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
625 undef $self->{current_attribute};
626 undef $self->{last_emitted_start_tag_name};
627 undef $self->{last_attribute_value_state};
628 delete $self->{self_closing};
629 $self->{char} = [];
630 # $self->{next_char}
631 !!!next-input-character;
632 $self->{token} = [];
633 # $self->{escape}
634 } # _initialize_tokenizer
635
636 ## A token has:
637 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
638 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
639 ## ->{name} (DOCTYPE_TOKEN)
640 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
641 ## ->{public_identifier} (DOCTYPE_TOKEN)
642 ## ->{system_identifier} (DOCTYPE_TOKEN)
643 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
644 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
645 ## ->{name}
646 ## ->{value}
647 ## ->{has_reference} == 1 or 0
648 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
649 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
650 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
651 ## while the token is pushed back to the stack.
652
653 ## ISSUE: "When a DOCTYPE token is created, its
654 ## <i>self-closing flag</i> must be unset (its other state is that it
655 ## be set), and its attributes list must be empty.": Wrong subject?
656
657 ## Emitted token MUST immediately be handled by the tree construction state.
658
659 ## Before each step, UA MAY check to see if either one of the scripts in
660 ## "list of scripts that will execute as soon as possible" or the first
661 ## script in the "list of scripts that will execute asynchronously",
662 ## has completed loading. If one has, then it MUST be executed
663 ## and removed from the list.
664
665 ## NOTE: HTML5 "Writing HTML documents" section, applied to
666 ## documents and not to user agents and conformance checkers,
667 ## contains some requirements that are not detected by the
668 ## parsing algorithm:
669 ## - Some requirements on character encoding declarations. ## TODO
670 ## - "Elements MUST NOT contain content that their content model disallows."
671 ## ... Some are parse error, some are not (will be reported by c.c.).
672 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
673 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
674 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
675
676 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
677 ## be detected by the HTML5 parsing algorithm:
678 ## - Text,
679
680 sub _get_next_token ($) {
681 my $self = shift;
682
683 if ($self->{self_closing}) {
684 !!!parse-error (type => 'nestc', token => $self->{current_token});
685 ## NOTE: The |self_closing| flag is only set by start tag token.
686 ## In addition, when a start tag token is emitted, it is always set to
687 ## |current_token|.
688 delete $self->{self_closing};
689 }
690
691 if (@{$self->{token}}) {
692 $self->{self_closing} = $self->{token}->[0]->{self_closing};
693 return shift @{$self->{token}};
694 }
695
696 A: {
697 if ($self->{state} == DATA_STATE) {
698 if ($self->{next_char} == 0x0026) { # &
699 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
700 not $self->{escape}) {
701 !!!cp (1);
702 $self->{state} = ENTITY_DATA_STATE;
703 !!!next-input-character;
704 redo A;
705 } else {
706 !!!cp (2);
707 #
708 }
709 } elsif ($self->{next_char} == 0x002D) { # -
710 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
711 unless ($self->{escape}) {
712 if ($self->{prev_char}->[0] == 0x002D and # -
713 $self->{prev_char}->[1] == 0x0021 and # !
714 $self->{prev_char}->[2] == 0x003C) { # <
715 !!!cp (3);
716 $self->{escape} = 1;
717 } else {
718 !!!cp (4);
719 }
720 } else {
721 !!!cp (5);
722 }
723 }
724
725 #
726 } elsif ($self->{next_char} == 0x003C) { # <
727 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
728 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
729 not $self->{escape})) {
730 !!!cp (6);
731 $self->{state} = TAG_OPEN_STATE;
732 !!!next-input-character;
733 redo A;
734 } else {
735 !!!cp (7);
736 #
737 }
738 } elsif ($self->{next_char} == 0x003E) { # >
739 if ($self->{escape} and
740 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
741 if ($self->{prev_char}->[0] == 0x002D and # -
742 $self->{prev_char}->[1] == 0x002D) { # -
743 !!!cp (8);
744 delete $self->{escape};
745 } else {
746 !!!cp (9);
747 }
748 } else {
749 !!!cp (10);
750 }
751
752 #
753 } elsif ($self->{next_char} == -1) {
754 !!!cp (11);
755 !!!emit ({type => END_OF_FILE_TOKEN,
756 line => $self->{line}, column => $self->{column}});
757 last A; ## TODO: ok?
758 } else {
759 !!!cp (12);
760 }
761 # Anything else
762 my $token = {type => CHARACTER_TOKEN,
763 data => chr $self->{next_char},
764 line => $self->{line}, column => $self->{column},
765 };
766 ## Stay in the data state
767 !!!next-input-character;
768
769 !!!emit ($token);
770
771 redo A;
772 } elsif ($self->{state} == ENTITY_DATA_STATE) {
773 ## (cannot happen in CDATA state)
774
775 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
776
777 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
778
779 $self->{state} = DATA_STATE;
780 # next-input-character is already done
781
782 unless (defined $token) {
783 !!!cp (13);
784 !!!emit ({type => CHARACTER_TOKEN, data => '&',
785 line => $l, column => $c,
786 });
787 } else {
788 !!!cp (14);
789 !!!emit ($token);
790 }
791
792 redo A;
793 } elsif ($self->{state} == TAG_OPEN_STATE) {
794 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
795 if ($self->{next_char} == 0x002F) { # /
796 !!!cp (15);
797 !!!next-input-character;
798 $self->{state} = CLOSE_TAG_OPEN_STATE;
799 redo A;
800 } else {
801 !!!cp (16);
802 ## reconsume
803 $self->{state} = DATA_STATE;
804
805 !!!emit ({type => CHARACTER_TOKEN, data => '<',
806 line => $self->{line_prev},
807 column => $self->{column_prev},
808 });
809
810 redo A;
811 }
812 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
813 if ($self->{next_char} == 0x0021) { # !
814 !!!cp (17);
815 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
816 !!!next-input-character;
817 redo A;
818 } elsif ($self->{next_char} == 0x002F) { # /
819 !!!cp (18);
820 $self->{state} = CLOSE_TAG_OPEN_STATE;
821 !!!next-input-character;
822 redo A;
823 } elsif (0x0041 <= $self->{next_char} and
824 $self->{next_char} <= 0x005A) { # A..Z
825 !!!cp (19);
826 $self->{current_token}
827 = {type => START_TAG_TOKEN,
828 tag_name => chr ($self->{next_char} + 0x0020),
829 line => $self->{line_prev},
830 column => $self->{column_prev}};
831 $self->{state} = TAG_NAME_STATE;
832 !!!next-input-character;
833 redo A;
834 } elsif (0x0061 <= $self->{next_char} and
835 $self->{next_char} <= 0x007A) { # a..z
836 !!!cp (20);
837 $self->{current_token} = {type => START_TAG_TOKEN,
838 tag_name => chr ($self->{next_char}),
839 line => $self->{line_prev},
840 column => $self->{column_prev}};
841 $self->{state} = TAG_NAME_STATE;
842 !!!next-input-character;
843 redo A;
844 } elsif ($self->{next_char} == 0x003E) { # >
845 !!!cp (21);
846 !!!parse-error (type => 'empty start tag',
847 line => $self->{line_prev},
848 column => $self->{column_prev});
849 $self->{state} = DATA_STATE;
850 !!!next-input-character;
851
852 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
853 line => $self->{line_prev},
854 column => $self->{column_prev},
855 });
856
857 redo A;
858 } elsif ($self->{next_char} == 0x003F) { # ?
859 !!!cp (22);
860 !!!parse-error (type => 'pio',
861 line => $self->{line_prev},
862 column => $self->{column_prev});
863 $self->{state} = BOGUS_COMMENT_STATE;
864 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
865 line => $self->{line_prev},
866 column => $self->{column_prev},
867 };
868 ## $self->{next_char} is intentionally left as is
869 redo A;
870 } else {
871 !!!cp (23);
872 !!!parse-error (type => 'bare stago');
873 $self->{state} = DATA_STATE;
874 ## reconsume
875
876 !!!emit ({type => CHARACTER_TOKEN, data => '<',
877 line => $self->{line_prev},
878 column => $self->{column_prev},
879 });
880
881 redo A;
882 }
883 } else {
884 die "$0: $self->{content_model} in tag open";
885 }
886 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
887 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
888 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
889 if (defined $self->{last_emitted_start_tag_name}) {
890
891 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
892 my @next_char;
893 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
894 push @next_char, $self->{next_char};
895 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
896 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
897 if ($self->{next_char} == $c or $self->{next_char} == $C) {
898 !!!cp (24);
899 !!!next-input-character;
900 next TAGNAME;
901 } else {
902 !!!cp (25);
903 $self->{next_char} = shift @next_char; # reconsume
904 !!!back-next-input-character (@next_char);
905 $self->{state} = DATA_STATE;
906
907 !!!emit ({type => CHARACTER_TOKEN, data => '</',
908 line => $l, column => $c,
909 });
910
911 redo A;
912 }
913 }
914 push @next_char, $self->{next_char};
915
916 unless ($self->{next_char} == 0x0009 or # HT
917 $self->{next_char} == 0x000A or # LF
918 $self->{next_char} == 0x000B or # VT
919 $self->{next_char} == 0x000C or # FF
920 $self->{next_char} == 0x0020 or # SP
921 $self->{next_char} == 0x003E or # >
922 $self->{next_char} == 0x002F or # /
923 $self->{next_char} == -1) {
924 !!!cp (26);
925 $self->{next_char} = shift @next_char; # reconsume
926 !!!back-next-input-character (@next_char);
927 $self->{state} = DATA_STATE;
928 !!!emit ({type => CHARACTER_TOKEN, data => '</',
929 line => $l, column => $c,
930 });
931 redo A;
932 } else {
933 !!!cp (27);
934 $self->{next_char} = shift @next_char;
935 !!!back-next-input-character (@next_char);
936 # and consume...
937 }
938 } else {
939 ## No start tag token has ever been emitted
940 !!!cp (28);
941 # next-input-character is already done
942 $self->{state} = DATA_STATE;
943 !!!emit ({type => CHARACTER_TOKEN, data => '</',
944 line => $l, column => $c,
945 });
946 redo A;
947 }
948 }
949
950 if (0x0041 <= $self->{next_char} and
951 $self->{next_char} <= 0x005A) { # A..Z
952 !!!cp (29);
953 $self->{current_token}
954 = {type => END_TAG_TOKEN,
955 tag_name => chr ($self->{next_char} + 0x0020),
956 line => $l, column => $c};
957 $self->{state} = TAG_NAME_STATE;
958 !!!next-input-character;
959 redo A;
960 } elsif (0x0061 <= $self->{next_char} and
961 $self->{next_char} <= 0x007A) { # a..z
962 !!!cp (30);
963 $self->{current_token} = {type => END_TAG_TOKEN,
964 tag_name => chr ($self->{next_char}),
965 line => $l, column => $c};
966 $self->{state} = TAG_NAME_STATE;
967 !!!next-input-character;
968 redo A;
969 } elsif ($self->{next_char} == 0x003E) { # >
970 !!!cp (31);
971 !!!parse-error (type => 'empty end tag',
972 line => $self->{line_prev}, ## "<" in "</>"
973 column => $self->{column_prev} - 1);
974 $self->{state} = DATA_STATE;
975 !!!next-input-character;
976 redo A;
977 } elsif ($self->{next_char} == -1) {
978 !!!cp (32);
979 !!!parse-error (type => 'bare etago');
980 $self->{state} = DATA_STATE;
981 # reconsume
982
983 !!!emit ({type => CHARACTER_TOKEN, data => '</',
984 line => $l, column => $c,
985 });
986
987 redo A;
988 } else {
989 !!!cp (33);
990 !!!parse-error (type => 'bogus end tag');
991 $self->{state} = BOGUS_COMMENT_STATE;
992 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
993 line => $self->{line_prev}, # "<" of "</"
994 column => $self->{column_prev} - 1,
995 };
996 ## $self->{next_char} is intentionally left as is
997 redo A;
998 }
999 } elsif ($self->{state} == TAG_NAME_STATE) {
1000 if ($self->{next_char} == 0x0009 or # HT
1001 $self->{next_char} == 0x000A or # LF
1002 $self->{next_char} == 0x000B or # VT
1003 $self->{next_char} == 0x000C or # FF
1004 $self->{next_char} == 0x0020) { # SP
1005 !!!cp (34);
1006 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1007 !!!next-input-character;
1008 redo A;
1009 } elsif ($self->{next_char} == 0x003E) { # >
1010 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1011 !!!cp (35);
1012 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1013 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1014 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1015 #if ($self->{current_token}->{attributes}) {
1016 # ## NOTE: This should never be reached.
1017 # !!! cp (36);
1018 # !!! parse-error (type => 'end tag attribute');
1019 #} else {
1020 !!!cp (37);
1021 #}
1022 } else {
1023 die "$0: $self->{current_token}->{type}: Unknown token type";
1024 }
1025 $self->{state} = DATA_STATE;
1026 !!!next-input-character;
1027
1028 !!!emit ($self->{current_token}); # start tag or end tag
1029
1030 redo A;
1031 } elsif (0x0041 <= $self->{next_char} and
1032 $self->{next_char} <= 0x005A) { # A..Z
1033 !!!cp (38);
1034 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1035 # start tag or end tag
1036 ## Stay in this state
1037 !!!next-input-character;
1038 redo A;
1039 } elsif ($self->{next_char} == -1) {
1040 !!!parse-error (type => 'unclosed tag');
1041 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1042 !!!cp (39);
1043 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1044 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1045 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1046 #if ($self->{current_token}->{attributes}) {
1047 # ## NOTE: This state should never be reached.
1048 # !!! cp (40);
1049 # !!! parse-error (type => 'end tag attribute');
1050 #} else {
1051 !!!cp (41);
1052 #}
1053 } else {
1054 die "$0: $self->{current_token}->{type}: Unknown token type";
1055 }
1056 $self->{state} = DATA_STATE;
1057 # reconsume
1058
1059 !!!emit ($self->{current_token}); # start tag or end tag
1060
1061 redo A;
1062 } elsif ($self->{next_char} == 0x002F) { # /
1063 !!!cp (42);
1064 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1065 !!!next-input-character;
1066 redo A;
1067 } else {
1068 !!!cp (44);
1069 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1070 # start tag or end tag
1071 ## Stay in the state
1072 !!!next-input-character;
1073 redo A;
1074 }
1075 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1076 if ($self->{next_char} == 0x0009 or # HT
1077 $self->{next_char} == 0x000A or # LF
1078 $self->{next_char} == 0x000B or # VT
1079 $self->{next_char} == 0x000C or # FF
1080 $self->{next_char} == 0x0020) { # SP
1081 !!!cp (45);
1082 ## Stay in the state
1083 !!!next-input-character;
1084 redo A;
1085 } elsif ($self->{next_char} == 0x003E) { # >
1086 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1087 !!!cp (46);
1088 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1089 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1090 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1091 if ($self->{current_token}->{attributes}) {
1092 !!!cp (47);
1093 !!!parse-error (type => 'end tag attribute');
1094 } else {
1095 !!!cp (48);
1096 }
1097 } else {
1098 die "$0: $self->{current_token}->{type}: Unknown token type";
1099 }
1100 $self->{state} = DATA_STATE;
1101 !!!next-input-character;
1102
1103 !!!emit ($self->{current_token}); # start tag or end tag
1104
1105 redo A;
1106 } elsif (0x0041 <= $self->{next_char} and
1107 $self->{next_char} <= 0x005A) { # A..Z
1108 !!!cp (49);
1109 $self->{current_attribute}
1110 = {name => chr ($self->{next_char} + 0x0020),
1111 value => '',
1112 line => $self->{line}, column => $self->{column}};
1113 $self->{state} = ATTRIBUTE_NAME_STATE;
1114 !!!next-input-character;
1115 redo A;
1116 } elsif ($self->{next_char} == 0x002F) { # /
1117 !!!cp (50);
1118 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1119 !!!next-input-character;
1120 redo A;
1121 } elsif ($self->{next_char} == -1) {
1122 !!!parse-error (type => 'unclosed tag');
1123 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1124 !!!cp (52);
1125 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1126 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1127 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1128 if ($self->{current_token}->{attributes}) {
1129 !!!cp (53);
1130 !!!parse-error (type => 'end tag attribute');
1131 } else {
1132 !!!cp (54);
1133 }
1134 } else {
1135 die "$0: $self->{current_token}->{type}: Unknown token type";
1136 }
1137 $self->{state} = DATA_STATE;
1138 # reconsume
1139
1140 !!!emit ($self->{current_token}); # start tag or end tag
1141
1142 redo A;
1143 } else {
1144 if ({
1145 0x0022 => 1, # "
1146 0x0027 => 1, # '
1147 0x003D => 1, # =
1148 }->{$self->{next_char}}) {
1149 !!!cp (55);
1150 !!!parse-error (type => 'bad attribute name');
1151 } else {
1152 !!!cp (56);
1153 }
1154 $self->{current_attribute}
1155 = {name => chr ($self->{next_char}),
1156 value => '',
1157 line => $self->{line}, column => $self->{column}};
1158 $self->{state} = ATTRIBUTE_NAME_STATE;
1159 !!!next-input-character;
1160 redo A;
1161 }
1162 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1163 my $before_leave = sub {
1164 if (exists $self->{current_token}->{attributes} # start tag or end tag
1165 ->{$self->{current_attribute}->{name}}) { # MUST
1166 !!!cp (57);
1167 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1168 ## Discard $self->{current_attribute} # MUST
1169 } else {
1170 !!!cp (58);
1171 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1172 = $self->{current_attribute};
1173 }
1174 }; # $before_leave
1175
1176 if ($self->{next_char} == 0x0009 or # HT
1177 $self->{next_char} == 0x000A or # LF
1178 $self->{next_char} == 0x000B or # VT
1179 $self->{next_char} == 0x000C or # FF
1180 $self->{next_char} == 0x0020) { # SP
1181 !!!cp (59);
1182 $before_leave->();
1183 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1184 !!!next-input-character;
1185 redo A;
1186 } elsif ($self->{next_char} == 0x003D) { # =
1187 !!!cp (60);
1188 $before_leave->();
1189 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1190 !!!next-input-character;
1191 redo A;
1192 } elsif ($self->{next_char} == 0x003E) { # >
1193 $before_leave->();
1194 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1195 !!!cp (61);
1196 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1197 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1198 !!!cp (62);
1199 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1200 if ($self->{current_token}->{attributes}) {
1201 !!!parse-error (type => 'end tag attribute');
1202 }
1203 } else {
1204 die "$0: $self->{current_token}->{type}: Unknown token type";
1205 }
1206 $self->{state} = DATA_STATE;
1207 !!!next-input-character;
1208
1209 !!!emit ($self->{current_token}); # start tag or end tag
1210
1211 redo A;
1212 } elsif (0x0041 <= $self->{next_char} and
1213 $self->{next_char} <= 0x005A) { # A..Z
1214 !!!cp (63);
1215 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1216 ## Stay in the state
1217 !!!next-input-character;
1218 redo A;
1219 } elsif ($self->{next_char} == 0x002F) { # /
1220 !!!cp (64);
1221 $before_leave->();
1222 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1223 !!!next-input-character;
1224 redo A;
1225 } elsif ($self->{next_char} == -1) {
1226 !!!parse-error (type => 'unclosed tag');
1227 $before_leave->();
1228 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1229 !!!cp (66);
1230 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1231 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1232 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1233 if ($self->{current_token}->{attributes}) {
1234 !!!cp (67);
1235 !!!parse-error (type => 'end tag attribute');
1236 } else {
1237 ## NOTE: This state should never be reached.
1238 !!!cp (68);
1239 }
1240 } else {
1241 die "$0: $self->{current_token}->{type}: Unknown token type";
1242 }
1243 $self->{state} = DATA_STATE;
1244 # reconsume
1245
1246 !!!emit ($self->{current_token}); # start tag or end tag
1247
1248 redo A;
1249 } else {
1250 if ($self->{next_char} == 0x0022 or # "
1251 $self->{next_char} == 0x0027) { # '
1252 !!!cp (69);
1253 !!!parse-error (type => 'bad attribute name');
1254 } else {
1255 !!!cp (70);
1256 }
1257 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1258 ## Stay in the state
1259 !!!next-input-character;
1260 redo A;
1261 }
1262 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1263 if ($self->{next_char} == 0x0009 or # HT
1264 $self->{next_char} == 0x000A or # LF
1265 $self->{next_char} == 0x000B or # VT
1266 $self->{next_char} == 0x000C or # FF
1267 $self->{next_char} == 0x0020) { # SP
1268 !!!cp (71);
1269 ## Stay in the state
1270 !!!next-input-character;
1271 redo A;
1272 } elsif ($self->{next_char} == 0x003D) { # =
1273 !!!cp (72);
1274 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1275 !!!next-input-character;
1276 redo A;
1277 } elsif ($self->{next_char} == 0x003E) { # >
1278 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1279 !!!cp (73);
1280 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1281 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1282 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1283 if ($self->{current_token}->{attributes}) {
1284 !!!cp (74);
1285 !!!parse-error (type => 'end tag attribute');
1286 } else {
1287 ## NOTE: This state should never be reached.
1288 !!!cp (75);
1289 }
1290 } else {
1291 die "$0: $self->{current_token}->{type}: Unknown token type";
1292 }
1293 $self->{state} = DATA_STATE;
1294 !!!next-input-character;
1295
1296 !!!emit ($self->{current_token}); # start tag or end tag
1297
1298 redo A;
1299 } elsif (0x0041 <= $self->{next_char} and
1300 $self->{next_char} <= 0x005A) { # A..Z
1301 !!!cp (76);
1302 $self->{current_attribute}
1303 = {name => chr ($self->{next_char} + 0x0020),
1304 value => '',
1305 line => $self->{line}, column => $self->{column}};
1306 $self->{state} = ATTRIBUTE_NAME_STATE;
1307 !!!next-input-character;
1308 redo A;
1309 } elsif ($self->{next_char} == 0x002F) { # /
1310 !!!cp (77);
1311 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1312 !!!next-input-character;
1313 redo A;
1314 } elsif ($self->{next_char} == -1) {
1315 !!!parse-error (type => 'unclosed tag');
1316 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1317 !!!cp (79);
1318 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1319 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1320 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1321 if ($self->{current_token}->{attributes}) {
1322 !!!cp (80);
1323 !!!parse-error (type => 'end tag attribute');
1324 } else {
1325 ## NOTE: This state should never be reached.
1326 !!!cp (81);
1327 }
1328 } else {
1329 die "$0: $self->{current_token}->{type}: Unknown token type";
1330 }
1331 $self->{state} = DATA_STATE;
1332 # reconsume
1333
1334 !!!emit ($self->{current_token}); # start tag or end tag
1335
1336 redo A;
1337 } else {
1338 !!!cp (82);
1339 $self->{current_attribute}
1340 = {name => chr ($self->{next_char}),
1341 value => '',
1342 line => $self->{line}, column => $self->{column}};
1343 $self->{state} = ATTRIBUTE_NAME_STATE;
1344 !!!next-input-character;
1345 redo A;
1346 }
1347 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1348 if ($self->{next_char} == 0x0009 or # HT
1349 $self->{next_char} == 0x000A or # LF
1350 $self->{next_char} == 0x000B or # VT
1351 $self->{next_char} == 0x000C or # FF
1352 $self->{next_char} == 0x0020) { # SP
1353 !!!cp (83);
1354 ## Stay in the state
1355 !!!next-input-character;
1356 redo A;
1357 } elsif ($self->{next_char} == 0x0022) { # "
1358 !!!cp (84);
1359 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1360 !!!next-input-character;
1361 redo A;
1362 } elsif ($self->{next_char} == 0x0026) { # &
1363 !!!cp (85);
1364 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1365 ## reconsume
1366 redo A;
1367 } elsif ($self->{next_char} == 0x0027) { # '
1368 !!!cp (86);
1369 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1370 !!!next-input-character;
1371 redo A;
1372 } elsif ($self->{next_char} == 0x003E) { # >
1373 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1374 !!!cp (87);
1375 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1376 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1377 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1378 if ($self->{current_token}->{attributes}) {
1379 !!!cp (88);
1380 !!!parse-error (type => 'end tag attribute');
1381 } else {
1382 ## NOTE: This state should never be reached.
1383 !!!cp (89);
1384 }
1385 } else {
1386 die "$0: $self->{current_token}->{type}: Unknown token type";
1387 }
1388 $self->{state} = DATA_STATE;
1389 !!!next-input-character;
1390
1391 !!!emit ($self->{current_token}); # start tag or end tag
1392
1393 redo A;
1394 } elsif ($self->{next_char} == -1) {
1395 !!!parse-error (type => 'unclosed tag');
1396 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1397 !!!cp (90);
1398 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1399 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1400 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1401 if ($self->{current_token}->{attributes}) {
1402 !!!cp (91);
1403 !!!parse-error (type => 'end tag attribute');
1404 } else {
1405 ## NOTE: This state should never be reached.
1406 !!!cp (92);
1407 }
1408 } else {
1409 die "$0: $self->{current_token}->{type}: Unknown token type";
1410 }
1411 $self->{state} = DATA_STATE;
1412 ## reconsume
1413
1414 !!!emit ($self->{current_token}); # start tag or end tag
1415
1416 redo A;
1417 } else {
1418 if ($self->{next_char} == 0x003D) { # =
1419 !!!cp (93);
1420 !!!parse-error (type => 'bad attribute value');
1421 } else {
1422 !!!cp (94);
1423 }
1424 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1425 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1426 !!!next-input-character;
1427 redo A;
1428 }
1429 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1430 if ($self->{next_char} == 0x0022) { # "
1431 !!!cp (95);
1432 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1433 !!!next-input-character;
1434 redo A;
1435 } elsif ($self->{next_char} == 0x0026) { # &
1436 !!!cp (96);
1437 $self->{last_attribute_value_state} = $self->{state};
1438 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1439 !!!next-input-character;
1440 redo A;
1441 } elsif ($self->{next_char} == -1) {
1442 !!!parse-error (type => 'unclosed attribute value');
1443 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1444 !!!cp (97);
1445 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1446 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1447 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1448 if ($self->{current_token}->{attributes}) {
1449 !!!cp (98);
1450 !!!parse-error (type => 'end tag attribute');
1451 } else {
1452 ## NOTE: This state should never be reached.
1453 !!!cp (99);
1454 }
1455 } else {
1456 die "$0: $self->{current_token}->{type}: Unknown token type";
1457 }
1458 $self->{state} = DATA_STATE;
1459 ## reconsume
1460
1461 !!!emit ($self->{current_token}); # start tag or end tag
1462
1463 redo A;
1464 } else {
1465 !!!cp (100);
1466 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1467 ## Stay in the state
1468 !!!next-input-character;
1469 redo A;
1470 }
1471 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1472 if ($self->{next_char} == 0x0027) { # '
1473 !!!cp (101);
1474 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1475 !!!next-input-character;
1476 redo A;
1477 } elsif ($self->{next_char} == 0x0026) { # &
1478 !!!cp (102);
1479 $self->{last_attribute_value_state} = $self->{state};
1480 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1481 !!!next-input-character;
1482 redo A;
1483 } elsif ($self->{next_char} == -1) {
1484 !!!parse-error (type => 'unclosed attribute value');
1485 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1486 !!!cp (103);
1487 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1488 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1489 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1490 if ($self->{current_token}->{attributes}) {
1491 !!!cp (104);
1492 !!!parse-error (type => 'end tag attribute');
1493 } else {
1494 ## NOTE: This state should never be reached.
1495 !!!cp (105);
1496 }
1497 } else {
1498 die "$0: $self->{current_token}->{type}: Unknown token type";
1499 }
1500 $self->{state} = DATA_STATE;
1501 ## reconsume
1502
1503 !!!emit ($self->{current_token}); # start tag or end tag
1504
1505 redo A;
1506 } else {
1507 !!!cp (106);
1508 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1509 ## Stay in the state
1510 !!!next-input-character;
1511 redo A;
1512 }
1513 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1514 if ($self->{next_char} == 0x0009 or # HT
1515 $self->{next_char} == 0x000A or # LF
1516 $self->{next_char} == 0x000B or # HT
1517 $self->{next_char} == 0x000C or # FF
1518 $self->{next_char} == 0x0020) { # SP
1519 !!!cp (107);
1520 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1521 !!!next-input-character;
1522 redo A;
1523 } elsif ($self->{next_char} == 0x0026) { # &
1524 !!!cp (108);
1525 $self->{last_attribute_value_state} = $self->{state};
1526 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1527 !!!next-input-character;
1528 redo A;
1529 } elsif ($self->{next_char} == 0x003E) { # >
1530 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1531 !!!cp (109);
1532 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1533 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1534 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1535 if ($self->{current_token}->{attributes}) {
1536 !!!cp (110);
1537 !!!parse-error (type => 'end tag attribute');
1538 } else {
1539 ## NOTE: This state should never be reached.
1540 !!!cp (111);
1541 }
1542 } else {
1543 die "$0: $self->{current_token}->{type}: Unknown token type";
1544 }
1545 $self->{state} = DATA_STATE;
1546 !!!next-input-character;
1547
1548 !!!emit ($self->{current_token}); # start tag or end tag
1549
1550 redo A;
1551 } elsif ($self->{next_char} == -1) {
1552 !!!parse-error (type => 'unclosed tag');
1553 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1554 !!!cp (112);
1555 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1556 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1557 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1558 if ($self->{current_token}->{attributes}) {
1559 !!!cp (113);
1560 !!!parse-error (type => 'end tag attribute');
1561 } else {
1562 ## NOTE: This state should never be reached.
1563 !!!cp (114);
1564 }
1565 } else {
1566 die "$0: $self->{current_token}->{type}: Unknown token type";
1567 }
1568 $self->{state} = DATA_STATE;
1569 ## reconsume
1570
1571 !!!emit ($self->{current_token}); # start tag or end tag
1572
1573 redo A;
1574 } else {
1575 if ({
1576 0x0022 => 1, # "
1577 0x0027 => 1, # '
1578 0x003D => 1, # =
1579 }->{$self->{next_char}}) {
1580 !!!cp (115);
1581 !!!parse-error (type => 'bad attribute value');
1582 } else {
1583 !!!cp (116);
1584 }
1585 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1586 ## Stay in the state
1587 !!!next-input-character;
1588 redo A;
1589 }
1590 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1591 my $token = $self->_tokenize_attempt_to_consume_an_entity
1592 (1,
1593 $self->{last_attribute_value_state}
1594 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1595 $self->{last_attribute_value_state}
1596 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1597 -1);
1598
1599 unless (defined $token) {
1600 !!!cp (117);
1601 $self->{current_attribute}->{value} .= '&';
1602 } else {
1603 !!!cp (118);
1604 $self->{current_attribute}->{value} .= $token->{data};
1605 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1606 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1607 }
1608
1609 $self->{state} = $self->{last_attribute_value_state};
1610 # next-input-character is already done
1611 redo A;
1612 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1613 if ($self->{next_char} == 0x0009 or # HT
1614 $self->{next_char} == 0x000A or # LF
1615 $self->{next_char} == 0x000B or # VT
1616 $self->{next_char} == 0x000C or # FF
1617 $self->{next_char} == 0x0020) { # SP
1618 !!!cp (118);
1619 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1620 !!!next-input-character;
1621 redo A;
1622 } elsif ($self->{next_char} == 0x003E) { # >
1623 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1624 !!!cp (119);
1625 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1626 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1627 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1628 if ($self->{current_token}->{attributes}) {
1629 !!!cp (120);
1630 !!!parse-error (type => 'end tag attribute');
1631 } else {
1632 ## NOTE: This state should never be reached.
1633 !!!cp (121);
1634 }
1635 } else {
1636 die "$0: $self->{current_token}->{type}: Unknown token type";
1637 }
1638 $self->{state} = DATA_STATE;
1639 !!!next-input-character;
1640
1641 !!!emit ($self->{current_token}); # start tag or end tag
1642
1643 redo A;
1644 } elsif ($self->{next_char} == 0x002F) { # /
1645 !!!cp (122);
1646 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1647 !!!next-input-character;
1648 redo A;
1649 } else {
1650 !!!cp ('124.1');
1651 !!!parse-error (type => 'no space between attributes');
1652 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1653 ## reconsume
1654 redo A;
1655 }
1656 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1657 if ($self->{next_char} == 0x003E) { # >
1658 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1659 !!!cp ('124.2');
1660 !!!parse-error (type => 'nestc', token => $self->{current_token});
1661 ## TODO: Different type than slash in start tag
1662 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1663 if ($self->{current_token}->{attributes}) {
1664 !!!cp ('124.4');
1665 !!!parse-error (type => 'end tag attribute');
1666 } else {
1667 !!!cp ('124.5');
1668 }
1669 ## TODO: Test |<title></title/>|
1670 } else {
1671 !!!cp ('124.3');
1672 $self->{self_closing} = 1;
1673 }
1674
1675 $self->{state} = DATA_STATE;
1676 !!!next-input-character;
1677
1678 !!!emit ($self->{current_token}); # start tag or end tag
1679
1680 redo A;
1681 } else {
1682 !!!cp ('124.4');
1683 !!!parse-error (type => 'nestc');
1684 ## TODO: This error type is wrong.
1685 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1686 ## Reconsume.
1687 redo A;
1688 }
1689 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1690 ## (only happen if PCDATA state)
1691
1692 ## NOTE: Set by the previous state
1693 #my $token = {type => COMMENT_TOKEN, data => ''};
1694
1695 BC: {
1696 if ($self->{next_char} == 0x003E) { # >
1697 !!!cp (124);
1698 $self->{state} = DATA_STATE;
1699 !!!next-input-character;
1700
1701 !!!emit ($self->{current_token}); # comment
1702
1703 redo A;
1704 } elsif ($self->{next_char} == -1) {
1705 !!!cp (125);
1706 $self->{state} = DATA_STATE;
1707 ## reconsume
1708
1709 !!!emit ($self->{current_token}); # comment
1710
1711 redo A;
1712 } else {
1713 !!!cp (126);
1714 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1715 !!!next-input-character;
1716 redo BC;
1717 }
1718 } # BC
1719
1720 die "$0: _get_next_token: unexpected case [BC]";
1721 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1722 ## (only happen if PCDATA state)
1723
1724 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1725
1726 my @next_char;
1727 push @next_char, $self->{next_char};
1728
1729 if ($self->{next_char} == 0x002D) { # -
1730 !!!next-input-character;
1731 push @next_char, $self->{next_char};
1732 if ($self->{next_char} == 0x002D) { # -
1733 !!!cp (127);
1734 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1735 line => $l, column => $c,
1736 };
1737 $self->{state} = COMMENT_START_STATE;
1738 !!!next-input-character;
1739 redo A;
1740 } else {
1741 !!!cp (128);
1742 }
1743 } elsif ($self->{next_char} == 0x0044 or # D
1744 $self->{next_char} == 0x0064) { # d
1745 !!!next-input-character;
1746 push @next_char, $self->{next_char};
1747 if ($self->{next_char} == 0x004F or # O
1748 $self->{next_char} == 0x006F) { # o
1749 !!!next-input-character;
1750 push @next_char, $self->{next_char};
1751 if ($self->{next_char} == 0x0043 or # C
1752 $self->{next_char} == 0x0063) { # c
1753 !!!next-input-character;
1754 push @next_char, $self->{next_char};
1755 if ($self->{next_char} == 0x0054 or # T
1756 $self->{next_char} == 0x0074) { # t
1757 !!!next-input-character;
1758 push @next_char, $self->{next_char};
1759 if ($self->{next_char} == 0x0059 or # Y
1760 $self->{next_char} == 0x0079) { # y
1761 !!!next-input-character;
1762 push @next_char, $self->{next_char};
1763 if ($self->{next_char} == 0x0050 or # P
1764 $self->{next_char} == 0x0070) { # p
1765 !!!next-input-character;
1766 push @next_char, $self->{next_char};
1767 if ($self->{next_char} == 0x0045 or # E
1768 $self->{next_char} == 0x0065) { # e
1769 !!!cp (129);
1770 ## TODO: What a stupid code this is!
1771 $self->{state} = DOCTYPE_STATE;
1772 $self->{current_token} = {type => DOCTYPE_TOKEN,
1773 quirks => 1,
1774 line => $l, column => $c,
1775 };
1776 !!!next-input-character;
1777 redo A;
1778 } else {
1779 !!!cp (130);
1780 }
1781 } else {
1782 !!!cp (131);
1783 }
1784 } else {
1785 !!!cp (132);
1786 }
1787 } else {
1788 !!!cp (133);
1789 }
1790 } else {
1791 !!!cp (134);
1792 }
1793 } else {
1794 !!!cp (135);
1795 }
1796 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1797 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1798 $self->{next_char} == 0x005B) { # [
1799 !!!next-input-character;
1800 push @next_char, $self->{next_char};
1801 if ($self->{next_char} == 0x0043) { # C
1802 !!!next-input-character;
1803 push @next_char, $self->{next_char};
1804 if ($self->{next_char} == 0x0044) { # D
1805 !!!next-input-character;
1806 push @next_char, $self->{next_char};
1807 if ($self->{next_char} == 0x0041) { # A
1808 !!!next-input-character;
1809 push @next_char, $self->{next_char};
1810 if ($self->{next_char} == 0x0054) { # T
1811 !!!next-input-character;
1812 push @next_char, $self->{next_char};
1813 if ($self->{next_char} == 0x0041) { # A
1814 !!!next-input-character;
1815 push @next_char, $self->{next_char};
1816 if ($self->{next_char} == 0x005B) { # [
1817 !!!cp (135.1);
1818 $self->{state} = CDATA_BLOCK_STATE;
1819 !!!next-input-character;
1820 redo A;
1821 } else {
1822 !!!cp (135.2);
1823 }
1824 } else {
1825 !!!cp (135.3);
1826 }
1827 } else {
1828 !!!cp (135.4);
1829 }
1830 } else {
1831 !!!cp (135.5);
1832 }
1833 } else {
1834 !!!cp (135.6);
1835 }
1836 } else {
1837 !!!cp (135.7);
1838 }
1839 } else {
1840 !!!cp (136);
1841 }
1842
1843 !!!parse-error (type => 'bogus comment');
1844 $self->{next_char} = shift @next_char;
1845 !!!back-next-input-character (@next_char);
1846 $self->{state} = BOGUS_COMMENT_STATE;
1847 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1848 line => $l, column => $c,
1849 };
1850 redo A;
1851
1852 ## ISSUE: typos in spec: chacacters, is is a parse error
1853 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1854 } elsif ($self->{state} == COMMENT_START_STATE) {
1855 if ($self->{next_char} == 0x002D) { # -
1856 !!!cp (137);
1857 $self->{state} = COMMENT_START_DASH_STATE;
1858 !!!next-input-character;
1859 redo A;
1860 } elsif ($self->{next_char} == 0x003E) { # >
1861 !!!cp (138);
1862 !!!parse-error (type => 'bogus comment');
1863 $self->{state} = DATA_STATE;
1864 !!!next-input-character;
1865
1866 !!!emit ($self->{current_token}); # comment
1867
1868 redo A;
1869 } elsif ($self->{next_char} == -1) {
1870 !!!cp (139);
1871 !!!parse-error (type => 'unclosed comment');
1872 $self->{state} = DATA_STATE;
1873 ## reconsume
1874
1875 !!!emit ($self->{current_token}); # comment
1876
1877 redo A;
1878 } else {
1879 !!!cp (140);
1880 $self->{current_token}->{data} # comment
1881 .= chr ($self->{next_char});
1882 $self->{state} = COMMENT_STATE;
1883 !!!next-input-character;
1884 redo A;
1885 }
1886 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1887 if ($self->{next_char} == 0x002D) { # -
1888 !!!cp (141);
1889 $self->{state} = COMMENT_END_STATE;
1890 !!!next-input-character;
1891 redo A;
1892 } elsif ($self->{next_char} == 0x003E) { # >
1893 !!!cp (142);
1894 !!!parse-error (type => 'bogus comment');
1895 $self->{state} = DATA_STATE;
1896 !!!next-input-character;
1897
1898 !!!emit ($self->{current_token}); # comment
1899
1900 redo A;
1901 } elsif ($self->{next_char} == -1) {
1902 !!!cp (143);
1903 !!!parse-error (type => 'unclosed comment');
1904 $self->{state} = DATA_STATE;
1905 ## reconsume
1906
1907 !!!emit ($self->{current_token}); # comment
1908
1909 redo A;
1910 } else {
1911 !!!cp (144);
1912 $self->{current_token}->{data} # comment
1913 .= '-' . chr ($self->{next_char});
1914 $self->{state} = COMMENT_STATE;
1915 !!!next-input-character;
1916 redo A;
1917 }
1918 } elsif ($self->{state} == COMMENT_STATE) {
1919 if ($self->{next_char} == 0x002D) { # -
1920 !!!cp (145);
1921 $self->{state} = COMMENT_END_DASH_STATE;
1922 !!!next-input-character;
1923 redo A;
1924 } elsif ($self->{next_char} == -1) {
1925 !!!cp (146);
1926 !!!parse-error (type => 'unclosed comment');
1927 $self->{state} = DATA_STATE;
1928 ## reconsume
1929
1930 !!!emit ($self->{current_token}); # comment
1931
1932 redo A;
1933 } else {
1934 !!!cp (147);
1935 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1936 ## Stay in the state
1937 !!!next-input-character;
1938 redo A;
1939 }
1940 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1941 if ($self->{next_char} == 0x002D) { # -
1942 !!!cp (148);
1943 $self->{state} = COMMENT_END_STATE;
1944 !!!next-input-character;
1945 redo A;
1946 } elsif ($self->{next_char} == -1) {
1947 !!!cp (149);
1948 !!!parse-error (type => 'unclosed comment');
1949 $self->{state} = DATA_STATE;
1950 ## reconsume
1951
1952 !!!emit ($self->{current_token}); # comment
1953
1954 redo A;
1955 } else {
1956 !!!cp (150);
1957 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1958 $self->{state} = COMMENT_STATE;
1959 !!!next-input-character;
1960 redo A;
1961 }
1962 } elsif ($self->{state} == COMMENT_END_STATE) {
1963 if ($self->{next_char} == 0x003E) { # >
1964 !!!cp (151);
1965 $self->{state} = DATA_STATE;
1966 !!!next-input-character;
1967
1968 !!!emit ($self->{current_token}); # comment
1969
1970 redo A;
1971 } elsif ($self->{next_char} == 0x002D) { # -
1972 !!!cp (152);
1973 !!!parse-error (type => 'dash in comment',
1974 line => $self->{line_prev},
1975 column => $self->{column_prev});
1976 $self->{current_token}->{data} .= '-'; # comment
1977 ## Stay in the state
1978 !!!next-input-character;
1979 redo A;
1980 } elsif ($self->{next_char} == -1) {
1981 !!!cp (153);
1982 !!!parse-error (type => 'unclosed comment');
1983 $self->{state} = DATA_STATE;
1984 ## reconsume
1985
1986 !!!emit ($self->{current_token}); # comment
1987
1988 redo A;
1989 } else {
1990 !!!cp (154);
1991 !!!parse-error (type => 'dash in comment',
1992 line => $self->{line_prev},
1993 column => $self->{column_prev});
1994 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1995 $self->{state} = COMMENT_STATE;
1996 !!!next-input-character;
1997 redo A;
1998 }
1999 } elsif ($self->{state} == DOCTYPE_STATE) {
2000 if ($self->{next_char} == 0x0009 or # HT
2001 $self->{next_char} == 0x000A or # LF
2002 $self->{next_char} == 0x000B or # VT
2003 $self->{next_char} == 0x000C or # FF
2004 $self->{next_char} == 0x0020) { # SP
2005 !!!cp (155);
2006 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2007 !!!next-input-character;
2008 redo A;
2009 } else {
2010 !!!cp (156);
2011 !!!parse-error (type => 'no space before DOCTYPE name');
2012 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2013 ## reconsume
2014 redo A;
2015 }
2016 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2017 if ($self->{next_char} == 0x0009 or # HT
2018 $self->{next_char} == 0x000A or # LF
2019 $self->{next_char} == 0x000B or # VT
2020 $self->{next_char} == 0x000C or # FF
2021 $self->{next_char} == 0x0020) { # SP
2022 !!!cp (157);
2023 ## Stay in the state
2024 !!!next-input-character;
2025 redo A;
2026 } elsif ($self->{next_char} == 0x003E) { # >
2027 !!!cp (158);
2028 !!!parse-error (type => 'no DOCTYPE name');
2029 $self->{state} = DATA_STATE;
2030 !!!next-input-character;
2031
2032 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2033
2034 redo A;
2035 } elsif ($self->{next_char} == -1) {
2036 !!!cp (159);
2037 !!!parse-error (type => 'no DOCTYPE name');
2038 $self->{state} = DATA_STATE;
2039 ## reconsume
2040
2041 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2042
2043 redo A;
2044 } else {
2045 !!!cp (160);
2046 $self->{current_token}->{name} = chr $self->{next_char};
2047 delete $self->{current_token}->{quirks};
2048 ## ISSUE: "Set the token's name name to the" in the spec
2049 $self->{state} = DOCTYPE_NAME_STATE;
2050 !!!next-input-character;
2051 redo A;
2052 }
2053 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2054 ## ISSUE: Redundant "First," in the spec.
2055 if ($self->{next_char} == 0x0009 or # HT
2056 $self->{next_char} == 0x000A or # LF
2057 $self->{next_char} == 0x000B or # VT
2058 $self->{next_char} == 0x000C or # FF
2059 $self->{next_char} == 0x0020) { # SP
2060 !!!cp (161);
2061 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2062 !!!next-input-character;
2063 redo A;
2064 } elsif ($self->{next_char} == 0x003E) { # >
2065 !!!cp (162);
2066 $self->{state} = DATA_STATE;
2067 !!!next-input-character;
2068
2069 !!!emit ($self->{current_token}); # DOCTYPE
2070
2071 redo A;
2072 } elsif ($self->{next_char} == -1) {
2073 !!!cp (163);
2074 !!!parse-error (type => 'unclosed DOCTYPE');
2075 $self->{state} = DATA_STATE;
2076 ## reconsume
2077
2078 $self->{current_token}->{quirks} = 1;
2079 !!!emit ($self->{current_token}); # DOCTYPE
2080
2081 redo A;
2082 } else {
2083 !!!cp (164);
2084 $self->{current_token}->{name}
2085 .= chr ($self->{next_char}); # DOCTYPE
2086 ## Stay in the state
2087 !!!next-input-character;
2088 redo A;
2089 }
2090 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2091 if ($self->{next_char} == 0x0009 or # HT
2092 $self->{next_char} == 0x000A or # LF
2093 $self->{next_char} == 0x000B or # VT
2094 $self->{next_char} == 0x000C or # FF
2095 $self->{next_char} == 0x0020) { # SP
2096 !!!cp (165);
2097 ## Stay in the state
2098 !!!next-input-character;
2099 redo A;
2100 } elsif ($self->{next_char} == 0x003E) { # >
2101 !!!cp (166);
2102 $self->{state} = DATA_STATE;
2103 !!!next-input-character;
2104
2105 !!!emit ($self->{current_token}); # DOCTYPE
2106
2107 redo A;
2108 } elsif ($self->{next_char} == -1) {
2109 !!!cp (167);
2110 !!!parse-error (type => 'unclosed DOCTYPE');
2111 $self->{state} = DATA_STATE;
2112 ## reconsume
2113
2114 $self->{current_token}->{quirks} = 1;
2115 !!!emit ($self->{current_token}); # DOCTYPE
2116
2117 redo A;
2118 } elsif ($self->{next_char} == 0x0050 or # P
2119 $self->{next_char} == 0x0070) { # p
2120 !!!next-input-character;
2121 if ($self->{next_char} == 0x0055 or # U
2122 $self->{next_char} == 0x0075) { # u
2123 !!!next-input-character;
2124 if ($self->{next_char} == 0x0042 or # B
2125 $self->{next_char} == 0x0062) { # b
2126 !!!next-input-character;
2127 if ($self->{next_char} == 0x004C or # L
2128 $self->{next_char} == 0x006C) { # l
2129 !!!next-input-character;
2130 if ($self->{next_char} == 0x0049 or # I
2131 $self->{next_char} == 0x0069) { # i
2132 !!!next-input-character;
2133 if ($self->{next_char} == 0x0043 or # C
2134 $self->{next_char} == 0x0063) { # c
2135 !!!cp (168);
2136 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2137 !!!next-input-character;
2138 redo A;
2139 } else {
2140 !!!cp (169);
2141 }
2142 } else {
2143 !!!cp (170);
2144 }
2145 } else {
2146 !!!cp (171);
2147 }
2148 } else {
2149 !!!cp (172);
2150 }
2151 } else {
2152 !!!cp (173);
2153 }
2154
2155 #
2156 } elsif ($self->{next_char} == 0x0053 or # S
2157 $self->{next_char} == 0x0073) { # s
2158 !!!next-input-character;
2159 if ($self->{next_char} == 0x0059 or # Y
2160 $self->{next_char} == 0x0079) { # y
2161 !!!next-input-character;
2162 if ($self->{next_char} == 0x0053 or # S
2163 $self->{next_char} == 0x0073) { # s
2164 !!!next-input-character;
2165 if ($self->{next_char} == 0x0054 or # T
2166 $self->{next_char} == 0x0074) { # t
2167 !!!next-input-character;
2168 if ($self->{next_char} == 0x0045 or # E
2169 $self->{next_char} == 0x0065) { # e
2170 !!!next-input-character;
2171 if ($self->{next_char} == 0x004D or # M
2172 $self->{next_char} == 0x006D) { # m
2173 !!!cp (174);
2174 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2175 !!!next-input-character;
2176 redo A;
2177 } else {
2178 !!!cp (175);
2179 }
2180 } else {
2181 !!!cp (176);
2182 }
2183 } else {
2184 !!!cp (177);
2185 }
2186 } else {
2187 !!!cp (178);
2188 }
2189 } else {
2190 !!!cp (179);
2191 }
2192
2193 #
2194 } else {
2195 !!!cp (180);
2196 !!!next-input-character;
2197 #
2198 }
2199
2200 !!!parse-error (type => 'string after DOCTYPE name');
2201 $self->{current_token}->{quirks} = 1;
2202
2203 $self->{state} = BOGUS_DOCTYPE_STATE;
2204 # next-input-character is already done
2205 redo A;
2206 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2207 if ({
2208 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2209 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2210 }->{$self->{next_char}}) {
2211 !!!cp (181);
2212 ## Stay in the state
2213 !!!next-input-character;
2214 redo A;
2215 } elsif ($self->{next_char} eq 0x0022) { # "
2216 !!!cp (182);
2217 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2218 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2219 !!!next-input-character;
2220 redo A;
2221 } elsif ($self->{next_char} eq 0x0027) { # '
2222 !!!cp (183);
2223 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2224 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2225 !!!next-input-character;
2226 redo A;
2227 } elsif ($self->{next_char} eq 0x003E) { # >
2228 !!!cp (184);
2229 !!!parse-error (type => 'no PUBLIC literal');
2230
2231 $self->{state} = DATA_STATE;
2232 !!!next-input-character;
2233
2234 $self->{current_token}->{quirks} = 1;
2235 !!!emit ($self->{current_token}); # DOCTYPE
2236
2237 redo A;
2238 } elsif ($self->{next_char} == -1) {
2239 !!!cp (185);
2240 !!!parse-error (type => 'unclosed DOCTYPE');
2241
2242 $self->{state} = DATA_STATE;
2243 ## reconsume
2244
2245 $self->{current_token}->{quirks} = 1;
2246 !!!emit ($self->{current_token}); # DOCTYPE
2247
2248 redo A;
2249 } else {
2250 !!!cp (186);
2251 !!!parse-error (type => 'string after PUBLIC');
2252 $self->{current_token}->{quirks} = 1;
2253
2254 $self->{state} = BOGUS_DOCTYPE_STATE;
2255 !!!next-input-character;
2256 redo A;
2257 }
2258 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2259 if ($self->{next_char} == 0x0022) { # "
2260 !!!cp (187);
2261 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2262 !!!next-input-character;
2263 redo A;
2264 } elsif ($self->{next_char} == 0x003E) { # >
2265 !!!cp (188);
2266 !!!parse-error (type => 'unclosed PUBLIC literal');
2267
2268 $self->{state} = DATA_STATE;
2269 !!!next-input-character;
2270
2271 $self->{current_token}->{quirks} = 1;
2272 !!!emit ($self->{current_token}); # DOCTYPE
2273
2274 redo A;
2275 } elsif ($self->{next_char} == -1) {
2276 !!!cp (189);
2277 !!!parse-error (type => 'unclosed PUBLIC literal');
2278
2279 $self->{state} = DATA_STATE;
2280 ## reconsume
2281
2282 $self->{current_token}->{quirks} = 1;
2283 !!!emit ($self->{current_token}); # DOCTYPE
2284
2285 redo A;
2286 } else {
2287 !!!cp (190);
2288 $self->{current_token}->{public_identifier} # DOCTYPE
2289 .= chr $self->{next_char};
2290 ## Stay in the state
2291 !!!next-input-character;
2292 redo A;
2293 }
2294 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2295 if ($self->{next_char} == 0x0027) { # '
2296 !!!cp (191);
2297 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2298 !!!next-input-character;
2299 redo A;
2300 } elsif ($self->{next_char} == 0x003E) { # >
2301 !!!cp (192);
2302 !!!parse-error (type => 'unclosed PUBLIC literal');
2303
2304 $self->{state} = DATA_STATE;
2305 !!!next-input-character;
2306
2307 $self->{current_token}->{quirks} = 1;
2308 !!!emit ($self->{current_token}); # DOCTYPE
2309
2310 redo A;
2311 } elsif ($self->{next_char} == -1) {
2312 !!!cp (193);
2313 !!!parse-error (type => 'unclosed PUBLIC literal');
2314
2315 $self->{state} = DATA_STATE;
2316 ## reconsume
2317
2318 $self->{current_token}->{quirks} = 1;
2319 !!!emit ($self->{current_token}); # DOCTYPE
2320
2321 redo A;
2322 } else {
2323 !!!cp (194);
2324 $self->{current_token}->{public_identifier} # DOCTYPE
2325 .= chr $self->{next_char};
2326 ## Stay in the state
2327 !!!next-input-character;
2328 redo A;
2329 }
2330 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2331 if ({
2332 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2333 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2334 }->{$self->{next_char}}) {
2335 !!!cp (195);
2336 ## Stay in the state
2337 !!!next-input-character;
2338 redo A;
2339 } elsif ($self->{next_char} == 0x0022) { # "
2340 !!!cp (196);
2341 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2342 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2343 !!!next-input-character;
2344 redo A;
2345 } elsif ($self->{next_char} == 0x0027) { # '
2346 !!!cp (197);
2347 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2348 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2349 !!!next-input-character;
2350 redo A;
2351 } elsif ($self->{next_char} == 0x003E) { # >
2352 !!!cp (198);
2353 $self->{state} = DATA_STATE;
2354 !!!next-input-character;
2355
2356 !!!emit ($self->{current_token}); # DOCTYPE
2357
2358 redo A;
2359 } elsif ($self->{next_char} == -1) {
2360 !!!cp (199);
2361 !!!parse-error (type => 'unclosed DOCTYPE');
2362
2363 $self->{state} = DATA_STATE;
2364 ## reconsume
2365
2366 $self->{current_token}->{quirks} = 1;
2367 !!!emit ($self->{current_token}); # DOCTYPE
2368
2369 redo A;
2370 } else {
2371 !!!cp (200);
2372 !!!parse-error (type => 'string after PUBLIC literal');
2373 $self->{current_token}->{quirks} = 1;
2374
2375 $self->{state} = BOGUS_DOCTYPE_STATE;
2376 !!!next-input-character;
2377 redo A;
2378 }
2379 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2380 if ({
2381 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2382 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2383 }->{$self->{next_char}}) {
2384 !!!cp (201);
2385 ## Stay in the state
2386 !!!next-input-character;
2387 redo A;
2388 } elsif ($self->{next_char} == 0x0022) { # "
2389 !!!cp (202);
2390 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2391 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2392 !!!next-input-character;
2393 redo A;
2394 } elsif ($self->{next_char} == 0x0027) { # '
2395 !!!cp (203);
2396 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2397 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2398 !!!next-input-character;
2399 redo A;
2400 } elsif ($self->{next_char} == 0x003E) { # >
2401 !!!cp (204);
2402 !!!parse-error (type => 'no SYSTEM literal');
2403 $self->{state} = DATA_STATE;
2404 !!!next-input-character;
2405
2406 $self->{current_token}->{quirks} = 1;
2407 !!!emit ($self->{current_token}); # DOCTYPE
2408
2409 redo A;
2410 } elsif ($self->{next_char} == -1) {
2411 !!!cp (205);
2412 !!!parse-error (type => 'unclosed DOCTYPE');
2413
2414 $self->{state} = DATA_STATE;
2415 ## reconsume
2416
2417 $self->{current_token}->{quirks} = 1;
2418 !!!emit ($self->{current_token}); # DOCTYPE
2419
2420 redo A;
2421 } else {
2422 !!!cp (206);
2423 !!!parse-error (type => 'string after SYSTEM');
2424 $self->{current_token}->{quirks} = 1;
2425
2426 $self->{state} = BOGUS_DOCTYPE_STATE;
2427 !!!next-input-character;
2428 redo A;
2429 }
2430 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2431 if ($self->{next_char} == 0x0022) { # "
2432 !!!cp (207);
2433 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2434 !!!next-input-character;
2435 redo A;
2436 } elsif ($self->{next_char} == 0x003E) { # >
2437 !!!cp (208);
2438 !!!parse-error (type => 'unclosed PUBLIC literal');
2439
2440 $self->{state} = DATA_STATE;
2441 !!!next-input-character;
2442
2443 $self->{current_token}->{quirks} = 1;
2444 !!!emit ($self->{current_token}); # DOCTYPE
2445
2446 redo A;
2447 } elsif ($self->{next_char} == -1) {
2448 !!!cp (209);
2449 !!!parse-error (type => 'unclosed SYSTEM literal');
2450
2451 $self->{state} = DATA_STATE;
2452 ## reconsume
2453
2454 $self->{current_token}->{quirks} = 1;
2455 !!!emit ($self->{current_token}); # DOCTYPE
2456
2457 redo A;
2458 } else {
2459 !!!cp (210);
2460 $self->{current_token}->{system_identifier} # DOCTYPE
2461 .= chr $self->{next_char};
2462 ## Stay in the state
2463 !!!next-input-character;
2464 redo A;
2465 }
2466 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2467 if ($self->{next_char} == 0x0027) { # '
2468 !!!cp (211);
2469 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2470 !!!next-input-character;
2471 redo A;
2472 } elsif ($self->{next_char} == 0x003E) { # >
2473 !!!cp (212);
2474 !!!parse-error (type => 'unclosed PUBLIC literal');
2475
2476 $self->{state} = DATA_STATE;
2477 !!!next-input-character;
2478
2479 $self->{current_token}->{quirks} = 1;
2480 !!!emit ($self->{current_token}); # DOCTYPE
2481
2482 redo A;
2483 } elsif ($self->{next_char} == -1) {
2484 !!!cp (213);
2485 !!!parse-error (type => 'unclosed SYSTEM literal');
2486
2487 $self->{state} = DATA_STATE;
2488 ## reconsume
2489
2490 $self->{current_token}->{quirks} = 1;
2491 !!!emit ($self->{current_token}); # DOCTYPE
2492
2493 redo A;
2494 } else {
2495 !!!cp (214);
2496 $self->{current_token}->{system_identifier} # DOCTYPE
2497 .= chr $self->{next_char};
2498 ## Stay in the state
2499 !!!next-input-character;
2500 redo A;
2501 }
2502 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2503 if ({
2504 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2505 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2506 }->{$self->{next_char}}) {
2507 !!!cp (215);
2508 ## Stay in the state
2509 !!!next-input-character;
2510 redo A;
2511 } elsif ($self->{next_char} == 0x003E) { # >
2512 !!!cp (216);
2513 $self->{state} = DATA_STATE;
2514 !!!next-input-character;
2515
2516 !!!emit ($self->{current_token}); # DOCTYPE
2517
2518 redo A;
2519 } elsif ($self->{next_char} == -1) {
2520 !!!cp (217);
2521 !!!parse-error (type => 'unclosed DOCTYPE');
2522
2523 $self->{state} = DATA_STATE;
2524 ## reconsume
2525
2526 $self->{current_token}->{quirks} = 1;
2527 !!!emit ($self->{current_token}); # DOCTYPE
2528
2529 redo A;
2530 } else {
2531 !!!cp (218);
2532 !!!parse-error (type => 'string after SYSTEM literal');
2533 #$self->{current_token}->{quirks} = 1;
2534
2535 $self->{state} = BOGUS_DOCTYPE_STATE;
2536 !!!next-input-character;
2537 redo A;
2538 }
2539 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2540 if ($self->{next_char} == 0x003E) { # >
2541 !!!cp (219);
2542 $self->{state} = DATA_STATE;
2543 !!!next-input-character;
2544
2545 !!!emit ($self->{current_token}); # DOCTYPE
2546
2547 redo A;
2548 } elsif ($self->{next_char} == -1) {
2549 !!!cp (220);
2550 !!!parse-error (type => 'unclosed DOCTYPE');
2551 $self->{state} = DATA_STATE;
2552 ## reconsume
2553
2554 !!!emit ($self->{current_token}); # DOCTYPE
2555
2556 redo A;
2557 } else {
2558 !!!cp (221);
2559 ## Stay in the state
2560 !!!next-input-character;
2561 redo A;
2562 }
2563 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2564 my $s = '';
2565
2566 my ($l, $c) = ($self->{line}, $self->{column});
2567
2568 CS: while ($self->{next_char} != -1) {
2569 if ($self->{next_char} == 0x005D) { # ]
2570 !!!next-input-character;
2571 if ($self->{next_char} == 0x005D) { # ]
2572 !!!next-input-character;
2573 MDC: {
2574 if ($self->{next_char} == 0x003E) { # >
2575 !!!cp (221.1);
2576 !!!next-input-character;
2577 last CS;
2578 } elsif ($self->{next_char} == 0x005D) { # ]
2579 !!!cp (221.2);
2580 $s .= ']';
2581 !!!next-input-character;
2582 redo MDC;
2583 } else {
2584 !!!cp (221.3);
2585 $s .= ']]';
2586 #
2587 }
2588 } # MDC
2589 } else {
2590 !!!cp (221.4);
2591 $s .= ']';
2592 #
2593 }
2594 } else {
2595 !!!cp (221.5);
2596 #
2597 }
2598 $s .= chr $self->{next_char};
2599 !!!next-input-character;
2600 } # CS
2601
2602 $self->{state} = DATA_STATE;
2603 ## next-input-character done or EOF, which is reconsumed.
2604
2605 if (length $s) {
2606 !!!cp (221.6);
2607 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2608 line => $l, column => $c});
2609 } else {
2610 !!!cp (221.7);
2611 }
2612
2613 redo A;
2614
2615 ## ISSUE: "text tokens" in spec.
2616 ## TODO: Streaming support
2617 } else {
2618 die "$0: $self->{state}: Unknown state";
2619 }
2620 } # A
2621
2622 die "$0: _get_next_token: unexpected case";
2623 } # _get_next_token
2624
2625 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2626 my ($self, $in_attr, $additional) = @_;
2627
2628 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2629
2630 if ({
2631 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2632 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2633 $additional => 1,
2634 }->{$self->{next_char}}) {
2635 !!!cp (1001);
2636 ## Don't consume
2637 ## No error
2638 return undef;
2639 } elsif ($self->{next_char} == 0x0023) { # #
2640 !!!next-input-character;
2641 if ($self->{next_char} == 0x0078 or # x
2642 $self->{next_char} == 0x0058) { # X
2643 my $code;
2644 X: {
2645 my $x_char = $self->{next_char};
2646 !!!next-input-character;
2647 if (0x0030 <= $self->{next_char} and
2648 $self->{next_char} <= 0x0039) { # 0..9
2649 !!!cp (1002);
2650 $code ||= 0;
2651 $code *= 0x10;
2652 $code += $self->{next_char} - 0x0030;
2653 redo X;
2654 } elsif (0x0061 <= $self->{next_char} and
2655 $self->{next_char} <= 0x0066) { # a..f
2656 !!!cp (1003);
2657 $code ||= 0;
2658 $code *= 0x10;
2659 $code += $self->{next_char} - 0x0060 + 9;
2660 redo X;
2661 } elsif (0x0041 <= $self->{next_char} and
2662 $self->{next_char} <= 0x0046) { # A..F
2663 !!!cp (1004);
2664 $code ||= 0;
2665 $code *= 0x10;
2666 $code += $self->{next_char} - 0x0040 + 9;
2667 redo X;
2668 } elsif (not defined $code) { # no hexadecimal digit
2669 !!!cp (1005);
2670 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2671 !!!back-next-input-character ($x_char, $self->{next_char});
2672 $self->{next_char} = 0x0023; # #
2673 return undef;
2674 } elsif ($self->{next_char} == 0x003B) { # ;
2675 !!!cp (1006);
2676 !!!next-input-character;
2677 } else {
2678 !!!cp (1007);
2679 !!!parse-error (type => 'no refc', line => $l, column => $c);
2680 }
2681
2682 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2683 !!!cp (1008);
2684 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2685 $code = 0xFFFD;
2686 } elsif ($code > 0x10FFFF) {
2687 !!!cp (1009);
2688 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2689 $code = 0xFFFD;
2690 } elsif ($code == 0x000D) {
2691 !!!cp (1010);
2692 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2693 $code = 0x000A;
2694 } elsif (0x80 <= $code and $code <= 0x9F) {
2695 !!!cp (1011);
2696 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2697 $code = $c1_entity_char->{$code};
2698 }
2699
2700 return {type => CHARACTER_TOKEN, data => chr $code,
2701 has_reference => 1,
2702 line => $l, column => $c,
2703 };
2704 } # X
2705 } elsif (0x0030 <= $self->{next_char} and
2706 $self->{next_char} <= 0x0039) { # 0..9
2707 my $code = $self->{next_char} - 0x0030;
2708 !!!next-input-character;
2709
2710 while (0x0030 <= $self->{next_char} and
2711 $self->{next_char} <= 0x0039) { # 0..9
2712 !!!cp (1012);
2713 $code *= 10;
2714 $code += $self->{next_char} - 0x0030;
2715
2716 !!!next-input-character;
2717 }
2718
2719 if ($self->{next_char} == 0x003B) { # ;
2720 !!!cp (1013);
2721 !!!next-input-character;
2722 } else {
2723 !!!cp (1014);
2724 !!!parse-error (type => 'no refc', line => $l, column => $c);
2725 }
2726
2727 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2728 !!!cp (1015);
2729 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2730 $code = 0xFFFD;
2731 } elsif ($code > 0x10FFFF) {
2732 !!!cp (1016);
2733 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2734 $code = 0xFFFD;
2735 } elsif ($code == 0x000D) {
2736 !!!cp (1017);
2737 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2738 $code = 0x000A;
2739 } elsif (0x80 <= $code and $code <= 0x9F) {
2740 !!!cp (1018);
2741 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2742 $code = $c1_entity_char->{$code};
2743 }
2744
2745 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2746 line => $l, column => $c,
2747 };
2748 } else {
2749 !!!cp (1019);
2750 !!!parse-error (type => 'bare nero', line => $l, column => $c);
2751 !!!back-next-input-character ($self->{next_char});
2752 $self->{next_char} = 0x0023; # #
2753 return undef;
2754 }
2755 } elsif ((0x0041 <= $self->{next_char} and
2756 $self->{next_char} <= 0x005A) or
2757 (0x0061 <= $self->{next_char} and
2758 $self->{next_char} <= 0x007A)) {
2759 my $entity_name = chr $self->{next_char};
2760 !!!next-input-character;
2761
2762 my $value = $entity_name;
2763 my $match = 0;
2764 require Whatpm::_NamedEntityList;
2765 our $EntityChar;
2766
2767 while (length $entity_name < 30 and
2768 ## NOTE: Some number greater than the maximum length of entity name
2769 ((0x0041 <= $self->{next_char} and # a
2770 $self->{next_char} <= 0x005A) or # x
2771 (0x0061 <= $self->{next_char} and # a
2772 $self->{next_char} <= 0x007A) or # z
2773 (0x0030 <= $self->{next_char} and # 0
2774 $self->{next_char} <= 0x0039) or # 9
2775 $self->{next_char} == 0x003B)) { # ;
2776 $entity_name .= chr $self->{next_char};
2777 if (defined $EntityChar->{$entity_name}) {
2778 if ($self->{next_char} == 0x003B) { # ;
2779 !!!cp (1020);
2780 $value = $EntityChar->{$entity_name};
2781 $match = 1;
2782 !!!next-input-character;
2783 last;
2784 } else {
2785 !!!cp (1021);
2786 $value = $EntityChar->{$entity_name};
2787 $match = -1;
2788 !!!next-input-character;
2789 }
2790 } else {
2791 !!!cp (1022);
2792 $value .= chr $self->{next_char};
2793 $match *= 2;
2794 !!!next-input-character;
2795 }
2796 }
2797
2798 if ($match > 0) {
2799 !!!cp (1023);
2800 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2801 line => $l, column => $c,
2802 };
2803 } elsif ($match < 0) {
2804 !!!parse-error (type => 'no refc', line => $l, column => $c);
2805 if ($in_attr and $match < -1) {
2806 !!!cp (1024);
2807 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
2808 line => $l, column => $c,
2809 };
2810 } else {
2811 !!!cp (1025);
2812 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2813 line => $l, column => $c,
2814 };
2815 }
2816 } else {
2817 !!!cp (1026);
2818 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2819 ## NOTE: "No characters are consumed" in the spec.
2820 return {type => CHARACTER_TOKEN, data => '&'.$value,
2821 line => $l, column => $c,
2822 };
2823 }
2824 } else {
2825 !!!cp (1027);
2826 ## no characters are consumed
2827 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2828 return undef;
2829 }
2830 } # _tokenize_attempt_to_consume_an_entity
2831
2832 sub _initialize_tree_constructor ($) {
2833 my $self = shift;
2834 ## NOTE: $self->{document} MUST be specified before this method is called
2835 $self->{document}->strict_error_checking (0);
2836 ## TODO: Turn mutation events off # MUST
2837 ## TODO: Turn loose Document option (manakai extension) on
2838 $self->{document}->manakai_is_html (1); # MUST
2839 } # _initialize_tree_constructor
2840
2841 sub _terminate_tree_constructor ($) {
2842 my $self = shift;
2843 $self->{document}->strict_error_checking (1);
2844 ## TODO: Turn mutation events on
2845 } # _terminate_tree_constructor
2846
2847 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2848
2849 { # tree construction stage
2850 my $token;
2851
2852 sub _construct_tree ($) {
2853 my ($self) = @_;
2854
2855 ## When an interactive UA render the $self->{document} available
2856 ## to the user, or when it begin accepting user input, are
2857 ## not defined.
2858
2859 ## Append a character: collect it and all subsequent consecutive
2860 ## characters and insert one Text node whose data is concatenation
2861 ## of all those characters. # MUST
2862
2863 !!!next-token;
2864
2865 undef $self->{form_element};
2866 undef $self->{head_element};
2867 $self->{open_elements} = [];
2868 undef $self->{inner_html_node};
2869
2870 ## NOTE: The "initial" insertion mode.
2871 $self->_tree_construction_initial; # MUST
2872
2873 ## NOTE: The "before html" insertion mode.
2874 $self->_tree_construction_root_element;
2875 $self->{insertion_mode} = BEFORE_HEAD_IM;
2876
2877 ## NOTE: The "before head" insertion mode and so on.
2878 $self->_tree_construction_main;
2879 } # _construct_tree
2880
2881 sub _tree_construction_initial ($) {
2882 my $self = shift;
2883
2884 ## NOTE: "initial" insertion mode
2885
2886 INITIAL: {
2887 if ($token->{type} == DOCTYPE_TOKEN) {
2888 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2889 ## error, switch to a conformance checking mode for another
2890 ## language.
2891 my $doctype_name = $token->{name};
2892 $doctype_name = '' unless defined $doctype_name;
2893 $doctype_name =~ tr/a-z/A-Z/;
2894 if (not defined $token->{name} or # <!DOCTYPE>
2895 defined $token->{public_identifier} or
2896 defined $token->{system_identifier}) {
2897 !!!cp ('t1');
2898 !!!parse-error (type => 'not HTML5', token => $token);
2899 } elsif ($doctype_name ne 'HTML') {
2900 !!!cp ('t2');
2901 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2902 !!!parse-error (type => 'not HTML5', token => $token);
2903 } else {
2904 !!!cp ('t3');
2905 }
2906
2907 my $doctype = $self->{document}->create_document_type_definition
2908 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2909 ## NOTE: Default value for both |public_id| and |system_id| attributes
2910 ## are empty strings, so that we don't set any value in missing cases.
2911 $doctype->public_id ($token->{public_identifier})
2912 if defined $token->{public_identifier};
2913 $doctype->system_id ($token->{system_identifier})
2914 if defined $token->{system_identifier};
2915 ## NOTE: Other DocumentType attributes are null or empty lists.
2916 ## ISSUE: internalSubset = null??
2917 $self->{document}->append_child ($doctype);
2918
2919 if ($token->{quirks} or $doctype_name ne 'HTML') {
2920 !!!cp ('t4');
2921 $self->{document}->manakai_compat_mode ('quirks');
2922 } elsif (defined $token->{public_identifier}) {
2923 my $pubid = $token->{public_identifier};
2924 $pubid =~ tr/a-z/A-z/;
2925 if ({
2926 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2927 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2928 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2929 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2930 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2931 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2932 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2933 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2934 "-//IETF//DTD HTML 2.0//EN" => 1,
2935 "-//IETF//DTD HTML 2.1E//EN" => 1,
2936 "-//IETF//DTD HTML 3.0//EN" => 1,
2937 "-//IETF//DTD HTML 3.0//EN//" => 1,
2938 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2939 "-//IETF//DTD HTML 3.2//EN" => 1,
2940 "-//IETF//DTD HTML 3//EN" => 1,
2941 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2942 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2943 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2944 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2945 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2946 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2947 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2948 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2949 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2950 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2951 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2952 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2953 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2954 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2955 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2956 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2957 "-//IETF//DTD HTML STRICT//EN" => 1,
2958 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2959 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2960 "-//IETF//DTD HTML//EN" => 1,
2961 "-//IETF//DTD HTML//EN//2.0" => 1,
2962 "-//IETF//DTD HTML//EN//3.0" => 1,
2963 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2964 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2965 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2966 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2967 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2968 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2969 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2970 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2971 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2972 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2973 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2974 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2975 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2976 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2977 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2978 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2979 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2980 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2981 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2982 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2983 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2984 "-//W3C//DTD HTML 3.2//EN" => 1,
2985 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2986 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2987 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2988 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2989 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2990 "-//W3C//DTD W3 HTML//EN" => 1,
2991 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2992 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2993 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2994 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2995 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2996 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2997 "HTML" => 1,
2998 }->{$pubid}) {
2999 !!!cp ('t5');
3000 $self->{document}->manakai_compat_mode ('quirks');
3001 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
3002 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
3003 if (defined $token->{system_identifier}) {
3004 !!!cp ('t6');
3005 $self->{document}->manakai_compat_mode ('quirks');
3006 } else {
3007 !!!cp ('t7');
3008 $self->{document}->manakai_compat_mode ('limited quirks');
3009 }
3010 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
3011 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
3012 !!!cp ('t8');
3013 $self->{document}->manakai_compat_mode ('limited quirks');
3014 } else {
3015 !!!cp ('t9');
3016 }
3017 } else {
3018 !!!cp ('t10');
3019 }
3020 if (defined $token->{system_identifier}) {
3021 my $sysid = $token->{system_identifier};
3022 $sysid =~ tr/A-Z/a-z/;
3023 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3024 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
3025 $self->{document}->manakai_compat_mode ('quirks');
3026 !!!cp ('t11');
3027 } else {
3028 !!!cp ('t12');
3029 }
3030 } else {
3031 !!!cp ('t13');
3032 }
3033
3034 ## Go to the "before html" insertion mode.
3035 !!!next-token;
3036 return;
3037 } elsif ({
3038 START_TAG_TOKEN, 1,
3039 END_TAG_TOKEN, 1,
3040 END_OF_FILE_TOKEN, 1,
3041 }->{$token->{type}}) {
3042 !!!cp ('t14');
3043 !!!parse-error (type => 'no DOCTYPE', token => $token);
3044 $self->{document}->manakai_compat_mode ('quirks');
3045 ## Go to the "before html" insertion mode.
3046 ## reprocess
3047 !!!ack-later;
3048 return;
3049 } elsif ($token->{type} == CHARACTER_TOKEN) {
3050 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3051 ## Ignore the token
3052
3053 unless (length $token->{data}) {
3054 !!!cp ('t15');
3055 ## Stay in the insertion mode.
3056 !!!next-token;
3057 redo INITIAL;
3058 } else {
3059 !!!cp ('t16');
3060 }
3061 } else {
3062 !!!cp ('t17');
3063 }
3064
3065 !!!parse-error (type => 'no DOCTYPE', token => $token);
3066 $self->{document}->manakai_compat_mode ('quirks');
3067 ## Go to the "before html" insertion mode.
3068 ## reprocess
3069 return;
3070 } elsif ($token->{type} == COMMENT_TOKEN) {
3071 !!!cp ('t18');
3072 my $comment = $self->{document}->create_comment ($token->{data});
3073 $self->{document}->append_child ($comment);
3074
3075 ## Stay in the insertion mode.
3076 !!!next-token;
3077 redo INITIAL;
3078 } else {
3079 die "$0: $token->{type}: Unknown token type";
3080 }
3081 } # INITIAL
3082
3083 die "$0: _tree_construction_initial: This should be never reached";
3084 } # _tree_construction_initial
3085
3086 sub _tree_construction_root_element ($) {
3087 my $self = shift;
3088
3089 ## NOTE: "before html" insertion mode.
3090
3091 B: {
3092 if ($token->{type} == DOCTYPE_TOKEN) {
3093 !!!cp ('t19');
3094 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3095 ## Ignore the token
3096 ## Stay in the insertion mode.
3097 !!!next-token;
3098 redo B;
3099 } elsif ($token->{type} == COMMENT_TOKEN) {
3100 !!!cp ('t20');
3101 my $comment = $self->{document}->create_comment ($token->{data});
3102 $self->{document}->append_child ($comment);
3103 ## Stay in the insertion mode.
3104 !!!next-token;
3105 redo B;
3106 } elsif ($token->{type} == CHARACTER_TOKEN) {
3107 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3108 ## Ignore the token.
3109
3110 unless (length $token->{data}) {
3111 !!!cp ('t21');
3112 ## Stay in the insertion mode.
3113 !!!next-token;
3114 redo B;
3115 } else {
3116 !!!cp ('t22');
3117 }
3118 } else {
3119 !!!cp ('t23');
3120 }
3121
3122 $self->{application_cache_selection}->(undef);
3123
3124 #
3125 } elsif ($token->{type} == START_TAG_TOKEN) {
3126 if ($token->{tag_name} eq 'html') {
3127 my $root_element;
3128 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3129 $self->{document}->append_child ($root_element);
3130 push @{$self->{open_elements}},
3131 [$root_element, $el_category->{html}];
3132
3133 if ($token->{attributes}->{manifest}) {
3134 !!!cp ('t24');
3135 $self->{application_cache_selection}
3136 ->($token->{attributes}->{manifest}->{value});
3137 ## ISSUE: Spec is unclear on relative references.
3138 ## According to Hixie (#whatwg 2008-03-19), it should be
3139 ## resolved against the base URI of the document in HTML
3140 ## or xml:base of the element in XHTML.
3141 } else {
3142 !!!cp ('t25');
3143 $self->{application_cache_selection}->(undef);
3144 }
3145
3146 !!!nack ('t25c');
3147
3148 !!!next-token;
3149 return; ## Go to the "before head" insertion mode.
3150 } else {
3151 !!!cp ('t25.1');
3152 #
3153 }
3154 } elsif ({
3155 END_TAG_TOKEN, 1,
3156 END_OF_FILE_TOKEN, 1,
3157 }->{$token->{type}}) {
3158 !!!cp ('t26');
3159 #
3160 } else {
3161 die "$0: $token->{type}: Unknown token type";
3162 }
3163
3164 my $root_element;
3165 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3166 $self->{document}->append_child ($root_element);
3167 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3168
3169 $self->{application_cache_selection}->(undef);
3170
3171 ## NOTE: Reprocess the token.
3172 !!!ack-later;
3173 return; ## Go to the "before head" insertion mode.
3174
3175 ## ISSUE: There is an issue in the spec
3176 } # B
3177
3178 die "$0: _tree_construction_root_element: This should never be reached";
3179 } # _tree_construction_root_element
3180
3181 sub _reset_insertion_mode ($) {
3182 my $self = shift;
3183
3184 ## Step 1
3185 my $last;
3186
3187 ## Step 2
3188 my $i = -1;
3189 my $node = $self->{open_elements}->[$i];
3190
3191 ## Step 3
3192 S3: {
3193 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3194 $last = 1;
3195 if (defined $self->{inner_html_node}) {
3196 if ($self->{inner_html_node}->[1] & TABLE_CELL_EL) {
3197 !!!cp ('t27');
3198 #
3199 } else {
3200 !!!cp ('t28');
3201 $node = $self->{inner_html_node};
3202 }
3203 }
3204 }
3205
3206 ## Step 4..14
3207 my $new_mode;
3208 if ($node->[1] & FOREIGN_EL) {
3209 ## NOTE: Strictly spaking, the line below only applies to MathML and
3210 ## SVG elements. Currently the HTML syntax supports only MathML and
3211 ## SVG elements as foreigners.
3212 $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
3213 ## ISSUE: What is set as the secondary insertion mode?
3214 } else {
3215 $new_mode = {
3216 select => IN_SELECT_IM,
3217 ## NOTE: |option| and |optgroup| do not set
3218 ## insertion mode to "in select" by themselves.
3219 td => IN_CELL_IM,
3220 th => IN_CELL_IM,
3221 tr => IN_ROW_IM,
3222 tbody => IN_TABLE_BODY_IM,
3223 thead => IN_TABLE_BODY_IM,
3224 tfoot => IN_TABLE_BODY_IM,
3225 caption => IN_CAPTION_IM,
3226 colgroup => IN_COLUMN_GROUP_IM,
3227 table => IN_TABLE_IM,
3228 head => IN_BODY_IM, # not in head!
3229 body => IN_BODY_IM,
3230 frameset => IN_FRAMESET_IM,
3231 }->{$node->[0]->manakai_local_name};
3232 }
3233 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3234
3235 ## Step 15
3236 if ($node->[1] & HTML_EL) {
3237 unless (defined $self->{head_element}) {
3238 !!!cp ('t29');
3239 $self->{insertion_mode} = BEFORE_HEAD_IM;
3240 } else {
3241 ## ISSUE: Can this state be reached?
3242 !!!cp ('t30');
3243 $self->{insertion_mode} = AFTER_HEAD_IM;
3244 }
3245 return;
3246 } else {
3247 !!!cp ('t31');
3248 }
3249
3250 ## Step 16
3251 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3252
3253 ## Step 17
3254 $i--;
3255 $node = $self->{open_elements}->[$i];
3256
3257 ## Step 18
3258 redo S3;
3259 } # S3
3260
3261 die "$0: _reset_insertion_mode: This line should never be reached";
3262 } # _reset_insertion_mode
3263
3264 sub _tree_construction_main ($) {
3265 my $self = shift;
3266
3267 my $active_formatting_elements = [];
3268
3269 my $reconstruct_active_formatting_elements = sub { # MUST
3270 my $insert = shift;
3271
3272 ## Step 1
3273 return unless @$active_formatting_elements;
3274
3275 ## Step 3
3276 my $i = -1;
3277 my $entry = $active_formatting_elements->[$i];
3278
3279 ## Step 2
3280 return if $entry->[0] eq '#marker';
3281 for (@{$self->{open_elements}}) {
3282 if ($entry->[0] eq $_->[0]) {
3283 !!!cp ('t32');
3284 return;
3285 }
3286 }
3287
3288 S4: {
3289 ## Step 4
3290 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3291
3292 ## Step 5
3293 $i--;
3294 $entry = $active_formatting_elements->[$i];
3295
3296 ## Step 6
3297 if ($entry->[0] eq '#marker') {
3298 !!!cp ('t33_1');
3299 #
3300 } else {
3301 my $in_open_elements;
3302 OE: for (@{$self->{open_elements}}) {
3303 if ($entry->[0] eq $_->[0]) {
3304 !!!cp ('t33');
3305 $in_open_elements = 1;
3306 last OE;
3307 }
3308 }
3309 if ($in_open_elements) {
3310 !!!cp ('t34');
3311 #
3312 } else {
3313 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3314 !!!cp ('t35');
3315 redo S4;
3316 }
3317 }
3318
3319 ## Step 7
3320 $i++;
3321 $entry = $active_formatting_elements->[$i];
3322 } # S4
3323
3324 S7: {
3325 ## Step 8
3326 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3327
3328 ## Step 9
3329 $insert->($clone->[0]);
3330 push @{$self->{open_elements}}, $clone;
3331
3332 ## Step 10
3333 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3334
3335 ## Step 11
3336 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3337 !!!cp ('t36');
3338 ## Step 7'
3339 $i++;
3340 $entry = $active_formatting_elements->[$i];
3341
3342 redo S7;
3343 }
3344
3345 !!!cp ('t37');
3346 } # S7
3347 }; # $reconstruct_active_formatting_elements
3348
3349 my $clear_up_to_marker = sub {
3350 for (reverse 0..$#$active_formatting_elements) {
3351 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3352 !!!cp ('t38');
3353 splice @$active_formatting_elements, $_;
3354 return;
3355 }
3356 }
3357
3358 !!!cp ('t39');
3359 }; # $clear_up_to_marker
3360
3361 my $insert;
3362
3363 my $parse_rcdata = sub ($) {
3364 my ($content_model_flag) = @_;
3365
3366 ## Step 1
3367 my $start_tag_name = $token->{tag_name};
3368 my $el;
3369 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3370
3371 ## Step 2
3372 $insert->($el);
3373
3374 ## Step 3
3375 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3376 delete $self->{escape}; # MUST
3377
3378 ## Step 4
3379 my $text = '';
3380 !!!nack ('t40.1');
3381 !!!next-token;
3382 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3383 !!!cp ('t40');
3384 $text .= $token->{data};
3385 !!!next-token;
3386 }
3387
3388 ## Step 5
3389 if (length $text) {
3390 !!!cp ('t41');
3391 my $text = $self->{document}->create_text_node ($text);
3392 $el->append_child ($text);
3393 }
3394
3395 ## Step 6
3396 $self->{content_model} = PCDATA_CONTENT_MODEL;
3397
3398 ## Step 7
3399 if ($token->{type} == END_TAG_TOKEN and
3400 $token->{tag_name} eq $start_tag_name) {
3401 !!!cp ('t42');
3402 ## Ignore the token
3403 } else {
3404 ## NOTE: An end-of-file token.
3405 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3406 !!!cp ('t43');
3407 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3408 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3409 !!!cp ('t44');
3410 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
3411 } else {
3412 die "$0: $content_model_flag in parse_rcdata";
3413 }
3414 }
3415 !!!next-token;
3416 }; # $parse_rcdata
3417
3418 my $script_start_tag = sub () {
3419 my $script_el;
3420 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3421 ## TODO: mark as "parser-inserted"
3422
3423 $self->{content_model} = CDATA_CONTENT_MODEL;
3424 delete $self->{escape}; # MUST
3425
3426 my $text = '';
3427 !!!nack ('t45.1');
3428 !!!next-token;
3429 while ($token->{type} == CHARACTER_TOKEN) {
3430 !!!cp ('t45');
3431 $text .= $token->{data};
3432 !!!next-token;
3433 } # stop if non-character token or tokenizer stops tokenising
3434 if (length $text) {
3435 !!!cp ('t46');
3436 $script_el->manakai_append_text ($text);
3437 }
3438
3439 $self->{content_model} = PCDATA_CONTENT_MODEL;
3440
3441 if ($token->{type} == END_TAG_TOKEN and
3442 $token->{tag_name} eq 'script') {
3443 !!!cp ('t47');
3444 ## Ignore the token
3445 } else {
3446 !!!cp ('t48');
3447 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3448 ## ISSUE: And ignore?
3449 ## TODO: mark as "already executed"
3450 }
3451
3452 if (defined $self->{inner_html_node}) {
3453 !!!cp ('t49');
3454 ## TODO: mark as "already executed"
3455 } else {
3456 !!!cp ('t50');
3457 ## TODO: $old_insertion_point = current insertion point
3458 ## TODO: insertion point = just before the next input character
3459
3460 $insert->($script_el);
3461
3462 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3463
3464 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3465 }
3466
3467 !!!next-token;
3468 }; # $script_start_tag
3469
3470 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3471 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3472 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3473
3474 my $formatting_end_tag = sub {
3475 my $end_tag_token = shift;
3476 my $tag_name = $end_tag_token->{tag_name};
3477
3478 ## NOTE: The adoption agency algorithm (AAA).
3479
3480 FET: {
3481 ## Step 1
3482 my $formatting_element;
3483 my $formatting_element_i_in_active;
3484 AFE: for (reverse 0..$#$active_formatting_elements) {
3485 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3486 !!!cp ('t52');
3487 last AFE;
3488 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3489 eq $tag_name) {
3490 !!!cp ('t51');
3491 $formatting_element = $active_formatting_elements->[$_];
3492 $formatting_element_i_in_active = $_;
3493 last AFE;
3494 }
3495 } # AFE
3496 unless (defined $formatting_element) {
3497 !!!cp ('t53');
3498 !!!parse-error (type => 'unmatched end tag:'.$tag_name, token => $end_tag_token);
3499 ## Ignore the token
3500 !!!next-token;
3501 return;
3502 }
3503 ## has an element in scope
3504 my $in_scope = 1;
3505 my $formatting_element_i_in_open;
3506 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3507 my $node = $self->{open_elements}->[$_];
3508 if ($node->[0] eq $formatting_element->[0]) {
3509 if ($in_scope) {
3510 !!!cp ('t54');
3511 $formatting_element_i_in_open = $_;
3512 last INSCOPE;
3513 } else { # in open elements but not in scope
3514 !!!cp ('t55');
3515 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3516 token => $end_tag_token);
3517 ## Ignore the token
3518 !!!next-token;
3519 return;
3520 }
3521 } elsif ($node->[1] & SCOPING_EL) {
3522 !!!cp ('t56');
3523 $in_scope = 0;
3524 }
3525 } # INSCOPE
3526 unless (defined $formatting_element_i_in_open) {
3527 !!!cp ('t57');
3528 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3529 token => $end_tag_token);
3530 pop @$active_formatting_elements; # $formatting_element
3531 !!!next-token; ## TODO: ok?
3532 return;
3533 }
3534 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3535 !!!cp ('t58');
3536 !!!parse-error (type => 'not closed',
3537 value => $self->{open_elements}->[-1]->[0]
3538 ->manakai_local_name,
3539 token => $end_tag_token);
3540 }
3541
3542 ## Step 2
3543 my $furthest_block;
3544 my $furthest_block_i_in_open;
3545 OE: for (reverse 0..$#{$self->{open_elements}}) {
3546 my $node = $self->{open_elements}->[$_];
3547 if (not ($node->[1] & FORMATTING_EL) and
3548 #not $phrasing_category->{$node->[1]} and
3549 ($node->[1] & SPECIAL_EL or
3550 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3551 !!!cp ('t59');
3552 $furthest_block = $node;
3553 $furthest_block_i_in_open = $_;
3554 } elsif ($node->[0] eq $formatting_element->[0]) {
3555 !!!cp ('t60');
3556 last OE;
3557 }
3558 } # OE
3559
3560 ## Step 3
3561 unless (defined $furthest_block) { # MUST
3562 !!!cp ('t61');
3563 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3564 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3565 !!!next-token;
3566 return;
3567 }
3568
3569 ## Step 4
3570 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3571
3572 ## Step 5
3573 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3574 if (defined $furthest_block_parent) {
3575 !!!cp ('t62');
3576 $furthest_block_parent->remove_child ($furthest_block->[0]);
3577 }
3578
3579 ## Step 6
3580 my $bookmark_prev_el
3581 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3582 ->[0];
3583
3584 ## Step 7
3585 my $node = $furthest_block;
3586 my $node_i_in_open = $furthest_block_i_in_open;
3587 my $last_node = $furthest_block;
3588 S7: {
3589 ## Step 1
3590 $node_i_in_open--;
3591 $node = $self->{open_elements}->[$node_i_in_open];
3592
3593 ## Step 2
3594 my $node_i_in_active;
3595 S7S2: {
3596 for (reverse 0..$#$active_formatting_elements) {
3597 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3598 !!!cp ('t63');
3599 $node_i_in_active = $_;
3600 last S7S2;
3601 }
3602 }
3603 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3604 redo S7;
3605 } # S7S2
3606
3607 ## Step 3
3608 last S7 if $node->[0] eq $formatting_element->[0];
3609
3610 ## Step 4
3611 if ($last_node->[0] eq $furthest_block->[0]) {
3612 !!!cp ('t64');
3613 $bookmark_prev_el = $node->[0];
3614 }
3615
3616 ## Step 5
3617 if ($node->[0]->has_child_nodes ()) {
3618 !!!cp ('t65');
3619 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3620 $active_formatting_elements->[$node_i_in_active] = $clone;
3621 $self->{open_elements}->[$node_i_in_open] = $clone;
3622 $node = $clone;
3623 }
3624
3625 ## Step 6
3626 $node->[0]->append_child ($last_node->[0]);
3627
3628 ## Step 7
3629 $last_node = $node;
3630
3631 ## Step 8
3632 redo S7;
3633 } # S7
3634
3635 ## Step 8
3636 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3637 my $foster_parent_element;
3638 my $next_sibling;
3639 OE: for (reverse 0..$#{$self->{open_elements}}) {
3640 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3641 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3642 if (defined $parent and $parent->node_type == 1) {
3643 !!!cp ('t65.1');
3644 $foster_parent_element = $parent;
3645 $next_sibling = $self->{open_elements}->[$_]->[0];
3646 } else {
3647 !!!cp ('t65.2');
3648 $foster_parent_element
3649 = $self->{open_elements}->[$_ - 1]->[0];
3650 }
3651 last OE;
3652 }
3653 } # OE
3654 $foster_parent_element = $self->{open_elements}->[0]->[0]
3655 unless defined $foster_parent_element;
3656 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3657 $open_tables->[-1]->[1] = 1; # tainted
3658 } else {
3659 !!!cp ('t65.3');
3660 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3661 }
3662
3663 ## Step 9
3664 my $clone = [$formatting_element->[0]->clone_node (0),
3665 $formatting_element->[1]];
3666
3667 ## Step 10
3668 my @cn = @{$furthest_block->[0]->child_nodes};
3669 $clone->[0]->append_child ($_) for @cn;
3670
3671 ## Step 11
3672 $furthest_block->[0]->append_child ($clone->[0]);
3673
3674 ## Step 12
3675 my $i;
3676 AFE: for (reverse 0..$#$active_formatting_elements) {
3677 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3678 !!!cp ('t66');
3679 splice @$active_formatting_elements, $_, 1;
3680 $i-- and last AFE if defined $i;
3681 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3682 !!!cp ('t67');
3683 $i = $_;
3684 }
3685 } # AFE
3686 splice @$active_formatting_elements, $i + 1, 0, $clone;
3687
3688 ## Step 13
3689 undef $i;
3690 OE: for (reverse 0..$#{$self->{open_elements}}) {
3691 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3692 !!!cp ('t68');
3693 splice @{$self->{open_elements}}, $_, 1;
3694 $i-- and last OE if defined $i;
3695 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3696 !!!cp ('t69');
3697 $i = $_;
3698 }
3699 } # OE
3700 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3701
3702 ## Step 14
3703 redo FET;
3704 } # FET
3705 }; # $formatting_end_tag
3706
3707 $insert = my $insert_to_current = sub {
3708 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3709 }; # $insert_to_current
3710
3711 my $insert_to_foster = sub {
3712 my $child = shift;
3713 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3714 # MUST
3715 my $foster_parent_element;
3716 my $next_sibling;
3717 OE: for (reverse 0..$#{$self->{open_elements}}) {
3718 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3719 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3720 if (defined $parent and $parent->node_type == 1) {
3721 !!!cp ('t70');
3722 $foster_parent_element = $parent;
3723 $next_sibling = $self->{open_elements}->[$_]->[0];
3724 } else {
3725 !!!cp ('t71');
3726 $foster_parent_element
3727 = $self->{open_elements}->[$_ - 1]->[0];
3728 }
3729 last OE;
3730 }
3731 } # OE
3732 $foster_parent_element = $self->{open_elements}->[0]->[0]
3733 unless defined $foster_parent_element;
3734 $foster_parent_element->insert_before
3735 ($child, $next_sibling);
3736 $open_tables->[-1]->[1] = 1; # tainted
3737 } else {
3738 !!!cp ('t72');
3739 $self->{open_elements}->[-1]->[0]->append_child ($child);
3740 }
3741 }; # $insert_to_foster
3742
3743 B: while (1) {
3744 if ($token->{type} == DOCTYPE_TOKEN) {
3745 !!!cp ('t73');
3746 !!!parse-error (type => 'DOCTYPE in the middle', token => $token);
3747 ## Ignore the token
3748 ## Stay in the phase
3749 !!!next-token;
3750 next B;
3751 } elsif ($token->{type} == START_TAG_TOKEN and
3752 $token->{tag_name} eq 'html') {
3753 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3754 !!!cp ('t79');
3755 !!!parse-error (type => 'after html:html', token => $token);
3756 $self->{insertion_mode} = AFTER_BODY_IM;
3757 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3758 !!!cp ('t80');
3759 !!!parse-error (type => 'after html:html', token => $token);
3760 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3761 } else {
3762 !!!cp ('t81');
3763 }
3764
3765 !!!cp ('t82');
3766 !!!parse-error (type => 'not first start tag', token => $token);
3767 my $top_el = $self->{open_elements}->[0]->[0];
3768 for my $attr_name (keys %{$token->{attributes}}) {
3769 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3770 !!!cp ('t84');
3771 $top_el->set_attribute_ns
3772 (undef, [undef, $attr_name],
3773 $token->{attributes}->{$attr_name}->{value});
3774 }
3775 }
3776 !!!nack ('t84.1');
3777 !!!next-token;
3778 next B;
3779 } elsif ($token->{type} == COMMENT_TOKEN) {
3780 my $comment = $self->{document}->create_comment ($token->{data});
3781 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3782 !!!cp ('t85');
3783 $self->{document}->append_child ($comment);
3784 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3785 !!!cp ('t86');
3786 $self->{open_elements}->[0]->[0]->append_child ($comment);
3787 } else {
3788 !!!cp ('t87');
3789 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3790 }
3791 !!!next-token;
3792 next B;
3793 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
3794 if ($token->{type} == CHARACTER_TOKEN) {
3795 !!!cp ('t87.1');
3796 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3797 !!!next-token;
3798 next B;
3799 } elsif ($token->{type} == START_TAG_TOKEN) {
3800 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
3801 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
3802 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
3803 ($token->{tag_name} eq 'svg' and
3804 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
3805 ## NOTE: "using the rules for secondary insertion mode"then"continue"
3806 !!!cp ('t87.2');
3807 #
3808 } elsif ({
3809 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
3810 center => 1, code => 1, dd => 1, div => 1, dl => 1, em => 1,
3811 embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1, ## No h4!
3812 h5 => 1, h6 => 1, head => 1, hr => 1, i => 1, img => 1,
3813 li => 1, menu => 1, meta => 1, nobr => 1, p => 1, pre => 1,
3814 ruby => 1, s => 1, small => 1, span => 1, strong => 1,
3815 sub => 1, sup => 1, table => 1, tt => 1, u => 1, ul => 1,
3816 var => 1,
3817 }->{$token->{tag_name}}) {
3818 !!!cp ('t87.2');
3819 !!!parse-error (type => 'not closed',
3820 value => $self->{open_elements}->[-1]->[0]
3821 ->manakai_local_name,
3822 token => $token);
3823
3824 pop @{$self->{open_elements}}
3825 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
3826
3827 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
3828 ## Reprocess.
3829 next B;
3830 } else {
3831 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
3832 my $tag_name = $token->{tag_name};
3833 if ($nsuri eq $SVG_NS) {
3834 $tag_name = {
3835 altglyph => 'altGlyph',
3836 altglyphdef => 'altGlyphDef',
3837 altglyphitem => 'altGlyphItem',
3838 animatecolor => 'animateColor',
3839 animatemotion => 'animateMotion',
3840 animatetransform => 'animateTransform',
3841 clippath => 'clipPath',
3842 feblend => 'feBlend',
3843 fecolormatrix => 'feColorMatrix',
3844 fecomponenttransfer => 'feComponentTransfer',
3845 fecomposite => 'feComposite',
3846 feconvolvematrix => 'feConvolveMatrix',
3847 fediffuselighting => 'feDiffuseLighting',
3848 fedisplacementmap => 'feDisplacementMap',
3849 fedistantlight => 'feDistantLight',
3850 feflood => 'feFlood',
3851 fefunca => 'feFuncA',
3852 fefuncb => 'feFuncB',
3853 fefuncg => 'feFuncG',
3854 fefuncr => 'feFuncR',
3855 fegaussianblur => 'feGaussianBlur',
3856 feimage => 'feImage',
3857 femerge => 'feMerge',
3858 femergenode => 'feMergeNode',
3859 femorphology => 'feMorphology',
3860 feoffset => 'feOffset',
3861 fepointlight => 'fePointLight',
3862 fespecularlighting => 'feSpecularLighting',
3863 fespotlight => 'feSpotLight',
3864 fetile => 'feTile',
3865 feturbulence => 'feTurbulence',
3866 foreignobject => 'foreignObject',
3867 glyphref => 'glyphRef',
3868 lineargradient => 'linearGradient',
3869 radialgradient => 'radialGradient',
3870 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
3871 textpath => 'textPath',
3872 }->{$tag_name} || $tag_name;
3873 }
3874
3875 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
3876
3877 ## "adjust foreign attributes" - done in insert-element-f
3878
3879 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
3880
3881 if ($self->{self_closing}) {
3882 pop @{$self->{open_elements}};
3883 !!!ack ('t87.3');
3884 } else {
3885 !!!cp ('t87.4');
3886 }
3887
3888 !!!next-token;
3889 next B;
3890 }
3891 } elsif ($token->{type} == END_TAG_TOKEN) {
3892 ## NOTE: "using the rules for secondary insertion mode" then "continue"
3893 !!!cp ('t87.5');
3894 #
3895 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3896 ## NOTE: "using the rules for secondary insertion mode" then "continue"
3897 !!!cp ('t87.6');
3898 #
3899 ## TODO: ...
3900 } else {
3901 die "$0: $token->{type}: Unknown token type";
3902 }
3903 }
3904
3905 if ($self->{insertion_mode} & HEAD_IMS) {
3906 if ($token->{type} == CHARACTER_TOKEN) {
3907 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3908 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3909 !!!cp ('t88.2');
3910 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3911 } else {
3912 !!!cp ('t88.1');
3913 ## Ignore the token.
3914 !!!next-token;
3915 next B;
3916 }
3917 unless (length $token->{data}) {
3918 !!!cp ('t88');
3919 !!!next-token;
3920 next B;
3921 }
3922 }
3923
3924 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3925 !!!cp ('t89');
3926 ## As if <head>
3927 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
3928 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3929 push @{$self->{open_elements}},
3930 [$self->{head_element}, $el_category->{head}];
3931
3932 ## Reprocess in the "in head" insertion mode...
3933 pop @{$self->{open_elements}};
3934
3935 ## Reprocess in the "after head" insertion mode...
3936 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3937 !!!cp ('t90');
3938 ## As if </noscript>
3939 pop @{$self->{open_elements}};
3940 !!!parse-error (type => 'in noscript:#character', token => $token);
3941
3942 ## Reprocess in the "in head" insertion mode...
3943 ## As if </head>
3944 pop @{$self->{open_elements}};
3945
3946 ## Reprocess in the "after head" insertion mode...
3947 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3948 !!!cp ('t91');
3949 pop @{$self->{open_elements}};
3950
3951 ## Reprocess in the "after head" insertion mode...
3952 } else {
3953 !!!cp ('t92');
3954 }
3955
3956 ## "after head" insertion mode
3957 ## As if <body>
3958 !!!insert-element ('body',, $token);
3959 $self->{insertion_mode} = IN_BODY_IM;
3960 ## reprocess
3961 next B;
3962 } elsif ($token->{type} == START_TAG_TOKEN) {
3963 if ($token->{tag_name} eq 'head') {
3964 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3965 !!!cp ('t93');
3966 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3967 $self->{open_elements}->[-1]->[0]->append_child
3968 ($self->{head_element});
3969 push @{$self->{open_elements}},
3970 [$self->{head_element}, $el_category->{head}];
3971 $self->{insertion_mode} = IN_HEAD_IM;
3972 !!!nack ('t93.1');
3973 !!!next-token;
3974 next B;
3975 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3976 !!!cp ('t94');
3977 #
3978 } else {
3979 !!!cp ('t95');
3980 !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
3981 ## Ignore the token
3982 !!!nack ('t95.1');
3983 !!!next-token;
3984 next B;
3985 }
3986 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3987 !!!cp ('t96');
3988 ## As if <head>
3989 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
3990 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3991 push @{$self->{open_elements}},
3992 [$self->{head_element}, $el_category->{head}];
3993
3994 $self->{insertion_mode} = IN_HEAD_IM;
3995 ## Reprocess in the "in head" insertion mode...
3996 } else {
3997 !!!cp ('t97');
3998 }
3999
4000 if ($token->{tag_name} eq 'base') {
4001 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4002 !!!cp ('t98');
4003 ## As if </noscript>
4004 pop @{$self->{open_elements}};
4005 !!!parse-error (type => 'in noscript:base', token => $token);
4006
4007 $self->{insertion_mode} = IN_HEAD_IM;
4008 ## Reprocess in the "in head" insertion mode...
4009 } else {
4010 !!!cp ('t99');
4011 }
4012
4013 ## NOTE: There is a "as if in head" code clone.
4014 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4015 !!!cp ('t100');
4016 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4017 push @{$self->{open_elements}},
4018 [$self->{head_element}, $el_category->{head}];
4019 } else {
4020 !!!cp ('t101');
4021 }
4022 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4023 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4024 pop @{$self->{open_elements}} # <head>
4025 if $self->{insertion_mode} == AFTER_HEAD_IM;
4026 !!!nack ('t101.1');
4027 !!!next-token;
4028 next B;
4029 } elsif ($token->{tag_name} eq 'link') {
4030 ## NOTE: There is a "as if in head" code clone.
4031 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4032 !!!cp ('t102');
4033 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4034 push @{$self->{open_elements}},
4035 [$self->{head_element}, $el_category->{head}];
4036 } else {
4037 !!!cp ('t103');
4038 }
4039 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4040 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4041 pop @{$self->{open_elements}} # <head>
4042 if $self->{insertion_mode} == AFTER_HEAD_IM;
4043 !!!ack ('t103.1');
4044 !!!next-token;
4045 next B;
4046 } elsif ($token->{tag_name} eq 'meta') {
4047 ## NOTE: There is a "as if in head" code clone.
4048 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4049 !!!cp ('t104');
4050 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4051 push @{$self->{open_elements}},
4052 [$self->{head_element}, $el_category->{head}];
4053 } else {
4054 !!!cp ('t105');
4055 }
4056 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4057 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4058
4059 unless ($self->{confident}) {
4060 if ($token->{attributes}->{charset}) { ## TODO: And if supported
4061 !!!cp ('t106');
4062 $self->{change_encoding}
4063 ->($self, $token->{attributes}->{charset}->{value},
4064 $token);
4065
4066 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4067 ->set_user_data (manakai_has_reference =>
4068 $token->{attributes}->{charset}
4069 ->{has_reference});
4070 } elsif ($token->{attributes}->{content}) {
4071 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4072 if ($token->{attributes}->{content}->{value}
4073 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4074 [\x09-\x0D\x20]*=
4075 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4076 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4077 !!!cp ('t107');
4078 $self->{change_encoding}
4079 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4080 $token);
4081 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4082 ->set_user_data (manakai_has_reference =>
4083 $token->{attributes}->{content}
4084 ->{has_reference});
4085 } else {
4086 !!!cp ('t108');
4087 }
4088 }
4089 } else {
4090 if ($token->{attributes}->{charset}) {
4091 !!!cp ('t109');
4092 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4093 ->set_user_data (manakai_has_reference =>
4094 $token->{attributes}->{charset}
4095 ->{has_reference});
4096 }
4097 if ($token->{attributes}->{content}) {
4098 !!!cp ('t110');
4099 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4100 ->set_user_data (manakai_has_reference =>
4101 $token->{attributes}->{content}
4102 ->{has_reference});
4103 }
4104 }
4105
4106 pop @{$self->{open_elements}} # <head>
4107 if $self->{insertion_mode} == AFTER_HEAD_IM;
4108 !!!ack ('t110.1');
4109 !!!next-token;
4110 next B;
4111 } elsif ($token->{tag_name} eq 'title') {
4112 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4113 !!!cp ('t111');
4114 ## As if </noscript>
4115 pop @{$self->{open_elements}};
4116 !!!parse-error (type => 'in noscript:title', token => $token);
4117
4118 $self->{insertion_mode} = IN_HEAD_IM;
4119 ## Reprocess in the "in head" insertion mode...
4120 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4121 !!!cp ('t112');
4122 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4123 push @{$self->{open_elements}},
4124 [$self->{head_element}, $el_category->{head}];
4125 } else {
4126 !!!cp ('t113');
4127 }
4128
4129 ## NOTE: There is a "as if in head" code clone.
4130 my $parent = defined $self->{head_element} ? $self->{head_element}
4131 : $self->{open_elements}->[-1]->[0];
4132 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4133 pop @{$self->{open_elements}} # <head>
4134 if $self->{insertion_mode} == AFTER_HEAD_IM;
4135 next B;
4136 } elsif ($token->{tag_name} eq 'style') {
4137 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4138 ## insertion mode IN_HEAD_IM)
4139 ## NOTE: There is a "as if in head" code clone.
4140 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4141 !!!cp ('t114');
4142 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4143 push @{$self->{open_elements}},
4144 [$self->{head_element}, $el_category->{head}];
4145 } else {
4146 !!!cp ('t115');
4147 }
4148 $parse_rcdata->(CDATA_CONTENT_MODEL);
4149 pop @{$self->{open_elements}} # <head>
4150 if $self->{insertion_mode} == AFTER_HEAD_IM;
4151 next B;
4152 } elsif ($token->{tag_name} eq 'noscript') {
4153 if ($self->{insertion_mode} == IN_HEAD_IM) {
4154 !!!cp ('t116');
4155 ## NOTE: and scripting is disalbed
4156 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4157 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4158 !!!nack ('t116.1');
4159 !!!next-token;
4160 next B;
4161 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4162 !!!cp ('t117');
4163 !!!parse-error (type => 'in noscript:noscript', token => $token);
4164 ## Ignore the token
4165 !!!nack ('t117.1');
4166 !!!next-token;
4167 next B;
4168 } else {
4169 !!!cp ('t118');
4170 #
4171 }
4172 } elsif ($token->{tag_name} eq 'script') {
4173 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4174 !!!cp ('t119');
4175 ## As if </noscript>
4176 pop @{$self->{open_elements}};
4177 !!!parse-error (type => 'in noscript:script', token => $token);
4178
4179 $self->{insertion_mode} = IN_HEAD_IM;
4180 ## Reprocess in the "in head" insertion mode...
4181 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4182 !!!cp ('t120');
4183 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4184 push @{$self->{open_elements}},
4185 [$self->{head_element}, $el_category->{head}];
4186 } else {
4187 !!!cp ('t121');
4188 }
4189
4190 ## NOTE: There is a "as if in head" code clone.
4191 $script_start_tag->();
4192 pop @{$self->{open_elements}} # <head>
4193 if $self->{insertion_mode} == AFTER_HEAD_IM;
4194 next B;
4195 } elsif ($token->{tag_name} eq 'body' or
4196 $token->{tag_name} eq 'frameset') {
4197 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4198 !!!cp ('t122');
4199 ## As if </noscript>
4200 pop @{$self->{open_elements}};
4201 !!!parse-error (type => 'in noscript:'.$token->{tag_name}, token => $token);
4202
4203 ## Reprocess in the "in head" insertion mode...
4204 ## As if </head>
4205 pop @{$self->{open_elements}};
4206
4207 ## Reprocess in the "after head" insertion mode...
4208 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4209 !!!cp ('t124');
4210 pop @{$self->{open_elements}};
4211
4212 ## Reprocess in the "after head" insertion mode...
4213 } else {
4214 !!!cp ('t125');
4215 }
4216
4217 ## "after head" insertion mode
4218 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4219 if ($token->{tag_name} eq 'body') {
4220 !!!cp ('t126');
4221 $self->{insertion_mode} = IN_BODY_IM;
4222 } elsif ($token->{tag_name} eq 'frameset') {
4223 !!!cp ('t127');
4224 $self->{insertion_mode} = IN_FRAMESET_IM;
4225 } else {
4226 die "$0: tag name: $self->{tag_name}";
4227 }
4228 !!!nack ('t127.1');
4229 !!!next-token;
4230 next B;
4231 } else {
4232 !!!cp ('t128');
4233 #
4234 }
4235
4236 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4237 !!!cp ('t129');
4238 ## As if </noscript>
4239 pop @{$self->{open_elements}};
4240 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4241
4242 ## Reprocess in the "in head" insertion mode...
4243 ## As if </head>
4244 pop @{$self->{open_elements}};
4245
4246 ## Reprocess in the "after head" insertion mode...
4247 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4248 !!!cp ('t130');
4249 ## As if </head>
4250 pop @{$self->{open_elements}};
4251
4252 ## Reprocess in the "after head" insertion mode...
4253 } else {
4254 !!!cp ('t131');
4255 }
4256
4257 ## "after head" insertion mode
4258 ## As if <body>
4259 !!!insert-element ('body',, $token);
4260 $self->{insertion_mode} = IN_BODY_IM;
4261 ## reprocess
4262 !!!ack-later;
4263 next B;
4264 } elsif ($token->{type} == END_TAG_TOKEN) {
4265 if ($token->{tag_name} eq 'head') {
4266 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4267 !!!cp ('t132');
4268 ## As if <head>
4269 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4270 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4271 push @{$self->{open_elements}},
4272 [$self->{head_element}, $el_category->{head}];
4273
4274 ## Reprocess in the "in head" insertion mode...
4275 pop @{$self->{open_elements}};
4276 $self->{insertion_mode} = AFTER_HEAD_IM;
4277 !!!next-token;
4278 next B;
4279 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4280 !!!cp ('t133');
4281 ## As if </noscript>
4282 pop @{$self->{open_elements}};
4283 !!!parse-error (type => 'in noscript:/head', token => $token);
4284
4285 ## Reprocess in the "in head" insertion mode...
4286 pop @{$self->{open_elements}};
4287 $self->{insertion_mode} = AFTER_HEAD_IM;
4288 !!!next-token;
4289 next B;
4290 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4291 !!!cp ('t134');
4292 pop @{$self->{open_elements}};
4293 $self->{insertion_mode} = AFTER_HEAD_IM;
4294 !!!next-token;
4295 next B;
4296 } else {
4297 !!!cp ('t135');
4298 #
4299 }
4300 } elsif ($token->{tag_name} eq 'noscript') {
4301 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4302 !!!cp ('t136');
4303 pop @{$self->{open_elements}};
4304 $self->{insertion_mode} = IN_HEAD_IM;
4305 !!!next-token;
4306 next B;
4307 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4308 !!!cp ('t137');
4309 !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4310 ## Ignore the token ## ISSUE: An issue in the spec.
4311 !!!next-token;
4312 next B;
4313 } else {
4314 !!!cp ('t138');
4315 #
4316 }
4317 } elsif ({
4318 body => 1, html => 1,
4319 }->{$token->{tag_name}}) {
4320 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4321 !!!cp ('t139');
4322 ## As if <head>
4323 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4324 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4325 push @{$self->{open_elements}},
4326 [$self->{head_element}, $el_category->{head}];
4327
4328 $self->{insertion_mode} = IN_HEAD_IM;
4329 ## Reprocess in the "in head" insertion mode...
4330 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4331 !!!cp ('t140');
4332 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4333 ## Ignore the token
4334 !!!next-token;
4335 next B;
4336 } else {
4337 !!!cp ('t141');
4338 }
4339
4340 #
4341 } elsif ({
4342 p => 1, br => 1,
4343 }->{$token->{tag_name}}) {
4344 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4345 !!!cp ('t142');
4346 ## As if <head>
4347 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4348 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4349 push @{$self->{open_elements}},
4350 [$self->{head_element}, $el_category->{head}];
4351
4352 $self->{insertion_mode} = IN_HEAD_IM;
4353 ## Reprocess in the "in head" insertion mode...
4354 } else {
4355 !!!cp ('t143');
4356 }
4357
4358 #
4359 } else {
4360 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4361 !!!cp ('t144');
4362 #
4363 } else {
4364 !!!cp ('t145');
4365 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4366 ## Ignore the token
4367 !!!next-token;
4368 next B;
4369 }
4370 }
4371
4372 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4373 !!!cp ('t146');
4374 ## As if </noscript>
4375 pop @{$self->{open_elements}};
4376 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4377
4378 ## Reprocess in the "in head" insertion mode...
4379 ## As if </head>
4380 pop @{$self->{open_elements}};
4381
4382 ## Reprocess in the "after head" insertion mode...
4383 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4384 !!!cp ('t147');
4385 ## As if </head>
4386 pop @{$self->{open_elements}};
4387
4388 ## Reprocess in the "after head" insertion mode...
4389 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4390 ## ISSUE: This case cannot be reached?
4391 !!!cp ('t148');
4392 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4393 ## Ignore the token ## ISSUE: An issue in the spec.
4394 !!!next-token;
4395 next B;
4396 } else {
4397 !!!cp ('t149');
4398 }
4399
4400 ## "after head" insertion mode
4401 ## As if <body>
4402 !!!insert-element ('body',, $token);
4403 $self->{insertion_mode} = IN_BODY_IM;
4404 ## reprocess
4405 next B;
4406 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4407 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4408 !!!cp ('t149.1');
4409
4410 ## NOTE: As if <head>
4411 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4412 $self->{open_elements}->[-1]->[0]->append_child
4413 ($self->{head_element});
4414 #push @{$self->{open_elements}},
4415 # [$self->{head_element}, $el_category->{head}];
4416 #$self->{insertion_mode} = IN_HEAD_IM;
4417 ## NOTE: Reprocess.
4418
4419 ## NOTE: As if </head>
4420 #pop @{$self->{open_elements}};
4421 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4422 ## NOTE: Reprocess.
4423
4424 #
4425 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4426 !!!cp ('t149.2');
4427
4428 ## NOTE: As if </head>
4429 pop @{$self->{open_elements}};
4430 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4431 ## NOTE: Reprocess.
4432
4433 #
4434 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4435 !!!cp ('t149.3');
4436
4437 !!!parse-error (type => 'in noscript:#eof', token => $token);
4438
4439 ## As if </noscript>
4440 pop @{$self->{open_elements}};
4441 #$self->{insertion_mode} = IN_HEAD_IM;
4442 ## NOTE: Reprocess.
4443
4444 ## NOTE: As if </head>
4445 pop @{$self->{open_elements}};
4446 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4447 ## NOTE: Reprocess.
4448
4449 #
4450 } else {
4451 !!!cp ('t149.4');
4452 #
4453 }
4454
4455 ## NOTE: As if <body>
4456 !!!insert-element ('body',, $token);
4457 $self->{insertion_mode} = IN_BODY_IM;
4458 ## NOTE: Reprocess.
4459 next B;
4460 } else {
4461 die "$0: $token->{type}: Unknown token type";
4462 }
4463
4464 ## ISSUE: An issue in the spec.
4465 } elsif ($self->{insertion_mode} & BODY_IMS) {
4466 if ($token->{type} == CHARACTER_TOKEN) {
4467 !!!cp ('t150');
4468 ## NOTE: There is a code clone of "character in body".
4469 $reconstruct_active_formatting_elements->($insert_to_current);
4470
4471 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4472
4473 !!!next-token;
4474 next B;
4475 } elsif ($token->{type} == START_TAG_TOKEN) {
4476 if ({
4477 caption => 1, col => 1, colgroup => 1, tbody => 1,
4478 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4479 }->{$token->{tag_name}}) {
4480 if ($self->{insertion_mode} == IN_CELL_IM) {
4481 ## have an element in table scope
4482 for (reverse 0..$#{$self->{open_elements}}) {
4483 my $node = $self->{open_elements}->[$_];
4484 if ($node->[1] & TABLE_CELL_EL) {
4485 !!!cp ('t151');
4486
4487 ## Close the cell
4488 !!!back-token; # <x>
4489 $token = {type => END_TAG_TOKEN,
4490 tag_name => $node->[0]->manakai_local_name,
4491 line => $token->{line},
4492 column => $token->{column}};
4493 next B;
4494 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4495 !!!cp ('t152');
4496 ## ISSUE: This case can never be reached, maybe.
4497 last;
4498 }
4499 }
4500
4501 !!!cp ('t153');
4502 !!!parse-error (type => 'start tag not allowed',
4503 value => $token->{tag_name}, token => $token);
4504 ## Ignore the token
4505 !!!nack ('t153.1');
4506 !!!next-token;
4507 next B;
4508 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4509 !!!parse-error (type => 'not closed:caption', token => $token);
4510
4511 ## NOTE: As if </caption>.
4512 ## have a table element in table scope
4513 my $i;
4514 INSCOPE: {
4515 for (reverse 0..$#{$self->{open_elements}}) {
4516 my $node = $self->{open_elements}->[$_];
4517 if ($node->[1] & CAPTION_EL) {
4518 !!!cp ('t155');
4519 $i = $_;
4520 last INSCOPE;
4521 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4522 !!!cp ('t156');
4523 last;
4524 }
4525 }
4526
4527 !!!cp ('t157');
4528 !!!parse-error (type => 'start tag not allowed',
4529 value => $token->{tag_name}, token => $token);
4530 ## Ignore the token
4531 !!!nack ('t157.1');
4532 !!!next-token;
4533 next B;
4534 } # INSCOPE
4535
4536 ## generate implied end tags
4537 while ($self->{open_elements}->[-1]->[1]
4538 & END_TAG_OPTIONAL_EL) {
4539 !!!cp ('t158');
4540 pop @{$self->{open_elements}};
4541 }
4542
4543 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4544 !!!cp ('t159');
4545 !!!parse-error (type => 'not closed',
4546 value => $self->{open_elements}->[-1]->[0]
4547 ->manakai_local_name,
4548 token => $token);
4549 } else {
4550 !!!cp ('t160');
4551 }
4552
4553 splice @{$self->{open_elements}}, $i;
4554
4555 $clear_up_to_marker->();
4556
4557 $self->{insertion_mode} = IN_TABLE_IM;
4558
4559 ## reprocess
4560 !!!ack-later;
4561 next B;
4562 } else {
4563 !!!cp ('t161');
4564 #
4565 }
4566 } else {
4567 !!!cp ('t162');
4568 #
4569 }
4570 } elsif ($token->{type} == END_TAG_TOKEN) {
4571 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4572 if ($self->{insertion_mode} == IN_CELL_IM) {
4573 ## have an element in table scope
4574 my $i;
4575 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4576 my $node = $self->{open_elements}->[$_];
4577 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4578 !!!cp ('t163');
4579 $i = $_;
4580 last INSCOPE;
4581 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4582 !!!cp ('t164');
4583 last INSCOPE;
4584 }
4585 } # INSCOPE
4586 unless (defined $i) {
4587 !!!cp ('t165');
4588 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4589 ## Ignore the token
4590 !!!next-token;
4591 next B;
4592 }
4593
4594 ## generate implied end tags
4595 while ($self->{open_elements}->[-1]->[1]
4596 & END_TAG_OPTIONAL_EL) {
4597 !!!cp ('t166');
4598 pop @{$self->{open_elements}};
4599 }
4600
4601 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4602 ne $token->{tag_name}) {
4603 !!!cp ('t167');
4604 !!!parse-error (type => 'not closed',
4605 value => $self->{open_elements}->[-1]->[0]
4606 ->manakai_local_name,
4607 token => $token);
4608 } else {
4609 !!!cp ('t168');
4610 }
4611
4612 splice @{$self->{open_elements}}, $i;
4613
4614 $clear_up_to_marker->();
4615
4616 $self->{insertion_mode} = IN_ROW_IM;
4617
4618 !!!next-token;
4619 next B;
4620 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4621 !!!cp ('t169');
4622 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4623 ## Ignore the token
4624 !!!next-token;
4625 next B;
4626 } else {
4627 !!!cp ('t170');
4628 #
4629 }
4630 } elsif ($token->{tag_name} eq 'caption') {
4631 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4632 ## have a table element in table scope
4633 my $i;
4634 INSCOPE: {
4635 for (reverse 0..$#{$self->{open_elements}}) {
4636 my $node = $self->{open_elements}->[$_];
4637 if ($node->[1] & CAPTION_EL) {
4638 !!!cp ('t171');
4639 $i = $_;
4640 last INSCOPE;
4641 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4642 !!!cp ('t172');
4643 last;
4644 }
4645 }
4646
4647 !!!cp ('t173');
4648 !!!parse-error (type => 'unmatched end tag',
4649 value => $token->{tag_name}, token => $token);
4650 ## Ignore the token
4651 !!!next-token;
4652 next B;
4653 } # INSCOPE
4654
4655 ## generate implied end tags
4656 while ($self->{open_elements}->[-1]->[1]
4657 & END_TAG_OPTIONAL_EL) {
4658 !!!cp ('t174');
4659 pop @{$self->{open_elements}};
4660 }
4661
4662 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4663 !!!cp ('t175');
4664 !!!parse-error (type => 'not closed',
4665 value => $self->{open_elements}->[-1]->[0]
4666 ->manakai_local_name,
4667 token => $token);
4668 } else {
4669 !!!cp ('t176');
4670 }
4671
4672 splice @{$self->{open_elements}}, $i;
4673
4674 $clear_up_to_marker->();
4675
4676 $self->{insertion_mode} = IN_TABLE_IM;
4677
4678 !!!next-token;
4679 next B;
4680 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4681 !!!cp ('t177');
4682 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4683 ## Ignore the token
4684 !!!next-token;
4685 next B;
4686 } else {
4687 !!!cp ('t178');
4688 #
4689 }
4690 } elsif ({
4691 table => 1, tbody => 1, tfoot => 1,
4692 thead => 1, tr => 1,
4693 }->{$token->{tag_name}} and
4694 $self->{insertion_mode} == IN_CELL_IM) {
4695 ## have an element in table scope
4696 my $i;
4697 my $tn;
4698 INSCOPE: {
4699 for (reverse 0..$#{$self->{open_elements}}) {
4700 my $node = $self->{open_elements}->[$_];
4701 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4702 !!!cp ('t179');
4703 $i = $_;
4704
4705 ## Close the cell
4706 !!!back-token; # </x>
4707 $token = {type => END_TAG_TOKEN, tag_name => $tn,
4708 line => $token->{line},
4709 column => $token->{column}};
4710 next B;
4711 } elsif ($node->[1] & TABLE_CELL_EL) {
4712 !!!cp ('t180');
4713 $tn = $node->[0]->manakai_local_name;
4714 ## NOTE: There is exactly one |td| or |th| element
4715 ## in scope in the stack of open elements by definition.
4716 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4717 ## ISSUE: Can this be reached?
4718 !!!cp ('t181');
4719 last;
4720 }
4721 }
4722
4723 !!!cp ('t182');
4724 !!!parse-error (type => 'unmatched end tag',
4725 value => $token->{tag_name}, token => $token);
4726 ## Ignore the token
4727 !!!next-token;
4728 next B;
4729 } # INSCOPE
4730 } elsif ($token->{tag_name} eq 'table' and
4731 $self->{insertion_mode} == IN_CAPTION_IM) {
4732 !!!parse-error (type => 'not closed:caption', token => $token);
4733
4734 ## As if </caption>
4735 ## have a table element in table scope
4736 my $i;
4737 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4738 my $node = $self->{open_elements}->[$_];
4739 if ($node->[1] & CAPTION_EL) {
4740 !!!cp ('t184');
4741 $i = $_;
4742 last INSCOPE;
4743 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4744 !!!cp ('t185');
4745 last INSCOPE;
4746 }
4747 } # INSCOPE
4748 unless (defined $i) {
4749 !!!cp ('t186');
4750 !!!parse-error (type => 'unmatched end tag:caption', token => $token);
4751 ## Ignore the token
4752 !!!next-token;
4753 next B;
4754 }
4755
4756 ## generate implied end tags
4757 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
4758 !!!cp ('t187');
4759 pop @{$self->{open_elements}};
4760 }
4761
4762 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4763 !!!cp ('t188');
4764 !!!parse-error (type => 'not closed',
4765 value => $self->{open_elements}->[-1]->[0]
4766 ->manakai_local_name,
4767 token => $token);
4768 } else {
4769 !!!cp ('t189');
4770 }
4771
4772 splice @{$self->{open_elements}}, $i;
4773
4774 $clear_up_to_marker->();
4775
4776 $self->{insertion_mode} = IN_TABLE_IM;
4777
4778 ## reprocess
4779 next B;
4780 } elsif ({
4781 body => 1, col => 1, colgroup => 1, html => 1,
4782 }->{$token->{tag_name}}) {
4783 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
4784 !!!cp ('t190');
4785 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4786 ## Ignore the token
4787 !!!next-token;
4788 next B;
4789 } else {
4790 !!!cp ('t191');
4791 #
4792 }
4793 } elsif ({
4794 tbody => 1, tfoot => 1,
4795 thead => 1, tr => 1,
4796 }->{$token->{tag_name}} and
4797 $self->{insertion_mode} == IN_CAPTION_IM) {
4798 !!!cp ('t192');
4799 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4800 ## Ignore the token
4801 !!!next-token;
4802 next B;
4803 } else {
4804 !!!cp ('t193');
4805 #
4806 }
4807 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4808 for my $entry (@{$self->{open_elements}}) {
4809 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
4810 !!!cp ('t75');
4811 !!!parse-error (type => 'in body:#eof', token => $token);
4812 last;
4813 }
4814 }
4815
4816 ## Stop parsing.
4817 last B;
4818 } else {
4819 die "$0: $token->{type}: Unknown token type";
4820 }
4821
4822 $insert = $insert_to_current;
4823 #
4824 } elsif ($self->{insertion_mode} & TABLE_IMS) {
4825 if ($token->{type} == CHARACTER_TOKEN) {
4826 if (not $open_tables->[-1]->[1] and # tainted
4827 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4828 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4829
4830 unless (length $token->{data}) {
4831 !!!cp ('t194');
4832 !!!next-token;
4833 next B;
4834 } else {
4835 !!!cp ('t195');
4836 }
4837 }
4838
4839 !!!parse-error (type => 'in table:#character', token => $token);
4840
4841 ## As if in body, but insert into foster parent element
4842 ## ISSUE: Spec says that "whenever a node would be inserted
4843 ## into the current node" while characters might not be
4844 ## result in a new Text node.
4845 $reconstruct_active_formatting_elements->($insert_to_foster);
4846
4847 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4848 # MUST
4849 my $foster_parent_element;
4850 my $next_sibling;
4851 my $prev_sibling;
4852 OE: for (reverse 0..$#{$self->{open_elements}}) {
4853 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4854 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4855 if (defined $parent and $parent->node_type == 1) {
4856 !!!cp ('t196');
4857 $foster_parent_element = $parent;
4858 $next_sibling = $self->{open_elements}->[$_]->[0];
4859 $prev_sibling = $next_sibling->previous_sibling;
4860 } else {
4861 !!!cp ('t197');
4862 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4863 $prev_sibling = $foster_parent_element->last_child;
4864 }
4865 last OE;
4866 }
4867 } # OE
4868 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4869 $prev_sibling = $foster_parent_element->last_child
4870 unless defined $foster_parent_element;
4871 if (defined $prev_sibling and
4872 $prev_sibling->node_type == 3) {
4873 !!!cp ('t198');
4874 $prev_sibling->manakai_append_text ($token->{data});
4875 } else {
4876 !!!cp ('t199');
4877 $foster_parent_element->insert_before
4878 ($self->{document}->create_text_node ($token->{data}),
4879 $next_sibling);
4880 }
4881 $open_tables->[-1]->[1] = 1; # tainted
4882 } else {
4883 !!!cp ('t200');
4884 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4885 }
4886
4887 !!!next-token;
4888 next B;
4889 } elsif ($token->{type} == START_TAG_TOKEN) {
4890 if ({
4891 tr => ($self->{insertion_mode} != IN_ROW_IM),
4892 th => 1, td => 1,
4893 }->{$token->{tag_name}}) {
4894 if ($self->{insertion_mode} == IN_TABLE_IM) {
4895 ## Clear back to table context
4896 while (not ($self->{open_elements}->[-1]->[1]
4897 & TABLE_SCOPING_EL)) {
4898 !!!cp ('t201');
4899 pop @{$self->{open_elements}};
4900 }
4901
4902 !!!insert-element ('tbody',, $token);
4903 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4904 ## reprocess in the "in table body" insertion mode...
4905 }
4906
4907 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4908 unless ($token->{tag_name} eq 'tr') {
4909 !!!cp ('t202');
4910 !!!parse-error (type => 'missing start tag:tr', token => $token);
4911 }
4912
4913 ## Clear back to table body context
4914 while (not ($self->{open_elements}->[-1]->[1]
4915 & TABLE_ROWS_SCOPING_EL)) {
4916 !!!cp ('t203');
4917 ## ISSUE: Can this case be reached?
4918 pop @{$self->{open_elements}};
4919 }
4920
4921 $self->{insertion_mode} = IN_ROW_IM;
4922 if ($token->{tag_name} eq 'tr') {
4923 !!!cp ('t204');
4924 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4925 !!!nack ('t204');
4926 !!!next-token;
4927 next B;
4928 } else {
4929 !!!cp ('t205');
4930 !!!insert-element ('tr',, $token);
4931 ## reprocess in the "in row" insertion mode
4932 }
4933 } else {
4934 !!!cp ('t206');
4935 }
4936
4937 ## Clear back to table row context
4938 while (not ($self->{open_elements}->[-1]->[1]
4939 & TABLE_ROW_SCOPING_EL)) {
4940 !!!cp ('t207');
4941 pop @{$self->{open_elements}};
4942 }
4943
4944 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4945 $self->{insertion_mode} = IN_CELL_IM;
4946
4947 push @$active_formatting_elements, ['#marker', ''];
4948
4949 !!!nack ('t207.1');
4950 !!!next-token;
4951 next B;
4952 } elsif ({
4953 caption => 1, col => 1, colgroup => 1,
4954 tbody => 1, tfoot => 1, thead => 1,
4955 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4956 }->{$token->{tag_name}}) {
4957 if ($self->{insertion_mode} == IN_ROW_IM) {
4958 ## As if </tr>
4959 ## have an element in table scope
4960 my $i;
4961 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4962 my $node = $self->{open_elements}->[$_];
4963 if ($node->[1] & TABLE_ROW_EL) {
4964 !!!cp ('t208');
4965 $i = $_;
4966 last INSCOPE;
4967 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4968 !!!cp ('t209');
4969 last INSCOPE;
4970 }
4971 } # INSCOPE
4972 unless (defined $i) {
4973 !!!cp ('t210');
4974 ## TODO: This type is wrong.
4975 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name}, token => $token);
4976 ## Ignore the token
4977 !!!nack ('t210.1');
4978 !!!next-token;
4979 next B;
4980 }
4981
4982 ## Clear back to table row context
4983 while (not ($self->{open_elements}->[-1]->[1]
4984 & TABLE_ROW_SCOPING_EL)) {
4985 !!!cp ('t211');
4986 ## ISSUE: Can this case be reached?
4987 pop @{$self->{open_elements}};
4988 }
4989
4990 pop @{$self->{open_elements}}; # tr
4991 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4992 if ($token->{tag_name} eq 'tr') {
4993 !!!cp ('t212');
4994 ## reprocess
4995 !!!ack-later;
4996 next B;
4997 } else {
4998 !!!cp ('t213');
4999 ## reprocess in the "in table body" insertion mode...
5000 }
5001 }
5002
5003 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5004 ## have an element in table scope
5005 my $i;
5006 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5007 my $node = $self->{open_elements}->[$_];
5008 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5009 !!!cp ('t214');
5010 $i = $_;
5011 last INSCOPE;
5012 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5013 !!!cp ('t215');
5014 last INSCOPE;
5015 }
5016 } # INSCOPE
5017 unless (defined $i) {
5018 !!!cp ('t216');
5019 ## TODO: This erorr type ios wrong.
5020 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5021 ## Ignore the token
5022 !!!nack ('t216.1');
5023 !!!next-token;
5024 next B;
5025 }
5026
5027 ## Clear back to table body context
5028 while (not ($self->{open_elements}->[-1]->[1]
5029 & TABLE_ROWS_SCOPING_EL)) {
5030 !!!cp ('t217');
5031 ## ISSUE: Can this state be reached?
5032 pop @{$self->{open_elements}};
5033 }
5034
5035 ## As if <{current node}>
5036 ## have an element in table scope
5037 ## true by definition
5038
5039 ## Clear back to table body context
5040 ## nop by definition
5041
5042 pop @{$self->{open_elements}};
5043 $self->{insertion_mode} = IN_TABLE_IM;
5044 ## reprocess in "in table" insertion mode...
5045 } else {
5046 !!!cp ('t218');
5047 }
5048
5049 if ($token->{tag_name} eq 'col') {
5050 ## Clear back to table context
5051 while (not ($self->{open_elements}->[-1]->[1]
5052 & TABLE_SCOPING_EL)) {
5053 !!!cp ('t219');
5054 ## ISSUE: Can this state be reached?
5055 pop @{$self->{open_elements}};
5056 }
5057
5058 !!!insert-element ('colgroup',, $token);
5059 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5060 ## reprocess
5061 !!!ack-later;
5062 next B;
5063 } elsif ({
5064 caption => 1,
5065 colgroup => 1,
5066 tbody => 1, tfoot => 1, thead => 1,
5067 }->{$token->{tag_name}}) {
5068 ## Clear back to table context
5069 while (not ($self->{open_elements}->[-1]->[1]
5070 & TABLE_SCOPING_EL)) {
5071 !!!cp ('t220');
5072 ## ISSUE: Can this state be reached?
5073 pop @{$self->{open_elements}};
5074 }
5075
5076 push @$active_formatting_elements, ['#marker', '']
5077 if $token->{tag_name} eq 'caption';
5078
5079 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5080 $self->{insertion_mode} = {
5081 caption => IN_CAPTION_IM,
5082 colgroup => IN_COLUMN_GROUP_IM,
5083 tbody => IN_TABLE_BODY_IM,
5084 tfoot => IN_TABLE_BODY_IM,
5085 thead => IN_TABLE_BODY_IM,
5086 }->{$token->{tag_name}};
5087 !!!next-token;
5088 !!!nack ('t220.1');
5089 next B;
5090 } else {
5091 die "$0: in table: <>: $token->{tag_name}";
5092 }
5093 } elsif ($token->{tag_name} eq 'table') {
5094 !!!parse-error (type => 'not closed',
5095 value => $self->{open_elements}->[-1]->[0]
5096 ->manakai_local_name,
5097 token => $token);
5098
5099 ## As if </table>
5100 ## have a table element in table scope
5101 my $i;
5102 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5103 my $node = $self->{open_elements}->[$_];
5104 if ($node->[1] & TABLE_EL) {
5105 !!!cp ('t221');
5106 $i = $_;
5107 last INSCOPE;
5108 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5109 !!!cp ('t222');
5110 last INSCOPE;
5111 }
5112 } # INSCOPE
5113 unless (defined $i) {
5114 !!!cp ('t223');
5115 ## TODO: The following is wrong, maybe.
5116 !!!parse-error (type => 'unmatched end tag:table', token => $token);
5117 ## Ignore tokens </table><table>
5118 !!!nack ('t223.1');
5119 !!!next-token;
5120 next B;
5121 }
5122
5123 ## TODO: Followings are removed from the latest spec.
5124 ## generate implied end tags
5125 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5126 !!!cp ('t224');
5127 pop @{$self->{open_elements}};
5128 }
5129
5130 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5131 !!!cp ('t225');
5132 ## NOTE: |<table><tr><table>|
5133 !!!parse-error (type => 'not closed',
5134 value => $self->{open_elements}->[-1]->[0]
5135 ->manakai_local_name,
5136 token => $token);
5137 } else {
5138 !!!cp ('t226');
5139 }
5140
5141 splice @{$self->{open_elements}}, $i;
5142 pop @{$open_tables};
5143
5144 $self->_reset_insertion_mode;
5145
5146 ## reprocess
5147 !!!ack-later;
5148 next B;
5149 } elsif ($token->{tag_name} eq 'style') {
5150 if (not $open_tables->[-1]->[1]) { # tainted
5151 !!!cp ('t227.8');
5152 ## NOTE: This is a "as if in head" code clone.
5153 $parse_rcdata->(CDATA_CONTENT_MODEL);
5154 next B;
5155 } else {
5156 !!!cp ('t227.7');
5157 #
5158 }
5159 } elsif ($token->{tag_name} eq 'script') {
5160 if (not $open_tables->[-1]->[1]) { # tainted
5161 !!!cp ('t227.6');
5162 ## NOTE: This is a "as if in head" code clone.
5163 $script_start_tag->();
5164 next B;
5165 } else {
5166 !!!cp ('t227.5');
5167 #
5168 }
5169 } elsif ($token->{tag_name} eq 'input') {
5170 if (not $open_tables->[-1]->[1]) { # tainted
5171 if ($token->{attributes}->{type}) { ## TODO: case
5172 my $type = lc $token->{attributes}->{type}->{value};
5173 if ($type eq 'hidden') {
5174 !!!cp ('t227.3');
5175 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5176
5177 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5178
5179 ## TODO: form element pointer
5180
5181 pop @{$self->{open_elements}};
5182
5183 !!!next-token;
5184 !!!ack ('t227.2.1');
5185 next B;
5186 } else {
5187 !!!cp ('t227.2');
5188 #
5189 }
5190 } else {
5191 !!!cp ('t227.1');
5192 #
5193 }
5194 } else {
5195 !!!cp ('t227.4');
5196 #
5197 }
5198 } else {
5199 !!!cp ('t227');
5200 #
5201 }
5202
5203 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5204
5205 $insert = $insert_to_foster;
5206 #
5207 } elsif ($token->{type} == END_TAG_TOKEN) {
5208 if ($token->{tag_name} eq 'tr' and
5209 $self->{insertion_mode} == IN_ROW_IM) {
5210 ## have an element in table scope
5211 my $i;
5212 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5213 my $node = $self->{open_elements}->[$_];
5214 if ($node->[1] & TABLE_ROW_EL) {
5215 !!!cp ('t228');
5216 $i = $_;
5217 last INSCOPE;
5218 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5219 !!!cp ('t229');
5220 last INSCOPE;
5221 }
5222 } # INSCOPE
5223 unless (defined $i) {
5224 !!!cp ('t230');
5225 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5226 ## Ignore the token
5227 !!!nack ('t230.1');
5228 !!!next-token;
5229 next B;
5230 } else {
5231 !!!cp ('t232');
5232 }
5233
5234 ## Clear back to table row context
5235 while (not ($self->{open_elements}->[-1]->[1]
5236 & TABLE_ROW_SCOPING_EL)) {
5237 !!!cp ('t231');
5238 ## ISSUE: Can this state be reached?
5239 pop @{$self->{open_elements}};
5240 }
5241
5242 pop @{$self->{open_elements}}; # tr
5243 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5244 !!!next-token;
5245 !!!nack ('t231.1');
5246 next B;
5247 } elsif ($token->{tag_name} eq 'table') {
5248 if ($self->{insertion_mode} == IN_ROW_IM) {
5249 ## As if </tr>
5250 ## have an element in table scope
5251 my $i;
5252 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5253 my $node = $self->{open_elements}->[$_];
5254 if ($node->[1] & TABLE_ROW_EL) {
5255 !!!cp ('t233');
5256 $i = $_;
5257 last INSCOPE;
5258 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5259 !!!cp ('t234');
5260 last INSCOPE;
5261 }
5262 } # INSCOPE
5263 unless (defined $i) {
5264 !!!cp ('t235');
5265 ## TODO: The following is wrong.
5266 !!!parse-error (type => 'unmatched end tag:'.$token->{type}, token => $token);
5267 ## Ignore the token
5268 !!!nack ('t236.1');
5269 !!!next-token;
5270 next B;
5271 }
5272
5273 ## Clear back to table row context
5274 while (not ($self->{open_elements}->[-1]->[1]
5275 & TABLE_ROW_SCOPING_EL)) {
5276 !!!cp ('t236');
5277 ## ISSUE: Can this state be reached?
5278 pop @{$self->{open_elements}};
5279 }
5280
5281 pop @{$self->{open_elements}}; # tr
5282 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5283 ## reprocess in the "in table body" insertion mode...
5284 }
5285
5286 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5287 ## have an element in table scope
5288 my $i;
5289 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5290 my $node = $self->{open_elements}->[$_];
5291 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5292 !!!cp ('t237');
5293 $i = $_;
5294 last INSCOPE;
5295 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5296 !!!cp ('t238');
5297 last INSCOPE;
5298 }
5299 } # INSCOPE
5300 unless (defined $i) {
5301 !!!cp ('t239');
5302 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5303 ## Ignore the token
5304 !!!nack ('t239.1');
5305 !!!next-token;
5306 next B;
5307 }
5308
5309 ## Clear back to table body context
5310 while (not ($self->{open_elements}->[-1]->[1]
5311 & TABLE_ROWS_SCOPING_EL)) {
5312 !!!cp ('t240');
5313 pop @{$self->{open_elements}};
5314 }
5315
5316 ## As if <{current node}>
5317 ## have an element in table scope
5318 ## true by definition
5319
5320 ## Clear back to table body context
5321 ## nop by definition
5322
5323 pop @{$self->{open_elements}};
5324 $self->{insertion_mode} = IN_TABLE_IM;
5325 ## reprocess in the "in table" insertion mode...
5326 }
5327
5328 ## NOTE: </table> in the "in table" insertion mode.
5329 ## When you edit the code fragment below, please ensure that
5330 ## the code for <table> in the "in table" insertion mode
5331 ## is synced with it.
5332
5333 ## have a table element in table scope
5334 my $i;
5335 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5336 my $node = $self->{open_elements}->[$_];
5337 if ($node->[1] & TABLE_EL) {
5338 !!!cp ('t241');
5339 $i = $_;
5340 last INSCOPE;
5341 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5342 !!!cp ('t242');
5343 last INSCOPE;
5344 }
5345 } # INSCOPE
5346 unless (defined $i) {
5347 !!!cp ('t243');
5348 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5349 ## Ignore the token
5350 !!!nack ('t243.1');
5351 !!!next-token;
5352 next B;
5353 }
5354
5355 splice @{$self->{open_elements}}, $i;
5356 pop @{$open_tables};
5357
5358 $self->_reset_insertion_mode;
5359
5360 !!!next-token;
5361 next B;
5362 } elsif ({
5363 tbody => 1, tfoot => 1, thead => 1,
5364 }->{$token->{tag_name}} and
5365 $self->{insertion_mode} & ROW_IMS) {
5366 if ($self->{insertion_mode} == IN_ROW_IM) {
5367 ## have an element in table scope
5368 my $i;
5369 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5370 my $node = $self->{open_elements}->[$_];
5371 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5372 !!!cp ('t247');
5373 $i = $_;
5374 last INSCOPE;
5375 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5376 !!!cp ('t248');
5377 last INSCOPE;
5378 }
5379 } # INSCOPE
5380 unless (defined $i) {
5381 !!!cp ('t249');
5382 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5383 ## Ignore the token
5384 !!!nack ('t249.1');
5385 !!!next-token;
5386 next B;
5387 }
5388
5389 ## As if </tr>
5390 ## have an element in table scope
5391 my $i;
5392 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5393 my $node = $self->{open_elements}->[$_];
5394 if ($node->[1] & TABLE_ROW_EL) {
5395 !!!cp ('t250');
5396 $i = $_;
5397 last INSCOPE;
5398 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5399 !!!cp ('t251');
5400 last INSCOPE;
5401 }
5402 } # INSCOPE
5403 unless (defined $i) {
5404 !!!cp ('t252');
5405 !!!parse-error (type => 'unmatched end tag:tr', token => $token);
5406 ## Ignore the token
5407 !!!nack ('t252.1');
5408 !!!next-token;
5409 next B;
5410 }
5411
5412 ## Clear back to table row context
5413 while (not ($self->{open_elements}->[-1]->[1]
5414 & TABLE_ROW_SCOPING_EL)) {
5415 !!!cp ('t253');
5416 ## ISSUE: Can this case be reached?
5417 pop @{$self->{open_elements}};
5418 }
5419
5420 pop @{$self->{open_elements}}; # tr
5421 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5422 ## reprocess in the "in table body" insertion mode...
5423 }
5424
5425 ## have an element in table scope
5426 my $i;
5427 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5428 my $node = $self->{open_elements}->[$_];
5429 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5430 !!!cp ('t254');
5431 $i = $_;
5432 last INSCOPE;
5433 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5434 !!!cp ('t255');
5435 last INSCOPE;
5436 }
5437 } # INSCOPE
5438 unless (defined $i) {
5439 !!!cp ('t256');
5440 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5441 ## Ignore the token
5442 !!!nack ('t256.1');
5443 !!!next-token;
5444 next B;
5445 }
5446
5447 ## Clear back to table body context
5448 while (not ($self->{open_elements}->[-1]->[1]
5449 & TABLE_ROWS_SCOPING_EL)) {
5450 !!!cp ('t257');
5451 ## ISSUE: Can this case be reached?
5452 pop @{$self->{open_elements}};
5453 }
5454
5455 pop @{$self->{open_elements}};
5456 $self->{insertion_mode} = IN_TABLE_IM;
5457 !!!nack ('t257.1');
5458 !!!next-token;
5459 next B;
5460 } elsif ({
5461 body => 1, caption => 1, col => 1, colgroup => 1,
5462 html => 1, td => 1, th => 1,
5463 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5464 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5465 }->{$token->{tag_name}}) {
5466 !!!cp ('t258');
5467 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5468 ## Ignore the token
5469 !!!nack ('t258.1');
5470 !!!next-token;
5471 next B;
5472 } else {
5473 !!!cp ('t259');
5474 !!!parse-error (type => 'in table:/'.$token->{tag_name}, token => $token);
5475
5476 $insert = $insert_to_foster;
5477 #
5478 }
5479 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5480 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5481 @{$self->{open_elements}} == 1) { # redundant, maybe
5482 !!!parse-error (type => 'in body:#eof', token => $token);
5483 !!!cp ('t259.1');
5484 #
5485 } else {
5486 !!!cp ('t259.2');
5487 #
5488 }
5489
5490 ## Stop parsing
5491 last B;
5492 } else {
5493 die "$0: $token->{type}: Unknown token type";
5494 }
5495 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5496 if ($token->{type} == CHARACTER_TOKEN) {
5497 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5498 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5499 unless (length $token->{data}) {
5500 !!!cp ('t260');
5501 !!!next-token;
5502 next B;
5503 }
5504 }
5505
5506 !!!cp ('t261');
5507 #
5508 } elsif ($token->{type} == START_TAG_TOKEN) {
5509 if ($token->{tag_name} eq 'col') {
5510 !!!cp ('t262');
5511 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5512 pop @{$self->{open_elements}};
5513 !!!ack ('t262.1');
5514 !!!next-token;
5515 next B;
5516 } else {
5517 !!!cp ('t263');
5518 #
5519 }
5520 } elsif ($token->{type} == END_TAG_TOKEN) {
5521 if ($token->{tag_name} eq 'colgroup') {
5522 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5523 !!!cp ('t264');
5524 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5525 ## Ignore the token
5526 !!!next-token;
5527 next B;
5528 } else {
5529 !!!cp ('t265');
5530 pop @{$self->{open_elements}}; # colgroup
5531 $self->{insertion_mode} = IN_TABLE_IM;
5532 !!!next-token;
5533 next B;
5534 }
5535 } elsif ($token->{tag_name} eq 'col') {
5536 !!!cp ('t266');
5537 !!!parse-error (type => 'unmatched end tag:col', token => $token);
5538 ## Ignore the token
5539 !!!next-token;
5540 next B;
5541 } else {
5542 !!!cp ('t267');
5543 #
5544 }
5545 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5546 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5547 @{$self->{open_elements}} == 1) { # redundant, maybe
5548 !!!cp ('t270.2');
5549 ## Stop parsing.
5550 last B;
5551 } else {
5552 ## NOTE: As if </colgroup>.
5553 !!!cp ('t270.1');
5554 pop @{$self->{open_elements}}; # colgroup
5555 $self->{insertion_mode} = IN_TABLE_IM;
5556 ## Reprocess.
5557 next B;
5558 }
5559 } else {
5560 die "$0: $token->{type}: Unknown token type";
5561 }
5562
5563 ## As if </colgroup>
5564 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5565 !!!cp ('t269');
5566 ## TODO: Wrong error type?
5567 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5568 ## Ignore the token
5569 !!!nack ('t269.1');
5570 !!!next-token;
5571 next B;
5572 } else {
5573 !!!cp ('t270');
5574 pop @{$self->{open_elements}}; # colgroup
5575 $self->{insertion_mode} = IN_TABLE_IM;
5576 !!!ack-later;
5577 ## reprocess
5578 next B;
5579 }
5580 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5581 if ($token->{type} == CHARACTER_TOKEN) {
5582 !!!cp ('t271');
5583 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5584 !!!next-token;
5585 next B;
5586 } elsif ($token->{type} == START_TAG_TOKEN) {
5587 if ($token->{tag_name} eq 'option') {
5588 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5589 !!!cp ('t272');
5590 ## As if </option>
5591 pop @{$self->{open_elements}};
5592 } else {
5593 !!!cp ('t273');
5594 }
5595
5596 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5597 !!!nack ('t273.1');
5598 !!!next-token;
5599 next B;
5600 } elsif ($token->{tag_name} eq 'optgroup') {
5601 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5602 !!!cp ('t274');
5603 ## As if </option>
5604 pop @{$self->{open_elements}};
5605 } else {
5606 !!!cp ('t275');
5607 }
5608
5609 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5610 !!!cp ('t276');
5611 ## As if </optgroup>
5612 pop @{$self->{open_elements}};
5613 } else {
5614 !!!cp ('t277');
5615 }
5616
5617 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5618 !!!nack ('t277.1');
5619 !!!next-token;
5620 next B;
5621 } elsif ($token->{tag_name} eq 'select' or
5622 $token->{tag_name} eq 'input' or
5623 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5624 {
5625 caption => 1, table => 1,
5626 tbody => 1, tfoot => 1, thead => 1,
5627 tr => 1, td => 1, th => 1,
5628 }->{$token->{tag_name}})) {
5629 ## TODO: The type below is not good - <select> is replaced by </select>
5630 !!!parse-error (type => 'not closed:select', token => $token);
5631 ## NOTE: As if the token were </select> (<select> case) or
5632 ## as if there were </select> (otherwise).
5633 ## have an element in table scope
5634 my $i;
5635 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5636 my $node = $self->{open_elements}->[$_];
5637 if ($node->[1] & SELECT_EL) {
5638 !!!cp ('t278');
5639 $i = $_;
5640 last INSCOPE;
5641 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5642 !!!cp ('t279');
5643 last INSCOPE;
5644 }
5645 } # INSCOPE
5646 unless (defined $i) {
5647 !!!cp ('t280');
5648 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5649 ## Ignore the token
5650 !!!nack ('t280.1');
5651 !!!next-token;
5652 next B;
5653 }
5654
5655 !!!cp ('t281');
5656 splice @{$self->{open_elements}}, $i;
5657
5658 $self->_reset_insertion_mode;
5659
5660 if ($token->{tag_name} eq 'select') {
5661 !!!nack ('t281.2');
5662 !!!next-token;
5663 next B;
5664 } else {
5665 !!!cp ('t281.1');
5666 !!!ack-later;
5667 ## Reprocess the token.
5668 next B;
5669 }
5670 } else {
5671 !!!cp ('t282');
5672 !!!parse-error (type => 'in select:'.$token->{tag_name}, token => $token);
5673 ## Ignore the token
5674 !!!nack ('t282.1');
5675 !!!next-token;
5676 next B;
5677 }
5678 } elsif ($token->{type} == END_TAG_TOKEN) {
5679 if ($token->{tag_name} eq 'optgroup') {
5680 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
5681 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
5682 !!!cp ('t283');
5683 ## As if </option>
5684 splice @{$self->{open_elements}}, -2;
5685 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5686 !!!cp ('t284');
5687 pop @{$self->{open_elements}};
5688 } else {
5689 !!!cp ('t285');
5690 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5691 ## Ignore the token
5692 }
5693 !!!nack ('t285.1');
5694 !!!next-token;
5695 next B;
5696 } elsif ($token->{tag_name} eq 'option') {
5697 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5698 !!!cp ('t286');
5699 pop @{$self->{open_elements}};
5700 } else {
5701 !!!cp ('t287');
5702 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5703 ## Ignore the token
5704 }
5705 !!!nack ('t287.1');
5706 !!!next-token;
5707 next B;
5708 } elsif ($token->{tag_name} eq 'select') {
5709 ## have an element in table scope
5710 my $i;
5711 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5712 my $node = $self->{open_elements}->[$_];
5713 if ($node->[1] & SELECT_EL) {
5714 !!!cp ('t288');
5715 $i = $_;
5716 last INSCOPE;
5717 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5718 !!!cp ('t289');
5719 last INSCOPE;
5720 }
5721 } # INSCOPE
5722 unless (defined $i) {
5723 !!!cp ('t290');
5724 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5725 ## Ignore the token
5726 !!!nack ('t290.1');
5727 !!!next-token;
5728 next B;
5729 }
5730
5731 !!!cp ('t291');
5732 splice @{$self->{open_elements}}, $i;
5733
5734 $self->_reset_insertion_mode;
5735
5736 !!!nack ('t291.1');
5737 !!!next-token;
5738 next B;
5739 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5740 {
5741 caption => 1, table => 1, tbody => 1,
5742 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5743 }->{$token->{tag_name}}) {
5744 ## TODO: The following is wrong?
5745 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5746
5747 ## have an element in table scope
5748 my $i;
5749 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5750 my $node = $self->{open_elements}->[$_];
5751 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5752 !!!cp ('t292');
5753 $i = $_;
5754 last INSCOPE;
5755 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5756 !!!cp ('t293');
5757 last INSCOPE;
5758 }
5759 } # INSCOPE
5760 unless (defined $i) {
5761 !!!cp ('t294');
5762 ## Ignore the token
5763 !!!nack ('t294.1');
5764 !!!next-token;
5765 next B;
5766 }
5767
5768 ## As if </select>
5769 ## have an element in table scope
5770 undef $i;
5771 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5772 my $node = $self->{open_elements}->[$_];
5773 if ($node->[1] & SELECT_EL) {
5774 !!!cp ('t295');
5775 $i = $_;
5776 last INSCOPE;
5777 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5778 ## ISSUE: Can this state be reached?
5779 !!!cp ('t296');
5780 last INSCOPE;
5781 }
5782 } # INSCOPE
5783 unless (defined $i) {
5784 !!!cp ('t297');
5785 ## TODO: The following error type is correct?
5786 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5787 ## Ignore the </select> token
5788 !!!nack ('t297.1');
5789 !!!next-token; ## TODO: ok?
5790 next B;
5791 }
5792
5793 !!!cp ('t298');
5794 splice @{$self->{open_elements}}, $i;
5795
5796 $self->_reset_insertion_mode;
5797
5798 !!!ack-later;
5799 ## reprocess
5800 next B;
5801 } else {
5802 !!!cp ('t299');
5803 !!!parse-error (type => 'in select:/'.$token->{tag_name}, token => $token);
5804 ## Ignore the token
5805 !!!nack ('t299.3');
5806 !!!next-token;
5807 next B;
5808 }
5809 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5810 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5811 @{$self->{open_elements}} == 1) { # redundant, maybe
5812 !!!cp ('t299.1');
5813 !!!parse-error (type => 'in body:#eof', token => $token);
5814 } else {
5815 !!!cp ('t299.2');
5816 }
5817
5818 ## Stop parsing.
5819 last B;
5820 } else {
5821 die "$0: $token->{type}: Unknown token type";
5822 }
5823 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
5824 if ($token->{type} == CHARACTER_TOKEN) {
5825 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5826 my $data = $1;
5827 ## As if in body
5828 $reconstruct_active_formatting_elements->($insert_to_current);
5829
5830 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5831
5832 unless (length $token->{data}) {
5833 !!!cp ('t300');
5834 !!!next-token;
5835 next B;
5836 }
5837 }
5838
5839 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5840 !!!cp ('t301');
5841 !!!parse-error (type => 'after html:#character', token => $token);
5842
5843 ## Reprocess in the "after body" insertion mode.
5844 } else {
5845 !!!cp ('t302');
5846 }
5847
5848 ## "after body" insertion mode
5849 !!!parse-error (type => 'after body:#character', token => $token);
5850
5851 $self->{insertion_mode} = IN_BODY_IM;
5852 ## reprocess
5853 next B;
5854 } elsif ($token->{type} == START_TAG_TOKEN) {
5855 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5856 !!!cp ('t303');
5857 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5858
5859 ## Reprocess in the "after body" insertion mode.
5860 } else {
5861 !!!cp ('t304');
5862 }
5863
5864 ## "after body" insertion mode
5865 !!!parse-error (type => 'after body:'.$token->{tag_name}, token => $token);
5866
5867 $self->{insertion_mode} = IN_BODY_IM;
5868 !!!ack-later;
5869 ## reprocess
5870 next B;
5871 } elsif ($token->{type} == END_TAG_TOKEN) {
5872 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5873 !!!cp ('t305');
5874 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5875
5876 $self->{insertion_mode} = AFTER_BODY_IM;
5877 ## Reprocess in the "after body" insertion mode.
5878 } else {
5879 !!!cp ('t306');
5880 }
5881
5882 ## "after body" insertion mode
5883 if ($token->{tag_name} eq 'html') {
5884 if (defined $self->{inner_html_node}) {
5885 !!!cp ('t307');
5886 !!!parse-error (type => 'unmatched end tag:html', token => $token);
5887 ## Ignore the token
5888 !!!next-token;
5889 next B;
5890 } else {
5891 !!!cp ('t308');
5892 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
5893 !!!next-token;
5894 next B;
5895 }
5896 } else {
5897 !!!cp ('t309');
5898 !!!parse-error (type => 'after body:/'.$token->{tag_name}, token => $token);
5899
5900 $self->{insertion_mode} = IN_BODY_IM;
5901 ## reprocess
5902 next B;
5903 }
5904 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5905 !!!cp ('t309.2');
5906 ## Stop parsing
5907 last B;
5908 } else {
5909 die "$0: $token->{type}: Unknown token type";
5910 }
5911 } elsif ($self->{insertion_mode} & FRAME_IMS) {
5912 if ($token->{type} == CHARACTER_TOKEN) {
5913 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5914 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5915
5916 unless (length $token->{data}) {
5917 !!!cp ('t310');
5918 !!!next-token;
5919 next B;
5920 }
5921 }
5922
5923 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5924 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5925 !!!cp ('t311');
5926 !!!parse-error (type => 'in frameset:#character', token => $token);
5927 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
5928 !!!cp ('t312');
5929 !!!parse-error (type => 'after frameset:#character', token => $token);
5930 } else { # "after html frameset"
5931 !!!cp ('t313');
5932 !!!parse-error (type => 'after html:#character', token => $token);
5933
5934 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5935 ## Reprocess in the "after frameset" insertion mode.
5936 !!!parse-error (type => 'after frameset:#character', token => $token);
5937 }
5938
5939 ## Ignore the token.
5940 if (length $token->{data}) {
5941 !!!cp ('t314');
5942 ## reprocess the rest of characters
5943 } else {
5944 !!!cp ('t315');
5945 !!!next-token;
5946 }
5947 next B;
5948 }
5949
5950 die qq[$0: Character "$token->{data}"];
5951 } elsif ($token->{type} == START_TAG_TOKEN) {
5952 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5953 !!!cp ('t316');
5954 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5955
5956 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5957 ## Process in the "after frameset" insertion mode.
5958 } else {
5959 !!!cp ('t317');
5960 }
5961
5962 if ($token->{tag_name} eq 'frameset' and
5963 $self->{insertion_mode} == IN_FRAMESET_IM) {
5964 !!!cp ('t318');
5965 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5966 !!!nack ('t318.1');
5967 !!!next-token;
5968 next B;
5969 } elsif ($token->{tag_name} eq 'frame' and
5970 $self->{insertion_mode} == IN_FRAMESET_IM) {
5971 !!!cp ('t319');
5972 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5973 pop @{$self->{open_elements}};
5974 !!!ack ('t319.1');
5975 !!!next-token;
5976 next B;
5977 } elsif ($token->{tag_name} eq 'noframes') {
5978 !!!cp ('t320');
5979 ## NOTE: As if in body.
5980 $parse_rcdata->(CDATA_CONTENT_MODEL);
5981 next B;
5982 } else {
5983 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5984 !!!cp ('t321');
5985 !!!parse-error (type => 'in frameset:'.$token->{tag_name}, token => $token);
5986 } else {
5987 !!!cp ('t322');
5988 !!!parse-error (type => 'after frameset:'.$token->{tag_name}, token => $token);
5989 }
5990 ## Ignore the token
5991 !!!nack ('t322.1');
5992 !!!next-token;
5993 next B;
5994 }
5995 } elsif ($token->{type} == END_TAG_TOKEN) {
5996 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5997 !!!cp ('t323');
5998 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5999
6000 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6001 ## Process in the "after frameset" insertion mode.
6002 } else {
6003 !!!cp ('t324');
6004 }
6005
6006 if ($token->{tag_name} eq 'frameset' and
6007 $self->{insertion_mode} == IN_FRAMESET_IM) {
6008 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6009 @{$self->{open_elements}} == 1) {
6010 !!!cp ('t325');
6011 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6012 ## Ignore the token
6013 !!!next-token;
6014 } else {
6015 !!!cp ('t326');
6016 pop @{$self->{open_elements}};
6017 !!!next-token;
6018 }
6019
6020 if (not defined $self->{inner_html_node} and
6021 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6022 !!!cp ('t327');
6023 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6024 } else {
6025 !!!cp ('t328');
6026 }
6027 next B;
6028 } elsif ($token->{tag_name} eq 'html' and
6029 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6030 !!!cp ('t329');
6031 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6032 !!!next-token;
6033 next B;
6034 } else {
6035 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6036 !!!cp ('t330');
6037 !!!parse-error (type => 'in frameset:/'.$token->{tag_name}, token => $token);
6038 } else {
6039 !!!cp ('t331');
6040 !!!parse-error (type => 'after frameset:/'.$token->{tag_name}, token => $token);
6041 }
6042 ## Ignore the token
6043 !!!next-token;
6044 next B;
6045 }
6046 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6047 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6048 @{$self->{open_elements}} == 1) { # redundant, maybe
6049 !!!cp ('t331.1');
6050 !!!parse-error (type => 'in body:#eof', token => $token);
6051 } else {
6052 !!!cp ('t331.2');
6053 }
6054
6055 ## Stop parsing
6056 last B;
6057 } else {
6058 die "$0: $token->{type}: Unknown token type";
6059 }
6060
6061 ## ISSUE: An issue in spec here
6062 } else {
6063 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6064 }
6065
6066 ## "in body" insertion mode
6067 if ($token->{type} == START_TAG_TOKEN) {
6068 if ($token->{tag_name} eq 'script') {
6069 !!!cp ('t332');
6070 ## NOTE: This is an "as if in head" code clone
6071 $script_start_tag->();
6072 next B;
6073 } elsif ($token->{tag_name} eq 'style') {
6074 !!!cp ('t333');
6075 ## NOTE: This is an "as if in head" code clone
6076 $parse_rcdata->(CDATA_CONTENT_MODEL);
6077 next B;
6078 } elsif ({
6079 base => 1, link => 1,
6080 }->{$token->{tag_name}}) {
6081 !!!cp ('t334');
6082 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6083 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6084 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6085 !!!ack ('t334.1');
6086 !!!next-token;
6087 next B;
6088 } elsif ($token->{tag_name} eq 'meta') {
6089 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6090 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6091 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6092
6093 unless ($self->{confident}) {
6094 if ($token->{attributes}->{charset}) { ## TODO: And if supported
6095 !!!cp ('t335');
6096 $self->{change_encoding}
6097 ->($self, $token->{attributes}->{charset}->{value}, $token);
6098
6099 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6100 ->set_user_data (manakai_has_reference =>
6101 $token->{attributes}->{charset}
6102 ->{has_reference});
6103 } elsif ($token->{attributes}->{content}) {
6104 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
6105 if ($token->{attributes}->{content}->{value}
6106 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6107 [\x09-\x0D\x20]*=
6108 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6109 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
6110 !!!cp ('t336');
6111 $self->{change_encoding}
6112 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6113 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6114 ->set_user_data (manakai_has_reference =>
6115 $token->{attributes}->{content}
6116 ->{has_reference});
6117 }
6118 }
6119 } else {
6120 if ($token->{attributes}->{charset}) {
6121 !!!cp ('t337');
6122 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6123 ->set_user_data (manakai_has_reference =>
6124 $token->{attributes}->{charset}
6125 ->{has_reference});
6126 }
6127 if ($token->{attributes}->{content}) {
6128 !!!cp ('t338');
6129 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6130 ->set_user_data (manakai_has_reference =>
6131 $token->{attributes}->{content}
6132 ->{has_reference});
6133 }
6134 }
6135
6136 !!!ack ('t338.1');
6137 !!!next-token;
6138 next B;
6139 } elsif ($token->{tag_name} eq 'title') {
6140 !!!cp ('t341');
6141 ## NOTE: This is an "as if in head" code clone
6142 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6143 next B;
6144 } elsif ($token->{tag_name} eq 'body') {
6145 !!!parse-error (type => 'in body:body', token => $token);
6146
6147 if (@{$self->{open_elements}} == 1 or
6148 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6149 !!!cp ('t342');
6150 ## Ignore the token
6151 } else {
6152 my $body_el = $self->{open_elements}->[1]->[0];
6153 for my $attr_name (keys %{$token->{attributes}}) {
6154 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6155 !!!cp ('t343');
6156 $body_el->set_attribute_ns
6157 (undef, [undef, $attr_name],
6158 $token->{attributes}->{$attr_name}->{value});
6159 }
6160 }
6161 }
6162 !!!nack ('t343.1');
6163 !!!next-token;
6164 next B;
6165 } elsif ({
6166 address => 1, blockquote => 1, center => 1, dir => 1,
6167 div => 1, dl => 1, fieldset => 1,
6168 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6169 menu => 1, ol => 1, p => 1, ul => 1,
6170 pre => 1, listing => 1,
6171 form => 1,
6172 table => 1,
6173 hr => 1,
6174 }->{$token->{tag_name}}) {
6175 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6176 !!!cp ('t350');
6177 !!!parse-error (type => 'in form:form', token => $token);
6178 ## Ignore the token
6179 !!!nack ('t350.1');
6180 !!!next-token;
6181 next B;
6182 }
6183
6184 ## has a p element in scope
6185 INSCOPE: for (reverse @{$self->{open_elements}}) {
6186 if ($_->[1] & P_EL) {
6187 !!!cp ('t344');
6188 !!!back-token; # <form>
6189 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6190 line => $token->{line}, column => $token->{column}};
6191 next B;
6192 } elsif ($_->[1] & SCOPING_EL) {
6193 !!!cp ('t345');
6194 last INSCOPE;
6195 }
6196 } # INSCOPE
6197
6198 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6199 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6200 !!!nack ('t346.1');
6201 !!!next-token;
6202 if ($token->{type} == CHARACTER_TOKEN) {
6203 $token->{data} =~ s/^\x0A//;
6204 unless (length $token->{data}) {
6205 !!!cp ('t346');
6206 !!!next-token;
6207 } else {
6208 !!!cp ('t349');
6209 }
6210 } else {
6211 !!!cp ('t348');
6212 }
6213 } elsif ($token->{tag_name} eq 'form') {
6214 !!!cp ('t347.1');
6215 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6216
6217 !!!nack ('t347.2');
6218 !!!next-token;
6219 } elsif ($token->{tag_name} eq 'table') {
6220 !!!cp ('t382');
6221 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6222
6223 $self->{insertion_mode} = IN_TABLE_IM;
6224
6225 !!!nack ('t382.1');
6226 !!!next-token;
6227 } elsif ($token->{tag_name} eq 'hr') {
6228 !!!cp ('t386');
6229 pop @{$self->{open_elements}};
6230
6231 !!!nack ('t386.1');
6232 !!!next-token;
6233 } else {
6234 !!!nack ('t347.1');
6235 !!!next-token;
6236 }
6237 next B;
6238 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6239 ## has a p element in scope
6240 INSCOPE: for (reverse @{$self->{open_elements}}) {
6241 if ($_->[1] & P_EL) {
6242 !!!cp ('t353');
6243 !!!back-token; # <x>
6244 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6245 line => $token->{line}, column => $token->{column}};
6246 next B;
6247 } elsif ($_->[1] & SCOPING_EL) {
6248 !!!cp ('t354');
6249 last INSCOPE;
6250 }
6251 } # INSCOPE
6252
6253 ## Step 1
6254 my $i = -1;
6255 my $node = $self->{open_elements}->[$i];
6256 my $li_or_dtdd = {li => {li => 1},
6257 dt => {dt => 1, dd => 1},
6258 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6259 LI: {
6260 ## Step 2
6261 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6262 if ($i != -1) {
6263 !!!cp ('t355');
6264 !!!parse-error (type => 'not closed',
6265 value => $self->{open_elements}->[-1]->[0]
6266 ->manakai_local_name,
6267 token => $token);
6268 } else {
6269 !!!cp ('t356');
6270 }
6271 splice @{$self->{open_elements}}, $i;
6272 last LI;
6273 } else {
6274 !!!cp ('t357');
6275 }
6276
6277 ## Step 3
6278 if (not ($node->[1] & FORMATTING_EL) and
6279 #not $phrasing_category->{$node->[1]} and
6280 ($node->[1] & SPECIAL_EL or
6281 $node->[1] & SCOPING_EL) and
6282 not ($node->[1] & ADDRESS_EL) and
6283 not ($node->[1] & DIV_EL)) {
6284 !!!cp ('t358');
6285 last LI;
6286 }
6287
6288 !!!cp ('t359');
6289 ## Step 4
6290 $i--;
6291 $node = $self->{open_elements}->[$i];
6292 redo LI;
6293 } # LI
6294
6295 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6296 !!!nack ('t359.1');
6297 !!!next-token;
6298 next B;
6299 } elsif ($token->{tag_name} eq 'plaintext') {
6300 ## has a p element in scope
6301 INSCOPE: for (reverse @{$self->{open_elements}}) {
6302 if ($_->[1] & P_EL) {
6303 !!!cp ('t367');
6304 !!!back-token; # <plaintext>
6305 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6306 line => $token->{line}, column => $token->{column}};
6307 next B;
6308 } elsif ($_->[1] & SCOPING_EL) {
6309 !!!cp ('t368');
6310 last INSCOPE;
6311 }
6312 } # INSCOPE
6313
6314 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6315
6316 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6317
6318 !!!nack ('t368.1');
6319 !!!next-token;
6320 next B;
6321 } elsif ($token->{tag_name} eq 'a') {
6322 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6323 my $node = $active_formatting_elements->[$i];
6324 if ($node->[1] & A_EL) {
6325 !!!cp ('t371');
6326 !!!parse-error (type => 'in a:a', token => $token);
6327
6328 !!!back-token; # <a>
6329 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6330 line => $token->{line}, column => $token->{column}};
6331 $formatting_end_tag->($token);
6332
6333 AFE2: for (reverse 0..$#$active_formatting_elements) {
6334 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6335 !!!cp ('t372');
6336 splice @$active_formatting_elements, $_, 1;
6337 last AFE2;
6338 }
6339 } # AFE2
6340 OE: for (reverse 0..$#{$self->{open_elements}}) {
6341 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6342 !!!cp ('t373');
6343 splice @{$self->{open_elements}}, $_, 1;
6344 last OE;
6345 }
6346 } # OE
6347 last AFE;
6348 } elsif ($node->[0] eq '#marker') {
6349 !!!cp ('t374');
6350 last AFE;
6351 }
6352 } # AFE
6353
6354 $reconstruct_active_formatting_elements->($insert_to_current);
6355
6356 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6357 push @$active_formatting_elements, $self->{open_elements}->[-1];
6358
6359 !!!nack ('t374.1');
6360 !!!next-token;
6361 next B;
6362 } elsif ($token->{tag_name} eq 'nobr') {
6363 $reconstruct_active_formatting_elements->($insert_to_current);
6364
6365 ## has a |nobr| element in scope
6366 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6367 my $node = $self->{open_elements}->[$_];
6368 if ($node->[1] & NOBR_EL) {
6369 !!!cp ('t376');
6370 !!!parse-error (type => 'in nobr:nobr', token => $token);
6371 !!!back-token; # <nobr>
6372 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6373 line => $token->{line}, column => $token->{column}};
6374 next B;
6375 } elsif ($node->[1] & SCOPING_EL) {
6376 !!!cp ('t377');
6377 last INSCOPE;
6378 }
6379 } # INSCOPE
6380
6381 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6382 push @$active_formatting_elements, $self->{open_elements}->[-1];
6383
6384 !!!nack ('t377.1');
6385 !!!next-token;
6386 next B;
6387 } elsif ($token->{tag_name} eq 'button') {
6388 ## has a button element in scope
6389 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6390 my $node = $self->{open_elements}->[$_];
6391 if ($node->[1] & BUTTON_EL) {
6392 !!!cp ('t378');
6393 !!!parse-error (type => 'in button:button', token => $token);
6394 !!!back-token; # <button>
6395 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6396 line => $token->{line}, column => $token->{column}};
6397 next B;
6398 } elsif ($node->[1] & SCOPING_EL) {
6399 !!!cp ('t379');
6400 last INSCOPE;
6401 }
6402 } # INSCOPE
6403
6404 $reconstruct_active_formatting_elements->($insert_to_current);
6405
6406 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6407
6408 ## TODO: associate with $self->{form_element} if defined
6409
6410 push @$active_formatting_elements, ['#marker', ''];
6411
6412 !!!nack ('t379.1');
6413 !!!next-token;
6414 next B;
6415 } elsif ({
6416 xmp => 1,
6417 iframe => 1,
6418 noembed => 1,
6419 noframes => 1,
6420 noscript => 0, ## TODO: 1 if scripting is enabled
6421 }->{$token->{tag_name}}) {
6422 if ($token->{tag_name} eq 'xmp') {
6423 !!!cp ('t381');
6424 $reconstruct_active_formatting_elements->($insert_to_current);
6425 } else {
6426 !!!cp ('t399');
6427 }
6428 ## NOTE: There is an "as if in body" code clone.
6429 $parse_rcdata->(CDATA_CONTENT_MODEL);
6430 next B;
6431 } elsif ($token->{tag_name} eq 'isindex') {
6432 !!!parse-error (type => 'isindex', token => $token);
6433
6434 if (defined $self->{form_element}) {
6435 !!!cp ('t389');
6436 ## Ignore the token
6437 !!!nack ('t389'); ## NOTE: Not acknowledged.
6438 !!!next-token;
6439 next B;
6440 } else {
6441 my $at = $token->{attributes};
6442 my $form_attrs;
6443 $form_attrs->{action} = $at->{action} if $at->{action};
6444 my $prompt_attr = $at->{prompt};
6445 $at->{name} = {name => 'name', value => 'isindex'};
6446 delete $at->{action};
6447 delete $at->{prompt};
6448 my @tokens = (
6449 {type => START_TAG_TOKEN, tag_name => 'form',
6450 attributes => $form_attrs,
6451 line => $token->{line}, column => $token->{column}},
6452 {type => START_TAG_TOKEN, tag_name => 'hr',
6453 line => $token->{line}, column => $token->{column}},
6454 {type => START_TAG_TOKEN, tag_name => 'p',
6455 line => $token->{line}, column => $token->{column}},
6456 {type => START_TAG_TOKEN, tag_name => 'label',
6457 line => $token->{line}, column => $token->{column}},
6458 );
6459 if ($prompt_attr) {
6460 !!!cp ('t390');
6461 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6462 #line => $token->{line}, column => $token->{column},
6463 };
6464 } else {
6465 !!!cp ('t391');
6466 push @tokens, {type => CHARACTER_TOKEN,
6467 data => 'This is a searchable index. Insert your search keywords here: ',
6468 #line => $token->{line}, column => $token->{column},
6469 }; # SHOULD
6470 ## TODO: make this configurable
6471 }
6472 push @tokens,
6473 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6474 line => $token->{line}, column => $token->{column}},
6475 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6476 {type => END_TAG_TOKEN, tag_name => 'label',
6477 line => $token->{line}, column => $token->{column}},
6478 {type => END_TAG_TOKEN, tag_name => 'p',
6479 line => $token->{line}, column => $token->{column}},
6480 {type => START_TAG_TOKEN, tag_name => 'hr',
6481 line => $token->{line}, column => $token->{column}},
6482 {type => END_TAG_TOKEN, tag_name => 'form',
6483 line => $token->{line}, column => $token->{column}};
6484 !!!nack ('t391.1'); ## NOTE: Not acknowledged.
6485 !!!back-token (@tokens);
6486 !!!next-token;
6487 next B;
6488 }
6489 } elsif ($token->{tag_name} eq 'textarea') {
6490 my $tag_name = $token->{tag_name};
6491 my $el;
6492 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6493
6494 ## TODO: $self->{form_element} if defined
6495 $self->{content_model} = RCDATA_CONTENT_MODEL;
6496 delete $self->{escape}; # MUST
6497
6498 $insert->($el);
6499
6500 my $text = '';
6501 !!!nack ('t392.1');
6502 !!!next-token;
6503 if ($token->{type} == CHARACTER_TOKEN) {
6504 $token->{data} =~ s/^\x0A//;
6505 unless (length $token->{data}) {
6506 !!!cp ('t392');
6507 !!!next-token;
6508 } else {
6509 !!!cp ('t393');
6510 }
6511 } else {
6512 !!!cp ('t394');
6513 }
6514 while ($token->{type} == CHARACTER_TOKEN) {
6515 !!!cp ('t395');
6516 $text .= $token->{data};
6517 !!!next-token;
6518 }
6519 if (length $text) {
6520 !!!cp ('t396');
6521 $el->manakai_append_text ($text);
6522 }
6523
6524 $self->{content_model} = PCDATA_CONTENT_MODEL;
6525
6526 if ($token->{type} == END_TAG_TOKEN and
6527 $token->{tag_name} eq $tag_name) {
6528 !!!cp ('t397');
6529 ## Ignore the token
6530 } else {
6531 !!!cp ('t398');
6532 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
6533 }
6534 !!!next-token;
6535 next B;
6536 } elsif ($token->{tag_name} eq 'math' or
6537 $token->{tag_name} eq 'svg') {
6538 $reconstruct_active_formatting_elements->($insert_to_current);
6539
6540 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
6541
6542 ## "adjust foreign attributes" - done in insert-element-f
6543
6544 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
6545
6546 if ($self->{self_closing}) {
6547 pop @{$self->{open_elements}};
6548 !!!ack ('t398.1');
6549 } else {
6550 !!!cp ('t398.2');
6551 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6552 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6553 ## mode, "in body" (not "in foreign content") secondary insertion
6554 ## mode, maybe.
6555 }
6556
6557 !!!next-token;
6558 next B;
6559 } elsif ({
6560 caption => 1, col => 1, colgroup => 1, frame => 1,
6561 frameset => 1, head => 1, option => 1, optgroup => 1,
6562 tbody => 1, td => 1, tfoot => 1, th => 1,
6563 thead => 1, tr => 1,
6564 }->{$token->{tag_name}}) {
6565 !!!cp ('t401');
6566 !!!parse-error (type => 'in body:'.$token->{tag_name}, token => $token);
6567 ## Ignore the token
6568 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6569 !!!next-token;
6570 next B;
6571
6572 ## ISSUE: An issue on HTML5 new elements in the spec.
6573 } else {
6574 if ($token->{tag_name} eq 'image') {
6575 !!!cp ('t384');
6576 !!!parse-error (type => 'image', token => $token);
6577 $token->{tag_name} = 'img';
6578 } else {
6579 !!!cp ('t385');
6580 }
6581
6582 ## NOTE: There is an "as if <br>" code clone.
6583 $reconstruct_active_formatting_elements->($insert_to_current);
6584
6585 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6586
6587 if ({
6588 applet => 1, marquee => 1, object => 1,
6589 }->{$token->{tag_name}}) {
6590 !!!cp ('t380');
6591 push @$active_formatting_elements, ['#marker', ''];
6592 !!!nack ('t380.1');
6593 } elsif ({
6594 b => 1, big => 1, em => 1, font => 1, i => 1,
6595 s => 1, small => 1, strile => 1,
6596 strong => 1, tt => 1, u => 1,
6597 }->{$token->{tag_name}}) {
6598 !!!cp ('t375');
6599 push @$active_formatting_elements, $self->{open_elements}->[-1];
6600 !!!nack ('t375.1');
6601 } elsif ($token->{tag_name} eq 'input') {
6602 !!!cp ('t388');
6603 ## TODO: associate with $self->{form_element} if defined
6604 pop @{$self->{open_elements}};
6605 !!!ack ('t388.2');
6606 } elsif ({
6607 area => 1, basefont => 1, bgsound => 1, br => 1,
6608 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
6609 #image => 1,
6610 }->{$token->{tag_name}}) {
6611 !!!cp ('t388.1');
6612 pop @{$self->{open_elements}};
6613 !!!ack ('t388.3');
6614 } elsif ($token->{tag_name} eq 'select') {
6615 ## TODO: associate with $self->{form_element} if defined
6616
6617 if ($self->{insertion_mode} & TABLE_IMS or
6618 $self->{insertion_mode} & BODY_TABLE_IMS or
6619 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6620 !!!cp ('t400.1');
6621 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6622 } else {
6623 !!!cp ('t400.2');
6624 $self->{insertion_mode} = IN_SELECT_IM;
6625 }
6626 !!!nack ('t400.3');
6627 } else {
6628 !!!nack ('t402');
6629 }
6630
6631 !!!next-token;
6632 next B;
6633 }
6634 } elsif ($token->{type} == END_TAG_TOKEN) {
6635 if ($token->{tag_name} eq 'body') {
6636 ## has a |body| element in scope
6637 my $i;
6638 INSCOPE: {
6639 for (reverse @{$self->{open_elements}}) {
6640 if ($_->[1] & BODY_EL) {
6641 !!!cp ('t405');
6642 $i = $_;
6643 last INSCOPE;
6644 } elsif ($_->[1] & SCOPING_EL) {
6645 !!!cp ('t405.1');
6646 last;
6647 }
6648 }
6649
6650 !!!parse-error (type => 'start tag not allowed',
6651 value => $token->{tag_name}, token => $token);
6652 ## NOTE: Ignore the token.
6653 !!!next-token;
6654 next B;
6655 } # INSCOPE
6656
6657 for (@{$self->{open_elements}}) {
6658 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
6659 !!!cp ('t403');
6660 !!!parse-error (type => 'not closed',
6661 value => $_->[0]->manakai_local_name,
6662 token => $token);
6663 last;
6664 } else {
6665 !!!cp ('t404');
6666 }
6667 }
6668
6669 $self->{insertion_mode} = AFTER_BODY_IM;
6670 !!!next-token;
6671 next B;
6672 } elsif ($token->{tag_name} eq 'html') {
6673 ## TODO: Update this code. It seems that the code below is not
6674 ## up-to-date, though it has same effect as speced.
6675 if (@{$self->{open_elements}} > 1 and
6676 $self->{open_elements}->[1]->[1] & BODY_EL) {
6677 ## ISSUE: There is an issue in the spec.
6678 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
6679 !!!cp ('t406');
6680 !!!parse-error (type => 'not closed',
6681 value => $self->{open_elements}->[1]->[0]
6682 ->manakai_local_name,
6683 token => $token);
6684 } else {
6685 !!!cp ('t407');
6686 }
6687 $self->{insertion_mode} = AFTER_BODY_IM;
6688 ## reprocess
6689 next B;
6690 } else {
6691 !!!cp ('t408');
6692 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6693 ## Ignore the token
6694 !!!next-token;
6695 next B;
6696 }
6697 } elsif ({
6698 address => 1, blockquote => 1, center => 1, dir => 1,
6699 div => 1, dl => 1, fieldset => 1, listing => 1,
6700 menu => 1, ol => 1, pre => 1, ul => 1,
6701 dd => 1, dt => 1, li => 1,
6702 applet => 1, button => 1, marquee => 1, object => 1,
6703 }->{$token->{tag_name}}) {
6704 ## has an element in scope
6705 my $i;
6706 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6707 my $node = $self->{open_elements}->[$_];
6708 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6709 !!!cp ('t410');
6710 $i = $_;
6711 last INSCOPE;
6712 } elsif ($node->[1] & SCOPING_EL) {
6713 !!!cp ('t411');
6714 last INSCOPE;
6715 }
6716 } # INSCOPE
6717
6718 unless (defined $i) { # has an element in scope
6719 !!!cp ('t413');
6720 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6721 } else {
6722 ## Step 1. generate implied end tags
6723 while ({
6724 dd => ($token->{tag_name} ne 'dd'),
6725 dt => ($token->{tag_name} ne 'dt'),
6726 li => ($token->{tag_name} ne 'li'),
6727 p => 1,
6728 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
6729 !!!cp ('t409');
6730 pop @{$self->{open_elements}};
6731 }
6732
6733 ## Step 2.
6734 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6735 ne $token->{tag_name}) {
6736 !!!cp ('t412');
6737 !!!parse-error (type => 'not closed',
6738 value => $self->{open_elements}->[-1]->[0]
6739 ->manakai_local_name,
6740 token => $token);
6741 } else {
6742 !!!cp ('t414');
6743 }
6744
6745 ## Step 3.
6746 splice @{$self->{open_elements}}, $i;
6747
6748 ## Step 4.
6749 $clear_up_to_marker->()
6750 if {
6751 applet => 1, button => 1, marquee => 1, object => 1,
6752 }->{$token->{tag_name}};
6753 }
6754 !!!next-token;
6755 next B;
6756 } elsif ($token->{tag_name} eq 'form') {
6757 undef $self->{form_element};
6758
6759 ## has an element in scope
6760 my $i;
6761 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6762 my $node = $self->{open_elements}->[$_];
6763 if ($node->[1] & FORM_EL) {
6764 !!!cp ('t418');
6765 $i = $_;
6766 last INSCOPE;
6767 } elsif ($node->[1] & SCOPING_EL) {
6768 !!!cp ('t419');
6769 last INSCOPE;
6770 }
6771 } # INSCOPE
6772
6773 unless (defined $i) { # has an element in scope
6774 !!!cp ('t421');
6775 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6776 } else {
6777 ## Step 1. generate implied end tags
6778 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6779 !!!cp ('t417');
6780 pop @{$self->{open_elements}};
6781 }
6782
6783 ## Step 2.
6784 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6785 ne $token->{tag_name}) {
6786 !!!cp ('t417.1');
6787 !!!parse-error (type => 'not closed',
6788 value => $self->{open_elements}->[-1]->[0]
6789 ->manakai_local_name,
6790 token => $token);
6791 } else {
6792 !!!cp ('t420');
6793 }
6794
6795 ## Step 3.
6796 splice @{$self->{open_elements}}, $i;
6797 }
6798
6799 !!!next-token;
6800 next B;
6801 } elsif ({
6802 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6803 }->{$token->{tag_name}}) {
6804 ## has an element in scope
6805 my $i;
6806 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6807 my $node = $self->{open_elements}->[$_];
6808 if ($node->[1] & HEADING_EL) {
6809 !!!cp ('t423');
6810 $i = $_;
6811 last INSCOPE;
6812 } elsif ($node->[1] & SCOPING_EL) {
6813 !!!cp ('t424');
6814 last INSCOPE;
6815 }
6816 } # INSCOPE
6817
6818 unless (defined $i) { # has an element in scope
6819 !!!cp ('t425.1');
6820 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6821 } else {
6822 ## Step 1. generate implied end tags
6823 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6824 !!!cp ('t422');
6825 pop @{$self->{open_elements}};
6826 }
6827
6828 ## Step 2.
6829 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6830 ne $token->{tag_name}) {
6831 !!!cp ('t425');
6832 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6833 } else {
6834 !!!cp ('t426');
6835 }
6836
6837 ## Step 3.
6838 splice @{$self->{open_elements}}, $i;
6839 }
6840
6841 !!!next-token;
6842 next B;
6843 } elsif ($token->{tag_name} eq 'p') {
6844 ## has an element in scope
6845 my $i;
6846 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6847 my $node = $self->{open_elements}->[$_];
6848 if ($node->[1] & P_EL) {
6849 !!!cp ('t410.1');
6850 $i = $_;
6851 last INSCOPE;
6852 } elsif ($node->[1] & SCOPING_EL) {
6853 !!!cp ('t411.1');
6854 last INSCOPE;
6855 }
6856 } # INSCOPE
6857
6858 if (defined $i) {
6859 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6860 ne $token->{tag_name}) {
6861 !!!cp ('t412.1');
6862 !!!parse-error (type => 'not closed',
6863 value => $self->{open_elements}->[-1]->[0]
6864 ->manakai_local_name,
6865 token => $token);
6866 } else {
6867 !!!cp ('t414.1');
6868 }
6869
6870 splice @{$self->{open_elements}}, $i;
6871 } else {
6872 !!!cp ('t413.1');
6873 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6874
6875 !!!cp ('t415.1');
6876 ## As if <p>, then reprocess the current token
6877 my $el;
6878 !!!create-element ($el, $HTML_NS, 'p',, $token);
6879 $insert->($el);
6880 ## NOTE: Not inserted into |$self->{open_elements}|.
6881 }
6882
6883 !!!next-token;
6884 next B;
6885 } elsif ({
6886 a => 1,
6887 b => 1, big => 1, em => 1, font => 1, i => 1,
6888 nobr => 1, s => 1, small => 1, strile => 1,
6889 strong => 1, tt => 1, u => 1,
6890 }->{$token->{tag_name}}) {
6891 !!!cp ('t427');
6892 $formatting_end_tag->($token);
6893 next B;
6894 } elsif ($token->{tag_name} eq 'br') {
6895 !!!cp ('t428');
6896 !!!parse-error (type => 'unmatched end tag:br', token => $token);
6897
6898 ## As if <br>
6899 $reconstruct_active_formatting_elements->($insert_to_current);
6900
6901 my $el;
6902 !!!create-element ($el, $HTML_NS, 'br',, $token);
6903 $insert->($el);
6904
6905 ## Ignore the token.
6906 !!!next-token;
6907 next B;
6908 } elsif ({
6909 caption => 1, col => 1, colgroup => 1, frame => 1,
6910 frameset => 1, head => 1, option => 1, optgroup => 1,
6911 tbody => 1, td => 1, tfoot => 1, th => 1,
6912 thead => 1, tr => 1,
6913 area => 1, basefont => 1, bgsound => 1,
6914 embed => 1, hr => 1, iframe => 1, image => 1,
6915 img => 1, input => 1, isindex => 1, noembed => 1,
6916 noframes => 1, param => 1, select => 1, spacer => 1,
6917 table => 1, textarea => 1, wbr => 1,
6918 noscript => 0, ## TODO: if scripting is enabled
6919 }->{$token->{tag_name}}) {
6920 !!!cp ('t429');
6921 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6922 ## Ignore the token
6923 !!!next-token;
6924 next B;
6925
6926 ## ISSUE: Issue on HTML5 new elements in spec
6927
6928 } else {
6929 ## Step 1
6930 my $node_i = -1;
6931 my $node = $self->{open_elements}->[$node_i];
6932
6933 ## Step 2
6934 S2: {
6935 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6936 ## Step 1
6937 ## generate implied end tags
6938 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6939 !!!cp ('t430');
6940 ## ISSUE: Can this case be reached?
6941 pop @{$self->{open_elements}};
6942 }
6943
6944 ## Step 2
6945 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6946 ne $token->{tag_name}) {
6947 !!!cp ('t431');
6948 ## NOTE: <x><y></x>
6949 !!!parse-error (type => 'not closed',
6950 value => $self->{open_elements}->[-1]->[0]
6951 ->manakai_local_name,
6952 token => $token);
6953 } else {
6954 !!!cp ('t432');
6955 }
6956
6957 ## Step 3
6958 splice @{$self->{open_elements}}, $node_i;
6959
6960 !!!next-token;
6961 last S2;
6962 } else {
6963 ## Step 3
6964 if (not ($node->[1] & FORMATTING_EL) and
6965 #not $phrasing_category->{$node->[1]} and
6966 ($node->[1] & SPECIAL_EL or
6967 $node->[1] & SCOPING_EL)) {
6968 !!!cp ('t433');
6969 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6970 ## Ignore the token
6971 !!!next-token;
6972 last S2;
6973 }
6974
6975 !!!cp ('t434');
6976 }
6977
6978 ## Step 4
6979 $node_i--;
6980 $node = $self->{open_elements}->[$node_i];
6981
6982 ## Step 5;
6983 redo S2;
6984 } # S2
6985 next B;
6986 }
6987 }
6988 next B;
6989 } continue { # B
6990 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
6991 ## NOTE: The code below is executed in cases where it does not have
6992 ## to be, but it it is harmless even in those cases.
6993 ## has an element in scope
6994 INSCOPE: {
6995 for (reverse 0..$#{$self->{open_elements}}) {
6996 my $node = $self->{open_elements}->[$_];
6997 if ($node->[1] & FOREIGN_EL) {
6998 last INSCOPE;
6999 } elsif ($node->[1] & SCOPING_EL) {
7000 last;
7001 }
7002 }
7003
7004 ## NOTE: No foreign element in scope.
7005 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7006 } # INSCOPE
7007 }
7008 } # B
7009
7010 ## Stop parsing # MUST
7011
7012 ## TODO: script stuffs
7013 } # _tree_construct_main
7014
7015 sub set_inner_html ($$$) {
7016 my $class = shift;
7017 my $node = shift;
7018 my $s = \$_[0];
7019 my $onerror = $_[1];
7020
7021 ## ISSUE: Should {confident} be true?
7022
7023 my $nt = $node->node_type;
7024 if ($nt == 9) {
7025 # MUST
7026
7027 ## Step 1 # MUST
7028 ## TODO: If the document has an active parser, ...
7029 ## ISSUE: There is an issue in the spec.
7030
7031 ## Step 2 # MUST
7032 my @cn = @{$node->child_nodes};
7033 for (@cn) {
7034 $node->remove_child ($_);
7035 }
7036
7037 ## Step 3, 4, 5 # MUST
7038 $class->parse_string ($$s => $node, $onerror);
7039 } elsif ($nt == 1) {
7040 ## TODO: If non-html element
7041
7042 ## NOTE: Most of this code is copied from |parse_string|
7043
7044 ## Step 1 # MUST
7045 my $this_doc = $node->owner_document;
7046 my $doc = $this_doc->implementation->create_document;
7047 $doc->manakai_is_html (1);
7048 my $p = $class->new;
7049 $p->{document} = $doc;
7050
7051 ## Step 8 # MUST
7052 my $i = 0;
7053 $p->{line_prev} = $p->{line} = 1;
7054 $p->{column_prev} = $p->{column} = 0;
7055 $p->{set_next_char} = sub {
7056 my $self = shift;
7057
7058 pop @{$self->{prev_char}};
7059 unshift @{$self->{prev_char}}, $self->{next_char};
7060
7061 $self->{next_char} = -1 and return if $i >= length $$s;
7062 $self->{next_char} = ord substr $$s, $i++, 1;
7063
7064 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7065 $p->{column}++;
7066
7067 if ($self->{next_char} == 0x000A) { # LF
7068 $p->{line}++;
7069 $p->{column} = 0;
7070 !!!cp ('i1');
7071 } elsif ($self->{next_char} == 0x000D) { # CR
7072 $i++ if substr ($$s, $i, 1) eq "\x0A";
7073 $self->{next_char} = 0x000A; # LF # MUST
7074 $p->{line}++;
7075 $p->{column} = 0;
7076 !!!cp ('i2');
7077 } elsif ($self->{next_char} > 0x10FFFF) {
7078 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7079 !!!cp ('i3');
7080 } elsif ($self->{next_char} == 0x0000) { # NULL
7081 !!!cp ('i4');
7082 !!!parse-error (type => 'NULL');
7083 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7084 } elsif ($self->{next_char} <= 0x0008 or
7085 (0x000E <= $self->{next_char} and
7086 $self->{next_char} <= 0x001F) or
7087 (0x007F <= $self->{next_char} and
7088 $self->{next_char} <= 0x009F) or
7089 (0xD800 <= $self->{next_char} and
7090 $self->{next_char} <= 0xDFFF) or
7091 (0xFDD0 <= $self->{next_char} and
7092 $self->{next_char} <= 0xFDDF) or
7093 {
7094 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7095 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7096 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7097 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7098 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7099 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7100 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7101 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7102 0x10FFFE => 1, 0x10FFFF => 1,
7103 }->{$self->{next_char}}) {
7104 !!!cp ('i4.1');
7105 !!!parse-error (type => 'control char', level => $self->{must_level});
7106 ## TODO: error type documentation
7107 }
7108 };
7109 $p->{prev_char} = [-1, -1, -1];
7110 $p->{next_char} = -1;
7111
7112 my $ponerror = $onerror || sub {
7113 my (%opt) = @_;
7114 my $line = $opt{line};
7115 my $column = $opt{column};
7116 if (defined $opt{token} and defined $opt{token}->{line}) {
7117 $line = $opt{token}->{line};
7118 $column = $opt{token}->{column};
7119 }
7120 warn "Parse error ($opt{type}) at line $line column $column\n";
7121 };
7122 $p->{parse_error} = sub {
7123 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7124 };
7125
7126 $p->_initialize_tokenizer;
7127 $p->_initialize_tree_constructor;
7128
7129 ## Step 2
7130 my $node_ln = $node->manakai_local_name;
7131 $p->{content_model} = {
7132 title => RCDATA_CONTENT_MODEL,
7133 textarea => RCDATA_CONTENT_MODEL,
7134 style => CDATA_CONTENT_MODEL,
7135 script => CDATA_CONTENT_MODEL,
7136 xmp => CDATA_CONTENT_MODEL,
7137 iframe => CDATA_CONTENT_MODEL,
7138 noembed => CDATA_CONTENT_MODEL,
7139 noframes => CDATA_CONTENT_MODEL,
7140 noscript => CDATA_CONTENT_MODEL,
7141 plaintext => PLAINTEXT_CONTENT_MODEL,
7142 }->{$node_ln};
7143 $p->{content_model} = PCDATA_CONTENT_MODEL
7144 unless defined $p->{content_model};
7145 ## ISSUE: What is "the name of the element"? local name?
7146
7147 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7148 ## TODO: Foreign element OK?
7149
7150 ## Step 3
7151 my $root = $doc->create_element_ns
7152 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7153
7154 ## Step 4 # MUST
7155 $doc->append_child ($root);
7156
7157 ## Step 5 # MUST
7158 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7159
7160 undef $p->{head_element};
7161
7162 ## Step 6 # MUST
7163 $p->_reset_insertion_mode;
7164
7165 ## Step 7 # MUST
7166 my $anode = $node;
7167 AN: while (defined $anode) {
7168 if ($anode->node_type == 1) {
7169 my $nsuri = $anode->namespace_uri;
7170 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7171 if ($anode->manakai_local_name eq 'form') {
7172 !!!cp ('i5');
7173 $p->{form_element} = $anode;
7174 last AN;
7175 }
7176 }
7177 }
7178 $anode = $anode->parent_node;
7179 } # AN
7180
7181 ## Step 9 # MUST
7182 {
7183 my $self = $p;
7184 !!!next-token;
7185 }
7186 $p->_tree_construction_main;
7187
7188 ## Step 10 # MUST
7189 my @cn = @{$node->child_nodes};
7190 for (@cn) {
7191 $node->remove_child ($_);
7192 }
7193 ## ISSUE: mutation events? read-only?
7194
7195 ## Step 11 # MUST
7196 @cn = @{$root->child_nodes};
7197 for (@cn) {
7198 $this_doc->adopt_node ($_);
7199 $node->append_child ($_);
7200 }
7201 ## ISSUE: mutation events?
7202
7203 $p->_terminate_tree_constructor;
7204
7205 delete $p->{parse_error}; # delete loop
7206 } else {
7207 die "$0: |set_inner_html| is not defined for node of type $nt";
7208 }
7209 } # set_inner_html
7210
7211 } # tree construction stage
7212
7213 package Whatpm::HTML::RestartParser;
7214 push our @ISA, 'Error';
7215
7216 1;
7217 # $Date: 2008/04/13 05:54:28 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24