/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.131 - (show annotations) (download) (as text)
Sun Apr 13 05:54:28 2008 UTC (17 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.130: +141 -6 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	13 Apr 2008 05:49:16 -0000
	* HTML.pm.src, mkhtmlparser.pl: Support for element/attribute
	name/namespace fixup (HTML5 revisions 1413, 1415, 1416, and 1417).

2008-04-13  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::HTML;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.130 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 use Error qw(:try);
5
6 ## ISSUE:
7 ## var doc = implementation.createDocument (null, null, null);
8 ## doc.write ('');
9 ## alert (doc.compatMode);
10
11 ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
12 ## TODO: 1252 parse error (revision 1264)
13 ## TODO: 8859-11 = 874 (revision 1271)
14
15 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
16 my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
17 my $SVG_NS = q<http://www.w3.org/2000/svg>;
18 my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
19 my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
20 my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
21
22 sub A_EL () { 0b1 }
23 sub ADDRESS_EL () { 0b10 }
24 sub BODY_EL () { 0b100 }
25 sub BUTTON_EL () { 0b1000 }
26 sub CAPTION_EL () { 0b10000 }
27 sub DD_EL () { 0b100000 }
28 sub DIV_EL () { 0b1000000 }
29 sub DT_EL () { 0b10000000 }
30 sub FORM_EL () { 0b100000000 }
31 sub FORMATTING_EL () { 0b1000000000 }
32 sub FRAMESET_EL () { 0b10000000000 }
33 sub HEADING_EL () { 0b100000000000 }
34 sub HTML_EL () { 0b1000000000000 }
35 sub LI_EL () { 0b10000000000000 }
36 sub NOBR_EL () { 0b100000000000000 }
37 sub OPTION_EL () { 0b1000000000000000 }
38 sub OPTGROUP_EL () { 0b10000000000000000 }
39 sub P_EL () { 0b100000000000000000 }
40 sub SELECT_EL () { 0b1000000000000000000 }
41 sub TABLE_EL () { 0b10000000000000000000 }
42 sub TABLE_CELL_EL () { 0b100000000000000000000 }
43 sub TABLE_ROW_EL () { 0b1000000000000000000000 }
44 sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
45 sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
46 sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
47 sub FOREIGN_EL () { 0b10000000000000000000000000 }
48 sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
49 sub MML_AXML_EL () { 0b1000000000000000000000000000 }
50
51 sub TABLE_ROWS_EL () {
52 TABLE_EL |
53 TABLE_ROW_EL |
54 TABLE_ROW_GROUP_EL
55 }
56
57 sub END_TAG_OPTIONAL_EL () {
58 DD_EL |
59 DT_EL |
60 LI_EL |
61 P_EL
62 }
63
64 sub ALL_END_TAG_OPTIONAL_EL () {
65 END_TAG_OPTIONAL_EL |
66 BODY_EL |
67 HTML_EL |
68 TABLE_CELL_EL |
69 TABLE_ROW_EL |
70 TABLE_ROW_GROUP_EL
71 }
72
73 sub SCOPING_EL () {
74 BUTTON_EL |
75 CAPTION_EL |
76 HTML_EL |
77 TABLE_EL |
78 TABLE_CELL_EL |
79 MISC_SCOPING_EL
80 }
81
82 sub TABLE_SCOPING_EL () {
83 HTML_EL |
84 TABLE_EL
85 }
86
87 sub TABLE_ROWS_SCOPING_EL () {
88 HTML_EL |
89 TABLE_ROW_GROUP_EL
90 }
91
92 sub TABLE_ROW_SCOPING_EL () {
93 HTML_EL |
94 TABLE_ROW_EL
95 }
96
97 sub SPECIAL_EL () {
98 ADDRESS_EL |
99 BODY_EL |
100 DIV_EL |
101 END_TAG_OPTIONAL_EL |
102 FORM_EL |
103 FRAMESET_EL |
104 HEADING_EL |
105 OPTION_EL |
106 OPTGROUP_EL |
107 SELECT_EL |
108 TABLE_ROW_EL |
109 TABLE_ROW_GROUP_EL |
110 MISC_SPECIAL_EL
111 }
112
113 my $el_category = {
114 a => A_EL | FORMATTING_EL,
115 address => ADDRESS_EL,
116 applet => MISC_SCOPING_EL,
117 area => MISC_SPECIAL_EL,
118 b => FORMATTING_EL,
119 base => MISC_SPECIAL_EL,
120 basefont => MISC_SPECIAL_EL,
121 bgsound => MISC_SPECIAL_EL,
122 big => FORMATTING_EL,
123 blockquote => MISC_SPECIAL_EL,
124 body => BODY_EL,
125 br => MISC_SPECIAL_EL,
126 button => BUTTON_EL,
127 caption => CAPTION_EL,
128 center => MISC_SPECIAL_EL,
129 col => MISC_SPECIAL_EL,
130 colgroup => MISC_SPECIAL_EL,
131 dd => DD_EL,
132 dir => MISC_SPECIAL_EL,
133 div => DIV_EL,
134 dl => MISC_SPECIAL_EL,
135 dt => DT_EL,
136 em => FORMATTING_EL,
137 embed => MISC_SPECIAL_EL,
138 fieldset => MISC_SPECIAL_EL,
139 font => FORMATTING_EL,
140 form => FORM_EL,
141 frame => MISC_SPECIAL_EL,
142 frameset => FRAMESET_EL,
143 h1 => HEADING_EL,
144 h2 => HEADING_EL,
145 h3 => HEADING_EL,
146 h4 => HEADING_EL,
147 h5 => HEADING_EL,
148 h6 => HEADING_EL,
149 head => MISC_SPECIAL_EL,
150 hr => MISC_SPECIAL_EL,
151 html => HTML_EL,
152 i => FORMATTING_EL,
153 iframe => MISC_SPECIAL_EL,
154 img => MISC_SPECIAL_EL,
155 input => MISC_SPECIAL_EL,
156 isindex => MISC_SPECIAL_EL,
157 li => LI_EL,
158 link => MISC_SPECIAL_EL,
159 listing => MISC_SPECIAL_EL,
160 marquee => MISC_SCOPING_EL,
161 menu => MISC_SPECIAL_EL,
162 meta => MISC_SPECIAL_EL,
163 nobr => NOBR_EL | FORMATTING_EL,
164 noembed => MISC_SPECIAL_EL,
165 noframes => MISC_SPECIAL_EL,
166 noscript => MISC_SPECIAL_EL,
167 object => MISC_SCOPING_EL,
168 ol => MISC_SPECIAL_EL,
169 optgroup => OPTGROUP_EL,
170 option => OPTION_EL,
171 p => P_EL,
172 param => MISC_SPECIAL_EL,
173 plaintext => MISC_SPECIAL_EL,
174 pre => MISC_SPECIAL_EL,
175 s => FORMATTING_EL,
176 script => MISC_SPECIAL_EL,
177 select => SELECT_EL,
178 small => FORMATTING_EL,
179 spacer => MISC_SPECIAL_EL,
180 strike => FORMATTING_EL,
181 strong => FORMATTING_EL,
182 style => MISC_SPECIAL_EL,
183 table => TABLE_EL,
184 tbody => TABLE_ROW_GROUP_EL,
185 td => TABLE_CELL_EL,
186 textarea => MISC_SPECIAL_EL,
187 tfoot => TABLE_ROW_GROUP_EL,
188 th => TABLE_CELL_EL,
189 thead => TABLE_ROW_GROUP_EL,
190 title => MISC_SPECIAL_EL,
191 tr => TABLE_ROW_EL,
192 tt => FORMATTING_EL,
193 u => FORMATTING_EL,
194 ul => MISC_SPECIAL_EL,
195 wbr => MISC_SPECIAL_EL,
196 };
197
198 my $el_category_f = {
199 $MML_NS => {
200 'annotation-xml' => MML_AXML_EL,
201 mi => FOREIGN_FLOW_CONTENT_EL,
202 mo => FOREIGN_FLOW_CONTENT_EL,
203 mn => FOREIGN_FLOW_CONTENT_EL,
204 ms => FOREIGN_FLOW_CONTENT_EL,
205 mtext => FOREIGN_FLOW_CONTENT_EL,
206 },
207 $SVG_NS => {
208 foreignObject => FOREIGN_FLOW_CONTENT_EL,
209 desc => FOREIGN_FLOW_CONTENT_EL,
210 title => FOREIGN_FLOW_CONTENT_EL,
211 },
212 ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
213 };
214
215 my $svg_attr_name = {
216 attributetype => 'attributeType',
217 basefrequency => 'baseFrequency',
218 baseprofile => 'baseProfile',
219 calcmode => 'calcMode',
220 clippathunits => 'clipPathUnits',
221 contentscripttype => 'contentScriptType',
222 contentstyletype => 'contentStyleType',
223 diffuseconstant => 'diffuseConstant',
224 edgemode => 'edgeMode',
225 externalresourcesrequired => 'externalResourcesRequired',
226 fecolormatrix => 'feColorMatrix',
227 fecomposite => 'feComposite',
228 fegaussianblur => 'feGaussianBlur',
229 femorphology => 'feMorphology',
230 fetile => 'feTile',
231 filterres => 'filterRes',
232 filterunits => 'filterUnits',
233 glyphref => 'glyphRef',
234 gradienttransform => 'gradientTransform',
235 gradientunits => 'gradientUnits',
236 kernelmatrix => 'kernelMatrix',
237 kernelunitlength => 'kernelUnitLength',
238 keypoints => 'keyPoints',
239 keysplines => 'keySplines',
240 keytimes => 'keyTimes',
241 lengthadjust => 'lengthAdjust',
242 limitingconeangle => 'limitingConeAngle',
243 markerheight => 'markerHeight',
244 markerunits => 'markerUnits',
245 markerwidth => 'markerWidth',
246 maskcontentunits => 'maskContentUnits',
247 maskunits => 'maskUnits',
248 numoctaves => 'numOctaves',
249 pathlength => 'pathLength',
250 patterncontentunits => 'patternContentUnits',
251 patterntransform => 'patternTransform',
252 patternunits => 'patternUnits',
253 pointsatx => 'pointsAtX',
254 pointsaty => 'pointsAtY',
255 pointsatz => 'pointsAtZ',
256 preservealpha => 'preserveAlpha',
257 preserveaspectratio => 'preserveAspectRatio',
258 primitiveunits => 'primitiveUnits',
259 refx => 'refX',
260 refy => 'refY',
261 repeatcount => 'repeatCount',
262 repeatdur => 'repeatDur',
263 requiredextensions => 'requiredExtensions',
264 specularconstant => 'specularConstant',
265 specularexponent => 'specularExponent',
266 spreadmethod => 'spreadMethod',
267 startoffset => 'startOffset',
268 stddeviation => 'stdDeviation',
269 stitchtiles => 'stitchTiles',
270 surfacescale => 'surfaceScale',
271 systemlanguage => 'systemLanguage',
272 tablevalues => 'tableValues',
273 targetx => 'targetX',
274 targety => 'targetY',
275 textlength => 'textLength',
276 viewbox => 'viewBox',
277 viewtarget => 'viewTarget',
278 xchannelselector => 'xChannelSelector',
279 ychannelselector => 'yChannelSelector',
280 zoomandpan => 'zoomAndPan',
281 };
282
283 my $foreign_attr_xname = {
284 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
285 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
286 'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
287 'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
288 'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
289 'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
290 'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
291 'xml:base' => [$XML_NS, ['xml', 'base']],
292 'xml:lang' => [$XML_NS, ['xml', 'lang']],
293 'xml:space' => [$XML_NS, ['xml', 'space']],
294 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
295 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
296 };
297
298 ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
299
300 my $c1_entity_char = {
301 0x80 => 0x20AC,
302 0x81 => 0xFFFD,
303 0x82 => 0x201A,
304 0x83 => 0x0192,
305 0x84 => 0x201E,
306 0x85 => 0x2026,
307 0x86 => 0x2020,
308 0x87 => 0x2021,
309 0x88 => 0x02C6,
310 0x89 => 0x2030,
311 0x8A => 0x0160,
312 0x8B => 0x2039,
313 0x8C => 0x0152,
314 0x8D => 0xFFFD,
315 0x8E => 0x017D,
316 0x8F => 0xFFFD,
317 0x90 => 0xFFFD,
318 0x91 => 0x2018,
319 0x92 => 0x2019,
320 0x93 => 0x201C,
321 0x94 => 0x201D,
322 0x95 => 0x2022,
323 0x96 => 0x2013,
324 0x97 => 0x2014,
325 0x98 => 0x02DC,
326 0x99 => 0x2122,
327 0x9A => 0x0161,
328 0x9B => 0x203A,
329 0x9C => 0x0153,
330 0x9D => 0xFFFD,
331 0x9E => 0x017E,
332 0x9F => 0x0178,
333 }; # $c1_entity_char
334
335 sub parse_byte_string ($$$$;$) {
336 my $self = ref $_[0] ? shift : shift->new;
337 my $charset = shift;
338 my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
339 my $s;
340
341 if (defined $charset) {
342 require Encode; ## TODO: decode(utf8) don't delete BOM
343 $s = \ (Encode::decode ($charset, $$bytes_s));
344 $self->{input_encoding} = lc $charset; ## TODO: normalize name
345 $self->{confident} = 1;
346 } else {
347 ## TODO: Implement HTML5 detection algorithm
348 require Whatpm::Charset::UniversalCharDet;
349 $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
350 (substr ($$bytes_s, 0, 1024));
351 $charset ||= 'windows-1252';
352 $s = \ (Encode::decode ($charset, $$bytes_s));
353 $self->{input_encoding} = $charset;
354 $self->{confident} = 0;
355 }
356
357 $self->{change_encoding} = sub {
358 my $self = shift;
359 my $charset = lc shift;
360 my $token = shift;
361 ## TODO: if $charset is supported
362 ## TODO: normalize charset name
363
364 ## "Change the encoding" algorithm:
365
366 ## Step 1
367 if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
368 $charset = 'utf-8';
369 }
370
371 ## Step 2
372 if (defined $self->{input_encoding} and
373 $self->{input_encoding} eq $charset) {
374 $self->{confident} = 1;
375 return;
376 }
377
378 !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
379 ':'.$charset, level => 'w', token => $token);
380
381 ## Step 3
382 # if (can) {
383 ## change the encoding on the fly.
384 #$self->{confident} = 1;
385 #return;
386 # }
387
388 ## Step 4
389 throw Whatpm::HTML::RestartParser (charset => $charset);
390 }; # $self->{change_encoding}
391
392 my @args = @_; shift @args; # $s
393 my $return;
394 try {
395 $return = $self->parse_char_string ($s, @args);
396 } catch Whatpm::HTML::RestartParser with {
397 my $charset = shift->{charset};
398 $s = \ (Encode::decode ($charset, $$bytes_s));
399 $self->{input_encoding} = $charset; ## TODO: normalize
400 $self->{confident} = 1;
401 $return = $self->parse_char_string ($s, @args);
402 };
403 return $return;
404 } # parse_byte_string
405
406 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
407 ## and the HTML layer MUST ignore it. However, we does strip BOM in
408 ## the encoding layer and the HTML layer does not ignore any U+FEFF,
409 ## because the core part of our HTML parser expects a string of character,
410 ## not a string of bytes or code units or anything which might contain a BOM.
411 ## Therefore, any parser interface that accepts a string of bytes,
412 ## such as |parse_byte_string| in this module, must ensure that it does
413 ## strip the BOM and never strip any ZWNBSP.
414
415 *parse_char_string = \&parse_string;
416
417 sub parse_string ($$$;$) {
418 my $self = ref $_[0] ? shift : shift->new;
419 my $s = ref $_[0] ? $_[0] : \($_[0]);
420 $self->{document} = $_[1];
421 @{$self->{document}->child_nodes} = ();
422
423 ## NOTE: |set_inner_html| copies most of this method's code
424
425 $self->{confident} = 1 unless exists $self->{confident};
426 $self->{document}->input_encoding ($self->{input_encoding})
427 if defined $self->{input_encoding};
428
429 my $i = 0;
430 $self->{line_prev} = $self->{line} = 1;
431 $self->{column_prev} = $self->{column} = 0;
432 $self->{set_next_char} = sub {
433 my $self = shift;
434
435 pop @{$self->{prev_char}};
436 unshift @{$self->{prev_char}}, $self->{next_char};
437
438 $self->{next_char} = -1 and return if $i >= length $$s;
439 $self->{next_char} = ord substr $$s, $i++, 1;
440
441 ($self->{line_prev}, $self->{column_prev})
442 = ($self->{line}, $self->{column});
443 $self->{column}++;
444
445 if ($self->{next_char} == 0x000A) { # LF
446 $self->{line}++;
447 $self->{column} = 0;
448 } elsif ($self->{next_char} == 0x000D) { # CR
449 $i++ if substr ($$s, $i, 1) eq "\x0A";
450 $self->{next_char} = 0x000A; # LF # MUST
451 $self->{line}++;
452 $self->{column} = 0;
453 } elsif ($self->{next_char} > 0x10FFFF) {
454 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
455 } elsif ($self->{next_char} == 0x0000) { # NULL
456 !!!parse-error (type => 'NULL');
457 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
458 }
459 };
460 $self->{prev_char} = [-1, -1, -1];
461 $self->{next_char} = -1;
462
463 my $onerror = $_[2] || sub {
464 my (%opt) = @_;
465 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
466 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
467 warn "Parse error ($opt{type}) at line $line column $column\n";
468 };
469 $self->{parse_error} = sub {
470 $onerror->(line => $self->{line}, column => $self->{column}, @_);
471 };
472
473 $self->_initialize_tokenizer;
474 $self->_initialize_tree_constructor;
475 $self->_construct_tree;
476 $self->_terminate_tree_constructor;
477
478 delete $self->{parse_error}; # remove loop
479
480 return $self->{document};
481 } # parse_string
482
483 sub new ($) {
484 my $class = shift;
485 my $self = bless {}, $class;
486 $self->{set_next_char} = sub {
487 $self->{next_char} = -1;
488 };
489 $self->{parse_error} = sub {
490 #
491 };
492 $self->{change_encoding} = sub {
493 # if ($_[0] is a supported encoding) {
494 # run "change the encoding" algorithm;
495 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
496 # }
497 };
498 $self->{application_cache_selection} = sub {
499 #
500 };
501 return $self;
502 } # new
503
504 sub CM_ENTITY () { 0b001 } # & markup in data
505 sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
506 sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
507
508 sub PLAINTEXT_CONTENT_MODEL () { 0 }
509 sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
510 sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
511 sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
512
513 sub DATA_STATE () { 0 }
514 sub ENTITY_DATA_STATE () { 1 }
515 sub TAG_OPEN_STATE () { 2 }
516 sub CLOSE_TAG_OPEN_STATE () { 3 }
517 sub TAG_NAME_STATE () { 4 }
518 sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
519 sub ATTRIBUTE_NAME_STATE () { 6 }
520 sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
521 sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
522 sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
523 sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
524 sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
525 sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
526 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
527 sub COMMENT_START_STATE () { 14 }
528 sub COMMENT_START_DASH_STATE () { 15 }
529 sub COMMENT_STATE () { 16 }
530 sub COMMENT_END_STATE () { 17 }
531 sub COMMENT_END_DASH_STATE () { 18 }
532 sub BOGUS_COMMENT_STATE () { 19 }
533 sub DOCTYPE_STATE () { 20 }
534 sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
535 sub DOCTYPE_NAME_STATE () { 22 }
536 sub AFTER_DOCTYPE_NAME_STATE () { 23 }
537 sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
538 sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
539 sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
540 sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
541 sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
542 sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
543 sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
544 sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
545 sub BOGUS_DOCTYPE_STATE () { 32 }
546 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
547 sub SELF_CLOSING_START_TAG_STATE () { 34 }
548 sub CDATA_BLOCK_STATE () { 35 }
549
550 sub DOCTYPE_TOKEN () { 1 }
551 sub COMMENT_TOKEN () { 2 }
552 sub START_TAG_TOKEN () { 3 }
553 sub END_TAG_TOKEN () { 4 }
554 sub END_OF_FILE_TOKEN () { 5 }
555 sub CHARACTER_TOKEN () { 6 }
556
557 sub AFTER_HTML_IMS () { 0b100 }
558 sub HEAD_IMS () { 0b1000 }
559 sub BODY_IMS () { 0b10000 }
560 sub BODY_TABLE_IMS () { 0b100000 }
561 sub TABLE_IMS () { 0b1000000 }
562 sub ROW_IMS () { 0b10000000 }
563 sub BODY_AFTER_IMS () { 0b100000000 }
564 sub FRAME_IMS () { 0b1000000000 }
565 sub SELECT_IMS () { 0b10000000000 }
566 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
567 ## NOTE: "in foreign content" insertion mode is special; it is combined
568 ## with the secondary insertion mode. In this parser, they are stored
569 ## together in the bit-or'ed form.
570
571 ## NOTE: "initial" and "before html" insertion modes have no constants.
572
573 ## NOTE: "after after body" insertion mode.
574 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
575
576 ## NOTE: "after after frameset" insertion mode.
577 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
578
579 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
580 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
581 sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
582 sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
583 sub IN_BODY_IM () { BODY_IMS }
584 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
585 sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
586 sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
587 sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
588 sub IN_TABLE_IM () { TABLE_IMS }
589 sub AFTER_BODY_IM () { BODY_AFTER_IMS }
590 sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
591 sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
592 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
593 sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
594 sub IN_COLUMN_GROUP_IM () { 0b10 }
595
596 ## Implementations MUST act as if state machine in the spec
597
598 sub _initialize_tokenizer ($) {
599 my $self = shift;
600 $self->{state} = DATA_STATE; # MUST
601 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
602 undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
603 undef $self->{current_attribute};
604 undef $self->{last_emitted_start_tag_name};
605 undef $self->{last_attribute_value_state};
606 delete $self->{self_closing};
607 $self->{char} = [];
608 # $self->{next_char}
609 !!!next-input-character;
610 $self->{token} = [];
611 # $self->{escape}
612 } # _initialize_tokenizer
613
614 ## A token has:
615 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
616 ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
617 ## ->{name} (DOCTYPE_TOKEN)
618 ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
619 ## ->{public_identifier} (DOCTYPE_TOKEN)
620 ## ->{system_identifier} (DOCTYPE_TOKEN)
621 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
622 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
623 ## ->{name}
624 ## ->{value}
625 ## ->{has_reference} == 1 or 0
626 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
627 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
628 ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
629 ## while the token is pushed back to the stack.
630
631 ## ISSUE: "When a DOCTYPE token is created, its
632 ## <i>self-closing flag</i> must be unset (its other state is that it
633 ## be set), and its attributes list must be empty.": Wrong subject?
634
635 ## Emitted token MUST immediately be handled by the tree construction state.
636
637 ## Before each step, UA MAY check to see if either one of the scripts in
638 ## "list of scripts that will execute as soon as possible" or the first
639 ## script in the "list of scripts that will execute asynchronously",
640 ## has completed loading. If one has, then it MUST be executed
641 ## and removed from the list.
642
643 ## NOTE: HTML5 "Writing HTML documents" section, applied to
644 ## documents and not to user agents and conformance checkers,
645 ## contains some requirements that are not detected by the
646 ## parsing algorithm:
647 ## - Some requirements on character encoding declarations. ## TODO
648 ## - "Elements MUST NOT contain content that their content model disallows."
649 ## ... Some are parse error, some are not (will be reported by c.c.).
650 ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
651 ## - Text (in elements, attributes, and comments) SHOULD NOT contain
652 ## control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL? Unicode control character?)
653
654 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
655 ## be detected by the HTML5 parsing algorithm:
656 ## - Text,
657
658 sub _get_next_token ($) {
659 my $self = shift;
660
661 if ($self->{self_closing}) {
662 !!!parse-error (type => 'nestc', token => $self->{current_token});
663 ## NOTE: The |self_closing| flag is only set by start tag token.
664 ## In addition, when a start tag token is emitted, it is always set to
665 ## |current_token|.
666 delete $self->{self_closing};
667 }
668
669 if (@{$self->{token}}) {
670 $self->{self_closing} = $self->{token}->[0]->{self_closing};
671 return shift @{$self->{token}};
672 }
673
674 A: {
675 if ($self->{state} == DATA_STATE) {
676 if ($self->{next_char} == 0x0026) { # &
677 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
678 not $self->{escape}) {
679 !!!cp (1);
680 $self->{state} = ENTITY_DATA_STATE;
681 !!!next-input-character;
682 redo A;
683 } else {
684 !!!cp (2);
685 #
686 }
687 } elsif ($self->{next_char} == 0x002D) { # -
688 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
689 unless ($self->{escape}) {
690 if ($self->{prev_char}->[0] == 0x002D and # -
691 $self->{prev_char}->[1] == 0x0021 and # !
692 $self->{prev_char}->[2] == 0x003C) { # <
693 !!!cp (3);
694 $self->{escape} = 1;
695 } else {
696 !!!cp (4);
697 }
698 } else {
699 !!!cp (5);
700 }
701 }
702
703 #
704 } elsif ($self->{next_char} == 0x003C) { # <
705 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
706 (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
707 not $self->{escape})) {
708 !!!cp (6);
709 $self->{state} = TAG_OPEN_STATE;
710 !!!next-input-character;
711 redo A;
712 } else {
713 !!!cp (7);
714 #
715 }
716 } elsif ($self->{next_char} == 0x003E) { # >
717 if ($self->{escape} and
718 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
719 if ($self->{prev_char}->[0] == 0x002D and # -
720 $self->{prev_char}->[1] == 0x002D) { # -
721 !!!cp (8);
722 delete $self->{escape};
723 } else {
724 !!!cp (9);
725 }
726 } else {
727 !!!cp (10);
728 }
729
730 #
731 } elsif ($self->{next_char} == -1) {
732 !!!cp (11);
733 !!!emit ({type => END_OF_FILE_TOKEN,
734 line => $self->{line}, column => $self->{column}});
735 last A; ## TODO: ok?
736 } else {
737 !!!cp (12);
738 }
739 # Anything else
740 my $token = {type => CHARACTER_TOKEN,
741 data => chr $self->{next_char},
742 line => $self->{line}, column => $self->{column},
743 };
744 ## Stay in the data state
745 !!!next-input-character;
746
747 !!!emit ($token);
748
749 redo A;
750 } elsif ($self->{state} == ENTITY_DATA_STATE) {
751 ## (cannot happen in CDATA state)
752
753 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
754
755 my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
756
757 $self->{state} = DATA_STATE;
758 # next-input-character is already done
759
760 unless (defined $token) {
761 !!!cp (13);
762 !!!emit ({type => CHARACTER_TOKEN, data => '&',
763 line => $l, column => $c,
764 });
765 } else {
766 !!!cp (14);
767 !!!emit ($token);
768 }
769
770 redo A;
771 } elsif ($self->{state} == TAG_OPEN_STATE) {
772 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
773 if ($self->{next_char} == 0x002F) { # /
774 !!!cp (15);
775 !!!next-input-character;
776 $self->{state} = CLOSE_TAG_OPEN_STATE;
777 redo A;
778 } else {
779 !!!cp (16);
780 ## reconsume
781 $self->{state} = DATA_STATE;
782
783 !!!emit ({type => CHARACTER_TOKEN, data => '<',
784 line => $self->{line_prev},
785 column => $self->{column_prev},
786 });
787
788 redo A;
789 }
790 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
791 if ($self->{next_char} == 0x0021) { # !
792 !!!cp (17);
793 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
794 !!!next-input-character;
795 redo A;
796 } elsif ($self->{next_char} == 0x002F) { # /
797 !!!cp (18);
798 $self->{state} = CLOSE_TAG_OPEN_STATE;
799 !!!next-input-character;
800 redo A;
801 } elsif (0x0041 <= $self->{next_char} and
802 $self->{next_char} <= 0x005A) { # A..Z
803 !!!cp (19);
804 $self->{current_token}
805 = {type => START_TAG_TOKEN,
806 tag_name => chr ($self->{next_char} + 0x0020),
807 line => $self->{line_prev},
808 column => $self->{column_prev}};
809 $self->{state} = TAG_NAME_STATE;
810 !!!next-input-character;
811 redo A;
812 } elsif (0x0061 <= $self->{next_char} and
813 $self->{next_char} <= 0x007A) { # a..z
814 !!!cp (20);
815 $self->{current_token} = {type => START_TAG_TOKEN,
816 tag_name => chr ($self->{next_char}),
817 line => $self->{line_prev},
818 column => $self->{column_prev}};
819 $self->{state} = TAG_NAME_STATE;
820 !!!next-input-character;
821 redo A;
822 } elsif ($self->{next_char} == 0x003E) { # >
823 !!!cp (21);
824 !!!parse-error (type => 'empty start tag',
825 line => $self->{line_prev},
826 column => $self->{column_prev});
827 $self->{state} = DATA_STATE;
828 !!!next-input-character;
829
830 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
831 line => $self->{line_prev},
832 column => $self->{column_prev},
833 });
834
835 redo A;
836 } elsif ($self->{next_char} == 0x003F) { # ?
837 !!!cp (22);
838 !!!parse-error (type => 'pio',
839 line => $self->{line_prev},
840 column => $self->{column_prev});
841 $self->{state} = BOGUS_COMMENT_STATE;
842 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
843 line => $self->{line_prev},
844 column => $self->{column_prev},
845 };
846 ## $self->{next_char} is intentionally left as is
847 redo A;
848 } else {
849 !!!cp (23);
850 !!!parse-error (type => 'bare stago');
851 $self->{state} = DATA_STATE;
852 ## reconsume
853
854 !!!emit ({type => CHARACTER_TOKEN, data => '<',
855 line => $self->{line_prev},
856 column => $self->{column_prev},
857 });
858
859 redo A;
860 }
861 } else {
862 die "$0: $self->{content_model} in tag open";
863 }
864 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
865 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
866 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
867 if (defined $self->{last_emitted_start_tag_name}) {
868
869 ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
870 my @next_char;
871 TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
872 push @next_char, $self->{next_char};
873 my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
874 my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
875 if ($self->{next_char} == $c or $self->{next_char} == $C) {
876 !!!cp (24);
877 !!!next-input-character;
878 next TAGNAME;
879 } else {
880 !!!cp (25);
881 $self->{next_char} = shift @next_char; # reconsume
882 !!!back-next-input-character (@next_char);
883 $self->{state} = DATA_STATE;
884
885 !!!emit ({type => CHARACTER_TOKEN, data => '</',
886 line => $l, column => $c,
887 });
888
889 redo A;
890 }
891 }
892 push @next_char, $self->{next_char};
893
894 unless ($self->{next_char} == 0x0009 or # HT
895 $self->{next_char} == 0x000A or # LF
896 $self->{next_char} == 0x000B or # VT
897 $self->{next_char} == 0x000C or # FF
898 $self->{next_char} == 0x0020 or # SP
899 $self->{next_char} == 0x003E or # >
900 $self->{next_char} == 0x002F or # /
901 $self->{next_char} == -1) {
902 !!!cp (26);
903 $self->{next_char} = shift @next_char; # reconsume
904 !!!back-next-input-character (@next_char);
905 $self->{state} = DATA_STATE;
906 !!!emit ({type => CHARACTER_TOKEN, data => '</',
907 line => $l, column => $c,
908 });
909 redo A;
910 } else {
911 !!!cp (27);
912 $self->{next_char} = shift @next_char;
913 !!!back-next-input-character (@next_char);
914 # and consume...
915 }
916 } else {
917 ## No start tag token has ever been emitted
918 !!!cp (28);
919 # next-input-character is already done
920 $self->{state} = DATA_STATE;
921 !!!emit ({type => CHARACTER_TOKEN, data => '</',
922 line => $l, column => $c,
923 });
924 redo A;
925 }
926 }
927
928 if (0x0041 <= $self->{next_char} and
929 $self->{next_char} <= 0x005A) { # A..Z
930 !!!cp (29);
931 $self->{current_token}
932 = {type => END_TAG_TOKEN,
933 tag_name => chr ($self->{next_char} + 0x0020),
934 line => $l, column => $c};
935 $self->{state} = TAG_NAME_STATE;
936 !!!next-input-character;
937 redo A;
938 } elsif (0x0061 <= $self->{next_char} and
939 $self->{next_char} <= 0x007A) { # a..z
940 !!!cp (30);
941 $self->{current_token} = {type => END_TAG_TOKEN,
942 tag_name => chr ($self->{next_char}),
943 line => $l, column => $c};
944 $self->{state} = TAG_NAME_STATE;
945 !!!next-input-character;
946 redo A;
947 } elsif ($self->{next_char} == 0x003E) { # >
948 !!!cp (31);
949 !!!parse-error (type => 'empty end tag',
950 line => $self->{line_prev}, ## "<" in "</>"
951 column => $self->{column_prev} - 1);
952 $self->{state} = DATA_STATE;
953 !!!next-input-character;
954 redo A;
955 } elsif ($self->{next_char} == -1) {
956 !!!cp (32);
957 !!!parse-error (type => 'bare etago');
958 $self->{state} = DATA_STATE;
959 # reconsume
960
961 !!!emit ({type => CHARACTER_TOKEN, data => '</',
962 line => $l, column => $c,
963 });
964
965 redo A;
966 } else {
967 !!!cp (33);
968 !!!parse-error (type => 'bogus end tag');
969 $self->{state} = BOGUS_COMMENT_STATE;
970 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
971 line => $self->{line_prev}, # "<" of "</"
972 column => $self->{column_prev} - 1,
973 };
974 ## $self->{next_char} is intentionally left as is
975 redo A;
976 }
977 } elsif ($self->{state} == TAG_NAME_STATE) {
978 if ($self->{next_char} == 0x0009 or # HT
979 $self->{next_char} == 0x000A or # LF
980 $self->{next_char} == 0x000B or # VT
981 $self->{next_char} == 0x000C or # FF
982 $self->{next_char} == 0x0020) { # SP
983 !!!cp (34);
984 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
985 !!!next-input-character;
986 redo A;
987 } elsif ($self->{next_char} == 0x003E) { # >
988 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
989 !!!cp (35);
990 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
991 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
992 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
993 #if ($self->{current_token}->{attributes}) {
994 # ## NOTE: This should never be reached.
995 # !!! cp (36);
996 # !!! parse-error (type => 'end tag attribute');
997 #} else {
998 !!!cp (37);
999 #}
1000 } else {
1001 die "$0: $self->{current_token}->{type}: Unknown token type";
1002 }
1003 $self->{state} = DATA_STATE;
1004 !!!next-input-character;
1005
1006 !!!emit ($self->{current_token}); # start tag or end tag
1007
1008 redo A;
1009 } elsif (0x0041 <= $self->{next_char} and
1010 $self->{next_char} <= 0x005A) { # A..Z
1011 !!!cp (38);
1012 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1013 # start tag or end tag
1014 ## Stay in this state
1015 !!!next-input-character;
1016 redo A;
1017 } elsif ($self->{next_char} == -1) {
1018 !!!parse-error (type => 'unclosed tag');
1019 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1020 !!!cp (39);
1021 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1022 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1023 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1024 #if ($self->{current_token}->{attributes}) {
1025 # ## NOTE: This state should never be reached.
1026 # !!! cp (40);
1027 # !!! parse-error (type => 'end tag attribute');
1028 #} else {
1029 !!!cp (41);
1030 #}
1031 } else {
1032 die "$0: $self->{current_token}->{type}: Unknown token type";
1033 }
1034 $self->{state} = DATA_STATE;
1035 # reconsume
1036
1037 !!!emit ($self->{current_token}); # start tag or end tag
1038
1039 redo A;
1040 } elsif ($self->{next_char} == 0x002F) { # /
1041 !!!cp (42);
1042 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1043 !!!next-input-character;
1044 redo A;
1045 } else {
1046 !!!cp (44);
1047 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1048 # start tag or end tag
1049 ## Stay in the state
1050 !!!next-input-character;
1051 redo A;
1052 }
1053 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1054 if ($self->{next_char} == 0x0009 or # HT
1055 $self->{next_char} == 0x000A or # LF
1056 $self->{next_char} == 0x000B or # VT
1057 $self->{next_char} == 0x000C or # FF
1058 $self->{next_char} == 0x0020) { # SP
1059 !!!cp (45);
1060 ## Stay in the state
1061 !!!next-input-character;
1062 redo A;
1063 } elsif ($self->{next_char} == 0x003E) { # >
1064 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1065 !!!cp (46);
1066 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1067 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1068 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1069 if ($self->{current_token}->{attributes}) {
1070 !!!cp (47);
1071 !!!parse-error (type => 'end tag attribute');
1072 } else {
1073 !!!cp (48);
1074 }
1075 } else {
1076 die "$0: $self->{current_token}->{type}: Unknown token type";
1077 }
1078 $self->{state} = DATA_STATE;
1079 !!!next-input-character;
1080
1081 !!!emit ($self->{current_token}); # start tag or end tag
1082
1083 redo A;
1084 } elsif (0x0041 <= $self->{next_char} and
1085 $self->{next_char} <= 0x005A) { # A..Z
1086 !!!cp (49);
1087 $self->{current_attribute}
1088 = {name => chr ($self->{next_char} + 0x0020),
1089 value => '',
1090 line => $self->{line}, column => $self->{column}};
1091 $self->{state} = ATTRIBUTE_NAME_STATE;
1092 !!!next-input-character;
1093 redo A;
1094 } elsif ($self->{next_char} == 0x002F) { # /
1095 !!!cp (50);
1096 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1097 !!!next-input-character;
1098 redo A;
1099 } elsif ($self->{next_char} == -1) {
1100 !!!parse-error (type => 'unclosed tag');
1101 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1102 !!!cp (52);
1103 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1104 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1105 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1106 if ($self->{current_token}->{attributes}) {
1107 !!!cp (53);
1108 !!!parse-error (type => 'end tag attribute');
1109 } else {
1110 !!!cp (54);
1111 }
1112 } else {
1113 die "$0: $self->{current_token}->{type}: Unknown token type";
1114 }
1115 $self->{state} = DATA_STATE;
1116 # reconsume
1117
1118 !!!emit ($self->{current_token}); # start tag or end tag
1119
1120 redo A;
1121 } else {
1122 if ({
1123 0x0022 => 1, # "
1124 0x0027 => 1, # '
1125 0x003D => 1, # =
1126 }->{$self->{next_char}}) {
1127 !!!cp (55);
1128 !!!parse-error (type => 'bad attribute name');
1129 } else {
1130 !!!cp (56);
1131 }
1132 $self->{current_attribute}
1133 = {name => chr ($self->{next_char}),
1134 value => '',
1135 line => $self->{line}, column => $self->{column}};
1136 $self->{state} = ATTRIBUTE_NAME_STATE;
1137 !!!next-input-character;
1138 redo A;
1139 }
1140 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1141 my $before_leave = sub {
1142 if (exists $self->{current_token}->{attributes} # start tag or end tag
1143 ->{$self->{current_attribute}->{name}}) { # MUST
1144 !!!cp (57);
1145 !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1146 ## Discard $self->{current_attribute} # MUST
1147 } else {
1148 !!!cp (58);
1149 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1150 = $self->{current_attribute};
1151 }
1152 }; # $before_leave
1153
1154 if ($self->{next_char} == 0x0009 or # HT
1155 $self->{next_char} == 0x000A or # LF
1156 $self->{next_char} == 0x000B or # VT
1157 $self->{next_char} == 0x000C or # FF
1158 $self->{next_char} == 0x0020) { # SP
1159 !!!cp (59);
1160 $before_leave->();
1161 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1162 !!!next-input-character;
1163 redo A;
1164 } elsif ($self->{next_char} == 0x003D) { # =
1165 !!!cp (60);
1166 $before_leave->();
1167 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1168 !!!next-input-character;
1169 redo A;
1170 } elsif ($self->{next_char} == 0x003E) { # >
1171 $before_leave->();
1172 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1173 !!!cp (61);
1174 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1175 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1176 !!!cp (62);
1177 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1178 if ($self->{current_token}->{attributes}) {
1179 !!!parse-error (type => 'end tag attribute');
1180 }
1181 } else {
1182 die "$0: $self->{current_token}->{type}: Unknown token type";
1183 }
1184 $self->{state} = DATA_STATE;
1185 !!!next-input-character;
1186
1187 !!!emit ($self->{current_token}); # start tag or end tag
1188
1189 redo A;
1190 } elsif (0x0041 <= $self->{next_char} and
1191 $self->{next_char} <= 0x005A) { # A..Z
1192 !!!cp (63);
1193 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1194 ## Stay in the state
1195 !!!next-input-character;
1196 redo A;
1197 } elsif ($self->{next_char} == 0x002F) { # /
1198 !!!cp (64);
1199 $before_leave->();
1200 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1201 !!!next-input-character;
1202 redo A;
1203 } elsif ($self->{next_char} == -1) {
1204 !!!parse-error (type => 'unclosed tag');
1205 $before_leave->();
1206 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1207 !!!cp (66);
1208 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1209 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1210 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1211 if ($self->{current_token}->{attributes}) {
1212 !!!cp (67);
1213 !!!parse-error (type => 'end tag attribute');
1214 } else {
1215 ## NOTE: This state should never be reached.
1216 !!!cp (68);
1217 }
1218 } else {
1219 die "$0: $self->{current_token}->{type}: Unknown token type";
1220 }
1221 $self->{state} = DATA_STATE;
1222 # reconsume
1223
1224 !!!emit ($self->{current_token}); # start tag or end tag
1225
1226 redo A;
1227 } else {
1228 if ($self->{next_char} == 0x0022 or # "
1229 $self->{next_char} == 0x0027) { # '
1230 !!!cp (69);
1231 !!!parse-error (type => 'bad attribute name');
1232 } else {
1233 !!!cp (70);
1234 }
1235 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1236 ## Stay in the state
1237 !!!next-input-character;
1238 redo A;
1239 }
1240 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1241 if ($self->{next_char} == 0x0009 or # HT
1242 $self->{next_char} == 0x000A or # LF
1243 $self->{next_char} == 0x000B or # VT
1244 $self->{next_char} == 0x000C or # FF
1245 $self->{next_char} == 0x0020) { # SP
1246 !!!cp (71);
1247 ## Stay in the state
1248 !!!next-input-character;
1249 redo A;
1250 } elsif ($self->{next_char} == 0x003D) { # =
1251 !!!cp (72);
1252 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1253 !!!next-input-character;
1254 redo A;
1255 } elsif ($self->{next_char} == 0x003E) { # >
1256 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1257 !!!cp (73);
1258 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1259 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1260 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1261 if ($self->{current_token}->{attributes}) {
1262 !!!cp (74);
1263 !!!parse-error (type => 'end tag attribute');
1264 } else {
1265 ## NOTE: This state should never be reached.
1266 !!!cp (75);
1267 }
1268 } else {
1269 die "$0: $self->{current_token}->{type}: Unknown token type";
1270 }
1271 $self->{state} = DATA_STATE;
1272 !!!next-input-character;
1273
1274 !!!emit ($self->{current_token}); # start tag or end tag
1275
1276 redo A;
1277 } elsif (0x0041 <= $self->{next_char} and
1278 $self->{next_char} <= 0x005A) { # A..Z
1279 !!!cp (76);
1280 $self->{current_attribute}
1281 = {name => chr ($self->{next_char} + 0x0020),
1282 value => '',
1283 line => $self->{line}, column => $self->{column}};
1284 $self->{state} = ATTRIBUTE_NAME_STATE;
1285 !!!next-input-character;
1286 redo A;
1287 } elsif ($self->{next_char} == 0x002F) { # /
1288 !!!cp (77);
1289 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1290 !!!next-input-character;
1291 redo A;
1292 } elsif ($self->{next_char} == -1) {
1293 !!!parse-error (type => 'unclosed tag');
1294 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1295 !!!cp (79);
1296 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1297 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1298 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1299 if ($self->{current_token}->{attributes}) {
1300 !!!cp (80);
1301 !!!parse-error (type => 'end tag attribute');
1302 } else {
1303 ## NOTE: This state should never be reached.
1304 !!!cp (81);
1305 }
1306 } else {
1307 die "$0: $self->{current_token}->{type}: Unknown token type";
1308 }
1309 $self->{state} = DATA_STATE;
1310 # reconsume
1311
1312 !!!emit ($self->{current_token}); # start tag or end tag
1313
1314 redo A;
1315 } else {
1316 !!!cp (82);
1317 $self->{current_attribute}
1318 = {name => chr ($self->{next_char}),
1319 value => '',
1320 line => $self->{line}, column => $self->{column}};
1321 $self->{state} = ATTRIBUTE_NAME_STATE;
1322 !!!next-input-character;
1323 redo A;
1324 }
1325 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1326 if ($self->{next_char} == 0x0009 or # HT
1327 $self->{next_char} == 0x000A or # LF
1328 $self->{next_char} == 0x000B or # VT
1329 $self->{next_char} == 0x000C or # FF
1330 $self->{next_char} == 0x0020) { # SP
1331 !!!cp (83);
1332 ## Stay in the state
1333 !!!next-input-character;
1334 redo A;
1335 } elsif ($self->{next_char} == 0x0022) { # "
1336 !!!cp (84);
1337 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1338 !!!next-input-character;
1339 redo A;
1340 } elsif ($self->{next_char} == 0x0026) { # &
1341 !!!cp (85);
1342 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1343 ## reconsume
1344 redo A;
1345 } elsif ($self->{next_char} == 0x0027) { # '
1346 !!!cp (86);
1347 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1348 !!!next-input-character;
1349 redo A;
1350 } elsif ($self->{next_char} == 0x003E) { # >
1351 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1352 !!!cp (87);
1353 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1354 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1355 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1356 if ($self->{current_token}->{attributes}) {
1357 !!!cp (88);
1358 !!!parse-error (type => 'end tag attribute');
1359 } else {
1360 ## NOTE: This state should never be reached.
1361 !!!cp (89);
1362 }
1363 } else {
1364 die "$0: $self->{current_token}->{type}: Unknown token type";
1365 }
1366 $self->{state} = DATA_STATE;
1367 !!!next-input-character;
1368
1369 !!!emit ($self->{current_token}); # start tag or end tag
1370
1371 redo A;
1372 } elsif ($self->{next_char} == -1) {
1373 !!!parse-error (type => 'unclosed tag');
1374 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1375 !!!cp (90);
1376 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1377 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1378 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1379 if ($self->{current_token}->{attributes}) {
1380 !!!cp (91);
1381 !!!parse-error (type => 'end tag attribute');
1382 } else {
1383 ## NOTE: This state should never be reached.
1384 !!!cp (92);
1385 }
1386 } else {
1387 die "$0: $self->{current_token}->{type}: Unknown token type";
1388 }
1389 $self->{state} = DATA_STATE;
1390 ## reconsume
1391
1392 !!!emit ($self->{current_token}); # start tag or end tag
1393
1394 redo A;
1395 } else {
1396 if ($self->{next_char} == 0x003D) { # =
1397 !!!cp (93);
1398 !!!parse-error (type => 'bad attribute value');
1399 } else {
1400 !!!cp (94);
1401 }
1402 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1403 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1404 !!!next-input-character;
1405 redo A;
1406 }
1407 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1408 if ($self->{next_char} == 0x0022) { # "
1409 !!!cp (95);
1410 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1411 !!!next-input-character;
1412 redo A;
1413 } elsif ($self->{next_char} == 0x0026) { # &
1414 !!!cp (96);
1415 $self->{last_attribute_value_state} = $self->{state};
1416 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1417 !!!next-input-character;
1418 redo A;
1419 } elsif ($self->{next_char} == -1) {
1420 !!!parse-error (type => 'unclosed attribute value');
1421 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1422 !!!cp (97);
1423 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1424 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1425 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1426 if ($self->{current_token}->{attributes}) {
1427 !!!cp (98);
1428 !!!parse-error (type => 'end tag attribute');
1429 } else {
1430 ## NOTE: This state should never be reached.
1431 !!!cp (99);
1432 }
1433 } else {
1434 die "$0: $self->{current_token}->{type}: Unknown token type";
1435 }
1436 $self->{state} = DATA_STATE;
1437 ## reconsume
1438
1439 !!!emit ($self->{current_token}); # start tag or end tag
1440
1441 redo A;
1442 } else {
1443 !!!cp (100);
1444 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1445 ## Stay in the state
1446 !!!next-input-character;
1447 redo A;
1448 }
1449 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1450 if ($self->{next_char} == 0x0027) { # '
1451 !!!cp (101);
1452 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1453 !!!next-input-character;
1454 redo A;
1455 } elsif ($self->{next_char} == 0x0026) { # &
1456 !!!cp (102);
1457 $self->{last_attribute_value_state} = $self->{state};
1458 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1459 !!!next-input-character;
1460 redo A;
1461 } elsif ($self->{next_char} == -1) {
1462 !!!parse-error (type => 'unclosed attribute value');
1463 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1464 !!!cp (103);
1465 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1466 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1467 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1468 if ($self->{current_token}->{attributes}) {
1469 !!!cp (104);
1470 !!!parse-error (type => 'end tag attribute');
1471 } else {
1472 ## NOTE: This state should never be reached.
1473 !!!cp (105);
1474 }
1475 } else {
1476 die "$0: $self->{current_token}->{type}: Unknown token type";
1477 }
1478 $self->{state} = DATA_STATE;
1479 ## reconsume
1480
1481 !!!emit ($self->{current_token}); # start tag or end tag
1482
1483 redo A;
1484 } else {
1485 !!!cp (106);
1486 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1487 ## Stay in the state
1488 !!!next-input-character;
1489 redo A;
1490 }
1491 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1492 if ($self->{next_char} == 0x0009 or # HT
1493 $self->{next_char} == 0x000A or # LF
1494 $self->{next_char} == 0x000B or # HT
1495 $self->{next_char} == 0x000C or # FF
1496 $self->{next_char} == 0x0020) { # SP
1497 !!!cp (107);
1498 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1499 !!!next-input-character;
1500 redo A;
1501 } elsif ($self->{next_char} == 0x0026) { # &
1502 !!!cp (108);
1503 $self->{last_attribute_value_state} = $self->{state};
1504 $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1505 !!!next-input-character;
1506 redo A;
1507 } elsif ($self->{next_char} == 0x003E) { # >
1508 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1509 !!!cp (109);
1510 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1511 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1512 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1513 if ($self->{current_token}->{attributes}) {
1514 !!!cp (110);
1515 !!!parse-error (type => 'end tag attribute');
1516 } else {
1517 ## NOTE: This state should never be reached.
1518 !!!cp (111);
1519 }
1520 } else {
1521 die "$0: $self->{current_token}->{type}: Unknown token type";
1522 }
1523 $self->{state} = DATA_STATE;
1524 !!!next-input-character;
1525
1526 !!!emit ($self->{current_token}); # start tag or end tag
1527
1528 redo A;
1529 } elsif ($self->{next_char} == -1) {
1530 !!!parse-error (type => 'unclosed tag');
1531 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1532 !!!cp (112);
1533 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1534 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1535 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1536 if ($self->{current_token}->{attributes}) {
1537 !!!cp (113);
1538 !!!parse-error (type => 'end tag attribute');
1539 } else {
1540 ## NOTE: This state should never be reached.
1541 !!!cp (114);
1542 }
1543 } else {
1544 die "$0: $self->{current_token}->{type}: Unknown token type";
1545 }
1546 $self->{state} = DATA_STATE;
1547 ## reconsume
1548
1549 !!!emit ($self->{current_token}); # start tag or end tag
1550
1551 redo A;
1552 } else {
1553 if ({
1554 0x0022 => 1, # "
1555 0x0027 => 1, # '
1556 0x003D => 1, # =
1557 }->{$self->{next_char}}) {
1558 !!!cp (115);
1559 !!!parse-error (type => 'bad attribute value');
1560 } else {
1561 !!!cp (116);
1562 }
1563 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1564 ## Stay in the state
1565 !!!next-input-character;
1566 redo A;
1567 }
1568 } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1569 my $token = $self->_tokenize_attempt_to_consume_an_entity
1570 (1,
1571 $self->{last_attribute_value_state}
1572 == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
1573 $self->{last_attribute_value_state}
1574 == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
1575 -1);
1576
1577 unless (defined $token) {
1578 !!!cp (117);
1579 $self->{current_attribute}->{value} .= '&';
1580 } else {
1581 !!!cp (118);
1582 $self->{current_attribute}->{value} .= $token->{data};
1583 $self->{current_attribute}->{has_reference} = $token->{has_reference};
1584 ## ISSUE: spec says "append the returned character token to the current attribute's value"
1585 }
1586
1587 $self->{state} = $self->{last_attribute_value_state};
1588 # next-input-character is already done
1589 redo A;
1590 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1591 if ($self->{next_char} == 0x0009 or # HT
1592 $self->{next_char} == 0x000A or # LF
1593 $self->{next_char} == 0x000B or # VT
1594 $self->{next_char} == 0x000C or # FF
1595 $self->{next_char} == 0x0020) { # SP
1596 !!!cp (118);
1597 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1598 !!!next-input-character;
1599 redo A;
1600 } elsif ($self->{next_char} == 0x003E) { # >
1601 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1602 !!!cp (119);
1603 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1604 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1605 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1606 if ($self->{current_token}->{attributes}) {
1607 !!!cp (120);
1608 !!!parse-error (type => 'end tag attribute');
1609 } else {
1610 ## NOTE: This state should never be reached.
1611 !!!cp (121);
1612 }
1613 } else {
1614 die "$0: $self->{current_token}->{type}: Unknown token type";
1615 }
1616 $self->{state} = DATA_STATE;
1617 !!!next-input-character;
1618
1619 !!!emit ($self->{current_token}); # start tag or end tag
1620
1621 redo A;
1622 } elsif ($self->{next_char} == 0x002F) { # /
1623 !!!cp (122);
1624 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1625 !!!next-input-character;
1626 redo A;
1627 } else {
1628 !!!cp ('124.1');
1629 !!!parse-error (type => 'no space between attributes');
1630 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1631 ## reconsume
1632 redo A;
1633 }
1634 } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1635 if ($self->{next_char} == 0x003E) { # >
1636 if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1637 !!!cp ('124.2');
1638 !!!parse-error (type => 'nestc', token => $self->{current_token});
1639 ## TODO: Different type than slash in start tag
1640 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1641 if ($self->{current_token}->{attributes}) {
1642 !!!cp ('124.4');
1643 !!!parse-error (type => 'end tag attribute');
1644 } else {
1645 !!!cp ('124.5');
1646 }
1647 ## TODO: Test |<title></title/>|
1648 } else {
1649 !!!cp ('124.3');
1650 $self->{self_closing} = 1;
1651 }
1652
1653 $self->{state} = DATA_STATE;
1654 !!!next-input-character;
1655
1656 !!!emit ($self->{current_token}); # start tag or end tag
1657
1658 redo A;
1659 } else {
1660 !!!cp ('124.4');
1661 !!!parse-error (type => 'nestc');
1662 ## TODO: This error type is wrong.
1663 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1664 ## Reconsume.
1665 redo A;
1666 }
1667 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1668 ## (only happen if PCDATA state)
1669
1670 ## NOTE: Set by the previous state
1671 #my $token = {type => COMMENT_TOKEN, data => ''};
1672
1673 BC: {
1674 if ($self->{next_char} == 0x003E) { # >
1675 !!!cp (124);
1676 $self->{state} = DATA_STATE;
1677 !!!next-input-character;
1678
1679 !!!emit ($self->{current_token}); # comment
1680
1681 redo A;
1682 } elsif ($self->{next_char} == -1) {
1683 !!!cp (125);
1684 $self->{state} = DATA_STATE;
1685 ## reconsume
1686
1687 !!!emit ($self->{current_token}); # comment
1688
1689 redo A;
1690 } else {
1691 !!!cp (126);
1692 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1693 !!!next-input-character;
1694 redo BC;
1695 }
1696 } # BC
1697
1698 die "$0: _get_next_token: unexpected case [BC]";
1699 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1700 ## (only happen if PCDATA state)
1701
1702 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1703
1704 my @next_char;
1705 push @next_char, $self->{next_char};
1706
1707 if ($self->{next_char} == 0x002D) { # -
1708 !!!next-input-character;
1709 push @next_char, $self->{next_char};
1710 if ($self->{next_char} == 0x002D) { # -
1711 !!!cp (127);
1712 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1713 line => $l, column => $c,
1714 };
1715 $self->{state} = COMMENT_START_STATE;
1716 !!!next-input-character;
1717 redo A;
1718 } else {
1719 !!!cp (128);
1720 }
1721 } elsif ($self->{next_char} == 0x0044 or # D
1722 $self->{next_char} == 0x0064) { # d
1723 !!!next-input-character;
1724 push @next_char, $self->{next_char};
1725 if ($self->{next_char} == 0x004F or # O
1726 $self->{next_char} == 0x006F) { # o
1727 !!!next-input-character;
1728 push @next_char, $self->{next_char};
1729 if ($self->{next_char} == 0x0043 or # C
1730 $self->{next_char} == 0x0063) { # c
1731 !!!next-input-character;
1732 push @next_char, $self->{next_char};
1733 if ($self->{next_char} == 0x0054 or # T
1734 $self->{next_char} == 0x0074) { # t
1735 !!!next-input-character;
1736 push @next_char, $self->{next_char};
1737 if ($self->{next_char} == 0x0059 or # Y
1738 $self->{next_char} == 0x0079) { # y
1739 !!!next-input-character;
1740 push @next_char, $self->{next_char};
1741 if ($self->{next_char} == 0x0050 or # P
1742 $self->{next_char} == 0x0070) { # p
1743 !!!next-input-character;
1744 push @next_char, $self->{next_char};
1745 if ($self->{next_char} == 0x0045 or # E
1746 $self->{next_char} == 0x0065) { # e
1747 !!!cp (129);
1748 ## TODO: What a stupid code this is!
1749 $self->{state} = DOCTYPE_STATE;
1750 $self->{current_token} = {type => DOCTYPE_TOKEN,
1751 quirks => 1,
1752 line => $l, column => $c,
1753 };
1754 !!!next-input-character;
1755 redo A;
1756 } else {
1757 !!!cp (130);
1758 }
1759 } else {
1760 !!!cp (131);
1761 }
1762 } else {
1763 !!!cp (132);
1764 }
1765 } else {
1766 !!!cp (133);
1767 }
1768 } else {
1769 !!!cp (134);
1770 }
1771 } else {
1772 !!!cp (135);
1773 }
1774 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1775 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1776 $self->{next_char} == 0x005B) { # [
1777 !!!next-input-character;
1778 push @next_char, $self->{next_char};
1779 if ($self->{next_char} == 0x0043) { # C
1780 !!!next-input-character;
1781 push @next_char, $self->{next_char};
1782 if ($self->{next_char} == 0x0044) { # D
1783 !!!next-input-character;
1784 push @next_char, $self->{next_char};
1785 if ($self->{next_char} == 0x0041) { # A
1786 !!!next-input-character;
1787 push @next_char, $self->{next_char};
1788 if ($self->{next_char} == 0x0054) { # T
1789 !!!next-input-character;
1790 push @next_char, $self->{next_char};
1791 if ($self->{next_char} == 0x0041) { # A
1792 !!!next-input-character;
1793 push @next_char, $self->{next_char};
1794 if ($self->{next_char} == 0x005B) { # [
1795 !!!cp (135.1);
1796 $self->{state} = CDATA_BLOCK_STATE;
1797 !!!next-input-character;
1798 redo A;
1799 } else {
1800 !!!cp (135.2);
1801 }
1802 } else {
1803 !!!cp (135.3);
1804 }
1805 } else {
1806 !!!cp (135.4);
1807 }
1808 } else {
1809 !!!cp (135.5);
1810 }
1811 } else {
1812 !!!cp (135.6);
1813 }
1814 } else {
1815 !!!cp (135.7);
1816 }
1817 } else {
1818 !!!cp (136);
1819 }
1820
1821 !!!parse-error (type => 'bogus comment');
1822 $self->{next_char} = shift @next_char;
1823 !!!back-next-input-character (@next_char);
1824 $self->{state} = BOGUS_COMMENT_STATE;
1825 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1826 line => $l, column => $c,
1827 };
1828 redo A;
1829
1830 ## ISSUE: typos in spec: chacacters, is is a parse error
1831 ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1832 } elsif ($self->{state} == COMMENT_START_STATE) {
1833 if ($self->{next_char} == 0x002D) { # -
1834 !!!cp (137);
1835 $self->{state} = COMMENT_START_DASH_STATE;
1836 !!!next-input-character;
1837 redo A;
1838 } elsif ($self->{next_char} == 0x003E) { # >
1839 !!!cp (138);
1840 !!!parse-error (type => 'bogus comment');
1841 $self->{state} = DATA_STATE;
1842 !!!next-input-character;
1843
1844 !!!emit ($self->{current_token}); # comment
1845
1846 redo A;
1847 } elsif ($self->{next_char} == -1) {
1848 !!!cp (139);
1849 !!!parse-error (type => 'unclosed comment');
1850 $self->{state} = DATA_STATE;
1851 ## reconsume
1852
1853 !!!emit ($self->{current_token}); # comment
1854
1855 redo A;
1856 } else {
1857 !!!cp (140);
1858 $self->{current_token}->{data} # comment
1859 .= chr ($self->{next_char});
1860 $self->{state} = COMMENT_STATE;
1861 !!!next-input-character;
1862 redo A;
1863 }
1864 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1865 if ($self->{next_char} == 0x002D) { # -
1866 !!!cp (141);
1867 $self->{state} = COMMENT_END_STATE;
1868 !!!next-input-character;
1869 redo A;
1870 } elsif ($self->{next_char} == 0x003E) { # >
1871 !!!cp (142);
1872 !!!parse-error (type => 'bogus comment');
1873 $self->{state} = DATA_STATE;
1874 !!!next-input-character;
1875
1876 !!!emit ($self->{current_token}); # comment
1877
1878 redo A;
1879 } elsif ($self->{next_char} == -1) {
1880 !!!cp (143);
1881 !!!parse-error (type => 'unclosed comment');
1882 $self->{state} = DATA_STATE;
1883 ## reconsume
1884
1885 !!!emit ($self->{current_token}); # comment
1886
1887 redo A;
1888 } else {
1889 !!!cp (144);
1890 $self->{current_token}->{data} # comment
1891 .= '-' . chr ($self->{next_char});
1892 $self->{state} = COMMENT_STATE;
1893 !!!next-input-character;
1894 redo A;
1895 }
1896 } elsif ($self->{state} == COMMENT_STATE) {
1897 if ($self->{next_char} == 0x002D) { # -
1898 !!!cp (145);
1899 $self->{state} = COMMENT_END_DASH_STATE;
1900 !!!next-input-character;
1901 redo A;
1902 } elsif ($self->{next_char} == -1) {
1903 !!!cp (146);
1904 !!!parse-error (type => 'unclosed comment');
1905 $self->{state} = DATA_STATE;
1906 ## reconsume
1907
1908 !!!emit ($self->{current_token}); # comment
1909
1910 redo A;
1911 } else {
1912 !!!cp (147);
1913 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1914 ## Stay in the state
1915 !!!next-input-character;
1916 redo A;
1917 }
1918 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1919 if ($self->{next_char} == 0x002D) { # -
1920 !!!cp (148);
1921 $self->{state} = COMMENT_END_STATE;
1922 !!!next-input-character;
1923 redo A;
1924 } elsif ($self->{next_char} == -1) {
1925 !!!cp (149);
1926 !!!parse-error (type => 'unclosed comment');
1927 $self->{state} = DATA_STATE;
1928 ## reconsume
1929
1930 !!!emit ($self->{current_token}); # comment
1931
1932 redo A;
1933 } else {
1934 !!!cp (150);
1935 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
1936 $self->{state} = COMMENT_STATE;
1937 !!!next-input-character;
1938 redo A;
1939 }
1940 } elsif ($self->{state} == COMMENT_END_STATE) {
1941 if ($self->{next_char} == 0x003E) { # >
1942 !!!cp (151);
1943 $self->{state} = DATA_STATE;
1944 !!!next-input-character;
1945
1946 !!!emit ($self->{current_token}); # comment
1947
1948 redo A;
1949 } elsif ($self->{next_char} == 0x002D) { # -
1950 !!!cp (152);
1951 !!!parse-error (type => 'dash in comment',
1952 line => $self->{line_prev},
1953 column => $self->{column_prev});
1954 $self->{current_token}->{data} .= '-'; # comment
1955 ## Stay in the state
1956 !!!next-input-character;
1957 redo A;
1958 } elsif ($self->{next_char} == -1) {
1959 !!!cp (153);
1960 !!!parse-error (type => 'unclosed comment');
1961 $self->{state} = DATA_STATE;
1962 ## reconsume
1963
1964 !!!emit ($self->{current_token}); # comment
1965
1966 redo A;
1967 } else {
1968 !!!cp (154);
1969 !!!parse-error (type => 'dash in comment',
1970 line => $self->{line_prev},
1971 column => $self->{column_prev});
1972 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
1973 $self->{state} = COMMENT_STATE;
1974 !!!next-input-character;
1975 redo A;
1976 }
1977 } elsif ($self->{state} == DOCTYPE_STATE) {
1978 if ($self->{next_char} == 0x0009 or # HT
1979 $self->{next_char} == 0x000A or # LF
1980 $self->{next_char} == 0x000B or # VT
1981 $self->{next_char} == 0x000C or # FF
1982 $self->{next_char} == 0x0020) { # SP
1983 !!!cp (155);
1984 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1985 !!!next-input-character;
1986 redo A;
1987 } else {
1988 !!!cp (156);
1989 !!!parse-error (type => 'no space before DOCTYPE name');
1990 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1991 ## reconsume
1992 redo A;
1993 }
1994 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1995 if ($self->{next_char} == 0x0009 or # HT
1996 $self->{next_char} == 0x000A or # LF
1997 $self->{next_char} == 0x000B or # VT
1998 $self->{next_char} == 0x000C or # FF
1999 $self->{next_char} == 0x0020) { # SP
2000 !!!cp (157);
2001 ## Stay in the state
2002 !!!next-input-character;
2003 redo A;
2004 } elsif ($self->{next_char} == 0x003E) { # >
2005 !!!cp (158);
2006 !!!parse-error (type => 'no DOCTYPE name');
2007 $self->{state} = DATA_STATE;
2008 !!!next-input-character;
2009
2010 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2011
2012 redo A;
2013 } elsif ($self->{next_char} == -1) {
2014 !!!cp (159);
2015 !!!parse-error (type => 'no DOCTYPE name');
2016 $self->{state} = DATA_STATE;
2017 ## reconsume
2018
2019 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2020
2021 redo A;
2022 } else {
2023 !!!cp (160);
2024 $self->{current_token}->{name} = chr $self->{next_char};
2025 delete $self->{current_token}->{quirks};
2026 ## ISSUE: "Set the token's name name to the" in the spec
2027 $self->{state} = DOCTYPE_NAME_STATE;
2028 !!!next-input-character;
2029 redo A;
2030 }
2031 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2032 ## ISSUE: Redundant "First," in the spec.
2033 if ($self->{next_char} == 0x0009 or # HT
2034 $self->{next_char} == 0x000A or # LF
2035 $self->{next_char} == 0x000B or # VT
2036 $self->{next_char} == 0x000C or # FF
2037 $self->{next_char} == 0x0020) { # SP
2038 !!!cp (161);
2039 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2040 !!!next-input-character;
2041 redo A;
2042 } elsif ($self->{next_char} == 0x003E) { # >
2043 !!!cp (162);
2044 $self->{state} = DATA_STATE;
2045 !!!next-input-character;
2046
2047 !!!emit ($self->{current_token}); # DOCTYPE
2048
2049 redo A;
2050 } elsif ($self->{next_char} == -1) {
2051 !!!cp (163);
2052 !!!parse-error (type => 'unclosed DOCTYPE');
2053 $self->{state} = DATA_STATE;
2054 ## reconsume
2055
2056 $self->{current_token}->{quirks} = 1;
2057 !!!emit ($self->{current_token}); # DOCTYPE
2058
2059 redo A;
2060 } else {
2061 !!!cp (164);
2062 $self->{current_token}->{name}
2063 .= chr ($self->{next_char}); # DOCTYPE
2064 ## Stay in the state
2065 !!!next-input-character;
2066 redo A;
2067 }
2068 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2069 if ($self->{next_char} == 0x0009 or # HT
2070 $self->{next_char} == 0x000A or # LF
2071 $self->{next_char} == 0x000B or # VT
2072 $self->{next_char} == 0x000C or # FF
2073 $self->{next_char} == 0x0020) { # SP
2074 !!!cp (165);
2075 ## Stay in the state
2076 !!!next-input-character;
2077 redo A;
2078 } elsif ($self->{next_char} == 0x003E) { # >
2079 !!!cp (166);
2080 $self->{state} = DATA_STATE;
2081 !!!next-input-character;
2082
2083 !!!emit ($self->{current_token}); # DOCTYPE
2084
2085 redo A;
2086 } elsif ($self->{next_char} == -1) {
2087 !!!cp (167);
2088 !!!parse-error (type => 'unclosed DOCTYPE');
2089 $self->{state} = DATA_STATE;
2090 ## reconsume
2091
2092 $self->{current_token}->{quirks} = 1;
2093 !!!emit ($self->{current_token}); # DOCTYPE
2094
2095 redo A;
2096 } elsif ($self->{next_char} == 0x0050 or # P
2097 $self->{next_char} == 0x0070) { # p
2098 !!!next-input-character;
2099 if ($self->{next_char} == 0x0055 or # U
2100 $self->{next_char} == 0x0075) { # u
2101 !!!next-input-character;
2102 if ($self->{next_char} == 0x0042 or # B
2103 $self->{next_char} == 0x0062) { # b
2104 !!!next-input-character;
2105 if ($self->{next_char} == 0x004C or # L
2106 $self->{next_char} == 0x006C) { # l
2107 !!!next-input-character;
2108 if ($self->{next_char} == 0x0049 or # I
2109 $self->{next_char} == 0x0069) { # i
2110 !!!next-input-character;
2111 if ($self->{next_char} == 0x0043 or # C
2112 $self->{next_char} == 0x0063) { # c
2113 !!!cp (168);
2114 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2115 !!!next-input-character;
2116 redo A;
2117 } else {
2118 !!!cp (169);
2119 }
2120 } else {
2121 !!!cp (170);
2122 }
2123 } else {
2124 !!!cp (171);
2125 }
2126 } else {
2127 !!!cp (172);
2128 }
2129 } else {
2130 !!!cp (173);
2131 }
2132
2133 #
2134 } elsif ($self->{next_char} == 0x0053 or # S
2135 $self->{next_char} == 0x0073) { # s
2136 !!!next-input-character;
2137 if ($self->{next_char} == 0x0059 or # Y
2138 $self->{next_char} == 0x0079) { # y
2139 !!!next-input-character;
2140 if ($self->{next_char} == 0x0053 or # S
2141 $self->{next_char} == 0x0073) { # s
2142 !!!next-input-character;
2143 if ($self->{next_char} == 0x0054 or # T
2144 $self->{next_char} == 0x0074) { # t
2145 !!!next-input-character;
2146 if ($self->{next_char} == 0x0045 or # E
2147 $self->{next_char} == 0x0065) { # e
2148 !!!next-input-character;
2149 if ($self->{next_char} == 0x004D or # M
2150 $self->{next_char} == 0x006D) { # m
2151 !!!cp (174);
2152 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2153 !!!next-input-character;
2154 redo A;
2155 } else {
2156 !!!cp (175);
2157 }
2158 } else {
2159 !!!cp (176);
2160 }
2161 } else {
2162 !!!cp (177);
2163 }
2164 } else {
2165 !!!cp (178);
2166 }
2167 } else {
2168 !!!cp (179);
2169 }
2170
2171 #
2172 } else {
2173 !!!cp (180);
2174 !!!next-input-character;
2175 #
2176 }
2177
2178 !!!parse-error (type => 'string after DOCTYPE name');
2179 $self->{current_token}->{quirks} = 1;
2180
2181 $self->{state} = BOGUS_DOCTYPE_STATE;
2182 # next-input-character is already done
2183 redo A;
2184 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2185 if ({
2186 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2187 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2188 }->{$self->{next_char}}) {
2189 !!!cp (181);
2190 ## Stay in the state
2191 !!!next-input-character;
2192 redo A;
2193 } elsif ($self->{next_char} eq 0x0022) { # "
2194 !!!cp (182);
2195 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2196 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2197 !!!next-input-character;
2198 redo A;
2199 } elsif ($self->{next_char} eq 0x0027) { # '
2200 !!!cp (183);
2201 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2202 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2203 !!!next-input-character;
2204 redo A;
2205 } elsif ($self->{next_char} eq 0x003E) { # >
2206 !!!cp (184);
2207 !!!parse-error (type => 'no PUBLIC literal');
2208
2209 $self->{state} = DATA_STATE;
2210 !!!next-input-character;
2211
2212 $self->{current_token}->{quirks} = 1;
2213 !!!emit ($self->{current_token}); # DOCTYPE
2214
2215 redo A;
2216 } elsif ($self->{next_char} == -1) {
2217 !!!cp (185);
2218 !!!parse-error (type => 'unclosed DOCTYPE');
2219
2220 $self->{state} = DATA_STATE;
2221 ## reconsume
2222
2223 $self->{current_token}->{quirks} = 1;
2224 !!!emit ($self->{current_token}); # DOCTYPE
2225
2226 redo A;
2227 } else {
2228 !!!cp (186);
2229 !!!parse-error (type => 'string after PUBLIC');
2230 $self->{current_token}->{quirks} = 1;
2231
2232 $self->{state} = BOGUS_DOCTYPE_STATE;
2233 !!!next-input-character;
2234 redo A;
2235 }
2236 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2237 if ($self->{next_char} == 0x0022) { # "
2238 !!!cp (187);
2239 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2240 !!!next-input-character;
2241 redo A;
2242 } elsif ($self->{next_char} == 0x003E) { # >
2243 !!!cp (188);
2244 !!!parse-error (type => 'unclosed PUBLIC literal');
2245
2246 $self->{state} = DATA_STATE;
2247 !!!next-input-character;
2248
2249 $self->{current_token}->{quirks} = 1;
2250 !!!emit ($self->{current_token}); # DOCTYPE
2251
2252 redo A;
2253 } elsif ($self->{next_char} == -1) {
2254 !!!cp (189);
2255 !!!parse-error (type => 'unclosed PUBLIC literal');
2256
2257 $self->{state} = DATA_STATE;
2258 ## reconsume
2259
2260 $self->{current_token}->{quirks} = 1;
2261 !!!emit ($self->{current_token}); # DOCTYPE
2262
2263 redo A;
2264 } else {
2265 !!!cp (190);
2266 $self->{current_token}->{public_identifier} # DOCTYPE
2267 .= chr $self->{next_char};
2268 ## Stay in the state
2269 !!!next-input-character;
2270 redo A;
2271 }
2272 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2273 if ($self->{next_char} == 0x0027) { # '
2274 !!!cp (191);
2275 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2276 !!!next-input-character;
2277 redo A;
2278 } elsif ($self->{next_char} == 0x003E) { # >
2279 !!!cp (192);
2280 !!!parse-error (type => 'unclosed PUBLIC literal');
2281
2282 $self->{state} = DATA_STATE;
2283 !!!next-input-character;
2284
2285 $self->{current_token}->{quirks} = 1;
2286 !!!emit ($self->{current_token}); # DOCTYPE
2287
2288 redo A;
2289 } elsif ($self->{next_char} == -1) {
2290 !!!cp (193);
2291 !!!parse-error (type => 'unclosed PUBLIC literal');
2292
2293 $self->{state} = DATA_STATE;
2294 ## reconsume
2295
2296 $self->{current_token}->{quirks} = 1;
2297 !!!emit ($self->{current_token}); # DOCTYPE
2298
2299 redo A;
2300 } else {
2301 !!!cp (194);
2302 $self->{current_token}->{public_identifier} # DOCTYPE
2303 .= chr $self->{next_char};
2304 ## Stay in the state
2305 !!!next-input-character;
2306 redo A;
2307 }
2308 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2309 if ({
2310 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2311 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2312 }->{$self->{next_char}}) {
2313 !!!cp (195);
2314 ## Stay in the state
2315 !!!next-input-character;
2316 redo A;
2317 } elsif ($self->{next_char} == 0x0022) { # "
2318 !!!cp (196);
2319 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2320 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2321 !!!next-input-character;
2322 redo A;
2323 } elsif ($self->{next_char} == 0x0027) { # '
2324 !!!cp (197);
2325 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2326 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2327 !!!next-input-character;
2328 redo A;
2329 } elsif ($self->{next_char} == 0x003E) { # >
2330 !!!cp (198);
2331 $self->{state} = DATA_STATE;
2332 !!!next-input-character;
2333
2334 !!!emit ($self->{current_token}); # DOCTYPE
2335
2336 redo A;
2337 } elsif ($self->{next_char} == -1) {
2338 !!!cp (199);
2339 !!!parse-error (type => 'unclosed DOCTYPE');
2340
2341 $self->{state} = DATA_STATE;
2342 ## reconsume
2343
2344 $self->{current_token}->{quirks} = 1;
2345 !!!emit ($self->{current_token}); # DOCTYPE
2346
2347 redo A;
2348 } else {
2349 !!!cp (200);
2350 !!!parse-error (type => 'string after PUBLIC literal');
2351 $self->{current_token}->{quirks} = 1;
2352
2353 $self->{state} = BOGUS_DOCTYPE_STATE;
2354 !!!next-input-character;
2355 redo A;
2356 }
2357 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2358 if ({
2359 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2360 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2361 }->{$self->{next_char}}) {
2362 !!!cp (201);
2363 ## Stay in the state
2364 !!!next-input-character;
2365 redo A;
2366 } elsif ($self->{next_char} == 0x0022) { # "
2367 !!!cp (202);
2368 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2369 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2370 !!!next-input-character;
2371 redo A;
2372 } elsif ($self->{next_char} == 0x0027) { # '
2373 !!!cp (203);
2374 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2375 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2376 !!!next-input-character;
2377 redo A;
2378 } elsif ($self->{next_char} == 0x003E) { # >
2379 !!!cp (204);
2380 !!!parse-error (type => 'no SYSTEM literal');
2381 $self->{state} = DATA_STATE;
2382 !!!next-input-character;
2383
2384 $self->{current_token}->{quirks} = 1;
2385 !!!emit ($self->{current_token}); # DOCTYPE
2386
2387 redo A;
2388 } elsif ($self->{next_char} == -1) {
2389 !!!cp (205);
2390 !!!parse-error (type => 'unclosed DOCTYPE');
2391
2392 $self->{state} = DATA_STATE;
2393 ## reconsume
2394
2395 $self->{current_token}->{quirks} = 1;
2396 !!!emit ($self->{current_token}); # DOCTYPE
2397
2398 redo A;
2399 } else {
2400 !!!cp (206);
2401 !!!parse-error (type => 'string after SYSTEM');
2402 $self->{current_token}->{quirks} = 1;
2403
2404 $self->{state} = BOGUS_DOCTYPE_STATE;
2405 !!!next-input-character;
2406 redo A;
2407 }
2408 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2409 if ($self->{next_char} == 0x0022) { # "
2410 !!!cp (207);
2411 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2412 !!!next-input-character;
2413 redo A;
2414 } elsif ($self->{next_char} == 0x003E) { # >
2415 !!!cp (208);
2416 !!!parse-error (type => 'unclosed PUBLIC literal');
2417
2418 $self->{state} = DATA_STATE;
2419 !!!next-input-character;
2420
2421 $self->{current_token}->{quirks} = 1;
2422 !!!emit ($self->{current_token}); # DOCTYPE
2423
2424 redo A;
2425 } elsif ($self->{next_char} == -1) {
2426 !!!cp (209);
2427 !!!parse-error (type => 'unclosed SYSTEM literal');
2428
2429 $self->{state} = DATA_STATE;
2430 ## reconsume
2431
2432 $self->{current_token}->{quirks} = 1;
2433 !!!emit ($self->{current_token}); # DOCTYPE
2434
2435 redo A;
2436 } else {
2437 !!!cp (210);
2438 $self->{current_token}->{system_identifier} # DOCTYPE
2439 .= chr $self->{next_char};
2440 ## Stay in the state
2441 !!!next-input-character;
2442 redo A;
2443 }
2444 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2445 if ($self->{next_char} == 0x0027) { # '
2446 !!!cp (211);
2447 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2448 !!!next-input-character;
2449 redo A;
2450 } elsif ($self->{next_char} == 0x003E) { # >
2451 !!!cp (212);
2452 !!!parse-error (type => 'unclosed PUBLIC literal');
2453
2454 $self->{state} = DATA_STATE;
2455 !!!next-input-character;
2456
2457 $self->{current_token}->{quirks} = 1;
2458 !!!emit ($self->{current_token}); # DOCTYPE
2459
2460 redo A;
2461 } elsif ($self->{next_char} == -1) {
2462 !!!cp (213);
2463 !!!parse-error (type => 'unclosed SYSTEM literal');
2464
2465 $self->{state} = DATA_STATE;
2466 ## reconsume
2467
2468 $self->{current_token}->{quirks} = 1;
2469 !!!emit ($self->{current_token}); # DOCTYPE
2470
2471 redo A;
2472 } else {
2473 !!!cp (214);
2474 $self->{current_token}->{system_identifier} # DOCTYPE
2475 .= chr $self->{next_char};
2476 ## Stay in the state
2477 !!!next-input-character;
2478 redo A;
2479 }
2480 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2481 if ({
2482 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2483 #0x000D => 1, # HT, LF, VT, FF, SP, CR
2484 }->{$self->{next_char}}) {
2485 !!!cp (215);
2486 ## Stay in the state
2487 !!!next-input-character;
2488 redo A;
2489 } elsif ($self->{next_char} == 0x003E) { # >
2490 !!!cp (216);
2491 $self->{state} = DATA_STATE;
2492 !!!next-input-character;
2493
2494 !!!emit ($self->{current_token}); # DOCTYPE
2495
2496 redo A;
2497 } elsif ($self->{next_char} == -1) {
2498 !!!cp (217);
2499 !!!parse-error (type => 'unclosed DOCTYPE');
2500
2501 $self->{state} = DATA_STATE;
2502 ## reconsume
2503
2504 $self->{current_token}->{quirks} = 1;
2505 !!!emit ($self->{current_token}); # DOCTYPE
2506
2507 redo A;
2508 } else {
2509 !!!cp (218);
2510 !!!parse-error (type => 'string after SYSTEM literal');
2511 #$self->{current_token}->{quirks} = 1;
2512
2513 $self->{state} = BOGUS_DOCTYPE_STATE;
2514 !!!next-input-character;
2515 redo A;
2516 }
2517 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2518 if ($self->{next_char} == 0x003E) { # >
2519 !!!cp (219);
2520 $self->{state} = DATA_STATE;
2521 !!!next-input-character;
2522
2523 !!!emit ($self->{current_token}); # DOCTYPE
2524
2525 redo A;
2526 } elsif ($self->{next_char} == -1) {
2527 !!!cp (220);
2528 !!!parse-error (type => 'unclosed DOCTYPE');
2529 $self->{state} = DATA_STATE;
2530 ## reconsume
2531
2532 !!!emit ($self->{current_token}); # DOCTYPE
2533
2534 redo A;
2535 } else {
2536 !!!cp (221);
2537 ## Stay in the state
2538 !!!next-input-character;
2539 redo A;
2540 }
2541 } elsif ($self->{state} == CDATA_BLOCK_STATE) {
2542 my $s = '';
2543
2544 my ($l, $c) = ($self->{line}, $self->{column});
2545
2546 CS: while ($self->{next_char} != -1) {
2547 if ($self->{next_char} == 0x005D) { # ]
2548 !!!next-input-character;
2549 if ($self->{next_char} == 0x005D) { # ]
2550 !!!next-input-character;
2551 MDC: {
2552 if ($self->{next_char} == 0x003E) { # >
2553 !!!cp (221.1);
2554 !!!next-input-character;
2555 last CS;
2556 } elsif ($self->{next_char} == 0x005D) { # ]
2557 !!!cp (221.2);
2558 $s .= ']';
2559 !!!next-input-character;
2560 redo MDC;
2561 } else {
2562 !!!cp (221.3);
2563 $s .= ']]';
2564 #
2565 }
2566 } # MDC
2567 } else {
2568 !!!cp (221.4);
2569 $s .= ']';
2570 #
2571 }
2572 } else {
2573 !!!cp (221.5);
2574 #
2575 }
2576 $s .= chr $self->{next_char};
2577 !!!next-input-character;
2578 } # CS
2579
2580 $self->{state} = DATA_STATE;
2581 ## next-input-character done or EOF, which is reconsumed.
2582
2583 if (length $s) {
2584 !!!cp (221.6);
2585 !!!emit ({type => CHARACTER_TOKEN, data => $s,
2586 line => $l, column => $c});
2587 } else {
2588 !!!cp (221.7);
2589 }
2590
2591 redo A;
2592
2593 ## ISSUE: "text tokens" in spec.
2594 ## TODO: Streaming support
2595 } else {
2596 die "$0: $self->{state}: Unknown state";
2597 }
2598 } # A
2599
2600 die "$0: _get_next_token: unexpected case";
2601 } # _get_next_token
2602
2603 sub _tokenize_attempt_to_consume_an_entity ($$$) {
2604 my ($self, $in_attr, $additional) = @_;
2605
2606 my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2607
2608 if ({
2609 0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2610 0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
2611 $additional => 1,
2612 }->{$self->{next_char}}) {
2613 !!!cp (1001);
2614 ## Don't consume
2615 ## No error
2616 return undef;
2617 } elsif ($self->{next_char} == 0x0023) { # #
2618 !!!next-input-character;
2619 if ($self->{next_char} == 0x0078 or # x
2620 $self->{next_char} == 0x0058) { # X
2621 my $code;
2622 X: {
2623 my $x_char = $self->{next_char};
2624 !!!next-input-character;
2625 if (0x0030 <= $self->{next_char} and
2626 $self->{next_char} <= 0x0039) { # 0..9
2627 !!!cp (1002);
2628 $code ||= 0;
2629 $code *= 0x10;
2630 $code += $self->{next_char} - 0x0030;
2631 redo X;
2632 } elsif (0x0061 <= $self->{next_char} and
2633 $self->{next_char} <= 0x0066) { # a..f
2634 !!!cp (1003);
2635 $code ||= 0;
2636 $code *= 0x10;
2637 $code += $self->{next_char} - 0x0060 + 9;
2638 redo X;
2639 } elsif (0x0041 <= $self->{next_char} and
2640 $self->{next_char} <= 0x0046) { # A..F
2641 !!!cp (1004);
2642 $code ||= 0;
2643 $code *= 0x10;
2644 $code += $self->{next_char} - 0x0040 + 9;
2645 redo X;
2646 } elsif (not defined $code) { # no hexadecimal digit
2647 !!!cp (1005);
2648 !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2649 !!!back-next-input-character ($x_char, $self->{next_char});
2650 $self->{next_char} = 0x0023; # #
2651 return undef;
2652 } elsif ($self->{next_char} == 0x003B) { # ;
2653 !!!cp (1006);
2654 !!!next-input-character;
2655 } else {
2656 !!!cp (1007);
2657 !!!parse-error (type => 'no refc', line => $l, column => $c);
2658 }
2659
2660 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2661 !!!cp (1008);
2662 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2663 $code = 0xFFFD;
2664 } elsif ($code > 0x10FFFF) {
2665 !!!cp (1009);
2666 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2667 $code = 0xFFFD;
2668 } elsif ($code == 0x000D) {
2669 !!!cp (1010);
2670 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2671 $code = 0x000A;
2672 } elsif (0x80 <= $code and $code <= 0x9F) {
2673 !!!cp (1011);
2674 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2675 $code = $c1_entity_char->{$code};
2676 }
2677
2678 return {type => CHARACTER_TOKEN, data => chr $code,
2679 has_reference => 1,
2680 line => $l, column => $c,
2681 };
2682 } # X
2683 } elsif (0x0030 <= $self->{next_char} and
2684 $self->{next_char} <= 0x0039) { # 0..9
2685 my $code = $self->{next_char} - 0x0030;
2686 !!!next-input-character;
2687
2688 while (0x0030 <= $self->{next_char} and
2689 $self->{next_char} <= 0x0039) { # 0..9
2690 !!!cp (1012);
2691 $code *= 10;
2692 $code += $self->{next_char} - 0x0030;
2693
2694 !!!next-input-character;
2695 }
2696
2697 if ($self->{next_char} == 0x003B) { # ;
2698 !!!cp (1013);
2699 !!!next-input-character;
2700 } else {
2701 !!!cp (1014);
2702 !!!parse-error (type => 'no refc', line => $l, column => $c);
2703 }
2704
2705 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2706 !!!cp (1015);
2707 !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2708 $code = 0xFFFD;
2709 } elsif ($code > 0x10FFFF) {
2710 !!!cp (1016);
2711 !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2712 $code = 0xFFFD;
2713 } elsif ($code == 0x000D) {
2714 !!!cp (1017);
2715 !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2716 $code = 0x000A;
2717 } elsif (0x80 <= $code and $code <= 0x9F) {
2718 !!!cp (1018);
2719 !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2720 $code = $c1_entity_char->{$code};
2721 }
2722
2723 return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2724 line => $l, column => $c,
2725 };
2726 } else {
2727 !!!cp (1019);
2728 !!!parse-error (type => 'bare nero', line => $l, column => $c);
2729 !!!back-next-input-character ($self->{next_char});
2730 $self->{next_char} = 0x0023; # #
2731 return undef;
2732 }
2733 } elsif ((0x0041 <= $self->{next_char} and
2734 $self->{next_char} <= 0x005A) or
2735 (0x0061 <= $self->{next_char} and
2736 $self->{next_char} <= 0x007A)) {
2737 my $entity_name = chr $self->{next_char};
2738 !!!next-input-character;
2739
2740 my $value = $entity_name;
2741 my $match = 0;
2742 require Whatpm::_NamedEntityList;
2743 our $EntityChar;
2744
2745 while (length $entity_name < 30 and
2746 ## NOTE: Some number greater than the maximum length of entity name
2747 ((0x0041 <= $self->{next_char} and # a
2748 $self->{next_char} <= 0x005A) or # x
2749 (0x0061 <= $self->{next_char} and # a
2750 $self->{next_char} <= 0x007A) or # z
2751 (0x0030 <= $self->{next_char} and # 0
2752 $self->{next_char} <= 0x0039) or # 9
2753 $self->{next_char} == 0x003B)) { # ;
2754 $entity_name .= chr $self->{next_char};
2755 if (defined $EntityChar->{$entity_name}) {
2756 if ($self->{next_char} == 0x003B) { # ;
2757 !!!cp (1020);
2758 $value = $EntityChar->{$entity_name};
2759 $match = 1;
2760 !!!next-input-character;
2761 last;
2762 } else {
2763 !!!cp (1021);
2764 $value = $EntityChar->{$entity_name};
2765 $match = -1;
2766 !!!next-input-character;
2767 }
2768 } else {
2769 !!!cp (1022);
2770 $value .= chr $self->{next_char};
2771 $match *= 2;
2772 !!!next-input-character;
2773 }
2774 }
2775
2776 if ($match > 0) {
2777 !!!cp (1023);
2778 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2779 line => $l, column => $c,
2780 };
2781 } elsif ($match < 0) {
2782 !!!parse-error (type => 'no refc', line => $l, column => $c);
2783 if ($in_attr and $match < -1) {
2784 !!!cp (1024);
2785 return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
2786 line => $l, column => $c,
2787 };
2788 } else {
2789 !!!cp (1025);
2790 return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2791 line => $l, column => $c,
2792 };
2793 }
2794 } else {
2795 !!!cp (1026);
2796 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2797 ## NOTE: "No characters are consumed" in the spec.
2798 return {type => CHARACTER_TOKEN, data => '&'.$value,
2799 line => $l, column => $c,
2800 };
2801 }
2802 } else {
2803 !!!cp (1027);
2804 ## no characters are consumed
2805 !!!parse-error (type => 'bare ero', line => $l, column => $c);
2806 return undef;
2807 }
2808 } # _tokenize_attempt_to_consume_an_entity
2809
2810 sub _initialize_tree_constructor ($) {
2811 my $self = shift;
2812 ## NOTE: $self->{document} MUST be specified before this method is called
2813 $self->{document}->strict_error_checking (0);
2814 ## TODO: Turn mutation events off # MUST
2815 ## TODO: Turn loose Document option (manakai extension) on
2816 $self->{document}->manakai_is_html (1); # MUST
2817 } # _initialize_tree_constructor
2818
2819 sub _terminate_tree_constructor ($) {
2820 my $self = shift;
2821 $self->{document}->strict_error_checking (1);
2822 ## TODO: Turn mutation events on
2823 } # _terminate_tree_constructor
2824
2825 ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
2826
2827 { # tree construction stage
2828 my $token;
2829
2830 sub _construct_tree ($) {
2831 my ($self) = @_;
2832
2833 ## When an interactive UA render the $self->{document} available
2834 ## to the user, or when it begin accepting user input, are
2835 ## not defined.
2836
2837 ## Append a character: collect it and all subsequent consecutive
2838 ## characters and insert one Text node whose data is concatenation
2839 ## of all those characters. # MUST
2840
2841 !!!next-token;
2842
2843 undef $self->{form_element};
2844 undef $self->{head_element};
2845 $self->{open_elements} = [];
2846 undef $self->{inner_html_node};
2847
2848 ## NOTE: The "initial" insertion mode.
2849 $self->_tree_construction_initial; # MUST
2850
2851 ## NOTE: The "before html" insertion mode.
2852 $self->_tree_construction_root_element;
2853 $self->{insertion_mode} = BEFORE_HEAD_IM;
2854
2855 ## NOTE: The "before head" insertion mode and so on.
2856 $self->_tree_construction_main;
2857 } # _construct_tree
2858
2859 sub _tree_construction_initial ($) {
2860 my $self = shift;
2861
2862 ## NOTE: "initial" insertion mode
2863
2864 INITIAL: {
2865 if ($token->{type} == DOCTYPE_TOKEN) {
2866 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
2867 ## error, switch to a conformance checking mode for another
2868 ## language.
2869 my $doctype_name = $token->{name};
2870 $doctype_name = '' unless defined $doctype_name;
2871 $doctype_name =~ tr/a-z/A-Z/;
2872 if (not defined $token->{name} or # <!DOCTYPE>
2873 defined $token->{public_identifier} or
2874 defined $token->{system_identifier}) {
2875 !!!cp ('t1');
2876 !!!parse-error (type => 'not HTML5', token => $token);
2877 } elsif ($doctype_name ne 'HTML') {
2878 !!!cp ('t2');
2879 ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
2880 !!!parse-error (type => 'not HTML5', token => $token);
2881 } else {
2882 !!!cp ('t3');
2883 }
2884
2885 my $doctype = $self->{document}->create_document_type_definition
2886 ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
2887 ## NOTE: Default value for both |public_id| and |system_id| attributes
2888 ## are empty strings, so that we don't set any value in missing cases.
2889 $doctype->public_id ($token->{public_identifier})
2890 if defined $token->{public_identifier};
2891 $doctype->system_id ($token->{system_identifier})
2892 if defined $token->{system_identifier};
2893 ## NOTE: Other DocumentType attributes are null or empty lists.
2894 ## ISSUE: internalSubset = null??
2895 $self->{document}->append_child ($doctype);
2896
2897 if ($token->{quirks} or $doctype_name ne 'HTML') {
2898 !!!cp ('t4');
2899 $self->{document}->manakai_compat_mode ('quirks');
2900 } elsif (defined $token->{public_identifier}) {
2901 my $pubid = $token->{public_identifier};
2902 $pubid =~ tr/a-z/A-z/;
2903 if ({
2904 "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
2905 "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2906 "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
2907 "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
2908 "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
2909 "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
2910 "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
2911 "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
2912 "-//IETF//DTD HTML 2.0//EN" => 1,
2913 "-//IETF//DTD HTML 2.1E//EN" => 1,
2914 "-//IETF//DTD HTML 3.0//EN" => 1,
2915 "-//IETF//DTD HTML 3.0//EN//" => 1,
2916 "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
2917 "-//IETF//DTD HTML 3.2//EN" => 1,
2918 "-//IETF//DTD HTML 3//EN" => 1,
2919 "-//IETF//DTD HTML LEVEL 0//EN" => 1,
2920 "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
2921 "-//IETF//DTD HTML LEVEL 1//EN" => 1,
2922 "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
2923 "-//IETF//DTD HTML LEVEL 2//EN" => 1,
2924 "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
2925 "-//IETF//DTD HTML LEVEL 3//EN" => 1,
2926 "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
2927 "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
2928 "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
2929 "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
2930 "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
2931 "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
2932 "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
2933 "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
2934 "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
2935 "-//IETF//DTD HTML STRICT//EN" => 1,
2936 "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
2937 "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
2938 "-//IETF//DTD HTML//EN" => 1,
2939 "-//IETF//DTD HTML//EN//2.0" => 1,
2940 "-//IETF//DTD HTML//EN//3.0" => 1,
2941 "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
2942 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
2943 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
2944 "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
2945 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
2946 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
2947 "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
2948 "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
2949 "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
2950 "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
2951 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
2952 "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
2953 "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
2954 "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
2955 "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
2956 "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
2957 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
2958 "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
2959 "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
2960 "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
2961 "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
2962 "-//W3C//DTD HTML 3.2//EN" => 1,
2963 "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
2964 "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
2965 "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
2966 "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
2967 "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
2968 "-//W3C//DTD W3 HTML//EN" => 1,
2969 "-//W3O//DTD W3 HTML 3.0//EN" => 1,
2970 "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
2971 "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
2972 "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
2973 "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
2974 "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
2975 "HTML" => 1,
2976 }->{$pubid}) {
2977 !!!cp ('t5');
2978 $self->{document}->manakai_compat_mode ('quirks');
2979 } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
2980 $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
2981 if (defined $token->{system_identifier}) {
2982 !!!cp ('t6');
2983 $self->{document}->manakai_compat_mode ('quirks');
2984 } else {
2985 !!!cp ('t7');
2986 $self->{document}->manakai_compat_mode ('limited quirks');
2987 }
2988 } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
2989 $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
2990 !!!cp ('t8');
2991 $self->{document}->manakai_compat_mode ('limited quirks');
2992 } else {
2993 !!!cp ('t9');
2994 }
2995 } else {
2996 !!!cp ('t10');
2997 }
2998 if (defined $token->{system_identifier}) {
2999 my $sysid = $token->{system_identifier};
3000 $sysid =~ tr/A-Z/a-z/;
3001 if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3002 ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
3003 $self->{document}->manakai_compat_mode ('quirks');
3004 !!!cp ('t11');
3005 } else {
3006 !!!cp ('t12');
3007 }
3008 } else {
3009 !!!cp ('t13');
3010 }
3011
3012 ## Go to the "before html" insertion mode.
3013 !!!next-token;
3014 return;
3015 } elsif ({
3016 START_TAG_TOKEN, 1,
3017 END_TAG_TOKEN, 1,
3018 END_OF_FILE_TOKEN, 1,
3019 }->{$token->{type}}) {
3020 !!!cp ('t14');
3021 !!!parse-error (type => 'no DOCTYPE', token => $token);
3022 $self->{document}->manakai_compat_mode ('quirks');
3023 ## Go to the "before html" insertion mode.
3024 ## reprocess
3025 !!!ack-later;
3026 return;
3027 } elsif ($token->{type} == CHARACTER_TOKEN) {
3028 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3029 ## Ignore the token
3030
3031 unless (length $token->{data}) {
3032 !!!cp ('t15');
3033 ## Stay in the insertion mode.
3034 !!!next-token;
3035 redo INITIAL;
3036 } else {
3037 !!!cp ('t16');
3038 }
3039 } else {
3040 !!!cp ('t17');
3041 }
3042
3043 !!!parse-error (type => 'no DOCTYPE', token => $token);
3044 $self->{document}->manakai_compat_mode ('quirks');
3045 ## Go to the "before html" insertion mode.
3046 ## reprocess
3047 return;
3048 } elsif ($token->{type} == COMMENT_TOKEN) {
3049 !!!cp ('t18');
3050 my $comment = $self->{document}->create_comment ($token->{data});
3051 $self->{document}->append_child ($comment);
3052
3053 ## Stay in the insertion mode.
3054 !!!next-token;
3055 redo INITIAL;
3056 } else {
3057 die "$0: $token->{type}: Unknown token type";
3058 }
3059 } # INITIAL
3060
3061 die "$0: _tree_construction_initial: This should be never reached";
3062 } # _tree_construction_initial
3063
3064 sub _tree_construction_root_element ($) {
3065 my $self = shift;
3066
3067 ## NOTE: "before html" insertion mode.
3068
3069 B: {
3070 if ($token->{type} == DOCTYPE_TOKEN) {
3071 !!!cp ('t19');
3072 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3073 ## Ignore the token
3074 ## Stay in the insertion mode.
3075 !!!next-token;
3076 redo B;
3077 } elsif ($token->{type} == COMMENT_TOKEN) {
3078 !!!cp ('t20');
3079 my $comment = $self->{document}->create_comment ($token->{data});
3080 $self->{document}->append_child ($comment);
3081 ## Stay in the insertion mode.
3082 !!!next-token;
3083 redo B;
3084 } elsif ($token->{type} == CHARACTER_TOKEN) {
3085 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3086 ## Ignore the token.
3087
3088 unless (length $token->{data}) {
3089 !!!cp ('t21');
3090 ## Stay in the insertion mode.
3091 !!!next-token;
3092 redo B;
3093 } else {
3094 !!!cp ('t22');
3095 }
3096 } else {
3097 !!!cp ('t23');
3098 }
3099
3100 $self->{application_cache_selection}->(undef);
3101
3102 #
3103 } elsif ($token->{type} == START_TAG_TOKEN) {
3104 if ($token->{tag_name} eq 'html') {
3105 my $root_element;
3106 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3107 $self->{document}->append_child ($root_element);
3108 push @{$self->{open_elements}},
3109 [$root_element, $el_category->{html}];
3110
3111 if ($token->{attributes}->{manifest}) {
3112 !!!cp ('t24');
3113 $self->{application_cache_selection}
3114 ->($token->{attributes}->{manifest}->{value});
3115 ## ISSUE: Spec is unclear on relative references.
3116 ## According to Hixie (#whatwg 2008-03-19), it should be
3117 ## resolved against the base URI of the document in HTML
3118 ## or xml:base of the element in XHTML.
3119 } else {
3120 !!!cp ('t25');
3121 $self->{application_cache_selection}->(undef);
3122 }
3123
3124 !!!nack ('t25c');
3125
3126 !!!next-token;
3127 return; ## Go to the "before head" insertion mode.
3128 } else {
3129 !!!cp ('t25.1');
3130 #
3131 }
3132 } elsif ({
3133 END_TAG_TOKEN, 1,
3134 END_OF_FILE_TOKEN, 1,
3135 }->{$token->{type}}) {
3136 !!!cp ('t26');
3137 #
3138 } else {
3139 die "$0: $token->{type}: Unknown token type";
3140 }
3141
3142 my $root_element;
3143 !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3144 $self->{document}->append_child ($root_element);
3145 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3146
3147 $self->{application_cache_selection}->(undef);
3148
3149 ## NOTE: Reprocess the token.
3150 !!!ack-later;
3151 return; ## Go to the "before head" insertion mode.
3152
3153 ## ISSUE: There is an issue in the spec
3154 } # B
3155
3156 die "$0: _tree_construction_root_element: This should never be reached";
3157 } # _tree_construction_root_element
3158
3159 sub _reset_insertion_mode ($) {
3160 my $self = shift;
3161
3162 ## Step 1
3163 my $last;
3164
3165 ## Step 2
3166 my $i = -1;
3167 my $node = $self->{open_elements}->[$i];
3168
3169 ## Step 3
3170 S3: {
3171 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3172 $last = 1;
3173 if (defined $self->{inner_html_node}) {
3174 if ($self->{inner_html_node}->[1] & TABLE_CELL_EL) {
3175 !!!cp ('t27');
3176 #
3177 } else {
3178 !!!cp ('t28');
3179 $node = $self->{inner_html_node};
3180 }
3181 }
3182 }
3183
3184 ## Step 4..14
3185 my $new_mode;
3186 if ($node->[1] & FOREIGN_EL) {
3187 ## NOTE: Strictly spaking, the line below only applies to MathML and
3188 ## SVG elements. Currently the HTML syntax supports only MathML and
3189 ## SVG elements as foreigners.
3190 $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
3191 ## ISSUE: What is set as the secondary insertion mode?
3192 } else {
3193 $new_mode = {
3194 select => IN_SELECT_IM,
3195 ## NOTE: |option| and |optgroup| do not set
3196 ## insertion mode to "in select" by themselves.
3197 td => IN_CELL_IM,
3198 th => IN_CELL_IM,
3199 tr => IN_ROW_IM,
3200 tbody => IN_TABLE_BODY_IM,
3201 thead => IN_TABLE_BODY_IM,
3202 tfoot => IN_TABLE_BODY_IM,
3203 caption => IN_CAPTION_IM,
3204 colgroup => IN_COLUMN_GROUP_IM,
3205 table => IN_TABLE_IM,
3206 head => IN_BODY_IM, # not in head!
3207 body => IN_BODY_IM,
3208 frameset => IN_FRAMESET_IM,
3209 }->{$node->[0]->manakai_local_name};
3210 }
3211 $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3212
3213 ## Step 15
3214 if ($node->[1] & HTML_EL) {
3215 unless (defined $self->{head_element}) {
3216 !!!cp ('t29');
3217 $self->{insertion_mode} = BEFORE_HEAD_IM;
3218 } else {
3219 ## ISSUE: Can this state be reached?
3220 !!!cp ('t30');
3221 $self->{insertion_mode} = AFTER_HEAD_IM;
3222 }
3223 return;
3224 } else {
3225 !!!cp ('t31');
3226 }
3227
3228 ## Step 16
3229 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3230
3231 ## Step 17
3232 $i--;
3233 $node = $self->{open_elements}->[$i];
3234
3235 ## Step 18
3236 redo S3;
3237 } # S3
3238
3239 die "$0: _reset_insertion_mode: This line should never be reached";
3240 } # _reset_insertion_mode
3241
3242 sub _tree_construction_main ($) {
3243 my $self = shift;
3244
3245 my $active_formatting_elements = [];
3246
3247 my $reconstruct_active_formatting_elements = sub { # MUST
3248 my $insert = shift;
3249
3250 ## Step 1
3251 return unless @$active_formatting_elements;
3252
3253 ## Step 3
3254 my $i = -1;
3255 my $entry = $active_formatting_elements->[$i];
3256
3257 ## Step 2
3258 return if $entry->[0] eq '#marker';
3259 for (@{$self->{open_elements}}) {
3260 if ($entry->[0] eq $_->[0]) {
3261 !!!cp ('t32');
3262 return;
3263 }
3264 }
3265
3266 S4: {
3267 ## Step 4
3268 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3269
3270 ## Step 5
3271 $i--;
3272 $entry = $active_formatting_elements->[$i];
3273
3274 ## Step 6
3275 if ($entry->[0] eq '#marker') {
3276 !!!cp ('t33_1');
3277 #
3278 } else {
3279 my $in_open_elements;
3280 OE: for (@{$self->{open_elements}}) {
3281 if ($entry->[0] eq $_->[0]) {
3282 !!!cp ('t33');
3283 $in_open_elements = 1;
3284 last OE;
3285 }
3286 }
3287 if ($in_open_elements) {
3288 !!!cp ('t34');
3289 #
3290 } else {
3291 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3292 !!!cp ('t35');
3293 redo S4;
3294 }
3295 }
3296
3297 ## Step 7
3298 $i++;
3299 $entry = $active_formatting_elements->[$i];
3300 } # S4
3301
3302 S7: {
3303 ## Step 8
3304 my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3305
3306 ## Step 9
3307 $insert->($clone->[0]);
3308 push @{$self->{open_elements}}, $clone;
3309
3310 ## Step 10
3311 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3312
3313 ## Step 11
3314 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3315 !!!cp ('t36');
3316 ## Step 7'
3317 $i++;
3318 $entry = $active_formatting_elements->[$i];
3319
3320 redo S7;
3321 }
3322
3323 !!!cp ('t37');
3324 } # S7
3325 }; # $reconstruct_active_formatting_elements
3326
3327 my $clear_up_to_marker = sub {
3328 for (reverse 0..$#$active_formatting_elements) {
3329 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3330 !!!cp ('t38');
3331 splice @$active_formatting_elements, $_;
3332 return;
3333 }
3334 }
3335
3336 !!!cp ('t39');
3337 }; # $clear_up_to_marker
3338
3339 my $insert;
3340
3341 my $parse_rcdata = sub ($) {
3342 my ($content_model_flag) = @_;
3343
3344 ## Step 1
3345 my $start_tag_name = $token->{tag_name};
3346 my $el;
3347 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3348
3349 ## Step 2
3350 $insert->($el);
3351
3352 ## Step 3
3353 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3354 delete $self->{escape}; # MUST
3355
3356 ## Step 4
3357 my $text = '';
3358 !!!nack ('t40.1');
3359 !!!next-token;
3360 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3361 !!!cp ('t40');
3362 $text .= $token->{data};
3363 !!!next-token;
3364 }
3365
3366 ## Step 5
3367 if (length $text) {
3368 !!!cp ('t41');
3369 my $text = $self->{document}->create_text_node ($text);
3370 $el->append_child ($text);
3371 }
3372
3373 ## Step 6
3374 $self->{content_model} = PCDATA_CONTENT_MODEL;
3375
3376 ## Step 7
3377 if ($token->{type} == END_TAG_TOKEN and
3378 $token->{tag_name} eq $start_tag_name) {
3379 !!!cp ('t42');
3380 ## Ignore the token
3381 } else {
3382 ## NOTE: An end-of-file token.
3383 if ($content_model_flag == CDATA_CONTENT_MODEL) {
3384 !!!cp ('t43');
3385 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3386 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3387 !!!cp ('t44');
3388 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
3389 } else {
3390 die "$0: $content_model_flag in parse_rcdata";
3391 }
3392 }
3393 !!!next-token;
3394 }; # $parse_rcdata
3395
3396 my $script_start_tag = sub () {
3397 my $script_el;
3398 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3399 ## TODO: mark as "parser-inserted"
3400
3401 $self->{content_model} = CDATA_CONTENT_MODEL;
3402 delete $self->{escape}; # MUST
3403
3404 my $text = '';
3405 !!!nack ('t45.1');
3406 !!!next-token;
3407 while ($token->{type} == CHARACTER_TOKEN) {
3408 !!!cp ('t45');
3409 $text .= $token->{data};
3410 !!!next-token;
3411 } # stop if non-character token or tokenizer stops tokenising
3412 if (length $text) {
3413 !!!cp ('t46');
3414 $script_el->manakai_append_text ($text);
3415 }
3416
3417 $self->{content_model} = PCDATA_CONTENT_MODEL;
3418
3419 if ($token->{type} == END_TAG_TOKEN and
3420 $token->{tag_name} eq 'script') {
3421 !!!cp ('t47');
3422 ## Ignore the token
3423 } else {
3424 !!!cp ('t48');
3425 !!!parse-error (type => 'in CDATA:#'.$token->{type}, token => $token);
3426 ## ISSUE: And ignore?
3427 ## TODO: mark as "already executed"
3428 }
3429
3430 if (defined $self->{inner_html_node}) {
3431 !!!cp ('t49');
3432 ## TODO: mark as "already executed"
3433 } else {
3434 !!!cp ('t50');
3435 ## TODO: $old_insertion_point = current insertion point
3436 ## TODO: insertion point = just before the next input character
3437
3438 $insert->($script_el);
3439
3440 ## TODO: insertion point = $old_insertion_point (might be "undefined")
3441
3442 ## TODO: if there is a script that will execute as soon as the parser resume, then...
3443 }
3444
3445 !!!next-token;
3446 }; # $script_start_tag
3447
3448 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
3449 ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
3450 my $open_tables = [[$self->{open_elements}->[0]->[0]]];
3451
3452 my $formatting_end_tag = sub {
3453 my $end_tag_token = shift;
3454 my $tag_name = $end_tag_token->{tag_name};
3455
3456 ## NOTE: The adoption agency algorithm (AAA).
3457
3458 FET: {
3459 ## Step 1
3460 my $formatting_element;
3461 my $formatting_element_i_in_active;
3462 AFE: for (reverse 0..$#$active_formatting_elements) {
3463 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3464 !!!cp ('t52');
3465 last AFE;
3466 } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
3467 eq $tag_name) {
3468 !!!cp ('t51');
3469 $formatting_element = $active_formatting_elements->[$_];
3470 $formatting_element_i_in_active = $_;
3471 last AFE;
3472 }
3473 } # AFE
3474 unless (defined $formatting_element) {
3475 !!!cp ('t53');
3476 !!!parse-error (type => 'unmatched end tag:'.$tag_name, token => $end_tag_token);
3477 ## Ignore the token
3478 !!!next-token;
3479 return;
3480 }
3481 ## has an element in scope
3482 my $in_scope = 1;
3483 my $formatting_element_i_in_open;
3484 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
3485 my $node = $self->{open_elements}->[$_];
3486 if ($node->[0] eq $formatting_element->[0]) {
3487 if ($in_scope) {
3488 !!!cp ('t54');
3489 $formatting_element_i_in_open = $_;
3490 last INSCOPE;
3491 } else { # in open elements but not in scope
3492 !!!cp ('t55');
3493 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3494 token => $end_tag_token);
3495 ## Ignore the token
3496 !!!next-token;
3497 return;
3498 }
3499 } elsif ($node->[1] & SCOPING_EL) {
3500 !!!cp ('t56');
3501 $in_scope = 0;
3502 }
3503 } # INSCOPE
3504 unless (defined $formatting_element_i_in_open) {
3505 !!!cp ('t57');
3506 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name},
3507 token => $end_tag_token);
3508 pop @$active_formatting_elements; # $formatting_element
3509 !!!next-token; ## TODO: ok?
3510 return;
3511 }
3512 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
3513 !!!cp ('t58');
3514 !!!parse-error (type => 'not closed',
3515 value => $self->{open_elements}->[-1]->[0]
3516 ->manakai_local_name,
3517 token => $end_tag_token);
3518 }
3519
3520 ## Step 2
3521 my $furthest_block;
3522 my $furthest_block_i_in_open;
3523 OE: for (reverse 0..$#{$self->{open_elements}}) {
3524 my $node = $self->{open_elements}->[$_];
3525 if (not ($node->[1] & FORMATTING_EL) and
3526 #not $phrasing_category->{$node->[1]} and
3527 ($node->[1] & SPECIAL_EL or
3528 $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
3529 !!!cp ('t59');
3530 $furthest_block = $node;
3531 $furthest_block_i_in_open = $_;
3532 } elsif ($node->[0] eq $formatting_element->[0]) {
3533 !!!cp ('t60');
3534 last OE;
3535 }
3536 } # OE
3537
3538 ## Step 3
3539 unless (defined $furthest_block) { # MUST
3540 !!!cp ('t61');
3541 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
3542 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
3543 !!!next-token;
3544 return;
3545 }
3546
3547 ## Step 4
3548 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
3549
3550 ## Step 5
3551 my $furthest_block_parent = $furthest_block->[0]->parent_node;
3552 if (defined $furthest_block_parent) {
3553 !!!cp ('t62');
3554 $furthest_block_parent->remove_child ($furthest_block->[0]);
3555 }
3556
3557 ## Step 6
3558 my $bookmark_prev_el
3559 = $active_formatting_elements->[$formatting_element_i_in_active - 1]
3560 ->[0];
3561
3562 ## Step 7
3563 my $node = $furthest_block;
3564 my $node_i_in_open = $furthest_block_i_in_open;
3565 my $last_node = $furthest_block;
3566 S7: {
3567 ## Step 1
3568 $node_i_in_open--;
3569 $node = $self->{open_elements}->[$node_i_in_open];
3570
3571 ## Step 2
3572 my $node_i_in_active;
3573 S7S2: {
3574 for (reverse 0..$#$active_formatting_elements) {
3575 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
3576 !!!cp ('t63');
3577 $node_i_in_active = $_;
3578 last S7S2;
3579 }
3580 }
3581 splice @{$self->{open_elements}}, $node_i_in_open, 1;
3582 redo S7;
3583 } # S7S2
3584
3585 ## Step 3
3586 last S7 if $node->[0] eq $formatting_element->[0];
3587
3588 ## Step 4
3589 if ($last_node->[0] eq $furthest_block->[0]) {
3590 !!!cp ('t64');
3591 $bookmark_prev_el = $node->[0];
3592 }
3593
3594 ## Step 5
3595 if ($node->[0]->has_child_nodes ()) {
3596 !!!cp ('t65');
3597 my $clone = [$node->[0]->clone_node (0), $node->[1]];
3598 $active_formatting_elements->[$node_i_in_active] = $clone;
3599 $self->{open_elements}->[$node_i_in_open] = $clone;
3600 $node = $clone;
3601 }
3602
3603 ## Step 6
3604 $node->[0]->append_child ($last_node->[0]);
3605
3606 ## Step 7
3607 $last_node = $node;
3608
3609 ## Step 8
3610 redo S7;
3611 } # S7
3612
3613 ## Step 8
3614 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
3615 my $foster_parent_element;
3616 my $next_sibling;
3617 OE: for (reverse 0..$#{$self->{open_elements}}) {
3618 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3619 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3620 if (defined $parent and $parent->node_type == 1) {
3621 !!!cp ('t65.1');
3622 $foster_parent_element = $parent;
3623 $next_sibling = $self->{open_elements}->[$_]->[0];
3624 } else {
3625 !!!cp ('t65.2');
3626 $foster_parent_element
3627 = $self->{open_elements}->[$_ - 1]->[0];
3628 }
3629 last OE;
3630 }
3631 } # OE
3632 $foster_parent_element = $self->{open_elements}->[0]->[0]
3633 unless defined $foster_parent_element;
3634 $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
3635 $open_tables->[-1]->[1] = 1; # tainted
3636 } else {
3637 !!!cp ('t65.3');
3638 $common_ancestor_node->[0]->append_child ($last_node->[0]);
3639 }
3640
3641 ## Step 9
3642 my $clone = [$formatting_element->[0]->clone_node (0),
3643 $formatting_element->[1]];
3644
3645 ## Step 10
3646 my @cn = @{$furthest_block->[0]->child_nodes};
3647 $clone->[0]->append_child ($_) for @cn;
3648
3649 ## Step 11
3650 $furthest_block->[0]->append_child ($clone->[0]);
3651
3652 ## Step 12
3653 my $i;
3654 AFE: for (reverse 0..$#$active_formatting_elements) {
3655 if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
3656 !!!cp ('t66');
3657 splice @$active_formatting_elements, $_, 1;
3658 $i-- and last AFE if defined $i;
3659 } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
3660 !!!cp ('t67');
3661 $i = $_;
3662 }
3663 } # AFE
3664 splice @$active_formatting_elements, $i + 1, 0, $clone;
3665
3666 ## Step 13
3667 undef $i;
3668 OE: for (reverse 0..$#{$self->{open_elements}}) {
3669 if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
3670 !!!cp ('t68');
3671 splice @{$self->{open_elements}}, $_, 1;
3672 $i-- and last OE if defined $i;
3673 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
3674 !!!cp ('t69');
3675 $i = $_;
3676 }
3677 } # OE
3678 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
3679
3680 ## Step 14
3681 redo FET;
3682 } # FET
3683 }; # $formatting_end_tag
3684
3685 $insert = my $insert_to_current = sub {
3686 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
3687 }; # $insert_to_current
3688
3689 my $insert_to_foster = sub {
3690 my $child = shift;
3691 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
3692 # MUST
3693 my $foster_parent_element;
3694 my $next_sibling;
3695 OE: for (reverse 0..$#{$self->{open_elements}}) {
3696 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
3697 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
3698 if (defined $parent and $parent->node_type == 1) {
3699 !!!cp ('t70');
3700 $foster_parent_element = $parent;
3701 $next_sibling = $self->{open_elements}->[$_]->[0];
3702 } else {
3703 !!!cp ('t71');
3704 $foster_parent_element
3705 = $self->{open_elements}->[$_ - 1]->[0];
3706 }
3707 last OE;
3708 }
3709 } # OE
3710 $foster_parent_element = $self->{open_elements}->[0]->[0]
3711 unless defined $foster_parent_element;
3712 $foster_parent_element->insert_before
3713 ($child, $next_sibling);
3714 $open_tables->[-1]->[1] = 1; # tainted
3715 } else {
3716 !!!cp ('t72');
3717 $self->{open_elements}->[-1]->[0]->append_child ($child);
3718 }
3719 }; # $insert_to_foster
3720
3721 B: while (1) {
3722 if ($token->{type} == DOCTYPE_TOKEN) {
3723 !!!cp ('t73');
3724 !!!parse-error (type => 'DOCTYPE in the middle', token => $token);
3725 ## Ignore the token
3726 ## Stay in the phase
3727 !!!next-token;
3728 next B;
3729 } elsif ($token->{type} == START_TAG_TOKEN and
3730 $token->{tag_name} eq 'html') {
3731 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
3732 !!!cp ('t79');
3733 !!!parse-error (type => 'after html:html', token => $token);
3734 $self->{insertion_mode} = AFTER_BODY_IM;
3735 } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
3736 !!!cp ('t80');
3737 !!!parse-error (type => 'after html:html', token => $token);
3738 $self->{insertion_mode} = AFTER_FRAMESET_IM;
3739 } else {
3740 !!!cp ('t81');
3741 }
3742
3743 !!!cp ('t82');
3744 !!!parse-error (type => 'not first start tag', token => $token);
3745 my $top_el = $self->{open_elements}->[0]->[0];
3746 for my $attr_name (keys %{$token->{attributes}}) {
3747 unless ($top_el->has_attribute_ns (undef, $attr_name)) {
3748 !!!cp ('t84');
3749 $top_el->set_attribute_ns
3750 (undef, [undef, $attr_name],
3751 $token->{attributes}->{$attr_name}->{value});
3752 }
3753 }
3754 !!!nack ('t84.1');
3755 !!!next-token;
3756 next B;
3757 } elsif ($token->{type} == COMMENT_TOKEN) {
3758 my $comment = $self->{document}->create_comment ($token->{data});
3759 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
3760 !!!cp ('t85');
3761 $self->{document}->append_child ($comment);
3762 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
3763 !!!cp ('t86');
3764 $self->{open_elements}->[0]->[0]->append_child ($comment);
3765 } else {
3766 !!!cp ('t87');
3767 $self->{open_elements}->[-1]->[0]->append_child ($comment);
3768 }
3769 !!!next-token;
3770 next B;
3771 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
3772 if ($token->{type} == CHARACTER_TOKEN) {
3773 !!!cp ('t87.1');
3774 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
3775 !!!next-token;
3776 next B;
3777 } elsif ($token->{type} == START_TAG_TOKEN) {
3778 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
3779 $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
3780 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
3781 ($token->{tag_name} eq 'svg' and
3782 $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
3783 ## NOTE: "using the rules for secondary insertion mode"then"continue"
3784 !!!cp ('t87.2');
3785 #
3786 } elsif ({
3787 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
3788 center => 1, code => 1, dd => 1, div => 1, dl => 1, em => 1,
3789 embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1, ## No h4!
3790 h5 => 1, h6 => 1, head => 1, hr => 1, i => 1, img => 1,
3791 li => 1, menu => 1, meta => 1, nobr => 1, p => 1, pre => 1,
3792 ruby => 1, s => 1, small => 1, span => 1, strong => 1,
3793 sub => 1, sup => 1, table => 1, tt => 1, u => 1, ul => 1,
3794 var => 1,
3795 }->{$token->{tag_name}}) {
3796 !!!cp ('t87.2');
3797 !!!parse-error (type => 'not closed',
3798 value => $self->{open_elements}->[-1]->[0]
3799 ->manakai_local_name,
3800 token => $token);
3801
3802 pop @{$self->{open_elements}}
3803 while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
3804
3805 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
3806 ## Reprocess.
3807 next B;
3808 } else {
3809 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
3810 my $tag_name = $token->{tag_name};
3811 if ($nsuri eq $SVG_NS) {
3812 $tag_name = {
3813 altglyph => 'altGlyph',
3814 altglyphdef => 'altGlyphDef',
3815 altglyphitem => 'altGlyphItem',
3816 animatecolor => 'animateColor',
3817 animatemotion => 'animateMotion',
3818 animatetransform => 'animateTransform',
3819 clippath => 'clipPath',
3820 feblend => 'feBlend',
3821 fecolormatrix => 'feColorMatrix',
3822 fecomponenttransfer => 'feComponentTransfer',
3823 fecomposite => 'feComposite',
3824 feconvolvematrix => 'feConvolveMatrix',
3825 fediffuselighting => 'feDiffuseLighting',
3826 fedisplacementmap => 'feDisplacementMap',
3827 fedistantlight => 'feDistantLight',
3828 feflood => 'feFlood',
3829 fefunca => 'feFuncA',
3830 fefuncb => 'feFuncB',
3831 fefuncg => 'feFuncG',
3832 fefuncr => 'feFuncR',
3833 fegaussianblur => 'feGaussianBlur',
3834 feimage => 'feImage',
3835 femerge => 'feMerge',
3836 femergenode => 'feMergeNode',
3837 femorphology => 'feMorphology',
3838 feoffset => 'feOffset',
3839 fepointlight => 'fePointLight',
3840 fespecularlighting => 'feSpecularLighting',
3841 fespotlight => 'feSpotLight',
3842 fetile => 'feTile',
3843 feturbulence => 'feTurbulence',
3844 foreignobject => 'foreignObject',
3845 glyphref => 'glyphRef',
3846 lineargradient => 'linearGradient',
3847 radialgradient => 'radialGradient',
3848 #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
3849 textpath => 'textPath',
3850 }->{$tag_name} || $tag_name;
3851 }
3852
3853 ## "adjust SVG attributes" (SVG only) - done in insert-element-f
3854
3855 ## "adjust foreign attributes" - done in insert-element-f
3856
3857 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
3858
3859 if ($self->{self_closing}) {
3860 pop @{$self->{open_elements}};
3861 !!!ack ('t87.3');
3862 } else {
3863 !!!cp ('t87.4');
3864 }
3865
3866 !!!next-token;
3867 next B;
3868 }
3869 } elsif ($token->{type} == END_TAG_TOKEN) {
3870 ## NOTE: "using the rules for secondary insertion mode" then "continue"
3871 !!!cp ('t87.5');
3872 #
3873 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
3874 ## NOTE: "using the rules for secondary insertion mode" then "continue"
3875 !!!cp ('t87.6');
3876 #
3877 ## TODO: ...
3878 } else {
3879 die "$0: $token->{type}: Unknown token type";
3880 }
3881 }
3882
3883 if ($self->{insertion_mode} & HEAD_IMS) {
3884 if ($token->{type} == CHARACTER_TOKEN) {
3885 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3886 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3887 !!!cp ('t88.2');
3888 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3889 } else {
3890 !!!cp ('t88.1');
3891 ## Ignore the token.
3892 !!!next-token;
3893 next B;
3894 }
3895 unless (length $token->{data}) {
3896 !!!cp ('t88');
3897 !!!next-token;
3898 next B;
3899 }
3900 }
3901
3902 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3903 !!!cp ('t89');
3904 ## As if <head>
3905 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
3906 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3907 push @{$self->{open_elements}},
3908 [$self->{head_element}, $el_category->{head}];
3909
3910 ## Reprocess in the "in head" insertion mode...
3911 pop @{$self->{open_elements}};
3912
3913 ## Reprocess in the "after head" insertion mode...
3914 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3915 !!!cp ('t90');
3916 ## As if </noscript>
3917 pop @{$self->{open_elements}};
3918 !!!parse-error (type => 'in noscript:#character', token => $token);
3919
3920 ## Reprocess in the "in head" insertion mode...
3921 ## As if </head>
3922 pop @{$self->{open_elements}};
3923
3924 ## Reprocess in the "after head" insertion mode...
3925 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
3926 !!!cp ('t91');
3927 pop @{$self->{open_elements}};
3928
3929 ## Reprocess in the "after head" insertion mode...
3930 } else {
3931 !!!cp ('t92');
3932 }
3933
3934 ## "after head" insertion mode
3935 ## As if <body>
3936 !!!insert-element ('body',, $token);
3937 $self->{insertion_mode} = IN_BODY_IM;
3938 ## reprocess
3939 next B;
3940 } elsif ($token->{type} == START_TAG_TOKEN) {
3941 if ($token->{tag_name} eq 'head') {
3942 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3943 !!!cp ('t93');
3944 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3945 $self->{open_elements}->[-1]->[0]->append_child
3946 ($self->{head_element});
3947 push @{$self->{open_elements}},
3948 [$self->{head_element}, $el_category->{head}];
3949 $self->{insertion_mode} = IN_HEAD_IM;
3950 !!!nack ('t93.1');
3951 !!!next-token;
3952 next B;
3953 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
3954 !!!cp ('t94');
3955 #
3956 } else {
3957 !!!cp ('t95');
3958 !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
3959 ## Ignore the token
3960 !!!nack ('t95.1');
3961 !!!next-token;
3962 next B;
3963 }
3964 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
3965 !!!cp ('t96');
3966 ## As if <head>
3967 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
3968 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
3969 push @{$self->{open_elements}},
3970 [$self->{head_element}, $el_category->{head}];
3971
3972 $self->{insertion_mode} = IN_HEAD_IM;
3973 ## Reprocess in the "in head" insertion mode...
3974 } else {
3975 !!!cp ('t97');
3976 }
3977
3978 if ($token->{tag_name} eq 'base') {
3979 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
3980 !!!cp ('t98');
3981 ## As if </noscript>
3982 pop @{$self->{open_elements}};
3983 !!!parse-error (type => 'in noscript:base', token => $token);
3984
3985 $self->{insertion_mode} = IN_HEAD_IM;
3986 ## Reprocess in the "in head" insertion mode...
3987 } else {
3988 !!!cp ('t99');
3989 }
3990
3991 ## NOTE: There is a "as if in head" code clone.
3992 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
3993 !!!cp ('t100');
3994 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
3995 push @{$self->{open_elements}},
3996 [$self->{head_element}, $el_category->{head}];
3997 } else {
3998 !!!cp ('t101');
3999 }
4000 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4001 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4002 pop @{$self->{open_elements}} # <head>
4003 if $self->{insertion_mode} == AFTER_HEAD_IM;
4004 !!!nack ('t101.1');
4005 !!!next-token;
4006 next B;
4007 } elsif ($token->{tag_name} eq 'link') {
4008 ## NOTE: There is a "as if in head" code clone.
4009 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4010 !!!cp ('t102');
4011 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4012 push @{$self->{open_elements}},
4013 [$self->{head_element}, $el_category->{head}];
4014 } else {
4015 !!!cp ('t103');
4016 }
4017 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4018 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4019 pop @{$self->{open_elements}} # <head>
4020 if $self->{insertion_mode} == AFTER_HEAD_IM;
4021 !!!ack ('t103.1');
4022 !!!next-token;
4023 next B;
4024 } elsif ($token->{tag_name} eq 'meta') {
4025 ## NOTE: There is a "as if in head" code clone.
4026 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4027 !!!cp ('t104');
4028 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4029 push @{$self->{open_elements}},
4030 [$self->{head_element}, $el_category->{head}];
4031 } else {
4032 !!!cp ('t105');
4033 }
4034 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4035 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4036
4037 unless ($self->{confident}) {
4038 if ($token->{attributes}->{charset}) { ## TODO: And if supported
4039 !!!cp ('t106');
4040 $self->{change_encoding}
4041 ->($self, $token->{attributes}->{charset}->{value},
4042 $token);
4043
4044 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4045 ->set_user_data (manakai_has_reference =>
4046 $token->{attributes}->{charset}
4047 ->{has_reference});
4048 } elsif ($token->{attributes}->{content}) {
4049 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4050 if ($token->{attributes}->{content}->{value}
4051 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4052 [\x09-\x0D\x20]*=
4053 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4054 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4055 !!!cp ('t107');
4056 $self->{change_encoding}
4057 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4058 $token);
4059 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4060 ->set_user_data (manakai_has_reference =>
4061 $token->{attributes}->{content}
4062 ->{has_reference});
4063 } else {
4064 !!!cp ('t108');
4065 }
4066 }
4067 } else {
4068 if ($token->{attributes}->{charset}) {
4069 !!!cp ('t109');
4070 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4071 ->set_user_data (manakai_has_reference =>
4072 $token->{attributes}->{charset}
4073 ->{has_reference});
4074 }
4075 if ($token->{attributes}->{content}) {
4076 !!!cp ('t110');
4077 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4078 ->set_user_data (manakai_has_reference =>
4079 $token->{attributes}->{content}
4080 ->{has_reference});
4081 }
4082 }
4083
4084 pop @{$self->{open_elements}} # <head>
4085 if $self->{insertion_mode} == AFTER_HEAD_IM;
4086 !!!ack ('t110.1');
4087 !!!next-token;
4088 next B;
4089 } elsif ($token->{tag_name} eq 'title') {
4090 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4091 !!!cp ('t111');
4092 ## As if </noscript>
4093 pop @{$self->{open_elements}};
4094 !!!parse-error (type => 'in noscript:title', token => $token);
4095
4096 $self->{insertion_mode} = IN_HEAD_IM;
4097 ## Reprocess in the "in head" insertion mode...
4098 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4099 !!!cp ('t112');
4100 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4101 push @{$self->{open_elements}},
4102 [$self->{head_element}, $el_category->{head}];
4103 } else {
4104 !!!cp ('t113');
4105 }
4106
4107 ## NOTE: There is a "as if in head" code clone.
4108 my $parent = defined $self->{head_element} ? $self->{head_element}
4109 : $self->{open_elements}->[-1]->[0];
4110 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4111 pop @{$self->{open_elements}} # <head>
4112 if $self->{insertion_mode} == AFTER_HEAD_IM;
4113 next B;
4114 } elsif ($token->{tag_name} eq 'style') {
4115 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4116 ## insertion mode IN_HEAD_IM)
4117 ## NOTE: There is a "as if in head" code clone.
4118 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4119 !!!cp ('t114');
4120 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4121 push @{$self->{open_elements}},
4122 [$self->{head_element}, $el_category->{head}];
4123 } else {
4124 !!!cp ('t115');
4125 }
4126 $parse_rcdata->(CDATA_CONTENT_MODEL);
4127 pop @{$self->{open_elements}} # <head>
4128 if $self->{insertion_mode} == AFTER_HEAD_IM;
4129 next B;
4130 } elsif ($token->{tag_name} eq 'noscript') {
4131 if ($self->{insertion_mode} == IN_HEAD_IM) {
4132 !!!cp ('t116');
4133 ## NOTE: and scripting is disalbed
4134 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4135 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4136 !!!nack ('t116.1');
4137 !!!next-token;
4138 next B;
4139 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4140 !!!cp ('t117');
4141 !!!parse-error (type => 'in noscript:noscript', token => $token);
4142 ## Ignore the token
4143 !!!nack ('t117.1');
4144 !!!next-token;
4145 next B;
4146 } else {
4147 !!!cp ('t118');
4148 #
4149 }
4150 } elsif ($token->{tag_name} eq 'script') {
4151 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4152 !!!cp ('t119');
4153 ## As if </noscript>
4154 pop @{$self->{open_elements}};
4155 !!!parse-error (type => 'in noscript:script', token => $token);
4156
4157 $self->{insertion_mode} = IN_HEAD_IM;
4158 ## Reprocess in the "in head" insertion mode...
4159 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4160 !!!cp ('t120');
4161 !!!parse-error (type => 'after head:'.$token->{tag_name}, token => $token);
4162 push @{$self->{open_elements}},
4163 [$self->{head_element}, $el_category->{head}];
4164 } else {
4165 !!!cp ('t121');
4166 }
4167
4168 ## NOTE: There is a "as if in head" code clone.
4169 $script_start_tag->();
4170 pop @{$self->{open_elements}} # <head>
4171 if $self->{insertion_mode} == AFTER_HEAD_IM;
4172 next B;
4173 } elsif ($token->{tag_name} eq 'body' or
4174 $token->{tag_name} eq 'frameset') {
4175 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4176 !!!cp ('t122');
4177 ## As if </noscript>
4178 pop @{$self->{open_elements}};
4179 !!!parse-error (type => 'in noscript:'.$token->{tag_name}, token => $token);
4180
4181 ## Reprocess in the "in head" insertion mode...
4182 ## As if </head>
4183 pop @{$self->{open_elements}};
4184
4185 ## Reprocess in the "after head" insertion mode...
4186 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4187 !!!cp ('t124');
4188 pop @{$self->{open_elements}};
4189
4190 ## Reprocess in the "after head" insertion mode...
4191 } else {
4192 !!!cp ('t125');
4193 }
4194
4195 ## "after head" insertion mode
4196 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4197 if ($token->{tag_name} eq 'body') {
4198 !!!cp ('t126');
4199 $self->{insertion_mode} = IN_BODY_IM;
4200 } elsif ($token->{tag_name} eq 'frameset') {
4201 !!!cp ('t127');
4202 $self->{insertion_mode} = IN_FRAMESET_IM;
4203 } else {
4204 die "$0: tag name: $self->{tag_name}";
4205 }
4206 !!!nack ('t127.1');
4207 !!!next-token;
4208 next B;
4209 } else {
4210 !!!cp ('t128');
4211 #
4212 }
4213
4214 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4215 !!!cp ('t129');
4216 ## As if </noscript>
4217 pop @{$self->{open_elements}};
4218 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4219
4220 ## Reprocess in the "in head" insertion mode...
4221 ## As if </head>
4222 pop @{$self->{open_elements}};
4223
4224 ## Reprocess in the "after head" insertion mode...
4225 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4226 !!!cp ('t130');
4227 ## As if </head>
4228 pop @{$self->{open_elements}};
4229
4230 ## Reprocess in the "after head" insertion mode...
4231 } else {
4232 !!!cp ('t131');
4233 }
4234
4235 ## "after head" insertion mode
4236 ## As if <body>
4237 !!!insert-element ('body',, $token);
4238 $self->{insertion_mode} = IN_BODY_IM;
4239 ## reprocess
4240 !!!ack-later;
4241 next B;
4242 } elsif ($token->{type} == END_TAG_TOKEN) {
4243 if ($token->{tag_name} eq 'head') {
4244 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4245 !!!cp ('t132');
4246 ## As if <head>
4247 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4248 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4249 push @{$self->{open_elements}},
4250 [$self->{head_element}, $el_category->{head}];
4251
4252 ## Reprocess in the "in head" insertion mode...
4253 pop @{$self->{open_elements}};
4254 $self->{insertion_mode} = AFTER_HEAD_IM;
4255 !!!next-token;
4256 next B;
4257 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4258 !!!cp ('t133');
4259 ## As if </noscript>
4260 pop @{$self->{open_elements}};
4261 !!!parse-error (type => 'in noscript:/head', token => $token);
4262
4263 ## Reprocess in the "in head" insertion mode...
4264 pop @{$self->{open_elements}};
4265 $self->{insertion_mode} = AFTER_HEAD_IM;
4266 !!!next-token;
4267 next B;
4268 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4269 !!!cp ('t134');
4270 pop @{$self->{open_elements}};
4271 $self->{insertion_mode} = AFTER_HEAD_IM;
4272 !!!next-token;
4273 next B;
4274 } else {
4275 !!!cp ('t135');
4276 #
4277 }
4278 } elsif ($token->{tag_name} eq 'noscript') {
4279 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4280 !!!cp ('t136');
4281 pop @{$self->{open_elements}};
4282 $self->{insertion_mode} = IN_HEAD_IM;
4283 !!!next-token;
4284 next B;
4285 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4286 !!!cp ('t137');
4287 !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
4288 ## Ignore the token ## ISSUE: An issue in the spec.
4289 !!!next-token;
4290 next B;
4291 } else {
4292 !!!cp ('t138');
4293 #
4294 }
4295 } elsif ({
4296 body => 1, html => 1,
4297 }->{$token->{tag_name}}) {
4298 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4299 !!!cp ('t139');
4300 ## As if <head>
4301 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4302 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4303 push @{$self->{open_elements}},
4304 [$self->{head_element}, $el_category->{head}];
4305
4306 $self->{insertion_mode} = IN_HEAD_IM;
4307 ## Reprocess in the "in head" insertion mode...
4308 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4309 !!!cp ('t140');
4310 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4311 ## Ignore the token
4312 !!!next-token;
4313 next B;
4314 } else {
4315 !!!cp ('t141');
4316 }
4317
4318 #
4319 } elsif ({
4320 p => 1, br => 1,
4321 }->{$token->{tag_name}}) {
4322 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4323 !!!cp ('t142');
4324 ## As if <head>
4325 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4326 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4327 push @{$self->{open_elements}},
4328 [$self->{head_element}, $el_category->{head}];
4329
4330 $self->{insertion_mode} = IN_HEAD_IM;
4331 ## Reprocess in the "in head" insertion mode...
4332 } else {
4333 !!!cp ('t143');
4334 }
4335
4336 #
4337 } else {
4338 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4339 !!!cp ('t144');
4340 #
4341 } else {
4342 !!!cp ('t145');
4343 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4344 ## Ignore the token
4345 !!!next-token;
4346 next B;
4347 }
4348 }
4349
4350 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4351 !!!cp ('t146');
4352 ## As if </noscript>
4353 pop @{$self->{open_elements}};
4354 !!!parse-error (type => 'in noscript:/'.$token->{tag_name}, token => $token);
4355
4356 ## Reprocess in the "in head" insertion mode...
4357 ## As if </head>
4358 pop @{$self->{open_elements}};
4359
4360 ## Reprocess in the "after head" insertion mode...
4361 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4362 !!!cp ('t147');
4363 ## As if </head>
4364 pop @{$self->{open_elements}};
4365
4366 ## Reprocess in the "after head" insertion mode...
4367 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4368 ## ISSUE: This case cannot be reached?
4369 !!!cp ('t148');
4370 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4371 ## Ignore the token ## ISSUE: An issue in the spec.
4372 !!!next-token;
4373 next B;
4374 } else {
4375 !!!cp ('t149');
4376 }
4377
4378 ## "after head" insertion mode
4379 ## As if <body>
4380 !!!insert-element ('body',, $token);
4381 $self->{insertion_mode} = IN_BODY_IM;
4382 ## reprocess
4383 next B;
4384 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4385 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4386 !!!cp ('t149.1');
4387
4388 ## NOTE: As if <head>
4389 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4390 $self->{open_elements}->[-1]->[0]->append_child
4391 ($self->{head_element});
4392 #push @{$self->{open_elements}},
4393 # [$self->{head_element}, $el_category->{head}];
4394 #$self->{insertion_mode} = IN_HEAD_IM;
4395 ## NOTE: Reprocess.
4396
4397 ## NOTE: As if </head>
4398 #pop @{$self->{open_elements}};
4399 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4400 ## NOTE: Reprocess.
4401
4402 #
4403 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4404 !!!cp ('t149.2');
4405
4406 ## NOTE: As if </head>
4407 pop @{$self->{open_elements}};
4408 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4409 ## NOTE: Reprocess.
4410
4411 #
4412 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4413 !!!cp ('t149.3');
4414
4415 !!!parse-error (type => 'in noscript:#eof', token => $token);
4416
4417 ## As if </noscript>
4418 pop @{$self->{open_elements}};
4419 #$self->{insertion_mode} = IN_HEAD_IM;
4420 ## NOTE: Reprocess.
4421
4422 ## NOTE: As if </head>
4423 pop @{$self->{open_elements}};
4424 #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
4425 ## NOTE: Reprocess.
4426
4427 #
4428 } else {
4429 !!!cp ('t149.4');
4430 #
4431 }
4432
4433 ## NOTE: As if <body>
4434 !!!insert-element ('body',, $token);
4435 $self->{insertion_mode} = IN_BODY_IM;
4436 ## NOTE: Reprocess.
4437 next B;
4438 } else {
4439 die "$0: $token->{type}: Unknown token type";
4440 }
4441
4442 ## ISSUE: An issue in the spec.
4443 } elsif ($self->{insertion_mode} & BODY_IMS) {
4444 if ($token->{type} == CHARACTER_TOKEN) {
4445 !!!cp ('t150');
4446 ## NOTE: There is a code clone of "character in body".
4447 $reconstruct_active_formatting_elements->($insert_to_current);
4448
4449 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4450
4451 !!!next-token;
4452 next B;
4453 } elsif ($token->{type} == START_TAG_TOKEN) {
4454 if ({
4455 caption => 1, col => 1, colgroup => 1, tbody => 1,
4456 td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
4457 }->{$token->{tag_name}}) {
4458 if ($self->{insertion_mode} == IN_CELL_IM) {
4459 ## have an element in table scope
4460 for (reverse 0..$#{$self->{open_elements}}) {
4461 my $node = $self->{open_elements}->[$_];
4462 if ($node->[1] & TABLE_CELL_EL) {
4463 !!!cp ('t151');
4464
4465 ## Close the cell
4466 !!!back-token; # <x>
4467 $token = {type => END_TAG_TOKEN,
4468 tag_name => $node->[0]->manakai_local_name,
4469 line => $token->{line},
4470 column => $token->{column}};
4471 next B;
4472 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4473 !!!cp ('t152');
4474 ## ISSUE: This case can never be reached, maybe.
4475 last;
4476 }
4477 }
4478
4479 !!!cp ('t153');
4480 !!!parse-error (type => 'start tag not allowed',
4481 value => $token->{tag_name}, token => $token);
4482 ## Ignore the token
4483 !!!nack ('t153.1');
4484 !!!next-token;
4485 next B;
4486 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4487 !!!parse-error (type => 'not closed:caption', token => $token);
4488
4489 ## NOTE: As if </caption>.
4490 ## have a table element in table scope
4491 my $i;
4492 INSCOPE: {
4493 for (reverse 0..$#{$self->{open_elements}}) {
4494 my $node = $self->{open_elements}->[$_];
4495 if ($node->[1] & CAPTION_EL) {
4496 !!!cp ('t155');
4497 $i = $_;
4498 last INSCOPE;
4499 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4500 !!!cp ('t156');
4501 last;
4502 }
4503 }
4504
4505 !!!cp ('t157');
4506 !!!parse-error (type => 'start tag not allowed',
4507 value => $token->{tag_name}, token => $token);
4508 ## Ignore the token
4509 !!!nack ('t157.1');
4510 !!!next-token;
4511 next B;
4512 } # INSCOPE
4513
4514 ## generate implied end tags
4515 while ($self->{open_elements}->[-1]->[1]
4516 & END_TAG_OPTIONAL_EL) {
4517 !!!cp ('t158');
4518 pop @{$self->{open_elements}};
4519 }
4520
4521 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4522 !!!cp ('t159');
4523 !!!parse-error (type => 'not closed',
4524 value => $self->{open_elements}->[-1]->[0]
4525 ->manakai_local_name,
4526 token => $token);
4527 } else {
4528 !!!cp ('t160');
4529 }
4530
4531 splice @{$self->{open_elements}}, $i;
4532
4533 $clear_up_to_marker->();
4534
4535 $self->{insertion_mode} = IN_TABLE_IM;
4536
4537 ## reprocess
4538 !!!ack-later;
4539 next B;
4540 } else {
4541 !!!cp ('t161');
4542 #
4543 }
4544 } else {
4545 !!!cp ('t162');
4546 #
4547 }
4548 } elsif ($token->{type} == END_TAG_TOKEN) {
4549 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
4550 if ($self->{insertion_mode} == IN_CELL_IM) {
4551 ## have an element in table scope
4552 my $i;
4553 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4554 my $node = $self->{open_elements}->[$_];
4555 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4556 !!!cp ('t163');
4557 $i = $_;
4558 last INSCOPE;
4559 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4560 !!!cp ('t164');
4561 last INSCOPE;
4562 }
4563 } # INSCOPE
4564 unless (defined $i) {
4565 !!!cp ('t165');
4566 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4567 ## Ignore the token
4568 !!!next-token;
4569 next B;
4570 }
4571
4572 ## generate implied end tags
4573 while ($self->{open_elements}->[-1]->[1]
4574 & END_TAG_OPTIONAL_EL) {
4575 !!!cp ('t166');
4576 pop @{$self->{open_elements}};
4577 }
4578
4579 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
4580 ne $token->{tag_name}) {
4581 !!!cp ('t167');
4582 !!!parse-error (type => 'not closed',
4583 value => $self->{open_elements}->[-1]->[0]
4584 ->manakai_local_name,
4585 token => $token);
4586 } else {
4587 !!!cp ('t168');
4588 }
4589
4590 splice @{$self->{open_elements}}, $i;
4591
4592 $clear_up_to_marker->();
4593
4594 $self->{insertion_mode} = IN_ROW_IM;
4595
4596 !!!next-token;
4597 next B;
4598 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
4599 !!!cp ('t169');
4600 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4601 ## Ignore the token
4602 !!!next-token;
4603 next B;
4604 } else {
4605 !!!cp ('t170');
4606 #
4607 }
4608 } elsif ($token->{tag_name} eq 'caption') {
4609 if ($self->{insertion_mode} == IN_CAPTION_IM) {
4610 ## have a table element in table scope
4611 my $i;
4612 INSCOPE: {
4613 for (reverse 0..$#{$self->{open_elements}}) {
4614 my $node = $self->{open_elements}->[$_];
4615 if ($node->[1] & CAPTION_EL) {
4616 !!!cp ('t171');
4617 $i = $_;
4618 last INSCOPE;
4619 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4620 !!!cp ('t172');
4621 last;
4622 }
4623 }
4624
4625 !!!cp ('t173');
4626 !!!parse-error (type => 'unmatched end tag',
4627 value => $token->{tag_name}, token => $token);
4628 ## Ignore the token
4629 !!!next-token;
4630 next B;
4631 } # INSCOPE
4632
4633 ## generate implied end tags
4634 while ($self->{open_elements}->[-1]->[1]
4635 & END_TAG_OPTIONAL_EL) {
4636 !!!cp ('t174');
4637 pop @{$self->{open_elements}};
4638 }
4639
4640 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4641 !!!cp ('t175');
4642 !!!parse-error (type => 'not closed',
4643 value => $self->{open_elements}->[-1]->[0]
4644 ->manakai_local_name,
4645 token => $token);
4646 } else {
4647 !!!cp ('t176');
4648 }
4649
4650 splice @{$self->{open_elements}}, $i;
4651
4652 $clear_up_to_marker->();
4653
4654 $self->{insertion_mode} = IN_TABLE_IM;
4655
4656 !!!next-token;
4657 next B;
4658 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
4659 !!!cp ('t177');
4660 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4661 ## Ignore the token
4662 !!!next-token;
4663 next B;
4664 } else {
4665 !!!cp ('t178');
4666 #
4667 }
4668 } elsif ({
4669 table => 1, tbody => 1, tfoot => 1,
4670 thead => 1, tr => 1,
4671 }->{$token->{tag_name}} and
4672 $self->{insertion_mode} == IN_CELL_IM) {
4673 ## have an element in table scope
4674 my $i;
4675 my $tn;
4676 INSCOPE: {
4677 for (reverse 0..$#{$self->{open_elements}}) {
4678 my $node = $self->{open_elements}->[$_];
4679 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
4680 !!!cp ('t179');
4681 $i = $_;
4682
4683 ## Close the cell
4684 !!!back-token; # </x>
4685 $token = {type => END_TAG_TOKEN, tag_name => $tn,
4686 line => $token->{line},
4687 column => $token->{column}};
4688 next B;
4689 } elsif ($node->[1] & TABLE_CELL_EL) {
4690 !!!cp ('t180');
4691 $tn = $node->[0]->manakai_local_name;
4692 ## NOTE: There is exactly one |td| or |th| element
4693 ## in scope in the stack of open elements by definition.
4694 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4695 ## ISSUE: Can this be reached?
4696 !!!cp ('t181');
4697 last;
4698 }
4699 }
4700
4701 !!!cp ('t182');
4702 !!!parse-error (type => 'unmatched end tag',
4703 value => $token->{tag_name}, token => $token);
4704 ## Ignore the token
4705 !!!next-token;
4706 next B;
4707 } # INSCOPE
4708 } elsif ($token->{tag_name} eq 'table' and
4709 $self->{insertion_mode} == IN_CAPTION_IM) {
4710 !!!parse-error (type => 'not closed:caption', token => $token);
4711
4712 ## As if </caption>
4713 ## have a table element in table scope
4714 my $i;
4715 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4716 my $node = $self->{open_elements}->[$_];
4717 if ($node->[1] & CAPTION_EL) {
4718 !!!cp ('t184');
4719 $i = $_;
4720 last INSCOPE;
4721 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4722 !!!cp ('t185');
4723 last INSCOPE;
4724 }
4725 } # INSCOPE
4726 unless (defined $i) {
4727 !!!cp ('t186');
4728 !!!parse-error (type => 'unmatched end tag:caption', token => $token);
4729 ## Ignore the token
4730 !!!next-token;
4731 next B;
4732 }
4733
4734 ## generate implied end tags
4735 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
4736 !!!cp ('t187');
4737 pop @{$self->{open_elements}};
4738 }
4739
4740 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
4741 !!!cp ('t188');
4742 !!!parse-error (type => 'not closed',
4743 value => $self->{open_elements}->[-1]->[0]
4744 ->manakai_local_name,
4745 token => $token);
4746 } else {
4747 !!!cp ('t189');
4748 }
4749
4750 splice @{$self->{open_elements}}, $i;
4751
4752 $clear_up_to_marker->();
4753
4754 $self->{insertion_mode} = IN_TABLE_IM;
4755
4756 ## reprocess
4757 next B;
4758 } elsif ({
4759 body => 1, col => 1, colgroup => 1, html => 1,
4760 }->{$token->{tag_name}}) {
4761 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
4762 !!!cp ('t190');
4763 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4764 ## Ignore the token
4765 !!!next-token;
4766 next B;
4767 } else {
4768 !!!cp ('t191');
4769 #
4770 }
4771 } elsif ({
4772 tbody => 1, tfoot => 1,
4773 thead => 1, tr => 1,
4774 }->{$token->{tag_name}} and
4775 $self->{insertion_mode} == IN_CAPTION_IM) {
4776 !!!cp ('t192');
4777 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4778 ## Ignore the token
4779 !!!next-token;
4780 next B;
4781 } else {
4782 !!!cp ('t193');
4783 #
4784 }
4785 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4786 for my $entry (@{$self->{open_elements}}) {
4787 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
4788 !!!cp ('t75');
4789 !!!parse-error (type => 'in body:#eof', token => $token);
4790 last;
4791 }
4792 }
4793
4794 ## Stop parsing.
4795 last B;
4796 } else {
4797 die "$0: $token->{type}: Unknown token type";
4798 }
4799
4800 $insert = $insert_to_current;
4801 #
4802 } elsif ($self->{insertion_mode} & TABLE_IMS) {
4803 if ($token->{type} == CHARACTER_TOKEN) {
4804 if (not $open_tables->[-1]->[1] and # tainted
4805 $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4806 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4807
4808 unless (length $token->{data}) {
4809 !!!cp ('t194');
4810 !!!next-token;
4811 next B;
4812 } else {
4813 !!!cp ('t195');
4814 }
4815 }
4816
4817 !!!parse-error (type => 'in table:#character', token => $token);
4818
4819 ## As if in body, but insert into foster parent element
4820 ## ISSUE: Spec says that "whenever a node would be inserted
4821 ## into the current node" while characters might not be
4822 ## result in a new Text node.
4823 $reconstruct_active_formatting_elements->($insert_to_foster);
4824
4825 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4826 # MUST
4827 my $foster_parent_element;
4828 my $next_sibling;
4829 my $prev_sibling;
4830 OE: for (reverse 0..$#{$self->{open_elements}}) {
4831 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4832 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4833 if (defined $parent and $parent->node_type == 1) {
4834 !!!cp ('t196');
4835 $foster_parent_element = $parent;
4836 $next_sibling = $self->{open_elements}->[$_]->[0];
4837 $prev_sibling = $next_sibling->previous_sibling;
4838 } else {
4839 !!!cp ('t197');
4840 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
4841 $prev_sibling = $foster_parent_element->last_child;
4842 }
4843 last OE;
4844 }
4845 } # OE
4846 $foster_parent_element = $self->{open_elements}->[0]->[0] and
4847 $prev_sibling = $foster_parent_element->last_child
4848 unless defined $foster_parent_element;
4849 if (defined $prev_sibling and
4850 $prev_sibling->node_type == 3) {
4851 !!!cp ('t198');
4852 $prev_sibling->manakai_append_text ($token->{data});
4853 } else {
4854 !!!cp ('t199');
4855 $foster_parent_element->insert_before
4856 ($self->{document}->create_text_node ($token->{data}),
4857 $next_sibling);
4858 }
4859 $open_tables->[-1]->[1] = 1; # tainted
4860 } else {
4861 !!!cp ('t200');
4862 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4863 }
4864
4865 !!!next-token;
4866 next B;
4867 } elsif ($token->{type} == START_TAG_TOKEN) {
4868 if ({
4869 tr => ($self->{insertion_mode} != IN_ROW_IM),
4870 th => 1, td => 1,
4871 }->{$token->{tag_name}}) {
4872 if ($self->{insertion_mode} == IN_TABLE_IM) {
4873 ## Clear back to table context
4874 while (not ($self->{open_elements}->[-1]->[1]
4875 & TABLE_SCOPING_EL)) {
4876 !!!cp ('t201');
4877 pop @{$self->{open_elements}};
4878 }
4879
4880 !!!insert-element ('tbody',, $token);
4881 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4882 ## reprocess in the "in table body" insertion mode...
4883 }
4884
4885 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4886 unless ($token->{tag_name} eq 'tr') {
4887 !!!cp ('t202');
4888 !!!parse-error (type => 'missing start tag:tr', token => $token);
4889 }
4890
4891 ## Clear back to table body context
4892 while (not ($self->{open_elements}->[-1]->[1]
4893 & TABLE_ROWS_SCOPING_EL)) {
4894 !!!cp ('t203');
4895 ## ISSUE: Can this case be reached?
4896 pop @{$self->{open_elements}};
4897 }
4898
4899 $self->{insertion_mode} = IN_ROW_IM;
4900 if ($token->{tag_name} eq 'tr') {
4901 !!!cp ('t204');
4902 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4903 !!!nack ('t204');
4904 !!!next-token;
4905 next B;
4906 } else {
4907 !!!cp ('t205');
4908 !!!insert-element ('tr',, $token);
4909 ## reprocess in the "in row" insertion mode
4910 }
4911 } else {
4912 !!!cp ('t206');
4913 }
4914
4915 ## Clear back to table row context
4916 while (not ($self->{open_elements}->[-1]->[1]
4917 & TABLE_ROW_SCOPING_EL)) {
4918 !!!cp ('t207');
4919 pop @{$self->{open_elements}};
4920 }
4921
4922 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4923 $self->{insertion_mode} = IN_CELL_IM;
4924
4925 push @$active_formatting_elements, ['#marker', ''];
4926
4927 !!!nack ('t207.1');
4928 !!!next-token;
4929 next B;
4930 } elsif ({
4931 caption => 1, col => 1, colgroup => 1,
4932 tbody => 1, tfoot => 1, thead => 1,
4933 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
4934 }->{$token->{tag_name}}) {
4935 if ($self->{insertion_mode} == IN_ROW_IM) {
4936 ## As if </tr>
4937 ## have an element in table scope
4938 my $i;
4939 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4940 my $node = $self->{open_elements}->[$_];
4941 if ($node->[1] & TABLE_ROW_EL) {
4942 !!!cp ('t208');
4943 $i = $_;
4944 last INSCOPE;
4945 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4946 !!!cp ('t209');
4947 last INSCOPE;
4948 }
4949 } # INSCOPE
4950 unless (defined $i) {
4951 !!!cp ('t210');
4952 ## TODO: This type is wrong.
4953 !!!parse-error (type => 'unmacthed end tag:'.$token->{tag_name}, token => $token);
4954 ## Ignore the token
4955 !!!nack ('t210.1');
4956 !!!next-token;
4957 next B;
4958 }
4959
4960 ## Clear back to table row context
4961 while (not ($self->{open_elements}->[-1]->[1]
4962 & TABLE_ROW_SCOPING_EL)) {
4963 !!!cp ('t211');
4964 ## ISSUE: Can this case be reached?
4965 pop @{$self->{open_elements}};
4966 }
4967
4968 pop @{$self->{open_elements}}; # tr
4969 $self->{insertion_mode} = IN_TABLE_BODY_IM;
4970 if ($token->{tag_name} eq 'tr') {
4971 !!!cp ('t212');
4972 ## reprocess
4973 !!!ack-later;
4974 next B;
4975 } else {
4976 !!!cp ('t213');
4977 ## reprocess in the "in table body" insertion mode...
4978 }
4979 }
4980
4981 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
4982 ## have an element in table scope
4983 my $i;
4984 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4985 my $node = $self->{open_elements}->[$_];
4986 if ($node->[1] & TABLE_ROW_GROUP_EL) {
4987 !!!cp ('t214');
4988 $i = $_;
4989 last INSCOPE;
4990 } elsif ($node->[1] & TABLE_SCOPING_EL) {
4991 !!!cp ('t215');
4992 last INSCOPE;
4993 }
4994 } # INSCOPE
4995 unless (defined $i) {
4996 !!!cp ('t216');
4997 ## TODO: This erorr type ios wrong.
4998 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
4999 ## Ignore the token
5000 !!!nack ('t216.1');
5001 !!!next-token;
5002 next B;
5003 }
5004
5005 ## Clear back to table body context
5006 while (not ($self->{open_elements}->[-1]->[1]
5007 & TABLE_ROWS_SCOPING_EL)) {
5008 !!!cp ('t217');
5009 ## ISSUE: Can this state be reached?
5010 pop @{$self->{open_elements}};
5011 }
5012
5013 ## As if <{current node}>
5014 ## have an element in table scope
5015 ## true by definition
5016
5017 ## Clear back to table body context
5018 ## nop by definition
5019
5020 pop @{$self->{open_elements}};
5021 $self->{insertion_mode} = IN_TABLE_IM;
5022 ## reprocess in "in table" insertion mode...
5023 } else {
5024 !!!cp ('t218');
5025 }
5026
5027 if ($token->{tag_name} eq 'col') {
5028 ## Clear back to table context
5029 while (not ($self->{open_elements}->[-1]->[1]
5030 & TABLE_SCOPING_EL)) {
5031 !!!cp ('t219');
5032 ## ISSUE: Can this state be reached?
5033 pop @{$self->{open_elements}};
5034 }
5035
5036 !!!insert-element ('colgroup',, $token);
5037 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5038 ## reprocess
5039 !!!ack-later;
5040 next B;
5041 } elsif ({
5042 caption => 1,
5043 colgroup => 1,
5044 tbody => 1, tfoot => 1, thead => 1,
5045 }->{$token->{tag_name}}) {
5046 ## Clear back to table context
5047 while (not ($self->{open_elements}->[-1]->[1]
5048 & TABLE_SCOPING_EL)) {
5049 !!!cp ('t220');
5050 ## ISSUE: Can this state be reached?
5051 pop @{$self->{open_elements}};
5052 }
5053
5054 push @$active_formatting_elements, ['#marker', '']
5055 if $token->{tag_name} eq 'caption';
5056
5057 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5058 $self->{insertion_mode} = {
5059 caption => IN_CAPTION_IM,
5060 colgroup => IN_COLUMN_GROUP_IM,
5061 tbody => IN_TABLE_BODY_IM,
5062 tfoot => IN_TABLE_BODY_IM,
5063 thead => IN_TABLE_BODY_IM,
5064 }->{$token->{tag_name}};
5065 !!!next-token;
5066 !!!nack ('t220.1');
5067 next B;
5068 } else {
5069 die "$0: in table: <>: $token->{tag_name}";
5070 }
5071 } elsif ($token->{tag_name} eq 'table') {
5072 !!!parse-error (type => 'not closed',
5073 value => $self->{open_elements}->[-1]->[0]
5074 ->manakai_local_name,
5075 token => $token);
5076
5077 ## As if </table>
5078 ## have a table element in table scope
5079 my $i;
5080 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5081 my $node = $self->{open_elements}->[$_];
5082 if ($node->[1] & TABLE_EL) {
5083 !!!cp ('t221');
5084 $i = $_;
5085 last INSCOPE;
5086 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5087 !!!cp ('t222');
5088 last INSCOPE;
5089 }
5090 } # INSCOPE
5091 unless (defined $i) {
5092 !!!cp ('t223');
5093 ## TODO: The following is wrong, maybe.
5094 !!!parse-error (type => 'unmatched end tag:table', token => $token);
5095 ## Ignore tokens </table><table>
5096 !!!nack ('t223.1');
5097 !!!next-token;
5098 next B;
5099 }
5100
5101 ## TODO: Followings are removed from the latest spec.
5102 ## generate implied end tags
5103 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5104 !!!cp ('t224');
5105 pop @{$self->{open_elements}};
5106 }
5107
5108 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5109 !!!cp ('t225');
5110 ## NOTE: |<table><tr><table>|
5111 !!!parse-error (type => 'not closed',
5112 value => $self->{open_elements}->[-1]->[0]
5113 ->manakai_local_name,
5114 token => $token);
5115 } else {
5116 !!!cp ('t226');
5117 }
5118
5119 splice @{$self->{open_elements}}, $i;
5120 pop @{$open_tables};
5121
5122 $self->_reset_insertion_mode;
5123
5124 ## reprocess
5125 !!!ack-later;
5126 next B;
5127 } elsif ($token->{tag_name} eq 'style') {
5128 if (not $open_tables->[-1]->[1]) { # tainted
5129 !!!cp ('t227.8');
5130 ## NOTE: This is a "as if in head" code clone.
5131 $parse_rcdata->(CDATA_CONTENT_MODEL);
5132 next B;
5133 } else {
5134 !!!cp ('t227.7');
5135 #
5136 }
5137 } elsif ($token->{tag_name} eq 'script') {
5138 if (not $open_tables->[-1]->[1]) { # tainted
5139 !!!cp ('t227.6');
5140 ## NOTE: This is a "as if in head" code clone.
5141 $script_start_tag->();
5142 next B;
5143 } else {
5144 !!!cp ('t227.5');
5145 #
5146 }
5147 } elsif ($token->{tag_name} eq 'input') {
5148 if (not $open_tables->[-1]->[1]) { # tainted
5149 if ($token->{attributes}->{type}) { ## TODO: case
5150 my $type = lc $token->{attributes}->{type}->{value};
5151 if ($type eq 'hidden') {
5152 !!!cp ('t227.3');
5153 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5154
5155 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5156
5157 ## TODO: form element pointer
5158
5159 pop @{$self->{open_elements}};
5160
5161 !!!next-token;
5162 !!!ack ('t227.2.1');
5163 next B;
5164 } else {
5165 !!!cp ('t227.2');
5166 #
5167 }
5168 } else {
5169 !!!cp ('t227.1');
5170 #
5171 }
5172 } else {
5173 !!!cp ('t227.4');
5174 #
5175 }
5176 } else {
5177 !!!cp ('t227');
5178 #
5179 }
5180
5181 !!!parse-error (type => 'in table:'.$token->{tag_name}, token => $token);
5182
5183 $insert = $insert_to_foster;
5184 #
5185 } elsif ($token->{type} == END_TAG_TOKEN) {
5186 if ($token->{tag_name} eq 'tr' and
5187 $self->{insertion_mode} == IN_ROW_IM) {
5188 ## have an element in table scope
5189 my $i;
5190 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5191 my $node = $self->{open_elements}->[$_];
5192 if ($node->[1] & TABLE_ROW_EL) {
5193 !!!cp ('t228');
5194 $i = $_;
5195 last INSCOPE;
5196 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5197 !!!cp ('t229');
5198 last INSCOPE;
5199 }
5200 } # INSCOPE
5201 unless (defined $i) {
5202 !!!cp ('t230');
5203 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5204 ## Ignore the token
5205 !!!nack ('t230.1');
5206 !!!next-token;
5207 next B;
5208 } else {
5209 !!!cp ('t232');
5210 }
5211
5212 ## Clear back to table row context
5213 while (not ($self->{open_elements}->[-1]->[1]
5214 & TABLE_ROW_SCOPING_EL)) {
5215 !!!cp ('t231');
5216 ## ISSUE: Can this state be reached?
5217 pop @{$self->{open_elements}};
5218 }
5219
5220 pop @{$self->{open_elements}}; # tr
5221 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5222 !!!next-token;
5223 !!!nack ('t231.1');
5224 next B;
5225 } elsif ($token->{tag_name} eq 'table') {
5226 if ($self->{insertion_mode} == IN_ROW_IM) {
5227 ## As if </tr>
5228 ## have an element in table scope
5229 my $i;
5230 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5231 my $node = $self->{open_elements}->[$_];
5232 if ($node->[1] & TABLE_ROW_EL) {
5233 !!!cp ('t233');
5234 $i = $_;
5235 last INSCOPE;
5236 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5237 !!!cp ('t234');
5238 last INSCOPE;
5239 }
5240 } # INSCOPE
5241 unless (defined $i) {
5242 !!!cp ('t235');
5243 ## TODO: The following is wrong.
5244 !!!parse-error (type => 'unmatched end tag:'.$token->{type}, token => $token);
5245 ## Ignore the token
5246 !!!nack ('t236.1');
5247 !!!next-token;
5248 next B;
5249 }
5250
5251 ## Clear back to table row context
5252 while (not ($self->{open_elements}->[-1]->[1]
5253 & TABLE_ROW_SCOPING_EL)) {
5254 !!!cp ('t236');
5255 ## ISSUE: Can this state be reached?
5256 pop @{$self->{open_elements}};
5257 }
5258
5259 pop @{$self->{open_elements}}; # tr
5260 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5261 ## reprocess in the "in table body" insertion mode...
5262 }
5263
5264 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5265 ## have an element in table scope
5266 my $i;
5267 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5268 my $node = $self->{open_elements}->[$_];
5269 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5270 !!!cp ('t237');
5271 $i = $_;
5272 last INSCOPE;
5273 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5274 !!!cp ('t238');
5275 last INSCOPE;
5276 }
5277 } # INSCOPE
5278 unless (defined $i) {
5279 !!!cp ('t239');
5280 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5281 ## Ignore the token
5282 !!!nack ('t239.1');
5283 !!!next-token;
5284 next B;
5285 }
5286
5287 ## Clear back to table body context
5288 while (not ($self->{open_elements}->[-1]->[1]
5289 & TABLE_ROWS_SCOPING_EL)) {
5290 !!!cp ('t240');
5291 pop @{$self->{open_elements}};
5292 }
5293
5294 ## As if <{current node}>
5295 ## have an element in table scope
5296 ## true by definition
5297
5298 ## Clear back to table body context
5299 ## nop by definition
5300
5301 pop @{$self->{open_elements}};
5302 $self->{insertion_mode} = IN_TABLE_IM;
5303 ## reprocess in the "in table" insertion mode...
5304 }
5305
5306 ## NOTE: </table> in the "in table" insertion mode.
5307 ## When you edit the code fragment below, please ensure that
5308 ## the code for <table> in the "in table" insertion mode
5309 ## is synced with it.
5310
5311 ## have a table element in table scope
5312 my $i;
5313 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5314 my $node = $self->{open_elements}->[$_];
5315 if ($node->[1] & TABLE_EL) {
5316 !!!cp ('t241');
5317 $i = $_;
5318 last INSCOPE;
5319 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5320 !!!cp ('t242');
5321 last INSCOPE;
5322 }
5323 } # INSCOPE
5324 unless (defined $i) {
5325 !!!cp ('t243');
5326 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5327 ## Ignore the token
5328 !!!nack ('t243.1');
5329 !!!next-token;
5330 next B;
5331 }
5332
5333 splice @{$self->{open_elements}}, $i;
5334 pop @{$open_tables};
5335
5336 $self->_reset_insertion_mode;
5337
5338 !!!next-token;
5339 next B;
5340 } elsif ({
5341 tbody => 1, tfoot => 1, thead => 1,
5342 }->{$token->{tag_name}} and
5343 $self->{insertion_mode} & ROW_IMS) {
5344 if ($self->{insertion_mode} == IN_ROW_IM) {
5345 ## have an element in table scope
5346 my $i;
5347 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5348 my $node = $self->{open_elements}->[$_];
5349 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5350 !!!cp ('t247');
5351 $i = $_;
5352 last INSCOPE;
5353 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5354 !!!cp ('t248');
5355 last INSCOPE;
5356 }
5357 } # INSCOPE
5358 unless (defined $i) {
5359 !!!cp ('t249');
5360 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5361 ## Ignore the token
5362 !!!nack ('t249.1');
5363 !!!next-token;
5364 next B;
5365 }
5366
5367 ## As if </tr>
5368 ## have an element in table scope
5369 my $i;
5370 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5371 my $node = $self->{open_elements}->[$_];
5372 if ($node->[1] & TABLE_ROW_EL) {
5373 !!!cp ('t250');
5374 $i = $_;
5375 last INSCOPE;
5376 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5377 !!!cp ('t251');
5378 last INSCOPE;
5379 }
5380 } # INSCOPE
5381 unless (defined $i) {
5382 !!!cp ('t252');
5383 !!!parse-error (type => 'unmatched end tag:tr', token => $token);
5384 ## Ignore the token
5385 !!!nack ('t252.1');
5386 !!!next-token;
5387 next B;
5388 }
5389
5390 ## Clear back to table row context
5391 while (not ($self->{open_elements}->[-1]->[1]
5392 & TABLE_ROW_SCOPING_EL)) {
5393 !!!cp ('t253');
5394 ## ISSUE: Can this case be reached?
5395 pop @{$self->{open_elements}};
5396 }
5397
5398 pop @{$self->{open_elements}}; # tr
5399 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5400 ## reprocess in the "in table body" insertion mode...
5401 }
5402
5403 ## have an element in table scope
5404 my $i;
5405 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5406 my $node = $self->{open_elements}->[$_];
5407 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5408 !!!cp ('t254');
5409 $i = $_;
5410 last INSCOPE;
5411 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5412 !!!cp ('t255');
5413 last INSCOPE;
5414 }
5415 } # INSCOPE
5416 unless (defined $i) {
5417 !!!cp ('t256');
5418 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5419 ## Ignore the token
5420 !!!nack ('t256.1');
5421 !!!next-token;
5422 next B;
5423 }
5424
5425 ## Clear back to table body context
5426 while (not ($self->{open_elements}->[-1]->[1]
5427 & TABLE_ROWS_SCOPING_EL)) {
5428 !!!cp ('t257');
5429 ## ISSUE: Can this case be reached?
5430 pop @{$self->{open_elements}};
5431 }
5432
5433 pop @{$self->{open_elements}};
5434 $self->{insertion_mode} = IN_TABLE_IM;
5435 !!!nack ('t257.1');
5436 !!!next-token;
5437 next B;
5438 } elsif ({
5439 body => 1, caption => 1, col => 1, colgroup => 1,
5440 html => 1, td => 1, th => 1,
5441 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5442 tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
5443 }->{$token->{tag_name}}) {
5444 !!!cp ('t258');
5445 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5446 ## Ignore the token
5447 !!!nack ('t258.1');
5448 !!!next-token;
5449 next B;
5450 } else {
5451 !!!cp ('t259');
5452 !!!parse-error (type => 'in table:/'.$token->{tag_name}, token => $token);
5453
5454 $insert = $insert_to_foster;
5455 #
5456 }
5457 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5458 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5459 @{$self->{open_elements}} == 1) { # redundant, maybe
5460 !!!parse-error (type => 'in body:#eof', token => $token);
5461 !!!cp ('t259.1');
5462 #
5463 } else {
5464 !!!cp ('t259.2');
5465 #
5466 }
5467
5468 ## Stop parsing
5469 last B;
5470 } else {
5471 die "$0: $token->{type}: Unknown token type";
5472 }
5473 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
5474 if ($token->{type} == CHARACTER_TOKEN) {
5475 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5476 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5477 unless (length $token->{data}) {
5478 !!!cp ('t260');
5479 !!!next-token;
5480 next B;
5481 }
5482 }
5483
5484 !!!cp ('t261');
5485 #
5486 } elsif ($token->{type} == START_TAG_TOKEN) {
5487 if ($token->{tag_name} eq 'col') {
5488 !!!cp ('t262');
5489 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5490 pop @{$self->{open_elements}};
5491 !!!ack ('t262.1');
5492 !!!next-token;
5493 next B;
5494 } else {
5495 !!!cp ('t263');
5496 #
5497 }
5498 } elsif ($token->{type} == END_TAG_TOKEN) {
5499 if ($token->{tag_name} eq 'colgroup') {
5500 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5501 !!!cp ('t264');
5502 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5503 ## Ignore the token
5504 !!!next-token;
5505 next B;
5506 } else {
5507 !!!cp ('t265');
5508 pop @{$self->{open_elements}}; # colgroup
5509 $self->{insertion_mode} = IN_TABLE_IM;
5510 !!!next-token;
5511 next B;
5512 }
5513 } elsif ($token->{tag_name} eq 'col') {
5514 !!!cp ('t266');
5515 !!!parse-error (type => 'unmatched end tag:col', token => $token);
5516 ## Ignore the token
5517 !!!next-token;
5518 next B;
5519 } else {
5520 !!!cp ('t267');
5521 #
5522 }
5523 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5524 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5525 @{$self->{open_elements}} == 1) { # redundant, maybe
5526 !!!cp ('t270.2');
5527 ## Stop parsing.
5528 last B;
5529 } else {
5530 ## NOTE: As if </colgroup>.
5531 !!!cp ('t270.1');
5532 pop @{$self->{open_elements}}; # colgroup
5533 $self->{insertion_mode} = IN_TABLE_IM;
5534 ## Reprocess.
5535 next B;
5536 }
5537 } else {
5538 die "$0: $token->{type}: Unknown token type";
5539 }
5540
5541 ## As if </colgroup>
5542 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
5543 !!!cp ('t269');
5544 ## TODO: Wrong error type?
5545 !!!parse-error (type => 'unmatched end tag:colgroup', token => $token);
5546 ## Ignore the token
5547 !!!nack ('t269.1');
5548 !!!next-token;
5549 next B;
5550 } else {
5551 !!!cp ('t270');
5552 pop @{$self->{open_elements}}; # colgroup
5553 $self->{insertion_mode} = IN_TABLE_IM;
5554 !!!ack-later;
5555 ## reprocess
5556 next B;
5557 }
5558 } elsif ($self->{insertion_mode} & SELECT_IMS) {
5559 if ($token->{type} == CHARACTER_TOKEN) {
5560 !!!cp ('t271');
5561 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5562 !!!next-token;
5563 next B;
5564 } elsif ($token->{type} == START_TAG_TOKEN) {
5565 if ($token->{tag_name} eq 'option') {
5566 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5567 !!!cp ('t272');
5568 ## As if </option>
5569 pop @{$self->{open_elements}};
5570 } else {
5571 !!!cp ('t273');
5572 }
5573
5574 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5575 !!!nack ('t273.1');
5576 !!!next-token;
5577 next B;
5578 } elsif ($token->{tag_name} eq 'optgroup') {
5579 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5580 !!!cp ('t274');
5581 ## As if </option>
5582 pop @{$self->{open_elements}};
5583 } else {
5584 !!!cp ('t275');
5585 }
5586
5587 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5588 !!!cp ('t276');
5589 ## As if </optgroup>
5590 pop @{$self->{open_elements}};
5591 } else {
5592 !!!cp ('t277');
5593 }
5594
5595 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5596 !!!nack ('t277.1');
5597 !!!next-token;
5598 next B;
5599 } elsif ($token->{tag_name} eq 'select' or
5600 $token->{tag_name} eq 'input' or
5601 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5602 {
5603 caption => 1, table => 1,
5604 tbody => 1, tfoot => 1, thead => 1,
5605 tr => 1, td => 1, th => 1,
5606 }->{$token->{tag_name}})) {
5607 ## TODO: The type below is not good - <select> is replaced by </select>
5608 !!!parse-error (type => 'not closed:select', token => $token);
5609 ## NOTE: As if the token were </select> (<select> case) or
5610 ## as if there were </select> (otherwise).
5611 ## have an element in table scope
5612 my $i;
5613 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5614 my $node = $self->{open_elements}->[$_];
5615 if ($node->[1] & SELECT_EL) {
5616 !!!cp ('t278');
5617 $i = $_;
5618 last INSCOPE;
5619 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5620 !!!cp ('t279');
5621 last INSCOPE;
5622 }
5623 } # INSCOPE
5624 unless (defined $i) {
5625 !!!cp ('t280');
5626 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5627 ## Ignore the token
5628 !!!nack ('t280.1');
5629 !!!next-token;
5630 next B;
5631 }
5632
5633 !!!cp ('t281');
5634 splice @{$self->{open_elements}}, $i;
5635
5636 $self->_reset_insertion_mode;
5637
5638 if ($token->{tag_name} eq 'select') {
5639 !!!nack ('t281.2');
5640 !!!next-token;
5641 next B;
5642 } else {
5643 !!!cp ('t281.1');
5644 !!!ack-later;
5645 ## Reprocess the token.
5646 next B;
5647 }
5648 } else {
5649 !!!cp ('t282');
5650 !!!parse-error (type => 'in select:'.$token->{tag_name}, token => $token);
5651 ## Ignore the token
5652 !!!nack ('t282.1');
5653 !!!next-token;
5654 next B;
5655 }
5656 } elsif ($token->{type} == END_TAG_TOKEN) {
5657 if ($token->{tag_name} eq 'optgroup') {
5658 if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
5659 $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
5660 !!!cp ('t283');
5661 ## As if </option>
5662 splice @{$self->{open_elements}}, -2;
5663 } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
5664 !!!cp ('t284');
5665 pop @{$self->{open_elements}};
5666 } else {
5667 !!!cp ('t285');
5668 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5669 ## Ignore the token
5670 }
5671 !!!nack ('t285.1');
5672 !!!next-token;
5673 next B;
5674 } elsif ($token->{tag_name} eq 'option') {
5675 if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
5676 !!!cp ('t286');
5677 pop @{$self->{open_elements}};
5678 } else {
5679 !!!cp ('t287');
5680 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5681 ## Ignore the token
5682 }
5683 !!!nack ('t287.1');
5684 !!!next-token;
5685 next B;
5686 } elsif ($token->{tag_name} eq 'select') {
5687 ## have an element in table scope
5688 my $i;
5689 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5690 my $node = $self->{open_elements}->[$_];
5691 if ($node->[1] & SELECT_EL) {
5692 !!!cp ('t288');
5693 $i = $_;
5694 last INSCOPE;
5695 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5696 !!!cp ('t289');
5697 last INSCOPE;
5698 }
5699 } # INSCOPE
5700 unless (defined $i) {
5701 !!!cp ('t290');
5702 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5703 ## Ignore the token
5704 !!!nack ('t290.1');
5705 !!!next-token;
5706 next B;
5707 }
5708
5709 !!!cp ('t291');
5710 splice @{$self->{open_elements}}, $i;
5711
5712 $self->_reset_insertion_mode;
5713
5714 !!!nack ('t291.1');
5715 !!!next-token;
5716 next B;
5717 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
5718 {
5719 caption => 1, table => 1, tbody => 1,
5720 tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
5721 }->{$token->{tag_name}}) {
5722 ## TODO: The following is wrong?
5723 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5724
5725 ## have an element in table scope
5726 my $i;
5727 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5728 my $node = $self->{open_elements}->[$_];
5729 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5730 !!!cp ('t292');
5731 $i = $_;
5732 last INSCOPE;
5733 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5734 !!!cp ('t293');
5735 last INSCOPE;
5736 }
5737 } # INSCOPE
5738 unless (defined $i) {
5739 !!!cp ('t294');
5740 ## Ignore the token
5741 !!!nack ('t294.1');
5742 !!!next-token;
5743 next B;
5744 }
5745
5746 ## As if </select>
5747 ## have an element in table scope
5748 undef $i;
5749 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5750 my $node = $self->{open_elements}->[$_];
5751 if ($node->[1] & SELECT_EL) {
5752 !!!cp ('t295');
5753 $i = $_;
5754 last INSCOPE;
5755 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5756 ## ISSUE: Can this state be reached?
5757 !!!cp ('t296');
5758 last INSCOPE;
5759 }
5760 } # INSCOPE
5761 unless (defined $i) {
5762 !!!cp ('t297');
5763 ## TODO: The following error type is correct?
5764 !!!parse-error (type => 'unmatched end tag:select', token => $token);
5765 ## Ignore the </select> token
5766 !!!nack ('t297.1');
5767 !!!next-token; ## TODO: ok?
5768 next B;
5769 }
5770
5771 !!!cp ('t298');
5772 splice @{$self->{open_elements}}, $i;
5773
5774 $self->_reset_insertion_mode;
5775
5776 !!!ack-later;
5777 ## reprocess
5778 next B;
5779 } else {
5780 !!!cp ('t299');
5781 !!!parse-error (type => 'in select:/'.$token->{tag_name}, token => $token);
5782 ## Ignore the token
5783 !!!nack ('t299.3');
5784 !!!next-token;
5785 next B;
5786 }
5787 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5788 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
5789 @{$self->{open_elements}} == 1) { # redundant, maybe
5790 !!!cp ('t299.1');
5791 !!!parse-error (type => 'in body:#eof', token => $token);
5792 } else {
5793 !!!cp ('t299.2');
5794 }
5795
5796 ## Stop parsing.
5797 last B;
5798 } else {
5799 die "$0: $token->{type}: Unknown token type";
5800 }
5801 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
5802 if ($token->{type} == CHARACTER_TOKEN) {
5803 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5804 my $data = $1;
5805 ## As if in body
5806 $reconstruct_active_formatting_elements->($insert_to_current);
5807
5808 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5809
5810 unless (length $token->{data}) {
5811 !!!cp ('t300');
5812 !!!next-token;
5813 next B;
5814 }
5815 }
5816
5817 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5818 !!!cp ('t301');
5819 !!!parse-error (type => 'after html:#character', token => $token);
5820
5821 ## Reprocess in the "after body" insertion mode.
5822 } else {
5823 !!!cp ('t302');
5824 }
5825
5826 ## "after body" insertion mode
5827 !!!parse-error (type => 'after body:#character', token => $token);
5828
5829 $self->{insertion_mode} = IN_BODY_IM;
5830 ## reprocess
5831 next B;
5832 } elsif ($token->{type} == START_TAG_TOKEN) {
5833 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5834 !!!cp ('t303');
5835 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5836
5837 ## Reprocess in the "after body" insertion mode.
5838 } else {
5839 !!!cp ('t304');
5840 }
5841
5842 ## "after body" insertion mode
5843 !!!parse-error (type => 'after body:'.$token->{tag_name}, token => $token);
5844
5845 $self->{insertion_mode} = IN_BODY_IM;
5846 !!!ack-later;
5847 ## reprocess
5848 next B;
5849 } elsif ($token->{type} == END_TAG_TOKEN) {
5850 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
5851 !!!cp ('t305');
5852 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5853
5854 $self->{insertion_mode} = AFTER_BODY_IM;
5855 ## Reprocess in the "after body" insertion mode.
5856 } else {
5857 !!!cp ('t306');
5858 }
5859
5860 ## "after body" insertion mode
5861 if ($token->{tag_name} eq 'html') {
5862 if (defined $self->{inner_html_node}) {
5863 !!!cp ('t307');
5864 !!!parse-error (type => 'unmatched end tag:html', token => $token);
5865 ## Ignore the token
5866 !!!next-token;
5867 next B;
5868 } else {
5869 !!!cp ('t308');
5870 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
5871 !!!next-token;
5872 next B;
5873 }
5874 } else {
5875 !!!cp ('t309');
5876 !!!parse-error (type => 'after body:/'.$token->{tag_name}, token => $token);
5877
5878 $self->{insertion_mode} = IN_BODY_IM;
5879 ## reprocess
5880 next B;
5881 }
5882 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5883 !!!cp ('t309.2');
5884 ## Stop parsing
5885 last B;
5886 } else {
5887 die "$0: $token->{type}: Unknown token type";
5888 }
5889 } elsif ($self->{insertion_mode} & FRAME_IMS) {
5890 if ($token->{type} == CHARACTER_TOKEN) {
5891 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5892 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5893
5894 unless (length $token->{data}) {
5895 !!!cp ('t310');
5896 !!!next-token;
5897 next B;
5898 }
5899 }
5900
5901 if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
5902 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5903 !!!cp ('t311');
5904 !!!parse-error (type => 'in frameset:#character', token => $token);
5905 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
5906 !!!cp ('t312');
5907 !!!parse-error (type => 'after frameset:#character', token => $token);
5908 } else { # "after html frameset"
5909 !!!cp ('t313');
5910 !!!parse-error (type => 'after html:#character', token => $token);
5911
5912 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5913 ## Reprocess in the "after frameset" insertion mode.
5914 !!!parse-error (type => 'after frameset:#character', token => $token);
5915 }
5916
5917 ## Ignore the token.
5918 if (length $token->{data}) {
5919 !!!cp ('t314');
5920 ## reprocess the rest of characters
5921 } else {
5922 !!!cp ('t315');
5923 !!!next-token;
5924 }
5925 next B;
5926 }
5927
5928 die qq[$0: Character "$token->{data}"];
5929 } elsif ($token->{type} == START_TAG_TOKEN) {
5930 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5931 !!!cp ('t316');
5932 !!!parse-error (type => 'after html:'.$token->{tag_name}, token => $token);
5933
5934 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5935 ## Process in the "after frameset" insertion mode.
5936 } else {
5937 !!!cp ('t317');
5938 }
5939
5940 if ($token->{tag_name} eq 'frameset' and
5941 $self->{insertion_mode} == IN_FRAMESET_IM) {
5942 !!!cp ('t318');
5943 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5944 !!!nack ('t318.1');
5945 !!!next-token;
5946 next B;
5947 } elsif ($token->{tag_name} eq 'frame' and
5948 $self->{insertion_mode} == IN_FRAMESET_IM) {
5949 !!!cp ('t319');
5950 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5951 pop @{$self->{open_elements}};
5952 !!!ack ('t319.1');
5953 !!!next-token;
5954 next B;
5955 } elsif ($token->{tag_name} eq 'noframes') {
5956 !!!cp ('t320');
5957 ## NOTE: As if in body.
5958 $parse_rcdata->(CDATA_CONTENT_MODEL);
5959 next B;
5960 } else {
5961 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
5962 !!!cp ('t321');
5963 !!!parse-error (type => 'in frameset:'.$token->{tag_name}, token => $token);
5964 } else {
5965 !!!cp ('t322');
5966 !!!parse-error (type => 'after frameset:'.$token->{tag_name}, token => $token);
5967 }
5968 ## Ignore the token
5969 !!!nack ('t322.1');
5970 !!!next-token;
5971 next B;
5972 }
5973 } elsif ($token->{type} == END_TAG_TOKEN) {
5974 if ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
5975 !!!cp ('t323');
5976 !!!parse-error (type => 'after html:/'.$token->{tag_name}, token => $token);
5977
5978 $self->{insertion_mode} = AFTER_FRAMESET_IM;
5979 ## Process in the "after frameset" insertion mode.
5980 } else {
5981 !!!cp ('t324');
5982 }
5983
5984 if ($token->{tag_name} eq 'frameset' and
5985 $self->{insertion_mode} == IN_FRAMESET_IM) {
5986 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
5987 @{$self->{open_elements}} == 1) {
5988 !!!cp ('t325');
5989 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
5990 ## Ignore the token
5991 !!!next-token;
5992 } else {
5993 !!!cp ('t326');
5994 pop @{$self->{open_elements}};
5995 !!!next-token;
5996 }
5997
5998 if (not defined $self->{inner_html_node} and
5999 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6000 !!!cp ('t327');
6001 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6002 } else {
6003 !!!cp ('t328');
6004 }
6005 next B;
6006 } elsif ($token->{tag_name} eq 'html' and
6007 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6008 !!!cp ('t329');
6009 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6010 !!!next-token;
6011 next B;
6012 } else {
6013 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6014 !!!cp ('t330');
6015 !!!parse-error (type => 'in frameset:/'.$token->{tag_name}, token => $token);
6016 } else {
6017 !!!cp ('t331');
6018 !!!parse-error (type => 'after frameset:/'.$token->{tag_name}, token => $token);
6019 }
6020 ## Ignore the token
6021 !!!next-token;
6022 next B;
6023 }
6024 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6025 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6026 @{$self->{open_elements}} == 1) { # redundant, maybe
6027 !!!cp ('t331.1');
6028 !!!parse-error (type => 'in body:#eof', token => $token);
6029 } else {
6030 !!!cp ('t331.2');
6031 }
6032
6033 ## Stop parsing
6034 last B;
6035 } else {
6036 die "$0: $token->{type}: Unknown token type";
6037 }
6038
6039 ## ISSUE: An issue in spec here
6040 } else {
6041 die "$0: $self->{insertion_mode}: Unknown insertion mode";
6042 }
6043
6044 ## "in body" insertion mode
6045 if ($token->{type} == START_TAG_TOKEN) {
6046 if ($token->{tag_name} eq 'script') {
6047 !!!cp ('t332');
6048 ## NOTE: This is an "as if in head" code clone
6049 $script_start_tag->();
6050 next B;
6051 } elsif ($token->{tag_name} eq 'style') {
6052 !!!cp ('t333');
6053 ## NOTE: This is an "as if in head" code clone
6054 $parse_rcdata->(CDATA_CONTENT_MODEL);
6055 next B;
6056 } elsif ({
6057 base => 1, link => 1,
6058 }->{$token->{tag_name}}) {
6059 !!!cp ('t334');
6060 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6061 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6062 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6063 !!!ack ('t334.1');
6064 !!!next-token;
6065 next B;
6066 } elsif ($token->{tag_name} eq 'meta') {
6067 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6068 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6069 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6070
6071 unless ($self->{confident}) {
6072 if ($token->{attributes}->{charset}) { ## TODO: And if supported
6073 !!!cp ('t335');
6074 $self->{change_encoding}
6075 ->($self, $token->{attributes}->{charset}->{value}, $token);
6076
6077 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6078 ->set_user_data (manakai_has_reference =>
6079 $token->{attributes}->{charset}
6080 ->{has_reference});
6081 } elsif ($token->{attributes}->{content}) {
6082 ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
6083 if ($token->{attributes}->{content}->{value}
6084 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6085 [\x09-\x0D\x20]*=
6086 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6087 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
6088 !!!cp ('t336');
6089 $self->{change_encoding}
6090 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6091 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6092 ->set_user_data (manakai_has_reference =>
6093 $token->{attributes}->{content}
6094 ->{has_reference});
6095 }
6096 }
6097 } else {
6098 if ($token->{attributes}->{charset}) {
6099 !!!cp ('t337');
6100 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6101 ->set_user_data (manakai_has_reference =>
6102 $token->{attributes}->{charset}
6103 ->{has_reference});
6104 }
6105 if ($token->{attributes}->{content}) {
6106 !!!cp ('t338');
6107 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6108 ->set_user_data (manakai_has_reference =>
6109 $token->{attributes}->{content}
6110 ->{has_reference});
6111 }
6112 }
6113
6114 !!!ack ('t338.1');
6115 !!!next-token;
6116 next B;
6117 } elsif ($token->{tag_name} eq 'title') {
6118 !!!cp ('t341');
6119 ## NOTE: This is an "as if in head" code clone
6120 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6121 next B;
6122 } elsif ($token->{tag_name} eq 'body') {
6123 !!!parse-error (type => 'in body:body', token => $token);
6124
6125 if (@{$self->{open_elements}} == 1 or
6126 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6127 !!!cp ('t342');
6128 ## Ignore the token
6129 } else {
6130 my $body_el = $self->{open_elements}->[1]->[0];
6131 for my $attr_name (keys %{$token->{attributes}}) {
6132 unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6133 !!!cp ('t343');
6134 $body_el->set_attribute_ns
6135 (undef, [undef, $attr_name],
6136 $token->{attributes}->{$attr_name}->{value});
6137 }
6138 }
6139 }
6140 !!!nack ('t343.1');
6141 !!!next-token;
6142 next B;
6143 } elsif ({
6144 address => 1, blockquote => 1, center => 1, dir => 1,
6145 div => 1, dl => 1, fieldset => 1,
6146 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6147 menu => 1, ol => 1, p => 1, ul => 1,
6148 pre => 1, listing => 1,
6149 form => 1,
6150 table => 1,
6151 hr => 1,
6152 }->{$token->{tag_name}}) {
6153 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6154 !!!cp ('t350');
6155 !!!parse-error (type => 'in form:form', token => $token);
6156 ## Ignore the token
6157 !!!nack ('t350.1');
6158 !!!next-token;
6159 next B;
6160 }
6161
6162 ## has a p element in scope
6163 INSCOPE: for (reverse @{$self->{open_elements}}) {
6164 if ($_->[1] & P_EL) {
6165 !!!cp ('t344');
6166 !!!back-token; # <form>
6167 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6168 line => $token->{line}, column => $token->{column}};
6169 next B;
6170 } elsif ($_->[1] & SCOPING_EL) {
6171 !!!cp ('t345');
6172 last INSCOPE;
6173 }
6174 } # INSCOPE
6175
6176 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6177 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6178 !!!nack ('t346.1');
6179 !!!next-token;
6180 if ($token->{type} == CHARACTER_TOKEN) {
6181 $token->{data} =~ s/^\x0A//;
6182 unless (length $token->{data}) {
6183 !!!cp ('t346');
6184 !!!next-token;
6185 } else {
6186 !!!cp ('t349');
6187 }
6188 } else {
6189 !!!cp ('t348');
6190 }
6191 } elsif ($token->{tag_name} eq 'form') {
6192 !!!cp ('t347.1');
6193 $self->{form_element} = $self->{open_elements}->[-1]->[0];
6194
6195 !!!nack ('t347.2');
6196 !!!next-token;
6197 } elsif ($token->{tag_name} eq 'table') {
6198 !!!cp ('t382');
6199 push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6200
6201 $self->{insertion_mode} = IN_TABLE_IM;
6202
6203 !!!nack ('t382.1');
6204 !!!next-token;
6205 } elsif ($token->{tag_name} eq 'hr') {
6206 !!!cp ('t386');
6207 pop @{$self->{open_elements}};
6208
6209 !!!nack ('t386.1');
6210 !!!next-token;
6211 } else {
6212 !!!nack ('t347.1');
6213 !!!next-token;
6214 }
6215 next B;
6216 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6217 ## has a p element in scope
6218 INSCOPE: for (reverse @{$self->{open_elements}}) {
6219 if ($_->[1] & P_EL) {
6220 !!!cp ('t353');
6221 !!!back-token; # <x>
6222 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6223 line => $token->{line}, column => $token->{column}};
6224 next B;
6225 } elsif ($_->[1] & SCOPING_EL) {
6226 !!!cp ('t354');
6227 last INSCOPE;
6228 }
6229 } # INSCOPE
6230
6231 ## Step 1
6232 my $i = -1;
6233 my $node = $self->{open_elements}->[$i];
6234 my $li_or_dtdd = {li => {li => 1},
6235 dt => {dt => 1, dd => 1},
6236 dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6237 LI: {
6238 ## Step 2
6239 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6240 if ($i != -1) {
6241 !!!cp ('t355');
6242 !!!parse-error (type => 'not closed',
6243 value => $self->{open_elements}->[-1]->[0]
6244 ->manakai_local_name,
6245 token => $token);
6246 } else {
6247 !!!cp ('t356');
6248 }
6249 splice @{$self->{open_elements}}, $i;
6250 last LI;
6251 } else {
6252 !!!cp ('t357');
6253 }
6254
6255 ## Step 3
6256 if (not ($node->[1] & FORMATTING_EL) and
6257 #not $phrasing_category->{$node->[1]} and
6258 ($node->[1] & SPECIAL_EL or
6259 $node->[1] & SCOPING_EL) and
6260 not ($node->[1] & ADDRESS_EL) and
6261 not ($node->[1] & DIV_EL)) {
6262 !!!cp ('t358');
6263 last LI;
6264 }
6265
6266 !!!cp ('t359');
6267 ## Step 4
6268 $i--;
6269 $node = $self->{open_elements}->[$i];
6270 redo LI;
6271 } # LI
6272
6273 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6274 !!!nack ('t359.1');
6275 !!!next-token;
6276 next B;
6277 } elsif ($token->{tag_name} eq 'plaintext') {
6278 ## has a p element in scope
6279 INSCOPE: for (reverse @{$self->{open_elements}}) {
6280 if ($_->[1] & P_EL) {
6281 !!!cp ('t367');
6282 !!!back-token; # <plaintext>
6283 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6284 line => $token->{line}, column => $token->{column}};
6285 next B;
6286 } elsif ($_->[1] & SCOPING_EL) {
6287 !!!cp ('t368');
6288 last INSCOPE;
6289 }
6290 } # INSCOPE
6291
6292 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6293
6294 $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6295
6296 !!!nack ('t368.1');
6297 !!!next-token;
6298 next B;
6299 } elsif ($token->{tag_name} eq 'a') {
6300 AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6301 my $node = $active_formatting_elements->[$i];
6302 if ($node->[1] & A_EL) {
6303 !!!cp ('t371');
6304 !!!parse-error (type => 'in a:a', token => $token);
6305
6306 !!!back-token; # <a>
6307 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6308 line => $token->{line}, column => $token->{column}};
6309 $formatting_end_tag->($token);
6310
6311 AFE2: for (reverse 0..$#$active_formatting_elements) {
6312 if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6313 !!!cp ('t372');
6314 splice @$active_formatting_elements, $_, 1;
6315 last AFE2;
6316 }
6317 } # AFE2
6318 OE: for (reverse 0..$#{$self->{open_elements}}) {
6319 if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6320 !!!cp ('t373');
6321 splice @{$self->{open_elements}}, $_, 1;
6322 last OE;
6323 }
6324 } # OE
6325 last AFE;
6326 } elsif ($node->[0] eq '#marker') {
6327 !!!cp ('t374');
6328 last AFE;
6329 }
6330 } # AFE
6331
6332 $reconstruct_active_formatting_elements->($insert_to_current);
6333
6334 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6335 push @$active_formatting_elements, $self->{open_elements}->[-1];
6336
6337 !!!nack ('t374.1');
6338 !!!next-token;
6339 next B;
6340 } elsif ($token->{tag_name} eq 'nobr') {
6341 $reconstruct_active_formatting_elements->($insert_to_current);
6342
6343 ## has a |nobr| element in scope
6344 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6345 my $node = $self->{open_elements}->[$_];
6346 if ($node->[1] & NOBR_EL) {
6347 !!!cp ('t376');
6348 !!!parse-error (type => 'in nobr:nobr', token => $token);
6349 !!!back-token; # <nobr>
6350 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
6351 line => $token->{line}, column => $token->{column}};
6352 next B;
6353 } elsif ($node->[1] & SCOPING_EL) {
6354 !!!cp ('t377');
6355 last INSCOPE;
6356 }
6357 } # INSCOPE
6358
6359 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6360 push @$active_formatting_elements, $self->{open_elements}->[-1];
6361
6362 !!!nack ('t377.1');
6363 !!!next-token;
6364 next B;
6365 } elsif ($token->{tag_name} eq 'button') {
6366 ## has a button element in scope
6367 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6368 my $node = $self->{open_elements}->[$_];
6369 if ($node->[1] & BUTTON_EL) {
6370 !!!cp ('t378');
6371 !!!parse-error (type => 'in button:button', token => $token);
6372 !!!back-token; # <button>
6373 $token = {type => END_TAG_TOKEN, tag_name => 'button',
6374 line => $token->{line}, column => $token->{column}};
6375 next B;
6376 } elsif ($node->[1] & SCOPING_EL) {
6377 !!!cp ('t379');
6378 last INSCOPE;
6379 }
6380 } # INSCOPE
6381
6382 $reconstruct_active_formatting_elements->($insert_to_current);
6383
6384 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6385
6386 ## TODO: associate with $self->{form_element} if defined
6387
6388 push @$active_formatting_elements, ['#marker', ''];
6389
6390 !!!nack ('t379.1');
6391 !!!next-token;
6392 next B;
6393 } elsif ({
6394 xmp => 1,
6395 iframe => 1,
6396 noembed => 1,
6397 noframes => 1,
6398 noscript => 0, ## TODO: 1 if scripting is enabled
6399 }->{$token->{tag_name}}) {
6400 if ($token->{tag_name} eq 'xmp') {
6401 !!!cp ('t381');
6402 $reconstruct_active_formatting_elements->($insert_to_current);
6403 } else {
6404 !!!cp ('t399');
6405 }
6406 ## NOTE: There is an "as if in body" code clone.
6407 $parse_rcdata->(CDATA_CONTENT_MODEL);
6408 next B;
6409 } elsif ($token->{tag_name} eq 'isindex') {
6410 !!!parse-error (type => 'isindex', token => $token);
6411
6412 if (defined $self->{form_element}) {
6413 !!!cp ('t389');
6414 ## Ignore the token
6415 !!!nack ('t389'); ## NOTE: Not acknowledged.
6416 !!!next-token;
6417 next B;
6418 } else {
6419 my $at = $token->{attributes};
6420 my $form_attrs;
6421 $form_attrs->{action} = $at->{action} if $at->{action};
6422 my $prompt_attr = $at->{prompt};
6423 $at->{name} = {name => 'name', value => 'isindex'};
6424 delete $at->{action};
6425 delete $at->{prompt};
6426 my @tokens = (
6427 {type => START_TAG_TOKEN, tag_name => 'form',
6428 attributes => $form_attrs,
6429 line => $token->{line}, column => $token->{column}},
6430 {type => START_TAG_TOKEN, tag_name => 'hr',
6431 line => $token->{line}, column => $token->{column}},
6432 {type => START_TAG_TOKEN, tag_name => 'p',
6433 line => $token->{line}, column => $token->{column}},
6434 {type => START_TAG_TOKEN, tag_name => 'label',
6435 line => $token->{line}, column => $token->{column}},
6436 );
6437 if ($prompt_attr) {
6438 !!!cp ('t390');
6439 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
6440 #line => $token->{line}, column => $token->{column},
6441 };
6442 } else {
6443 !!!cp ('t391');
6444 push @tokens, {type => CHARACTER_TOKEN,
6445 data => 'This is a searchable index. Insert your search keywords here: ',
6446 #line => $token->{line}, column => $token->{column},
6447 }; # SHOULD
6448 ## TODO: make this configurable
6449 }
6450 push @tokens,
6451 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
6452 line => $token->{line}, column => $token->{column}},
6453 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
6454 {type => END_TAG_TOKEN, tag_name => 'label',
6455 line => $token->{line}, column => $token->{column}},
6456 {type => END_TAG_TOKEN, tag_name => 'p',
6457 line => $token->{line}, column => $token->{column}},
6458 {type => START_TAG_TOKEN, tag_name => 'hr',
6459 line => $token->{line}, column => $token->{column}},
6460 {type => END_TAG_TOKEN, tag_name => 'form',
6461 line => $token->{line}, column => $token->{column}};
6462 !!!nack ('t391.1'); ## NOTE: Not acknowledged.
6463 !!!back-token (@tokens);
6464 !!!next-token;
6465 next B;
6466 }
6467 } elsif ($token->{tag_name} eq 'textarea') {
6468 my $tag_name = $token->{tag_name};
6469 my $el;
6470 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
6471
6472 ## TODO: $self->{form_element} if defined
6473 $self->{content_model} = RCDATA_CONTENT_MODEL;
6474 delete $self->{escape}; # MUST
6475
6476 $insert->($el);
6477
6478 my $text = '';
6479 !!!nack ('t392.1');
6480 !!!next-token;
6481 if ($token->{type} == CHARACTER_TOKEN) {
6482 $token->{data} =~ s/^\x0A//;
6483 unless (length $token->{data}) {
6484 !!!cp ('t392');
6485 !!!next-token;
6486 } else {
6487 !!!cp ('t393');
6488 }
6489 } else {
6490 !!!cp ('t394');
6491 }
6492 while ($token->{type} == CHARACTER_TOKEN) {
6493 !!!cp ('t395');
6494 $text .= $token->{data};
6495 !!!next-token;
6496 }
6497 if (length $text) {
6498 !!!cp ('t396');
6499 $el->manakai_append_text ($text);
6500 }
6501
6502 $self->{content_model} = PCDATA_CONTENT_MODEL;
6503
6504 if ($token->{type} == END_TAG_TOKEN and
6505 $token->{tag_name} eq $tag_name) {
6506 !!!cp ('t397');
6507 ## Ignore the token
6508 } else {
6509 !!!cp ('t398');
6510 !!!parse-error (type => 'in RCDATA:#'.$token->{type}, token => $token);
6511 }
6512 !!!next-token;
6513 next B;
6514 } elsif ($token->{tag_name} eq 'math' or
6515 $token->{tag_name} eq 'svg') {
6516 $reconstruct_active_formatting_elements->($insert_to_current);
6517
6518 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
6519
6520 ## "adjust foreign attributes" - done in insert-element-f
6521
6522 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
6523
6524 if ($self->{self_closing}) {
6525 pop @{$self->{open_elements}};
6526 !!!ack ('t398.1');
6527 } else {
6528 !!!cp ('t398.2');
6529 $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
6530 ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
6531 ## mode, "in body" (not "in foreign content") secondary insertion
6532 ## mode, maybe.
6533 }
6534
6535 !!!next-token;
6536 next B;
6537 } elsif ({
6538 caption => 1, col => 1, colgroup => 1, frame => 1,
6539 frameset => 1, head => 1, option => 1, optgroup => 1,
6540 tbody => 1, td => 1, tfoot => 1, th => 1,
6541 thead => 1, tr => 1,
6542 }->{$token->{tag_name}}) {
6543 !!!cp ('t401');
6544 !!!parse-error (type => 'in body:'.$token->{tag_name}, token => $token);
6545 ## Ignore the token
6546 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
6547 !!!next-token;
6548 next B;
6549
6550 ## ISSUE: An issue on HTML5 new elements in the spec.
6551 } else {
6552 if ($token->{tag_name} eq 'image') {
6553 !!!cp ('t384');
6554 !!!parse-error (type => 'image', token => $token);
6555 $token->{tag_name} = 'img';
6556 } else {
6557 !!!cp ('t385');
6558 }
6559
6560 ## NOTE: There is an "as if <br>" code clone.
6561 $reconstruct_active_formatting_elements->($insert_to_current);
6562
6563 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6564
6565 if ({
6566 applet => 1, marquee => 1, object => 1,
6567 }->{$token->{tag_name}}) {
6568 !!!cp ('t380');
6569 push @$active_formatting_elements, ['#marker', ''];
6570 !!!nack ('t380.1');
6571 } elsif ({
6572 b => 1, big => 1, em => 1, font => 1, i => 1,
6573 s => 1, small => 1, strile => 1,
6574 strong => 1, tt => 1, u => 1,
6575 }->{$token->{tag_name}}) {
6576 !!!cp ('t375');
6577 push @$active_formatting_elements, $self->{open_elements}->[-1];
6578 !!!nack ('t375.1');
6579 } elsif ($token->{tag_name} eq 'input') {
6580 !!!cp ('t388');
6581 ## TODO: associate with $self->{form_element} if defined
6582 pop @{$self->{open_elements}};
6583 !!!ack ('t388.2');
6584 } elsif ({
6585 area => 1, basefont => 1, bgsound => 1, br => 1,
6586 embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
6587 #image => 1,
6588 }->{$token->{tag_name}}) {
6589 !!!cp ('t388.1');
6590 pop @{$self->{open_elements}};
6591 !!!ack ('t388.3');
6592 } elsif ($token->{tag_name} eq 'select') {
6593 ## TODO: associate with $self->{form_element} if defined
6594
6595 if ($self->{insertion_mode} & TABLE_IMS or
6596 $self->{insertion_mode} & BODY_TABLE_IMS or
6597 $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6598 !!!cp ('t400.1');
6599 $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
6600 } else {
6601 !!!cp ('t400.2');
6602 $self->{insertion_mode} = IN_SELECT_IM;
6603 }
6604 !!!nack ('t400.3');
6605 } else {
6606 !!!nack ('t402');
6607 }
6608
6609 !!!next-token;
6610 next B;
6611 }
6612 } elsif ($token->{type} == END_TAG_TOKEN) {
6613 if ($token->{tag_name} eq 'body') {
6614 ## has a |body| element in scope
6615 my $i;
6616 INSCOPE: {
6617 for (reverse @{$self->{open_elements}}) {
6618 if ($_->[1] & BODY_EL) {
6619 !!!cp ('t405');
6620 $i = $_;
6621 last INSCOPE;
6622 } elsif ($_->[1] & SCOPING_EL) {
6623 !!!cp ('t405.1');
6624 last;
6625 }
6626 }
6627
6628 !!!parse-error (type => 'start tag not allowed',
6629 value => $token->{tag_name}, token => $token);
6630 ## NOTE: Ignore the token.
6631 !!!next-token;
6632 next B;
6633 } # INSCOPE
6634
6635 for (@{$self->{open_elements}}) {
6636 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
6637 !!!cp ('t403');
6638 !!!parse-error (type => 'not closed',
6639 value => $_->[0]->manakai_local_name,
6640 token => $token);
6641 last;
6642 } else {
6643 !!!cp ('t404');
6644 }
6645 }
6646
6647 $self->{insertion_mode} = AFTER_BODY_IM;
6648 !!!next-token;
6649 next B;
6650 } elsif ($token->{tag_name} eq 'html') {
6651 ## TODO: Update this code. It seems that the code below is not
6652 ## up-to-date, though it has same effect as speced.
6653 if (@{$self->{open_elements}} > 1 and
6654 $self->{open_elements}->[1]->[1] & BODY_EL) {
6655 ## ISSUE: There is an issue in the spec.
6656 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
6657 !!!cp ('t406');
6658 !!!parse-error (type => 'not closed',
6659 value => $self->{open_elements}->[1]->[0]
6660 ->manakai_local_name,
6661 token => $token);
6662 } else {
6663 !!!cp ('t407');
6664 }
6665 $self->{insertion_mode} = AFTER_BODY_IM;
6666 ## reprocess
6667 next B;
6668 } else {
6669 !!!cp ('t408');
6670 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6671 ## Ignore the token
6672 !!!next-token;
6673 next B;
6674 }
6675 } elsif ({
6676 address => 1, blockquote => 1, center => 1, dir => 1,
6677 div => 1, dl => 1, fieldset => 1, listing => 1,
6678 menu => 1, ol => 1, pre => 1, ul => 1,
6679 dd => 1, dt => 1, li => 1,
6680 applet => 1, button => 1, marquee => 1, object => 1,
6681 }->{$token->{tag_name}}) {
6682 ## has an element in scope
6683 my $i;
6684 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6685 my $node = $self->{open_elements}->[$_];
6686 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6687 !!!cp ('t410');
6688 $i = $_;
6689 last INSCOPE;
6690 } elsif ($node->[1] & SCOPING_EL) {
6691 !!!cp ('t411');
6692 last INSCOPE;
6693 }
6694 } # INSCOPE
6695
6696 unless (defined $i) { # has an element in scope
6697 !!!cp ('t413');
6698 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6699 } else {
6700 ## Step 1. generate implied end tags
6701 while ({
6702 dd => ($token->{tag_name} ne 'dd'),
6703 dt => ($token->{tag_name} ne 'dt'),
6704 li => ($token->{tag_name} ne 'li'),
6705 p => 1,
6706 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
6707 !!!cp ('t409');
6708 pop @{$self->{open_elements}};
6709 }
6710
6711 ## Step 2.
6712 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6713 ne $token->{tag_name}) {
6714 !!!cp ('t412');
6715 !!!parse-error (type => 'not closed',
6716 value => $self->{open_elements}->[-1]->[0]
6717 ->manakai_local_name,
6718 token => $token);
6719 } else {
6720 !!!cp ('t414');
6721 }
6722
6723 ## Step 3.
6724 splice @{$self->{open_elements}}, $i;
6725
6726 ## Step 4.
6727 $clear_up_to_marker->()
6728 if {
6729 applet => 1, button => 1, marquee => 1, object => 1,
6730 }->{$token->{tag_name}};
6731 }
6732 !!!next-token;
6733 next B;
6734 } elsif ($token->{tag_name} eq 'form') {
6735 undef $self->{form_element};
6736
6737 ## has an element in scope
6738 my $i;
6739 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6740 my $node = $self->{open_elements}->[$_];
6741 if ($node->[1] & FORM_EL) {
6742 !!!cp ('t418');
6743 $i = $_;
6744 last INSCOPE;
6745 } elsif ($node->[1] & SCOPING_EL) {
6746 !!!cp ('t419');
6747 last INSCOPE;
6748 }
6749 } # INSCOPE
6750
6751 unless (defined $i) { # has an element in scope
6752 !!!cp ('t421');
6753 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6754 } else {
6755 ## Step 1. generate implied end tags
6756 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6757 !!!cp ('t417');
6758 pop @{$self->{open_elements}};
6759 }
6760
6761 ## Step 2.
6762 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6763 ne $token->{tag_name}) {
6764 !!!cp ('t417.1');
6765 !!!parse-error (type => 'not closed',
6766 value => $self->{open_elements}->[-1]->[0]
6767 ->manakai_local_name,
6768 token => $token);
6769 } else {
6770 !!!cp ('t420');
6771 }
6772
6773 ## Step 3.
6774 splice @{$self->{open_elements}}, $i;
6775 }
6776
6777 !!!next-token;
6778 next B;
6779 } elsif ({
6780 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6781 }->{$token->{tag_name}}) {
6782 ## has an element in scope
6783 my $i;
6784 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6785 my $node = $self->{open_elements}->[$_];
6786 if ($node->[1] & HEADING_EL) {
6787 !!!cp ('t423');
6788 $i = $_;
6789 last INSCOPE;
6790 } elsif ($node->[1] & SCOPING_EL) {
6791 !!!cp ('t424');
6792 last INSCOPE;
6793 }
6794 } # INSCOPE
6795
6796 unless (defined $i) { # has an element in scope
6797 !!!cp ('t425.1');
6798 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6799 } else {
6800 ## Step 1. generate implied end tags
6801 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6802 !!!cp ('t422');
6803 pop @{$self->{open_elements}};
6804 }
6805
6806 ## Step 2.
6807 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6808 ne $token->{tag_name}) {
6809 !!!cp ('t425');
6810 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6811 } else {
6812 !!!cp ('t426');
6813 }
6814
6815 ## Step 3.
6816 splice @{$self->{open_elements}}, $i;
6817 }
6818
6819 !!!next-token;
6820 next B;
6821 } elsif ($token->{tag_name} eq 'p') {
6822 ## has an element in scope
6823 my $i;
6824 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6825 my $node = $self->{open_elements}->[$_];
6826 if ($node->[1] & P_EL) {
6827 !!!cp ('t410.1');
6828 $i = $_;
6829 last INSCOPE;
6830 } elsif ($node->[1] & SCOPING_EL) {
6831 !!!cp ('t411.1');
6832 last INSCOPE;
6833 }
6834 } # INSCOPE
6835
6836 if (defined $i) {
6837 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6838 ne $token->{tag_name}) {
6839 !!!cp ('t412.1');
6840 !!!parse-error (type => 'not closed',
6841 value => $self->{open_elements}->[-1]->[0]
6842 ->manakai_local_name,
6843 token => $token);
6844 } else {
6845 !!!cp ('t414.1');
6846 }
6847
6848 splice @{$self->{open_elements}}, $i;
6849 } else {
6850 !!!cp ('t413.1');
6851 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6852
6853 !!!cp ('t415.1');
6854 ## As if <p>, then reprocess the current token
6855 my $el;
6856 !!!create-element ($el, $HTML_NS, 'p',, $token);
6857 $insert->($el);
6858 ## NOTE: Not inserted into |$self->{open_elements}|.
6859 }
6860
6861 !!!next-token;
6862 next B;
6863 } elsif ({
6864 a => 1,
6865 b => 1, big => 1, em => 1, font => 1, i => 1,
6866 nobr => 1, s => 1, small => 1, strile => 1,
6867 strong => 1, tt => 1, u => 1,
6868 }->{$token->{tag_name}}) {
6869 !!!cp ('t427');
6870 $formatting_end_tag->($token);
6871 next B;
6872 } elsif ($token->{tag_name} eq 'br') {
6873 !!!cp ('t428');
6874 !!!parse-error (type => 'unmatched end tag:br', token => $token);
6875
6876 ## As if <br>
6877 $reconstruct_active_formatting_elements->($insert_to_current);
6878
6879 my $el;
6880 !!!create-element ($el, $HTML_NS, 'br',, $token);
6881 $insert->($el);
6882
6883 ## Ignore the token.
6884 !!!next-token;
6885 next B;
6886 } elsif ({
6887 caption => 1, col => 1, colgroup => 1, frame => 1,
6888 frameset => 1, head => 1, option => 1, optgroup => 1,
6889 tbody => 1, td => 1, tfoot => 1, th => 1,
6890 thead => 1, tr => 1,
6891 area => 1, basefont => 1, bgsound => 1,
6892 embed => 1, hr => 1, iframe => 1, image => 1,
6893 img => 1, input => 1, isindex => 1, noembed => 1,
6894 noframes => 1, param => 1, select => 1, spacer => 1,
6895 table => 1, textarea => 1, wbr => 1,
6896 noscript => 0, ## TODO: if scripting is enabled
6897 }->{$token->{tag_name}}) {
6898 !!!cp ('t429');
6899 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6900 ## Ignore the token
6901 !!!next-token;
6902 next B;
6903
6904 ## ISSUE: Issue on HTML5 new elements in spec
6905
6906 } else {
6907 ## Step 1
6908 my $node_i = -1;
6909 my $node = $self->{open_elements}->[$node_i];
6910
6911 ## Step 2
6912 S2: {
6913 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6914 ## Step 1
6915 ## generate implied end tags
6916 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
6917 !!!cp ('t430');
6918 ## ISSUE: Can this case be reached?
6919 pop @{$self->{open_elements}};
6920 }
6921
6922 ## Step 2
6923 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
6924 ne $token->{tag_name}) {
6925 !!!cp ('t431');
6926 ## NOTE: <x><y></x>
6927 !!!parse-error (type => 'not closed',
6928 value => $self->{open_elements}->[-1]->[0]
6929 ->manakai_local_name,
6930 token => $token);
6931 } else {
6932 !!!cp ('t432');
6933 }
6934
6935 ## Step 3
6936 splice @{$self->{open_elements}}, $node_i;
6937
6938 !!!next-token;
6939 last S2;
6940 } else {
6941 ## Step 3
6942 if (not ($node->[1] & FORMATTING_EL) and
6943 #not $phrasing_category->{$node->[1]} and
6944 ($node->[1] & SPECIAL_EL or
6945 $node->[1] & SCOPING_EL)) {
6946 !!!cp ('t433');
6947 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
6948 ## Ignore the token
6949 !!!next-token;
6950 last S2;
6951 }
6952
6953 !!!cp ('t434');
6954 }
6955
6956 ## Step 4
6957 $node_i--;
6958 $node = $self->{open_elements}->[$node_i];
6959
6960 ## Step 5;
6961 redo S2;
6962 } # S2
6963 next B;
6964 }
6965 }
6966 next B;
6967 } continue { # B
6968 if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
6969 ## NOTE: The code below is executed in cases where it does not have
6970 ## to be, but it it is harmless even in those cases.
6971 ## has an element in scope
6972 INSCOPE: {
6973 for (reverse 0..$#{$self->{open_elements}}) {
6974 my $node = $self->{open_elements}->[$_];
6975 if ($node->[1] & FOREIGN_EL) {
6976 last INSCOPE;
6977 } elsif ($node->[1] & SCOPING_EL) {
6978 last;
6979 }
6980 }
6981
6982 ## NOTE: No foreign element in scope.
6983 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
6984 } # INSCOPE
6985 }
6986 } # B
6987
6988 ## Stop parsing # MUST
6989
6990 ## TODO: script stuffs
6991 } # _tree_construct_main
6992
6993 sub set_inner_html ($$$) {
6994 my $class = shift;
6995 my $node = shift;
6996 my $s = \$_[0];
6997 my $onerror = $_[1];
6998
6999 ## ISSUE: Should {confident} be true?
7000
7001 my $nt = $node->node_type;
7002 if ($nt == 9) {
7003 # MUST
7004
7005 ## Step 1 # MUST
7006 ## TODO: If the document has an active parser, ...
7007 ## ISSUE: There is an issue in the spec.
7008
7009 ## Step 2 # MUST
7010 my @cn = @{$node->child_nodes};
7011 for (@cn) {
7012 $node->remove_child ($_);
7013 }
7014
7015 ## Step 3, 4, 5 # MUST
7016 $class->parse_string ($$s => $node, $onerror);
7017 } elsif ($nt == 1) {
7018 ## TODO: If non-html element
7019
7020 ## NOTE: Most of this code is copied from |parse_string|
7021
7022 ## Step 1 # MUST
7023 my $this_doc = $node->owner_document;
7024 my $doc = $this_doc->implementation->create_document;
7025 $doc->manakai_is_html (1);
7026 my $p = $class->new;
7027 $p->{document} = $doc;
7028
7029 ## Step 8 # MUST
7030 my $i = 0;
7031 $p->{line_prev} = $p->{line} = 1;
7032 $p->{column_prev} = $p->{column} = 0;
7033 $p->{set_next_char} = sub {
7034 my $self = shift;
7035
7036 pop @{$self->{prev_char}};
7037 unshift @{$self->{prev_char}}, $self->{next_char};
7038
7039 $self->{next_char} = -1 and return if $i >= length $$s;
7040 $self->{next_char} = ord substr $$s, $i++, 1;
7041
7042 ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7043 $p->{column}++;
7044
7045 if ($self->{next_char} == 0x000A) { # LF
7046 $p->{line}++;
7047 $p->{column} = 0;
7048 !!!cp ('i1');
7049 } elsif ($self->{next_char} == 0x000D) { # CR
7050 $i++ if substr ($$s, $i, 1) eq "\x0A";
7051 $self->{next_char} = 0x000A; # LF # MUST
7052 $p->{line}++;
7053 $p->{column} = 0;
7054 !!!cp ('i2');
7055 } elsif ($self->{next_char} > 0x10FFFF) {
7056 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7057 !!!cp ('i3');
7058 } elsif ($self->{next_char} == 0x0000) { # NULL
7059 !!!cp ('i4');
7060 !!!parse-error (type => 'NULL');
7061 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7062 }
7063 };
7064 $p->{prev_char} = [-1, -1, -1];
7065 $p->{next_char} = -1;
7066
7067 my $ponerror = $onerror || sub {
7068 my (%opt) = @_;
7069 my $line = $opt{line};
7070 my $column = $opt{column};
7071 if (defined $opt{token} and defined $opt{token}->{line}) {
7072 $line = $opt{token}->{line};
7073 $column = $opt{token}->{column};
7074 }
7075 warn "Parse error ($opt{type}) at line $line column $column\n";
7076 };
7077 $p->{parse_error} = sub {
7078 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7079 };
7080
7081 $p->_initialize_tokenizer;
7082 $p->_initialize_tree_constructor;
7083
7084 ## Step 2
7085 my $node_ln = $node->manakai_local_name;
7086 $p->{content_model} = {
7087 title => RCDATA_CONTENT_MODEL,
7088 textarea => RCDATA_CONTENT_MODEL,
7089 style => CDATA_CONTENT_MODEL,
7090 script => CDATA_CONTENT_MODEL,
7091 xmp => CDATA_CONTENT_MODEL,
7092 iframe => CDATA_CONTENT_MODEL,
7093 noembed => CDATA_CONTENT_MODEL,
7094 noframes => CDATA_CONTENT_MODEL,
7095 noscript => CDATA_CONTENT_MODEL,
7096 plaintext => PLAINTEXT_CONTENT_MODEL,
7097 }->{$node_ln};
7098 $p->{content_model} = PCDATA_CONTENT_MODEL
7099 unless defined $p->{content_model};
7100 ## ISSUE: What is "the name of the element"? local name?
7101
7102 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7103 ## TODO: Foreign element OK?
7104
7105 ## Step 3
7106 my $root = $doc->create_element_ns
7107 ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7108
7109 ## Step 4 # MUST
7110 $doc->append_child ($root);
7111
7112 ## Step 5 # MUST
7113 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7114
7115 undef $p->{head_element};
7116
7117 ## Step 6 # MUST
7118 $p->_reset_insertion_mode;
7119
7120 ## Step 7 # MUST
7121 my $anode = $node;
7122 AN: while (defined $anode) {
7123 if ($anode->node_type == 1) {
7124 my $nsuri = $anode->namespace_uri;
7125 if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7126 if ($anode->manakai_local_name eq 'form') {
7127 !!!cp ('i5');
7128 $p->{form_element} = $anode;
7129 last AN;
7130 }
7131 }
7132 }
7133 $anode = $anode->parent_node;
7134 } # AN
7135
7136 ## Step 9 # MUST
7137 {
7138 my $self = $p;
7139 !!!next-token;
7140 }
7141 $p->_tree_construction_main;
7142
7143 ## Step 10 # MUST
7144 my @cn = @{$node->child_nodes};
7145 for (@cn) {
7146 $node->remove_child ($_);
7147 }
7148 ## ISSUE: mutation events? read-only?
7149
7150 ## Step 11 # MUST
7151 @cn = @{$root->child_nodes};
7152 for (@cn) {
7153 $this_doc->adopt_node ($_);
7154 $node->append_child ($_);
7155 }
7156 ## ISSUE: mutation events?
7157
7158 $p->_terminate_tree_constructor;
7159
7160 delete $p->{parse_error}; # delete loop
7161 } else {
7162 die "$0: |set_inner_html| is not defined for node of type $nt";
7163 }
7164 } # set_inner_html
7165
7166 } # tree construction stage
7167
7168 package Whatpm::HTML::RestartParser;
7169 push our @ISA, 'Error';
7170
7171 1;
7172 # $Date: 2008/04/12 15:47:13 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24