/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.171 - (hide annotations) (download) (as text)
Sun Sep 14 01:51:08 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.170: +53 -5 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	14 Sep 2008 01:47:27 -0000
2008-09-14  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src (parse_char_string): Use newly created
	|Whatpm::Charset::DecodeHandle::CharString| instead of Perl's
	standard feature to |open| a string as a filehandle,
	since Perl's string filehandle seems not supporting |ungetc|
	method correctly.
	(parse_char_stream): Define |{getc_until}| method.
	(DATA_STATE): Experimental support for |getc_until| feature.

++ whatpm/Whatpm/Charset/ChangeLog	14 Sep 2008 01:50:52 -0000
2008-09-14  Wakaba  <wakaba@suika.fam.cx>

	* DecodeHandle.pm (CharString): New class.
	(Encode read): Don't remove read string from |{char_buffer}|,
	to decease the number of string operations and to enable
	|manakai_getc_until| ungetc'ing without any string operation.
	(manakai_getc_until): New method.

	* UnicodeChecker.pm (getc): Don't |read| more than one
	character, to prevent characters being bufferred
	such that mixture of |getc| and |manakai_getc_until|
	calls does not make the result broken.

1 wakaba 1.2 package Whatpm::HTML;
2 wakaba 1.1 use strict;
3 wakaba 1.171 our $VERSION=do{my @r=(q$Revision: 1.170 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.63 use Error qw(:try);
5 wakaba 1.1
6 wakaba 1.18 ## ISSUE:
7     ## var doc = implementation.createDocument (null, null, null);
8     ## doc.write ('');
9     ## alert (doc.compatMode);
10 wakaba 1.1
11 wakaba 1.139 require IO::Handle;
12    
13 wakaba 1.126 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
14     my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
15     my $SVG_NS = q<http://www.w3.org/2000/svg>;
16     my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
17     my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
18     my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
19    
20 wakaba 1.123 sub A_EL () { 0b1 }
21     sub ADDRESS_EL () { 0b10 }
22     sub BODY_EL () { 0b100 }
23     sub BUTTON_EL () { 0b1000 }
24     sub CAPTION_EL () { 0b10000 }
25     sub DD_EL () { 0b100000 }
26     sub DIV_EL () { 0b1000000 }
27     sub DT_EL () { 0b10000000 }
28     sub FORM_EL () { 0b100000000 }
29     sub FORMATTING_EL () { 0b1000000000 }
30     sub FRAMESET_EL () { 0b10000000000 }
31     sub HEADING_EL () { 0b100000000000 }
32     sub HTML_EL () { 0b1000000000000 }
33     sub LI_EL () { 0b10000000000000 }
34     sub NOBR_EL () { 0b100000000000000 }
35     sub OPTION_EL () { 0b1000000000000000 }
36     sub OPTGROUP_EL () { 0b10000000000000000 }
37     sub P_EL () { 0b100000000000000000 }
38     sub SELECT_EL () { 0b1000000000000000000 }
39     sub TABLE_EL () { 0b10000000000000000000 }
40     sub TABLE_CELL_EL () { 0b100000000000000000000 }
41     sub TABLE_ROW_EL () { 0b1000000000000000000000 }
42     sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
43     sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
44     sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
45 wakaba 1.126 sub FOREIGN_EL () { 0b10000000000000000000000000 }
46     sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
47     sub MML_AXML_EL () { 0b1000000000000000000000000000 }
48 wakaba 1.151 sub RUBY_EL () { 0b10000000000000000000000000000 }
49     sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
50 wakaba 1.123
51     sub TABLE_ROWS_EL () {
52     TABLE_EL |
53     TABLE_ROW_EL |
54     TABLE_ROW_GROUP_EL
55     }
56    
57 wakaba 1.151 ## NOTE: Used in "generate implied end tags" algorithm.
58     ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
59     ## is used in "generate implied end tags" implementation (search for the
60     ## function mae).
61 wakaba 1.123 sub END_TAG_OPTIONAL_EL () {
62     DD_EL |
63     DT_EL |
64     LI_EL |
65 wakaba 1.151 P_EL |
66     RUBY_COMPONENT_EL
67 wakaba 1.123 }
68    
69 wakaba 1.151 ## NOTE: Used in </body> and EOF algorithms.
70 wakaba 1.123 sub ALL_END_TAG_OPTIONAL_EL () {
71 wakaba 1.151 DD_EL |
72     DT_EL |
73     LI_EL |
74     P_EL |
75    
76 wakaba 1.123 BODY_EL |
77     HTML_EL |
78     TABLE_CELL_EL |
79     TABLE_ROW_EL |
80     TABLE_ROW_GROUP_EL
81     }
82    
83     sub SCOPING_EL () {
84     BUTTON_EL |
85     CAPTION_EL |
86     HTML_EL |
87     TABLE_EL |
88     TABLE_CELL_EL |
89     MISC_SCOPING_EL
90     }
91    
92     sub TABLE_SCOPING_EL () {
93     HTML_EL |
94     TABLE_EL
95     }
96    
97     sub TABLE_ROWS_SCOPING_EL () {
98     HTML_EL |
99     TABLE_ROW_GROUP_EL
100     }
101    
102     sub TABLE_ROW_SCOPING_EL () {
103     HTML_EL |
104     TABLE_ROW_EL
105     }
106    
107     sub SPECIAL_EL () {
108     ADDRESS_EL |
109     BODY_EL |
110     DIV_EL |
111 wakaba 1.151
112     DD_EL |
113     DT_EL |
114     LI_EL |
115     P_EL |
116    
117 wakaba 1.123 FORM_EL |
118     FRAMESET_EL |
119     HEADING_EL |
120     OPTION_EL |
121     OPTGROUP_EL |
122     SELECT_EL |
123     TABLE_ROW_EL |
124     TABLE_ROW_GROUP_EL |
125     MISC_SPECIAL_EL
126     }
127    
128     my $el_category = {
129     a => A_EL | FORMATTING_EL,
130     address => ADDRESS_EL,
131     applet => MISC_SCOPING_EL,
132     area => MISC_SPECIAL_EL,
133     b => FORMATTING_EL,
134     base => MISC_SPECIAL_EL,
135     basefont => MISC_SPECIAL_EL,
136     bgsound => MISC_SPECIAL_EL,
137     big => FORMATTING_EL,
138     blockquote => MISC_SPECIAL_EL,
139     body => BODY_EL,
140     br => MISC_SPECIAL_EL,
141     button => BUTTON_EL,
142     caption => CAPTION_EL,
143     center => MISC_SPECIAL_EL,
144     col => MISC_SPECIAL_EL,
145     colgroup => MISC_SPECIAL_EL,
146     dd => DD_EL,
147     dir => MISC_SPECIAL_EL,
148     div => DIV_EL,
149     dl => MISC_SPECIAL_EL,
150     dt => DT_EL,
151     em => FORMATTING_EL,
152     embed => MISC_SPECIAL_EL,
153     fieldset => MISC_SPECIAL_EL,
154     font => FORMATTING_EL,
155     form => FORM_EL,
156     frame => MISC_SPECIAL_EL,
157     frameset => FRAMESET_EL,
158     h1 => HEADING_EL,
159     h2 => HEADING_EL,
160     h3 => HEADING_EL,
161     h4 => HEADING_EL,
162     h5 => HEADING_EL,
163     h6 => HEADING_EL,
164     head => MISC_SPECIAL_EL,
165     hr => MISC_SPECIAL_EL,
166     html => HTML_EL,
167     i => FORMATTING_EL,
168     iframe => MISC_SPECIAL_EL,
169     img => MISC_SPECIAL_EL,
170     input => MISC_SPECIAL_EL,
171     isindex => MISC_SPECIAL_EL,
172     li => LI_EL,
173     link => MISC_SPECIAL_EL,
174     listing => MISC_SPECIAL_EL,
175     marquee => MISC_SCOPING_EL,
176     menu => MISC_SPECIAL_EL,
177     meta => MISC_SPECIAL_EL,
178     nobr => NOBR_EL | FORMATTING_EL,
179     noembed => MISC_SPECIAL_EL,
180     noframes => MISC_SPECIAL_EL,
181     noscript => MISC_SPECIAL_EL,
182     object => MISC_SCOPING_EL,
183     ol => MISC_SPECIAL_EL,
184     optgroup => OPTGROUP_EL,
185     option => OPTION_EL,
186     p => P_EL,
187     param => MISC_SPECIAL_EL,
188     plaintext => MISC_SPECIAL_EL,
189     pre => MISC_SPECIAL_EL,
190 wakaba 1.151 rp => RUBY_COMPONENT_EL,
191     rt => RUBY_COMPONENT_EL,
192     ruby => RUBY_EL,
193 wakaba 1.123 s => FORMATTING_EL,
194     script => MISC_SPECIAL_EL,
195     select => SELECT_EL,
196     small => FORMATTING_EL,
197     spacer => MISC_SPECIAL_EL,
198     strike => FORMATTING_EL,
199     strong => FORMATTING_EL,
200     style => MISC_SPECIAL_EL,
201     table => TABLE_EL,
202     tbody => TABLE_ROW_GROUP_EL,
203     td => TABLE_CELL_EL,
204     textarea => MISC_SPECIAL_EL,
205     tfoot => TABLE_ROW_GROUP_EL,
206     th => TABLE_CELL_EL,
207     thead => TABLE_ROW_GROUP_EL,
208     title => MISC_SPECIAL_EL,
209     tr => TABLE_ROW_EL,
210     tt => FORMATTING_EL,
211     u => FORMATTING_EL,
212     ul => MISC_SPECIAL_EL,
213     wbr => MISC_SPECIAL_EL,
214     };
215    
216 wakaba 1.126 my $el_category_f = {
217     $MML_NS => {
218     'annotation-xml' => MML_AXML_EL,
219     mi => FOREIGN_FLOW_CONTENT_EL,
220     mo => FOREIGN_FLOW_CONTENT_EL,
221     mn => FOREIGN_FLOW_CONTENT_EL,
222     ms => FOREIGN_FLOW_CONTENT_EL,
223     mtext => FOREIGN_FLOW_CONTENT_EL,
224     },
225     $SVG_NS => {
226 wakaba 1.131 foreignObject => FOREIGN_FLOW_CONTENT_EL,
227 wakaba 1.126 desc => FOREIGN_FLOW_CONTENT_EL,
228     title => FOREIGN_FLOW_CONTENT_EL,
229     },
230     ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
231     };
232    
233 wakaba 1.131 my $svg_attr_name = {
234 wakaba 1.146 attributename => 'attributeName',
235 wakaba 1.131 attributetype => 'attributeType',
236     basefrequency => 'baseFrequency',
237     baseprofile => 'baseProfile',
238     calcmode => 'calcMode',
239     clippathunits => 'clipPathUnits',
240     contentscripttype => 'contentScriptType',
241     contentstyletype => 'contentStyleType',
242     diffuseconstant => 'diffuseConstant',
243     edgemode => 'edgeMode',
244     externalresourcesrequired => 'externalResourcesRequired',
245     filterres => 'filterRes',
246     filterunits => 'filterUnits',
247     glyphref => 'glyphRef',
248     gradienttransform => 'gradientTransform',
249     gradientunits => 'gradientUnits',
250     kernelmatrix => 'kernelMatrix',
251     kernelunitlength => 'kernelUnitLength',
252     keypoints => 'keyPoints',
253     keysplines => 'keySplines',
254     keytimes => 'keyTimes',
255     lengthadjust => 'lengthAdjust',
256     limitingconeangle => 'limitingConeAngle',
257     markerheight => 'markerHeight',
258     markerunits => 'markerUnits',
259     markerwidth => 'markerWidth',
260     maskcontentunits => 'maskContentUnits',
261     maskunits => 'maskUnits',
262     numoctaves => 'numOctaves',
263     pathlength => 'pathLength',
264     patterncontentunits => 'patternContentUnits',
265     patterntransform => 'patternTransform',
266     patternunits => 'patternUnits',
267     pointsatx => 'pointsAtX',
268     pointsaty => 'pointsAtY',
269     pointsatz => 'pointsAtZ',
270     preservealpha => 'preserveAlpha',
271     preserveaspectratio => 'preserveAspectRatio',
272     primitiveunits => 'primitiveUnits',
273     refx => 'refX',
274     refy => 'refY',
275     repeatcount => 'repeatCount',
276     repeatdur => 'repeatDur',
277     requiredextensions => 'requiredExtensions',
278 wakaba 1.146 requiredfeatures => 'requiredFeatures',
279 wakaba 1.131 specularconstant => 'specularConstant',
280     specularexponent => 'specularExponent',
281     spreadmethod => 'spreadMethod',
282     startoffset => 'startOffset',
283     stddeviation => 'stdDeviation',
284     stitchtiles => 'stitchTiles',
285     surfacescale => 'surfaceScale',
286     systemlanguage => 'systemLanguage',
287     tablevalues => 'tableValues',
288     targetx => 'targetX',
289     targety => 'targetY',
290     textlength => 'textLength',
291     viewbox => 'viewBox',
292     viewtarget => 'viewTarget',
293     xchannelselector => 'xChannelSelector',
294     ychannelselector => 'yChannelSelector',
295     zoomandpan => 'zoomAndPan',
296     };
297    
298     my $foreign_attr_xname = {
299     'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
300     'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
301     'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
302     'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
303     'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
304     'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
305     'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
306     'xml:base' => [$XML_NS, ['xml', 'base']],
307     'xml:lang' => [$XML_NS, ['xml', 'lang']],
308     'xml:space' => [$XML_NS, ['xml', 'space']],
309     'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
310     'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
311     };
312    
313     ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
314    
315 wakaba 1.4 my $c1_entity_char = {
316 wakaba 1.10 0x80 => 0x20AC,
317     0x81 => 0xFFFD,
318     0x82 => 0x201A,
319     0x83 => 0x0192,
320     0x84 => 0x201E,
321     0x85 => 0x2026,
322     0x86 => 0x2020,
323     0x87 => 0x2021,
324     0x88 => 0x02C6,
325     0x89 => 0x2030,
326     0x8A => 0x0160,
327     0x8B => 0x2039,
328     0x8C => 0x0152,
329     0x8D => 0xFFFD,
330     0x8E => 0x017D,
331     0x8F => 0xFFFD,
332     0x90 => 0xFFFD,
333     0x91 => 0x2018,
334     0x92 => 0x2019,
335     0x93 => 0x201C,
336     0x94 => 0x201D,
337     0x95 => 0x2022,
338     0x96 => 0x2013,
339     0x97 => 0x2014,
340     0x98 => 0x02DC,
341     0x99 => 0x2122,
342     0x9A => 0x0161,
343     0x9B => 0x203A,
344     0x9C => 0x0153,
345     0x9D => 0xFFFD,
346     0x9E => 0x017E,
347     0x9F => 0x0178,
348 wakaba 1.4 }; # $c1_entity_char
349 wakaba 1.1
350 wakaba 1.63 sub parse_byte_string ($$$$;$) {
351 wakaba 1.138 my $self = shift;
352     my $charset_name = shift;
353     open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
354     return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
355     } # parse_byte_string
356    
357 wakaba 1.162 sub parse_byte_stream ($$$$;$$) {
358     # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_;
359 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
360 wakaba 1.133 my $charset_name = shift;
361 wakaba 1.138 my $byte_stream = $_[0];
362 wakaba 1.133
363 wakaba 1.134 my $onerror = $_[2] || sub {
364     my (%opt) = @_;
365     warn "Parse error ($opt{type})\n";
366     };
367     $self->{parse_error} = $onerror; # updated later by parse_char_string
368    
369 wakaba 1.162 my $get_wrapper = $_[3] || sub ($) {
370     return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle
371     };
372    
373 wakaba 1.133 ## HTML5 encoding sniffing algorithm
374     require Message::Charset::Info;
375     my $charset;
376 wakaba 1.136 my $buffer;
377     my ($char_stream, $e_status);
378 wakaba 1.133
379     SNIFFING: {
380 wakaba 1.160 ## NOTE: By setting |allow_fallback| option true when the
381     ## |get_decode_handle| method is invoked, we ignore what the HTML5
382     ## spec requires, i.e. unsupported encoding should be ignored.
383     ## TODO: We should not do this unless the parser is invoked
384     ## in the conformance checking mode, in which this behavior
385     ## would be useful.
386 wakaba 1.133
387     ## Step 1
388     if (defined $charset_name) {
389 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
390     ## TODO: Is this ok? Transfer protocol's parameter should be
391     ## interpreted in its semantics?
392 wakaba 1.133
393     ## ISSUE: Unsupported encoding is not ignored according to the spec.
394 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
395     ($byte_stream, allow_error_reporting => 1,
396 wakaba 1.133 allow_fallback => 1);
397 wakaba 1.136 if ($char_stream) {
398 wakaba 1.133 $self->{confident} = 1;
399     last SNIFFING;
400 wakaba 1.136 } else {
401     ## TODO: unsupported error
402 wakaba 1.133 }
403     }
404    
405     ## Step 2
406 wakaba 1.136 my $byte_buffer = '';
407     for (1..1024) {
408     my $char = $byte_stream->getc;
409     last unless defined $char;
410     $byte_buffer .= $char;
411     } ## TODO: timeout
412 wakaba 1.133
413     ## Step 3
414 wakaba 1.136 if ($byte_buffer =~ /^\xFE\xFF/) {
415 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-16be');
416 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
417     ($byte_stream, allow_error_reporting => 1,
418     allow_fallback => 1, byte_buffer => \$byte_buffer);
419 wakaba 1.133 $self->{confident} = 1;
420     last SNIFFING;
421 wakaba 1.136 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
422 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-16le');
423 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
424     ($byte_stream, allow_error_reporting => 1,
425     allow_fallback => 1, byte_buffer => \$byte_buffer);
426 wakaba 1.133 $self->{confident} = 1;
427     last SNIFFING;
428 wakaba 1.136 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
429 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
430 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
431     ($byte_stream, allow_error_reporting => 1,
432     allow_fallback => 1, byte_buffer => \$byte_buffer);
433 wakaba 1.133 $self->{confident} = 1;
434     last SNIFFING;
435     }
436    
437     ## Step 4
438     ## TODO: <meta charset>
439    
440     ## Step 5
441     ## TODO: from history
442    
443     ## Step 6
444 wakaba 1.65 require Whatpm::Charset::UniversalCharDet;
445 wakaba 1.133 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
446 wakaba 1.136 ($byte_buffer);
447 wakaba 1.133 if (defined $charset_name) {
448 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
449 wakaba 1.133
450     ## ISSUE: Unsupported encoding is not ignored according to the spec.
451 wakaba 1.136 require Whatpm::Charset::DecodeHandle;
452     $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
453     ($byte_stream);
454     ($char_stream, $e_status) = $charset->get_decode_handle
455     ($buffer, allow_error_reporting => 1,
456     allow_fallback => 1, byte_buffer => \$byte_buffer);
457     if ($char_stream) {
458     $buffer->{buffer} = $byte_buffer;
459 wakaba 1.153 !!!parse-error (type => 'sniffing:chardet',
460     text => $charset_name,
461     level => $self->{level}->{info},
462     layer => 'encode',
463 wakaba 1.134 line => 1, column => 1);
464 wakaba 1.133 $self->{confident} = 0;
465     last SNIFFING;
466     }
467     }
468    
469     ## Step 7: default
470     ## TODO: Make this configurable.
471 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('windows-1252');
472 wakaba 1.133 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
473     ## detectable in the step 6.
474 wakaba 1.136 require Whatpm::Charset::DecodeHandle;
475     $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
476     ($byte_stream);
477     ($char_stream, $e_status)
478     = $charset->get_decode_handle ($buffer,
479     allow_error_reporting => 1,
480     allow_fallback => 1,
481     byte_buffer => \$byte_buffer);
482     $buffer->{buffer} = $byte_buffer;
483 wakaba 1.153 !!!parse-error (type => 'sniffing:default',
484     text => 'windows-1252',
485     level => $self->{level}->{info},
486     line => 1, column => 1,
487     layer => 'encode');
488 wakaba 1.63 $self->{confident} = 0;
489 wakaba 1.133 } # SNIFFING
490    
491     if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
492 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
493 wakaba 1.153 !!!parse-error (type => 'chardecode:fallback',
494 wakaba 1.160 #text => $self->{input_encoding},
495 wakaba 1.153 level => $self->{level}->{uncertain},
496     line => 1, column => 1,
497     layer => 'encode');
498 wakaba 1.133 } elsif (not ($e_status &
499     Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
500 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name;
501 wakaba 1.153 !!!parse-error (type => 'chardecode:no error',
502     text => $self->{input_encoding},
503     level => $self->{level}->{uncertain},
504     line => 1, column => 1,
505     layer => 'encode');
506 wakaba 1.160 } else {
507     $self->{input_encoding} = $charset->get_iana_name;
508 wakaba 1.63 }
509    
510     $self->{change_encoding} = sub {
511     my $self = shift;
512 wakaba 1.134 $charset_name = shift;
513 wakaba 1.114 my $token = shift;
514 wakaba 1.63
515 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
516 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
517     ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
518     byte_buffer => \ $buffer->{buffer});
519 wakaba 1.134
520 wakaba 1.136 if ($char_stream) { # if supported
521 wakaba 1.134 ## "Change the encoding" algorithm:
522 wakaba 1.63
523 wakaba 1.134 ## Step 1
524 wakaba 1.149 if ($charset->{category} &
525     Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
526 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
527 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
528     ($byte_stream,
529     byte_buffer => \ $buffer->{buffer});
530 wakaba 1.134 }
531     $charset_name = $charset->get_iana_name;
532    
533     ## Step 2
534     if (defined $self->{input_encoding} and
535     $self->{input_encoding} eq $charset_name) {
536 wakaba 1.153 !!!parse-error (type => 'charset label:matching',
537     text => $charset_name,
538     level => $self->{level}->{info});
539 wakaba 1.134 $self->{confident} = 1;
540     return;
541     }
542 wakaba 1.63
543 wakaba 1.153 !!!parse-error (type => 'charset label detected',
544     text => $self->{input_encoding},
545     value => $charset_name,
546     level => $self->{level}->{warn},
547     token => $token);
548 wakaba 1.134
549     ## Step 3
550     # if (can) {
551     ## change the encoding on the fly.
552     #$self->{confident} = 1;
553     #return;
554     # }
555    
556     ## Step 4
557     throw Whatpm::HTML::RestartParser ();
558 wakaba 1.63 }
559     }; # $self->{change_encoding}
560    
561 wakaba 1.136 my $char_onerror = sub {
562     my (undef, $type, %opt) = @_;
563 wakaba 1.153 !!!parse-error (layer => 'encode',
564     %opt, type => $type,
565 wakaba 1.137 line => $self->{line}, column => $self->{column} + 1);
566 wakaba 1.136 if ($opt{octets}) {
567     ${$opt{octets}} = "\x{FFFD}"; # relacement character
568     }
569     };
570 wakaba 1.162
571     my $wrapped_char_stream = $get_wrapper->($char_stream);
572     $wrapped_char_stream->onerror ($char_onerror);
573 wakaba 1.136
574 wakaba 1.63 my @args = @_; shift @args; # $s
575     my $return;
576     try {
577 wakaba 1.162 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
578 wakaba 1.63 } catch Whatpm::HTML::RestartParser with {
579 wakaba 1.134 ## NOTE: Invoked after {change_encoding}.
580    
581     if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
582 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
583 wakaba 1.153 !!!parse-error (type => 'chardecode:fallback',
584     level => $self->{level}->{uncertain},
585 wakaba 1.160 #text => $self->{input_encoding},
586 wakaba 1.153 line => 1, column => 1,
587     layer => 'encode');
588 wakaba 1.134 } elsif (not ($e_status &
589     Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
590 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name;
591 wakaba 1.153 !!!parse-error (type => 'chardecode:no error',
592     text => $self->{input_encoding},
593     level => $self->{level}->{uncertain},
594     line => 1, column => 1,
595     layer => 'encode');
596 wakaba 1.160 } else {
597     $self->{input_encoding} = $charset->get_iana_name;
598 wakaba 1.134 }
599 wakaba 1.63 $self->{confident} = 1;
600 wakaba 1.162
601     $wrapped_char_stream = $get_wrapper->($char_stream);
602     $wrapped_char_stream->onerror ($char_onerror);
603    
604     $return = $self->parse_char_stream ($wrapped_char_stream, @args);
605 wakaba 1.63 };
606     return $return;
607 wakaba 1.138 } # parse_byte_stream
608 wakaba 1.63
609 wakaba 1.71 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
610     ## and the HTML layer MUST ignore it. However, we does strip BOM in
611     ## the encoding layer and the HTML layer does not ignore any U+FEFF,
612     ## because the core part of our HTML parser expects a string of character,
613     ## not a string of bytes or code units or anything which might contain a BOM.
614     ## Therefore, any parser interface that accepts a string of bytes,
615     ## such as |parse_byte_string| in this module, must ensure that it does
616     ## strip the BOM and never strip any ZWNBSP.
617    
618 wakaba 1.162 sub parse_char_string ($$$;$$) {
619     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
620 wakaba 1.135 my $self = shift;
621 wakaba 1.139 my $s = ref $_[0] ? $_[0] : \($_[0]);
622 wakaba 1.171 require Whatpm::Charset::DecodeHandle;
623     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
624 wakaba 1.162 if ($_[3]) {
625     $input = $_[3]->($input);
626     }
627 wakaba 1.135 return $self->parse_char_stream ($input, @_[1..$#_]);
628     } # parse_char_string
629 wakaba 1.162 *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
630 wakaba 1.63
631 wakaba 1.135 sub parse_char_stream ($$$;$) {
632 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
633 wakaba 1.135 my $input = $_[0];
634 wakaba 1.1 $self->{document} = $_[1];
635 wakaba 1.63 @{$self->{document}->child_nodes} = ();
636 wakaba 1.1
637 wakaba 1.3 ## NOTE: |set_inner_html| copies most of this method's code
638    
639 wakaba 1.63 $self->{confident} = 1 unless exists $self->{confident};
640 wakaba 1.64 $self->{document}->input_encoding ($self->{input_encoding})
641     if defined $self->{input_encoding};
642 wakaba 1.63
643 wakaba 1.1 my $i = 0;
644 wakaba 1.112 $self->{line_prev} = $self->{line} = 1;
645     $self->{column_prev} = $self->{column} = 0;
646 wakaba 1.76 $self->{set_next_char} = sub {
647 wakaba 1.1 my $self = shift;
648 wakaba 1.13
649 wakaba 1.76 pop @{$self->{prev_char}};
650     unshift @{$self->{prev_char}}, $self->{next_char};
651 wakaba 1.13
652 wakaba 1.139 my $char;
653     if (defined $self->{next_next_char}) {
654     $char = $self->{next_next_char};
655     delete $self->{next_next_char};
656     } else {
657     $char = $input->getc;
658     }
659 wakaba 1.135 $self->{next_char} = -1 and return unless defined $char;
660     $self->{next_char} = ord $char;
661 wakaba 1.112
662     ($self->{line_prev}, $self->{column_prev})
663     = ($self->{line}, $self->{column});
664     $self->{column}++;
665 wakaba 1.1
666 wakaba 1.76 if ($self->{next_char} == 0x000A) { # LF
667 wakaba 1.132 !!!cp ('j1');
668 wakaba 1.112 $self->{line}++;
669     $self->{column} = 0;
670 wakaba 1.76 } elsif ($self->{next_char} == 0x000D) { # CR
671 wakaba 1.132 !!!cp ('j2');
672 wakaba 1.170 ## TODO: support for abort/streaming
673 wakaba 1.135 my $next = $input->getc;
674 wakaba 1.139 if (defined $next and $next ne "\x0A") {
675     $self->{next_next_char} = $next;
676 wakaba 1.135 }
677 wakaba 1.76 $self->{next_char} = 0x000A; # LF # MUST
678 wakaba 1.112 $self->{line}++;
679     $self->{column} = 0;
680 wakaba 1.76 } elsif ($self->{next_char} > 0x10FFFF) {
681 wakaba 1.132 !!!cp ('j3');
682 wakaba 1.76 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
683     } elsif ($self->{next_char} == 0x0000) { # NULL
684 wakaba 1.132 !!!cp ('j4');
685 wakaba 1.8 !!!parse-error (type => 'NULL');
686 wakaba 1.76 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
687 wakaba 1.132 } elsif ($self->{next_char} <= 0x0008 or
688     (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or
689     (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or
690     (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or
691     (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or
692 wakaba 1.171 ## ISSUE: U+FDE0-U+FDEF are not excluded
693 wakaba 1.132 {
694     0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
695     0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
696     0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
697     0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
698     0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
699     0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
700     0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
701     0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
702     0x10FFFE => 1, 0x10FFFF => 1,
703     }->{$self->{next_char}}) {
704     !!!cp ('j5');
705 wakaba 1.153 if ($self->{next_char} < 0x10000) {
706     !!!parse-error (type => 'control char',
707     text => (sprintf 'U+%04X', $self->{next_char}));
708     } else {
709     !!!parse-error (type => 'control char',
710     text => (sprintf 'U-%08X', $self->{next_char}));
711     }
712 wakaba 1.1 }
713     };
714 wakaba 1.76 $self->{prev_char} = [-1, -1, -1];
715     $self->{next_char} = -1;
716 wakaba 1.1
717 wakaba 1.171 $self->{getc_until} = sub { return undef };
718     # if ($input->can ('manakai_getc_until')) {
719     $self->{getc_until} = sub {
720     my $special_range = shift;
721     return undef if defined $self->{next_next_char};
722     my $s = $input->manakai_getc_until
723     (qr/(?![$special_range\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/);
724     if ($s) {
725     $self->{column} += length $$s;
726     $self->{column_prev} += length $$s;
727     $self->{prev_char} = [-1, -1, -1];
728     $self->{next_char} = -1;
729     }
730     return $s;
731     }; # $self->{getc_until}
732     # } else {
733     # $self->{getc_until} = sub {
734     # my $special_range = shift;
735     # return undef if defined $self->{next_next_char};
736     # my $c = $input->getc;
737     # if ($c =~ /^(?![$special_range\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/) {
738     # $self->{column}++;
739     # $self->{column_prev}++;
740     # $self->{prev_char} = [-1, -1, -1];
741     # $self->{next_char} = -1;
742     # return \$c;
743     # } elsif (defined $c) {
744     # #$input->ungetc (ord $c);
745     # $self->{next_next_char} = $c;
746     # return undef;
747     # } else {
748     # return undef;
749     # }
750     # }; # $self->{getc_until}
751     # }
752    
753 wakaba 1.3 my $onerror = $_[2] || sub {
754     my (%opt) = @_;
755 wakaba 1.112 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
756     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
757     warn "Parse error ($opt{type}) at line $line column $column\n";
758 wakaba 1.3 };
759     $self->{parse_error} = sub {
760 wakaba 1.112 $onerror->(line => $self->{line}, column => $self->{column}, @_);
761 wakaba 1.1 };
762    
763     $self->_initialize_tokenizer;
764     $self->_initialize_tree_constructor;
765     $self->_construct_tree;
766     $self->_terminate_tree_constructor;
767    
768 wakaba 1.112 delete $self->{parse_error}; # remove loop
769    
770 wakaba 1.1 return $self->{document};
771 wakaba 1.135 } # parse_char_stream
772 wakaba 1.1
773     sub new ($) {
774     my $class = shift;
775 wakaba 1.134 my $self = bless {
776 wakaba 1.153 level => {must => 'm',
777 wakaba 1.159 should => 's',
778 wakaba 1.153 warn => 'w',
779     info => 'i',
780     uncertain => 'u'},
781 wakaba 1.134 }, $class;
782 wakaba 1.76 $self->{set_next_char} = sub {
783     $self->{next_char} = -1;
784 wakaba 1.1 };
785     $self->{parse_error} = sub {
786     #
787     };
788 wakaba 1.63 $self->{change_encoding} = sub {
789     # if ($_[0] is a supported encoding) {
790     # run "change the encoding" algorithm;
791     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
792     # }
793     };
794 wakaba 1.61 $self->{application_cache_selection} = sub {
795     #
796     };
797 wakaba 1.1 return $self;
798     } # new
799    
800 wakaba 1.40 sub CM_ENTITY () { 0b001 } # & markup in data
801     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
802     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
803    
804     sub PLAINTEXT_CONTENT_MODEL () { 0 }
805     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
806     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
807     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
808    
809 wakaba 1.57 sub DATA_STATE () { 0 }
810 wakaba 1.168 #sub ENTITY_DATA_STATE () { 1 }
811 wakaba 1.57 sub TAG_OPEN_STATE () { 2 }
812     sub CLOSE_TAG_OPEN_STATE () { 3 }
813     sub TAG_NAME_STATE () { 4 }
814     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
815     sub ATTRIBUTE_NAME_STATE () { 6 }
816     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
817     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
818     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
819     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
820     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
821 wakaba 1.168 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
822 wakaba 1.57 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
823     sub COMMENT_START_STATE () { 14 }
824     sub COMMENT_START_DASH_STATE () { 15 }
825     sub COMMENT_STATE () { 16 }
826     sub COMMENT_END_STATE () { 17 }
827     sub COMMENT_END_DASH_STATE () { 18 }
828     sub BOGUS_COMMENT_STATE () { 19 }
829     sub DOCTYPE_STATE () { 20 }
830     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
831     sub DOCTYPE_NAME_STATE () { 22 }
832     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
833     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
834     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
835     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
836     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
837     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
838     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
839     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
840     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
841     sub BOGUS_DOCTYPE_STATE () { 32 }
842 wakaba 1.72 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
843 wakaba 1.125 sub SELF_CLOSING_START_TAG_STATE () { 34 }
844 wakaba 1.165 sub CDATA_SECTION_STATE () { 35 }
845 wakaba 1.164 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
846     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
847     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
848     sub CDATA_PCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
849 wakaba 1.165 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
850     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
851 wakaba 1.166 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
852     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
853 wakaba 1.168 ## NOTE: "Entity data state", "entity in attribute value state", and
854     ## "consume a character reference" algorithm are jointly implemented
855     ## using the following six states:
856     sub ENTITY_STATE () { 44 }
857     sub ENTITY_HASH_STATE () { 45 }
858     sub NCR_NUM_STATE () { 46 }
859     sub HEXREF_X_STATE () { 47 }
860     sub HEXREF_HEX_STATE () { 48 }
861     sub ENTITY_NAME_STATE () { 49 }
862 wakaba 1.57
863 wakaba 1.55 sub DOCTYPE_TOKEN () { 1 }
864     sub COMMENT_TOKEN () { 2 }
865     sub START_TAG_TOKEN () { 3 }
866     sub END_TAG_TOKEN () { 4 }
867     sub END_OF_FILE_TOKEN () { 5 }
868     sub CHARACTER_TOKEN () { 6 }
869    
870 wakaba 1.54 sub AFTER_HTML_IMS () { 0b100 }
871     sub HEAD_IMS () { 0b1000 }
872     sub BODY_IMS () { 0b10000 }
873 wakaba 1.56 sub BODY_TABLE_IMS () { 0b100000 }
874 wakaba 1.54 sub TABLE_IMS () { 0b1000000 }
875 wakaba 1.56 sub ROW_IMS () { 0b10000000 }
876 wakaba 1.54 sub BODY_AFTER_IMS () { 0b100000000 }
877     sub FRAME_IMS () { 0b1000000000 }
878 wakaba 1.101 sub SELECT_IMS () { 0b10000000000 }
879 wakaba 1.126 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
880     ## NOTE: "in foreign content" insertion mode is special; it is combined
881     ## with the secondary insertion mode. In this parser, they are stored
882     ## together in the bit-or'ed form.
883 wakaba 1.54
884 wakaba 1.84 ## NOTE: "initial" and "before html" insertion modes have no constants.
885    
886     ## NOTE: "after after body" insertion mode.
887 wakaba 1.54 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
888 wakaba 1.84
889     ## NOTE: "after after frameset" insertion mode.
890 wakaba 1.54 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
891 wakaba 1.84
892 wakaba 1.54 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
893     sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
894     sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
895     sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
896     sub IN_BODY_IM () { BODY_IMS }
897 wakaba 1.56 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
898     sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
899     sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
900     sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
901 wakaba 1.54 sub IN_TABLE_IM () { TABLE_IMS }
902     sub AFTER_BODY_IM () { BODY_AFTER_IMS }
903     sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
904     sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
905 wakaba 1.101 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
906     sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
907 wakaba 1.54 sub IN_COLUMN_GROUP_IM () { 0b10 }
908    
909 wakaba 1.1 ## Implementations MUST act as if state machine in the spec
910    
911     sub _initialize_tokenizer ($) {
912     my $self = shift;
913 wakaba 1.57 $self->{state} = DATA_STATE; # MUST
914 wakaba 1.163 #$self->{state_keyword}; # initialized when used
915 wakaba 1.169 #$self->{entity__value}; # initialized when used
916     #$self->{entity__match}; # initialized when used
917 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
918 wakaba 1.165 undef $self->{current_token};
919 wakaba 1.1 undef $self->{current_attribute};
920     undef $self->{last_emitted_start_tag_name};
921 wakaba 1.169 #$self->{prev_state}; # initialized when used
922 wakaba 1.125 delete $self->{self_closing};
923 wakaba 1.76 # $self->{next_char}
924 wakaba 1.1 !!!next-input-character;
925     $self->{token} = [];
926 wakaba 1.18 # $self->{escape}
927 wakaba 1.1 } # _initialize_tokenizer
928    
929     ## A token has:
930 wakaba 1.55 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
931     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
932     ## ->{name} (DOCTYPE_TOKEN)
933     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
934     ## ->{public_identifier} (DOCTYPE_TOKEN)
935     ## ->{system_identifier} (DOCTYPE_TOKEN)
936 wakaba 1.75 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
937 wakaba 1.55 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
938 wakaba 1.66 ## ->{name}
939     ## ->{value}
940     ## ->{has_reference} == 1 or 0
941 wakaba 1.55 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
942 wakaba 1.125 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
943     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
944     ## while the token is pushed back to the stack.
945    
946 wakaba 1.1 ## Emitted token MUST immediately be handled by the tree construction state.
947    
948     ## Before each step, UA MAY check to see if either one of the scripts in
949     ## "list of scripts that will execute as soon as possible" or the first
950     ## script in the "list of scripts that will execute asynchronously",
951     ## has completed loading. If one has, then it MUST be executed
952     ## and removed from the list.
953    
954 wakaba 1.169 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
955     ## (This requirement was dropped from HTML5 spec, unfortunately.)
956 wakaba 1.59
957 wakaba 1.1 sub _get_next_token ($) {
958     my $self = shift;
959 wakaba 1.125
960     if ($self->{self_closing}) {
961     !!!parse-error (type => 'nestc', token => $self->{current_token});
962     ## NOTE: The |self_closing| flag is only set by start tag token.
963     ## In addition, when a start tag token is emitted, it is always set to
964     ## |current_token|.
965     delete $self->{self_closing};
966     }
967    
968 wakaba 1.1 if (@{$self->{token}}) {
969 wakaba 1.125 $self->{self_closing} = $self->{token}->[0]->{self_closing};
970 wakaba 1.1 return shift @{$self->{token}};
971     }
972    
973     A: {
974 wakaba 1.57 if ($self->{state} == DATA_STATE) {
975 wakaba 1.76 if ($self->{next_char} == 0x0026) { # &
976 wakaba 1.72 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
977     not $self->{escape}) {
978 wakaba 1.77 !!!cp (1);
979 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
980     ## "entity data state". In this implementation, the tokenizer
981     ## is switched to the |ENTITY_STATE|, which is an implementation
982     ## of the "consume a character reference" algorithm.
983     $self->{entity_additional} = -1;
984 wakaba 1.169 $self->{prev_state} = DATA_STATE;
985 wakaba 1.167 $self->{state} = ENTITY_STATE;
986 wakaba 1.1 !!!next-input-character;
987     redo A;
988     } else {
989 wakaba 1.77 !!!cp (2);
990 wakaba 1.1 #
991     }
992 wakaba 1.76 } elsif ($self->{next_char} == 0x002D) { # -
993 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
994 wakaba 1.13 unless ($self->{escape}) {
995 wakaba 1.76 if ($self->{prev_char}->[0] == 0x002D and # -
996     $self->{prev_char}->[1] == 0x0021 and # !
997     $self->{prev_char}->[2] == 0x003C) { # <
998 wakaba 1.77 !!!cp (3);
999 wakaba 1.13 $self->{escape} = 1;
1000 wakaba 1.77 } else {
1001     !!!cp (4);
1002 wakaba 1.13 }
1003 wakaba 1.77 } else {
1004     !!!cp (5);
1005 wakaba 1.13 }
1006     }
1007    
1008     #
1009 wakaba 1.76 } elsif ($self->{next_char} == 0x003C) { # <
1010 wakaba 1.40 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
1011     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
1012 wakaba 1.13 not $self->{escape})) {
1013 wakaba 1.77 !!!cp (6);
1014 wakaba 1.57 $self->{state} = TAG_OPEN_STATE;
1015 wakaba 1.1 !!!next-input-character;
1016     redo A;
1017     } else {
1018 wakaba 1.77 !!!cp (7);
1019 wakaba 1.1 #
1020     }
1021 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1022 wakaba 1.13 if ($self->{escape} and
1023 wakaba 1.40 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
1024 wakaba 1.76 if ($self->{prev_char}->[0] == 0x002D and # -
1025     $self->{prev_char}->[1] == 0x002D) { # -
1026 wakaba 1.77 !!!cp (8);
1027 wakaba 1.13 delete $self->{escape};
1028 wakaba 1.77 } else {
1029     !!!cp (9);
1030 wakaba 1.13 }
1031 wakaba 1.77 } else {
1032     !!!cp (10);
1033 wakaba 1.13 }
1034    
1035     #
1036 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1037 wakaba 1.77 !!!cp (11);
1038 wakaba 1.112 !!!emit ({type => END_OF_FILE_TOKEN,
1039     line => $self->{line}, column => $self->{column}});
1040 wakaba 1.1 last A; ## TODO: ok?
1041 wakaba 1.77 } else {
1042     !!!cp (12);
1043 wakaba 1.1 }
1044     # Anything else
1045 wakaba 1.55 my $token = {type => CHARACTER_TOKEN,
1046 wakaba 1.112 data => chr $self->{next_char},
1047 wakaba 1.120 line => $self->{line}, column => $self->{column},
1048 wakaba 1.118 };
1049 wakaba 1.171
1050     my $s = $self->{getc_until}->(q[-!<>&]);
1051     if ($s) {
1052     $token->{data} .= $$s;
1053     }
1054    
1055 wakaba 1.1 ## Stay in the data state
1056     !!!next-input-character;
1057    
1058     !!!emit ($token);
1059    
1060     redo A;
1061 wakaba 1.57 } elsif ($self->{state} == TAG_OPEN_STATE) {
1062 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1063 wakaba 1.76 if ($self->{next_char} == 0x002F) { # /
1064 wakaba 1.77 !!!cp (15);
1065 wakaba 1.1 !!!next-input-character;
1066 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
1067 wakaba 1.1 redo A;
1068     } else {
1069 wakaba 1.77 !!!cp (16);
1070 wakaba 1.1 ## reconsume
1071 wakaba 1.57 $self->{state} = DATA_STATE;
1072 wakaba 1.1
1073 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1074 wakaba 1.120 line => $self->{line_prev},
1075     column => $self->{column_prev},
1076 wakaba 1.118 });
1077 wakaba 1.1
1078     redo A;
1079     }
1080 wakaba 1.40 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1081 wakaba 1.76 if ($self->{next_char} == 0x0021) { # !
1082 wakaba 1.77 !!!cp (17);
1083 wakaba 1.57 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1084 wakaba 1.1 !!!next-input-character;
1085     redo A;
1086 wakaba 1.76 } elsif ($self->{next_char} == 0x002F) { # /
1087 wakaba 1.77 !!!cp (18);
1088 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
1089 wakaba 1.1 !!!next-input-character;
1090     redo A;
1091 wakaba 1.76 } elsif (0x0041 <= $self->{next_char} and
1092     $self->{next_char} <= 0x005A) { # A..Z
1093 wakaba 1.77 !!!cp (19);
1094 wakaba 1.1 $self->{current_token}
1095 wakaba 1.55 = {type => START_TAG_TOKEN,
1096 wakaba 1.112 tag_name => chr ($self->{next_char} + 0x0020),
1097     line => $self->{line_prev},
1098     column => $self->{column_prev}};
1099 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1100 wakaba 1.1 !!!next-input-character;
1101     redo A;
1102 wakaba 1.76 } elsif (0x0061 <= $self->{next_char} and
1103     $self->{next_char} <= 0x007A) { # a..z
1104 wakaba 1.77 !!!cp (20);
1105 wakaba 1.55 $self->{current_token} = {type => START_TAG_TOKEN,
1106 wakaba 1.112 tag_name => chr ($self->{next_char}),
1107     line => $self->{line_prev},
1108     column => $self->{column_prev}};
1109 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1110 wakaba 1.1 !!!next-input-character;
1111     redo A;
1112 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1113 wakaba 1.77 !!!cp (21);
1114 wakaba 1.115 !!!parse-error (type => 'empty start tag',
1115     line => $self->{line_prev},
1116     column => $self->{column_prev});
1117 wakaba 1.57 $self->{state} = DATA_STATE;
1118 wakaba 1.1 !!!next-input-character;
1119    
1120 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1121 wakaba 1.120 line => $self->{line_prev},
1122     column => $self->{column_prev},
1123 wakaba 1.118 });
1124 wakaba 1.1
1125     redo A;
1126 wakaba 1.76 } elsif ($self->{next_char} == 0x003F) { # ?
1127 wakaba 1.77 !!!cp (22);
1128 wakaba 1.115 !!!parse-error (type => 'pio',
1129     line => $self->{line_prev},
1130     column => $self->{column_prev});
1131 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1132 wakaba 1.112 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1133 wakaba 1.120 line => $self->{line_prev},
1134     column => $self->{column_prev},
1135 wakaba 1.118 };
1136 wakaba 1.76 ## $self->{next_char} is intentionally left as is
1137 wakaba 1.1 redo A;
1138     } else {
1139 wakaba 1.77 !!!cp (23);
1140 wakaba 1.136 !!!parse-error (type => 'bare stago',
1141     line => $self->{line_prev},
1142     column => $self->{column_prev});
1143 wakaba 1.57 $self->{state} = DATA_STATE;
1144 wakaba 1.1 ## reconsume
1145    
1146 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1147 wakaba 1.120 line => $self->{line_prev},
1148     column => $self->{column_prev},
1149 wakaba 1.118 });
1150 wakaba 1.1
1151     redo A;
1152     }
1153     } else {
1154 wakaba 1.40 die "$0: $self->{content_model} in tag open";
1155 wakaba 1.1 }
1156 wakaba 1.57 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1157 wakaba 1.164 ## NOTE: The "close tag open state" in the spec is implemented as
1158     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_PCDATA_CLOSE_TAG_STATE|.
1159    
1160 wakaba 1.113 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1161 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1162 wakaba 1.23 if (defined $self->{last_emitted_start_tag_name}) {
1163 wakaba 1.164 $self->{state} = CDATA_PCDATA_CLOSE_TAG_STATE;
1164     $self->{state_keyword} = '';
1165     ## Reconsume.
1166     redo A;
1167 wakaba 1.23 } else {
1168     ## No start tag token has ever been emitted
1169 wakaba 1.164 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1170 wakaba 1.77 !!!cp (28);
1171 wakaba 1.57 $self->{state} = DATA_STATE;
1172 wakaba 1.164 ## Reconsume.
1173 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1174 wakaba 1.120 line => $l, column => $c,
1175 wakaba 1.118 });
1176 wakaba 1.1 redo A;
1177     }
1178     }
1179 wakaba 1.164
1180 wakaba 1.76 if (0x0041 <= $self->{next_char} and
1181     $self->{next_char} <= 0x005A) { # A..Z
1182 wakaba 1.77 !!!cp (29);
1183 wakaba 1.112 $self->{current_token}
1184     = {type => END_TAG_TOKEN,
1185     tag_name => chr ($self->{next_char} + 0x0020),
1186     line => $l, column => $c};
1187 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1188 wakaba 1.1 !!!next-input-character;
1189     redo A;
1190 wakaba 1.76 } elsif (0x0061 <= $self->{next_char} and
1191     $self->{next_char} <= 0x007A) { # a..z
1192 wakaba 1.77 !!!cp (30);
1193 wakaba 1.55 $self->{current_token} = {type => END_TAG_TOKEN,
1194 wakaba 1.112 tag_name => chr ($self->{next_char}),
1195     line => $l, column => $c};
1196 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1197 wakaba 1.1 !!!next-input-character;
1198     redo A;
1199 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1200 wakaba 1.77 !!!cp (31);
1201 wakaba 1.115 !!!parse-error (type => 'empty end tag',
1202     line => $self->{line_prev}, ## "<" in "</>"
1203     column => $self->{column_prev} - 1);
1204 wakaba 1.57 $self->{state} = DATA_STATE;
1205 wakaba 1.1 !!!next-input-character;
1206     redo A;
1207 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1208 wakaba 1.77 !!!cp (32);
1209 wakaba 1.3 !!!parse-error (type => 'bare etago');
1210 wakaba 1.57 $self->{state} = DATA_STATE;
1211 wakaba 1.1 # reconsume
1212    
1213 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1214 wakaba 1.120 line => $l, column => $c,
1215 wakaba 1.118 });
1216 wakaba 1.1
1217     redo A;
1218     } else {
1219 wakaba 1.77 !!!cp (33);
1220 wakaba 1.3 !!!parse-error (type => 'bogus end tag');
1221 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1222 wakaba 1.112 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1223 wakaba 1.120 line => $self->{line_prev}, # "<" of "</"
1224     column => $self->{column_prev} - 1,
1225 wakaba 1.118 };
1226 wakaba 1.164 ## NOTE: $self->{next_char} is intentionally left as is.
1227     ## Although the "anything else" case of the spec not explicitly
1228     ## states that the next input character is to be reconsumed,
1229     ## it will be included to the |data| of the comment token
1230     ## generated from the bogus end tag, as defined in the
1231     ## "bogus comment state" entry.
1232     redo A;
1233     }
1234     } elsif ($self->{state} == CDATA_PCDATA_CLOSE_TAG_STATE) {
1235     my $ch = substr $self->{last_emitted_start_tag_name}, length $self->{state_keyword}, 1;
1236     if (length $ch) {
1237     my $CH = $ch;
1238     $ch =~ tr/a-z/A-Z/;
1239     my $nch = chr $self->{next_char};
1240     if ($nch eq $ch or $nch eq $CH) {
1241     !!!cp (24);
1242     ## Stay in the state.
1243     $self->{state_keyword} .= $nch;
1244     !!!next-input-character;
1245     redo A;
1246     } else {
1247     !!!cp (25);
1248     $self->{state} = DATA_STATE;
1249     ## Reconsume.
1250     !!!emit ({type => CHARACTER_TOKEN,
1251     data => '</' . $self->{state_keyword},
1252     line => $self->{line_prev},
1253     column => $self->{column_prev} - 1 - length $self->{state_keyword},
1254     });
1255     redo A;
1256     }
1257     } else { # after "<{tag-name}"
1258     unless ({
1259     0x0009 => 1, # HT
1260     0x000A => 1, # LF
1261     0x000B => 1, # VT
1262     0x000C => 1, # FF
1263     0x0020 => 1, # SP
1264     0x003E => 1, # >
1265     0x002F => 1, # /
1266     -1 => 1, # EOF
1267     }->{$self->{next_char}}) {
1268     !!!cp (26);
1269     ## Reconsume.
1270     $self->{state} = DATA_STATE;
1271     !!!emit ({type => CHARACTER_TOKEN,
1272     data => '</' . $self->{state_keyword},
1273     line => $self->{line_prev},
1274     column => $self->{column_prev} - 1 - length $self->{state_keyword},
1275     });
1276     redo A;
1277     } else {
1278     !!!cp (27);
1279     $self->{current_token}
1280     = {type => END_TAG_TOKEN,
1281     tag_name => $self->{last_emitted_start_tag_name},
1282     line => $self->{line_prev},
1283     column => $self->{column_prev} - 1 - length $self->{state_keyword}};
1284     $self->{state} = TAG_NAME_STATE;
1285     ## Reconsume.
1286     redo A;
1287     }
1288 wakaba 1.1 }
1289 wakaba 1.57 } elsif ($self->{state} == TAG_NAME_STATE) {
1290 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
1291     $self->{next_char} == 0x000A or # LF
1292     $self->{next_char} == 0x000B or # VT
1293     $self->{next_char} == 0x000C or # FF
1294     $self->{next_char} == 0x0020) { # SP
1295 wakaba 1.77 !!!cp (34);
1296 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1297 wakaba 1.1 !!!next-input-character;
1298     redo A;
1299 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1300 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1301 wakaba 1.77 !!!cp (35);
1302 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1303 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1304 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1305 wakaba 1.78 #if ($self->{current_token}->{attributes}) {
1306     # ## NOTE: This should never be reached.
1307     # !!! cp (36);
1308     # !!! parse-error (type => 'end tag attribute');
1309     #} else {
1310 wakaba 1.77 !!!cp (37);
1311 wakaba 1.78 #}
1312 wakaba 1.1 } else {
1313     die "$0: $self->{current_token}->{type}: Unknown token type";
1314     }
1315 wakaba 1.57 $self->{state} = DATA_STATE;
1316 wakaba 1.1 !!!next-input-character;
1317    
1318     !!!emit ($self->{current_token}); # start tag or end tag
1319    
1320     redo A;
1321 wakaba 1.76 } elsif (0x0041 <= $self->{next_char} and
1322     $self->{next_char} <= 0x005A) { # A..Z
1323 wakaba 1.77 !!!cp (38);
1324 wakaba 1.76 $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
1325 wakaba 1.1 # start tag or end tag
1326     ## Stay in this state
1327     !!!next-input-character;
1328     redo A;
1329 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1330 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1331 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1332 wakaba 1.77 !!!cp (39);
1333 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1334 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1335 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1336 wakaba 1.78 #if ($self->{current_token}->{attributes}) {
1337     # ## NOTE: This state should never be reached.
1338     # !!! cp (40);
1339     # !!! parse-error (type => 'end tag attribute');
1340     #} else {
1341 wakaba 1.77 !!!cp (41);
1342 wakaba 1.78 #}
1343 wakaba 1.1 } else {
1344     die "$0: $self->{current_token}->{type}: Unknown token type";
1345     }
1346 wakaba 1.57 $self->{state} = DATA_STATE;
1347 wakaba 1.1 # reconsume
1348    
1349     !!!emit ($self->{current_token}); # start tag or end tag
1350    
1351     redo A;
1352 wakaba 1.76 } elsif ($self->{next_char} == 0x002F) { # /
1353 wakaba 1.125 !!!cp (42);
1354     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1355 wakaba 1.1 !!!next-input-character;
1356     redo A;
1357     } else {
1358 wakaba 1.77 !!!cp (44);
1359 wakaba 1.76 $self->{current_token}->{tag_name} .= chr $self->{next_char};
1360 wakaba 1.1 # start tag or end tag
1361     ## Stay in the state
1362     !!!next-input-character;
1363     redo A;
1364     }
1365 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1366 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
1367     $self->{next_char} == 0x000A or # LF
1368     $self->{next_char} == 0x000B or # VT
1369     $self->{next_char} == 0x000C or # FF
1370     $self->{next_char} == 0x0020) { # SP
1371 wakaba 1.77 !!!cp (45);
1372 wakaba 1.1 ## Stay in the state
1373     !!!next-input-character;
1374     redo A;
1375 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1376 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1377 wakaba 1.77 !!!cp (46);
1378 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1379 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1380 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1381 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1382 wakaba 1.77 !!!cp (47);
1383 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1384 wakaba 1.77 } else {
1385     !!!cp (48);
1386 wakaba 1.1 }
1387     } else {
1388     die "$0: $self->{current_token}->{type}: Unknown token type";
1389     }
1390 wakaba 1.57 $self->{state} = DATA_STATE;
1391 wakaba 1.1 !!!next-input-character;
1392    
1393     !!!emit ($self->{current_token}); # start tag or end tag
1394    
1395     redo A;
1396 wakaba 1.76 } elsif (0x0041 <= $self->{next_char} and
1397     $self->{next_char} <= 0x005A) { # A..Z
1398 wakaba 1.77 !!!cp (49);
1399 wakaba 1.119 $self->{current_attribute}
1400     = {name => chr ($self->{next_char} + 0x0020),
1401     value => '',
1402     line => $self->{line}, column => $self->{column}};
1403 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1404 wakaba 1.1 !!!next-input-character;
1405     redo A;
1406 wakaba 1.76 } elsif ($self->{next_char} == 0x002F) { # /
1407 wakaba 1.125 !!!cp (50);
1408     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1409 wakaba 1.1 !!!next-input-character;
1410     redo A;
1411 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1412 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1413 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1414 wakaba 1.77 !!!cp (52);
1415 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1416 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1417 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1418 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1419 wakaba 1.77 !!!cp (53);
1420 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1421 wakaba 1.77 } else {
1422     !!!cp (54);
1423 wakaba 1.1 }
1424     } else {
1425     die "$0: $self->{current_token}->{type}: Unknown token type";
1426     }
1427 wakaba 1.57 $self->{state} = DATA_STATE;
1428 wakaba 1.1 # reconsume
1429    
1430     !!!emit ($self->{current_token}); # start tag or end tag
1431    
1432     redo A;
1433     } else {
1434 wakaba 1.72 if ({
1435     0x0022 => 1, # "
1436     0x0027 => 1, # '
1437     0x003D => 1, # =
1438 wakaba 1.76 }->{$self->{next_char}}) {
1439 wakaba 1.77 !!!cp (55);
1440 wakaba 1.72 !!!parse-error (type => 'bad attribute name');
1441 wakaba 1.77 } else {
1442     !!!cp (56);
1443 wakaba 1.72 }
1444 wakaba 1.119 $self->{current_attribute}
1445     = {name => chr ($self->{next_char}),
1446     value => '',
1447     line => $self->{line}, column => $self->{column}};
1448 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1449 wakaba 1.1 !!!next-input-character;
1450     redo A;
1451     }
1452 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1453 wakaba 1.1 my $before_leave = sub {
1454     if (exists $self->{current_token}->{attributes} # start tag or end tag
1455     ->{$self->{current_attribute}->{name}}) { # MUST
1456 wakaba 1.77 !!!cp (57);
1457 wakaba 1.153 !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});
1458 wakaba 1.1 ## Discard $self->{current_attribute} # MUST
1459     } else {
1460 wakaba 1.77 !!!cp (58);
1461 wakaba 1.1 $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
1462     = $self->{current_attribute};
1463     }
1464     }; # $before_leave
1465    
1466 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
1467     $self->{next_char} == 0x000A or # LF
1468     $self->{next_char} == 0x000B or # VT
1469     $self->{next_char} == 0x000C or # FF
1470     $self->{next_char} == 0x0020) { # SP
1471 wakaba 1.77 !!!cp (59);
1472 wakaba 1.1 $before_leave->();
1473 wakaba 1.57 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1474 wakaba 1.1 !!!next-input-character;
1475     redo A;
1476 wakaba 1.76 } elsif ($self->{next_char} == 0x003D) { # =
1477 wakaba 1.77 !!!cp (60);
1478 wakaba 1.1 $before_leave->();
1479 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1480 wakaba 1.1 !!!next-input-character;
1481     redo A;
1482 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1483 wakaba 1.1 $before_leave->();
1484 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1485 wakaba 1.77 !!!cp (61);
1486 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1487 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1488 wakaba 1.77 !!!cp (62);
1489 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1490 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1491 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1492 wakaba 1.1 }
1493     } else {
1494     die "$0: $self->{current_token}->{type}: Unknown token type";
1495     }
1496 wakaba 1.57 $self->{state} = DATA_STATE;
1497 wakaba 1.1 !!!next-input-character;
1498    
1499     !!!emit ($self->{current_token}); # start tag or end tag
1500    
1501     redo A;
1502 wakaba 1.76 } elsif (0x0041 <= $self->{next_char} and
1503     $self->{next_char} <= 0x005A) { # A..Z
1504 wakaba 1.77 !!!cp (63);
1505 wakaba 1.76 $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
1506 wakaba 1.1 ## Stay in the state
1507     !!!next-input-character;
1508     redo A;
1509 wakaba 1.76 } elsif ($self->{next_char} == 0x002F) { # /
1510 wakaba 1.125 !!!cp (64);
1511 wakaba 1.1 $before_leave->();
1512 wakaba 1.125 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1513 wakaba 1.1 !!!next-input-character;
1514     redo A;
1515 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1516 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1517 wakaba 1.1 $before_leave->();
1518 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1519 wakaba 1.77 !!!cp (66);
1520 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1521 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1522 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1523 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1524 wakaba 1.77 !!!cp (67);
1525 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1526 wakaba 1.77 } else {
1527 wakaba 1.78 ## NOTE: This state should never be reached.
1528 wakaba 1.77 !!!cp (68);
1529 wakaba 1.1 }
1530     } else {
1531     die "$0: $self->{current_token}->{type}: Unknown token type";
1532     }
1533 wakaba 1.57 $self->{state} = DATA_STATE;
1534 wakaba 1.1 # reconsume
1535    
1536     !!!emit ($self->{current_token}); # start tag or end tag
1537    
1538     redo A;
1539     } else {
1540 wakaba 1.76 if ($self->{next_char} == 0x0022 or # "
1541     $self->{next_char} == 0x0027) { # '
1542 wakaba 1.77 !!!cp (69);
1543 wakaba 1.72 !!!parse-error (type => 'bad attribute name');
1544 wakaba 1.77 } else {
1545     !!!cp (70);
1546 wakaba 1.72 }
1547 wakaba 1.76 $self->{current_attribute}->{name} .= chr ($self->{next_char});
1548 wakaba 1.1 ## Stay in the state
1549     !!!next-input-character;
1550     redo A;
1551     }
1552 wakaba 1.57 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1553 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
1554     $self->{next_char} == 0x000A or # LF
1555     $self->{next_char} == 0x000B or # VT
1556     $self->{next_char} == 0x000C or # FF
1557     $self->{next_char} == 0x0020) { # SP
1558 wakaba 1.77 !!!cp (71);
1559 wakaba 1.1 ## Stay in the state
1560     !!!next-input-character;
1561     redo A;
1562 wakaba 1.76 } elsif ($self->{next_char} == 0x003D) { # =
1563 wakaba 1.77 !!!cp (72);
1564 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1565 wakaba 1.1 !!!next-input-character;
1566     redo A;
1567 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1568 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1569 wakaba 1.77 !!!cp (73);
1570 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1571 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1572 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1573 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1574 wakaba 1.77 !!!cp (74);
1575 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1576 wakaba 1.77 } else {
1577 wakaba 1.78 ## NOTE: This state should never be reached.
1578 wakaba 1.77 !!!cp (75);
1579 wakaba 1.1 }
1580     } else {
1581     die "$0: $self->{current_token}->{type}: Unknown token type";
1582     }
1583 wakaba 1.57 $self->{state} = DATA_STATE;
1584 wakaba 1.1 !!!next-input-character;
1585    
1586     !!!emit ($self->{current_token}); # start tag or end tag
1587    
1588     redo A;
1589 wakaba 1.76 } elsif (0x0041 <= $self->{next_char} and
1590     $self->{next_char} <= 0x005A) { # A..Z
1591 wakaba 1.77 !!!cp (76);
1592 wakaba 1.119 $self->{current_attribute}
1593     = {name => chr ($self->{next_char} + 0x0020),
1594     value => '',
1595     line => $self->{line}, column => $self->{column}};
1596 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1597 wakaba 1.1 !!!next-input-character;
1598     redo A;
1599 wakaba 1.76 } elsif ($self->{next_char} == 0x002F) { # /
1600 wakaba 1.125 !!!cp (77);
1601     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1602 wakaba 1.1 !!!next-input-character;
1603     redo A;
1604 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1605 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1606 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1607 wakaba 1.77 !!!cp (79);
1608 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1609 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1610 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1611 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1612 wakaba 1.77 !!!cp (80);
1613 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1614 wakaba 1.77 } else {
1615 wakaba 1.78 ## NOTE: This state should never be reached.
1616 wakaba 1.77 !!!cp (81);
1617 wakaba 1.1 }
1618     } else {
1619     die "$0: $self->{current_token}->{type}: Unknown token type";
1620     }
1621 wakaba 1.57 $self->{state} = DATA_STATE;
1622 wakaba 1.1 # reconsume
1623    
1624     !!!emit ($self->{current_token}); # start tag or end tag
1625    
1626     redo A;
1627     } else {
1628 wakaba 1.156 if ($self->{next_char} == 0x0022 or # "
1629     $self->{next_char} == 0x0027) { # '
1630     !!!cp (78);
1631     !!!parse-error (type => 'bad attribute name');
1632     } else {
1633     !!!cp (82);
1634     }
1635 wakaba 1.119 $self->{current_attribute}
1636     = {name => chr ($self->{next_char}),
1637     value => '',
1638     line => $self->{line}, column => $self->{column}};
1639 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1640 wakaba 1.1 !!!next-input-character;
1641     redo A;
1642     }
1643 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1644 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
1645     $self->{next_char} == 0x000A or # LF
1646     $self->{next_char} == 0x000B or # VT
1647     $self->{next_char} == 0x000C or # FF
1648     $self->{next_char} == 0x0020) { # SP
1649 wakaba 1.77 !!!cp (83);
1650 wakaba 1.1 ## Stay in the state
1651     !!!next-input-character;
1652     redo A;
1653 wakaba 1.76 } elsif ($self->{next_char} == 0x0022) { # "
1654 wakaba 1.77 !!!cp (84);
1655 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1656 wakaba 1.1 !!!next-input-character;
1657     redo A;
1658 wakaba 1.76 } elsif ($self->{next_char} == 0x0026) { # &
1659 wakaba 1.77 !!!cp (85);
1660 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1661 wakaba 1.1 ## reconsume
1662     redo A;
1663 wakaba 1.76 } elsif ($self->{next_char} == 0x0027) { # '
1664 wakaba 1.77 !!!cp (86);
1665 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1666 wakaba 1.1 !!!next-input-character;
1667     redo A;
1668 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1669 wakaba 1.156 !!!parse-error (type => 'empty unquoted attribute value');
1670 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1671 wakaba 1.77 !!!cp (87);
1672 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1673 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1674 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1675 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1676 wakaba 1.77 !!!cp (88);
1677 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1678 wakaba 1.77 } else {
1679 wakaba 1.78 ## NOTE: This state should never be reached.
1680 wakaba 1.77 !!!cp (89);
1681 wakaba 1.1 }
1682     } else {
1683     die "$0: $self->{current_token}->{type}: Unknown token type";
1684     }
1685 wakaba 1.57 $self->{state} = DATA_STATE;
1686 wakaba 1.1 !!!next-input-character;
1687    
1688     !!!emit ($self->{current_token}); # start tag or end tag
1689    
1690     redo A;
1691 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1692 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1693 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1694 wakaba 1.77 !!!cp (90);
1695 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1696 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1697 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1698 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1699 wakaba 1.77 !!!cp (91);
1700 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1701 wakaba 1.77 } else {
1702 wakaba 1.78 ## NOTE: This state should never be reached.
1703 wakaba 1.77 !!!cp (92);
1704 wakaba 1.1 }
1705     } else {
1706     die "$0: $self->{current_token}->{type}: Unknown token type";
1707     }
1708 wakaba 1.57 $self->{state} = DATA_STATE;
1709 wakaba 1.1 ## reconsume
1710    
1711     !!!emit ($self->{current_token}); # start tag or end tag
1712    
1713     redo A;
1714     } else {
1715 wakaba 1.76 if ($self->{next_char} == 0x003D) { # =
1716 wakaba 1.77 !!!cp (93);
1717 wakaba 1.72 !!!parse-error (type => 'bad attribute value');
1718 wakaba 1.77 } else {
1719     !!!cp (94);
1720 wakaba 1.72 }
1721 wakaba 1.76 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1722 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1723 wakaba 1.1 !!!next-input-character;
1724     redo A;
1725     }
1726 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1727 wakaba 1.76 if ($self->{next_char} == 0x0022) { # "
1728 wakaba 1.77 !!!cp (95);
1729 wakaba 1.72 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1730 wakaba 1.1 !!!next-input-character;
1731     redo A;
1732 wakaba 1.76 } elsif ($self->{next_char} == 0x0026) { # &
1733 wakaba 1.77 !!!cp (96);
1734 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1735     ## "entity in attribute value state". In this implementation, the
1736     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1737     ## implementation of the "consume a character reference" algorithm.
1738 wakaba 1.169 $self->{prev_state} = $self->{state};
1739 wakaba 1.167 $self->{entity_additional} = 0x0022; # "
1740     $self->{state} = ENTITY_STATE;
1741 wakaba 1.1 !!!next-input-character;
1742     redo A;
1743 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1744 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1745 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1746 wakaba 1.77 !!!cp (97);
1747 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1748 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1749 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1750 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1751 wakaba 1.77 !!!cp (98);
1752 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1753 wakaba 1.77 } else {
1754 wakaba 1.78 ## NOTE: This state should never be reached.
1755 wakaba 1.77 !!!cp (99);
1756 wakaba 1.1 }
1757     } else {
1758     die "$0: $self->{current_token}->{type}: Unknown token type";
1759     }
1760 wakaba 1.57 $self->{state} = DATA_STATE;
1761 wakaba 1.1 ## reconsume
1762    
1763     !!!emit ($self->{current_token}); # start tag or end tag
1764    
1765     redo A;
1766     } else {
1767 wakaba 1.77 !!!cp (100);
1768 wakaba 1.76 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1769 wakaba 1.1 ## Stay in the state
1770     !!!next-input-character;
1771     redo A;
1772     }
1773 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1774 wakaba 1.76 if ($self->{next_char} == 0x0027) { # '
1775 wakaba 1.77 !!!cp (101);
1776 wakaba 1.72 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1777 wakaba 1.1 !!!next-input-character;
1778     redo A;
1779 wakaba 1.76 } elsif ($self->{next_char} == 0x0026) { # &
1780 wakaba 1.77 !!!cp (102);
1781 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1782     ## "entity in attribute value state". In this implementation, the
1783     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1784     ## implementation of the "consume a character reference" algorithm.
1785     $self->{entity_additional} = 0x0027; # '
1786 wakaba 1.169 $self->{prev_state} = $self->{state};
1787 wakaba 1.167 $self->{state} = ENTITY_STATE;
1788 wakaba 1.1 !!!next-input-character;
1789     redo A;
1790 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1791 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1792 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1793 wakaba 1.77 !!!cp (103);
1794 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1795 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1796 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1797 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1798 wakaba 1.77 !!!cp (104);
1799 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1800 wakaba 1.77 } else {
1801 wakaba 1.78 ## NOTE: This state should never be reached.
1802 wakaba 1.77 !!!cp (105);
1803 wakaba 1.1 }
1804     } else {
1805     die "$0: $self->{current_token}->{type}: Unknown token type";
1806     }
1807 wakaba 1.57 $self->{state} = DATA_STATE;
1808 wakaba 1.1 ## reconsume
1809    
1810     !!!emit ($self->{current_token}); # start tag or end tag
1811    
1812     redo A;
1813     } else {
1814 wakaba 1.77 !!!cp (106);
1815 wakaba 1.76 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1816 wakaba 1.1 ## Stay in the state
1817     !!!next-input-character;
1818     redo A;
1819     }
1820 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1821 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
1822     $self->{next_char} == 0x000A or # LF
1823     $self->{next_char} == 0x000B or # HT
1824     $self->{next_char} == 0x000C or # FF
1825     $self->{next_char} == 0x0020) { # SP
1826 wakaba 1.77 !!!cp (107);
1827 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1828 wakaba 1.1 !!!next-input-character;
1829     redo A;
1830 wakaba 1.76 } elsif ($self->{next_char} == 0x0026) { # &
1831 wakaba 1.77 !!!cp (108);
1832 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1833     ## "entity in attribute value state". In this implementation, the
1834     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1835     ## implementation of the "consume a character reference" algorithm.
1836     $self->{entity_additional} = -1;
1837 wakaba 1.169 $self->{prev_state} = $self->{state};
1838 wakaba 1.167 $self->{state} = ENTITY_STATE;
1839 wakaba 1.1 !!!next-input-character;
1840     redo A;
1841 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1842 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1843 wakaba 1.77 !!!cp (109);
1844 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1845 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1846 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1847 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1848 wakaba 1.77 !!!cp (110);
1849 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1850 wakaba 1.77 } else {
1851 wakaba 1.78 ## NOTE: This state should never be reached.
1852 wakaba 1.77 !!!cp (111);
1853 wakaba 1.1 }
1854     } else {
1855     die "$0: $self->{current_token}->{type}: Unknown token type";
1856     }
1857 wakaba 1.57 $self->{state} = DATA_STATE;
1858 wakaba 1.1 !!!next-input-character;
1859    
1860     !!!emit ($self->{current_token}); # start tag or end tag
1861    
1862     redo A;
1863 wakaba 1.76 } elsif ($self->{next_char} == -1) {
1864 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1865 wakaba 1.55 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1866 wakaba 1.77 !!!cp (112);
1867 wakaba 1.1 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1868 wakaba 1.55 } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1869 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1870 wakaba 1.1 if ($self->{current_token}->{attributes}) {
1871 wakaba 1.77 !!!cp (113);
1872 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1873 wakaba 1.77 } else {
1874 wakaba 1.78 ## NOTE: This state should never be reached.
1875 wakaba 1.77 !!!cp (114);
1876 wakaba 1.1 }
1877     } else {
1878     die "$0: $self->{current_token}->{type}: Unknown token type";
1879     }
1880 wakaba 1.57 $self->{state} = DATA_STATE;
1881 wakaba 1.1 ## reconsume
1882    
1883     !!!emit ($self->{current_token}); # start tag or end tag
1884    
1885     redo A;
1886     } else {
1887 wakaba 1.72 if ({
1888     0x0022 => 1, # "
1889     0x0027 => 1, # '
1890     0x003D => 1, # =
1891 wakaba 1.76 }->{$self->{next_char}}) {
1892 wakaba 1.77 !!!cp (115);
1893 wakaba 1.72 !!!parse-error (type => 'bad attribute value');
1894 wakaba 1.77 } else {
1895     !!!cp (116);
1896 wakaba 1.72 }
1897 wakaba 1.76 $self->{current_attribute}->{value} .= chr ($self->{next_char});
1898 wakaba 1.1 ## Stay in the state
1899     !!!next-input-character;
1900     redo A;
1901     }
1902 wakaba 1.72 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1903 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
1904     $self->{next_char} == 0x000A or # LF
1905     $self->{next_char} == 0x000B or # VT
1906     $self->{next_char} == 0x000C or # FF
1907     $self->{next_char} == 0x0020) { # SP
1908 wakaba 1.77 !!!cp (118);
1909 wakaba 1.72 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1910     !!!next-input-character;
1911     redo A;
1912 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
1913 wakaba 1.72 if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1914 wakaba 1.77 !!!cp (119);
1915 wakaba 1.72 $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1916     } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1917     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1918     if ($self->{current_token}->{attributes}) {
1919 wakaba 1.77 !!!cp (120);
1920 wakaba 1.72 !!!parse-error (type => 'end tag attribute');
1921 wakaba 1.77 } else {
1922 wakaba 1.78 ## NOTE: This state should never be reached.
1923 wakaba 1.77 !!!cp (121);
1924 wakaba 1.72 }
1925     } else {
1926     die "$0: $self->{current_token}->{type}: Unknown token type";
1927     }
1928     $self->{state} = DATA_STATE;
1929     !!!next-input-character;
1930    
1931     !!!emit ($self->{current_token}); # start tag or end tag
1932    
1933     redo A;
1934 wakaba 1.76 } elsif ($self->{next_char} == 0x002F) { # /
1935 wakaba 1.125 !!!cp (122);
1936     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1937 wakaba 1.72 !!!next-input-character;
1938 wakaba 1.125 redo A;
1939 wakaba 1.141 } elsif ($self->{next_char} == -1) {
1940     !!!parse-error (type => 'unclosed tag');
1941     if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1942     !!!cp (122.3);
1943     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1944     } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1945     if ($self->{current_token}->{attributes}) {
1946     !!!cp (122.1);
1947     !!!parse-error (type => 'end tag attribute');
1948     } else {
1949     ## NOTE: This state should never be reached.
1950     !!!cp (122.2);
1951     }
1952     } else {
1953     die "$0: $self->{current_token}->{type}: Unknown token type";
1954     }
1955     $self->{state} = DATA_STATE;
1956     ## Reconsume.
1957     !!!emit ($self->{current_token}); # start tag or end tag
1958     redo A;
1959 wakaba 1.125 } else {
1960     !!!cp ('124.1');
1961     !!!parse-error (type => 'no space between attributes');
1962     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1963     ## reconsume
1964     redo A;
1965     }
1966     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1967     if ($self->{next_char} == 0x003E) { # >
1968     if ($self->{current_token}->{type} == END_TAG_TOKEN) {
1969     !!!cp ('124.2');
1970     !!!parse-error (type => 'nestc', token => $self->{current_token});
1971     ## TODO: Different type than slash in start tag
1972     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1973     if ($self->{current_token}->{attributes}) {
1974     !!!cp ('124.4');
1975     !!!parse-error (type => 'end tag attribute');
1976     } else {
1977     !!!cp ('124.5');
1978     }
1979     ## TODO: Test |<title></title/>|
1980 wakaba 1.72 } else {
1981 wakaba 1.125 !!!cp ('124.3');
1982     $self->{self_closing} = 1;
1983 wakaba 1.72 }
1984 wakaba 1.125
1985     $self->{state} = DATA_STATE;
1986     !!!next-input-character;
1987    
1988     !!!emit ($self->{current_token}); # start tag or end tag
1989    
1990 wakaba 1.72 redo A;
1991 wakaba 1.141 } elsif ($self->{next_char} == -1) {
1992     !!!parse-error (type => 'unclosed tag');
1993     if ($self->{current_token}->{type} == START_TAG_TOKEN) {
1994     !!!cp (124.7);
1995     $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
1996     } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
1997     if ($self->{current_token}->{attributes}) {
1998     !!!cp (124.5);
1999     !!!parse-error (type => 'end tag attribute');
2000     } else {
2001     ## NOTE: This state should never be reached.
2002     !!!cp (124.6);
2003     }
2004     } else {
2005     die "$0: $self->{current_token}->{type}: Unknown token type";
2006     }
2007     $self->{state} = DATA_STATE;
2008     ## Reconsume.
2009     !!!emit ($self->{current_token}); # start tag or end tag
2010     redo A;
2011 wakaba 1.72 } else {
2012 wakaba 1.125 !!!cp ('124.4');
2013     !!!parse-error (type => 'nestc');
2014     ## TODO: This error type is wrong.
2015 wakaba 1.72 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2016 wakaba 1.125 ## Reconsume.
2017 wakaba 1.72 redo A;
2018     }
2019 wakaba 1.57 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2020 wakaba 1.1 ## (only happen if PCDATA state)
2021 wakaba 1.167
2022     ## NOTE: Unlike spec's "bogus comment state", this implementation
2023     ## consumes characters one-by-one basis.
2024 wakaba 1.1
2025 wakaba 1.167 if ($self->{next_char} == 0x003E) { # >
2026     !!!cp (124);
2027     $self->{state} = DATA_STATE;
2028     !!!next-input-character;
2029 wakaba 1.1
2030 wakaba 1.167 !!!emit ($self->{current_token}); # comment
2031     redo A;
2032     } elsif ($self->{next_char} == -1) {
2033     !!!cp (125);
2034     $self->{state} = DATA_STATE;
2035     ## reconsume
2036 wakaba 1.1
2037 wakaba 1.167 !!!emit ($self->{current_token}); # comment
2038     redo A;
2039     } else {
2040     !!!cp (126);
2041     $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2042     ## Stay in the state.
2043     !!!next-input-character;
2044     redo A;
2045     }
2046 wakaba 1.57 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2047 wakaba 1.1 ## (only happen if PCDATA state)
2048    
2049 wakaba 1.76 if ($self->{next_char} == 0x002D) { # -
2050 wakaba 1.163 !!!cp (133);
2051     $self->{state} = MD_HYPHEN_STATE;
2052 wakaba 1.1 !!!next-input-character;
2053 wakaba 1.163 redo A;
2054 wakaba 1.76 } elsif ($self->{next_char} == 0x0044 or # D
2055     $self->{next_char} == 0x0064) { # d
2056 wakaba 1.163 ## ASCII case-insensitive.
2057     !!!cp (130);
2058     $self->{state} = MD_DOCTYPE_STATE;
2059     $self->{state_keyword} = chr $self->{next_char};
2060 wakaba 1.1 !!!next-input-character;
2061 wakaba 1.163 redo A;
2062 wakaba 1.127 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2063     $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2064     $self->{next_char} == 0x005B) { # [
2065 wakaba 1.163 !!!cp (135.4);
2066     $self->{state} = MD_CDATA_STATE;
2067     $self->{state_keyword} = '[';
2068 wakaba 1.127 !!!next-input-character;
2069 wakaba 1.163 redo A;
2070 wakaba 1.77 } else {
2071     !!!cp (136);
2072 wakaba 1.1 }
2073    
2074 wakaba 1.163 !!!parse-error (type => 'bogus comment',
2075     line => $self->{line_prev},
2076     column => $self->{column_prev} - 1);
2077     ## Reconsume.
2078 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
2079 wakaba 1.112 $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2080 wakaba 1.163 line => $self->{line_prev},
2081     column => $self->{column_prev} - 1,
2082 wakaba 1.118 };
2083 wakaba 1.1 redo A;
2084 wakaba 1.163 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2085     if ($self->{next_char} == 0x002D) { # -
2086     !!!cp (127);
2087     $self->{current_token} = {type => COMMENT_TOKEN, data => '',
2088     line => $self->{line_prev},
2089     column => $self->{column_prev} - 2,
2090     };
2091     $self->{state} = COMMENT_START_STATE;
2092     !!!next-input-character;
2093     redo A;
2094     } else {
2095     !!!cp (128);
2096     !!!parse-error (type => 'bogus comment',
2097     line => $self->{line_prev},
2098     column => $self->{column_prev} - 2);
2099     $self->{state} = BOGUS_COMMENT_STATE;
2100     ## Reconsume.
2101     $self->{current_token} = {type => COMMENT_TOKEN,
2102     data => '-',
2103     line => $self->{line_prev},
2104     column => $self->{column_prev} - 2,
2105     };
2106     redo A;
2107     }
2108     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2109     ## ASCII case-insensitive.
2110     if ($self->{next_char} == [
2111     undef,
2112     0x004F, # O
2113     0x0043, # C
2114     0x0054, # T
2115     0x0059, # Y
2116     0x0050, # P
2117     ]->[length $self->{state_keyword}] or
2118     $self->{next_char} == [
2119     undef,
2120     0x006F, # o
2121     0x0063, # c
2122     0x0074, # t
2123     0x0079, # y
2124     0x0070, # p
2125     ]->[length $self->{state_keyword}]) {
2126     !!!cp (131);
2127     ## Stay in the state.
2128     $self->{state_keyword} .= chr $self->{next_char};
2129     !!!next-input-character;
2130     redo A;
2131     } elsif ((length $self->{state_keyword}) == 6 and
2132     ($self->{next_char} == 0x0045 or # E
2133     $self->{next_char} == 0x0065)) { # e
2134     !!!cp (129);
2135     $self->{state} = DOCTYPE_STATE;
2136     $self->{current_token} = {type => DOCTYPE_TOKEN,
2137     quirks => 1,
2138     line => $self->{line_prev},
2139     column => $self->{column_prev} - 7,
2140     };
2141     !!!next-input-character;
2142     redo A;
2143     } else {
2144     !!!cp (132);
2145     !!!parse-error (type => 'bogus comment',
2146     line => $self->{line_prev},
2147     column => $self->{column_prev} - 1 - length $self->{state_keyword});
2148     $self->{state} = BOGUS_COMMENT_STATE;
2149     ## Reconsume.
2150     $self->{current_token} = {type => COMMENT_TOKEN,
2151     data => $self->{state_keyword},
2152     line => $self->{line_prev},
2153     column => $self->{column_prev} - 1 - length $self->{state_keyword},
2154     };
2155     redo A;
2156     }
2157     } elsif ($self->{state} == MD_CDATA_STATE) {
2158     if ($self->{next_char} == {
2159     '[' => 0x0043, # C
2160     '[C' => 0x0044, # D
2161     '[CD' => 0x0041, # A
2162     '[CDA' => 0x0054, # T
2163     '[CDAT' => 0x0041, # A
2164     }->{$self->{state_keyword}}) {
2165     !!!cp (135.1);
2166     ## Stay in the state.
2167     $self->{state_keyword} .= chr $self->{next_char};
2168     !!!next-input-character;
2169     redo A;
2170     } elsif ($self->{state_keyword} eq '[CDATA' and
2171     $self->{next_char} == 0x005B) { # [
2172     !!!cp (135.2);
2173 wakaba 1.165 $self->{current_token} = {type => CHARACTER_TOKEN,
2174     data => '',
2175     line => $self->{line_prev},
2176     column => $self->{column_prev} - 7};
2177     $self->{state} = CDATA_SECTION_STATE;
2178 wakaba 1.163 !!!next-input-character;
2179     redo A;
2180     } else {
2181     !!!cp (135.3);
2182     !!!parse-error (type => 'bogus comment',
2183     line => $self->{line_prev},
2184     column => $self->{column_prev} - 1 - length $self->{state_keyword});
2185     $self->{state} = BOGUS_COMMENT_STATE;
2186     ## Reconsume.
2187     $self->{current_token} = {type => COMMENT_TOKEN,
2188     data => $self->{state_keyword},
2189     line => $self->{line_prev},
2190     column => $self->{column_prev} - 1 - length $self->{state_keyword},
2191     };
2192     redo A;
2193     }
2194 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_STATE) {
2195 wakaba 1.76 if ($self->{next_char} == 0x002D) { # -
2196 wakaba 1.77 !!!cp (137);
2197 wakaba 1.57 $self->{state} = COMMENT_START_DASH_STATE;
2198 wakaba 1.23 !!!next-input-character;
2199     redo A;
2200 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2201 wakaba 1.77 !!!cp (138);
2202 wakaba 1.23 !!!parse-error (type => 'bogus comment');
2203 wakaba 1.57 $self->{state} = DATA_STATE;
2204 wakaba 1.23 !!!next-input-character;
2205    
2206     !!!emit ($self->{current_token}); # comment
2207    
2208     redo A;
2209 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2210 wakaba 1.77 !!!cp (139);
2211 wakaba 1.23 !!!parse-error (type => 'unclosed comment');
2212 wakaba 1.57 $self->{state} = DATA_STATE;
2213 wakaba 1.23 ## reconsume
2214    
2215     !!!emit ($self->{current_token}); # comment
2216    
2217     redo A;
2218     } else {
2219 wakaba 1.77 !!!cp (140);
2220 wakaba 1.23 $self->{current_token}->{data} # comment
2221 wakaba 1.76 .= chr ($self->{next_char});
2222 wakaba 1.57 $self->{state} = COMMENT_STATE;
2223 wakaba 1.23 !!!next-input-character;
2224     redo A;
2225     }
2226 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2227 wakaba 1.76 if ($self->{next_char} == 0x002D) { # -
2228 wakaba 1.77 !!!cp (141);
2229 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
2230 wakaba 1.23 !!!next-input-character;
2231     redo A;
2232 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2233 wakaba 1.77 !!!cp (142);
2234 wakaba 1.23 !!!parse-error (type => 'bogus comment');
2235 wakaba 1.57 $self->{state} = DATA_STATE;
2236 wakaba 1.23 !!!next-input-character;
2237    
2238     !!!emit ($self->{current_token}); # comment
2239    
2240     redo A;
2241 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2242 wakaba 1.77 !!!cp (143);
2243 wakaba 1.23 !!!parse-error (type => 'unclosed comment');
2244 wakaba 1.57 $self->{state} = DATA_STATE;
2245 wakaba 1.23 ## reconsume
2246    
2247     !!!emit ($self->{current_token}); # comment
2248    
2249     redo A;
2250     } else {
2251 wakaba 1.77 !!!cp (144);
2252 wakaba 1.23 $self->{current_token}->{data} # comment
2253 wakaba 1.76 .= '-' . chr ($self->{next_char});
2254 wakaba 1.57 $self->{state} = COMMENT_STATE;
2255 wakaba 1.23 !!!next-input-character;
2256     redo A;
2257     }
2258 wakaba 1.57 } elsif ($self->{state} == COMMENT_STATE) {
2259 wakaba 1.76 if ($self->{next_char} == 0x002D) { # -
2260 wakaba 1.77 !!!cp (145);
2261 wakaba 1.57 $self->{state} = COMMENT_END_DASH_STATE;
2262 wakaba 1.1 !!!next-input-character;
2263     redo A;
2264 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2265 wakaba 1.77 !!!cp (146);
2266 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2267 wakaba 1.57 $self->{state} = DATA_STATE;
2268 wakaba 1.1 ## reconsume
2269    
2270     !!!emit ($self->{current_token}); # comment
2271    
2272     redo A;
2273     } else {
2274 wakaba 1.77 !!!cp (147);
2275 wakaba 1.76 $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
2276 wakaba 1.1 ## Stay in the state
2277     !!!next-input-character;
2278     redo A;
2279     }
2280 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2281 wakaba 1.76 if ($self->{next_char} == 0x002D) { # -
2282 wakaba 1.77 !!!cp (148);
2283 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
2284 wakaba 1.1 !!!next-input-character;
2285     redo A;
2286 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2287 wakaba 1.77 !!!cp (149);
2288 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2289 wakaba 1.57 $self->{state} = DATA_STATE;
2290 wakaba 1.1 ## reconsume
2291    
2292     !!!emit ($self->{current_token}); # comment
2293    
2294     redo A;
2295     } else {
2296 wakaba 1.77 !!!cp (150);
2297 wakaba 1.76 $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
2298 wakaba 1.57 $self->{state} = COMMENT_STATE;
2299 wakaba 1.1 !!!next-input-character;
2300     redo A;
2301     }
2302 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_STATE) {
2303 wakaba 1.76 if ($self->{next_char} == 0x003E) { # >
2304 wakaba 1.77 !!!cp (151);
2305 wakaba 1.57 $self->{state} = DATA_STATE;
2306 wakaba 1.1 !!!next-input-character;
2307    
2308     !!!emit ($self->{current_token}); # comment
2309    
2310     redo A;
2311 wakaba 1.76 } elsif ($self->{next_char} == 0x002D) { # -
2312 wakaba 1.77 !!!cp (152);
2313 wakaba 1.114 !!!parse-error (type => 'dash in comment',
2314     line => $self->{line_prev},
2315     column => $self->{column_prev});
2316 wakaba 1.1 $self->{current_token}->{data} .= '-'; # comment
2317     ## Stay in the state
2318     !!!next-input-character;
2319     redo A;
2320 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2321 wakaba 1.77 !!!cp (153);
2322 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2323 wakaba 1.57 $self->{state} = DATA_STATE;
2324 wakaba 1.1 ## reconsume
2325    
2326     !!!emit ($self->{current_token}); # comment
2327    
2328     redo A;
2329     } else {
2330 wakaba 1.77 !!!cp (154);
2331 wakaba 1.114 !!!parse-error (type => 'dash in comment',
2332     line => $self->{line_prev},
2333     column => $self->{column_prev});
2334 wakaba 1.76 $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
2335 wakaba 1.57 $self->{state} = COMMENT_STATE;
2336 wakaba 1.1 !!!next-input-character;
2337     redo A;
2338     }
2339 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_STATE) {
2340 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
2341     $self->{next_char} == 0x000A or # LF
2342     $self->{next_char} == 0x000B or # VT
2343     $self->{next_char} == 0x000C or # FF
2344     $self->{next_char} == 0x0020) { # SP
2345 wakaba 1.77 !!!cp (155);
2346 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2347 wakaba 1.1 !!!next-input-character;
2348     redo A;
2349     } else {
2350 wakaba 1.77 !!!cp (156);
2351 wakaba 1.3 !!!parse-error (type => 'no space before DOCTYPE name');
2352 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2353 wakaba 1.1 ## reconsume
2354     redo A;
2355     }
2356 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2357 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
2358     $self->{next_char} == 0x000A or # LF
2359     $self->{next_char} == 0x000B or # VT
2360     $self->{next_char} == 0x000C or # FF
2361     $self->{next_char} == 0x0020) { # SP
2362 wakaba 1.77 !!!cp (157);
2363 wakaba 1.1 ## Stay in the state
2364     !!!next-input-character;
2365     redo A;
2366 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2367 wakaba 1.77 !!!cp (158);
2368 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
2369 wakaba 1.57 $self->{state} = DATA_STATE;
2370 wakaba 1.1 !!!next-input-character;
2371    
2372 wakaba 1.112 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2373 wakaba 1.1
2374     redo A;
2375 wakaba 1.77 } elsif ($self->{next_char} == -1) {
2376     !!!cp (159);
2377 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
2378 wakaba 1.57 $self->{state} = DATA_STATE;
2379 wakaba 1.1 ## reconsume
2380    
2381 wakaba 1.112 !!!emit ($self->{current_token}); # DOCTYPE (quirks)
2382 wakaba 1.1
2383     redo A;
2384     } else {
2385 wakaba 1.77 !!!cp (160);
2386 wakaba 1.112 $self->{current_token}->{name} = chr $self->{next_char};
2387     delete $self->{current_token}->{quirks};
2388 wakaba 1.4 ## ISSUE: "Set the token's name name to the" in the spec
2389 wakaba 1.57 $self->{state} = DOCTYPE_NAME_STATE;
2390 wakaba 1.1 !!!next-input-character;
2391     redo A;
2392     }
2393 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2394 wakaba 1.18 ## ISSUE: Redundant "First," in the spec.
2395 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
2396     $self->{next_char} == 0x000A or # LF
2397     $self->{next_char} == 0x000B or # VT
2398     $self->{next_char} == 0x000C or # FF
2399     $self->{next_char} == 0x0020) { # SP
2400 wakaba 1.77 !!!cp (161);
2401 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2402 wakaba 1.1 !!!next-input-character;
2403     redo A;
2404 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2405 wakaba 1.77 !!!cp (162);
2406 wakaba 1.57 $self->{state} = DATA_STATE;
2407 wakaba 1.1 !!!next-input-character;
2408    
2409     !!!emit ($self->{current_token}); # DOCTYPE
2410    
2411     redo A;
2412 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2413 wakaba 1.77 !!!cp (163);
2414 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2415 wakaba 1.57 $self->{state} = DATA_STATE;
2416 wakaba 1.1 ## reconsume
2417    
2418 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2419 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2420 wakaba 1.1
2421     redo A;
2422     } else {
2423 wakaba 1.77 !!!cp (164);
2424 wakaba 1.1 $self->{current_token}->{name}
2425 wakaba 1.76 .= chr ($self->{next_char}); # DOCTYPE
2426 wakaba 1.1 ## Stay in the state
2427     !!!next-input-character;
2428     redo A;
2429     }
2430 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2431 wakaba 1.76 if ($self->{next_char} == 0x0009 or # HT
2432     $self->{next_char} == 0x000A or # LF
2433     $self->{next_char} == 0x000B or # VT
2434     $self->{next_char} == 0x000C or # FF
2435     $self->{next_char} == 0x0020) { # SP
2436 wakaba 1.77 !!!cp (165);
2437 wakaba 1.1 ## Stay in the state
2438     !!!next-input-character;
2439     redo A;
2440 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2441 wakaba 1.77 !!!cp (166);
2442 wakaba 1.57 $self->{state} = DATA_STATE;
2443 wakaba 1.1 !!!next-input-character;
2444    
2445     !!!emit ($self->{current_token}); # DOCTYPE
2446    
2447     redo A;
2448 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2449 wakaba 1.77 !!!cp (167);
2450 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2451 wakaba 1.57 $self->{state} = DATA_STATE;
2452 wakaba 1.1 ## reconsume
2453    
2454 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2455 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2456    
2457     redo A;
2458 wakaba 1.76 } elsif ($self->{next_char} == 0x0050 or # P
2459     $self->{next_char} == 0x0070) { # p
2460 wakaba 1.166 $self->{state} = PUBLIC_STATE;
2461     $self->{state_keyword} = chr $self->{next_char};
2462 wakaba 1.18 !!!next-input-character;
2463 wakaba 1.166 redo A;
2464 wakaba 1.76 } elsif ($self->{next_char} == 0x0053 or # S
2465     $self->{next_char} == 0x0073) { # s
2466 wakaba 1.166 $self->{state} = SYSTEM_STATE;
2467     $self->{state_keyword} = chr $self->{next_char};
2468 wakaba 1.18 !!!next-input-character;
2469 wakaba 1.166 redo A;
2470 wakaba 1.18 } else {
2471 wakaba 1.77 !!!cp (180);
2472 wakaba 1.166 !!!parse-error (type => 'string after DOCTYPE name');
2473     $self->{current_token}->{quirks} = 1;
2474    
2475     $self->{state} = BOGUS_DOCTYPE_STATE;
2476 wakaba 1.18 !!!next-input-character;
2477 wakaba 1.166 redo A;
2478 wakaba 1.18 }
2479 wakaba 1.166 } elsif ($self->{state} == PUBLIC_STATE) {
2480     ## ASCII case-insensitive
2481     if ($self->{next_char} == [
2482     undef,
2483     0x0055, # U
2484     0x0042, # B
2485     0x004C, # L
2486     0x0049, # I
2487     ]->[length $self->{state_keyword}] or
2488     $self->{next_char} == [
2489     undef,
2490     0x0075, # u
2491     0x0062, # b
2492     0x006C, # l
2493     0x0069, # i
2494     ]->[length $self->{state_keyword}]) {
2495     !!!cp (175);
2496     ## Stay in the state.
2497     $self->{state_keyword} .= chr $self->{next_char};
2498     !!!next-input-character;
2499     redo A;
2500     } elsif ((length $self->{state_keyword}) == 5 and
2501     ($self->{next_char} == 0x0043 or # C
2502     $self->{next_char} == 0x0063)) { # c
2503     !!!cp (168);
2504     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2505     !!!next-input-character;
2506     redo A;
2507     } else {
2508     !!!cp (169);
2509     !!!parse-error (type => 'string after DOCTYPE name',
2510     line => $self->{line_prev},
2511     column => $self->{column_prev} + 1 - length $self->{state_keyword});
2512     $self->{current_token}->{quirks} = 1;
2513 wakaba 1.18
2514 wakaba 1.166 $self->{state} = BOGUS_DOCTYPE_STATE;
2515     ## Reconsume.
2516     redo A;
2517     }
2518     } elsif ($self->{state} == SYSTEM_STATE) {
2519     ## ASCII case-insensitive
2520     if ($self->{next_char} == [
2521     undef,
2522     0x0059, # Y
2523     0x0053, # S
2524     0x0054, # T
2525     0x0045, # E
2526     ]->[length $self->{state_keyword}] or
2527     $self->{next_char} == [
2528     undef,
2529     0x0079, # y
2530     0x0073, # s
2531     0x0074, # t
2532     0x0065, # e
2533     ]->[length $self->{state_keyword}]) {
2534     !!!cp (170);
2535     ## Stay in the state.
2536     $self->{state_keyword} .= chr $self->{next_char};
2537     !!!next-input-character;
2538     redo A;
2539     } elsif ((length $self->{state_keyword}) == 5 and
2540     ($self->{next_char} == 0x004D or # M
2541     $self->{next_char} == 0x006D)) { # m
2542     !!!cp (171);
2543     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2544     !!!next-input-character;
2545     redo A;
2546     } else {
2547     !!!cp (172);
2548     !!!parse-error (type => 'string after DOCTYPE name',
2549     line => $self->{line_prev},
2550     column => $self->{column_prev} + 1 - length $self->{state_keyword});
2551     $self->{current_token}->{quirks} = 1;
2552 wakaba 1.73
2553 wakaba 1.166 $self->{state} = BOGUS_DOCTYPE_STATE;
2554     ## Reconsume.
2555     redo A;
2556     }
2557 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2558 wakaba 1.18 if ({
2559     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2560     #0x000D => 1, # HT, LF, VT, FF, SP, CR
2561 wakaba 1.76 }->{$self->{next_char}}) {
2562 wakaba 1.77 !!!cp (181);
2563 wakaba 1.18 ## Stay in the state
2564     !!!next-input-character;
2565     redo A;
2566 wakaba 1.76 } elsif ($self->{next_char} eq 0x0022) { # "
2567 wakaba 1.77 !!!cp (182);
2568 wakaba 1.18 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2569 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2570 wakaba 1.18 !!!next-input-character;
2571     redo A;
2572 wakaba 1.76 } elsif ($self->{next_char} eq 0x0027) { # '
2573 wakaba 1.77 !!!cp (183);
2574 wakaba 1.18 $self->{current_token}->{public_identifier} = ''; # DOCTYPE
2575 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2576 wakaba 1.18 !!!next-input-character;
2577     redo A;
2578 wakaba 1.76 } elsif ($self->{next_char} eq 0x003E) { # >
2579 wakaba 1.77 !!!cp (184);
2580 wakaba 1.18 !!!parse-error (type => 'no PUBLIC literal');
2581    
2582 wakaba 1.57 $self->{state} = DATA_STATE;
2583 wakaba 1.18 !!!next-input-character;
2584    
2585 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2586 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2587    
2588     redo A;
2589 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2590 wakaba 1.77 !!!cp (185);
2591 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2592    
2593 wakaba 1.57 $self->{state} = DATA_STATE;
2594 wakaba 1.18 ## reconsume
2595    
2596 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2597 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2598    
2599     redo A;
2600     } else {
2601 wakaba 1.77 !!!cp (186);
2602 wakaba 1.18 !!!parse-error (type => 'string after PUBLIC');
2603 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2604 wakaba 1.73
2605 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2606 wakaba 1.18 !!!next-input-character;
2607     redo A;
2608     }
2609 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2610 wakaba 1.76 if ($self->{next_char} == 0x0022) { # "
2611 wakaba 1.77 !!!cp (187);
2612 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2613 wakaba 1.18 !!!next-input-character;
2614     redo A;
2615 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2616 wakaba 1.77 !!!cp (188);
2617 wakaba 1.69 !!!parse-error (type => 'unclosed PUBLIC literal');
2618    
2619     $self->{state} = DATA_STATE;
2620     !!!next-input-character;
2621    
2622 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2623 wakaba 1.69 !!!emit ($self->{current_token}); # DOCTYPE
2624    
2625     redo A;
2626 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2627 wakaba 1.77 !!!cp (189);
2628 wakaba 1.18 !!!parse-error (type => 'unclosed PUBLIC literal');
2629    
2630 wakaba 1.57 $self->{state} = DATA_STATE;
2631 wakaba 1.18 ## reconsume
2632    
2633 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2634 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2635    
2636     redo A;
2637     } else {
2638 wakaba 1.77 !!!cp (190);
2639 wakaba 1.18 $self->{current_token}->{public_identifier} # DOCTYPE
2640 wakaba 1.76 .= chr $self->{next_char};
2641 wakaba 1.18 ## Stay in the state
2642     !!!next-input-character;
2643     redo A;
2644     }
2645 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2646 wakaba 1.76 if ($self->{next_char} == 0x0027) { # '
2647 wakaba 1.77 !!!cp (191);
2648 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2649 wakaba 1.18 !!!next-input-character;
2650     redo A;
2651 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2652 wakaba 1.77 !!!cp (192);
2653 wakaba 1.69 !!!parse-error (type => 'unclosed PUBLIC literal');
2654    
2655     $self->{state} = DATA_STATE;
2656     !!!next-input-character;
2657    
2658 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2659 wakaba 1.69 !!!emit ($self->{current_token}); # DOCTYPE
2660    
2661     redo A;
2662 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2663 wakaba 1.77 !!!cp (193);
2664 wakaba 1.18 !!!parse-error (type => 'unclosed PUBLIC literal');
2665    
2666 wakaba 1.57 $self->{state} = DATA_STATE;
2667 wakaba 1.18 ## reconsume
2668    
2669 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2670 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2671    
2672     redo A;
2673     } else {
2674 wakaba 1.77 !!!cp (194);
2675 wakaba 1.18 $self->{current_token}->{public_identifier} # DOCTYPE
2676 wakaba 1.76 .= chr $self->{next_char};
2677 wakaba 1.18 ## Stay in the state
2678     !!!next-input-character;
2679     redo A;
2680     }
2681 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2682 wakaba 1.18 if ({
2683     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2684     #0x000D => 1, # HT, LF, VT, FF, SP, CR
2685 wakaba 1.76 }->{$self->{next_char}}) {
2686 wakaba 1.77 !!!cp (195);
2687 wakaba 1.18 ## Stay in the state
2688     !!!next-input-character;
2689     redo A;
2690 wakaba 1.76 } elsif ($self->{next_char} == 0x0022) { # "
2691 wakaba 1.77 !!!cp (196);
2692 wakaba 1.18 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2693 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2694 wakaba 1.18 !!!next-input-character;
2695     redo A;
2696 wakaba 1.76 } elsif ($self->{next_char} == 0x0027) { # '
2697 wakaba 1.77 !!!cp (197);
2698 wakaba 1.18 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2699 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2700 wakaba 1.18 !!!next-input-character;
2701     redo A;
2702 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2703 wakaba 1.77 !!!cp (198);
2704 wakaba 1.57 $self->{state} = DATA_STATE;
2705 wakaba 1.18 !!!next-input-character;
2706    
2707     !!!emit ($self->{current_token}); # DOCTYPE
2708    
2709     redo A;
2710 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2711 wakaba 1.77 !!!cp (199);
2712 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2713    
2714 wakaba 1.57 $self->{state} = DATA_STATE;
2715 wakaba 1.26 ## reconsume
2716 wakaba 1.18
2717 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2718 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2719    
2720     redo A;
2721     } else {
2722 wakaba 1.77 !!!cp (200);
2723 wakaba 1.18 !!!parse-error (type => 'string after PUBLIC literal');
2724 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2725 wakaba 1.73
2726 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2727 wakaba 1.18 !!!next-input-character;
2728     redo A;
2729     }
2730 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2731 wakaba 1.18 if ({
2732     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2733     #0x000D => 1, # HT, LF, VT, FF, SP, CR
2734 wakaba 1.76 }->{$self->{next_char}}) {
2735 wakaba 1.77 !!!cp (201);
2736 wakaba 1.18 ## Stay in the state
2737     !!!next-input-character;
2738     redo A;
2739 wakaba 1.76 } elsif ($self->{next_char} == 0x0022) { # "
2740 wakaba 1.77 !!!cp (202);
2741 wakaba 1.18 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2742 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2743 wakaba 1.18 !!!next-input-character;
2744     redo A;
2745 wakaba 1.76 } elsif ($self->{next_char} == 0x0027) { # '
2746 wakaba 1.77 !!!cp (203);
2747 wakaba 1.18 $self->{current_token}->{system_identifier} = ''; # DOCTYPE
2748 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2749 wakaba 1.18 !!!next-input-character;
2750     redo A;
2751 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2752 wakaba 1.77 !!!cp (204);
2753 wakaba 1.18 !!!parse-error (type => 'no SYSTEM literal');
2754 wakaba 1.57 $self->{state} = DATA_STATE;
2755 wakaba 1.18 !!!next-input-character;
2756    
2757 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2758 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2759    
2760     redo A;
2761 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2762 wakaba 1.77 !!!cp (205);
2763 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2764    
2765 wakaba 1.57 $self->{state} = DATA_STATE;
2766 wakaba 1.26 ## reconsume
2767 wakaba 1.18
2768 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2769 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2770    
2771     redo A;
2772     } else {
2773 wakaba 1.77 !!!cp (206);
2774 wakaba 1.30 !!!parse-error (type => 'string after SYSTEM');
2775 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2776 wakaba 1.73
2777 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2778 wakaba 1.18 !!!next-input-character;
2779     redo A;
2780     }
2781 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2782 wakaba 1.76 if ($self->{next_char} == 0x0022) { # "
2783 wakaba 1.77 !!!cp (207);
2784 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2785 wakaba 1.18 !!!next-input-character;
2786     redo A;
2787 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2788 wakaba 1.77 !!!cp (208);
2789 wakaba 1.153 !!!parse-error (type => 'unclosed SYSTEM literal');
2790 wakaba 1.69
2791     $self->{state} = DATA_STATE;
2792     !!!next-input-character;
2793    
2794 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2795 wakaba 1.69 !!!emit ($self->{current_token}); # DOCTYPE
2796    
2797     redo A;
2798 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2799 wakaba 1.77 !!!cp (209);
2800 wakaba 1.18 !!!parse-error (type => 'unclosed SYSTEM literal');
2801    
2802 wakaba 1.57 $self->{state} = DATA_STATE;
2803 wakaba 1.18 ## reconsume
2804    
2805 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2806 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2807    
2808     redo A;
2809     } else {
2810 wakaba 1.77 !!!cp (210);
2811 wakaba 1.18 $self->{current_token}->{system_identifier} # DOCTYPE
2812 wakaba 1.76 .= chr $self->{next_char};
2813 wakaba 1.18 ## Stay in the state
2814     !!!next-input-character;
2815     redo A;
2816     }
2817 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2818 wakaba 1.76 if ($self->{next_char} == 0x0027) { # '
2819 wakaba 1.77 !!!cp (211);
2820 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2821 wakaba 1.18 !!!next-input-character;
2822     redo A;
2823 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2824 wakaba 1.77 !!!cp (212);
2825 wakaba 1.153 !!!parse-error (type => 'unclosed SYSTEM literal');
2826 wakaba 1.69
2827     $self->{state} = DATA_STATE;
2828     !!!next-input-character;
2829    
2830 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2831 wakaba 1.69 !!!emit ($self->{current_token}); # DOCTYPE
2832    
2833     redo A;
2834 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2835 wakaba 1.77 !!!cp (213);
2836 wakaba 1.18 !!!parse-error (type => 'unclosed SYSTEM literal');
2837    
2838 wakaba 1.57 $self->{state} = DATA_STATE;
2839 wakaba 1.18 ## reconsume
2840    
2841 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2842 wakaba 1.1 !!!emit ($self->{current_token}); # DOCTYPE
2843    
2844     redo A;
2845     } else {
2846 wakaba 1.77 !!!cp (214);
2847 wakaba 1.18 $self->{current_token}->{system_identifier} # DOCTYPE
2848 wakaba 1.76 .= chr $self->{next_char};
2849 wakaba 1.18 ## Stay in the state
2850     !!!next-input-character;
2851     redo A;
2852     }
2853 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2854 wakaba 1.18 if ({
2855     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
2856     #0x000D => 1, # HT, LF, VT, FF, SP, CR
2857 wakaba 1.76 }->{$self->{next_char}}) {
2858 wakaba 1.77 !!!cp (215);
2859 wakaba 1.18 ## Stay in the state
2860     !!!next-input-character;
2861     redo A;
2862 wakaba 1.76 } elsif ($self->{next_char} == 0x003E) { # >
2863 wakaba 1.77 !!!cp (216);
2864 wakaba 1.57 $self->{state} = DATA_STATE;
2865 wakaba 1.18 !!!next-input-character;
2866    
2867     !!!emit ($self->{current_token}); # DOCTYPE
2868    
2869     redo A;
2870 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2871 wakaba 1.77 !!!cp (217);
2872 wakaba 1.150 !!!parse-error (type => 'unclosed DOCTYPE');
2873 wakaba 1.57 $self->{state} = DATA_STATE;
2874 wakaba 1.26 ## reconsume
2875 wakaba 1.18
2876 wakaba 1.75 $self->{current_token}->{quirks} = 1;
2877 wakaba 1.18 !!!emit ($self->{current_token}); # DOCTYPE
2878    
2879     redo A;
2880     } else {
2881 wakaba 1.77 !!!cp (218);
2882 wakaba 1.18 !!!parse-error (type => 'string after SYSTEM literal');
2883 wakaba 1.75 #$self->{current_token}->{quirks} = 1;
2884 wakaba 1.73
2885 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2886 wakaba 1.1 !!!next-input-character;
2887     redo A;
2888     }
2889 wakaba 1.57 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2890 wakaba 1.76 if ($self->{next_char} == 0x003E) { # >
2891 wakaba 1.77 !!!cp (219);
2892 wakaba 1.57 $self->{state} = DATA_STATE;
2893 wakaba 1.1 !!!next-input-character;
2894    
2895     !!!emit ($self->{current_token}); # DOCTYPE
2896    
2897     redo A;
2898 wakaba 1.76 } elsif ($self->{next_char} == -1) {
2899 wakaba 1.77 !!!cp (220);
2900 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2901 wakaba 1.57 $self->{state} = DATA_STATE;
2902 wakaba 1.1 ## reconsume
2903    
2904     !!!emit ($self->{current_token}); # DOCTYPE
2905    
2906     redo A;
2907     } else {
2908 wakaba 1.77 !!!cp (221);
2909 wakaba 1.1 ## Stay in the state
2910     !!!next-input-character;
2911     redo A;
2912     }
2913 wakaba 1.165 } elsif ($self->{state} == CDATA_SECTION_STATE) {
2914     ## NOTE: "CDATA section state" in the state is jointly implemented
2915     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2916     ## and |CDATA_SECTION_MSE2_STATE|.
2917 wakaba 1.127
2918 wakaba 1.165 if ($self->{next_char} == 0x005D) { # ]
2919     !!!cp (221.1);
2920     $self->{state} = CDATA_SECTION_MSE1_STATE;
2921     !!!next-input-character;
2922     redo A;
2923     } elsif ($self->{next_char} == -1) {
2924     $self->{state} = DATA_STATE;
2925     !!!next-input-character;
2926     if (length $self->{current_token}->{data}) { # character
2927     !!!cp (221.2);
2928     !!!emit ($self->{current_token}); # character
2929     } else {
2930     !!!cp (221.3);
2931     ## No token to emit. $self->{current_token} is discarded.
2932     }
2933     redo A;
2934     } else {
2935     !!!cp (221.4);
2936     $self->{current_token}->{data} .= chr $self->{next_char};
2937     ## Stay in the state.
2938     !!!next-input-character;
2939     redo A;
2940     }
2941 wakaba 1.127
2942 wakaba 1.165 ## ISSUE: "text tokens" in spec.
2943     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2944     if ($self->{next_char} == 0x005D) { # ]
2945     !!!cp (221.5);
2946     $self->{state} = CDATA_SECTION_MSE2_STATE;
2947     !!!next-input-character;
2948     redo A;
2949     } else {
2950     !!!cp (221.6);
2951     $self->{current_token}->{data} .= ']';
2952     $self->{state} = CDATA_SECTION_STATE;
2953     ## Reconsume.
2954     redo A;
2955     }
2956     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2957     if ($self->{next_char} == 0x003E) { # >
2958     $self->{state} = DATA_STATE;
2959     !!!next-input-character;
2960     if (length $self->{current_token}->{data}) { # character
2961     !!!cp (221.7);
2962     !!!emit ($self->{current_token}); # character
2963 wakaba 1.127 } else {
2964 wakaba 1.165 !!!cp (221.8);
2965     ## No token to emit. $self->{current_token} is discarded.
2966 wakaba 1.127 }
2967 wakaba 1.165 redo A;
2968     } elsif ($self->{next_char} == 0x005D) { # ]
2969     !!!cp (221.9); # character
2970     $self->{current_token}->{data} .= ']'; ## Add first "]" of "]]]".
2971     ## Stay in the state.
2972 wakaba 1.127 !!!next-input-character;
2973 wakaba 1.165 redo A;
2974 wakaba 1.127 } else {
2975 wakaba 1.165 !!!cp (221.11);
2976     $self->{current_token}->{data} .= ']]'; # character
2977     $self->{state} = CDATA_SECTION_STATE;
2978     ## Reconsume.
2979     redo A;
2980 wakaba 1.127 }
2981 wakaba 1.167 } elsif ($self->{state} == ENTITY_STATE) {
2982 wakaba 1.168 if ({
2983     0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2984     0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, &
2985     $self->{entity_additional} => 1,
2986     }->{$self->{next_char}}) {
2987     !!!cp (1001);
2988     ## Don't consume
2989     ## No error
2990     ## Return nothing.
2991     #
2992     } elsif ($self->{next_char} == 0x0023) { # #
2993 wakaba 1.170 !!!cp (999);
2994 wakaba 1.168 $self->{state} = ENTITY_HASH_STATE;
2995     $self->{state_keyword} = '#';
2996     !!!next-input-character;
2997     redo A;
2998     } elsif ((0x0041 <= $self->{next_char} and
2999     $self->{next_char} <= 0x005A) or # A..Z
3000     (0x0061 <= $self->{next_char} and
3001     $self->{next_char} <= 0x007A)) { # a..z
3002 wakaba 1.170 !!!cp (998);
3003 wakaba 1.168 require Whatpm::_NamedEntityList;
3004     $self->{state} = ENTITY_NAME_STATE;
3005     $self->{state_keyword} = chr $self->{next_char};
3006     $self->{entity__value} = $self->{state_keyword};
3007     $self->{entity__match} = 0;
3008     !!!next-input-character;
3009     redo A;
3010     } else {
3011     !!!cp (1027);
3012     !!!parse-error (type => 'bare ero');
3013     ## Return nothing.
3014     #
3015     }
3016 wakaba 1.20
3017 wakaba 1.168 ## NOTE: No character is consumed by the "consume a character
3018     ## reference" algorithm. In other word, there is an "&" character
3019     ## that does not introduce a character reference, which would be
3020     ## appended to the parent element or the attribute value in later
3021     ## process of the tokenizer.
3022 wakaba 1.112
3023 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3024 wakaba 1.170 !!!cp (997);
3025 wakaba 1.169 $self->{state} = $self->{prev_state};
3026 wakaba 1.168 ## Reconsume.
3027     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3028     line => $self->{line_prev},
3029     column => $self->{column_prev},
3030     });
3031     redo A;
3032 wakaba 1.169 } else {
3033 wakaba 1.170 !!!cp (996);
3034 wakaba 1.169 $self->{current_attribute}->{value} .= '&';
3035     $self->{state} = $self->{prev_state};
3036     ## Reconsume.
3037     redo A;
3038 wakaba 1.168 }
3039     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3040     if ($self->{next_char} == 0x0078 or # x
3041     $self->{next_char} == 0x0058) { # X
3042 wakaba 1.170 !!!cp (995);
3043 wakaba 1.168 $self->{state} = HEXREF_X_STATE;
3044     $self->{state_keyword} .= chr $self->{next_char};
3045     !!!next-input-character;
3046     redo A;
3047     } elsif (0x0030 <= $self->{next_char} and
3048     $self->{next_char} <= 0x0039) { # 0..9
3049 wakaba 1.170 !!!cp (994);
3050 wakaba 1.168 $self->{state} = NCR_NUM_STATE;
3051     $self->{state_keyword} = $self->{next_char} - 0x0030;
3052     !!!next-input-character;
3053     redo A;
3054     } else {
3055     !!!parse-error (type => 'bare nero',
3056     line => $self->{line_prev},
3057     column => $self->{column_prev} - 1);
3058    
3059     ## NOTE: According to the spec algorithm, nothing is returned,
3060     ## and then "&#" is appended to the parent element or the attribute
3061     ## value in the later processing.
3062    
3063 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3064 wakaba 1.170 !!!cp (1019);
3065 wakaba 1.169 $self->{state} = $self->{prev_state};
3066 wakaba 1.168 ## Reconsume.
3067     !!!emit ({type => CHARACTER_TOKEN,
3068     data => '&#',
3069     line => $self->{line_prev},
3070     column => $self->{column_prev} - 1,
3071     });
3072     redo A;
3073 wakaba 1.169 } else {
3074 wakaba 1.170 !!!cp (993);
3075 wakaba 1.169 $self->{current_attribute}->{value} .= '&#';
3076     $self->{state} = $self->{prev_state};
3077     ## Reconsume.
3078     redo A;
3079 wakaba 1.1 }
3080 wakaba 1.168 }
3081     } elsif ($self->{state} == NCR_NUM_STATE) {
3082     if (0x0030 <= $self->{next_char} and
3083     $self->{next_char} <= 0x0039) { # 0..9
3084 wakaba 1.78 !!!cp (1012);
3085 wakaba 1.168 $self->{state_keyword} *= 10;
3086     $self->{state_keyword} += $self->{next_char} - 0x0030;
3087 wakaba 1.1
3088 wakaba 1.168 ## Stay in the state.
3089 wakaba 1.1 !!!next-input-character;
3090 wakaba 1.168 redo A;
3091     } elsif ($self->{next_char} == 0x003B) { # ;
3092 wakaba 1.78 !!!cp (1013);
3093 wakaba 1.1 !!!next-input-character;
3094 wakaba 1.168 #
3095 wakaba 1.1 } else {
3096 wakaba 1.78 !!!cp (1014);
3097 wakaba 1.168 !!!parse-error (type => 'no refc');
3098     ## Reconsume.
3099     #
3100 wakaba 1.1 }
3101    
3102 wakaba 1.168 my $code = $self->{state_keyword};
3103     my $l = $self->{line_prev};
3104     my $c = $self->{column_prev};
3105 wakaba 1.26 if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3106 wakaba 1.78 !!!cp (1015);
3107 wakaba 1.153 !!!parse-error (type => 'invalid character reference',
3108     text => (sprintf 'U+%04X', $code),
3109     line => $l, column => $c);
3110 wakaba 1.26 $code = 0xFFFD;
3111     } elsif ($code > 0x10FFFF) {
3112 wakaba 1.78 !!!cp (1016);
3113 wakaba 1.153 !!!parse-error (type => 'invalid character reference',
3114     text => (sprintf 'U-%08X', $code),
3115     line => $l, column => $c);
3116 wakaba 1.26 $code = 0xFFFD;
3117     } elsif ($code == 0x000D) {
3118 wakaba 1.78 !!!cp (1017);
3119 wakaba 1.153 !!!parse-error (type => 'CR character reference',
3120     line => $l, column => $c);
3121 wakaba 1.26 $code = 0x000A;
3122 wakaba 1.4 } elsif (0x80 <= $code and $code <= 0x9F) {
3123 wakaba 1.78 !!!cp (1018);
3124 wakaba 1.153 !!!parse-error (type => 'C1 character reference',
3125     text => (sprintf 'U+%04X', $code),
3126     line => $l, column => $c);
3127 wakaba 1.4 $code = $c1_entity_char->{$code};
3128 wakaba 1.1 }
3129 wakaba 1.168
3130 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3131 wakaba 1.170 !!!cp (992);
3132 wakaba 1.169 $self->{state} = $self->{prev_state};
3133 wakaba 1.168 ## Reconsume.
3134 wakaba 1.169 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3135     line => $l, column => $c,
3136     });
3137 wakaba 1.168 redo A;
3138     } else {
3139 wakaba 1.170 !!!cp (991);
3140 wakaba 1.169 $self->{current_attribute}->{value} .= chr $code;
3141     $self->{current_attribute}->{has_reference} = 1;
3142     $self->{state} = $self->{prev_state};
3143 wakaba 1.168 ## Reconsume.
3144     redo A;
3145     }
3146     } elsif ($self->{state} == HEXREF_X_STATE) {
3147     if ((0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) or
3148     (0x0041 <= $self->{next_char} and $self->{next_char} <= 0x0046) or
3149     (0x0061 <= $self->{next_char} and $self->{next_char} <= 0x0066)) {
3150     # 0..9, A..F, a..f
3151 wakaba 1.170 !!!cp (990);
3152 wakaba 1.168 $self->{state} = HEXREF_HEX_STATE;
3153     $self->{state_keyword} = 0;
3154     ## Reconsume.
3155     redo A;
3156     } else {
3157     !!!parse-error (type => 'bare hcro',
3158     line => $self->{line_prev},
3159     column => $self->{column_prev} - 2);
3160    
3161     ## NOTE: According to the spec algorithm, nothing is returned,
3162     ## and then "&#" followed by "X" or "x" is appended to the parent
3163     ## element or the attribute value in the later processing.
3164    
3165 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3166 wakaba 1.170 !!!cp (1005);
3167 wakaba 1.169 $self->{state} = $self->{prev_state};
3168 wakaba 1.168 ## Reconsume.
3169     !!!emit ({type => CHARACTER_TOKEN,
3170     data => '&' . $self->{state_keyword},
3171     line => $self->{line_prev},
3172     column => $self->{column_prev} - length $self->{state_keyword},
3173     });
3174     redo A;
3175 wakaba 1.169 } else {
3176 wakaba 1.170 !!!cp (989);
3177 wakaba 1.169 $self->{current_attribute}->{value} .= '&' . $self->{state_keyword};
3178     $self->{state} = $self->{prev_state};
3179     ## Reconsume.
3180     redo A;
3181 wakaba 1.168 }
3182     }
3183     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3184     if (0x0030 <= $self->{next_char} and $self->{next_char} <= 0x0039) {
3185     # 0..9
3186     !!!cp (1002);
3187     $self->{state_keyword} *= 0x10;
3188     $self->{state_keyword} += $self->{next_char} - 0x0030;
3189     ## Stay in the state.
3190     !!!next-input-character;
3191     redo A;
3192     } elsif (0x0061 <= $self->{next_char} and
3193     $self->{next_char} <= 0x0066) { # a..f
3194     !!!cp (1003);
3195     $self->{state_keyword} *= 0x10;
3196     $self->{state_keyword} += $self->{next_char} - 0x0060 + 9;
3197     ## Stay in the state.
3198     !!!next-input-character;
3199     redo A;
3200     } elsif (0x0041 <= $self->{next_char} and
3201     $self->{next_char} <= 0x0046) { # A..F
3202     !!!cp (1004);
3203     $self->{state_keyword} *= 0x10;
3204     $self->{state_keyword} += $self->{next_char} - 0x0040 + 9;
3205     ## Stay in the state.
3206     !!!next-input-character;
3207     redo A;
3208     } elsif ($self->{next_char} == 0x003B) { # ;
3209     !!!cp (1006);
3210     !!!next-input-character;
3211     #
3212     } else {
3213     !!!cp (1007);
3214     !!!parse-error (type => 'no refc',
3215     line => $self->{line},
3216     column => $self->{column});
3217     ## Reconsume.
3218     #
3219     }
3220    
3221     my $code = $self->{state_keyword};
3222     my $l = $self->{line_prev};
3223     my $c = $self->{column_prev};
3224     if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
3225     !!!cp (1008);
3226     !!!parse-error (type => 'invalid character reference',
3227     text => (sprintf 'U+%04X', $code),
3228     line => $l, column => $c);
3229     $code = 0xFFFD;
3230     } elsif ($code > 0x10FFFF) {
3231     !!!cp (1009);
3232     !!!parse-error (type => 'invalid character reference',
3233     text => (sprintf 'U-%08X', $code),
3234     line => $l, column => $c);
3235     $code = 0xFFFD;
3236     } elsif ($code == 0x000D) {
3237     !!!cp (1010);
3238     !!!parse-error (type => 'CR character reference', line => $l, column => $c);
3239     $code = 0x000A;
3240     } elsif (0x80 <= $code and $code <= 0x9F) {
3241     !!!cp (1011);
3242     !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);
3243     $code = $c1_entity_char->{$code};
3244     }
3245    
3246 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3247 wakaba 1.170 !!!cp (988);
3248 wakaba 1.169 $self->{state} = $self->{prev_state};
3249 wakaba 1.168 ## Reconsume.
3250 wakaba 1.169 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3251     line => $l, column => $c,
3252     });
3253 wakaba 1.168 redo A;
3254     } else {
3255 wakaba 1.170 !!!cp (987);
3256 wakaba 1.169 $self->{current_attribute}->{value} .= chr $code;
3257     $self->{current_attribute}->{has_reference} = 1;
3258     $self->{state} = $self->{prev_state};
3259 wakaba 1.168 ## Reconsume.
3260     redo A;
3261     }
3262     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3263     if (length $self->{state_keyword} < 30 and
3264     ## NOTE: Some number greater than the maximum length of entity name
3265     ((0x0041 <= $self->{next_char} and # a
3266     $self->{next_char} <= 0x005A) or # x
3267     (0x0061 <= $self->{next_char} and # a
3268     $self->{next_char} <= 0x007A) or # z
3269     (0x0030 <= $self->{next_char} and # 0
3270     $self->{next_char} <= 0x0039) or # 9
3271     $self->{next_char} == 0x003B)) { # ;
3272     our $EntityChar;
3273     $self->{state_keyword} .= chr $self->{next_char};
3274     if (defined $EntityChar->{$self->{state_keyword}}) {
3275     if ($self->{next_char} == 0x003B) { # ;
3276     !!!cp (1020);
3277     $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3278     $self->{entity__match} = 1;
3279     !!!next-input-character;
3280     #
3281     } else {
3282     !!!cp (1021);
3283     $self->{entity__value} = $EntityChar->{$self->{state_keyword}};
3284     $self->{entity__match} = -1;
3285     ## Stay in the state.
3286     !!!next-input-character;
3287     redo A;
3288     }
3289     } else {
3290     !!!cp (1022);
3291     $self->{entity__value} .= chr $self->{next_char};
3292     $self->{entity__match} *= 2;
3293     ## Stay in the state.
3294 wakaba 1.16 !!!next-input-character;
3295 wakaba 1.168 redo A;
3296     }
3297     }
3298    
3299     my $data;
3300     my $has_ref;
3301     if ($self->{entity__match} > 0) {
3302     !!!cp (1023);
3303     $data = $self->{entity__value};
3304     $has_ref = 1;
3305     #
3306     } elsif ($self->{entity__match} < 0) {
3307     !!!parse-error (type => 'no refc');
3308 wakaba 1.169 if ($self->{prev_state} != DATA_STATE and # in attribute
3309     $self->{entity__match} < -1) {
3310 wakaba 1.168 !!!cp (1024);
3311     $data = '&' . $self->{state_keyword};
3312     #
3313 wakaba 1.37 } else {
3314 wakaba 1.168 !!!cp (1025);
3315     $data = $self->{entity__value};
3316     $has_ref = 1;
3317     #
3318 wakaba 1.16 }
3319 wakaba 1.1 } else {
3320 wakaba 1.168 !!!cp (1026);
3321     !!!parse-error (type => 'bare ero',
3322     line => $self->{line_prev},
3323     column => $self->{column_prev});
3324     $data = '&' . $self->{state_keyword};
3325     #
3326 wakaba 1.1 }
3327 wakaba 1.168
3328     ## NOTE: In these cases, when a character reference is found,
3329     ## it is consumed and a character token is returned, or, otherwise,
3330     ## nothing is consumed and returned, according to the spec algorithm.
3331     ## In this implementation, anything that has been examined by the
3332     ## tokenizer is appended to the parent element or the attribute value
3333     ## as string, either literal string when no character reference or
3334     ## entity-replaced string otherwise, in this stage, since any characters
3335     ## that would not be consumed are appended in the data state or in an
3336     ## appropriate attribute value state anyway.
3337    
3338 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3339 wakaba 1.170 !!!cp (986);
3340 wakaba 1.169 $self->{state} = $self->{prev_state};
3341 wakaba 1.168 ## Reconsume.
3342     !!!emit ({type => CHARACTER_TOKEN,
3343 wakaba 1.169 data => $data,
3344 wakaba 1.168 line => $self->{line_prev},
3345     column => $self->{column_prev} + 1 - length $self->{state_keyword},
3346     });
3347 wakaba 1.167 redo A;
3348 wakaba 1.169 } else {
3349 wakaba 1.170 !!!cp (985);
3350 wakaba 1.169 $self->{current_attribute}->{value} .= $data;
3351     $self->{current_attribute}->{has_reference} = 1 if $has_ref;
3352     $self->{state} = $self->{prev_state};
3353     ## Reconsume.
3354     redo A;
3355 wakaba 1.37 }
3356 wakaba 1.1 } else {
3357 wakaba 1.167 die "$0: $self->{state}: Unknown state";
3358     }
3359     } # A
3360    
3361     die "$0: _get_next_token: unexpected case";
3362     } # _get_next_token
3363 wakaba 1.1
3364     sub _initialize_tree_constructor ($) {
3365     my $self = shift;
3366     ## NOTE: $self->{document} MUST be specified before this method is called
3367     $self->{document}->strict_error_checking (0);
3368     ## TODO: Turn mutation events off # MUST
3369     ## TODO: Turn loose Document option (manakai extension) on
3370 wakaba 1.18 $self->{document}->manakai_is_html (1); # MUST
3371 wakaba 1.154 $self->{document}->set_user_data (manakai_source_line => 1);
3372     $self->{document}->set_user_data (manakai_source_column => 1);
3373 wakaba 1.1 } # _initialize_tree_constructor
3374    
3375     sub _terminate_tree_constructor ($) {
3376     my $self = shift;
3377     $self->{document}->strict_error_checking (1);
3378     ## TODO: Turn mutation events on
3379     } # _terminate_tree_constructor
3380    
3381     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3382    
3383 wakaba 1.3 { # tree construction stage
3384     my $token;
3385    
3386 wakaba 1.1 sub _construct_tree ($) {
3387     my ($self) = @_;
3388    
3389     ## When an interactive UA render the $self->{document} available
3390     ## to the user, or when it begin accepting user input, are
3391     ## not defined.
3392    
3393     ## Append a character: collect it and all subsequent consecutive
3394     ## characters and insert one Text node whose data is concatenation
3395     ## of all those characters. # MUST
3396    
3397     !!!next-token;
3398    
3399 wakaba 1.3 undef $self->{form_element};
3400     undef $self->{head_element};
3401     $self->{open_elements} = [];
3402     undef $self->{inner_html_node};
3403    
3404 wakaba 1.84 ## NOTE: The "initial" insertion mode.
3405 wakaba 1.3 $self->_tree_construction_initial; # MUST
3406 wakaba 1.84
3407     ## NOTE: The "before html" insertion mode.
3408 wakaba 1.3 $self->_tree_construction_root_element;
3409 wakaba 1.84 $self->{insertion_mode} = BEFORE_HEAD_IM;
3410    
3411     ## NOTE: The "before head" insertion mode and so on.
3412 wakaba 1.3 $self->_tree_construction_main;
3413     } # _construct_tree
3414    
3415     sub _tree_construction_initial ($) {
3416     my $self = shift;
3417 wakaba 1.84
3418     ## NOTE: "initial" insertion mode
3419    
3420 wakaba 1.18 INITIAL: {
3421 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
3422 wakaba 1.18 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3423     ## error, switch to a conformance checking mode for another
3424     ## language.
3425     my $doctype_name = $token->{name};
3426     $doctype_name = '' unless defined $doctype_name;
3427 wakaba 1.159 $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3428 wakaba 1.18 if (not defined $token->{name} or # <!DOCTYPE>
3429     defined $token->{system_identifier}) {
3430 wakaba 1.79 !!!cp ('t1');
3431 wakaba 1.113 !!!parse-error (type => 'not HTML5', token => $token);
3432 wakaba 1.18 } elsif ($doctype_name ne 'HTML') {
3433 wakaba 1.79 !!!cp ('t2');
3434 wakaba 1.113 !!!parse-error (type => 'not HTML5', token => $token);
3435 wakaba 1.159 } elsif (defined $token->{public_identifier}) {
3436     if ($token->{public_identifier} eq 'XSLT-compat') {
3437     !!!cp ('t1.2');
3438     !!!parse-error (type => 'XSLT-compat', token => $token,
3439     level => $self->{level}->{should});
3440     } else {
3441     !!!parse-error (type => 'not HTML5', token => $token);
3442     }
3443 wakaba 1.79 } else {
3444     !!!cp ('t3');
3445 wakaba 1.159 #
3446 wakaba 1.18 }
3447    
3448     my $doctype = $self->{document}->create_document_type_definition
3449     ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3450 wakaba 1.122 ## NOTE: Default value for both |public_id| and |system_id| attributes
3451     ## are empty strings, so that we don't set any value in missing cases.
3452 wakaba 1.18 $doctype->public_id ($token->{public_identifier})
3453     if defined $token->{public_identifier};
3454     $doctype->system_id ($token->{system_identifier})
3455     if defined $token->{system_identifier};
3456     ## NOTE: Other DocumentType attributes are null or empty lists.
3457     ## ISSUE: internalSubset = null??
3458     $self->{document}->append_child ($doctype);
3459    
3460 wakaba 1.75 if ($token->{quirks} or $doctype_name ne 'HTML') {
3461 wakaba 1.79 !!!cp ('t4');
3462 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3463     } elsif (defined $token->{public_identifier}) {
3464     my $pubid = $token->{public_identifier};
3465     $pubid =~ tr/a-z/A-z/;
3466 wakaba 1.143 my $prefix = [
3467     "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3468     "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3469     "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3470     "-//IETF//DTD HTML 2.0 LEVEL 1//",
3471     "-//IETF//DTD HTML 2.0 LEVEL 2//",
3472     "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3473     "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3474     "-//IETF//DTD HTML 2.0 STRICT//",
3475     "-//IETF//DTD HTML 2.0//",
3476     "-//IETF//DTD HTML 2.1E//",
3477     "-//IETF//DTD HTML 3.0//",
3478     "-//IETF//DTD HTML 3.2 FINAL//",
3479     "-//IETF//DTD HTML 3.2//",
3480     "-//IETF//DTD HTML 3//",
3481     "-//IETF//DTD HTML LEVEL 0//",
3482     "-//IETF//DTD HTML LEVEL 1//",
3483     "-//IETF//DTD HTML LEVEL 2//",
3484     "-//IETF//DTD HTML LEVEL 3//",
3485     "-//IETF//DTD HTML STRICT LEVEL 0//",
3486     "-//IETF//DTD HTML STRICT LEVEL 1//",
3487     "-//IETF//DTD HTML STRICT LEVEL 2//",
3488     "-//IETF//DTD HTML STRICT LEVEL 3//",
3489     "-//IETF//DTD HTML STRICT//",
3490     "-//IETF//DTD HTML//",
3491     "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3492     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3493     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3494     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3495     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3496     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3497     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3498     "-//NETSCAPE COMM. CORP.//DTD HTML//",
3499     "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3500     "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3501     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3502     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3503     "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3504     "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3505     "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3506     "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3507     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3508     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3509     "-//W3C//DTD HTML 3 1995-03-24//",
3510     "-//W3C//DTD HTML 3.2 DRAFT//",
3511     "-//W3C//DTD HTML 3.2 FINAL//",
3512     "-//W3C//DTD HTML 3.2//",
3513     "-//W3C//DTD HTML 3.2S DRAFT//",
3514     "-//W3C//DTD HTML 4.0 FRAMESET//",
3515     "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3516     "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3517     "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3518     "-//W3C//DTD W3 HTML//",
3519     "-//W3O//DTD W3 HTML 3.0//",
3520     "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3521     "-//WEBTECHS//DTD MOZILLA HTML//",
3522     ]; # $prefix
3523     my $match;
3524     for (@$prefix) {
3525     if (substr ($prefix, 0, length $_) eq $_) {
3526     $match = 1;
3527     last;
3528     }
3529     }
3530     if ($match or
3531     $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3532     $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3533     $pubid eq "HTML") {
3534 wakaba 1.79 !!!cp ('t5');
3535 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3536 wakaba 1.143 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3537     $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3538 wakaba 1.18 if (defined $token->{system_identifier}) {
3539 wakaba 1.79 !!!cp ('t6');
3540 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3541     } else {
3542 wakaba 1.79 !!!cp ('t7');
3543 wakaba 1.18 $self->{document}->manakai_compat_mode ('limited quirks');
3544 wakaba 1.3 }
3545 wakaba 1.143 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3546     $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3547 wakaba 1.79 !!!cp ('t8');
3548 wakaba 1.18 $self->{document}->manakai_compat_mode ('limited quirks');
3549 wakaba 1.79 } else {
3550     !!!cp ('t9');
3551 wakaba 1.18 }
3552 wakaba 1.79 } else {
3553     !!!cp ('t10');
3554 wakaba 1.18 }
3555     if (defined $token->{system_identifier}) {
3556     my $sysid = $token->{system_identifier};
3557     $sysid =~ tr/A-Z/a-z/;
3558     if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3559 wakaba 1.143 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3560     ## marked as quirks.
3561 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3562 wakaba 1.79 !!!cp ('t11');
3563     } else {
3564     !!!cp ('t12');
3565 wakaba 1.18 }
3566 wakaba 1.79 } else {
3567     !!!cp ('t13');
3568 wakaba 1.18 }
3569    
3570 wakaba 1.84 ## Go to the "before html" insertion mode.
3571 wakaba 1.18 !!!next-token;
3572     return;
3573     } elsif ({
3574 wakaba 1.55 START_TAG_TOKEN, 1,
3575     END_TAG_TOKEN, 1,
3576     END_OF_FILE_TOKEN, 1,
3577 wakaba 1.18 }->{$token->{type}}) {
3578 wakaba 1.79 !!!cp ('t14');
3579 wakaba 1.113 !!!parse-error (type => 'no DOCTYPE', token => $token);
3580 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3581 wakaba 1.84 ## Go to the "before html" insertion mode.
3582 wakaba 1.18 ## reprocess
3583 wakaba 1.125 !!!ack-later;
3584 wakaba 1.18 return;
3585 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
3586 wakaba 1.18 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3587     ## Ignore the token
3588 wakaba 1.26
3589 wakaba 1.18 unless (length $token->{data}) {
3590 wakaba 1.79 !!!cp ('t15');
3591 wakaba 1.84 ## Stay in the insertion mode.
3592 wakaba 1.18 !!!next-token;
3593     redo INITIAL;
3594 wakaba 1.79 } else {
3595     !!!cp ('t16');
3596 wakaba 1.3 }
3597 wakaba 1.79 } else {
3598     !!!cp ('t17');
3599 wakaba 1.3 }
3600 wakaba 1.18
3601 wakaba 1.113 !!!parse-error (type => 'no DOCTYPE', token => $token);
3602 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3603 wakaba 1.84 ## Go to the "before html" insertion mode.
3604 wakaba 1.18 ## reprocess
3605     return;
3606 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
3607 wakaba 1.79 !!!cp ('t18');
3608 wakaba 1.18 my $comment = $self->{document}->create_comment ($token->{data});
3609     $self->{document}->append_child ($comment);
3610    
3611 wakaba 1.84 ## Stay in the insertion mode.
3612 wakaba 1.18 !!!next-token;
3613     redo INITIAL;
3614     } else {
3615 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
3616 wakaba 1.18 }
3617     } # INITIAL
3618 wakaba 1.79
3619     die "$0: _tree_construction_initial: This should be never reached";
3620 wakaba 1.3 } # _tree_construction_initial
3621    
3622     sub _tree_construction_root_element ($) {
3623     my $self = shift;
3624 wakaba 1.84
3625     ## NOTE: "before html" insertion mode.
3626 wakaba 1.3
3627     B: {
3628 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
3629 wakaba 1.79 !!!cp ('t19');
3630 wakaba 1.153 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3631 wakaba 1.3 ## Ignore the token
3632 wakaba 1.84 ## Stay in the insertion mode.
3633 wakaba 1.3 !!!next-token;
3634     redo B;
3635 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
3636 wakaba 1.79 !!!cp ('t20');
3637 wakaba 1.3 my $comment = $self->{document}->create_comment ($token->{data});
3638     $self->{document}->append_child ($comment);
3639 wakaba 1.84 ## Stay in the insertion mode.
3640 wakaba 1.3 !!!next-token;
3641     redo B;
3642 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
3643 wakaba 1.26 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
3644     ## Ignore the token.
3645    
3646 wakaba 1.3 unless (length $token->{data}) {
3647 wakaba 1.79 !!!cp ('t21');
3648 wakaba 1.84 ## Stay in the insertion mode.
3649 wakaba 1.3 !!!next-token;
3650     redo B;
3651 wakaba 1.79 } else {
3652     !!!cp ('t22');
3653 wakaba 1.3 }
3654 wakaba 1.79 } else {
3655     !!!cp ('t23');
3656 wakaba 1.3 }
3657 wakaba 1.61
3658     $self->{application_cache_selection}->(undef);
3659    
3660     #
3661     } elsif ($token->{type} == START_TAG_TOKEN) {
3662 wakaba 1.84 if ($token->{tag_name} eq 'html') {
3663     my $root_element;
3664 wakaba 1.126 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3665 wakaba 1.84 $self->{document}->append_child ($root_element);
3666 wakaba 1.123 push @{$self->{open_elements}},
3667     [$root_element, $el_category->{html}];
3668 wakaba 1.84
3669     if ($token->{attributes}->{manifest}) {
3670     !!!cp ('t24');
3671     $self->{application_cache_selection}
3672     ->($token->{attributes}->{manifest}->{value});
3673 wakaba 1.118 ## ISSUE: Spec is unclear on relative references.
3674     ## According to Hixie (#whatwg 2008-03-19), it should be
3675     ## resolved against the base URI of the document in HTML
3676     ## or xml:base of the element in XHTML.
3677 wakaba 1.84 } else {
3678     !!!cp ('t25');
3679     $self->{application_cache_selection}->(undef);
3680     }
3681    
3682 wakaba 1.125 !!!nack ('t25c');
3683    
3684 wakaba 1.84 !!!next-token;
3685     return; ## Go to the "before head" insertion mode.
3686 wakaba 1.61 } else {
3687 wakaba 1.84 !!!cp ('t25.1');
3688     #
3689 wakaba 1.61 }
3690 wakaba 1.3 } elsif ({
3691 wakaba 1.55 END_TAG_TOKEN, 1,
3692     END_OF_FILE_TOKEN, 1,
3693 wakaba 1.3 }->{$token->{type}}) {
3694 wakaba 1.79 !!!cp ('t26');
3695 wakaba 1.3 #
3696     } else {
3697 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
3698 wakaba 1.3 }
3699 wakaba 1.61
3700 wakaba 1.126 my $root_element;
3701     !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3702 wakaba 1.84 $self->{document}->append_child ($root_element);
3703 wakaba 1.123 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3704 wakaba 1.84
3705     $self->{application_cache_selection}->(undef);
3706    
3707     ## NOTE: Reprocess the token.
3708 wakaba 1.125 !!!ack-later;
3709 wakaba 1.84 return; ## Go to the "before head" insertion mode.
3710    
3711     ## ISSUE: There is an issue in the spec
3712 wakaba 1.3 } # B
3713 wakaba 1.79
3714     die "$0: _tree_construction_root_element: This should never be reached";
3715 wakaba 1.3 } # _tree_construction_root_element
3716    
3717     sub _reset_insertion_mode ($) {
3718     my $self = shift;
3719    
3720     ## Step 1
3721     my $last;
3722    
3723     ## Step 2
3724     my $i = -1;
3725     my $node = $self->{open_elements}->[$i];
3726    
3727     ## Step 3
3728     S3: {
3729 wakaba 1.29 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3730     $last = 1;
3731     if (defined $self->{inner_html_node}) {
3732 wakaba 1.140 !!!cp ('t28');
3733     $node = $self->{inner_html_node};
3734     } else {
3735     die "_reset_insertion_mode: t27";
3736 wakaba 1.3 }
3737     }
3738 wakaba 1.140
3739     ## Step 4..14
3740     my $new_mode;
3741     if ($node->[1] & FOREIGN_EL) {
3742     !!!cp ('t28.1');
3743     ## NOTE: Strictly spaking, the line below only applies to MathML and
3744     ## SVG elements. Currently the HTML syntax supports only MathML and
3745     ## SVG elements as foreigners.
3746 wakaba 1.148 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3747 wakaba 1.140 } elsif ($node->[1] & TABLE_CELL_EL) {
3748     if ($last) {
3749     !!!cp ('t28.2');
3750     #
3751     } else {
3752     !!!cp ('t28.3');
3753     $new_mode = IN_CELL_IM;
3754     }
3755     } else {
3756     !!!cp ('t28.4');
3757     $new_mode = {
3758 wakaba 1.54 select => IN_SELECT_IM,
3759 wakaba 1.83 ## NOTE: |option| and |optgroup| do not set
3760     ## insertion mode to "in select" by themselves.
3761 wakaba 1.54 tr => IN_ROW_IM,
3762     tbody => IN_TABLE_BODY_IM,
3763     thead => IN_TABLE_BODY_IM,
3764     tfoot => IN_TABLE_BODY_IM,
3765     caption => IN_CAPTION_IM,
3766     colgroup => IN_COLUMN_GROUP_IM,
3767     table => IN_TABLE_IM,
3768     head => IN_BODY_IM, # not in head!
3769     body => IN_BODY_IM,
3770     frameset => IN_FRAMESET_IM,
3771 wakaba 1.123 }->{$node->[0]->manakai_local_name};
3772 wakaba 1.140 }
3773     $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3774 wakaba 1.3
3775 wakaba 1.126 ## Step 15
3776 wakaba 1.123 if ($node->[1] & HTML_EL) {
3777 wakaba 1.3 unless (defined $self->{head_element}) {
3778 wakaba 1.79 !!!cp ('t29');
3779 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
3780 wakaba 1.3 } else {
3781 wakaba 1.81 ## ISSUE: Can this state be reached?
3782 wakaba 1.79 !!!cp ('t30');
3783 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3784 wakaba 1.3 }
3785     return;
3786 wakaba 1.79 } else {
3787     !!!cp ('t31');
3788 wakaba 1.3 }
3789    
3790 wakaba 1.126 ## Step 16
3791 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3792 wakaba 1.3
3793 wakaba 1.126 ## Step 17
3794 wakaba 1.3 $i--;
3795     $node = $self->{open_elements}->[$i];
3796    
3797 wakaba 1.126 ## Step 18
3798 wakaba 1.3 redo S3;
3799     } # S3
3800 wakaba 1.79
3801     die "$0: _reset_insertion_mode: This line should never be reached";
3802 wakaba 1.3 } # _reset_insertion_mode
3803    
3804     sub _tree_construction_main ($) {
3805     my $self = shift;
3806    
3807 wakaba 1.1 my $active_formatting_elements = [];
3808    
3809     my $reconstruct_active_formatting_elements = sub { # MUST
3810     my $insert = shift;
3811    
3812     ## Step 1
3813     return unless @$active_formatting_elements;
3814    
3815     ## Step 3
3816     my $i = -1;
3817     my $entry = $active_formatting_elements->[$i];
3818    
3819     ## Step 2
3820     return if $entry->[0] eq '#marker';
3821 wakaba 1.3 for (@{$self->{open_elements}}) {
3822 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
3823 wakaba 1.79 !!!cp ('t32');
3824 wakaba 1.1 return;
3825     }
3826     }
3827    
3828     S4: {
3829     ## Step 4
3830     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3831    
3832     ## Step 5
3833     $i--;
3834     $entry = $active_formatting_elements->[$i];
3835    
3836     ## Step 6
3837     if ($entry->[0] eq '#marker') {
3838 wakaba 1.81 !!!cp ('t33_1');
3839 wakaba 1.1 #
3840     } else {
3841     my $in_open_elements;
3842 wakaba 1.3 OE: for (@{$self->{open_elements}}) {
3843 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
3844 wakaba 1.79 !!!cp ('t33');
3845 wakaba 1.1 $in_open_elements = 1;
3846     last OE;
3847     }
3848     }
3849     if ($in_open_elements) {
3850 wakaba 1.79 !!!cp ('t34');
3851 wakaba 1.1 #
3852     } else {
3853 wakaba 1.81 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3854 wakaba 1.79 !!!cp ('t35');
3855 wakaba 1.1 redo S4;
3856     }
3857     }
3858    
3859     ## Step 7
3860     $i++;
3861     $entry = $active_formatting_elements->[$i];
3862     } # S4
3863    
3864     S7: {
3865     ## Step 8
3866     my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3867    
3868     ## Step 9
3869     $insert->($clone->[0]);
3870 wakaba 1.3 push @{$self->{open_elements}}, $clone;
3871 wakaba 1.1
3872     ## Step 10
3873 wakaba 1.3 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3874 wakaba 1.1
3875     ## Step 11
3876     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3877 wakaba 1.79 !!!cp ('t36');
3878 wakaba 1.1 ## Step 7'
3879     $i++;
3880     $entry = $active_formatting_elements->[$i];
3881    
3882     redo S7;
3883     }
3884 wakaba 1.79
3885     !!!cp ('t37');
3886 wakaba 1.1 } # S7
3887     }; # $reconstruct_active_formatting_elements
3888    
3889     my $clear_up_to_marker = sub {
3890     for (reverse 0..$#$active_formatting_elements) {
3891     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3892 wakaba 1.79 !!!cp ('t38');
3893 wakaba 1.1 splice @$active_formatting_elements, $_;
3894     return;
3895     }
3896     }
3897 wakaba 1.79
3898     !!!cp ('t39');
3899 wakaba 1.1 }; # $clear_up_to_marker
3900    
3901 wakaba 1.96 my $insert;
3902    
3903     my $parse_rcdata = sub ($) {
3904     my ($content_model_flag) = @_;
3905 wakaba 1.25
3906     ## Step 1
3907     my $start_tag_name = $token->{tag_name};
3908     my $el;
3909 wakaba 1.126 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3910 wakaba 1.25
3911     ## Step 2
3912 wakaba 1.96 $insert->($el);
3913 wakaba 1.25
3914     ## Step 3
3915 wakaba 1.40 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3916 wakaba 1.13 delete $self->{escape}; # MUST
3917 wakaba 1.25
3918     ## Step 4
3919 wakaba 1.1 my $text = '';
3920 wakaba 1.125 !!!nack ('t40.1');
3921 wakaba 1.1 !!!next-token;
3922 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3923 wakaba 1.79 !!!cp ('t40');
3924 wakaba 1.1 $text .= $token->{data};
3925     !!!next-token;
3926 wakaba 1.25 }
3927    
3928     ## Step 5
3929 wakaba 1.1 if (length $text) {
3930 wakaba 1.79 !!!cp ('t41');
3931 wakaba 1.25 my $text = $self->{document}->create_text_node ($text);
3932     $el->append_child ($text);
3933 wakaba 1.1 }
3934 wakaba 1.25
3935     ## Step 6
3936 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
3937 wakaba 1.25
3938     ## Step 7
3939 wakaba 1.79 if ($token->{type} == END_TAG_TOKEN and
3940     $token->{tag_name} eq $start_tag_name) {
3941     !!!cp ('t42');
3942 wakaba 1.1 ## Ignore the token
3943     } else {
3944 wakaba 1.96 ## NOTE: An end-of-file token.
3945     if ($content_model_flag == CDATA_CONTENT_MODEL) {
3946     !!!cp ('t43');
3947 wakaba 1.153 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3948 wakaba 1.96 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
3949     !!!cp ('t44');
3950 wakaba 1.153 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
3951 wakaba 1.96 } else {
3952     die "$0: $content_model_flag in parse_rcdata";
3953     }
3954 wakaba 1.1 }
3955     !!!next-token;
3956 wakaba 1.25 }; # $parse_rcdata
3957 wakaba 1.1
3958 wakaba 1.96 my $script_start_tag = sub () {
3959 wakaba 1.1 my $script_el;
3960 wakaba 1.126 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
3961 wakaba 1.1 ## TODO: mark as "parser-inserted"
3962    
3963 wakaba 1.40 $self->{content_model} = CDATA_CONTENT_MODEL;
3964 wakaba 1.13 delete $self->{escape}; # MUST
3965 wakaba 1.1
3966     my $text = '';
3967 wakaba 1.125 !!!nack ('t45.1');
3968 wakaba 1.1 !!!next-token;
3969 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
3970 wakaba 1.79 !!!cp ('t45');
3971 wakaba 1.1 $text .= $token->{data};
3972     !!!next-token;
3973     } # stop if non-character token or tokenizer stops tokenising
3974     if (length $text) {
3975 wakaba 1.79 !!!cp ('t46');
3976 wakaba 1.1 $script_el->manakai_append_text ($text);
3977     }
3978    
3979 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
3980 wakaba 1.1
3981 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
3982 wakaba 1.1 $token->{tag_name} eq 'script') {
3983 wakaba 1.79 !!!cp ('t47');
3984 wakaba 1.1 ## Ignore the token
3985     } else {
3986 wakaba 1.79 !!!cp ('t48');
3987 wakaba 1.153 !!!parse-error (type => 'in CDATA:#eof', token => $token);
3988 wakaba 1.1 ## ISSUE: And ignore?
3989     ## TODO: mark as "already executed"
3990     }
3991    
3992 wakaba 1.3 if (defined $self->{inner_html_node}) {
3993 wakaba 1.79 !!!cp ('t49');
3994 wakaba 1.3 ## TODO: mark as "already executed"
3995     } else {
3996 wakaba 1.79 !!!cp ('t50');
3997 wakaba 1.1 ## TODO: $old_insertion_point = current insertion point
3998     ## TODO: insertion point = just before the next input character
3999 wakaba 1.25
4000     $insert->($script_el);
4001 wakaba 1.1
4002     ## TODO: insertion point = $old_insertion_point (might be "undefined")
4003    
4004     ## TODO: if there is a script that will execute as soon as the parser resume, then...
4005     }
4006    
4007     !!!next-token;
4008     }; # $script_start_tag
4009    
4010 wakaba 1.102 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
4011     ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
4012     my $open_tables = [[$self->{open_elements}->[0]->[0]]];
4013    
4014 wakaba 1.1 my $formatting_end_tag = sub {
4015 wakaba 1.113 my $end_tag_token = shift;
4016     my $tag_name = $end_tag_token->{tag_name};
4017 wakaba 1.1
4018 wakaba 1.103 ## NOTE: The adoption agency algorithm (AAA).
4019 wakaba 1.102
4020 wakaba 1.1 FET: {
4021     ## Step 1
4022     my $formatting_element;
4023     my $formatting_element_i_in_active;
4024     AFE: for (reverse 0..$#$active_formatting_elements) {
4025 wakaba 1.123 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
4026     !!!cp ('t52');
4027     last AFE;
4028     } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
4029     eq $tag_name) {
4030 wakaba 1.79 !!!cp ('t51');
4031 wakaba 1.1 $formatting_element = $active_formatting_elements->[$_];
4032     $formatting_element_i_in_active = $_;
4033     last AFE;
4034     }
4035     } # AFE
4036     unless (defined $formatting_element) {
4037 wakaba 1.79 !!!cp ('t53');
4038 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
4039 wakaba 1.1 ## Ignore the token
4040     !!!next-token;
4041     return;
4042     }
4043     ## has an element in scope
4044     my $in_scope = 1;
4045     my $formatting_element_i_in_open;
4046 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4047     my $node = $self->{open_elements}->[$_];
4048 wakaba 1.1 if ($node->[0] eq $formatting_element->[0]) {
4049     if ($in_scope) {
4050 wakaba 1.79 !!!cp ('t54');
4051 wakaba 1.1 $formatting_element_i_in_open = $_;
4052     last INSCOPE;
4053     } else { # in open elements but not in scope
4054 wakaba 1.79 !!!cp ('t55');
4055 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4056     text => $token->{tag_name},
4057 wakaba 1.113 token => $end_tag_token);
4058 wakaba 1.1 ## Ignore the token
4059     !!!next-token;
4060     return;
4061     }
4062 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
4063 wakaba 1.79 !!!cp ('t56');
4064 wakaba 1.1 $in_scope = 0;
4065     }
4066     } # INSCOPE
4067     unless (defined $formatting_element_i_in_open) {
4068 wakaba 1.79 !!!cp ('t57');
4069 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4070     text => $token->{tag_name},
4071 wakaba 1.113 token => $end_tag_token);
4072 wakaba 1.1 pop @$active_formatting_elements; # $formatting_element
4073     !!!next-token; ## TODO: ok?
4074     return;
4075     }
4076 wakaba 1.3 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
4077 wakaba 1.79 !!!cp ('t58');
4078 wakaba 1.122 !!!parse-error (type => 'not closed',
4079 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4080 wakaba 1.122 ->manakai_local_name,
4081 wakaba 1.113 token => $end_tag_token);
4082 wakaba 1.1 }
4083    
4084     ## Step 2
4085     my $furthest_block;
4086     my $furthest_block_i_in_open;
4087 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
4088     my $node = $self->{open_elements}->[$_];
4089 wakaba 1.123 if (not ($node->[1] & FORMATTING_EL) and
4090 wakaba 1.1 #not $phrasing_category->{$node->[1]} and
4091 wakaba 1.123 ($node->[1] & SPECIAL_EL or
4092     $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
4093 wakaba 1.79 !!!cp ('t59');
4094 wakaba 1.1 $furthest_block = $node;
4095     $furthest_block_i_in_open = $_;
4096     } elsif ($node->[0] eq $formatting_element->[0]) {
4097 wakaba 1.79 !!!cp ('t60');
4098 wakaba 1.1 last OE;
4099     }
4100     } # OE
4101    
4102     ## Step 3
4103     unless (defined $furthest_block) { # MUST
4104 wakaba 1.79 !!!cp ('t61');
4105 wakaba 1.3 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
4106 wakaba 1.1 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
4107     !!!next-token;
4108     return;
4109     }
4110    
4111     ## Step 4
4112 wakaba 1.3 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
4113 wakaba 1.1
4114     ## Step 5
4115     my $furthest_block_parent = $furthest_block->[0]->parent_node;
4116     if (defined $furthest_block_parent) {
4117 wakaba 1.79 !!!cp ('t62');
4118 wakaba 1.1 $furthest_block_parent->remove_child ($furthest_block->[0]);
4119     }
4120    
4121     ## Step 6
4122     my $bookmark_prev_el
4123     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
4124     ->[0];
4125    
4126     ## Step 7
4127     my $node = $furthest_block;
4128     my $node_i_in_open = $furthest_block_i_in_open;
4129     my $last_node = $furthest_block;
4130     S7: {
4131     ## Step 1
4132     $node_i_in_open--;
4133 wakaba 1.3 $node = $self->{open_elements}->[$node_i_in_open];
4134 wakaba 1.1
4135     ## Step 2
4136     my $node_i_in_active;
4137     S7S2: {
4138     for (reverse 0..$#$active_formatting_elements) {
4139     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4140 wakaba 1.79 !!!cp ('t63');
4141 wakaba 1.1 $node_i_in_active = $_;
4142     last S7S2;
4143     }
4144     }
4145 wakaba 1.3 splice @{$self->{open_elements}}, $node_i_in_open, 1;
4146 wakaba 1.1 redo S7;
4147     } # S7S2
4148    
4149     ## Step 3
4150     last S7 if $node->[0] eq $formatting_element->[0];
4151    
4152     ## Step 4
4153     if ($last_node->[0] eq $furthest_block->[0]) {
4154 wakaba 1.79 !!!cp ('t64');
4155 wakaba 1.1 $bookmark_prev_el = $node->[0];
4156     }
4157    
4158     ## Step 5
4159     if ($node->[0]->has_child_nodes ()) {
4160 wakaba 1.79 !!!cp ('t65');
4161 wakaba 1.1 my $clone = [$node->[0]->clone_node (0), $node->[1]];
4162     $active_formatting_elements->[$node_i_in_active] = $clone;
4163 wakaba 1.3 $self->{open_elements}->[$node_i_in_open] = $clone;
4164 wakaba 1.1 $node = $clone;
4165     }
4166    
4167     ## Step 6
4168     $node->[0]->append_child ($last_node->[0]);
4169    
4170     ## Step 7
4171     $last_node = $node;
4172    
4173     ## Step 8
4174     redo S7;
4175     } # S7
4176    
4177     ## Step 8
4178 wakaba 1.123 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
4179 wakaba 1.102 my $foster_parent_element;
4180     my $next_sibling;
4181 wakaba 1.123 OE: for (reverse 0..$#{$self->{open_elements}}) {
4182     if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4183 wakaba 1.102 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4184     if (defined $parent and $parent->node_type == 1) {
4185     !!!cp ('t65.1');
4186     $foster_parent_element = $parent;
4187     $next_sibling = $self->{open_elements}->[$_]->[0];
4188     } else {
4189     !!!cp ('t65.2');
4190     $foster_parent_element
4191     = $self->{open_elements}->[$_ - 1]->[0];
4192     }
4193     last OE;
4194     }
4195     } # OE
4196     $foster_parent_element = $self->{open_elements}->[0]->[0]
4197     unless defined $foster_parent_element;
4198     $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
4199     $open_tables->[-1]->[1] = 1; # tainted
4200     } else {
4201     !!!cp ('t65.3');
4202     $common_ancestor_node->[0]->append_child ($last_node->[0]);
4203     }
4204 wakaba 1.1
4205     ## Step 9
4206     my $clone = [$formatting_element->[0]->clone_node (0),
4207     $formatting_element->[1]];
4208    
4209     ## Step 10
4210     my @cn = @{$furthest_block->[0]->child_nodes};
4211     $clone->[0]->append_child ($_) for @cn;
4212    
4213     ## Step 11
4214     $furthest_block->[0]->append_child ($clone->[0]);
4215    
4216     ## Step 12
4217     my $i;
4218     AFE: for (reverse 0..$#$active_formatting_elements) {
4219     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
4220 wakaba 1.79 !!!cp ('t66');
4221 wakaba 1.1 splice @$active_formatting_elements, $_, 1;
4222     $i-- and last AFE if defined $i;
4223     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
4224 wakaba 1.79 !!!cp ('t67');
4225 wakaba 1.1 $i = $_;
4226     }
4227     } # AFE
4228     splice @$active_formatting_elements, $i + 1, 0, $clone;
4229    
4230     ## Step 13
4231     undef $i;
4232 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
4233     if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
4234 wakaba 1.79 !!!cp ('t68');
4235 wakaba 1.3 splice @{$self->{open_elements}}, $_, 1;
4236 wakaba 1.1 $i-- and last OE if defined $i;
4237 wakaba 1.3 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
4238 wakaba 1.79 !!!cp ('t69');
4239 wakaba 1.1 $i = $_;
4240     }
4241     } # OE
4242 wakaba 1.3 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
4243 wakaba 1.1
4244     ## Step 14
4245     redo FET;
4246     } # FET
4247     }; # $formatting_end_tag
4248    
4249 wakaba 1.96 $insert = my $insert_to_current = sub {
4250 wakaba 1.25 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
4251 wakaba 1.1 }; # $insert_to_current
4252    
4253     my $insert_to_foster = sub {
4254 wakaba 1.95 my $child = shift;
4255 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4256 wakaba 1.95 # MUST
4257     my $foster_parent_element;
4258     my $next_sibling;
4259 wakaba 1.123 OE: for (reverse 0..$#{$self->{open_elements}}) {
4260     if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4261 wakaba 1.3 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4262 wakaba 1.1 if (defined $parent and $parent->node_type == 1) {
4263 wakaba 1.79 !!!cp ('t70');
4264 wakaba 1.1 $foster_parent_element = $parent;
4265 wakaba 1.3 $next_sibling = $self->{open_elements}->[$_]->[0];
4266 wakaba 1.1 } else {
4267 wakaba 1.79 !!!cp ('t71');
4268 wakaba 1.1 $foster_parent_element
4269 wakaba 1.3 = $self->{open_elements}->[$_ - 1]->[0];
4270 wakaba 1.1 }
4271     last OE;
4272     }
4273     } # OE
4274 wakaba 1.3 $foster_parent_element = $self->{open_elements}->[0]->[0]
4275 wakaba 1.1 unless defined $foster_parent_element;
4276     $foster_parent_element->insert_before
4277     ($child, $next_sibling);
4278 wakaba 1.95 $open_tables->[-1]->[1] = 1; # tainted
4279     } else {
4280     !!!cp ('t72');
4281     $self->{open_elements}->[-1]->[0]->append_child ($child);
4282     }
4283 wakaba 1.1 }; # $insert_to_foster
4284    
4285 wakaba 1.126 B: while (1) {
4286 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
4287 wakaba 1.79 !!!cp ('t73');
4288 wakaba 1.153 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4289 wakaba 1.52 ## Ignore the token
4290     ## Stay in the phase
4291     !!!next-token;
4292 wakaba 1.126 next B;
4293 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN and
4294 wakaba 1.52 $token->{tag_name} eq 'html') {
4295 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4296 wakaba 1.79 !!!cp ('t79');
4297 wakaba 1.153 !!!parse-error (type => 'after html', text => 'html', token => $token);
4298 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4299     } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4300 wakaba 1.79 !!!cp ('t80');
4301 wakaba 1.153 !!!parse-error (type => 'after html', text => 'html', token => $token);
4302 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4303 wakaba 1.79 } else {
4304     !!!cp ('t81');
4305 wakaba 1.52 }
4306    
4307 wakaba 1.84 !!!cp ('t82');
4308 wakaba 1.113 !!!parse-error (type => 'not first start tag', token => $token);
4309 wakaba 1.52 my $top_el = $self->{open_elements}->[0]->[0];
4310     for my $attr_name (keys %{$token->{attributes}}) {
4311     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4312 wakaba 1.79 !!!cp ('t84');
4313 wakaba 1.52 $top_el->set_attribute_ns
4314     (undef, [undef, $attr_name],
4315     $token->{attributes}->{$attr_name}->{value});
4316     }
4317     }
4318 wakaba 1.125 !!!nack ('t84.1');
4319 wakaba 1.52 !!!next-token;
4320 wakaba 1.126 next B;
4321 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
4322 wakaba 1.52 my $comment = $self->{document}->create_comment ($token->{data});
4323 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4324 wakaba 1.79 !!!cp ('t85');
4325 wakaba 1.52 $self->{document}->append_child ($comment);
4326 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4327 wakaba 1.79 !!!cp ('t86');
4328 wakaba 1.52 $self->{open_elements}->[0]->[0]->append_child ($comment);
4329     } else {
4330 wakaba 1.79 !!!cp ('t87');
4331 wakaba 1.52 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4332     }
4333     !!!next-token;
4334 wakaba 1.126 next B;
4335     } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4336     if ($token->{type} == CHARACTER_TOKEN) {
4337     !!!cp ('t87.1');
4338     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4339     !!!next-token;
4340     next B;
4341     } elsif ($token->{type} == START_TAG_TOKEN) {
4342 wakaba 1.129 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4343     $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4344 wakaba 1.126 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4345     ($token->{tag_name} eq 'svg' and
4346     $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4347     ## NOTE: "using the rules for secondary insertion mode"then"continue"
4348     !!!cp ('t87.2');
4349     #
4350     } elsif ({
4351 wakaba 1.130 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4352 wakaba 1.146 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4353     em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4354     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4355     img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4356     nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4357     small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4358     sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4359 wakaba 1.126 }->{$token->{tag_name}}) {
4360     !!!cp ('t87.2');
4361     !!!parse-error (type => 'not closed',
4362 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4363 wakaba 1.126 ->manakai_local_name,
4364     token => $token);
4365    
4366     pop @{$self->{open_elements}}
4367     while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4368    
4369 wakaba 1.130 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4370 wakaba 1.126 ## Reprocess.
4371     next B;
4372     } else {
4373 wakaba 1.131 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4374     my $tag_name = $token->{tag_name};
4375     if ($nsuri eq $SVG_NS) {
4376     $tag_name = {
4377     altglyph => 'altGlyph',
4378     altglyphdef => 'altGlyphDef',
4379     altglyphitem => 'altGlyphItem',
4380     animatecolor => 'animateColor',
4381     animatemotion => 'animateMotion',
4382     animatetransform => 'animateTransform',
4383     clippath => 'clipPath',
4384     feblend => 'feBlend',
4385     fecolormatrix => 'feColorMatrix',
4386     fecomponenttransfer => 'feComponentTransfer',
4387     fecomposite => 'feComposite',
4388     feconvolvematrix => 'feConvolveMatrix',
4389     fediffuselighting => 'feDiffuseLighting',
4390     fedisplacementmap => 'feDisplacementMap',
4391     fedistantlight => 'feDistantLight',
4392     feflood => 'feFlood',
4393     fefunca => 'feFuncA',
4394     fefuncb => 'feFuncB',
4395     fefuncg => 'feFuncG',
4396     fefuncr => 'feFuncR',
4397     fegaussianblur => 'feGaussianBlur',
4398     feimage => 'feImage',
4399     femerge => 'feMerge',
4400     femergenode => 'feMergeNode',
4401     femorphology => 'feMorphology',
4402     feoffset => 'feOffset',
4403     fepointlight => 'fePointLight',
4404     fespecularlighting => 'feSpecularLighting',
4405     fespotlight => 'feSpotLight',
4406     fetile => 'feTile',
4407     feturbulence => 'feTurbulence',
4408     foreignobject => 'foreignObject',
4409     glyphref => 'glyphRef',
4410     lineargradient => 'linearGradient',
4411     radialgradient => 'radialGradient',
4412     #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4413     textpath => 'textPath',
4414     }->{$tag_name} || $tag_name;
4415     }
4416    
4417     ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4418    
4419     ## "adjust foreign attributes" - done in insert-element-f
4420 wakaba 1.126
4421 wakaba 1.131 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4422 wakaba 1.126
4423     if ($self->{self_closing}) {
4424     pop @{$self->{open_elements}};
4425     !!!ack ('t87.3');
4426     } else {
4427     !!!cp ('t87.4');
4428     }
4429    
4430     !!!next-token;
4431     next B;
4432     }
4433     } elsif ($token->{type} == END_TAG_TOKEN) {
4434     ## NOTE: "using the rules for secondary insertion mode" then "continue"
4435     !!!cp ('t87.5');
4436     #
4437     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4438     !!!cp ('t87.6');
4439 wakaba 1.146 !!!parse-error (type => 'not closed',
4440 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4441 wakaba 1.146 ->manakai_local_name,
4442     token => $token);
4443    
4444     pop @{$self->{open_elements}}
4445     while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4446    
4447     $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4448     ## Reprocess.
4449     next B;
4450 wakaba 1.126 } else {
4451     die "$0: $token->{type}: Unknown token type";
4452     }
4453     }
4454    
4455     if ($self->{insertion_mode} & HEAD_IMS) {
4456 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4457 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
4458 wakaba 1.99 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4459     !!!cp ('t88.2');
4460     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4461     } else {
4462     !!!cp ('t88.1');
4463     ## Ignore the token.
4464     !!!next-token;
4465 wakaba 1.126 next B;
4466 wakaba 1.99 }
4467 wakaba 1.52 unless (length $token->{data}) {
4468 wakaba 1.79 !!!cp ('t88');
4469 wakaba 1.52 !!!next-token;
4470 wakaba 1.126 next B;
4471 wakaba 1.1 }
4472     }
4473 wakaba 1.52
4474 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4475 wakaba 1.79 !!!cp ('t89');
4476 wakaba 1.52 ## As if <head>
4477 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4478 wakaba 1.52 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4479 wakaba 1.123 push @{$self->{open_elements}},
4480     [$self->{head_element}, $el_category->{head}];
4481 wakaba 1.52
4482     ## Reprocess in the "in head" insertion mode...
4483     pop @{$self->{open_elements}};
4484    
4485     ## Reprocess in the "after head" insertion mode...
4486 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4487 wakaba 1.79 !!!cp ('t90');
4488 wakaba 1.52 ## As if </noscript>
4489     pop @{$self->{open_elements}};
4490 wakaba 1.153 !!!parse-error (type => 'in noscript:#text', token => $token);
4491 wakaba 1.1
4492 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
4493     ## As if </head>
4494     pop @{$self->{open_elements}};
4495    
4496     ## Reprocess in the "after head" insertion mode...
4497 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4498 wakaba 1.79 !!!cp ('t91');
4499 wakaba 1.52 pop @{$self->{open_elements}};
4500    
4501     ## Reprocess in the "after head" insertion mode...
4502 wakaba 1.79 } else {
4503     !!!cp ('t92');
4504 wakaba 1.1 }
4505 wakaba 1.52
4506 wakaba 1.123 ## "after head" insertion mode
4507     ## As if <body>
4508     !!!insert-element ('body',, $token);
4509     $self->{insertion_mode} = IN_BODY_IM;
4510     ## reprocess
4511 wakaba 1.126 next B;
4512 wakaba 1.123 } elsif ($token->{type} == START_TAG_TOKEN) {
4513     if ($token->{tag_name} eq 'head') {
4514     if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4515     !!!cp ('t93');
4516 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4517 wakaba 1.123 $self->{open_elements}->[-1]->[0]->append_child
4518     ($self->{head_element});
4519     push @{$self->{open_elements}},
4520     [$self->{head_element}, $el_category->{head}];
4521     $self->{insertion_mode} = IN_HEAD_IM;
4522 wakaba 1.125 !!!nack ('t93.1');
4523 wakaba 1.123 !!!next-token;
4524 wakaba 1.126 next B;
4525 wakaba 1.125 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4526 wakaba 1.139 !!!cp ('t93.2');
4527 wakaba 1.153 !!!parse-error (type => 'after head', text => 'head',
4528     token => $token);
4529 wakaba 1.139 ## Ignore the token
4530     !!!nack ('t93.3');
4531     !!!next-token;
4532     next B;
4533 wakaba 1.125 } else {
4534     !!!cp ('t95');
4535 wakaba 1.153 !!!parse-error (type => 'in head:head',
4536     token => $token); # or in head noscript
4537 wakaba 1.125 ## Ignore the token
4538     !!!nack ('t95.1');
4539     !!!next-token;
4540 wakaba 1.126 next B;
4541 wakaba 1.125 }
4542     } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4543 wakaba 1.126 !!!cp ('t96');
4544     ## As if <head>
4545     !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4546     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4547     push @{$self->{open_elements}},
4548     [$self->{head_element}, $el_category->{head}];
4549 wakaba 1.52
4550 wakaba 1.126 $self->{insertion_mode} = IN_HEAD_IM;
4551     ## Reprocess in the "in head" insertion mode...
4552     } else {
4553     !!!cp ('t97');
4554     }
4555 wakaba 1.52
4556 wakaba 1.49 if ($token->{tag_name} eq 'base') {
4557 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4558 wakaba 1.79 !!!cp ('t98');
4559 wakaba 1.49 ## As if </noscript>
4560     pop @{$self->{open_elements}};
4561 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'base',
4562     token => $token);
4563 wakaba 1.49
4564 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4565 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4566 wakaba 1.79 } else {
4567     !!!cp ('t99');
4568 wakaba 1.49 }
4569    
4570     ## NOTE: There is a "as if in head" code clone.
4571 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4572 wakaba 1.79 !!!cp ('t100');
4573 wakaba 1.153 !!!parse-error (type => 'after head',
4574     text => $token->{tag_name}, token => $token);
4575 wakaba 1.123 push @{$self->{open_elements}},
4576     [$self->{head_element}, $el_category->{head}];
4577 wakaba 1.79 } else {
4578     !!!cp ('t101');
4579 wakaba 1.49 }
4580 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4581 wakaba 1.49 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4582 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4583 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4584 wakaba 1.125 !!!nack ('t101.1');
4585 wakaba 1.49 !!!next-token;
4586 wakaba 1.126 next B;
4587 wakaba 1.49 } elsif ($token->{tag_name} eq 'link') {
4588 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
4589 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4590 wakaba 1.79 !!!cp ('t102');
4591 wakaba 1.153 !!!parse-error (type => 'after head',
4592     text => $token->{tag_name}, token => $token);
4593 wakaba 1.123 push @{$self->{open_elements}},
4594     [$self->{head_element}, $el_category->{head}];
4595 wakaba 1.79 } else {
4596     !!!cp ('t103');
4597 wakaba 1.25 }
4598 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4599 wakaba 1.25 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4600 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4601 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4602 wakaba 1.125 !!!ack ('t103.1');
4603 wakaba 1.1 !!!next-token;
4604 wakaba 1.126 next B;
4605 wakaba 1.34 } elsif ($token->{tag_name} eq 'meta') {
4606     ## NOTE: There is a "as if in head" code clone.
4607 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4608 wakaba 1.79 !!!cp ('t104');
4609 wakaba 1.153 !!!parse-error (type => 'after head',
4610     text => $token->{tag_name}, token => $token);
4611 wakaba 1.123 push @{$self->{open_elements}},
4612     [$self->{head_element}, $el_category->{head}];
4613 wakaba 1.79 } else {
4614     !!!cp ('t105');
4615 wakaba 1.34 }
4616 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4617 wakaba 1.66 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4618 wakaba 1.34
4619     unless ($self->{confident}) {
4620 wakaba 1.134 if ($token->{attributes}->{charset}) {
4621 wakaba 1.79 !!!cp ('t106');
4622 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
4623     ## in the {change_encoding} callback.
4624 wakaba 1.63 $self->{change_encoding}
4625 wakaba 1.114 ->($self, $token->{attributes}->{charset}->{value},
4626     $token);
4627 wakaba 1.66
4628     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4629     ->set_user_data (manakai_has_reference =>
4630     $token->{attributes}->{charset}
4631     ->{has_reference});
4632 wakaba 1.63 } elsif ($token->{attributes}->{content}) {
4633     if ($token->{attributes}->{content}->{value}
4634 wakaba 1.144 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4635 wakaba 1.70 [\x09-\x0D\x20]*=
4636 wakaba 1.34 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4637 wakaba 1.145 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
4638 wakaba 1.79 !!!cp ('t107');
4639 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
4640     ## in the {change_encoding} callback.
4641 wakaba 1.63 $self->{change_encoding}
4642 wakaba 1.114 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4643     $token);
4644 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4645     ->set_user_data (manakai_has_reference =>
4646     $token->{attributes}->{content}
4647     ->{has_reference});
4648 wakaba 1.79 } else {
4649     !!!cp ('t108');
4650 wakaba 1.63 }
4651 wakaba 1.34 }
4652 wakaba 1.66 } else {
4653     if ($token->{attributes}->{charset}) {
4654 wakaba 1.79 !!!cp ('t109');
4655 wakaba 1.66 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4656     ->set_user_data (manakai_has_reference =>
4657     $token->{attributes}->{charset}
4658     ->{has_reference});
4659     }
4660 wakaba 1.68 if ($token->{attributes}->{content}) {
4661 wakaba 1.79 !!!cp ('t110');
4662 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4663     ->set_user_data (manakai_has_reference =>
4664     $token->{attributes}->{content}
4665     ->{has_reference});
4666     }
4667 wakaba 1.34 }
4668    
4669 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4670 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4671 wakaba 1.125 !!!ack ('t110.1');
4672 wakaba 1.34 !!!next-token;
4673 wakaba 1.126 next B;
4674 wakaba 1.49 } elsif ($token->{tag_name} eq 'title') {
4675 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4676 wakaba 1.79 !!!cp ('t111');
4677 wakaba 1.49 ## As if </noscript>
4678     pop @{$self->{open_elements}};
4679 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'title',
4680     token => $token);
4681 wakaba 1.49
4682 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4683 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4684 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4685 wakaba 1.79 !!!cp ('t112');
4686 wakaba 1.153 !!!parse-error (type => 'after head',
4687     text => $token->{tag_name}, token => $token);
4688 wakaba 1.123 push @{$self->{open_elements}},
4689     [$self->{head_element}, $el_category->{head}];
4690 wakaba 1.79 } else {
4691     !!!cp ('t113');
4692 wakaba 1.25 }
4693 wakaba 1.49
4694     ## NOTE: There is a "as if in head" code clone.
4695 wakaba 1.31 my $parent = defined $self->{head_element} ? $self->{head_element}
4696     : $self->{open_elements}->[-1]->[0];
4697 wakaba 1.96 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4698 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4699 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4700 wakaba 1.126 next B;
4701 wakaba 1.148 } elsif ($token->{tag_name} eq 'style' or
4702     $token->{tag_name} eq 'noframes') {
4703 wakaba 1.25 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4704 wakaba 1.54 ## insertion mode IN_HEAD_IM)
4705 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
4706 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4707 wakaba 1.79 !!!cp ('t114');
4708 wakaba 1.153 !!!parse-error (type => 'after head',
4709     text => $token->{tag_name}, token => $token);
4710 wakaba 1.123 push @{$self->{open_elements}},
4711     [$self->{head_element}, $el_category->{head}];
4712 wakaba 1.79 } else {
4713     !!!cp ('t115');
4714 wakaba 1.25 }
4715 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
4716 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4717 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4718 wakaba 1.126 next B;
4719 wakaba 1.25 } elsif ($token->{tag_name} eq 'noscript') {
4720 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_IM) {
4721 wakaba 1.79 !!!cp ('t116');
4722 wakaba 1.25 ## NOTE: and scripting is disalbed
4723 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4724 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4725 wakaba 1.125 !!!nack ('t116.1');
4726 wakaba 1.1 !!!next-token;
4727 wakaba 1.126 next B;
4728 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4729 wakaba 1.79 !!!cp ('t117');
4730 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'noscript',
4731     token => $token);
4732 wakaba 1.1 ## Ignore the token
4733 wakaba 1.125 !!!nack ('t117.1');
4734 wakaba 1.41 !!!next-token;
4735 wakaba 1.126 next B;
4736 wakaba 1.1 } else {
4737 wakaba 1.79 !!!cp ('t118');
4738 wakaba 1.25 #
4739 wakaba 1.1 }
4740 wakaba 1.49 } elsif ($token->{tag_name} eq 'script') {
4741 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4742 wakaba 1.79 !!!cp ('t119');
4743 wakaba 1.49 ## As if </noscript>
4744     pop @{$self->{open_elements}};
4745 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'script',
4746     token => $token);
4747 wakaba 1.49
4748 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4749 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4750 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4751 wakaba 1.79 !!!cp ('t120');
4752 wakaba 1.153 !!!parse-error (type => 'after head',
4753     text => $token->{tag_name}, token => $token);
4754 wakaba 1.123 push @{$self->{open_elements}},
4755     [$self->{head_element}, $el_category->{head}];
4756 wakaba 1.79 } else {
4757     !!!cp ('t121');
4758 wakaba 1.25 }
4759 wakaba 1.49
4760 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
4761 wakaba 1.100 $script_start_tag->();
4762     pop @{$self->{open_elements}} # <head>
4763 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4764 wakaba 1.126 next B;
4765 wakaba 1.49 } elsif ($token->{tag_name} eq 'body' or
4766 wakaba 1.25 $token->{tag_name} eq 'frameset') {
4767 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4768 wakaba 1.79 !!!cp ('t122');
4769 wakaba 1.49 ## As if </noscript>
4770     pop @{$self->{open_elements}};
4771 wakaba 1.153 !!!parse-error (type => 'in noscript',
4772     text => $token->{tag_name}, token => $token);
4773 wakaba 1.49
4774     ## Reprocess in the "in head" insertion mode...
4775     ## As if </head>
4776     pop @{$self->{open_elements}};
4777    
4778     ## Reprocess in the "after head" insertion mode...
4779 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4780 wakaba 1.79 !!!cp ('t124');
4781 wakaba 1.49 pop @{$self->{open_elements}};
4782    
4783     ## Reprocess in the "after head" insertion mode...
4784 wakaba 1.79 } else {
4785     !!!cp ('t125');
4786 wakaba 1.49 }
4787    
4788     ## "after head" insertion mode
4789 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4790 wakaba 1.54 if ($token->{tag_name} eq 'body') {
4791 wakaba 1.79 !!!cp ('t126');
4792 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4793     } elsif ($token->{tag_name} eq 'frameset') {
4794 wakaba 1.79 !!!cp ('t127');
4795 wakaba 1.54 $self->{insertion_mode} = IN_FRAMESET_IM;
4796     } else {
4797     die "$0: tag name: $self->{tag_name}";
4798     }
4799 wakaba 1.125 !!!nack ('t127.1');
4800 wakaba 1.1 !!!next-token;
4801 wakaba 1.126 next B;
4802 wakaba 1.1 } else {
4803 wakaba 1.79 !!!cp ('t128');
4804 wakaba 1.1 #
4805     }
4806 wakaba 1.49
4807 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4808 wakaba 1.79 !!!cp ('t129');
4809 wakaba 1.49 ## As if </noscript>
4810     pop @{$self->{open_elements}};
4811 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
4812     text => $token->{tag_name}, token => $token);
4813 wakaba 1.49
4814     ## Reprocess in the "in head" insertion mode...
4815     ## As if </head>
4816 wakaba 1.25 pop @{$self->{open_elements}};
4817 wakaba 1.49
4818     ## Reprocess in the "after head" insertion mode...
4819 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4820 wakaba 1.79 !!!cp ('t130');
4821 wakaba 1.49 ## As if </head>
4822 wakaba 1.25 pop @{$self->{open_elements}};
4823 wakaba 1.49
4824     ## Reprocess in the "after head" insertion mode...
4825 wakaba 1.79 } else {
4826     !!!cp ('t131');
4827 wakaba 1.49 }
4828    
4829     ## "after head" insertion mode
4830     ## As if <body>
4831 wakaba 1.116 !!!insert-element ('body',, $token);
4832 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4833 wakaba 1.49 ## reprocess
4834 wakaba 1.125 !!!ack-later;
4835 wakaba 1.126 next B;
4836 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4837 wakaba 1.49 if ($token->{tag_name} eq 'head') {
4838 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4839 wakaba 1.79 !!!cp ('t132');
4840 wakaba 1.50 ## As if <head>
4841 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4842 wakaba 1.50 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4843 wakaba 1.123 push @{$self->{open_elements}},
4844     [$self->{head_element}, $el_category->{head}];
4845 wakaba 1.50
4846     ## Reprocess in the "in head" insertion mode...
4847     pop @{$self->{open_elements}};
4848 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4849 wakaba 1.50 !!!next-token;
4850 wakaba 1.126 next B;
4851 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4852 wakaba 1.79 !!!cp ('t133');
4853 wakaba 1.49 ## As if </noscript>
4854     pop @{$self->{open_elements}};
4855 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
4856     text => 'head', token => $token);
4857 wakaba 1.49
4858     ## Reprocess in the "in head" insertion mode...
4859 wakaba 1.50 pop @{$self->{open_elements}};
4860 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4861 wakaba 1.50 !!!next-token;
4862 wakaba 1.126 next B;
4863 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4864 wakaba 1.79 !!!cp ('t134');
4865 wakaba 1.49 pop @{$self->{open_elements}};
4866 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4867 wakaba 1.49 !!!next-token;
4868 wakaba 1.126 next B;
4869 wakaba 1.139 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4870     !!!cp ('t134.1');
4871 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => 'head',
4872     token => $token);
4873 wakaba 1.139 ## Ignore the token
4874     !!!next-token;
4875     next B;
4876 wakaba 1.49 } else {
4877 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4878 wakaba 1.49 }
4879     } elsif ($token->{tag_name} eq 'noscript') {
4880 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4881 wakaba 1.79 !!!cp ('t136');
4882 wakaba 1.49 pop @{$self->{open_elements}};
4883 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4884 wakaba 1.49 !!!next-token;
4885 wakaba 1.126 next B;
4886 wakaba 1.139 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4887     $self->{insertion_mode} == AFTER_HEAD_IM) {
4888 wakaba 1.79 !!!cp ('t137');
4889 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4890     text => 'noscript', token => $token);
4891 wakaba 1.50 ## Ignore the token ## ISSUE: An issue in the spec.
4892     !!!next-token;
4893 wakaba 1.126 next B;
4894 wakaba 1.49 } else {
4895 wakaba 1.79 !!!cp ('t138');
4896 wakaba 1.49 #
4897     }
4898     } elsif ({
4899 wakaba 1.31 body => 1, html => 1,
4900     }->{$token->{tag_name}}) {
4901 wakaba 1.139 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4902     $self->{insertion_mode} == IN_HEAD_IM or
4903     $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4904 wakaba 1.79 !!!cp ('t140');
4905 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4906     text => $token->{tag_name}, token => $token);
4907 wakaba 1.49 ## Ignore the token
4908     !!!next-token;
4909 wakaba 1.126 next B;
4910 wakaba 1.139 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4911     !!!cp ('t140.1');
4912 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4913     text => $token->{tag_name}, token => $token);
4914 wakaba 1.139 ## Ignore the token
4915     !!!next-token;
4916     next B;
4917 wakaba 1.79 } else {
4918 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4919 wakaba 1.49 }
4920 wakaba 1.139 } elsif ($token->{tag_name} eq 'p') {
4921     !!!cp ('t142');
4922 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4923     text => $token->{tag_name}, token => $token);
4924 wakaba 1.139 ## Ignore the token
4925     !!!next-token;
4926     next B;
4927     } elsif ($token->{tag_name} eq 'br') {
4928 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4929 wakaba 1.139 !!!cp ('t142.2');
4930     ## (before head) as if <head>, (in head) as if </head>
4931 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4932 wakaba 1.50 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4933 wakaba 1.139 $self->{insertion_mode} = AFTER_HEAD_IM;
4934    
4935     ## Reprocess in the "after head" insertion mode...
4936     } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4937     !!!cp ('t143.2');
4938     ## As if </head>
4939     pop @{$self->{open_elements}};
4940     $self->{insertion_mode} = AFTER_HEAD_IM;
4941    
4942     ## Reprocess in the "after head" insertion mode...
4943     } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4944     !!!cp ('t143.3');
4945     ## ISSUE: Two parse errors for <head><noscript></br>
4946 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4947     text => 'br', token => $token);
4948 wakaba 1.139 ## As if </noscript>
4949     pop @{$self->{open_elements}};
4950     $self->{insertion_mode} = IN_HEAD_IM;
4951 wakaba 1.50
4952     ## Reprocess in the "in head" insertion mode...
4953 wakaba 1.139 ## As if </head>
4954     pop @{$self->{open_elements}};
4955     $self->{insertion_mode} = AFTER_HEAD_IM;
4956    
4957     ## Reprocess in the "after head" insertion mode...
4958     } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4959     !!!cp ('t143.4');
4960     #
4961 wakaba 1.79 } else {
4962 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4963 wakaba 1.50 }
4964    
4965 wakaba 1.139 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
4966 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4967     text => 'br', token => $token);
4968 wakaba 1.139 ## Ignore the token
4969     !!!next-token;
4970     next B;
4971 wakaba 1.25 } else {
4972 wakaba 1.139 !!!cp ('t145');
4973 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4974     text => $token->{tag_name}, token => $token);
4975 wakaba 1.139 ## Ignore the token
4976     !!!next-token;
4977     next B;
4978 wakaba 1.49 }
4979    
4980 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4981 wakaba 1.79 !!!cp ('t146');
4982 wakaba 1.49 ## As if </noscript>
4983     pop @{$self->{open_elements}};
4984 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
4985     text => $token->{tag_name}, token => $token);
4986 wakaba 1.49
4987     ## Reprocess in the "in head" insertion mode...
4988     ## As if </head>
4989     pop @{$self->{open_elements}};
4990    
4991     ## Reprocess in the "after head" insertion mode...
4992 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4993 wakaba 1.79 !!!cp ('t147');
4994 wakaba 1.49 ## As if </head>
4995     pop @{$self->{open_elements}};
4996    
4997     ## Reprocess in the "after head" insertion mode...
4998 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4999 wakaba 1.82 ## ISSUE: This case cannot be reached?
5000 wakaba 1.79 !!!cp ('t148');
5001 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5002     text => $token->{tag_name}, token => $token);
5003 wakaba 1.50 ## Ignore the token ## ISSUE: An issue in the spec.
5004     !!!next-token;
5005 wakaba 1.126 next B;
5006 wakaba 1.79 } else {
5007     !!!cp ('t149');
5008 wakaba 1.1 }
5009    
5010 wakaba 1.49 ## "after head" insertion mode
5011     ## As if <body>
5012 wakaba 1.116 !!!insert-element ('body',, $token);
5013 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
5014 wakaba 1.52 ## reprocess
5015 wakaba 1.126 next B;
5016 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5017     if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5018     !!!cp ('t149.1');
5019    
5020     ## NOTE: As if <head>
5021 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
5022 wakaba 1.104 $self->{open_elements}->[-1]->[0]->append_child
5023     ($self->{head_element});
5024 wakaba 1.123 #push @{$self->{open_elements}},
5025     # [$self->{head_element}, $el_category->{head}];
5026 wakaba 1.104 #$self->{insertion_mode} = IN_HEAD_IM;
5027     ## NOTE: Reprocess.
5028    
5029     ## NOTE: As if </head>
5030     #pop @{$self->{open_elements}};
5031     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5032     ## NOTE: Reprocess.
5033    
5034     #
5035     } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5036     !!!cp ('t149.2');
5037    
5038     ## NOTE: As if </head>
5039     pop @{$self->{open_elements}};
5040     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5041     ## NOTE: Reprocess.
5042    
5043     #
5044     } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5045     !!!cp ('t149.3');
5046    
5047 wakaba 1.113 !!!parse-error (type => 'in noscript:#eof', token => $token);
5048 wakaba 1.104
5049     ## As if </noscript>
5050     pop @{$self->{open_elements}};
5051     #$self->{insertion_mode} = IN_HEAD_IM;
5052     ## NOTE: Reprocess.
5053    
5054     ## NOTE: As if </head>
5055     pop @{$self->{open_elements}};
5056     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5057     ## NOTE: Reprocess.
5058    
5059     #
5060     } else {
5061     !!!cp ('t149.4');
5062     #
5063     }
5064    
5065     ## NOTE: As if <body>
5066 wakaba 1.116 !!!insert-element ('body',, $token);
5067 wakaba 1.104 $self->{insertion_mode} = IN_BODY_IM;
5068     ## NOTE: Reprocess.
5069 wakaba 1.126 next B;
5070 wakaba 1.104 } else {
5071     die "$0: $token->{type}: Unknown token type";
5072     }
5073 wakaba 1.52
5074     ## ISSUE: An issue in the spec.
5075 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_IMS) {
5076 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
5077 wakaba 1.79 !!!cp ('t150');
5078 wakaba 1.52 ## NOTE: There is a code clone of "character in body".
5079     $reconstruct_active_formatting_elements->($insert_to_current);
5080    
5081     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5082    
5083     !!!next-token;
5084 wakaba 1.126 next B;
5085 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
5086 wakaba 1.52 if ({
5087     caption => 1, col => 1, colgroup => 1, tbody => 1,
5088     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
5089     }->{$token->{tag_name}}) {
5090 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
5091 wakaba 1.52 ## have an element in table scope
5092 wakaba 1.108 for (reverse 0..$#{$self->{open_elements}}) {
5093 wakaba 1.52 my $node = $self->{open_elements}->[$_];
5094 wakaba 1.123 if ($node->[1] & TABLE_CELL_EL) {
5095 wakaba 1.79 !!!cp ('t151');
5096 wakaba 1.108
5097     ## Close the cell
5098 wakaba 1.125 !!!back-token; # <x>
5099 wakaba 1.122 $token = {type => END_TAG_TOKEN,
5100     tag_name => $node->[0]->manakai_local_name,
5101 wakaba 1.114 line => $token->{line},
5102     column => $token->{column}};
5103 wakaba 1.126 next B;
5104 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5105 wakaba 1.79 !!!cp ('t152');
5106 wakaba 1.108 ## ISSUE: This case can never be reached, maybe.
5107     last;
5108 wakaba 1.52 }
5109 wakaba 1.108 }
5110    
5111     !!!cp ('t153');
5112     !!!parse-error (type => 'start tag not allowed',
5113 wakaba 1.153 text => $token->{tag_name}, token => $token);
5114 wakaba 1.108 ## Ignore the token
5115 wakaba 1.125 !!!nack ('t153.1');
5116 wakaba 1.108 !!!next-token;
5117 wakaba 1.126 next B;
5118 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5119 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'caption',
5120     token => $token);
5121 wakaba 1.52
5122 wakaba 1.108 ## NOTE: As if </caption>.
5123 wakaba 1.52 ## have a table element in table scope
5124     my $i;
5125 wakaba 1.108 INSCOPE: {
5126     for (reverse 0..$#{$self->{open_elements}}) {
5127     my $node = $self->{open_elements}->[$_];
5128 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5129 wakaba 1.108 !!!cp ('t155');
5130     $i = $_;
5131     last INSCOPE;
5132 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5133 wakaba 1.108 !!!cp ('t156');
5134     last;
5135     }
5136 wakaba 1.52 }
5137 wakaba 1.108
5138     !!!cp ('t157');
5139     !!!parse-error (type => 'start tag not allowed',
5140 wakaba 1.153 text => $token->{tag_name}, token => $token);
5141 wakaba 1.108 ## Ignore the token
5142 wakaba 1.125 !!!nack ('t157.1');
5143 wakaba 1.108 !!!next-token;
5144 wakaba 1.126 next B;
5145 wakaba 1.52 } # INSCOPE
5146    
5147     ## generate implied end tags
5148 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5149     & END_TAG_OPTIONAL_EL) {
5150 wakaba 1.79 !!!cp ('t158');
5151 wakaba 1.86 pop @{$self->{open_elements}};
5152 wakaba 1.52 }
5153    
5154 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5155 wakaba 1.79 !!!cp ('t159');
5156 wakaba 1.122 !!!parse-error (type => 'not closed',
5157 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5158 wakaba 1.122 ->manakai_local_name,
5159     token => $token);
5160 wakaba 1.79 } else {
5161     !!!cp ('t160');
5162 wakaba 1.52 }
5163    
5164     splice @{$self->{open_elements}}, $i;
5165    
5166     $clear_up_to_marker->();
5167    
5168 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5169 wakaba 1.52
5170     ## reprocess
5171 wakaba 1.125 !!!ack-later;
5172 wakaba 1.126 next B;
5173 wakaba 1.52 } else {
5174 wakaba 1.79 !!!cp ('t161');
5175 wakaba 1.52 #
5176     }
5177     } else {
5178 wakaba 1.79 !!!cp ('t162');
5179 wakaba 1.52 #
5180     }
5181 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
5182 wakaba 1.52 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5183 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
5184 wakaba 1.43 ## have an element in table scope
5185 wakaba 1.52 my $i;
5186 wakaba 1.43 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5187     my $node = $self->{open_elements}->[$_];
5188 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5189 wakaba 1.79 !!!cp ('t163');
5190 wakaba 1.52 $i = $_;
5191 wakaba 1.43 last INSCOPE;
5192 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5193 wakaba 1.79 !!!cp ('t164');
5194 wakaba 1.43 last INSCOPE;
5195     }
5196     } # INSCOPE
5197 wakaba 1.52 unless (defined $i) {
5198 wakaba 1.79 !!!cp ('t165');
5199 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5200     text => $token->{tag_name},
5201     token => $token);
5202 wakaba 1.43 ## Ignore the token
5203     !!!next-token;
5204 wakaba 1.126 next B;
5205 wakaba 1.43 }
5206    
5207 wakaba 1.52 ## generate implied end tags
5208 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5209     & END_TAG_OPTIONAL_EL) {
5210 wakaba 1.79 !!!cp ('t166');
5211 wakaba 1.86 pop @{$self->{open_elements}};
5212 wakaba 1.52 }
5213 wakaba 1.86
5214 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
5215     ne $token->{tag_name}) {
5216 wakaba 1.79 !!!cp ('t167');
5217 wakaba 1.122 !!!parse-error (type => 'not closed',
5218 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5219 wakaba 1.122 ->manakai_local_name,
5220     token => $token);
5221 wakaba 1.79 } else {
5222     !!!cp ('t168');
5223 wakaba 1.52 }
5224    
5225     splice @{$self->{open_elements}}, $i;
5226    
5227     $clear_up_to_marker->();
5228    
5229 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
5230 wakaba 1.52
5231     !!!next-token;
5232 wakaba 1.126 next B;
5233 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5234 wakaba 1.79 !!!cp ('t169');
5235 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5236     text => $token->{tag_name}, token => $token);
5237 wakaba 1.52 ## Ignore the token
5238     !!!next-token;
5239 wakaba 1.126 next B;
5240 wakaba 1.52 } else {
5241 wakaba 1.79 !!!cp ('t170');
5242 wakaba 1.52 #
5243     }
5244     } elsif ($token->{tag_name} eq 'caption') {
5245 wakaba 1.54 if ($self->{insertion_mode} == IN_CAPTION_IM) {
5246 wakaba 1.43 ## have a table element in table scope
5247     my $i;
5248 wakaba 1.108 INSCOPE: {
5249     for (reverse 0..$#{$self->{open_elements}}) {
5250     my $node = $self->{open_elements}->[$_];
5251 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5252 wakaba 1.108 !!!cp ('t171');
5253     $i = $_;
5254     last INSCOPE;
5255 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5256 wakaba 1.108 !!!cp ('t172');
5257     last;
5258     }
5259 wakaba 1.43 }
5260 wakaba 1.108
5261     !!!cp ('t173');
5262     !!!parse-error (type => 'unmatched end tag',
5263 wakaba 1.153 text => $token->{tag_name}, token => $token);
5264 wakaba 1.108 ## Ignore the token
5265     !!!next-token;
5266 wakaba 1.126 next B;
5267 wakaba 1.43 } # INSCOPE
5268    
5269     ## generate implied end tags
5270 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5271     & END_TAG_OPTIONAL_EL) {
5272 wakaba 1.79 !!!cp ('t174');
5273 wakaba 1.86 pop @{$self->{open_elements}};
5274 wakaba 1.43 }
5275 wakaba 1.52
5276 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5277 wakaba 1.79 !!!cp ('t175');
5278 wakaba 1.122 !!!parse-error (type => 'not closed',
5279 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5280 wakaba 1.122 ->manakai_local_name,
5281     token => $token);
5282 wakaba 1.79 } else {
5283     !!!cp ('t176');
5284 wakaba 1.52 }
5285    
5286     splice @{$self->{open_elements}}, $i;
5287    
5288     $clear_up_to_marker->();
5289    
5290 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5291 wakaba 1.52
5292     !!!next-token;
5293 wakaba 1.126 next B;
5294 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5295 wakaba 1.79 !!!cp ('t177');
5296 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5297     text => $token->{tag_name}, token => $token);
5298 wakaba 1.52 ## Ignore the token
5299     !!!next-token;
5300 wakaba 1.126 next B;
5301 wakaba 1.52 } else {
5302 wakaba 1.79 !!!cp ('t178');
5303 wakaba 1.52 #
5304     }
5305     } elsif ({
5306     table => 1, tbody => 1, tfoot => 1,
5307     thead => 1, tr => 1,
5308     }->{$token->{tag_name}} and
5309 wakaba 1.54 $self->{insertion_mode} == IN_CELL_IM) {
5310 wakaba 1.52 ## have an element in table scope
5311     my $i;
5312     my $tn;
5313 wakaba 1.108 INSCOPE: {
5314     for (reverse 0..$#{$self->{open_elements}}) {
5315     my $node = $self->{open_elements}->[$_];
5316 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5317 wakaba 1.108 !!!cp ('t179');
5318     $i = $_;
5319    
5320     ## Close the cell
5321 wakaba 1.125 !!!back-token; # </x>
5322 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5323     line => $token->{line},
5324     column => $token->{column}};
5325 wakaba 1.126 next B;
5326 wakaba 1.123 } elsif ($node->[1] & TABLE_CELL_EL) {
5327 wakaba 1.108 !!!cp ('t180');
5328 wakaba 1.123 $tn = $node->[0]->manakai_local_name;
5329 wakaba 1.108 ## NOTE: There is exactly one |td| or |th| element
5330     ## in scope in the stack of open elements by definition.
5331 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5332 wakaba 1.108 ## ISSUE: Can this be reached?
5333     !!!cp ('t181');
5334     last;
5335     }
5336 wakaba 1.52 }
5337 wakaba 1.108
5338 wakaba 1.79 !!!cp ('t182');
5339 wakaba 1.108 !!!parse-error (type => 'unmatched end tag',
5340 wakaba 1.153 text => $token->{tag_name}, token => $token);
5341 wakaba 1.52 ## Ignore the token
5342     !!!next-token;
5343 wakaba 1.126 next B;
5344 wakaba 1.108 } # INSCOPE
5345 wakaba 1.52 } elsif ($token->{tag_name} eq 'table' and
5346 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
5347 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'caption',
5348     token => $token);
5349 wakaba 1.52
5350     ## As if </caption>
5351     ## have a table element in table scope
5352     my $i;
5353     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5354     my $node = $self->{open_elements}->[$_];
5355 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5356 wakaba 1.79 !!!cp ('t184');
5357 wakaba 1.52 $i = $_;
5358     last INSCOPE;
5359 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5360 wakaba 1.79 !!!cp ('t185');
5361 wakaba 1.52 last INSCOPE;
5362     }
5363     } # INSCOPE
5364     unless (defined $i) {
5365 wakaba 1.79 !!!cp ('t186');
5366 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5367     text => 'caption', token => $token);
5368 wakaba 1.52 ## Ignore the token
5369     !!!next-token;
5370 wakaba 1.126 next B;
5371 wakaba 1.52 }
5372    
5373     ## generate implied end tags
5374 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5375 wakaba 1.79 !!!cp ('t187');
5376 wakaba 1.86 pop @{$self->{open_elements}};
5377 wakaba 1.52 }
5378    
5379 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5380 wakaba 1.79 !!!cp ('t188');
5381 wakaba 1.122 !!!parse-error (type => 'not closed',
5382 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5383 wakaba 1.122 ->manakai_local_name,
5384     token => $token);
5385 wakaba 1.79 } else {
5386     !!!cp ('t189');
5387 wakaba 1.52 }
5388    
5389     splice @{$self->{open_elements}}, $i;
5390    
5391     $clear_up_to_marker->();
5392    
5393 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5394 wakaba 1.52
5395     ## reprocess
5396 wakaba 1.126 next B;
5397 wakaba 1.52 } elsif ({
5398     body => 1, col => 1, colgroup => 1, html => 1,
5399     }->{$token->{tag_name}}) {
5400 wakaba 1.56 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5401 wakaba 1.79 !!!cp ('t190');
5402 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5403     text => $token->{tag_name}, token => $token);
5404 wakaba 1.52 ## Ignore the token
5405     !!!next-token;
5406 wakaba 1.126 next B;
5407 wakaba 1.52 } else {
5408 wakaba 1.79 !!!cp ('t191');
5409 wakaba 1.52 #
5410     }
5411     } elsif ({
5412     tbody => 1, tfoot => 1,
5413     thead => 1, tr => 1,
5414     }->{$token->{tag_name}} and
5415 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
5416 wakaba 1.79 !!!cp ('t192');
5417 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5418     text => $token->{tag_name}, token => $token);
5419 wakaba 1.52 ## Ignore the token
5420     !!!next-token;
5421 wakaba 1.126 next B;
5422 wakaba 1.52 } else {
5423 wakaba 1.79 !!!cp ('t193');
5424 wakaba 1.52 #
5425     }
5426 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5427     for my $entry (@{$self->{open_elements}}) {
5428 wakaba 1.123 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5429 wakaba 1.104 !!!cp ('t75');
5430 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
5431 wakaba 1.104 last;
5432     }
5433     }
5434    
5435     ## Stop parsing.
5436     last B;
5437 wakaba 1.52 } else {
5438     die "$0: $token->{type}: Unknown token type";
5439     }
5440    
5441     $insert = $insert_to_current;
5442     #
5443 wakaba 1.56 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5444 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
5445 wakaba 1.95 if (not $open_tables->[-1]->[1] and # tainted
5446     $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
5447     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5448 wakaba 1.52
5449 wakaba 1.95 unless (length $token->{data}) {
5450     !!!cp ('t194');
5451     !!!next-token;
5452 wakaba 1.126 next B;
5453 wakaba 1.95 } else {
5454     !!!cp ('t195');
5455     }
5456     }
5457 wakaba 1.52
5458 wakaba 1.153 !!!parse-error (type => 'in table:#text', token => $token);
5459 wakaba 1.52
5460     ## As if in body, but insert into foster parent element
5461     ## ISSUE: Spec says that "whenever a node would be inserted
5462     ## into the current node" while characters might not be
5463     ## result in a new Text node.
5464     $reconstruct_active_formatting_elements->($insert_to_foster);
5465    
5466 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5467 wakaba 1.52 # MUST
5468     my $foster_parent_element;
5469     my $next_sibling;
5470     my $prev_sibling;
5471     OE: for (reverse 0..$#{$self->{open_elements}}) {
5472 wakaba 1.123 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5473 wakaba 1.52 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5474     if (defined $parent and $parent->node_type == 1) {
5475 wakaba 1.79 !!!cp ('t196');
5476 wakaba 1.52 $foster_parent_element = $parent;
5477     $next_sibling = $self->{open_elements}->[$_]->[0];
5478     $prev_sibling = $next_sibling->previous_sibling;
5479     } else {
5480 wakaba 1.79 !!!cp ('t197');
5481 wakaba 1.52 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5482     $prev_sibling = $foster_parent_element->last_child;
5483     }
5484     last OE;
5485     }
5486     } # OE
5487     $foster_parent_element = $self->{open_elements}->[0]->[0] and
5488     $prev_sibling = $foster_parent_element->last_child
5489     unless defined $foster_parent_element;
5490     if (defined $prev_sibling and
5491     $prev_sibling->node_type == 3) {
5492 wakaba 1.79 !!!cp ('t198');
5493 wakaba 1.52 $prev_sibling->manakai_append_text ($token->{data});
5494     } else {
5495 wakaba 1.79 !!!cp ('t199');
5496 wakaba 1.52 $foster_parent_element->insert_before
5497     ($self->{document}->create_text_node ($token->{data}),
5498     $next_sibling);
5499     }
5500 wakaba 1.95 $open_tables->[-1]->[1] = 1; # tainted
5501     } else {
5502     !!!cp ('t200');
5503     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5504     }
5505 wakaba 1.52
5506 wakaba 1.95 !!!next-token;
5507 wakaba 1.126 next B;
5508 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
5509 wakaba 1.153 if ({
5510     tr => ($self->{insertion_mode} != IN_ROW_IM),
5511     th => 1, td => 1,
5512     }->{$token->{tag_name}}) {
5513     if ($self->{insertion_mode} == IN_TABLE_IM) {
5514     ## Clear back to table context
5515     while (not ($self->{open_elements}->[-1]->[1]
5516     & TABLE_SCOPING_EL)) {
5517     !!!cp ('t201');
5518     pop @{$self->{open_elements}};
5519     }
5520    
5521     !!!insert-element ('tbody',, $token);
5522     $self->{insertion_mode} = IN_TABLE_BODY_IM;
5523     ## reprocess in the "in table body" insertion mode...
5524     }
5525    
5526     if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5527     unless ($token->{tag_name} eq 'tr') {
5528     !!!cp ('t202');
5529     !!!parse-error (type => 'missing start tag:tr', token => $token);
5530     }
5531 wakaba 1.43
5532 wakaba 1.153 ## Clear back to table body context
5533     while (not ($self->{open_elements}->[-1]->[1]
5534     & TABLE_ROWS_SCOPING_EL)) {
5535     !!!cp ('t203');
5536     ## ISSUE: Can this case be reached?
5537     pop @{$self->{open_elements}};
5538     }
5539 wakaba 1.43
5540 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
5541 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
5542 wakaba 1.79 !!!cp ('t204');
5543 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5544 wakaba 1.125 !!!nack ('t204');
5545 wakaba 1.52 !!!next-token;
5546 wakaba 1.126 next B;
5547 wakaba 1.52 } else {
5548 wakaba 1.79 !!!cp ('t205');
5549 wakaba 1.116 !!!insert-element ('tr',, $token);
5550 wakaba 1.52 ## reprocess in the "in row" insertion mode
5551     }
5552 wakaba 1.79 } else {
5553     !!!cp ('t206');
5554 wakaba 1.52 }
5555    
5556     ## Clear back to table row context
5557 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5558     & TABLE_ROW_SCOPING_EL)) {
5559 wakaba 1.79 !!!cp ('t207');
5560 wakaba 1.52 pop @{$self->{open_elements}};
5561 wakaba 1.43 }
5562 wakaba 1.52
5563 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5564 wakaba 1.54 $self->{insertion_mode} = IN_CELL_IM;
5565 wakaba 1.52
5566     push @$active_formatting_elements, ['#marker', ''];
5567    
5568 wakaba 1.125 !!!nack ('t207.1');
5569 wakaba 1.52 !!!next-token;
5570 wakaba 1.126 next B;
5571 wakaba 1.52 } elsif ({
5572     caption => 1, col => 1, colgroup => 1,
5573     tbody => 1, tfoot => 1, thead => 1,
5574 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5575 wakaba 1.52 }->{$token->{tag_name}}) {
5576 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
5577 wakaba 1.52 ## As if </tr>
5578 wakaba 1.43 ## have an element in table scope
5579     my $i;
5580     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5581     my $node = $self->{open_elements}->[$_];
5582 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5583 wakaba 1.79 !!!cp ('t208');
5584 wakaba 1.43 $i = $_;
5585     last INSCOPE;
5586 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5587 wakaba 1.79 !!!cp ('t209');
5588 wakaba 1.43 last INSCOPE;
5589     }
5590     } # INSCOPE
5591 wakaba 1.79 unless (defined $i) {
5592 wakaba 1.125 !!!cp ('t210');
5593 wakaba 1.83 ## TODO: This type is wrong.
5594 wakaba 1.153 !!!parse-error (type => 'unmacthed end tag',
5595     text => $token->{tag_name}, token => $token);
5596 wakaba 1.52 ## Ignore the token
5597 wakaba 1.125 !!!nack ('t210.1');
5598 wakaba 1.52 !!!next-token;
5599 wakaba 1.126 next B;
5600 wakaba 1.43 }
5601    
5602 wakaba 1.52 ## Clear back to table row context
5603 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5604     & TABLE_ROW_SCOPING_EL)) {
5605 wakaba 1.79 !!!cp ('t211');
5606 wakaba 1.83 ## ISSUE: Can this case be reached?
5607 wakaba 1.52 pop @{$self->{open_elements}};
5608 wakaba 1.1 }
5609 wakaba 1.43
5610 wakaba 1.52 pop @{$self->{open_elements}}; # tr
5611 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5612 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
5613 wakaba 1.79 !!!cp ('t212');
5614 wakaba 1.52 ## reprocess
5615 wakaba 1.125 !!!ack-later;
5616 wakaba 1.126 next B;
5617 wakaba 1.52 } else {
5618 wakaba 1.79 !!!cp ('t213');
5619 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
5620     }
5621 wakaba 1.1 }
5622 wakaba 1.52
5623 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5624 wakaba 1.52 ## have an element in table scope
5625 wakaba 1.43 my $i;
5626     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5627     my $node = $self->{open_elements}->[$_];
5628 wakaba 1.123 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5629 wakaba 1.79 !!!cp ('t214');
5630 wakaba 1.43 $i = $_;
5631     last INSCOPE;
5632 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5633 wakaba 1.79 !!!cp ('t215');
5634 wakaba 1.43 last INSCOPE;
5635     }
5636     } # INSCOPE
5637 wakaba 1.52 unless (defined $i) {
5638 wakaba 1.79 !!!cp ('t216');
5639 wakaba 1.153 ## TODO: This erorr type is wrong.
5640     !!!parse-error (type => 'unmatched end tag',
5641     text => $token->{tag_name}, token => $token);
5642 wakaba 1.52 ## Ignore the token
5643 wakaba 1.125 !!!nack ('t216.1');
5644 wakaba 1.52 !!!next-token;
5645 wakaba 1.126 next B;
5646 wakaba 1.43 }
5647 wakaba 1.52
5648     ## Clear back to table body context
5649 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5650     & TABLE_ROWS_SCOPING_EL)) {
5651 wakaba 1.79 !!!cp ('t217');
5652 wakaba 1.83 ## ISSUE: Can this state be reached?
5653 wakaba 1.52 pop @{$self->{open_elements}};
5654 wakaba 1.43 }
5655    
5656 wakaba 1.52 ## As if <{current node}>
5657     ## have an element in table scope
5658     ## true by definition
5659 wakaba 1.43
5660 wakaba 1.52 ## Clear back to table body context
5661     ## nop by definition
5662 wakaba 1.43
5663 wakaba 1.52 pop @{$self->{open_elements}};
5664 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5665 wakaba 1.52 ## reprocess in "in table" insertion mode...
5666 wakaba 1.79 } else {
5667     !!!cp ('t218');
5668 wakaba 1.52 }
5669    
5670     if ($token->{tag_name} eq 'col') {
5671     ## Clear back to table context
5672 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5673     & TABLE_SCOPING_EL)) {
5674 wakaba 1.79 !!!cp ('t219');
5675 wakaba 1.83 ## ISSUE: Can this state be reached?
5676 wakaba 1.52 pop @{$self->{open_elements}};
5677     }
5678 wakaba 1.43
5679 wakaba 1.116 !!!insert-element ('colgroup',, $token);
5680 wakaba 1.54 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5681 wakaba 1.52 ## reprocess
5682 wakaba 1.125 !!!ack-later;
5683 wakaba 1.126 next B;
5684 wakaba 1.52 } elsif ({
5685     caption => 1,
5686     colgroup => 1,
5687     tbody => 1, tfoot => 1, thead => 1,
5688     }->{$token->{tag_name}}) {
5689     ## Clear back to table context
5690 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5691     & TABLE_SCOPING_EL)) {
5692 wakaba 1.79 !!!cp ('t220');
5693 wakaba 1.83 ## ISSUE: Can this state be reached?
5694 wakaba 1.52 pop @{$self->{open_elements}};
5695 wakaba 1.1 }
5696 wakaba 1.52
5697     push @$active_formatting_elements, ['#marker', '']
5698     if $token->{tag_name} eq 'caption';
5699    
5700 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5701 wakaba 1.52 $self->{insertion_mode} = {
5702 wakaba 1.54 caption => IN_CAPTION_IM,
5703     colgroup => IN_COLUMN_GROUP_IM,
5704     tbody => IN_TABLE_BODY_IM,
5705     tfoot => IN_TABLE_BODY_IM,
5706     thead => IN_TABLE_BODY_IM,
5707 wakaba 1.52 }->{$token->{tag_name}};
5708 wakaba 1.1 !!!next-token;
5709 wakaba 1.125 !!!nack ('t220.1');
5710 wakaba 1.126 next B;
5711 wakaba 1.52 } else {
5712     die "$0: in table: <>: $token->{tag_name}";
5713 wakaba 1.1 }
5714 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
5715 wakaba 1.122 !!!parse-error (type => 'not closed',
5716 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5717 wakaba 1.122 ->manakai_local_name,
5718     token => $token);
5719 wakaba 1.1
5720 wakaba 1.52 ## As if </table>
5721 wakaba 1.1 ## have a table element in table scope
5722     my $i;
5723 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5724     my $node = $self->{open_elements}->[$_];
5725 wakaba 1.123 if ($node->[1] & TABLE_EL) {
5726 wakaba 1.79 !!!cp ('t221');
5727 wakaba 1.1 $i = $_;
5728     last INSCOPE;
5729 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5730 wakaba 1.79 !!!cp ('t222');
5731 wakaba 1.1 last INSCOPE;
5732     }
5733     } # INSCOPE
5734     unless (defined $i) {
5735 wakaba 1.79 !!!cp ('t223');
5736 wakaba 1.83 ## TODO: The following is wrong, maybe.
5737 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => 'table',
5738     token => $token);
5739 wakaba 1.52 ## Ignore tokens </table><table>
5740 wakaba 1.125 !!!nack ('t223.1');
5741 wakaba 1.1 !!!next-token;
5742 wakaba 1.126 next B;
5743 wakaba 1.1 }
5744    
5745 wakaba 1.151 ## TODO: Followings are removed from the latest spec.
5746 wakaba 1.1 ## generate implied end tags
5747 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5748 wakaba 1.79 !!!cp ('t224');
5749 wakaba 1.86 pop @{$self->{open_elements}};
5750 wakaba 1.1 }
5751    
5752 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5753 wakaba 1.79 !!!cp ('t225');
5754 wakaba 1.122 ## NOTE: |<table><tr><table>|
5755     !!!parse-error (type => 'not closed',
5756 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5757 wakaba 1.122 ->manakai_local_name,
5758     token => $token);
5759 wakaba 1.79 } else {
5760     !!!cp ('t226');
5761 wakaba 1.1 }
5762    
5763 wakaba 1.3 splice @{$self->{open_elements}}, $i;
5764 wakaba 1.95 pop @{$open_tables};
5765 wakaba 1.1
5766 wakaba 1.52 $self->_reset_insertion_mode;
5767 wakaba 1.1
5768 wakaba 1.125 ## reprocess
5769     !!!ack-later;
5770 wakaba 1.126 next B;
5771 wakaba 1.100 } elsif ($token->{tag_name} eq 'style') {
5772     if (not $open_tables->[-1]->[1]) { # tainted
5773     !!!cp ('t227.8');
5774     ## NOTE: This is a "as if in head" code clone.
5775     $parse_rcdata->(CDATA_CONTENT_MODEL);
5776 wakaba 1.126 next B;
5777 wakaba 1.100 } else {
5778     !!!cp ('t227.7');
5779     #
5780     }
5781     } elsif ($token->{tag_name} eq 'script') {
5782     if (not $open_tables->[-1]->[1]) { # tainted
5783     !!!cp ('t227.6');
5784     ## NOTE: This is a "as if in head" code clone.
5785     $script_start_tag->();
5786 wakaba 1.126 next B;
5787 wakaba 1.100 } else {
5788     !!!cp ('t227.5');
5789     #
5790     }
5791 wakaba 1.98 } elsif ($token->{tag_name} eq 'input') {
5792     if (not $open_tables->[-1]->[1]) { # tainted
5793     if ($token->{attributes}->{type}) { ## TODO: case
5794     my $type = lc $token->{attributes}->{type}->{value};
5795     if ($type eq 'hidden') {
5796     !!!cp ('t227.3');
5797 wakaba 1.153 !!!parse-error (type => 'in table',
5798     text => $token->{tag_name}, token => $token);
5799 wakaba 1.98
5800 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5801 wakaba 1.98
5802     ## TODO: form element pointer
5803    
5804     pop @{$self->{open_elements}};
5805    
5806     !!!next-token;
5807 wakaba 1.125 !!!ack ('t227.2.1');
5808 wakaba 1.126 next B;
5809 wakaba 1.98 } else {
5810     !!!cp ('t227.2');
5811     #
5812     }
5813     } else {
5814     !!!cp ('t227.1');
5815     #
5816     }
5817     } else {
5818     !!!cp ('t227.4');
5819     #
5820     }
5821 wakaba 1.58 } else {
5822 wakaba 1.79 !!!cp ('t227');
5823 wakaba 1.58 #
5824     }
5825 wakaba 1.98
5826 wakaba 1.153 !!!parse-error (type => 'in table', text => $token->{tag_name},
5827     token => $token);
5828 wakaba 1.98
5829     $insert = $insert_to_foster;
5830     #
5831 wakaba 1.58 } elsif ($token->{type} == END_TAG_TOKEN) {
5832 wakaba 1.52 if ($token->{tag_name} eq 'tr' and
5833 wakaba 1.54 $self->{insertion_mode} == IN_ROW_IM) {
5834 wakaba 1.52 ## have an element in table scope
5835     my $i;
5836     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5837     my $node = $self->{open_elements}->[$_];
5838 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5839 wakaba 1.79 !!!cp ('t228');
5840 wakaba 1.52 $i = $_;
5841     last INSCOPE;
5842 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5843 wakaba 1.79 !!!cp ('t229');
5844 wakaba 1.52 last INSCOPE;
5845     }
5846     } # INSCOPE
5847     unless (defined $i) {
5848 wakaba 1.79 !!!cp ('t230');
5849 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5850     text => $token->{tag_name}, token => $token);
5851 wakaba 1.52 ## Ignore the token
5852 wakaba 1.125 !!!nack ('t230.1');
5853 wakaba 1.42 !!!next-token;
5854 wakaba 1.126 next B;
5855 wakaba 1.79 } else {
5856     !!!cp ('t232');
5857 wakaba 1.42 }
5858    
5859 wakaba 1.52 ## Clear back to table row context
5860 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5861     & TABLE_ROW_SCOPING_EL)) {
5862 wakaba 1.79 !!!cp ('t231');
5863 wakaba 1.83 ## ISSUE: Can this state be reached?
5864 wakaba 1.52 pop @{$self->{open_elements}};
5865     }
5866 wakaba 1.42
5867 wakaba 1.52 pop @{$self->{open_elements}}; # tr
5868 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5869 wakaba 1.52 !!!next-token;
5870 wakaba 1.125 !!!nack ('t231.1');
5871 wakaba 1.126 next B;
5872 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
5873 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
5874 wakaba 1.52 ## As if </tr>
5875     ## have an element in table scope
5876     my $i;
5877     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5878     my $node = $self->{open_elements}->[$_];
5879 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5880 wakaba 1.79 !!!cp ('t233');
5881 wakaba 1.52 $i = $_;
5882     last INSCOPE;
5883 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5884 wakaba 1.79 !!!cp ('t234');
5885 wakaba 1.52 last INSCOPE;
5886 wakaba 1.42 }
5887 wakaba 1.52 } # INSCOPE
5888     unless (defined $i) {
5889 wakaba 1.79 !!!cp ('t235');
5890 wakaba 1.83 ## TODO: The following is wrong.
5891 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5892     text => $token->{type}, token => $token);
5893 wakaba 1.52 ## Ignore the token
5894 wakaba 1.125 !!!nack ('t236.1');
5895 wakaba 1.52 !!!next-token;
5896 wakaba 1.126 next B;
5897 wakaba 1.42 }
5898 wakaba 1.52
5899     ## Clear back to table row context
5900 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5901     & TABLE_ROW_SCOPING_EL)) {
5902 wakaba 1.79 !!!cp ('t236');
5903 wakaba 1.83 ## ISSUE: Can this state be reached?
5904 wakaba 1.46 pop @{$self->{open_elements}};
5905 wakaba 1.1 }
5906 wakaba 1.46
5907 wakaba 1.52 pop @{$self->{open_elements}}; # tr
5908 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5909 wakaba 1.46 ## reprocess in the "in table body" insertion mode...
5910 wakaba 1.1 }
5911    
5912 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5913 wakaba 1.52 ## have an element in table scope
5914     my $i;
5915     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5916     my $node = $self->{open_elements}->[$_];
5917 wakaba 1.123 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5918 wakaba 1.79 !!!cp ('t237');
5919 wakaba 1.52 $i = $_;
5920     last INSCOPE;
5921 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5922 wakaba 1.79 !!!cp ('t238');
5923 wakaba 1.52 last INSCOPE;
5924     }
5925     } # INSCOPE
5926     unless (defined $i) {
5927 wakaba 1.79 !!!cp ('t239');
5928 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5929     text => $token->{tag_name}, token => $token);
5930 wakaba 1.52 ## Ignore the token
5931 wakaba 1.125 !!!nack ('t239.1');
5932 wakaba 1.52 !!!next-token;
5933 wakaba 1.126 next B;
5934 wakaba 1.47 }
5935    
5936     ## Clear back to table body context
5937 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5938     & TABLE_ROWS_SCOPING_EL)) {
5939 wakaba 1.79 !!!cp ('t240');
5940 wakaba 1.47 pop @{$self->{open_elements}};
5941     }
5942    
5943 wakaba 1.52 ## As if <{current node}>
5944     ## have an element in table scope
5945     ## true by definition
5946    
5947     ## Clear back to table body context
5948     ## nop by definition
5949    
5950     pop @{$self->{open_elements}};
5951 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5952 wakaba 1.52 ## reprocess in the "in table" insertion mode...
5953     }
5954    
5955 wakaba 1.94 ## NOTE: </table> in the "in table" insertion mode.
5956     ## When you edit the code fragment below, please ensure that
5957     ## the code for <table> in the "in table" insertion mode
5958     ## is synced with it.
5959    
5960 wakaba 1.52 ## have a table element in table scope
5961     my $i;
5962     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5963     my $node = $self->{open_elements}->[$_];
5964 wakaba 1.123 if ($node->[1] & TABLE_EL) {
5965 wakaba 1.79 !!!cp ('t241');
5966 wakaba 1.52 $i = $_;
5967     last INSCOPE;
5968 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5969 wakaba 1.79 !!!cp ('t242');
5970 wakaba 1.52 last INSCOPE;
5971 wakaba 1.47 }
5972 wakaba 1.52 } # INSCOPE
5973     unless (defined $i) {
5974 wakaba 1.79 !!!cp ('t243');
5975 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5976     text => $token->{tag_name}, token => $token);
5977 wakaba 1.52 ## Ignore the token
5978 wakaba 1.125 !!!nack ('t243.1');
5979 wakaba 1.52 !!!next-token;
5980 wakaba 1.126 next B;
5981 wakaba 1.3 }
5982 wakaba 1.52
5983     splice @{$self->{open_elements}}, $i;
5984 wakaba 1.95 pop @{$open_tables};
5985 wakaba 1.1
5986 wakaba 1.52 $self->_reset_insertion_mode;
5987 wakaba 1.47
5988     !!!next-token;
5989 wakaba 1.126 next B;
5990 wakaba 1.47 } elsif ({
5991 wakaba 1.48 tbody => 1, tfoot => 1, thead => 1,
5992 wakaba 1.52 }->{$token->{tag_name}} and
5993 wakaba 1.56 $self->{insertion_mode} & ROW_IMS) {
5994 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
5995 wakaba 1.52 ## have an element in table scope
5996     my $i;
5997     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5998     my $node = $self->{open_elements}->[$_];
5999 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6000 wakaba 1.79 !!!cp ('t247');
6001 wakaba 1.52 $i = $_;
6002     last INSCOPE;
6003 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6004 wakaba 1.79 !!!cp ('t248');
6005 wakaba 1.52 last INSCOPE;
6006     }
6007     } # INSCOPE
6008     unless (defined $i) {
6009 wakaba 1.79 !!!cp ('t249');
6010 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6011     text => $token->{tag_name}, token => $token);
6012 wakaba 1.52 ## Ignore the token
6013 wakaba 1.125 !!!nack ('t249.1');
6014 wakaba 1.52 !!!next-token;
6015 wakaba 1.126 next B;
6016 wakaba 1.52 }
6017    
6018 wakaba 1.48 ## As if </tr>
6019     ## have an element in table scope
6020     my $i;
6021     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6022     my $node = $self->{open_elements}->[$_];
6023 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
6024 wakaba 1.79 !!!cp ('t250');
6025 wakaba 1.48 $i = $_;
6026     last INSCOPE;
6027 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6028 wakaba 1.79 !!!cp ('t251');
6029 wakaba 1.48 last INSCOPE;
6030     }
6031     } # INSCOPE
6032 wakaba 1.52 unless (defined $i) {
6033 wakaba 1.79 !!!cp ('t252');
6034 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6035     text => 'tr', token => $token);
6036 wakaba 1.52 ## Ignore the token
6037 wakaba 1.125 !!!nack ('t252.1');
6038 wakaba 1.52 !!!next-token;
6039 wakaba 1.126 next B;
6040 wakaba 1.52 }
6041 wakaba 1.48
6042     ## Clear back to table row context
6043 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6044     & TABLE_ROW_SCOPING_EL)) {
6045 wakaba 1.79 !!!cp ('t253');
6046 wakaba 1.83 ## ISSUE: Can this case be reached?
6047 wakaba 1.48 pop @{$self->{open_elements}};
6048     }
6049    
6050     pop @{$self->{open_elements}}; # tr
6051 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
6052 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
6053     }
6054    
6055     ## have an element in table scope
6056     my $i;
6057     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6058     my $node = $self->{open_elements}->[$_];
6059 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6060 wakaba 1.79 !!!cp ('t254');
6061 wakaba 1.52 $i = $_;
6062     last INSCOPE;
6063 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6064 wakaba 1.79 !!!cp ('t255');
6065 wakaba 1.52 last INSCOPE;
6066     }
6067     } # INSCOPE
6068     unless (defined $i) {
6069 wakaba 1.79 !!!cp ('t256');
6070 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6071     text => $token->{tag_name}, token => $token);
6072 wakaba 1.52 ## Ignore the token
6073 wakaba 1.125 !!!nack ('t256.1');
6074 wakaba 1.52 !!!next-token;
6075 wakaba 1.126 next B;
6076 wakaba 1.52 }
6077    
6078     ## Clear back to table body context
6079 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6080     & TABLE_ROWS_SCOPING_EL)) {
6081 wakaba 1.79 !!!cp ('t257');
6082 wakaba 1.83 ## ISSUE: Can this case be reached?
6083 wakaba 1.52 pop @{$self->{open_elements}};
6084     }
6085    
6086     pop @{$self->{open_elements}};
6087 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6088 wakaba 1.125 !!!nack ('t257.1');
6089 wakaba 1.52 !!!next-token;
6090 wakaba 1.126 next B;
6091 wakaba 1.52 } elsif ({
6092     body => 1, caption => 1, col => 1, colgroup => 1,
6093     html => 1, td => 1, th => 1,
6094 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
6095     tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
6096 wakaba 1.52 }->{$token->{tag_name}}) {
6097 wakaba 1.125 !!!cp ('t258');
6098 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6099     text => $token->{tag_name}, token => $token);
6100 wakaba 1.125 ## Ignore the token
6101     !!!nack ('t258.1');
6102     !!!next-token;
6103 wakaba 1.126 next B;
6104 wakaba 1.58 } else {
6105 wakaba 1.79 !!!cp ('t259');
6106 wakaba 1.153 !!!parse-error (type => 'in table:/',
6107     text => $token->{tag_name}, token => $token);
6108 wakaba 1.52
6109 wakaba 1.58 $insert = $insert_to_foster;
6110     #
6111     }
6112 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6113 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6114 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6115 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6116 wakaba 1.104 !!!cp ('t259.1');
6117 wakaba 1.105 #
6118 wakaba 1.104 } else {
6119     !!!cp ('t259.2');
6120 wakaba 1.105 #
6121 wakaba 1.104 }
6122    
6123     ## Stop parsing
6124     last B;
6125 wakaba 1.58 } else {
6126     die "$0: $token->{type}: Unknown token type";
6127     }
6128 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6129 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6130 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6131     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6132     unless (length $token->{data}) {
6133 wakaba 1.79 !!!cp ('t260');
6134 wakaba 1.52 !!!next-token;
6135 wakaba 1.126 next B;
6136 wakaba 1.52 }
6137     }
6138    
6139 wakaba 1.79 !!!cp ('t261');
6140 wakaba 1.52 #
6141 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6142 wakaba 1.52 if ($token->{tag_name} eq 'col') {
6143 wakaba 1.79 !!!cp ('t262');
6144 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6145 wakaba 1.52 pop @{$self->{open_elements}};
6146 wakaba 1.125 !!!ack ('t262.1');
6147 wakaba 1.52 !!!next-token;
6148 wakaba 1.126 next B;
6149 wakaba 1.52 } else {
6150 wakaba 1.79 !!!cp ('t263');
6151 wakaba 1.52 #
6152     }
6153 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6154 wakaba 1.52 if ($token->{tag_name} eq 'colgroup') {
6155 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6156 wakaba 1.79 !!!cp ('t264');
6157 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6158     text => 'colgroup', token => $token);
6159 wakaba 1.52 ## Ignore the token
6160     !!!next-token;
6161 wakaba 1.126 next B;
6162 wakaba 1.52 } else {
6163 wakaba 1.79 !!!cp ('t265');
6164 wakaba 1.52 pop @{$self->{open_elements}}; # colgroup
6165 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6166 wakaba 1.52 !!!next-token;
6167 wakaba 1.126 next B;
6168 wakaba 1.52 }
6169     } elsif ($token->{tag_name} eq 'col') {
6170 wakaba 1.79 !!!cp ('t266');
6171 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6172     text => 'col', token => $token);
6173 wakaba 1.52 ## Ignore the token
6174     !!!next-token;
6175 wakaba 1.126 next B;
6176 wakaba 1.52 } else {
6177 wakaba 1.79 !!!cp ('t267');
6178 wakaba 1.52 #
6179     }
6180 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6181 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6182 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6183     !!!cp ('t270.2');
6184     ## Stop parsing.
6185     last B;
6186     } else {
6187     ## NOTE: As if </colgroup>.
6188     !!!cp ('t270.1');
6189     pop @{$self->{open_elements}}; # colgroup
6190     $self->{insertion_mode} = IN_TABLE_IM;
6191     ## Reprocess.
6192 wakaba 1.126 next B;
6193 wakaba 1.104 }
6194     } else {
6195     die "$0: $token->{type}: Unknown token type";
6196     }
6197 wakaba 1.52
6198     ## As if </colgroup>
6199 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6200 wakaba 1.79 !!!cp ('t269');
6201 wakaba 1.104 ## TODO: Wrong error type?
6202 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6203     text => 'colgroup', token => $token);
6204 wakaba 1.52 ## Ignore the token
6205 wakaba 1.125 !!!nack ('t269.1');
6206 wakaba 1.52 !!!next-token;
6207 wakaba 1.126 next B;
6208 wakaba 1.52 } else {
6209 wakaba 1.79 !!!cp ('t270');
6210 wakaba 1.52 pop @{$self->{open_elements}}; # colgroup
6211 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6212 wakaba 1.125 !!!ack-later;
6213 wakaba 1.52 ## reprocess
6214 wakaba 1.126 next B;
6215 wakaba 1.52 }
6216 wakaba 1.101 } elsif ($self->{insertion_mode} & SELECT_IMS) {
6217 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
6218 wakaba 1.79 !!!cp ('t271');
6219 wakaba 1.58 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
6220     !!!next-token;
6221 wakaba 1.126 next B;
6222 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
6223 wakaba 1.123 if ($token->{tag_name} eq 'option') {
6224     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6225     !!!cp ('t272');
6226     ## As if </option>
6227     pop @{$self->{open_elements}};
6228     } else {
6229     !!!cp ('t273');
6230     }
6231 wakaba 1.52
6232 wakaba 1.123 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6233 wakaba 1.125 !!!nack ('t273.1');
6234 wakaba 1.123 !!!next-token;
6235 wakaba 1.126 next B;
6236 wakaba 1.123 } elsif ($token->{tag_name} eq 'optgroup') {
6237     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6238     !!!cp ('t274');
6239     ## As if </option>
6240     pop @{$self->{open_elements}};
6241     } else {
6242     !!!cp ('t275');
6243     }
6244 wakaba 1.52
6245 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6246     !!!cp ('t276');
6247     ## As if </optgroup>
6248     pop @{$self->{open_elements}};
6249     } else {
6250     !!!cp ('t277');
6251     }
6252 wakaba 1.52
6253 wakaba 1.123 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6254 wakaba 1.125 !!!nack ('t277.1');
6255 wakaba 1.123 !!!next-token;
6256 wakaba 1.126 next B;
6257 wakaba 1.146 } elsif ({
6258     select => 1, input => 1, textarea => 1,
6259     }->{$token->{tag_name}} or
6260 wakaba 1.101 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6261     {
6262     caption => 1, table => 1,
6263     tbody => 1, tfoot => 1, thead => 1,
6264     tr => 1, td => 1, th => 1,
6265     }->{$token->{tag_name}})) {
6266     ## TODO: The type below is not good - <select> is replaced by </select>
6267 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'select',
6268     token => $token);
6269 wakaba 1.101 ## NOTE: As if the token were </select> (<select> case) or
6270     ## as if there were </select> (otherwise).
6271 wakaba 1.123 ## have an element in table scope
6272     my $i;
6273     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6274     my $node = $self->{open_elements}->[$_];
6275     if ($node->[1] & SELECT_EL) {
6276     !!!cp ('t278');
6277     $i = $_;
6278     last INSCOPE;
6279     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6280     !!!cp ('t279');
6281     last INSCOPE;
6282     }
6283     } # INSCOPE
6284     unless (defined $i) {
6285     !!!cp ('t280');
6286 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6287     text => 'select', token => $token);
6288 wakaba 1.123 ## Ignore the token
6289 wakaba 1.125 !!!nack ('t280.1');
6290 wakaba 1.123 !!!next-token;
6291 wakaba 1.126 next B;
6292 wakaba 1.123 }
6293 wakaba 1.52
6294 wakaba 1.123 !!!cp ('t281');
6295     splice @{$self->{open_elements}}, $i;
6296 wakaba 1.52
6297 wakaba 1.123 $self->_reset_insertion_mode;
6298 wakaba 1.47
6299 wakaba 1.101 if ($token->{tag_name} eq 'select') {
6300 wakaba 1.125 !!!nack ('t281.2');
6301 wakaba 1.101 !!!next-token;
6302 wakaba 1.126 next B;
6303 wakaba 1.101 } else {
6304     !!!cp ('t281.1');
6305 wakaba 1.125 !!!ack-later;
6306 wakaba 1.101 ## Reprocess the token.
6307 wakaba 1.126 next B;
6308 wakaba 1.101 }
6309 wakaba 1.58 } else {
6310 wakaba 1.79 !!!cp ('t282');
6311 wakaba 1.153 !!!parse-error (type => 'in select',
6312     text => $token->{tag_name}, token => $token);
6313 wakaba 1.58 ## Ignore the token
6314 wakaba 1.125 !!!nack ('t282.1');
6315 wakaba 1.58 !!!next-token;
6316 wakaba 1.126 next B;
6317 wakaba 1.58 }
6318     } elsif ($token->{type} == END_TAG_TOKEN) {
6319 wakaba 1.123 if ($token->{tag_name} eq 'optgroup') {
6320     if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6321     $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6322     !!!cp ('t283');
6323     ## As if </option>
6324     splice @{$self->{open_elements}}, -2;
6325     } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6326     !!!cp ('t284');
6327     pop @{$self->{open_elements}};
6328     } else {
6329     !!!cp ('t285');
6330 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6331     text => $token->{tag_name}, token => $token);
6332 wakaba 1.123 ## Ignore the token
6333     }
6334 wakaba 1.125 !!!nack ('t285.1');
6335 wakaba 1.123 !!!next-token;
6336 wakaba 1.126 next B;
6337 wakaba 1.123 } elsif ($token->{tag_name} eq 'option') {
6338     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6339     !!!cp ('t286');
6340     pop @{$self->{open_elements}};
6341     } else {
6342     !!!cp ('t287');
6343 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6344     text => $token->{tag_name}, token => $token);
6345 wakaba 1.123 ## Ignore the token
6346     }
6347 wakaba 1.125 !!!nack ('t287.1');
6348 wakaba 1.123 !!!next-token;
6349 wakaba 1.126 next B;
6350 wakaba 1.123 } elsif ($token->{tag_name} eq 'select') {
6351     ## have an element in table scope
6352     my $i;
6353     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6354     my $node = $self->{open_elements}->[$_];
6355     if ($node->[1] & SELECT_EL) {
6356     !!!cp ('t288');
6357     $i = $_;
6358     last INSCOPE;
6359     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6360     !!!cp ('t289');
6361     last INSCOPE;
6362     }
6363     } # INSCOPE
6364     unless (defined $i) {
6365     !!!cp ('t290');
6366 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6367     text => $token->{tag_name}, token => $token);
6368 wakaba 1.123 ## Ignore the token
6369 wakaba 1.125 !!!nack ('t290.1');
6370 wakaba 1.123 !!!next-token;
6371 wakaba 1.126 next B;
6372 wakaba 1.123 }
6373 wakaba 1.52
6374 wakaba 1.123 !!!cp ('t291');
6375     splice @{$self->{open_elements}}, $i;
6376 wakaba 1.52
6377 wakaba 1.123 $self->_reset_insertion_mode;
6378 wakaba 1.52
6379 wakaba 1.125 !!!nack ('t291.1');
6380 wakaba 1.123 !!!next-token;
6381 wakaba 1.126 next B;
6382 wakaba 1.101 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6383     {
6384     caption => 1, table => 1, tbody => 1,
6385     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6386     }->{$token->{tag_name}}) {
6387 wakaba 1.83 ## TODO: The following is wrong?
6388 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6389     text => $token->{tag_name}, token => $token);
6390 wakaba 1.52
6391 wakaba 1.123 ## have an element in table scope
6392     my $i;
6393     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6394     my $node = $self->{open_elements}->[$_];
6395     if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6396     !!!cp ('t292');
6397     $i = $_;
6398     last INSCOPE;
6399     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6400     !!!cp ('t293');
6401     last INSCOPE;
6402     }
6403     } # INSCOPE
6404     unless (defined $i) {
6405     !!!cp ('t294');
6406     ## Ignore the token
6407 wakaba 1.125 !!!nack ('t294.1');
6408 wakaba 1.123 !!!next-token;
6409 wakaba 1.126 next B;
6410 wakaba 1.123 }
6411 wakaba 1.52
6412 wakaba 1.123 ## As if </select>
6413     ## have an element in table scope
6414     undef $i;
6415     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6416     my $node = $self->{open_elements}->[$_];
6417     if ($node->[1] & SELECT_EL) {
6418     !!!cp ('t295');
6419     $i = $_;
6420     last INSCOPE;
6421     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6422 wakaba 1.83 ## ISSUE: Can this state be reached?
6423 wakaba 1.123 !!!cp ('t296');
6424     last INSCOPE;
6425     }
6426     } # INSCOPE
6427     unless (defined $i) {
6428     !!!cp ('t297');
6429 wakaba 1.83 ## TODO: The following error type is correct?
6430 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6431     text => 'select', token => $token);
6432 wakaba 1.123 ## Ignore the </select> token
6433 wakaba 1.125 !!!nack ('t297.1');
6434 wakaba 1.123 !!!next-token; ## TODO: ok?
6435 wakaba 1.126 next B;
6436 wakaba 1.123 }
6437 wakaba 1.52
6438 wakaba 1.123 !!!cp ('t298');
6439     splice @{$self->{open_elements}}, $i;
6440 wakaba 1.52
6441 wakaba 1.123 $self->_reset_insertion_mode;
6442 wakaba 1.52
6443 wakaba 1.125 !!!ack-later;
6444 wakaba 1.123 ## reprocess
6445 wakaba 1.126 next B;
6446 wakaba 1.58 } else {
6447 wakaba 1.79 !!!cp ('t299');
6448 wakaba 1.153 !!!parse-error (type => 'in select:/',
6449     text => $token->{tag_name}, token => $token);
6450 wakaba 1.52 ## Ignore the token
6451 wakaba 1.125 !!!nack ('t299.3');
6452 wakaba 1.52 !!!next-token;
6453 wakaba 1.126 next B;
6454 wakaba 1.58 }
6455 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6456 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6457 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6458     !!!cp ('t299.1');
6459 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6460 wakaba 1.104 } else {
6461     !!!cp ('t299.2');
6462     }
6463    
6464     ## Stop parsing.
6465     last B;
6466 wakaba 1.58 } else {
6467     die "$0: $token->{type}: Unknown token type";
6468     }
6469 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6470 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6471 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6472     my $data = $1;
6473     ## As if in body
6474     $reconstruct_active_formatting_elements->($insert_to_current);
6475    
6476     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6477    
6478     unless (length $token->{data}) {
6479 wakaba 1.79 !!!cp ('t300');
6480 wakaba 1.52 !!!next-token;
6481 wakaba 1.126 next B;
6482 wakaba 1.52 }
6483     }
6484    
6485 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6486 wakaba 1.79 !!!cp ('t301');
6487 wakaba 1.153 !!!parse-error (type => 'after html:#text', token => $token);
6488 wakaba 1.52
6489 wakaba 1.84 ## Reprocess in the "after body" insertion mode.
6490 wakaba 1.79 } else {
6491     !!!cp ('t302');
6492 wakaba 1.52 }
6493    
6494     ## "after body" insertion mode
6495 wakaba 1.153 !!!parse-error (type => 'after body:#text', token => $token);
6496 wakaba 1.52
6497 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6498 wakaba 1.52 ## reprocess
6499 wakaba 1.126 next B;
6500 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6501 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6502 wakaba 1.79 !!!cp ('t303');
6503 wakaba 1.153 !!!parse-error (type => 'after html',
6504     text => $token->{tag_name}, token => $token);
6505 wakaba 1.52
6506 wakaba 1.84 ## Reprocess in the "after body" insertion mode.
6507 wakaba 1.79 } else {
6508     !!!cp ('t304');
6509 wakaba 1.52 }
6510    
6511     ## "after body" insertion mode
6512 wakaba 1.153 !!!parse-error (type => 'after body',
6513     text => $token->{tag_name}, token => $token);
6514 wakaba 1.52
6515 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6516 wakaba 1.125 !!!ack-later;
6517 wakaba 1.52 ## reprocess
6518 wakaba 1.126 next B;
6519 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6520 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6521 wakaba 1.79 !!!cp ('t305');
6522 wakaba 1.153 !!!parse-error (type => 'after html:/',
6523     text => $token->{tag_name}, token => $token);
6524 wakaba 1.52
6525 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
6526 wakaba 1.84 ## Reprocess in the "after body" insertion mode.
6527 wakaba 1.79 } else {
6528     !!!cp ('t306');
6529 wakaba 1.52 }
6530    
6531     ## "after body" insertion mode
6532     if ($token->{tag_name} eq 'html') {
6533     if (defined $self->{inner_html_node}) {
6534 wakaba 1.79 !!!cp ('t307');
6535 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6536     text => 'html', token => $token);
6537 wakaba 1.52 ## Ignore the token
6538     !!!next-token;
6539 wakaba 1.126 next B;
6540 wakaba 1.52 } else {
6541 wakaba 1.79 !!!cp ('t308');
6542 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6543 wakaba 1.52 !!!next-token;
6544 wakaba 1.126 next B;
6545 wakaba 1.52 }
6546     } else {
6547 wakaba 1.79 !!!cp ('t309');
6548 wakaba 1.153 !!!parse-error (type => 'after body:/',
6549     text => $token->{tag_name}, token => $token);
6550 wakaba 1.52
6551 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6552 wakaba 1.52 ## reprocess
6553 wakaba 1.126 next B;
6554 wakaba 1.52 }
6555 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6556     !!!cp ('t309.2');
6557     ## Stop parsing
6558     last B;
6559 wakaba 1.52 } else {
6560     die "$0: $token->{type}: Unknown token type";
6561     }
6562 wakaba 1.56 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6563 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6564 wakaba 1.52 if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
6565     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6566    
6567     unless (length $token->{data}) {
6568 wakaba 1.79 !!!cp ('t310');
6569 wakaba 1.52 !!!next-token;
6570 wakaba 1.126 next B;
6571 wakaba 1.52 }
6572     }
6573    
6574     if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
6575 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6576 wakaba 1.79 !!!cp ('t311');
6577 wakaba 1.153 !!!parse-error (type => 'in frameset:#text', token => $token);
6578 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6579 wakaba 1.79 !!!cp ('t312');
6580 wakaba 1.153 !!!parse-error (type => 'after frameset:#text', token => $token);
6581 wakaba 1.158 } else { # "after after frameset"
6582 wakaba 1.79 !!!cp ('t313');
6583 wakaba 1.153 !!!parse-error (type => 'after html:#text', token => $token);
6584 wakaba 1.52 }
6585    
6586     ## Ignore the token.
6587     if (length $token->{data}) {
6588 wakaba 1.79 !!!cp ('t314');
6589 wakaba 1.52 ## reprocess the rest of characters
6590     } else {
6591 wakaba 1.79 !!!cp ('t315');
6592 wakaba 1.52 !!!next-token;
6593     }
6594 wakaba 1.126 next B;
6595 wakaba 1.52 }
6596    
6597     die qq[$0: Character "$token->{data}"];
6598 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6599 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
6600 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6601 wakaba 1.79 !!!cp ('t318');
6602 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6603 wakaba 1.125 !!!nack ('t318.1');
6604 wakaba 1.52 !!!next-token;
6605 wakaba 1.126 next B;
6606 wakaba 1.52 } elsif ($token->{tag_name} eq 'frame' and
6607 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6608 wakaba 1.79 !!!cp ('t319');
6609 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6610 wakaba 1.52 pop @{$self->{open_elements}};
6611 wakaba 1.125 !!!ack ('t319.1');
6612 wakaba 1.52 !!!next-token;
6613 wakaba 1.126 next B;
6614 wakaba 1.52 } elsif ($token->{tag_name} eq 'noframes') {
6615 wakaba 1.79 !!!cp ('t320');
6616 wakaba 1.148 ## NOTE: As if in head.
6617 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
6618 wakaba 1.126 next B;
6619 wakaba 1.158
6620     ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6621     ## has no parse error.
6622 wakaba 1.52 } else {
6623 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6624 wakaba 1.79 !!!cp ('t321');
6625 wakaba 1.153 !!!parse-error (type => 'in frameset',
6626     text => $token->{tag_name}, token => $token);
6627 wakaba 1.158 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6628 wakaba 1.79 !!!cp ('t322');
6629 wakaba 1.153 !!!parse-error (type => 'after frameset',
6630     text => $token->{tag_name}, token => $token);
6631 wakaba 1.158 } else { # "after after frameset"
6632     !!!cp ('t322.2');
6633     !!!parse-error (type => 'after after frameset',
6634     text => $token->{tag_name}, token => $token);
6635 wakaba 1.52 }
6636     ## Ignore the token
6637 wakaba 1.125 !!!nack ('t322.1');
6638 wakaba 1.52 !!!next-token;
6639 wakaba 1.126 next B;
6640 wakaba 1.52 }
6641 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6642 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
6643 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6644 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6645 wakaba 1.52 @{$self->{open_elements}} == 1) {
6646 wakaba 1.79 !!!cp ('t325');
6647 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6648     text => $token->{tag_name}, token => $token);
6649 wakaba 1.52 ## Ignore the token
6650     !!!next-token;
6651     } else {
6652 wakaba 1.79 !!!cp ('t326');
6653 wakaba 1.52 pop @{$self->{open_elements}};
6654     !!!next-token;
6655     }
6656 wakaba 1.47
6657 wakaba 1.52 if (not defined $self->{inner_html_node} and
6658 wakaba 1.123 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6659 wakaba 1.79 !!!cp ('t327');
6660 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6661 wakaba 1.79 } else {
6662     !!!cp ('t328');
6663 wakaba 1.52 }
6664 wakaba 1.126 next B;
6665 wakaba 1.52 } elsif ($token->{tag_name} eq 'html' and
6666 wakaba 1.54 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6667 wakaba 1.79 !!!cp ('t329');
6668 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6669 wakaba 1.52 !!!next-token;
6670 wakaba 1.126 next B;
6671 wakaba 1.52 } else {
6672 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6673 wakaba 1.79 !!!cp ('t330');
6674 wakaba 1.153 !!!parse-error (type => 'in frameset:/',
6675     text => $token->{tag_name}, token => $token);
6676 wakaba 1.158 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6677     !!!cp ('t330.1');
6678     !!!parse-error (type => 'after frameset:/',
6679     text => $token->{tag_name}, token => $token);
6680     } else { # "after after html"
6681 wakaba 1.79 !!!cp ('t331');
6682 wakaba 1.158 !!!parse-error (type => 'after after frameset:/',
6683 wakaba 1.153 text => $token->{tag_name}, token => $token);
6684 wakaba 1.52 }
6685     ## Ignore the token
6686     !!!next-token;
6687 wakaba 1.126 next B;
6688 wakaba 1.52 }
6689 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6690 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6691 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6692     !!!cp ('t331.1');
6693 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6694 wakaba 1.104 } else {
6695     !!!cp ('t331.2');
6696     }
6697    
6698     ## Stop parsing
6699     last B;
6700 wakaba 1.52 } else {
6701     die "$0: $token->{type}: Unknown token type";
6702     }
6703 wakaba 1.47
6704 wakaba 1.52 ## ISSUE: An issue in spec here
6705     } else {
6706     die "$0: $self->{insertion_mode}: Unknown insertion mode";
6707     }
6708 wakaba 1.47
6709 wakaba 1.52 ## "in body" insertion mode
6710 wakaba 1.55 if ($token->{type} == START_TAG_TOKEN) {
6711 wakaba 1.52 if ($token->{tag_name} eq 'script') {
6712 wakaba 1.79 !!!cp ('t332');
6713 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6714 wakaba 1.100 $script_start_tag->();
6715 wakaba 1.126 next B;
6716 wakaba 1.52 } elsif ($token->{tag_name} eq 'style') {
6717 wakaba 1.79 !!!cp ('t333');
6718 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6719 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
6720 wakaba 1.126 next B;
6721 wakaba 1.52 } elsif ({
6722     base => 1, link => 1,
6723     }->{$token->{tag_name}}) {
6724 wakaba 1.79 !!!cp ('t334');
6725 wakaba 1.52 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6726 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6727 wakaba 1.52 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6728 wakaba 1.125 !!!ack ('t334.1');
6729 wakaba 1.52 !!!next-token;
6730 wakaba 1.126 next B;
6731 wakaba 1.52 } elsif ($token->{tag_name} eq 'meta') {
6732     ## NOTE: This is an "as if in head" code clone, only "-t" differs
6733 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6734 wakaba 1.66 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6735 wakaba 1.46
6736 wakaba 1.52 unless ($self->{confident}) {
6737 wakaba 1.134 if ($token->{attributes}->{charset}) {
6738 wakaba 1.79 !!!cp ('t335');
6739 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
6740     ## in the {change_encoding} callback.
6741 wakaba 1.63 $self->{change_encoding}
6742 wakaba 1.114 ->($self, $token->{attributes}->{charset}->{value}, $token);
6743 wakaba 1.66
6744     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6745     ->set_user_data (manakai_has_reference =>
6746     $token->{attributes}->{charset}
6747     ->{has_reference});
6748 wakaba 1.63 } elsif ($token->{attributes}->{content}) {
6749     if ($token->{attributes}->{content}->{value}
6750 wakaba 1.144 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6751 wakaba 1.70 [\x09-\x0D\x20]*=
6752 wakaba 1.52 [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6753 wakaba 1.145 ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
6754 wakaba 1.79 !!!cp ('t336');
6755 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
6756     ## in the {change_encoding} callback.
6757 wakaba 1.63 $self->{change_encoding}
6758 wakaba 1.114 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6759 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6760     ->set_user_data (manakai_has_reference =>
6761     $token->{attributes}->{content}
6762     ->{has_reference});
6763 wakaba 1.63 }
6764 wakaba 1.52 }
6765 wakaba 1.66 } else {
6766     if ($token->{attributes}->{charset}) {
6767 wakaba 1.79 !!!cp ('t337');
6768 wakaba 1.66 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6769     ->set_user_data (manakai_has_reference =>
6770     $token->{attributes}->{charset}
6771     ->{has_reference});
6772     }
6773 wakaba 1.68 if ($token->{attributes}->{content}) {
6774 wakaba 1.79 !!!cp ('t338');
6775 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6776     ->set_user_data (manakai_has_reference =>
6777     $token->{attributes}->{content}
6778     ->{has_reference});
6779     }
6780 wakaba 1.52 }
6781 wakaba 1.1
6782 wakaba 1.125 !!!ack ('t338.1');
6783 wakaba 1.52 !!!next-token;
6784 wakaba 1.126 next B;
6785 wakaba 1.52 } elsif ($token->{tag_name} eq 'title') {
6786 wakaba 1.79 !!!cp ('t341');
6787 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6788 wakaba 1.96 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6789 wakaba 1.126 next B;
6790 wakaba 1.52 } elsif ($token->{tag_name} eq 'body') {
6791 wakaba 1.153 !!!parse-error (type => 'in body', text => 'body', token => $token);
6792 wakaba 1.46
6793 wakaba 1.52 if (@{$self->{open_elements}} == 1 or
6794 wakaba 1.123 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6795 wakaba 1.79 !!!cp ('t342');
6796 wakaba 1.52 ## Ignore the token
6797     } else {
6798     my $body_el = $self->{open_elements}->[1]->[0];
6799     for my $attr_name (keys %{$token->{attributes}}) {
6800     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6801 wakaba 1.79 !!!cp ('t343');
6802 wakaba 1.52 $body_el->set_attribute_ns
6803     (undef, [undef, $attr_name],
6804     $token->{attributes}->{$attr_name}->{value});
6805     }
6806     }
6807     }
6808 wakaba 1.125 !!!nack ('t343.1');
6809 wakaba 1.52 !!!next-token;
6810 wakaba 1.126 next B;
6811 wakaba 1.52 } elsif ({
6812     address => 1, blockquote => 1, center => 1, dir => 1,
6813 wakaba 1.85 div => 1, dl => 1, fieldset => 1,
6814     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6815 wakaba 1.97 menu => 1, ol => 1, p => 1, ul => 1,
6816     pre => 1, listing => 1,
6817 wakaba 1.109 form => 1,
6818     table => 1,
6819     hr => 1,
6820 wakaba 1.52 }->{$token->{tag_name}}) {
6821 wakaba 1.109 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6822     !!!cp ('t350');
6823 wakaba 1.113 !!!parse-error (type => 'in form:form', token => $token);
6824 wakaba 1.109 ## Ignore the token
6825 wakaba 1.125 !!!nack ('t350.1');
6826 wakaba 1.109 !!!next-token;
6827 wakaba 1.126 next B;
6828 wakaba 1.109 }
6829    
6830 wakaba 1.52 ## has a p element in scope
6831     INSCOPE: for (reverse @{$self->{open_elements}}) {
6832 wakaba 1.123 if ($_->[1] & P_EL) {
6833 wakaba 1.79 !!!cp ('t344');
6834 wakaba 1.125 !!!back-token; # <form>
6835 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6836     line => $token->{line}, column => $token->{column}};
6837 wakaba 1.126 next B;
6838 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
6839 wakaba 1.79 !!!cp ('t345');
6840 wakaba 1.52 last INSCOPE;
6841     }
6842     } # INSCOPE
6843    
6844 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6845 wakaba 1.97 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6846 wakaba 1.125 !!!nack ('t346.1');
6847 wakaba 1.52 !!!next-token;
6848 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6849 wakaba 1.52 $token->{data} =~ s/^\x0A//;
6850     unless (length $token->{data}) {
6851 wakaba 1.79 !!!cp ('t346');
6852 wakaba 1.1 !!!next-token;
6853 wakaba 1.79 } else {
6854     !!!cp ('t349');
6855 wakaba 1.52 }
6856 wakaba 1.79 } else {
6857     !!!cp ('t348');
6858 wakaba 1.52 }
6859 wakaba 1.109 } elsif ($token->{tag_name} eq 'form') {
6860     !!!cp ('t347.1');
6861     $self->{form_element} = $self->{open_elements}->[-1]->[0];
6862    
6863 wakaba 1.125 !!!nack ('t347.2');
6864 wakaba 1.109 !!!next-token;
6865     } elsif ($token->{tag_name} eq 'table') {
6866     !!!cp ('t382');
6867     push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6868    
6869     $self->{insertion_mode} = IN_TABLE_IM;
6870    
6871 wakaba 1.125 !!!nack ('t382.1');
6872 wakaba 1.109 !!!next-token;
6873     } elsif ($token->{tag_name} eq 'hr') {
6874     !!!cp ('t386');
6875     pop @{$self->{open_elements}};
6876    
6877 wakaba 1.125 !!!nack ('t386.1');
6878 wakaba 1.109 !!!next-token;
6879 wakaba 1.52 } else {
6880 wakaba 1.125 !!!nack ('t347.1');
6881 wakaba 1.52 !!!next-token;
6882     }
6883 wakaba 1.126 next B;
6884 wakaba 1.109 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6885 wakaba 1.52 ## has a p element in scope
6886     INSCOPE: for (reverse @{$self->{open_elements}}) {
6887 wakaba 1.123 if ($_->[1] & P_EL) {
6888 wakaba 1.79 !!!cp ('t353');
6889 wakaba 1.125 !!!back-token; # <x>
6890 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6891     line => $token->{line}, column => $token->{column}};
6892 wakaba 1.126 next B;
6893 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
6894 wakaba 1.79 !!!cp ('t354');
6895 wakaba 1.52 last INSCOPE;
6896     }
6897     } # INSCOPE
6898    
6899     ## Step 1
6900     my $i = -1;
6901     my $node = $self->{open_elements}->[$i];
6902 wakaba 1.109 my $li_or_dtdd = {li => {li => 1},
6903     dt => {dt => 1, dd => 1},
6904     dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6905 wakaba 1.52 LI: {
6906     ## Step 2
6907 wakaba 1.123 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6908 wakaba 1.52 if ($i != -1) {
6909 wakaba 1.79 !!!cp ('t355');
6910 wakaba 1.122 !!!parse-error (type => 'not closed',
6911 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
6912 wakaba 1.122 ->manakai_local_name,
6913     token => $token);
6914 wakaba 1.79 } else {
6915     !!!cp ('t356');
6916 wakaba 1.52 }
6917     splice @{$self->{open_elements}}, $i;
6918     last LI;
6919 wakaba 1.79 } else {
6920     !!!cp ('t357');
6921 wakaba 1.52 }
6922    
6923     ## Step 3
6924 wakaba 1.123 if (not ($node->[1] & FORMATTING_EL) and
6925 wakaba 1.52 #not $phrasing_category->{$node->[1]} and
6926 wakaba 1.123 ($node->[1] & SPECIAL_EL or
6927     $node->[1] & SCOPING_EL) and
6928     not ($node->[1] & ADDRESS_EL) and
6929     not ($node->[1] & DIV_EL)) {
6930 wakaba 1.79 !!!cp ('t358');
6931 wakaba 1.52 last LI;
6932     }
6933    
6934 wakaba 1.79 !!!cp ('t359');
6935 wakaba 1.52 ## Step 4
6936     $i--;
6937     $node = $self->{open_elements}->[$i];
6938     redo LI;
6939     } # LI
6940    
6941 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6942 wakaba 1.125 !!!nack ('t359.1');
6943 wakaba 1.52 !!!next-token;
6944 wakaba 1.126 next B;
6945 wakaba 1.52 } elsif ($token->{tag_name} eq 'plaintext') {
6946     ## has a p element in scope
6947     INSCOPE: for (reverse @{$self->{open_elements}}) {
6948 wakaba 1.123 if ($_->[1] & P_EL) {
6949 wakaba 1.79 !!!cp ('t367');
6950 wakaba 1.125 !!!back-token; # <plaintext>
6951 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6952     line => $token->{line}, column => $token->{column}};
6953 wakaba 1.126 next B;
6954 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
6955 wakaba 1.79 !!!cp ('t368');
6956 wakaba 1.52 last INSCOPE;
6957 wakaba 1.46 }
6958 wakaba 1.52 } # INSCOPE
6959    
6960 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6961 wakaba 1.52
6962     $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
6963    
6964 wakaba 1.125 !!!nack ('t368.1');
6965 wakaba 1.52 !!!next-token;
6966 wakaba 1.126 next B;
6967 wakaba 1.52 } elsif ($token->{tag_name} eq 'a') {
6968     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
6969     my $node = $active_formatting_elements->[$i];
6970 wakaba 1.123 if ($node->[1] & A_EL) {
6971 wakaba 1.79 !!!cp ('t371');
6972 wakaba 1.113 !!!parse-error (type => 'in a:a', token => $token);
6973 wakaba 1.52
6974 wakaba 1.125 !!!back-token; # <a>
6975 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'a',
6976     line => $token->{line}, column => $token->{column}};
6977 wakaba 1.113 $formatting_end_tag->($token);
6978 wakaba 1.52
6979     AFE2: for (reverse 0..$#$active_formatting_elements) {
6980     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
6981 wakaba 1.79 !!!cp ('t372');
6982 wakaba 1.52 splice @$active_formatting_elements, $_, 1;
6983     last AFE2;
6984 wakaba 1.1 }
6985 wakaba 1.52 } # AFE2
6986     OE: for (reverse 0..$#{$self->{open_elements}}) {
6987     if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
6988 wakaba 1.79 !!!cp ('t373');
6989 wakaba 1.52 splice @{$self->{open_elements}}, $_, 1;
6990     last OE;
6991 wakaba 1.1 }
6992 wakaba 1.52 } # OE
6993     last AFE;
6994     } elsif ($node->[0] eq '#marker') {
6995 wakaba 1.79 !!!cp ('t374');
6996 wakaba 1.52 last AFE;
6997     }
6998     } # AFE
6999    
7000     $reconstruct_active_formatting_elements->($insert_to_current);
7001 wakaba 1.1
7002 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7003 wakaba 1.52 push @$active_formatting_elements, $self->{open_elements}->[-1];
7004 wakaba 1.1
7005 wakaba 1.125 !!!nack ('t374.1');
7006 wakaba 1.52 !!!next-token;
7007 wakaba 1.126 next B;
7008 wakaba 1.52 } elsif ($token->{tag_name} eq 'nobr') {
7009     $reconstruct_active_formatting_elements->($insert_to_current);
7010 wakaba 1.1
7011 wakaba 1.52 ## has a |nobr| element in scope
7012     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7013     my $node = $self->{open_elements}->[$_];
7014 wakaba 1.123 if ($node->[1] & NOBR_EL) {
7015 wakaba 1.79 !!!cp ('t376');
7016 wakaba 1.113 !!!parse-error (type => 'in nobr:nobr', token => $token);
7017 wakaba 1.125 !!!back-token; # <nobr>
7018 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
7019     line => $token->{line}, column => $token->{column}};
7020 wakaba 1.126 next B;
7021 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7022 wakaba 1.79 !!!cp ('t377');
7023 wakaba 1.52 last INSCOPE;
7024     }
7025     } # INSCOPE
7026    
7027 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7028 wakaba 1.52 push @$active_formatting_elements, $self->{open_elements}->[-1];
7029    
7030 wakaba 1.125 !!!nack ('t377.1');
7031 wakaba 1.52 !!!next-token;
7032 wakaba 1.126 next B;
7033 wakaba 1.52 } elsif ($token->{tag_name} eq 'button') {
7034     ## has a button element in scope
7035     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7036     my $node = $self->{open_elements}->[$_];
7037 wakaba 1.123 if ($node->[1] & BUTTON_EL) {
7038 wakaba 1.79 !!!cp ('t378');
7039 wakaba 1.113 !!!parse-error (type => 'in button:button', token => $token);
7040 wakaba 1.125 !!!back-token; # <button>
7041 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'button',
7042     line => $token->{line}, column => $token->{column}};
7043 wakaba 1.126 next B;
7044 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7045 wakaba 1.79 !!!cp ('t379');
7046 wakaba 1.52 last INSCOPE;
7047     }
7048     } # INSCOPE
7049    
7050     $reconstruct_active_formatting_elements->($insert_to_current);
7051    
7052 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7053 wakaba 1.85
7054     ## TODO: associate with $self->{form_element} if defined
7055    
7056 wakaba 1.52 push @$active_formatting_elements, ['#marker', ''];
7057 wakaba 1.1
7058 wakaba 1.125 !!!nack ('t379.1');
7059 wakaba 1.52 !!!next-token;
7060 wakaba 1.126 next B;
7061 wakaba 1.103 } elsif ({
7062 wakaba 1.109 xmp => 1,
7063     iframe => 1,
7064     noembed => 1,
7065 wakaba 1.148 noframes => 1, ## NOTE: This is an "as if in head" code clone.
7066 wakaba 1.109 noscript => 0, ## TODO: 1 if scripting is enabled
7067 wakaba 1.103 }->{$token->{tag_name}}) {
7068 wakaba 1.109 if ($token->{tag_name} eq 'xmp') {
7069     !!!cp ('t381');
7070     $reconstruct_active_formatting_elements->($insert_to_current);
7071     } else {
7072     !!!cp ('t399');
7073     }
7074     ## NOTE: There is an "as if in body" code clone.
7075 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
7076 wakaba 1.126 next B;
7077 wakaba 1.52 } elsif ($token->{tag_name} eq 'isindex') {
7078 wakaba 1.113 !!!parse-error (type => 'isindex', token => $token);
7079 wakaba 1.52
7080     if (defined $self->{form_element}) {
7081 wakaba 1.79 !!!cp ('t389');
7082 wakaba 1.52 ## Ignore the token
7083 wakaba 1.125 !!!nack ('t389'); ## NOTE: Not acknowledged.
7084 wakaba 1.52 !!!next-token;
7085 wakaba 1.126 next B;
7086 wakaba 1.52 } else {
7087 wakaba 1.147 !!!ack ('t391.1');
7088    
7089 wakaba 1.52 my $at = $token->{attributes};
7090     my $form_attrs;
7091     $form_attrs->{action} = $at->{action} if $at->{action};
7092     my $prompt_attr = $at->{prompt};
7093     $at->{name} = {name => 'name', value => 'isindex'};
7094     delete $at->{action};
7095     delete $at->{prompt};
7096     my @tokens = (
7097 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'form',
7098 wakaba 1.114 attributes => $form_attrs,
7099     line => $token->{line}, column => $token->{column}},
7100     {type => START_TAG_TOKEN, tag_name => 'hr',
7101     line => $token->{line}, column => $token->{column}},
7102     {type => START_TAG_TOKEN, tag_name => 'p',
7103     line => $token->{line}, column => $token->{column}},
7104     {type => START_TAG_TOKEN, tag_name => 'label',
7105     line => $token->{line}, column => $token->{column}},
7106 wakaba 1.52 );
7107     if ($prompt_attr) {
7108 wakaba 1.79 !!!cp ('t390');
7109 wakaba 1.114 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
7110 wakaba 1.118 #line => $token->{line}, column => $token->{column},
7111     };
7112 wakaba 1.1 } else {
7113 wakaba 1.79 !!!cp ('t391');
7114 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN,
7115 wakaba 1.114 data => 'This is a searchable index. Insert your search keywords here: ',
7116 wakaba 1.118 #line => $token->{line}, column => $token->{column},
7117     }; # SHOULD
7118 wakaba 1.52 ## TODO: make this configurable
7119 wakaba 1.1 }
7120 wakaba 1.52 push @tokens,
7121 wakaba 1.114 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
7122     line => $token->{line}, column => $token->{column}},
7123 wakaba 1.55 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
7124 wakaba 1.114 {type => END_TAG_TOKEN, tag_name => 'label',
7125     line => $token->{line}, column => $token->{column}},
7126     {type => END_TAG_TOKEN, tag_name => 'p',
7127     line => $token->{line}, column => $token->{column}},
7128     {type => START_TAG_TOKEN, tag_name => 'hr',
7129     line => $token->{line}, column => $token->{column}},
7130     {type => END_TAG_TOKEN, tag_name => 'form',
7131     line => $token->{line}, column => $token->{column}};
7132 wakaba 1.52 !!!back-token (@tokens);
7133 wakaba 1.125 !!!next-token;
7134 wakaba 1.126 next B;
7135 wakaba 1.52 }
7136     } elsif ($token->{tag_name} eq 'textarea') {
7137     my $tag_name = $token->{tag_name};
7138     my $el;
7139 wakaba 1.126 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
7140 wakaba 1.52
7141     ## TODO: $self->{form_element} if defined
7142     $self->{content_model} = RCDATA_CONTENT_MODEL;
7143     delete $self->{escape}; # MUST
7144    
7145     $insert->($el);
7146    
7147     my $text = '';
7148 wakaba 1.125 !!!nack ('t392.1');
7149 wakaba 1.52 !!!next-token;
7150 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
7151 wakaba 1.52 $token->{data} =~ s/^\x0A//;
7152 wakaba 1.51 unless (length $token->{data}) {
7153 wakaba 1.79 !!!cp ('t392');
7154 wakaba 1.51 !!!next-token;
7155 wakaba 1.79 } else {
7156     !!!cp ('t393');
7157 wakaba 1.51 }
7158 wakaba 1.79 } else {
7159     !!!cp ('t394');
7160 wakaba 1.51 }
7161 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
7162 wakaba 1.79 !!!cp ('t395');
7163 wakaba 1.52 $text .= $token->{data};
7164     !!!next-token;
7165     }
7166     if (length $text) {
7167 wakaba 1.79 !!!cp ('t396');
7168 wakaba 1.52 $el->manakai_append_text ($text);
7169     }
7170    
7171     $self->{content_model} = PCDATA_CONTENT_MODEL;
7172 wakaba 1.51
7173 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
7174 wakaba 1.52 $token->{tag_name} eq $tag_name) {
7175 wakaba 1.79 !!!cp ('t397');
7176 wakaba 1.52 ## Ignore the token
7177     } else {
7178 wakaba 1.79 !!!cp ('t398');
7179 wakaba 1.153 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
7180 wakaba 1.51 }
7181 wakaba 1.52 !!!next-token;
7182 wakaba 1.126 next B;
7183 wakaba 1.151 } elsif ($token->{tag_name} eq 'rt' or
7184     $token->{tag_name} eq 'rp') {
7185     ## has a |ruby| element in scope
7186     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7187     my $node = $self->{open_elements}->[$_];
7188     if ($node->[1] & RUBY_EL) {
7189     !!!cp ('t398.1');
7190     ## generate implied end tags
7191     while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7192     !!!cp ('t398.2');
7193     pop @{$self->{open_elements}};
7194     }
7195     unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
7196     !!!cp ('t398.3');
7197     !!!parse-error (type => 'not closed',
7198 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7199 wakaba 1.151 ->manakai_local_name,
7200     token => $token);
7201     pop @{$self->{open_elements}}
7202     while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
7203     }
7204     last INSCOPE;
7205     } elsif ($node->[1] & SCOPING_EL) {
7206     !!!cp ('t398.4');
7207     last INSCOPE;
7208     }
7209     } # INSCOPE
7210    
7211     !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7212    
7213     !!!nack ('t398.5');
7214     !!!next-token;
7215     redo B;
7216 wakaba 1.126 } elsif ($token->{tag_name} eq 'math' or
7217     $token->{tag_name} eq 'svg') {
7218     $reconstruct_active_formatting_elements->($insert_to_current);
7219 wakaba 1.131
7220 wakaba 1.155 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
7221    
7222 wakaba 1.131 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
7223    
7224     ## "adjust foreign attributes" - done in insert-element-f
7225 wakaba 1.126
7226 wakaba 1.131 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
7227 wakaba 1.126
7228     if ($self->{self_closing}) {
7229     pop @{$self->{open_elements}};
7230     !!!ack ('t398.1');
7231     } else {
7232     !!!cp ('t398.2');
7233     $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
7234     ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
7235     ## mode, "in body" (not "in foreign content") secondary insertion
7236     ## mode, maybe.
7237     }
7238    
7239     !!!next-token;
7240     next B;
7241 wakaba 1.52 } elsif ({
7242     caption => 1, col => 1, colgroup => 1, frame => 1,
7243     frameset => 1, head => 1, option => 1, optgroup => 1,
7244     tbody => 1, td => 1, tfoot => 1, th => 1,
7245     thead => 1, tr => 1,
7246     }->{$token->{tag_name}}) {
7247 wakaba 1.79 !!!cp ('t401');
7248 wakaba 1.153 !!!parse-error (type => 'in body',
7249     text => $token->{tag_name}, token => $token);
7250 wakaba 1.52 ## Ignore the token
7251 wakaba 1.125 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
7252 wakaba 1.52 !!!next-token;
7253 wakaba 1.126 next B;
7254 wakaba 1.52
7255     ## ISSUE: An issue on HTML5 new elements in the spec.
7256     } else {
7257 wakaba 1.110 if ($token->{tag_name} eq 'image') {
7258     !!!cp ('t384');
7259 wakaba 1.113 !!!parse-error (type => 'image', token => $token);
7260 wakaba 1.110 $token->{tag_name} = 'img';
7261     } else {
7262     !!!cp ('t385');
7263     }
7264    
7265     ## NOTE: There is an "as if <br>" code clone.
7266 wakaba 1.52 $reconstruct_active_formatting_elements->($insert_to_current);
7267    
7268 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7269 wakaba 1.109
7270 wakaba 1.110 if ({
7271     applet => 1, marquee => 1, object => 1,
7272     }->{$token->{tag_name}}) {
7273     !!!cp ('t380');
7274     push @$active_formatting_elements, ['#marker', ''];
7275 wakaba 1.125 !!!nack ('t380.1');
7276 wakaba 1.110 } elsif ({
7277     b => 1, big => 1, em => 1, font => 1, i => 1,
7278     s => 1, small => 1, strile => 1,
7279     strong => 1, tt => 1, u => 1,
7280     }->{$token->{tag_name}}) {
7281     !!!cp ('t375');
7282     push @$active_formatting_elements, $self->{open_elements}->[-1];
7283 wakaba 1.125 !!!nack ('t375.1');
7284 wakaba 1.110 } elsif ($token->{tag_name} eq 'input') {
7285     !!!cp ('t388');
7286     ## TODO: associate with $self->{form_element} if defined
7287     pop @{$self->{open_elements}};
7288 wakaba 1.125 !!!ack ('t388.2');
7289 wakaba 1.110 } elsif ({
7290     area => 1, basefont => 1, bgsound => 1, br => 1,
7291     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7292     #image => 1,
7293     }->{$token->{tag_name}}) {
7294     !!!cp ('t388.1');
7295     pop @{$self->{open_elements}};
7296 wakaba 1.125 !!!ack ('t388.3');
7297 wakaba 1.110 } elsif ($token->{tag_name} eq 'select') {
7298 wakaba 1.109 ## TODO: associate with $self->{form_element} if defined
7299    
7300     if ($self->{insertion_mode} & TABLE_IMS or
7301     $self->{insertion_mode} & BODY_TABLE_IMS or
7302     $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7303     !!!cp ('t400.1');
7304     $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7305     } else {
7306     !!!cp ('t400.2');
7307     $self->{insertion_mode} = IN_SELECT_IM;
7308     }
7309 wakaba 1.125 !!!nack ('t400.3');
7310 wakaba 1.110 } else {
7311 wakaba 1.125 !!!nack ('t402');
7312 wakaba 1.109 }
7313 wakaba 1.51
7314 wakaba 1.52 !!!next-token;
7315 wakaba 1.126 next B;
7316 wakaba 1.52 }
7317 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
7318 wakaba 1.52 if ($token->{tag_name} eq 'body') {
7319 wakaba 1.107 ## has a |body| element in scope
7320     my $i;
7321 wakaba 1.111 INSCOPE: {
7322     for (reverse @{$self->{open_elements}}) {
7323 wakaba 1.123 if ($_->[1] & BODY_EL) {
7324 wakaba 1.111 !!!cp ('t405');
7325     $i = $_;
7326     last INSCOPE;
7327 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
7328 wakaba 1.111 !!!cp ('t405.1');
7329     last;
7330     }
7331 wakaba 1.52 }
7332 wakaba 1.111
7333     !!!parse-error (type => 'start tag not allowed',
7334 wakaba 1.153 text => $token->{tag_name}, token => $token);
7335 wakaba 1.107 ## NOTE: Ignore the token.
7336 wakaba 1.52 !!!next-token;
7337 wakaba 1.126 next B;
7338 wakaba 1.111 } # INSCOPE
7339 wakaba 1.107
7340     for (@{$self->{open_elements}}) {
7341 wakaba 1.123 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7342 wakaba 1.107 !!!cp ('t403');
7343 wakaba 1.122 !!!parse-error (type => 'not closed',
7344 wakaba 1.153 text => $_->[0]->manakai_local_name,
7345 wakaba 1.122 token => $token);
7346 wakaba 1.107 last;
7347     } else {
7348     !!!cp ('t404');
7349     }
7350     }
7351    
7352     $self->{insertion_mode} = AFTER_BODY_IM;
7353     !!!next-token;
7354 wakaba 1.126 next B;
7355 wakaba 1.52 } elsif ($token->{tag_name} eq 'html') {
7356 wakaba 1.122 ## TODO: Update this code. It seems that the code below is not
7357     ## up-to-date, though it has same effect as speced.
7358 wakaba 1.123 if (@{$self->{open_elements}} > 1 and
7359     $self->{open_elements}->[1]->[1] & BODY_EL) {
7360 wakaba 1.52 ## ISSUE: There is an issue in the spec.
7361 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7362 wakaba 1.79 !!!cp ('t406');
7363 wakaba 1.122 !!!parse-error (type => 'not closed',
7364 wakaba 1.153 text => $self->{open_elements}->[1]->[0]
7365 wakaba 1.122 ->manakai_local_name,
7366     token => $token);
7367 wakaba 1.79 } else {
7368     !!!cp ('t407');
7369 wakaba 1.1 }
7370 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
7371 wakaba 1.52 ## reprocess
7372 wakaba 1.126 next B;
7373 wakaba 1.51 } else {
7374 wakaba 1.79 !!!cp ('t408');
7375 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7376     text => $token->{tag_name}, token => $token);
7377 wakaba 1.52 ## Ignore the token
7378     !!!next-token;
7379 wakaba 1.126 next B;
7380 wakaba 1.51 }
7381 wakaba 1.52 } elsif ({
7382     address => 1, blockquote => 1, center => 1, dir => 1,
7383     div => 1, dl => 1, fieldset => 1, listing => 1,
7384     menu => 1, ol => 1, pre => 1, ul => 1,
7385     dd => 1, dt => 1, li => 1,
7386 wakaba 1.103 applet => 1, button => 1, marquee => 1, object => 1,
7387 wakaba 1.52 }->{$token->{tag_name}}) {
7388     ## has an element in scope
7389     my $i;
7390     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7391     my $node = $self->{open_elements}->[$_];
7392 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7393 wakaba 1.79 !!!cp ('t410');
7394 wakaba 1.52 $i = $_;
7395 wakaba 1.87 last INSCOPE;
7396 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7397 wakaba 1.79 !!!cp ('t411');
7398 wakaba 1.52 last INSCOPE;
7399 wakaba 1.51 }
7400 wakaba 1.52 } # INSCOPE
7401 wakaba 1.89
7402     unless (defined $i) { # has an element in scope
7403     !!!cp ('t413');
7404 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7405     text => $token->{tag_name}, token => $token);
7406 wakaba 1.157 ## NOTE: Ignore the token.
7407 wakaba 1.89 } else {
7408     ## Step 1. generate implied end tags
7409     while ({
7410 wakaba 1.151 ## END_TAG_OPTIONAL_EL
7411 wakaba 1.89 dd => ($token->{tag_name} ne 'dd'),
7412     dt => ($token->{tag_name} ne 'dt'),
7413     li => ($token->{tag_name} ne 'li'),
7414     p => 1,
7415 wakaba 1.151 rt => 1,
7416     rp => 1,
7417 wakaba 1.123 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7418 wakaba 1.89 !!!cp ('t409');
7419     pop @{$self->{open_elements}};
7420     }
7421    
7422     ## Step 2.
7423 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7424     ne $token->{tag_name}) {
7425 wakaba 1.79 !!!cp ('t412');
7426 wakaba 1.122 !!!parse-error (type => 'not closed',
7427 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7428 wakaba 1.122 ->manakai_local_name,
7429     token => $token);
7430 wakaba 1.51 } else {
7431 wakaba 1.89 !!!cp ('t414');
7432 wakaba 1.51 }
7433 wakaba 1.89
7434     ## Step 3.
7435 wakaba 1.52 splice @{$self->{open_elements}}, $i;
7436 wakaba 1.89
7437     ## Step 4.
7438     $clear_up_to_marker->()
7439     if {
7440 wakaba 1.103 applet => 1, button => 1, marquee => 1, object => 1,
7441 wakaba 1.89 }->{$token->{tag_name}};
7442 wakaba 1.51 }
7443 wakaba 1.52 !!!next-token;
7444 wakaba 1.126 next B;
7445 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
7446 wakaba 1.92 undef $self->{form_element};
7447    
7448 wakaba 1.52 ## has an element in scope
7449 wakaba 1.92 my $i;
7450 wakaba 1.52 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7451     my $node = $self->{open_elements}->[$_];
7452 wakaba 1.123 if ($node->[1] & FORM_EL) {
7453 wakaba 1.79 !!!cp ('t418');
7454 wakaba 1.92 $i = $_;
7455 wakaba 1.52 last INSCOPE;
7456 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7457 wakaba 1.79 !!!cp ('t419');
7458 wakaba 1.52 last INSCOPE;
7459     }
7460     } # INSCOPE
7461 wakaba 1.92
7462     unless (defined $i) { # has an element in scope
7463 wakaba 1.79 !!!cp ('t421');
7464 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7465     text => $token->{tag_name}, token => $token);
7466 wakaba 1.157 ## NOTE: Ignore the token.
7467 wakaba 1.92 } else {
7468     ## Step 1. generate implied end tags
7469 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7470 wakaba 1.92 !!!cp ('t417');
7471     pop @{$self->{open_elements}};
7472     }
7473    
7474     ## Step 2.
7475 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7476     ne $token->{tag_name}) {
7477 wakaba 1.92 !!!cp ('t417.1');
7478 wakaba 1.122 !!!parse-error (type => 'not closed',
7479 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7480 wakaba 1.122 ->manakai_local_name,
7481     token => $token);
7482 wakaba 1.92 } else {
7483     !!!cp ('t420');
7484     }
7485    
7486     ## Step 3.
7487     splice @{$self->{open_elements}}, $i;
7488 wakaba 1.52 }
7489    
7490     !!!next-token;
7491 wakaba 1.126 next B;
7492 wakaba 1.52 } elsif ({
7493     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7494     }->{$token->{tag_name}}) {
7495     ## has an element in scope
7496     my $i;
7497     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7498     my $node = $self->{open_elements}->[$_];
7499 wakaba 1.123 if ($node->[1] & HEADING_EL) {
7500 wakaba 1.79 !!!cp ('t423');
7501 wakaba 1.52 $i = $_;
7502     last INSCOPE;
7503 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7504 wakaba 1.79 !!!cp ('t424');
7505 wakaba 1.52 last INSCOPE;
7506 wakaba 1.51 }
7507 wakaba 1.52 } # INSCOPE
7508 wakaba 1.93
7509     unless (defined $i) { # has an element in scope
7510     !!!cp ('t425.1');
7511 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7512     text => $token->{tag_name}, token => $token);
7513 wakaba 1.157 ## NOTE: Ignore the token.
7514 wakaba 1.79 } else {
7515 wakaba 1.93 ## Step 1. generate implied end tags
7516 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7517 wakaba 1.93 !!!cp ('t422');
7518     pop @{$self->{open_elements}};
7519     }
7520    
7521     ## Step 2.
7522 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7523     ne $token->{tag_name}) {
7524 wakaba 1.93 !!!cp ('t425');
7525 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7526     text => $token->{tag_name}, token => $token);
7527 wakaba 1.93 } else {
7528     !!!cp ('t426');
7529     }
7530    
7531     ## Step 3.
7532     splice @{$self->{open_elements}}, $i;
7533 wakaba 1.36 }
7534 wakaba 1.52
7535     !!!next-token;
7536 wakaba 1.126 next B;
7537 wakaba 1.87 } elsif ($token->{tag_name} eq 'p') {
7538     ## has an element in scope
7539     my $i;
7540     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7541     my $node = $self->{open_elements}->[$_];
7542 wakaba 1.123 if ($node->[1] & P_EL) {
7543 wakaba 1.87 !!!cp ('t410.1');
7544     $i = $_;
7545 wakaba 1.88 last INSCOPE;
7546 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7547 wakaba 1.87 !!!cp ('t411.1');
7548     last INSCOPE;
7549     }
7550     } # INSCOPE
7551 wakaba 1.91
7552     if (defined $i) {
7553 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7554     ne $token->{tag_name}) {
7555 wakaba 1.87 !!!cp ('t412.1');
7556 wakaba 1.122 !!!parse-error (type => 'not closed',
7557 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7558 wakaba 1.122 ->manakai_local_name,
7559     token => $token);
7560 wakaba 1.87 } else {
7561 wakaba 1.91 !!!cp ('t414.1');
7562 wakaba 1.87 }
7563 wakaba 1.91
7564 wakaba 1.87 splice @{$self->{open_elements}}, $i;
7565     } else {
7566 wakaba 1.91 !!!cp ('t413.1');
7567 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7568     text => $token->{tag_name}, token => $token);
7569 wakaba 1.91
7570 wakaba 1.87 !!!cp ('t415.1');
7571     ## As if <p>, then reprocess the current token
7572     my $el;
7573 wakaba 1.126 !!!create-element ($el, $HTML_NS, 'p',, $token);
7574 wakaba 1.87 $insert->($el);
7575 wakaba 1.91 ## NOTE: Not inserted into |$self->{open_elements}|.
7576 wakaba 1.87 }
7577 wakaba 1.91
7578 wakaba 1.87 !!!next-token;
7579 wakaba 1.126 next B;
7580 wakaba 1.52 } elsif ({
7581     a => 1,
7582     b => 1, big => 1, em => 1, font => 1, i => 1,
7583     nobr => 1, s => 1, small => 1, strile => 1,
7584     strong => 1, tt => 1, u => 1,
7585     }->{$token->{tag_name}}) {
7586 wakaba 1.79 !!!cp ('t427');
7587 wakaba 1.113 $formatting_end_tag->($token);
7588 wakaba 1.126 next B;
7589 wakaba 1.52 } elsif ($token->{tag_name} eq 'br') {
7590 wakaba 1.79 !!!cp ('t428');
7591 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7592     text => 'br', token => $token);
7593 wakaba 1.52
7594     ## As if <br>
7595     $reconstruct_active_formatting_elements->($insert_to_current);
7596    
7597     my $el;
7598 wakaba 1.126 !!!create-element ($el, $HTML_NS, 'br',, $token);
7599 wakaba 1.52 $insert->($el);
7600    
7601     ## Ignore the token.
7602     !!!next-token;
7603 wakaba 1.126 next B;
7604 wakaba 1.52 } elsif ({
7605     caption => 1, col => 1, colgroup => 1, frame => 1,
7606     frameset => 1, head => 1, option => 1, optgroup => 1,
7607     tbody => 1, td => 1, tfoot => 1, th => 1,
7608     thead => 1, tr => 1,
7609     area => 1, basefont => 1, bgsound => 1,
7610     embed => 1, hr => 1, iframe => 1, image => 1,
7611     img => 1, input => 1, isindex => 1, noembed => 1,
7612     noframes => 1, param => 1, select => 1, spacer => 1,
7613     table => 1, textarea => 1, wbr => 1,
7614     noscript => 0, ## TODO: if scripting is enabled
7615     }->{$token->{tag_name}}) {
7616 wakaba 1.79 !!!cp ('t429');
7617 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7618     text => $token->{tag_name}, token => $token);
7619 wakaba 1.52 ## Ignore the token
7620     !!!next-token;
7621 wakaba 1.126 next B;
7622 wakaba 1.52
7623     ## ISSUE: Issue on HTML5 new elements in spec
7624    
7625     } else {
7626     ## Step 1
7627     my $node_i = -1;
7628     my $node = $self->{open_elements}->[$node_i];
7629 wakaba 1.51
7630 wakaba 1.52 ## Step 2
7631     S2: {
7632 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7633 wakaba 1.52 ## Step 1
7634     ## generate implied end tags
7635 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7636 wakaba 1.79 !!!cp ('t430');
7637 wakaba 1.151 ## NOTE: |<ruby><rt></ruby>|.
7638     ## ISSUE: <ruby><rt></rt> will also take this code path,
7639     ## which seems wrong.
7640 wakaba 1.86 pop @{$self->{open_elements}};
7641 wakaba 1.151 $node_i++;
7642 wakaba 1.52 }
7643    
7644     ## Step 2
7645 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7646     ne $token->{tag_name}) {
7647 wakaba 1.79 !!!cp ('t431');
7648 wakaba 1.58 ## NOTE: <x><y></x>
7649 wakaba 1.122 !!!parse-error (type => 'not closed',
7650 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7651 wakaba 1.122 ->manakai_local_name,
7652     token => $token);
7653 wakaba 1.79 } else {
7654     !!!cp ('t432');
7655 wakaba 1.52 }
7656    
7657     ## Step 3
7658 wakaba 1.151 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7659 wakaba 1.51
7660 wakaba 1.1 !!!next-token;
7661 wakaba 1.52 last S2;
7662 wakaba 1.1 } else {
7663 wakaba 1.52 ## Step 3
7664 wakaba 1.123 if (not ($node->[1] & FORMATTING_EL) and
7665 wakaba 1.52 #not $phrasing_category->{$node->[1]} and
7666 wakaba 1.123 ($node->[1] & SPECIAL_EL or
7667     $node->[1] & SCOPING_EL)) {
7668 wakaba 1.79 !!!cp ('t433');
7669 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7670     text => $token->{tag_name}, token => $token);
7671 wakaba 1.52 ## Ignore the token
7672     !!!next-token;
7673     last S2;
7674     }
7675 wakaba 1.79
7676     !!!cp ('t434');
7677 wakaba 1.1 }
7678 wakaba 1.52
7679     ## Step 4
7680     $node_i--;
7681     $node = $self->{open_elements}->[$node_i];
7682    
7683     ## Step 5;
7684     redo S2;
7685     } # S2
7686 wakaba 1.126 next B;
7687 wakaba 1.1 }
7688     }
7689 wakaba 1.126 next B;
7690     } continue { # B
7691     if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7692     ## NOTE: The code below is executed in cases where it does not have
7693     ## to be, but it it is harmless even in those cases.
7694     ## has an element in scope
7695     INSCOPE: {
7696     for (reverse 0..$#{$self->{open_elements}}) {
7697     my $node = $self->{open_elements}->[$_];
7698     if ($node->[1] & FOREIGN_EL) {
7699     last INSCOPE;
7700     } elsif ($node->[1] & SCOPING_EL) {
7701     last;
7702     }
7703     }
7704    
7705     ## NOTE: No foreign element in scope.
7706     $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7707     } # INSCOPE
7708     }
7709 wakaba 1.1 } # B
7710    
7711     ## Stop parsing # MUST
7712    
7713     ## TODO: script stuffs
7714 wakaba 1.3 } # _tree_construct_main
7715    
7716 wakaba 1.162 sub set_inner_html ($$$;$) {
7717 wakaba 1.3 my $class = shift;
7718     my $node = shift;
7719     my $s = \$_[0];
7720     my $onerror = $_[1];
7721 wakaba 1.162 my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7722 wakaba 1.3
7723 wakaba 1.63 ## ISSUE: Should {confident} be true?
7724    
7725 wakaba 1.3 my $nt = $node->node_type;
7726     if ($nt == 9) {
7727     # MUST
7728    
7729     ## Step 1 # MUST
7730     ## TODO: If the document has an active parser, ...
7731     ## ISSUE: There is an issue in the spec.
7732    
7733     ## Step 2 # MUST
7734     my @cn = @{$node->child_nodes};
7735     for (@cn) {
7736     $node->remove_child ($_);
7737     }
7738    
7739     ## Step 3, 4, 5 # MUST
7740 wakaba 1.162 $class->parse_char_string ($$s => $node, $onerror, $get_wrapper);
7741 wakaba 1.3 } elsif ($nt == 1) {
7742     ## TODO: If non-html element
7743    
7744     ## NOTE: Most of this code is copied from |parse_string|
7745    
7746 wakaba 1.162 ## TODO: Support for $get_wrapper
7747    
7748 wakaba 1.3 ## Step 1 # MUST
7749 wakaba 1.14 my $this_doc = $node->owner_document;
7750     my $doc = $this_doc->implementation->create_document;
7751 wakaba 1.18 $doc->manakai_is_html (1);
7752 wakaba 1.3 my $p = $class->new;
7753     $p->{document} = $doc;
7754    
7755 wakaba 1.84 ## Step 8 # MUST
7756 wakaba 1.3 my $i = 0;
7757 wakaba 1.121 $p->{line_prev} = $p->{line} = 1;
7758     $p->{column_prev} = $p->{column} = 0;
7759 wakaba 1.76 $p->{set_next_char} = sub {
7760 wakaba 1.3 my $self = shift;
7761 wakaba 1.14
7762 wakaba 1.76 pop @{$self->{prev_char}};
7763     unshift @{$self->{prev_char}}, $self->{next_char};
7764 wakaba 1.14
7765 wakaba 1.76 $self->{next_char} = -1 and return if $i >= length $$s;
7766     $self->{next_char} = ord substr $$s, $i++, 1;
7767 wakaba 1.121
7768     ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7769     $p->{column}++;
7770 wakaba 1.4
7771 wakaba 1.76 if ($self->{next_char} == 0x000A) { # LF
7772 wakaba 1.121 $p->{line}++;
7773     $p->{column} = 0;
7774 wakaba 1.79 !!!cp ('i1');
7775 wakaba 1.76 } elsif ($self->{next_char} == 0x000D) { # CR
7776 wakaba 1.15 $i++ if substr ($$s, $i, 1) eq "\x0A";
7777 wakaba 1.76 $self->{next_char} = 0x000A; # LF # MUST
7778 wakaba 1.121 $p->{line}++;
7779     $p->{column} = 0;
7780 wakaba 1.79 !!!cp ('i2');
7781 wakaba 1.76 } elsif ($self->{next_char} > 0x10FFFF) {
7782     $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7783 wakaba 1.79 !!!cp ('i3');
7784 wakaba 1.76 } elsif ($self->{next_char} == 0x0000) { # NULL
7785 wakaba 1.79 !!!cp ('i4');
7786 wakaba 1.14 !!!parse-error (type => 'NULL');
7787 wakaba 1.76 $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7788 wakaba 1.132 } elsif ($self->{next_char} <= 0x0008 or
7789     (0x000E <= $self->{next_char} and
7790     $self->{next_char} <= 0x001F) or
7791     (0x007F <= $self->{next_char} and
7792     $self->{next_char} <= 0x009F) or
7793     (0xD800 <= $self->{next_char} and
7794     $self->{next_char} <= 0xDFFF) or
7795     (0xFDD0 <= $self->{next_char} and
7796     $self->{next_char} <= 0xFDDF) or
7797     {
7798     0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
7799     0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
7800     0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
7801     0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
7802     0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
7803     0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
7804     0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
7805     0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
7806     0x10FFFE => 1, 0x10FFFF => 1,
7807     }->{$self->{next_char}}) {
7808     !!!cp ('i4.1');
7809 wakaba 1.153 if ($self->{next_char} < 0x10000) {
7810     !!!parse-error (type => 'control char',
7811     text => (sprintf 'U+%04X', $self->{next_char}));
7812     } else {
7813     !!!parse-error (type => 'control char',
7814     text => (sprintf 'U-%08X', $self->{next_char}));
7815     }
7816 wakaba 1.3 }
7817     };
7818 wakaba 1.76 $p->{prev_char} = [-1, -1, -1];
7819     $p->{next_char} = -1;
7820 wakaba 1.171
7821     $p->{getc_until} = sub {
7822     ## TODO: ...
7823     return undef;
7824     }; # $p->{getc_until};
7825    
7826 wakaba 1.3 my $ponerror = $onerror || sub {
7827     my (%opt) = @_;
7828 wakaba 1.121 my $line = $opt{line};
7829     my $column = $opt{column};
7830     if (defined $opt{token} and defined $opt{token}->{line}) {
7831     $line = $opt{token}->{line};
7832     $column = $opt{token}->{column};
7833     }
7834     warn "Parse error ($opt{type}) at line $line column $column\n";
7835 wakaba 1.3 };
7836     $p->{parse_error} = sub {
7837 wakaba 1.121 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7838 wakaba 1.3 };
7839    
7840     $p->_initialize_tokenizer;
7841     $p->_initialize_tree_constructor;
7842    
7843     ## Step 2
7844 wakaba 1.71 my $node_ln = $node->manakai_local_name;
7845 wakaba 1.40 $p->{content_model} = {
7846     title => RCDATA_CONTENT_MODEL,
7847     textarea => RCDATA_CONTENT_MODEL,
7848     style => CDATA_CONTENT_MODEL,
7849     script => CDATA_CONTENT_MODEL,
7850     xmp => CDATA_CONTENT_MODEL,
7851     iframe => CDATA_CONTENT_MODEL,
7852     noembed => CDATA_CONTENT_MODEL,
7853     noframes => CDATA_CONTENT_MODEL,
7854     noscript => CDATA_CONTENT_MODEL,
7855     plaintext => PLAINTEXT_CONTENT_MODEL,
7856     }->{$node_ln};
7857     $p->{content_model} = PCDATA_CONTENT_MODEL
7858     unless defined $p->{content_model};
7859     ## ISSUE: What is "the name of the element"? local name?
7860 wakaba 1.3
7861 wakaba 1.123 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7862     ## TODO: Foreign element OK?
7863 wakaba 1.3
7864 wakaba 1.84 ## Step 3
7865 wakaba 1.3 my $root = $doc->create_element_ns
7866     ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7867    
7868 wakaba 1.84 ## Step 4 # MUST
7869 wakaba 1.3 $doc->append_child ($root);
7870    
7871 wakaba 1.84 ## Step 5 # MUST
7872 wakaba 1.123 push @{$p->{open_elements}}, [$root, $el_category->{html}];
7873 wakaba 1.3
7874     undef $p->{head_element};
7875    
7876 wakaba 1.84 ## Step 6 # MUST
7877 wakaba 1.3 $p->_reset_insertion_mode;
7878    
7879 wakaba 1.84 ## Step 7 # MUST
7880 wakaba 1.3 my $anode = $node;
7881     AN: while (defined $anode) {
7882     if ($anode->node_type == 1) {
7883     my $nsuri = $anode->namespace_uri;
7884     if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
7885 wakaba 1.71 if ($anode->manakai_local_name eq 'form') {
7886 wakaba 1.79 !!!cp ('i5');
7887 wakaba 1.3 $p->{form_element} = $anode;
7888     last AN;
7889     }
7890     }
7891     }
7892     $anode = $anode->parent_node;
7893     } # AN
7894    
7895 wakaba 1.84 ## Step 9 # MUST
7896 wakaba 1.3 {
7897     my $self = $p;
7898     !!!next-token;
7899     }
7900     $p->_tree_construction_main;
7901    
7902 wakaba 1.84 ## Step 10 # MUST
7903 wakaba 1.3 my @cn = @{$node->child_nodes};
7904     for (@cn) {
7905     $node->remove_child ($_);
7906     }
7907     ## ISSUE: mutation events? read-only?
7908    
7909 wakaba 1.84 ## Step 11 # MUST
7910 wakaba 1.3 @cn = @{$root->child_nodes};
7911     for (@cn) {
7912 wakaba 1.14 $this_doc->adopt_node ($_);
7913 wakaba 1.3 $node->append_child ($_);
7914     }
7915 wakaba 1.14 ## ISSUE: mutation events?
7916 wakaba 1.3
7917     $p->_terminate_tree_constructor;
7918 wakaba 1.121
7919     delete $p->{parse_error}; # delete loop
7920 wakaba 1.3 } else {
7921     die "$0: |set_inner_html| is not defined for node of type $nt";
7922     }
7923     } # set_inner_html
7924    
7925     } # tree construction stage
7926 wakaba 1.1
7927 wakaba 1.63 package Whatpm::HTML::RestartParser;
7928     push our @ISA, 'Error';
7929    
7930 wakaba 1.1 1;
7931 wakaba 1.171 # $Date: 2008/09/13 12:25:44 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24