/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.193 - (hide annotations) (download) (as text)
Sat Oct 4 04:06:33 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.192: +36 -5 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	4 Oct 2008 03:53:07 -0000
2008-10-04  Wakaba  <wakaba@suika.fam.cx>

	* tree-test-1.dat: Test related to "special", "formatting",
	"scoping", and "phrasing" categories are added (cf. HTML5 revision
	1778).

++ whatpm/Whatpm/ChangeLog	4 Oct 2008 04:06:08 -0000
2008-10-04  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src: New "special" elements added to the list (HTML5
	revision 1778).  "strile" -> "strike".

1 wakaba 1.2 package Whatpm::HTML;
2 wakaba 1.1 use strict;
3 wakaba 1.193 our $VERSION=do{my @r=(q$Revision: 1.192 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.63 use Error qw(:try);
5 wakaba 1.1
6 wakaba 1.182 ## NOTE: This module don't check all HTML5 parse errors; character
7     ## encoding related parse errors are expected to be handled by relevant
8     ## modules.
9     ## Parse errors for control characters that are not allowed in HTML5
10     ## documents, for surrogate code points, and for noncharacter code
11     ## points, as well as U+FFFD substitions for characters whose code points
12     ## is higher than U+10FFFF may be detected by combining the parser with
13     ## the checker implemented by Whatpm::Charset::UnicodeChecker (for its
14     ## usage example, see |t/HTML-tree.t| in the Whatpm package or the
15     ## WebHACC::Language::HTML module in the WebHACC package).
16    
17 wakaba 1.18 ## ISSUE:
18     ## var doc = implementation.createDocument (null, null, null);
19     ## doc.write ('');
20     ## alert (doc.compatMode);
21 wakaba 1.1
22 wakaba 1.139 require IO::Handle;
23    
24 wakaba 1.126 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
25     my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
26     my $SVG_NS = q<http://www.w3.org/2000/svg>;
27     my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
28     my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
29     my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
30    
31 wakaba 1.123 sub A_EL () { 0b1 }
32     sub ADDRESS_EL () { 0b10 }
33     sub BODY_EL () { 0b100 }
34     sub BUTTON_EL () { 0b1000 }
35     sub CAPTION_EL () { 0b10000 }
36     sub DD_EL () { 0b100000 }
37     sub DIV_EL () { 0b1000000 }
38     sub DT_EL () { 0b10000000 }
39     sub FORM_EL () { 0b100000000 }
40     sub FORMATTING_EL () { 0b1000000000 }
41     sub FRAMESET_EL () { 0b10000000000 }
42     sub HEADING_EL () { 0b100000000000 }
43     sub HTML_EL () { 0b1000000000000 }
44     sub LI_EL () { 0b10000000000000 }
45     sub NOBR_EL () { 0b100000000000000 }
46     sub OPTION_EL () { 0b1000000000000000 }
47     sub OPTGROUP_EL () { 0b10000000000000000 }
48     sub P_EL () { 0b100000000000000000 }
49     sub SELECT_EL () { 0b1000000000000000000 }
50     sub TABLE_EL () { 0b10000000000000000000 }
51     sub TABLE_CELL_EL () { 0b100000000000000000000 }
52     sub TABLE_ROW_EL () { 0b1000000000000000000000 }
53     sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
54     sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
55     sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
56 wakaba 1.126 sub FOREIGN_EL () { 0b10000000000000000000000000 }
57     sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
58     sub MML_AXML_EL () { 0b1000000000000000000000000000 }
59 wakaba 1.151 sub RUBY_EL () { 0b10000000000000000000000000000 }
60     sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
61 wakaba 1.123
62     sub TABLE_ROWS_EL () {
63     TABLE_EL |
64     TABLE_ROW_EL |
65     TABLE_ROW_GROUP_EL
66     }
67    
68 wakaba 1.151 ## NOTE: Used in "generate implied end tags" algorithm.
69     ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL
70     ## is used in "generate implied end tags" implementation (search for the
71     ## function mae).
72 wakaba 1.123 sub END_TAG_OPTIONAL_EL () {
73     DD_EL |
74     DT_EL |
75     LI_EL |
76 wakaba 1.151 P_EL |
77     RUBY_COMPONENT_EL
78 wakaba 1.123 }
79    
80 wakaba 1.151 ## NOTE: Used in </body> and EOF algorithms.
81 wakaba 1.123 sub ALL_END_TAG_OPTIONAL_EL () {
82 wakaba 1.151 DD_EL |
83     DT_EL |
84     LI_EL |
85     P_EL |
86    
87 wakaba 1.123 BODY_EL |
88     HTML_EL |
89     TABLE_CELL_EL |
90     TABLE_ROW_EL |
91     TABLE_ROW_GROUP_EL
92     }
93    
94     sub SCOPING_EL () {
95     BUTTON_EL |
96     CAPTION_EL |
97     HTML_EL |
98     TABLE_EL |
99     TABLE_CELL_EL |
100     MISC_SCOPING_EL
101     }
102    
103     sub TABLE_SCOPING_EL () {
104     HTML_EL |
105     TABLE_EL
106     }
107    
108     sub TABLE_ROWS_SCOPING_EL () {
109     HTML_EL |
110     TABLE_ROW_GROUP_EL
111     }
112    
113     sub TABLE_ROW_SCOPING_EL () {
114     HTML_EL |
115     TABLE_ROW_EL
116     }
117    
118     sub SPECIAL_EL () {
119     ADDRESS_EL |
120     BODY_EL |
121     DIV_EL |
122 wakaba 1.151
123     DD_EL |
124     DT_EL |
125     LI_EL |
126     P_EL |
127    
128 wakaba 1.123 FORM_EL |
129     FRAMESET_EL |
130     HEADING_EL |
131     OPTION_EL |
132     OPTGROUP_EL |
133     SELECT_EL |
134     TABLE_ROW_EL |
135     TABLE_ROW_GROUP_EL |
136     MISC_SPECIAL_EL
137     }
138    
139     my $el_category = {
140     a => A_EL | FORMATTING_EL,
141     address => ADDRESS_EL,
142     applet => MISC_SCOPING_EL,
143     area => MISC_SPECIAL_EL,
144 wakaba 1.193 article => MISC_SPECIAL_EL,
145     aside => MISC_SPECIAL_EL,
146 wakaba 1.123 b => FORMATTING_EL,
147     base => MISC_SPECIAL_EL,
148     basefont => MISC_SPECIAL_EL,
149     bgsound => MISC_SPECIAL_EL,
150     big => FORMATTING_EL,
151     blockquote => MISC_SPECIAL_EL,
152     body => BODY_EL,
153     br => MISC_SPECIAL_EL,
154     button => BUTTON_EL,
155     caption => CAPTION_EL,
156     center => MISC_SPECIAL_EL,
157     col => MISC_SPECIAL_EL,
158     colgroup => MISC_SPECIAL_EL,
159 wakaba 1.193 command => MISC_SPECIAL_EL,
160     datagrid => MISC_SPECIAL_EL,
161 wakaba 1.123 dd => DD_EL,
162 wakaba 1.193 details => MISC_SPECIAL_EL,
163     dialog => MISC_SPECIAL_EL,
164 wakaba 1.123 dir => MISC_SPECIAL_EL,
165     div => DIV_EL,
166     dl => MISC_SPECIAL_EL,
167     dt => DT_EL,
168     em => FORMATTING_EL,
169     embed => MISC_SPECIAL_EL,
170 wakaba 1.193 eventsource => MISC_SPECIAL_EL,
171 wakaba 1.123 fieldset => MISC_SPECIAL_EL,
172 wakaba 1.193 figure => MISC_SPECIAL_EL,
173 wakaba 1.123 font => FORMATTING_EL,
174 wakaba 1.193 footer => MISC_SPECIAL_EL,
175 wakaba 1.123 form => FORM_EL,
176     frame => MISC_SPECIAL_EL,
177     frameset => FRAMESET_EL,
178     h1 => HEADING_EL,
179     h2 => HEADING_EL,
180     h3 => HEADING_EL,
181     h4 => HEADING_EL,
182     h5 => HEADING_EL,
183     h6 => HEADING_EL,
184     head => MISC_SPECIAL_EL,
185 wakaba 1.193 header => MISC_SPECIAL_EL,
186 wakaba 1.123 hr => MISC_SPECIAL_EL,
187     html => HTML_EL,
188     i => FORMATTING_EL,
189     iframe => MISC_SPECIAL_EL,
190     img => MISC_SPECIAL_EL,
191 wakaba 1.193 #image => MISC_SPECIAL_EL, ## NOTE: Commented out in the spec.
192 wakaba 1.123 input => MISC_SPECIAL_EL,
193     isindex => MISC_SPECIAL_EL,
194     li => LI_EL,
195     link => MISC_SPECIAL_EL,
196     listing => MISC_SPECIAL_EL,
197     marquee => MISC_SCOPING_EL,
198     menu => MISC_SPECIAL_EL,
199     meta => MISC_SPECIAL_EL,
200 wakaba 1.193 nav => MISC_SPECIAL_EL,
201 wakaba 1.123 nobr => NOBR_EL | FORMATTING_EL,
202     noembed => MISC_SPECIAL_EL,
203     noframes => MISC_SPECIAL_EL,
204     noscript => MISC_SPECIAL_EL,
205     object => MISC_SCOPING_EL,
206     ol => MISC_SPECIAL_EL,
207     optgroup => OPTGROUP_EL,
208     option => OPTION_EL,
209     p => P_EL,
210     param => MISC_SPECIAL_EL,
211     plaintext => MISC_SPECIAL_EL,
212     pre => MISC_SPECIAL_EL,
213 wakaba 1.151 rp => RUBY_COMPONENT_EL,
214     rt => RUBY_COMPONENT_EL,
215     ruby => RUBY_EL,
216 wakaba 1.123 s => FORMATTING_EL,
217     script => MISC_SPECIAL_EL,
218     select => SELECT_EL,
219 wakaba 1.193 section => MISC_SPECIAL_EL,
220 wakaba 1.123 small => FORMATTING_EL,
221     spacer => MISC_SPECIAL_EL,
222     strike => FORMATTING_EL,
223     strong => FORMATTING_EL,
224     style => MISC_SPECIAL_EL,
225     table => TABLE_EL,
226     tbody => TABLE_ROW_GROUP_EL,
227     td => TABLE_CELL_EL,
228     textarea => MISC_SPECIAL_EL,
229     tfoot => TABLE_ROW_GROUP_EL,
230     th => TABLE_CELL_EL,
231     thead => TABLE_ROW_GROUP_EL,
232     title => MISC_SPECIAL_EL,
233     tr => TABLE_ROW_EL,
234     tt => FORMATTING_EL,
235     u => FORMATTING_EL,
236     ul => MISC_SPECIAL_EL,
237     wbr => MISC_SPECIAL_EL,
238     };
239    
240 wakaba 1.126 my $el_category_f = {
241     $MML_NS => {
242     'annotation-xml' => MML_AXML_EL,
243     mi => FOREIGN_FLOW_CONTENT_EL,
244     mo => FOREIGN_FLOW_CONTENT_EL,
245     mn => FOREIGN_FLOW_CONTENT_EL,
246     ms => FOREIGN_FLOW_CONTENT_EL,
247     mtext => FOREIGN_FLOW_CONTENT_EL,
248     },
249     $SVG_NS => {
250 wakaba 1.131 foreignObject => FOREIGN_FLOW_CONTENT_EL,
251 wakaba 1.126 desc => FOREIGN_FLOW_CONTENT_EL,
252     title => FOREIGN_FLOW_CONTENT_EL,
253     },
254     ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
255     };
256    
257 wakaba 1.131 my $svg_attr_name = {
258 wakaba 1.146 attributename => 'attributeName',
259 wakaba 1.131 attributetype => 'attributeType',
260     basefrequency => 'baseFrequency',
261     baseprofile => 'baseProfile',
262     calcmode => 'calcMode',
263     clippathunits => 'clipPathUnits',
264     contentscripttype => 'contentScriptType',
265     contentstyletype => 'contentStyleType',
266     diffuseconstant => 'diffuseConstant',
267     edgemode => 'edgeMode',
268     externalresourcesrequired => 'externalResourcesRequired',
269     filterres => 'filterRes',
270     filterunits => 'filterUnits',
271     glyphref => 'glyphRef',
272     gradienttransform => 'gradientTransform',
273     gradientunits => 'gradientUnits',
274     kernelmatrix => 'kernelMatrix',
275     kernelunitlength => 'kernelUnitLength',
276     keypoints => 'keyPoints',
277     keysplines => 'keySplines',
278     keytimes => 'keyTimes',
279     lengthadjust => 'lengthAdjust',
280     limitingconeangle => 'limitingConeAngle',
281     markerheight => 'markerHeight',
282     markerunits => 'markerUnits',
283     markerwidth => 'markerWidth',
284     maskcontentunits => 'maskContentUnits',
285     maskunits => 'maskUnits',
286     numoctaves => 'numOctaves',
287     pathlength => 'pathLength',
288     patterncontentunits => 'patternContentUnits',
289     patterntransform => 'patternTransform',
290     patternunits => 'patternUnits',
291     pointsatx => 'pointsAtX',
292     pointsaty => 'pointsAtY',
293     pointsatz => 'pointsAtZ',
294     preservealpha => 'preserveAlpha',
295     preserveaspectratio => 'preserveAspectRatio',
296     primitiveunits => 'primitiveUnits',
297     refx => 'refX',
298     refy => 'refY',
299     repeatcount => 'repeatCount',
300     repeatdur => 'repeatDur',
301     requiredextensions => 'requiredExtensions',
302 wakaba 1.146 requiredfeatures => 'requiredFeatures',
303 wakaba 1.131 specularconstant => 'specularConstant',
304     specularexponent => 'specularExponent',
305     spreadmethod => 'spreadMethod',
306     startoffset => 'startOffset',
307     stddeviation => 'stdDeviation',
308     stitchtiles => 'stitchTiles',
309     surfacescale => 'surfaceScale',
310     systemlanguage => 'systemLanguage',
311     tablevalues => 'tableValues',
312     targetx => 'targetX',
313     targety => 'targetY',
314     textlength => 'textLength',
315     viewbox => 'viewBox',
316     viewtarget => 'viewTarget',
317     xchannelselector => 'xChannelSelector',
318     ychannelselector => 'yChannelSelector',
319     zoomandpan => 'zoomAndPan',
320     };
321    
322     my $foreign_attr_xname = {
323     'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
324     'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
325     'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
326     'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
327     'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
328     'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
329     'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
330     'xml:base' => [$XML_NS, ['xml', 'base']],
331     'xml:lang' => [$XML_NS, ['xml', 'lang']],
332     'xml:space' => [$XML_NS, ['xml', 'space']],
333     'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
334     'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
335     };
336    
337     ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
338    
339 wakaba 1.191 my $charref_map = {
340     0x0D => 0x000A,
341 wakaba 1.10 0x80 => 0x20AC,
342     0x81 => 0xFFFD,
343     0x82 => 0x201A,
344     0x83 => 0x0192,
345     0x84 => 0x201E,
346     0x85 => 0x2026,
347     0x86 => 0x2020,
348     0x87 => 0x2021,
349     0x88 => 0x02C6,
350     0x89 => 0x2030,
351     0x8A => 0x0160,
352     0x8B => 0x2039,
353     0x8C => 0x0152,
354     0x8D => 0xFFFD,
355     0x8E => 0x017D,
356     0x8F => 0xFFFD,
357     0x90 => 0xFFFD,
358     0x91 => 0x2018,
359     0x92 => 0x2019,
360     0x93 => 0x201C,
361     0x94 => 0x201D,
362     0x95 => 0x2022,
363     0x96 => 0x2013,
364     0x97 => 0x2014,
365     0x98 => 0x02DC,
366     0x99 => 0x2122,
367     0x9A => 0x0161,
368     0x9B => 0x203A,
369     0x9C => 0x0153,
370     0x9D => 0xFFFD,
371     0x9E => 0x017E,
372     0x9F => 0x0178,
373 wakaba 1.191 }; # $charref_map
374     $charref_map->{$_} = 0xFFFD
375     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
376     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
377     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
378     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
379     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
380     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
381     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
382 wakaba 1.1
383 wakaba 1.192 ## TODO: Invoke the reset algorithm when a resettable element is
384     ## created (cf. HTML5 revision 2259).
385    
386 wakaba 1.63 sub parse_byte_string ($$$$;$) {
387 wakaba 1.138 my $self = shift;
388     my $charset_name = shift;
389     open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
390     return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
391     } # parse_byte_string
392    
393 wakaba 1.162 sub parse_byte_stream ($$$$;$$) {
394     # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_;
395 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
396 wakaba 1.133 my $charset_name = shift;
397 wakaba 1.138 my $byte_stream = $_[0];
398 wakaba 1.133
399 wakaba 1.134 my $onerror = $_[2] || sub {
400     my (%opt) = @_;
401     warn "Parse error ($opt{type})\n";
402     };
403     $self->{parse_error} = $onerror; # updated later by parse_char_string
404    
405 wakaba 1.162 my $get_wrapper = $_[3] || sub ($) {
406     return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle
407     };
408    
409 wakaba 1.133 ## HTML5 encoding sniffing algorithm
410     require Message::Charset::Info;
411     my $charset;
412 wakaba 1.136 my $buffer;
413     my ($char_stream, $e_status);
414 wakaba 1.133
415     SNIFFING: {
416 wakaba 1.160 ## NOTE: By setting |allow_fallback| option true when the
417     ## |get_decode_handle| method is invoked, we ignore what the HTML5
418     ## spec requires, i.e. unsupported encoding should be ignored.
419     ## TODO: We should not do this unless the parser is invoked
420     ## in the conformance checking mode, in which this behavior
421     ## would be useful.
422 wakaba 1.133
423     ## Step 1
424     if (defined $charset_name) {
425 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
426     ## TODO: Is this ok? Transfer protocol's parameter should be
427     ## interpreted in its semantics?
428 wakaba 1.133
429 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
430     ($byte_stream, allow_error_reporting => 1,
431 wakaba 1.133 allow_fallback => 1);
432 wakaba 1.136 if ($char_stream) {
433 wakaba 1.133 $self->{confident} = 1;
434     last SNIFFING;
435 wakaba 1.136 } else {
436 wakaba 1.190 !!!parse-error (type => 'charset:not supported',
437     layer => 'encode',
438     line => 1, column => 1,
439     value => $charset_name,
440     level => $self->{level}->{uncertain});
441 wakaba 1.133 }
442     }
443    
444     ## Step 2
445 wakaba 1.136 my $byte_buffer = '';
446     for (1..1024) {
447     my $char = $byte_stream->getc;
448     last unless defined $char;
449     $byte_buffer .= $char;
450     } ## TODO: timeout
451 wakaba 1.133
452     ## Step 3
453 wakaba 1.136 if ($byte_buffer =~ /^\xFE\xFF/) {
454 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-16be');
455 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
456     ($byte_stream, allow_error_reporting => 1,
457     allow_fallback => 1, byte_buffer => \$byte_buffer);
458 wakaba 1.133 $self->{confident} = 1;
459     last SNIFFING;
460 wakaba 1.136 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
461 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-16le');
462 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
463     ($byte_stream, allow_error_reporting => 1,
464     allow_fallback => 1, byte_buffer => \$byte_buffer);
465 wakaba 1.133 $self->{confident} = 1;
466     last SNIFFING;
467 wakaba 1.136 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
468 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
469 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
470     ($byte_stream, allow_error_reporting => 1,
471     allow_fallback => 1, byte_buffer => \$byte_buffer);
472 wakaba 1.133 $self->{confident} = 1;
473     last SNIFFING;
474     }
475    
476     ## Step 4
477     ## TODO: <meta charset>
478    
479     ## Step 5
480     ## TODO: from history
481    
482     ## Step 6
483 wakaba 1.65 require Whatpm::Charset::UniversalCharDet;
484 wakaba 1.133 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
485 wakaba 1.136 ($byte_buffer);
486 wakaba 1.133 if (defined $charset_name) {
487 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
488 wakaba 1.133
489     ## ISSUE: Unsupported encoding is not ignored according to the spec.
490 wakaba 1.136 require Whatpm::Charset::DecodeHandle;
491     $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
492     ($byte_stream);
493     ($char_stream, $e_status) = $charset->get_decode_handle
494     ($buffer, allow_error_reporting => 1,
495     allow_fallback => 1, byte_buffer => \$byte_buffer);
496     if ($char_stream) {
497     $buffer->{buffer} = $byte_buffer;
498 wakaba 1.153 !!!parse-error (type => 'sniffing:chardet',
499     text => $charset_name,
500     level => $self->{level}->{info},
501     layer => 'encode',
502 wakaba 1.134 line => 1, column => 1);
503 wakaba 1.133 $self->{confident} = 0;
504     last SNIFFING;
505     }
506     }
507    
508     ## Step 7: default
509     ## TODO: Make this configurable.
510 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('windows-1252');
511 wakaba 1.133 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
512     ## detectable in the step 6.
513 wakaba 1.136 require Whatpm::Charset::DecodeHandle;
514     $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
515     ($byte_stream);
516     ($char_stream, $e_status)
517     = $charset->get_decode_handle ($buffer,
518     allow_error_reporting => 1,
519     allow_fallback => 1,
520     byte_buffer => \$byte_buffer);
521     $buffer->{buffer} = $byte_buffer;
522 wakaba 1.153 !!!parse-error (type => 'sniffing:default',
523     text => 'windows-1252',
524     level => $self->{level}->{info},
525     line => 1, column => 1,
526     layer => 'encode');
527 wakaba 1.63 $self->{confident} = 0;
528 wakaba 1.133 } # SNIFFING
529    
530     if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
531 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
532 wakaba 1.153 !!!parse-error (type => 'chardecode:fallback',
533 wakaba 1.160 #text => $self->{input_encoding},
534 wakaba 1.153 level => $self->{level}->{uncertain},
535     line => 1, column => 1,
536     layer => 'encode');
537 wakaba 1.133 } elsif (not ($e_status &
538 wakaba 1.178 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
539 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name;
540 wakaba 1.153 !!!parse-error (type => 'chardecode:no error',
541     text => $self->{input_encoding},
542     level => $self->{level}->{uncertain},
543     line => 1, column => 1,
544     layer => 'encode');
545 wakaba 1.160 } else {
546     $self->{input_encoding} = $charset->get_iana_name;
547 wakaba 1.63 }
548    
549     $self->{change_encoding} = sub {
550     my $self = shift;
551 wakaba 1.134 $charset_name = shift;
552 wakaba 1.114 my $token = shift;
553 wakaba 1.63
554 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
555 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
556     ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
557     byte_buffer => \ $buffer->{buffer});
558 wakaba 1.134
559 wakaba 1.136 if ($char_stream) { # if supported
560 wakaba 1.134 ## "Change the encoding" algorithm:
561 wakaba 1.63
562 wakaba 1.134 ## Step 1
563 wakaba 1.149 if ($charset->{category} &
564     Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
565 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
566 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
567     ($byte_stream,
568     byte_buffer => \ $buffer->{buffer});
569 wakaba 1.134 }
570     $charset_name = $charset->get_iana_name;
571    
572     ## Step 2
573     if (defined $self->{input_encoding} and
574     $self->{input_encoding} eq $charset_name) {
575 wakaba 1.153 !!!parse-error (type => 'charset label:matching',
576     text => $charset_name,
577     level => $self->{level}->{info});
578 wakaba 1.134 $self->{confident} = 1;
579     return;
580     }
581 wakaba 1.63
582 wakaba 1.153 !!!parse-error (type => 'charset label detected',
583     text => $self->{input_encoding},
584     value => $charset_name,
585     level => $self->{level}->{warn},
586     token => $token);
587 wakaba 1.134
588     ## Step 3
589     # if (can) {
590     ## change the encoding on the fly.
591     #$self->{confident} = 1;
592     #return;
593     # }
594    
595     ## Step 4
596     throw Whatpm::HTML::RestartParser ();
597 wakaba 1.63 }
598     }; # $self->{change_encoding}
599    
600 wakaba 1.136 my $char_onerror = sub {
601     my (undef, $type, %opt) = @_;
602 wakaba 1.153 !!!parse-error (layer => 'encode',
603 wakaba 1.174 line => $self->{line}, column => $self->{column} + 1,
604     %opt, type => $type);
605 wakaba 1.136 if ($opt{octets}) {
606     ${$opt{octets}} = "\x{FFFD}"; # relacement character
607     }
608     };
609 wakaba 1.162
610     my $wrapped_char_stream = $get_wrapper->($char_stream);
611     $wrapped_char_stream->onerror ($char_onerror);
612 wakaba 1.136
613 wakaba 1.182 my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef;
614 wakaba 1.63 my $return;
615     try {
616 wakaba 1.162 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
617 wakaba 1.63 } catch Whatpm::HTML::RestartParser with {
618 wakaba 1.134 ## NOTE: Invoked after {change_encoding}.
619    
620     if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
621 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
622 wakaba 1.153 !!!parse-error (type => 'chardecode:fallback',
623     level => $self->{level}->{uncertain},
624 wakaba 1.160 #text => $self->{input_encoding},
625 wakaba 1.153 line => 1, column => 1,
626     layer => 'encode');
627 wakaba 1.134 } elsif (not ($e_status &
628 wakaba 1.178 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
629 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name;
630 wakaba 1.153 !!!parse-error (type => 'chardecode:no error',
631     text => $self->{input_encoding},
632     level => $self->{level}->{uncertain},
633     line => 1, column => 1,
634     layer => 'encode');
635 wakaba 1.160 } else {
636     $self->{input_encoding} = $charset->get_iana_name;
637 wakaba 1.134 }
638 wakaba 1.63 $self->{confident} = 1;
639 wakaba 1.162
640     $wrapped_char_stream = $get_wrapper->($char_stream);
641     $wrapped_char_stream->onerror ($char_onerror);
642    
643     $return = $self->parse_char_stream ($wrapped_char_stream, @args);
644 wakaba 1.63 };
645     return $return;
646 wakaba 1.138 } # parse_byte_stream
647 wakaba 1.63
648 wakaba 1.71 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
649     ## and the HTML layer MUST ignore it. However, we does strip BOM in
650     ## the encoding layer and the HTML layer does not ignore any U+FEFF,
651     ## because the core part of our HTML parser expects a string of character,
652     ## not a string of bytes or code units or anything which might contain a BOM.
653     ## Therefore, any parser interface that accepts a string of bytes,
654     ## such as |parse_byte_string| in this module, must ensure that it does
655     ## strip the BOM and never strip any ZWNBSP.
656    
657 wakaba 1.162 sub parse_char_string ($$$;$$) {
658     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
659 wakaba 1.135 my $self = shift;
660 wakaba 1.139 my $s = ref $_[0] ? $_[0] : \($_[0]);
661 wakaba 1.171 require Whatpm::Charset::DecodeHandle;
662     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
663 wakaba 1.135 return $self->parse_char_stream ($input, @_[1..$#_]);
664     } # parse_char_string
665 wakaba 1.162 *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
666 wakaba 1.63
667 wakaba 1.182 sub parse_char_stream ($$$;$$) {
668 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
669 wakaba 1.135 my $input = $_[0];
670 wakaba 1.1 $self->{document} = $_[1];
671 wakaba 1.63 @{$self->{document}->child_nodes} = ();
672 wakaba 1.1
673 wakaba 1.3 ## NOTE: |set_inner_html| copies most of this method's code
674    
675 wakaba 1.63 $self->{confident} = 1 unless exists $self->{confident};
676 wakaba 1.64 $self->{document}->input_encoding ($self->{input_encoding})
677     if defined $self->{input_encoding};
678 wakaba 1.178 ## TODO: |{input_encoding}| is needless?
679 wakaba 1.63
680 wakaba 1.112 $self->{line_prev} = $self->{line} = 1;
681 wakaba 1.179 $self->{column_prev} = -1;
682     $self->{column} = 0;
683 wakaba 1.183 $self->{set_nc} = sub {
684 wakaba 1.1 my $self = shift;
685 wakaba 1.13
686 wakaba 1.178 my $char = '';
687 wakaba 1.183 if (defined $self->{next_nc}) {
688     $char = $self->{next_nc};
689     delete $self->{next_nc};
690     $self->{nc} = ord $char;
691 wakaba 1.139 } else {
692 wakaba 1.179 $self->{char_buffer} = '';
693     $self->{char_buffer_pos} = 0;
694    
695     my $count = $input->manakai_read_until
696 wakaba 1.182 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
697 wakaba 1.179 if ($count) {
698     $self->{line_prev} = $self->{line};
699     $self->{column_prev} = $self->{column};
700     $self->{column}++;
701 wakaba 1.183 $self->{nc}
702 wakaba 1.179 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
703     return;
704     }
705    
706 wakaba 1.178 if ($input->read ($char, 1)) {
707 wakaba 1.183 $self->{nc} = ord $char;
708 wakaba 1.178 } else {
709 wakaba 1.183 $self->{nc} = -1;
710 wakaba 1.178 return;
711     }
712 wakaba 1.139 }
713 wakaba 1.112
714     ($self->{line_prev}, $self->{column_prev})
715     = ($self->{line}, $self->{column});
716     $self->{column}++;
717 wakaba 1.1
718 wakaba 1.183 if ($self->{nc} == 0x000A) { # LF
719 wakaba 1.132 !!!cp ('j1');
720 wakaba 1.112 $self->{line}++;
721     $self->{column} = 0;
722 wakaba 1.183 } elsif ($self->{nc} == 0x000D) { # CR
723 wakaba 1.132 !!!cp ('j2');
724 wakaba 1.170 ## TODO: support for abort/streaming
725 wakaba 1.178 my $next = '';
726     if ($input->read ($next, 1) and $next ne "\x0A") {
727 wakaba 1.183 $self->{next_nc} = $next;
728 wakaba 1.135 }
729 wakaba 1.183 $self->{nc} = 0x000A; # LF # MUST
730 wakaba 1.112 $self->{line}++;
731     $self->{column} = 0;
732 wakaba 1.183 } elsif ($self->{nc} == 0x0000) { # NULL
733 wakaba 1.132 !!!cp ('j4');
734 wakaba 1.8 !!!parse-error (type => 'NULL');
735 wakaba 1.183 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
736 wakaba 1.1 }
737     };
738    
739 wakaba 1.172 $self->{read_until} = sub {
740     #my ($scalar, $specials_range, $offset) = @_;
741 wakaba 1.183 return 0 if defined $self->{next_nc};
742 wakaba 1.180
743 wakaba 1.182 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
744 wakaba 1.180 my $offset = $_[2] || 0;
745    
746     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
747     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
748     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
749     substr ($_[0], $offset)
750     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
751     my $count = $+[0] - $-[0];
752     if ($count) {
753     $self->{column} += $count;
754     $self->{char_buffer_pos} += $count;
755     $self->{line_prev} = $self->{line};
756     $self->{column_prev} = $self->{column} - 1;
757 wakaba 1.183 $self->{nc} = -1;
758 wakaba 1.180 }
759     return $count;
760     } else {
761     return 0;
762     }
763     } else {
764     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
765     if ($count) {
766     $self->{column} += $count;
767     $self->{line_prev} = $self->{line};
768     $self->{column_prev} = $self->{column} - 1;
769 wakaba 1.183 $self->{nc} = -1;
770 wakaba 1.180 }
771     return $count;
772 wakaba 1.172 }
773     }; # $self->{read_until}
774 wakaba 1.171
775 wakaba 1.3 my $onerror = $_[2] || sub {
776     my (%opt) = @_;
777 wakaba 1.112 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
778     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
779     warn "Parse error ($opt{type}) at line $line column $column\n";
780 wakaba 1.3 };
781     $self->{parse_error} = sub {
782 wakaba 1.112 $onerror->(line => $self->{line}, column => $self->{column}, @_);
783 wakaba 1.1 };
784    
785 wakaba 1.182 my $char_onerror = sub {
786     my (undef, $type, %opt) = @_;
787     !!!parse-error (layer => 'encode',
788     line => $self->{line}, column => $self->{column} + 1,
789     %opt, type => $type);
790     }; # $char_onerror
791    
792     if ($_[3]) {
793     $input = $_[3]->($input);
794     $input->onerror ($char_onerror);
795     } else {
796     $input->onerror ($char_onerror) unless defined $input->onerror;
797     }
798    
799 wakaba 1.1 $self->_initialize_tokenizer;
800     $self->_initialize_tree_constructor;
801     $self->_construct_tree;
802     $self->_terminate_tree_constructor;
803    
804 wakaba 1.112 delete $self->{parse_error}; # remove loop
805    
806 wakaba 1.1 return $self->{document};
807 wakaba 1.135 } # parse_char_stream
808 wakaba 1.1
809     sub new ($) {
810     my $class = shift;
811 wakaba 1.134 my $self = bless {
812 wakaba 1.153 level => {must => 'm',
813 wakaba 1.159 should => 's',
814 wakaba 1.153 warn => 'w',
815     info => 'i',
816     uncertain => 'u'},
817 wakaba 1.134 }, $class;
818 wakaba 1.183 $self->{set_nc} = sub {
819     $self->{nc} = -1;
820 wakaba 1.1 };
821     $self->{parse_error} = sub {
822     #
823     };
824 wakaba 1.63 $self->{change_encoding} = sub {
825     # if ($_[0] is a supported encoding) {
826     # run "change the encoding" algorithm;
827     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
828     # }
829     };
830 wakaba 1.61 $self->{application_cache_selection} = sub {
831     #
832     };
833 wakaba 1.1 return $self;
834     } # new
835    
836 wakaba 1.40 sub CM_ENTITY () { 0b001 } # & markup in data
837     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
838     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
839    
840     sub PLAINTEXT_CONTENT_MODEL () { 0 }
841     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
842     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
843     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
844    
845 wakaba 1.57 sub DATA_STATE () { 0 }
846 wakaba 1.168 #sub ENTITY_DATA_STATE () { 1 }
847 wakaba 1.57 sub TAG_OPEN_STATE () { 2 }
848     sub CLOSE_TAG_OPEN_STATE () { 3 }
849     sub TAG_NAME_STATE () { 4 }
850     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
851     sub ATTRIBUTE_NAME_STATE () { 6 }
852     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
853     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
854     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
855     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
856     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
857 wakaba 1.168 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
858 wakaba 1.57 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
859     sub COMMENT_START_STATE () { 14 }
860     sub COMMENT_START_DASH_STATE () { 15 }
861     sub COMMENT_STATE () { 16 }
862     sub COMMENT_END_STATE () { 17 }
863     sub COMMENT_END_DASH_STATE () { 18 }
864     sub BOGUS_COMMENT_STATE () { 19 }
865     sub DOCTYPE_STATE () { 20 }
866     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
867     sub DOCTYPE_NAME_STATE () { 22 }
868     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
869     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
870     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
871     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
872     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
873     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
874     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
875     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
876     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
877     sub BOGUS_DOCTYPE_STATE () { 32 }
878 wakaba 1.72 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
879 wakaba 1.125 sub SELF_CLOSING_START_TAG_STATE () { 34 }
880 wakaba 1.165 sub CDATA_SECTION_STATE () { 35 }
881 wakaba 1.164 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
882     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
883     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
884 wakaba 1.185 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
885 wakaba 1.165 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
886     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
887 wakaba 1.166 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
888     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
889 wakaba 1.168 ## NOTE: "Entity data state", "entity in attribute value state", and
890     ## "consume a character reference" algorithm are jointly implemented
891     ## using the following six states:
892     sub ENTITY_STATE () { 44 }
893     sub ENTITY_HASH_STATE () { 45 }
894     sub NCR_NUM_STATE () { 46 }
895     sub HEXREF_X_STATE () { 47 }
896     sub HEXREF_HEX_STATE () { 48 }
897     sub ENTITY_NAME_STATE () { 49 }
898 wakaba 1.185 sub PCDATA_STATE () { 50 } # "data state" in the spec
899 wakaba 1.57
900 wakaba 1.55 sub DOCTYPE_TOKEN () { 1 }
901     sub COMMENT_TOKEN () { 2 }
902     sub START_TAG_TOKEN () { 3 }
903     sub END_TAG_TOKEN () { 4 }
904     sub END_OF_FILE_TOKEN () { 5 }
905     sub CHARACTER_TOKEN () { 6 }
906    
907 wakaba 1.54 sub AFTER_HTML_IMS () { 0b100 }
908     sub HEAD_IMS () { 0b1000 }
909     sub BODY_IMS () { 0b10000 }
910 wakaba 1.56 sub BODY_TABLE_IMS () { 0b100000 }
911 wakaba 1.54 sub TABLE_IMS () { 0b1000000 }
912 wakaba 1.56 sub ROW_IMS () { 0b10000000 }
913 wakaba 1.54 sub BODY_AFTER_IMS () { 0b100000000 }
914     sub FRAME_IMS () { 0b1000000000 }
915 wakaba 1.101 sub SELECT_IMS () { 0b10000000000 }
916 wakaba 1.126 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
917     ## NOTE: "in foreign content" insertion mode is special; it is combined
918     ## with the secondary insertion mode. In this parser, they are stored
919     ## together in the bit-or'ed form.
920 wakaba 1.54
921 wakaba 1.84 ## NOTE: "initial" and "before html" insertion modes have no constants.
922    
923     ## NOTE: "after after body" insertion mode.
924 wakaba 1.54 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
925 wakaba 1.84
926     ## NOTE: "after after frameset" insertion mode.
927 wakaba 1.54 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
928 wakaba 1.84
929 wakaba 1.54 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
930     sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
931     sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
932     sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
933     sub IN_BODY_IM () { BODY_IMS }
934 wakaba 1.56 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
935     sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
936     sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
937     sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
938 wakaba 1.54 sub IN_TABLE_IM () { TABLE_IMS }
939     sub AFTER_BODY_IM () { BODY_AFTER_IMS }
940     sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
941     sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
942 wakaba 1.101 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
943     sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
944 wakaba 1.54 sub IN_COLUMN_GROUP_IM () { 0b10 }
945    
946 wakaba 1.1 ## Implementations MUST act as if state machine in the spec
947    
948     sub _initialize_tokenizer ($) {
949     my $self = shift;
950 wakaba 1.57 $self->{state} = DATA_STATE; # MUST
951 wakaba 1.183 #$self->{s_kwd}; # state keyword - initialized when used
952 wakaba 1.169 #$self->{entity__value}; # initialized when used
953     #$self->{entity__match}; # initialized when used
954 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
955 wakaba 1.183 undef $self->{ct}; # current token
956     undef $self->{ca}; # current attribute
957     undef $self->{last_stag_name}; # last emitted start tag name
958 wakaba 1.169 #$self->{prev_state}; # initialized when used
959 wakaba 1.125 delete $self->{self_closing};
960 wakaba 1.179 $self->{char_buffer} = '';
961     $self->{char_buffer_pos} = 0;
962 wakaba 1.183 $self->{nc} = -1; # next input character
963     #$self->{next_nc}
964 wakaba 1.1 !!!next-input-character;
965     $self->{token} = [];
966 wakaba 1.18 # $self->{escape}
967 wakaba 1.1 } # _initialize_tokenizer
968    
969     ## A token has:
970 wakaba 1.55 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
971     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
972     ## ->{name} (DOCTYPE_TOKEN)
973     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
974 wakaba 1.183 ## ->{pubid} (DOCTYPE_TOKEN)
975     ## ->{sysid} (DOCTYPE_TOKEN)
976 wakaba 1.75 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
977 wakaba 1.55 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
978 wakaba 1.66 ## ->{name}
979     ## ->{value}
980     ## ->{has_reference} == 1 or 0
981 wakaba 1.55 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
982 wakaba 1.125 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
983     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
984     ## while the token is pushed back to the stack.
985    
986 wakaba 1.1 ## Emitted token MUST immediately be handled by the tree construction state.
987    
988     ## Before each step, UA MAY check to see if either one of the scripts in
989     ## "list of scripts that will execute as soon as possible" or the first
990     ## script in the "list of scripts that will execute asynchronously",
991     ## has completed loading. If one has, then it MUST be executed
992     ## and removed from the list.
993    
994 wakaba 1.169 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
995     ## (This requirement was dropped from HTML5 spec, unfortunately.)
996 wakaba 1.59
997 wakaba 1.187 my $is_space = {
998     0x0009 => 1, # CHARACTER TABULATION (HT)
999     0x000A => 1, # LINE FEED (LF)
1000     #0x000B => 0, # LINE TABULATION (VT)
1001     0x000C => 1, # FORM FEED (FF)
1002     #0x000D => 1, # CARRIAGE RETURN (CR)
1003     0x0020 => 1, # SPACE (SP)
1004     };
1005    
1006 wakaba 1.1 sub _get_next_token ($) {
1007     my $self = shift;
1008 wakaba 1.125
1009     if ($self->{self_closing}) {
1010 wakaba 1.183 !!!parse-error (type => 'nestc', token => $self->{ct});
1011 wakaba 1.125 ## NOTE: The |self_closing| flag is only set by start tag token.
1012     ## In addition, when a start tag token is emitted, it is always set to
1013 wakaba 1.183 ## |ct|.
1014 wakaba 1.125 delete $self->{self_closing};
1015     }
1016    
1017 wakaba 1.1 if (@{$self->{token}}) {
1018 wakaba 1.125 $self->{self_closing} = $self->{token}->[0]->{self_closing};
1019 wakaba 1.1 return shift @{$self->{token}};
1020     }
1021    
1022     A: {
1023 wakaba 1.185 if ($self->{state} == PCDATA_STATE) {
1024     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
1025    
1026 wakaba 1.183 if ($self->{nc} == 0x0026) { # &
1027 wakaba 1.185 !!!cp (0.1);
1028     ## NOTE: In the spec, the tokenizer is switched to the
1029     ## "entity data state". In this implementation, the tokenizer
1030     ## is switched to the |ENTITY_STATE|, which is an implementation
1031     ## of the "consume a character reference" algorithm.
1032     $self->{entity_add} = -1;
1033     $self->{prev_state} = DATA_STATE;
1034     $self->{state} = ENTITY_STATE;
1035     !!!next-input-character;
1036     redo A;
1037     } elsif ($self->{nc} == 0x003C) { # <
1038     !!!cp (0.2);
1039     $self->{state} = TAG_OPEN_STATE;
1040     !!!next-input-character;
1041     redo A;
1042     } elsif ($self->{nc} == -1) {
1043     !!!cp (0.3);
1044     !!!emit ({type => END_OF_FILE_TOKEN,
1045     line => $self->{line}, column => $self->{column}});
1046     last A; ## TODO: ok?
1047     } else {
1048     !!!cp (0.4);
1049     #
1050     }
1051    
1052     # Anything else
1053     my $token = {type => CHARACTER_TOKEN,
1054     data => chr $self->{nc},
1055     line => $self->{line}, column => $self->{column},
1056     };
1057     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
1058    
1059     ## Stay in the state.
1060     !!!next-input-character;
1061     !!!emit ($token);
1062     redo A;
1063     } elsif ($self->{state} == DATA_STATE) {
1064     $self->{s_kwd} = '' unless defined $self->{s_kwd};
1065     if ($self->{nc} == 0x0026) { # &
1066     $self->{s_kwd} = '';
1067 wakaba 1.72 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
1068     not $self->{escape}) {
1069 wakaba 1.77 !!!cp (1);
1070 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1071     ## "entity data state". In this implementation, the tokenizer
1072     ## is switched to the |ENTITY_STATE|, which is an implementation
1073     ## of the "consume a character reference" algorithm.
1074 wakaba 1.183 $self->{entity_add} = -1;
1075 wakaba 1.169 $self->{prev_state} = DATA_STATE;
1076 wakaba 1.167 $self->{state} = ENTITY_STATE;
1077 wakaba 1.1 !!!next-input-character;
1078     redo A;
1079     } else {
1080 wakaba 1.77 !!!cp (2);
1081 wakaba 1.1 #
1082     }
1083 wakaba 1.183 } elsif ($self->{nc} == 0x002D) { # -
1084 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1085 wakaba 1.185 $self->{s_kwd} .= '-';
1086    
1087 wakaba 1.184 if ($self->{s_kwd} eq '<!--') {
1088     !!!cp (3);
1089     $self->{escape} = 1; # unless $self->{escape};
1090     $self->{s_kwd} = '--';
1091     #
1092     } elsif ($self->{s_kwd} eq '---') {
1093     !!!cp (4);
1094     $self->{s_kwd} = '--';
1095     #
1096 wakaba 1.77 } else {
1097     !!!cp (5);
1098 wakaba 1.184 #
1099 wakaba 1.13 }
1100     }
1101    
1102     #
1103 wakaba 1.184 } elsif ($self->{nc} == 0x0021) { # !
1104 wakaba 1.185 if (length $self->{s_kwd}) {
1105 wakaba 1.184 !!!cp (5.1);
1106     $self->{s_kwd} .= '!';
1107     #
1108     } else {
1109     !!!cp (5.2);
1110 wakaba 1.185 #$self->{s_kwd} = '';
1111 wakaba 1.184 #
1112     }
1113     #
1114 wakaba 1.183 } elsif ($self->{nc} == 0x003C) { # <
1115 wakaba 1.40 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
1116     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
1117 wakaba 1.13 not $self->{escape})) {
1118 wakaba 1.77 !!!cp (6);
1119 wakaba 1.57 $self->{state} = TAG_OPEN_STATE;
1120 wakaba 1.1 !!!next-input-character;
1121     redo A;
1122     } else {
1123 wakaba 1.77 !!!cp (7);
1124 wakaba 1.185 $self->{s_kwd} = '';
1125 wakaba 1.1 #
1126     }
1127 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1128 wakaba 1.13 if ($self->{escape} and
1129 wakaba 1.40 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
1130 wakaba 1.185 if ($self->{s_kwd} eq '--') {
1131 wakaba 1.77 !!!cp (8);
1132 wakaba 1.13 delete $self->{escape};
1133 wakaba 1.77 } else {
1134     !!!cp (9);
1135 wakaba 1.13 }
1136 wakaba 1.77 } else {
1137     !!!cp (10);
1138 wakaba 1.13 }
1139    
1140 wakaba 1.185 $self->{s_kwd} = '';
1141 wakaba 1.13 #
1142 wakaba 1.183 } elsif ($self->{nc} == -1) {
1143 wakaba 1.77 !!!cp (11);
1144 wakaba 1.185 $self->{s_kwd} = '';
1145 wakaba 1.112 !!!emit ({type => END_OF_FILE_TOKEN,
1146     line => $self->{line}, column => $self->{column}});
1147 wakaba 1.1 last A; ## TODO: ok?
1148 wakaba 1.77 } else {
1149     !!!cp (12);
1150 wakaba 1.185 $self->{s_kwd} = '';
1151 wakaba 1.184 #
1152 wakaba 1.1 }
1153 wakaba 1.184
1154 wakaba 1.1 # Anything else
1155 wakaba 1.55 my $token = {type => CHARACTER_TOKEN,
1156 wakaba 1.183 data => chr $self->{nc},
1157 wakaba 1.120 line => $self->{line}, column => $self->{column},
1158 wakaba 1.118 };
1159 wakaba 1.184 if ($self->{read_until}->($token->{data}, q[-!<>&],
1160     length $token->{data})) {
1161 wakaba 1.185 $self->{s_kwd} = '';
1162 wakaba 1.184 }
1163 wakaba 1.171
1164 wakaba 1.185 ## Stay in the data state.
1165     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
1166     !!!cp (13);
1167     $self->{state} = PCDATA_STATE;
1168     } else {
1169     !!!cp (14);
1170     ## Stay in the state.
1171     }
1172 wakaba 1.1 !!!next-input-character;
1173     !!!emit ($token);
1174     redo A;
1175 wakaba 1.57 } elsif ($self->{state} == TAG_OPEN_STATE) {
1176 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1177 wakaba 1.183 if ($self->{nc} == 0x002F) { # /
1178 wakaba 1.77 !!!cp (15);
1179 wakaba 1.1 !!!next-input-character;
1180 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
1181 wakaba 1.1 redo A;
1182 wakaba 1.184 } elsif ($self->{nc} == 0x0021) { # !
1183     !!!cp (15.1);
1184     $self->{s_kwd} = '<' unless $self->{escape};
1185     #
1186 wakaba 1.1 } else {
1187 wakaba 1.77 !!!cp (16);
1188 wakaba 1.184 #
1189     }
1190 wakaba 1.1
1191 wakaba 1.184 ## reconsume
1192     $self->{state} = DATA_STATE;
1193     !!!emit ({type => CHARACTER_TOKEN, data => '<',
1194     line => $self->{line_prev},
1195     column => $self->{column_prev},
1196     });
1197     redo A;
1198 wakaba 1.40 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1199 wakaba 1.183 if ($self->{nc} == 0x0021) { # !
1200 wakaba 1.77 !!!cp (17);
1201 wakaba 1.57 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1202 wakaba 1.1 !!!next-input-character;
1203     redo A;
1204 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1205 wakaba 1.77 !!!cp (18);
1206 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
1207 wakaba 1.1 !!!next-input-character;
1208     redo A;
1209 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1210     $self->{nc} <= 0x005A) { # A..Z
1211 wakaba 1.77 !!!cp (19);
1212 wakaba 1.183 $self->{ct}
1213 wakaba 1.55 = {type => START_TAG_TOKEN,
1214 wakaba 1.183 tag_name => chr ($self->{nc} + 0x0020),
1215 wakaba 1.112 line => $self->{line_prev},
1216     column => $self->{column_prev}};
1217 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1218 wakaba 1.1 !!!next-input-character;
1219     redo A;
1220 wakaba 1.183 } elsif (0x0061 <= $self->{nc} and
1221     $self->{nc} <= 0x007A) { # a..z
1222 wakaba 1.77 !!!cp (20);
1223 wakaba 1.183 $self->{ct} = {type => START_TAG_TOKEN,
1224     tag_name => chr ($self->{nc}),
1225 wakaba 1.112 line => $self->{line_prev},
1226     column => $self->{column_prev}};
1227 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1228 wakaba 1.1 !!!next-input-character;
1229     redo A;
1230 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1231 wakaba 1.77 !!!cp (21);
1232 wakaba 1.115 !!!parse-error (type => 'empty start tag',
1233     line => $self->{line_prev},
1234     column => $self->{column_prev});
1235 wakaba 1.57 $self->{state} = DATA_STATE;
1236 wakaba 1.1 !!!next-input-character;
1237    
1238 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1239 wakaba 1.120 line => $self->{line_prev},
1240     column => $self->{column_prev},
1241 wakaba 1.118 });
1242 wakaba 1.1
1243     redo A;
1244 wakaba 1.183 } elsif ($self->{nc} == 0x003F) { # ?
1245 wakaba 1.77 !!!cp (22);
1246 wakaba 1.115 !!!parse-error (type => 'pio',
1247     line => $self->{line_prev},
1248     column => $self->{column_prev});
1249 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1250 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1251 wakaba 1.120 line => $self->{line_prev},
1252     column => $self->{column_prev},
1253 wakaba 1.118 };
1254 wakaba 1.183 ## $self->{nc} is intentionally left as is
1255 wakaba 1.1 redo A;
1256     } else {
1257 wakaba 1.77 !!!cp (23);
1258 wakaba 1.136 !!!parse-error (type => 'bare stago',
1259     line => $self->{line_prev},
1260     column => $self->{column_prev});
1261 wakaba 1.57 $self->{state} = DATA_STATE;
1262 wakaba 1.1 ## reconsume
1263    
1264 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1265 wakaba 1.120 line => $self->{line_prev},
1266     column => $self->{column_prev},
1267 wakaba 1.118 });
1268 wakaba 1.1
1269     redo A;
1270     }
1271     } else {
1272 wakaba 1.40 die "$0: $self->{content_model} in tag open";
1273 wakaba 1.1 }
1274 wakaba 1.57 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1275 wakaba 1.164 ## NOTE: The "close tag open state" in the spec is implemented as
1276 wakaba 1.185 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
1277 wakaba 1.164
1278 wakaba 1.113 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1279 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1280 wakaba 1.183 if (defined $self->{last_stag_name}) {
1281 wakaba 1.185 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
1282 wakaba 1.183 $self->{s_kwd} = '';
1283 wakaba 1.164 ## Reconsume.
1284     redo A;
1285 wakaba 1.23 } else {
1286     ## No start tag token has ever been emitted
1287 wakaba 1.164 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1288 wakaba 1.77 !!!cp (28);
1289 wakaba 1.57 $self->{state} = DATA_STATE;
1290 wakaba 1.164 ## Reconsume.
1291 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1292 wakaba 1.120 line => $l, column => $c,
1293 wakaba 1.118 });
1294 wakaba 1.1 redo A;
1295     }
1296     }
1297 wakaba 1.164
1298 wakaba 1.183 if (0x0041 <= $self->{nc} and
1299     $self->{nc} <= 0x005A) { # A..Z
1300 wakaba 1.77 !!!cp (29);
1301 wakaba 1.183 $self->{ct}
1302 wakaba 1.112 = {type => END_TAG_TOKEN,
1303 wakaba 1.183 tag_name => chr ($self->{nc} + 0x0020),
1304 wakaba 1.112 line => $l, column => $c};
1305 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1306 wakaba 1.1 !!!next-input-character;
1307     redo A;
1308 wakaba 1.183 } elsif (0x0061 <= $self->{nc} and
1309     $self->{nc} <= 0x007A) { # a..z
1310 wakaba 1.77 !!!cp (30);
1311 wakaba 1.183 $self->{ct} = {type => END_TAG_TOKEN,
1312     tag_name => chr ($self->{nc}),
1313 wakaba 1.112 line => $l, column => $c};
1314 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1315 wakaba 1.1 !!!next-input-character;
1316     redo A;
1317 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1318 wakaba 1.77 !!!cp (31);
1319 wakaba 1.115 !!!parse-error (type => 'empty end tag',
1320     line => $self->{line_prev}, ## "<" in "</>"
1321     column => $self->{column_prev} - 1);
1322 wakaba 1.57 $self->{state} = DATA_STATE;
1323 wakaba 1.1 !!!next-input-character;
1324     redo A;
1325 wakaba 1.183 } elsif ($self->{nc} == -1) {
1326 wakaba 1.77 !!!cp (32);
1327 wakaba 1.3 !!!parse-error (type => 'bare etago');
1328 wakaba 1.57 $self->{state} = DATA_STATE;
1329 wakaba 1.1 # reconsume
1330    
1331 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1332 wakaba 1.120 line => $l, column => $c,
1333 wakaba 1.118 });
1334 wakaba 1.1
1335     redo A;
1336     } else {
1337 wakaba 1.77 !!!cp (33);
1338 wakaba 1.3 !!!parse-error (type => 'bogus end tag');
1339 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1340 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1341 wakaba 1.120 line => $self->{line_prev}, # "<" of "</"
1342     column => $self->{column_prev} - 1,
1343 wakaba 1.118 };
1344 wakaba 1.183 ## NOTE: $self->{nc} is intentionally left as is.
1345 wakaba 1.164 ## Although the "anything else" case of the spec not explicitly
1346     ## states that the next input character is to be reconsumed,
1347     ## it will be included to the |data| of the comment token
1348     ## generated from the bogus end tag, as defined in the
1349     ## "bogus comment state" entry.
1350     redo A;
1351     }
1352 wakaba 1.185 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
1353 wakaba 1.183 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
1354 wakaba 1.164 if (length $ch) {
1355     my $CH = $ch;
1356     $ch =~ tr/a-z/A-Z/;
1357 wakaba 1.183 my $nch = chr $self->{nc};
1358 wakaba 1.164 if ($nch eq $ch or $nch eq $CH) {
1359     !!!cp (24);
1360     ## Stay in the state.
1361 wakaba 1.183 $self->{s_kwd} .= $nch;
1362 wakaba 1.164 !!!next-input-character;
1363     redo A;
1364     } else {
1365     !!!cp (25);
1366     $self->{state} = DATA_STATE;
1367     ## Reconsume.
1368     !!!emit ({type => CHARACTER_TOKEN,
1369 wakaba 1.183 data => '</' . $self->{s_kwd},
1370 wakaba 1.164 line => $self->{line_prev},
1371 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1372 wakaba 1.164 });
1373     redo A;
1374     }
1375     } else { # after "<{tag-name}"
1376 wakaba 1.187 unless ($is_space->{$self->{nc}} or
1377     {
1378 wakaba 1.164 0x003E => 1, # >
1379     0x002F => 1, # /
1380     -1 => 1, # EOF
1381 wakaba 1.183 }->{$self->{nc}}) {
1382 wakaba 1.164 !!!cp (26);
1383     ## Reconsume.
1384     $self->{state} = DATA_STATE;
1385     !!!emit ({type => CHARACTER_TOKEN,
1386 wakaba 1.183 data => '</' . $self->{s_kwd},
1387 wakaba 1.164 line => $self->{line_prev},
1388 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1389 wakaba 1.164 });
1390     redo A;
1391     } else {
1392     !!!cp (27);
1393 wakaba 1.183 $self->{ct}
1394 wakaba 1.164 = {type => END_TAG_TOKEN,
1395 wakaba 1.183 tag_name => $self->{last_stag_name},
1396 wakaba 1.164 line => $self->{line_prev},
1397 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
1398 wakaba 1.164 $self->{state} = TAG_NAME_STATE;
1399     ## Reconsume.
1400     redo A;
1401     }
1402 wakaba 1.1 }
1403 wakaba 1.57 } elsif ($self->{state} == TAG_NAME_STATE) {
1404 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1405 wakaba 1.77 !!!cp (34);
1406 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1407 wakaba 1.1 !!!next-input-character;
1408     redo A;
1409 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1410     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1411 wakaba 1.77 !!!cp (35);
1412 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1413     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1414 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1415 wakaba 1.183 #if ($self->{ct}->{attributes}) {
1416 wakaba 1.78 # ## NOTE: This should never be reached.
1417     # !!! cp (36);
1418     # !!! parse-error (type => 'end tag attribute');
1419     #} else {
1420 wakaba 1.77 !!!cp (37);
1421 wakaba 1.78 #}
1422 wakaba 1.1 } else {
1423 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1424 wakaba 1.1 }
1425 wakaba 1.57 $self->{state} = DATA_STATE;
1426 wakaba 1.1 !!!next-input-character;
1427    
1428 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1429 wakaba 1.1
1430     redo A;
1431 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1432     $self->{nc} <= 0x005A) { # A..Z
1433 wakaba 1.77 !!!cp (38);
1434 wakaba 1.183 $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
1435 wakaba 1.1 # start tag or end tag
1436     ## Stay in this state
1437     !!!next-input-character;
1438     redo A;
1439 wakaba 1.183 } elsif ($self->{nc} == -1) {
1440 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1441 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1442 wakaba 1.77 !!!cp (39);
1443 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1444     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1445 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1446 wakaba 1.183 #if ($self->{ct}->{attributes}) {
1447 wakaba 1.78 # ## NOTE: This state should never be reached.
1448     # !!! cp (40);
1449     # !!! parse-error (type => 'end tag attribute');
1450     #} else {
1451 wakaba 1.77 !!!cp (41);
1452 wakaba 1.78 #}
1453 wakaba 1.1 } else {
1454 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1455 wakaba 1.1 }
1456 wakaba 1.57 $self->{state} = DATA_STATE;
1457 wakaba 1.1 # reconsume
1458    
1459 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1460 wakaba 1.1
1461     redo A;
1462 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1463 wakaba 1.125 !!!cp (42);
1464     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1465 wakaba 1.1 !!!next-input-character;
1466     redo A;
1467     } else {
1468 wakaba 1.77 !!!cp (44);
1469 wakaba 1.183 $self->{ct}->{tag_name} .= chr $self->{nc};
1470 wakaba 1.1 # start tag or end tag
1471     ## Stay in the state
1472     !!!next-input-character;
1473     redo A;
1474     }
1475 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1476 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1477 wakaba 1.77 !!!cp (45);
1478 wakaba 1.1 ## Stay in the state
1479     !!!next-input-character;
1480     redo A;
1481 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1482     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1483 wakaba 1.77 !!!cp (46);
1484 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1485     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1486 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1487 wakaba 1.183 if ($self->{ct}->{attributes}) {
1488 wakaba 1.77 !!!cp (47);
1489 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1490 wakaba 1.77 } else {
1491     !!!cp (48);
1492 wakaba 1.1 }
1493     } else {
1494 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1495 wakaba 1.1 }
1496 wakaba 1.57 $self->{state} = DATA_STATE;
1497 wakaba 1.1 !!!next-input-character;
1498    
1499 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1500 wakaba 1.1
1501     redo A;
1502 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1503     $self->{nc} <= 0x005A) { # A..Z
1504 wakaba 1.77 !!!cp (49);
1505 wakaba 1.183 $self->{ca}
1506     = {name => chr ($self->{nc} + 0x0020),
1507 wakaba 1.119 value => '',
1508     line => $self->{line}, column => $self->{column}};
1509 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1510 wakaba 1.1 !!!next-input-character;
1511     redo A;
1512 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1513 wakaba 1.125 !!!cp (50);
1514     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1515 wakaba 1.1 !!!next-input-character;
1516     redo A;
1517 wakaba 1.183 } elsif ($self->{nc} == -1) {
1518 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1519 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1520 wakaba 1.77 !!!cp (52);
1521 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1522     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1523 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1524 wakaba 1.183 if ($self->{ct}->{attributes}) {
1525 wakaba 1.77 !!!cp (53);
1526 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1527 wakaba 1.77 } else {
1528     !!!cp (54);
1529 wakaba 1.1 }
1530     } else {
1531 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1532 wakaba 1.1 }
1533 wakaba 1.57 $self->{state} = DATA_STATE;
1534 wakaba 1.1 # reconsume
1535    
1536 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1537 wakaba 1.1
1538     redo A;
1539     } else {
1540 wakaba 1.72 if ({
1541     0x0022 => 1, # "
1542     0x0027 => 1, # '
1543     0x003D => 1, # =
1544 wakaba 1.183 }->{$self->{nc}}) {
1545 wakaba 1.77 !!!cp (55);
1546 wakaba 1.72 !!!parse-error (type => 'bad attribute name');
1547 wakaba 1.77 } else {
1548     !!!cp (56);
1549 wakaba 1.72 }
1550 wakaba 1.183 $self->{ca}
1551     = {name => chr ($self->{nc}),
1552 wakaba 1.119 value => '',
1553     line => $self->{line}, column => $self->{column}};
1554 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1555 wakaba 1.1 !!!next-input-character;
1556     redo A;
1557     }
1558 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1559 wakaba 1.1 my $before_leave = sub {
1560 wakaba 1.183 if (exists $self->{ct}->{attributes} # start tag or end tag
1561     ->{$self->{ca}->{name}}) { # MUST
1562 wakaba 1.77 !!!cp (57);
1563 wakaba 1.183 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1564     ## Discard $self->{ca} # MUST
1565 wakaba 1.1 } else {
1566 wakaba 1.77 !!!cp (58);
1567 wakaba 1.183 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1568     = $self->{ca};
1569 wakaba 1.1 }
1570     }; # $before_leave
1571    
1572 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1573 wakaba 1.77 !!!cp (59);
1574 wakaba 1.1 $before_leave->();
1575 wakaba 1.57 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1576 wakaba 1.1 !!!next-input-character;
1577     redo A;
1578 wakaba 1.183 } elsif ($self->{nc} == 0x003D) { # =
1579 wakaba 1.77 !!!cp (60);
1580 wakaba 1.1 $before_leave->();
1581 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1582 wakaba 1.1 !!!next-input-character;
1583     redo A;
1584 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1585 wakaba 1.1 $before_leave->();
1586 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1587 wakaba 1.77 !!!cp (61);
1588 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1589     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1590 wakaba 1.77 !!!cp (62);
1591 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1592 wakaba 1.183 if ($self->{ct}->{attributes}) {
1593 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1594 wakaba 1.1 }
1595     } else {
1596 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1597 wakaba 1.1 }
1598 wakaba 1.57 $self->{state} = DATA_STATE;
1599 wakaba 1.1 !!!next-input-character;
1600    
1601 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1602 wakaba 1.1
1603     redo A;
1604 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1605     $self->{nc} <= 0x005A) { # A..Z
1606 wakaba 1.77 !!!cp (63);
1607 wakaba 1.183 $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1608 wakaba 1.1 ## Stay in the state
1609     !!!next-input-character;
1610     redo A;
1611 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1612 wakaba 1.125 !!!cp (64);
1613 wakaba 1.1 $before_leave->();
1614 wakaba 1.125 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1615 wakaba 1.1 !!!next-input-character;
1616     redo A;
1617 wakaba 1.183 } elsif ($self->{nc} == -1) {
1618 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1619 wakaba 1.1 $before_leave->();
1620 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1621 wakaba 1.77 !!!cp (66);
1622 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1623     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1624 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1625 wakaba 1.183 if ($self->{ct}->{attributes}) {
1626 wakaba 1.77 !!!cp (67);
1627 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1628 wakaba 1.77 } else {
1629 wakaba 1.78 ## NOTE: This state should never be reached.
1630 wakaba 1.77 !!!cp (68);
1631 wakaba 1.1 }
1632     } else {
1633 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1634 wakaba 1.1 }
1635 wakaba 1.57 $self->{state} = DATA_STATE;
1636 wakaba 1.1 # reconsume
1637    
1638 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1639 wakaba 1.1
1640     redo A;
1641     } else {
1642 wakaba 1.183 if ($self->{nc} == 0x0022 or # "
1643     $self->{nc} == 0x0027) { # '
1644 wakaba 1.77 !!!cp (69);
1645 wakaba 1.72 !!!parse-error (type => 'bad attribute name');
1646 wakaba 1.77 } else {
1647     !!!cp (70);
1648 wakaba 1.72 }
1649 wakaba 1.183 $self->{ca}->{name} .= chr ($self->{nc});
1650 wakaba 1.1 ## Stay in the state
1651     !!!next-input-character;
1652     redo A;
1653     }
1654 wakaba 1.57 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1655 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1656 wakaba 1.77 !!!cp (71);
1657 wakaba 1.1 ## Stay in the state
1658     !!!next-input-character;
1659     redo A;
1660 wakaba 1.183 } elsif ($self->{nc} == 0x003D) { # =
1661 wakaba 1.77 !!!cp (72);
1662 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1663 wakaba 1.1 !!!next-input-character;
1664     redo A;
1665 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1666     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1667 wakaba 1.77 !!!cp (73);
1668 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1669     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1670 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1671 wakaba 1.183 if ($self->{ct}->{attributes}) {
1672 wakaba 1.77 !!!cp (74);
1673 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1674 wakaba 1.77 } else {
1675 wakaba 1.78 ## NOTE: This state should never be reached.
1676 wakaba 1.77 !!!cp (75);
1677 wakaba 1.1 }
1678     } else {
1679 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1680 wakaba 1.1 }
1681 wakaba 1.57 $self->{state} = DATA_STATE;
1682 wakaba 1.1 !!!next-input-character;
1683    
1684 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1685 wakaba 1.1
1686     redo A;
1687 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1688     $self->{nc} <= 0x005A) { # A..Z
1689 wakaba 1.77 !!!cp (76);
1690 wakaba 1.183 $self->{ca}
1691     = {name => chr ($self->{nc} + 0x0020),
1692 wakaba 1.119 value => '',
1693     line => $self->{line}, column => $self->{column}};
1694 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1695 wakaba 1.1 !!!next-input-character;
1696     redo A;
1697 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1698 wakaba 1.125 !!!cp (77);
1699     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1700 wakaba 1.1 !!!next-input-character;
1701     redo A;
1702 wakaba 1.183 } elsif ($self->{nc} == -1) {
1703 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1704 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1705 wakaba 1.77 !!!cp (79);
1706 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1707     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1708 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1709 wakaba 1.183 if ($self->{ct}->{attributes}) {
1710 wakaba 1.77 !!!cp (80);
1711 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1712 wakaba 1.77 } else {
1713 wakaba 1.78 ## NOTE: This state should never be reached.
1714 wakaba 1.77 !!!cp (81);
1715 wakaba 1.1 }
1716     } else {
1717 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1718 wakaba 1.1 }
1719 wakaba 1.57 $self->{state} = DATA_STATE;
1720 wakaba 1.1 # reconsume
1721    
1722 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1723 wakaba 1.1
1724     redo A;
1725     } else {
1726 wakaba 1.183 if ($self->{nc} == 0x0022 or # "
1727     $self->{nc} == 0x0027) { # '
1728 wakaba 1.156 !!!cp (78);
1729     !!!parse-error (type => 'bad attribute name');
1730     } else {
1731     !!!cp (82);
1732     }
1733 wakaba 1.183 $self->{ca}
1734     = {name => chr ($self->{nc}),
1735 wakaba 1.119 value => '',
1736     line => $self->{line}, column => $self->{column}};
1737 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1738 wakaba 1.1 !!!next-input-character;
1739     redo A;
1740     }
1741 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1742 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1743 wakaba 1.77 !!!cp (83);
1744 wakaba 1.1 ## Stay in the state
1745     !!!next-input-character;
1746     redo A;
1747 wakaba 1.183 } elsif ($self->{nc} == 0x0022) { # "
1748 wakaba 1.77 !!!cp (84);
1749 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1750 wakaba 1.1 !!!next-input-character;
1751     redo A;
1752 wakaba 1.183 } elsif ($self->{nc} == 0x0026) { # &
1753 wakaba 1.77 !!!cp (85);
1754 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1755 wakaba 1.1 ## reconsume
1756     redo A;
1757 wakaba 1.183 } elsif ($self->{nc} == 0x0027) { # '
1758 wakaba 1.77 !!!cp (86);
1759 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1760 wakaba 1.1 !!!next-input-character;
1761     redo A;
1762 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1763 wakaba 1.156 !!!parse-error (type => 'empty unquoted attribute value');
1764 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1765 wakaba 1.77 !!!cp (87);
1766 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1767     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1768 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1769 wakaba 1.183 if ($self->{ct}->{attributes}) {
1770 wakaba 1.77 !!!cp (88);
1771 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1772 wakaba 1.77 } else {
1773 wakaba 1.78 ## NOTE: This state should never be reached.
1774 wakaba 1.77 !!!cp (89);
1775 wakaba 1.1 }
1776     } else {
1777 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1778 wakaba 1.1 }
1779 wakaba 1.57 $self->{state} = DATA_STATE;
1780 wakaba 1.1 !!!next-input-character;
1781    
1782 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1783 wakaba 1.1
1784     redo A;
1785 wakaba 1.183 } elsif ($self->{nc} == -1) {
1786 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1787 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1788 wakaba 1.77 !!!cp (90);
1789 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1790     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1791 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1792 wakaba 1.183 if ($self->{ct}->{attributes}) {
1793 wakaba 1.77 !!!cp (91);
1794 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1795 wakaba 1.77 } else {
1796 wakaba 1.78 ## NOTE: This state should never be reached.
1797 wakaba 1.77 !!!cp (92);
1798 wakaba 1.1 }
1799     } else {
1800 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1801 wakaba 1.1 }
1802 wakaba 1.57 $self->{state} = DATA_STATE;
1803 wakaba 1.1 ## reconsume
1804    
1805 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1806 wakaba 1.1
1807     redo A;
1808     } else {
1809 wakaba 1.183 if ($self->{nc} == 0x003D) { # =
1810 wakaba 1.77 !!!cp (93);
1811 wakaba 1.72 !!!parse-error (type => 'bad attribute value');
1812 wakaba 1.77 } else {
1813     !!!cp (94);
1814 wakaba 1.72 }
1815 wakaba 1.183 $self->{ca}->{value} .= chr ($self->{nc});
1816 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1817 wakaba 1.1 !!!next-input-character;
1818     redo A;
1819     }
1820 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1821 wakaba 1.183 if ($self->{nc} == 0x0022) { # "
1822 wakaba 1.77 !!!cp (95);
1823 wakaba 1.72 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1824 wakaba 1.1 !!!next-input-character;
1825     redo A;
1826 wakaba 1.183 } elsif ($self->{nc} == 0x0026) { # &
1827 wakaba 1.77 !!!cp (96);
1828 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1829     ## "entity in attribute value state". In this implementation, the
1830     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1831     ## implementation of the "consume a character reference" algorithm.
1832 wakaba 1.169 $self->{prev_state} = $self->{state};
1833 wakaba 1.183 $self->{entity_add} = 0x0022; # "
1834 wakaba 1.167 $self->{state} = ENTITY_STATE;
1835 wakaba 1.1 !!!next-input-character;
1836     redo A;
1837 wakaba 1.183 } elsif ($self->{nc} == -1) {
1838 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1839 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1840 wakaba 1.77 !!!cp (97);
1841 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1842     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1843 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1844 wakaba 1.183 if ($self->{ct}->{attributes}) {
1845 wakaba 1.77 !!!cp (98);
1846 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1847 wakaba 1.77 } else {
1848 wakaba 1.78 ## NOTE: This state should never be reached.
1849 wakaba 1.77 !!!cp (99);
1850 wakaba 1.1 }
1851     } else {
1852 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1853 wakaba 1.1 }
1854 wakaba 1.57 $self->{state} = DATA_STATE;
1855 wakaba 1.1 ## reconsume
1856    
1857 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1858 wakaba 1.1
1859     redo A;
1860     } else {
1861 wakaba 1.77 !!!cp (100);
1862 wakaba 1.183 $self->{ca}->{value} .= chr ($self->{nc});
1863     $self->{read_until}->($self->{ca}->{value},
1864 wakaba 1.173 q["&],
1865 wakaba 1.183 length $self->{ca}->{value});
1866 wakaba 1.173
1867 wakaba 1.1 ## Stay in the state
1868     !!!next-input-character;
1869     redo A;
1870     }
1871 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1872 wakaba 1.183 if ($self->{nc} == 0x0027) { # '
1873 wakaba 1.77 !!!cp (101);
1874 wakaba 1.72 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1875 wakaba 1.1 !!!next-input-character;
1876     redo A;
1877 wakaba 1.183 } elsif ($self->{nc} == 0x0026) { # &
1878 wakaba 1.77 !!!cp (102);
1879 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1880     ## "entity in attribute value state". In this implementation, the
1881     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1882     ## implementation of the "consume a character reference" algorithm.
1883 wakaba 1.183 $self->{entity_add} = 0x0027; # '
1884 wakaba 1.169 $self->{prev_state} = $self->{state};
1885 wakaba 1.167 $self->{state} = ENTITY_STATE;
1886 wakaba 1.1 !!!next-input-character;
1887     redo A;
1888 wakaba 1.183 } elsif ($self->{nc} == -1) {
1889 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1890 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1891 wakaba 1.77 !!!cp (103);
1892 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1893     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1894 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1895 wakaba 1.183 if ($self->{ct}->{attributes}) {
1896 wakaba 1.77 !!!cp (104);
1897 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1898 wakaba 1.77 } else {
1899 wakaba 1.78 ## NOTE: This state should never be reached.
1900 wakaba 1.77 !!!cp (105);
1901 wakaba 1.1 }
1902     } else {
1903 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1904 wakaba 1.1 }
1905 wakaba 1.57 $self->{state} = DATA_STATE;
1906 wakaba 1.1 ## reconsume
1907    
1908 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1909 wakaba 1.1
1910     redo A;
1911     } else {
1912 wakaba 1.77 !!!cp (106);
1913 wakaba 1.183 $self->{ca}->{value} .= chr ($self->{nc});
1914     $self->{read_until}->($self->{ca}->{value},
1915 wakaba 1.173 q['&],
1916 wakaba 1.183 length $self->{ca}->{value});
1917 wakaba 1.173
1918 wakaba 1.1 ## Stay in the state
1919     !!!next-input-character;
1920     redo A;
1921     }
1922 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1923 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1924 wakaba 1.77 !!!cp (107);
1925 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1926 wakaba 1.1 !!!next-input-character;
1927     redo A;
1928 wakaba 1.183 } elsif ($self->{nc} == 0x0026) { # &
1929 wakaba 1.77 !!!cp (108);
1930 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1931     ## "entity in attribute value state". In this implementation, the
1932     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1933     ## implementation of the "consume a character reference" algorithm.
1934 wakaba 1.183 $self->{entity_add} = -1;
1935 wakaba 1.169 $self->{prev_state} = $self->{state};
1936 wakaba 1.167 $self->{state} = ENTITY_STATE;
1937 wakaba 1.1 !!!next-input-character;
1938     redo A;
1939 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1940     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1941 wakaba 1.77 !!!cp (109);
1942 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1943     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1944 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1945 wakaba 1.183 if ($self->{ct}->{attributes}) {
1946 wakaba 1.77 !!!cp (110);
1947 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1948 wakaba 1.77 } else {
1949 wakaba 1.78 ## NOTE: This state should never be reached.
1950 wakaba 1.77 !!!cp (111);
1951 wakaba 1.1 }
1952     } else {
1953 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1954 wakaba 1.1 }
1955 wakaba 1.57 $self->{state} = DATA_STATE;
1956 wakaba 1.1 !!!next-input-character;
1957    
1958 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1959 wakaba 1.1
1960     redo A;
1961 wakaba 1.183 } elsif ($self->{nc} == -1) {
1962 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1963 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1964 wakaba 1.77 !!!cp (112);
1965 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1966     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1967 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1968 wakaba 1.183 if ($self->{ct}->{attributes}) {
1969 wakaba 1.77 !!!cp (113);
1970 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1971 wakaba 1.77 } else {
1972 wakaba 1.78 ## NOTE: This state should never be reached.
1973 wakaba 1.77 !!!cp (114);
1974 wakaba 1.1 }
1975     } else {
1976 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1977 wakaba 1.1 }
1978 wakaba 1.57 $self->{state} = DATA_STATE;
1979 wakaba 1.1 ## reconsume
1980    
1981 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1982 wakaba 1.1
1983     redo A;
1984     } else {
1985 wakaba 1.72 if ({
1986     0x0022 => 1, # "
1987     0x0027 => 1, # '
1988     0x003D => 1, # =
1989 wakaba 1.183 }->{$self->{nc}}) {
1990 wakaba 1.77 !!!cp (115);
1991 wakaba 1.72 !!!parse-error (type => 'bad attribute value');
1992 wakaba 1.77 } else {
1993     !!!cp (116);
1994 wakaba 1.72 }
1995 wakaba 1.183 $self->{ca}->{value} .= chr ($self->{nc});
1996     $self->{read_until}->($self->{ca}->{value},
1997 wakaba 1.173 q["'=& >],
1998 wakaba 1.183 length $self->{ca}->{value});
1999 wakaba 1.173
2000 wakaba 1.1 ## Stay in the state
2001     !!!next-input-character;
2002     redo A;
2003     }
2004 wakaba 1.72 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2005 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2006 wakaba 1.77 !!!cp (118);
2007 wakaba 1.72 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2008     !!!next-input-character;
2009     redo A;
2010 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2011     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2012 wakaba 1.77 !!!cp (119);
2013 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
2014     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2015 wakaba 1.72 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2016 wakaba 1.183 if ($self->{ct}->{attributes}) {
2017 wakaba 1.77 !!!cp (120);
2018 wakaba 1.72 !!!parse-error (type => 'end tag attribute');
2019 wakaba 1.77 } else {
2020 wakaba 1.78 ## NOTE: This state should never be reached.
2021 wakaba 1.77 !!!cp (121);
2022 wakaba 1.72 }
2023     } else {
2024 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
2025 wakaba 1.72 }
2026     $self->{state} = DATA_STATE;
2027     !!!next-input-character;
2028    
2029 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
2030 wakaba 1.72
2031     redo A;
2032 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
2033 wakaba 1.125 !!!cp (122);
2034     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2035 wakaba 1.72 !!!next-input-character;
2036 wakaba 1.125 redo A;
2037 wakaba 1.183 } elsif ($self->{nc} == -1) {
2038 wakaba 1.141 !!!parse-error (type => 'unclosed tag');
2039 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2040 wakaba 1.141 !!!cp (122.3);
2041 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
2042     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2043     if ($self->{ct}->{attributes}) {
2044 wakaba 1.141 !!!cp (122.1);
2045     !!!parse-error (type => 'end tag attribute');
2046     } else {
2047     ## NOTE: This state should never be reached.
2048     !!!cp (122.2);
2049     }
2050     } else {
2051 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
2052 wakaba 1.141 }
2053     $self->{state} = DATA_STATE;
2054     ## Reconsume.
2055 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
2056 wakaba 1.141 redo A;
2057 wakaba 1.125 } else {
2058     !!!cp ('124.1');
2059     !!!parse-error (type => 'no space between attributes');
2060     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2061     ## reconsume
2062     redo A;
2063     }
2064     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2065 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
2066     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2067 wakaba 1.125 !!!cp ('124.2');
2068 wakaba 1.183 !!!parse-error (type => 'nestc', token => $self->{ct});
2069 wakaba 1.125 ## TODO: Different type than slash in start tag
2070     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2071 wakaba 1.183 if ($self->{ct}->{attributes}) {
2072 wakaba 1.125 !!!cp ('124.4');
2073     !!!parse-error (type => 'end tag attribute');
2074     } else {
2075     !!!cp ('124.5');
2076     }
2077     ## TODO: Test |<title></title/>|
2078 wakaba 1.72 } else {
2079 wakaba 1.125 !!!cp ('124.3');
2080     $self->{self_closing} = 1;
2081 wakaba 1.72 }
2082 wakaba 1.125
2083     $self->{state} = DATA_STATE;
2084     !!!next-input-character;
2085    
2086 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
2087 wakaba 1.125
2088 wakaba 1.72 redo A;
2089 wakaba 1.183 } elsif ($self->{nc} == -1) {
2090 wakaba 1.141 !!!parse-error (type => 'unclosed tag');
2091 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2092 wakaba 1.141 !!!cp (124.7);
2093 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
2094     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2095     if ($self->{ct}->{attributes}) {
2096 wakaba 1.141 !!!cp (124.5);
2097     !!!parse-error (type => 'end tag attribute');
2098     } else {
2099     ## NOTE: This state should never be reached.
2100     !!!cp (124.6);
2101     }
2102     } else {
2103 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
2104 wakaba 1.141 }
2105     $self->{state} = DATA_STATE;
2106     ## Reconsume.
2107 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
2108 wakaba 1.141 redo A;
2109 wakaba 1.72 } else {
2110 wakaba 1.125 !!!cp ('124.4');
2111     !!!parse-error (type => 'nestc');
2112     ## TODO: This error type is wrong.
2113 wakaba 1.72 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2114 wakaba 1.125 ## Reconsume.
2115 wakaba 1.72 redo A;
2116     }
2117 wakaba 1.57 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2118 wakaba 1.1 ## (only happen if PCDATA state)
2119 wakaba 1.167
2120     ## NOTE: Unlike spec's "bogus comment state", this implementation
2121     ## consumes characters one-by-one basis.
2122 wakaba 1.1
2123 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
2124 wakaba 1.167 !!!cp (124);
2125     $self->{state} = DATA_STATE;
2126     !!!next-input-character;
2127 wakaba 1.1
2128 wakaba 1.183 !!!emit ($self->{ct}); # comment
2129 wakaba 1.167 redo A;
2130 wakaba 1.183 } elsif ($self->{nc} == -1) {
2131 wakaba 1.167 !!!cp (125);
2132     $self->{state} = DATA_STATE;
2133     ## reconsume
2134 wakaba 1.1
2135 wakaba 1.183 !!!emit ($self->{ct}); # comment
2136 wakaba 1.167 redo A;
2137     } else {
2138     !!!cp (126);
2139 wakaba 1.183 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2140     $self->{read_until}->($self->{ct}->{data},
2141 wakaba 1.173 q[>],
2142 wakaba 1.183 length $self->{ct}->{data});
2143 wakaba 1.173
2144 wakaba 1.167 ## Stay in the state.
2145     !!!next-input-character;
2146     redo A;
2147     }
2148 wakaba 1.57 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2149 wakaba 1.1 ## (only happen if PCDATA state)
2150    
2151 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2152 wakaba 1.163 !!!cp (133);
2153     $self->{state} = MD_HYPHEN_STATE;
2154 wakaba 1.1 !!!next-input-character;
2155 wakaba 1.163 redo A;
2156 wakaba 1.183 } elsif ($self->{nc} == 0x0044 or # D
2157     $self->{nc} == 0x0064) { # d
2158 wakaba 1.163 ## ASCII case-insensitive.
2159     !!!cp (130);
2160     $self->{state} = MD_DOCTYPE_STATE;
2161 wakaba 1.183 $self->{s_kwd} = chr $self->{nc};
2162 wakaba 1.1 !!!next-input-character;
2163 wakaba 1.163 redo A;
2164 wakaba 1.127 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2165     $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2166 wakaba 1.183 $self->{nc} == 0x005B) { # [
2167 wakaba 1.163 !!!cp (135.4);
2168     $self->{state} = MD_CDATA_STATE;
2169 wakaba 1.183 $self->{s_kwd} = '[';
2170 wakaba 1.127 !!!next-input-character;
2171 wakaba 1.163 redo A;
2172 wakaba 1.77 } else {
2173     !!!cp (136);
2174 wakaba 1.1 }
2175    
2176 wakaba 1.163 !!!parse-error (type => 'bogus comment',
2177     line => $self->{line_prev},
2178     column => $self->{column_prev} - 1);
2179     ## Reconsume.
2180 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
2181 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2182 wakaba 1.163 line => $self->{line_prev},
2183     column => $self->{column_prev} - 1,
2184 wakaba 1.118 };
2185 wakaba 1.1 redo A;
2186 wakaba 1.163 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2187 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2188 wakaba 1.163 !!!cp (127);
2189 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2190 wakaba 1.163 line => $self->{line_prev},
2191     column => $self->{column_prev} - 2,
2192     };
2193     $self->{state} = COMMENT_START_STATE;
2194     !!!next-input-character;
2195     redo A;
2196     } else {
2197     !!!cp (128);
2198     !!!parse-error (type => 'bogus comment',
2199     line => $self->{line_prev},
2200     column => $self->{column_prev} - 2);
2201     $self->{state} = BOGUS_COMMENT_STATE;
2202     ## Reconsume.
2203 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN,
2204 wakaba 1.163 data => '-',
2205     line => $self->{line_prev},
2206     column => $self->{column_prev} - 2,
2207     };
2208     redo A;
2209     }
2210     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2211     ## ASCII case-insensitive.
2212 wakaba 1.183 if ($self->{nc} == [
2213 wakaba 1.163 undef,
2214     0x004F, # O
2215     0x0043, # C
2216     0x0054, # T
2217     0x0059, # Y
2218     0x0050, # P
2219 wakaba 1.183 ]->[length $self->{s_kwd}] or
2220     $self->{nc} == [
2221 wakaba 1.163 undef,
2222     0x006F, # o
2223     0x0063, # c
2224     0x0074, # t
2225     0x0079, # y
2226     0x0070, # p
2227 wakaba 1.183 ]->[length $self->{s_kwd}]) {
2228 wakaba 1.163 !!!cp (131);
2229     ## Stay in the state.
2230 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
2231 wakaba 1.163 !!!next-input-character;
2232     redo A;
2233 wakaba 1.183 } elsif ((length $self->{s_kwd}) == 6 and
2234     ($self->{nc} == 0x0045 or # E
2235     $self->{nc} == 0x0065)) { # e
2236 wakaba 1.163 !!!cp (129);
2237     $self->{state} = DOCTYPE_STATE;
2238 wakaba 1.183 $self->{ct} = {type => DOCTYPE_TOKEN,
2239 wakaba 1.163 quirks => 1,
2240     line => $self->{line_prev},
2241     column => $self->{column_prev} - 7,
2242     };
2243     !!!next-input-character;
2244     redo A;
2245     } else {
2246     !!!cp (132);
2247     !!!parse-error (type => 'bogus comment',
2248     line => $self->{line_prev},
2249 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2250 wakaba 1.163 $self->{state} = BOGUS_COMMENT_STATE;
2251     ## Reconsume.
2252 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN,
2253     data => $self->{s_kwd},
2254 wakaba 1.163 line => $self->{line_prev},
2255 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2256 wakaba 1.163 };
2257     redo A;
2258     }
2259     } elsif ($self->{state} == MD_CDATA_STATE) {
2260 wakaba 1.183 if ($self->{nc} == {
2261 wakaba 1.163 '[' => 0x0043, # C
2262     '[C' => 0x0044, # D
2263     '[CD' => 0x0041, # A
2264     '[CDA' => 0x0054, # T
2265     '[CDAT' => 0x0041, # A
2266 wakaba 1.183 }->{$self->{s_kwd}}) {
2267 wakaba 1.163 !!!cp (135.1);
2268     ## Stay in the state.
2269 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
2270 wakaba 1.163 !!!next-input-character;
2271     redo A;
2272 wakaba 1.183 } elsif ($self->{s_kwd} eq '[CDATA' and
2273     $self->{nc} == 0x005B) { # [
2274 wakaba 1.163 !!!cp (135.2);
2275 wakaba 1.183 $self->{ct} = {type => CHARACTER_TOKEN,
2276 wakaba 1.165 data => '',
2277     line => $self->{line_prev},
2278     column => $self->{column_prev} - 7};
2279     $self->{state} = CDATA_SECTION_STATE;
2280 wakaba 1.163 !!!next-input-character;
2281     redo A;
2282     } else {
2283     !!!cp (135.3);
2284     !!!parse-error (type => 'bogus comment',
2285     line => $self->{line_prev},
2286 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2287 wakaba 1.163 $self->{state} = BOGUS_COMMENT_STATE;
2288     ## Reconsume.
2289 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN,
2290     data => $self->{s_kwd},
2291 wakaba 1.163 line => $self->{line_prev},
2292 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2293 wakaba 1.163 };
2294     redo A;
2295     }
2296 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_STATE) {
2297 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2298 wakaba 1.77 !!!cp (137);
2299 wakaba 1.57 $self->{state} = COMMENT_START_DASH_STATE;
2300 wakaba 1.23 !!!next-input-character;
2301     redo A;
2302 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2303 wakaba 1.77 !!!cp (138);
2304 wakaba 1.23 !!!parse-error (type => 'bogus comment');
2305 wakaba 1.57 $self->{state} = DATA_STATE;
2306 wakaba 1.23 !!!next-input-character;
2307    
2308 wakaba 1.183 !!!emit ($self->{ct}); # comment
2309 wakaba 1.23
2310     redo A;
2311 wakaba 1.183 } elsif ($self->{nc} == -1) {
2312 wakaba 1.77 !!!cp (139);
2313 wakaba 1.23 !!!parse-error (type => 'unclosed comment');
2314 wakaba 1.57 $self->{state} = DATA_STATE;
2315 wakaba 1.23 ## reconsume
2316    
2317 wakaba 1.183 !!!emit ($self->{ct}); # comment
2318 wakaba 1.23
2319     redo A;
2320     } else {
2321 wakaba 1.77 !!!cp (140);
2322 wakaba 1.183 $self->{ct}->{data} # comment
2323     .= chr ($self->{nc});
2324 wakaba 1.57 $self->{state} = COMMENT_STATE;
2325 wakaba 1.23 !!!next-input-character;
2326     redo A;
2327     }
2328 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2329 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2330 wakaba 1.77 !!!cp (141);
2331 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
2332 wakaba 1.23 !!!next-input-character;
2333     redo A;
2334 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2335 wakaba 1.77 !!!cp (142);
2336 wakaba 1.23 !!!parse-error (type => 'bogus comment');
2337 wakaba 1.57 $self->{state} = DATA_STATE;
2338 wakaba 1.23 !!!next-input-character;
2339    
2340 wakaba 1.183 !!!emit ($self->{ct}); # comment
2341 wakaba 1.23
2342     redo A;
2343 wakaba 1.183 } elsif ($self->{nc} == -1) {
2344 wakaba 1.77 !!!cp (143);
2345 wakaba 1.23 !!!parse-error (type => 'unclosed comment');
2346 wakaba 1.57 $self->{state} = DATA_STATE;
2347 wakaba 1.23 ## reconsume
2348    
2349 wakaba 1.183 !!!emit ($self->{ct}); # comment
2350 wakaba 1.23
2351     redo A;
2352     } else {
2353 wakaba 1.77 !!!cp (144);
2354 wakaba 1.183 $self->{ct}->{data} # comment
2355     .= '-' . chr ($self->{nc});
2356 wakaba 1.57 $self->{state} = COMMENT_STATE;
2357 wakaba 1.23 !!!next-input-character;
2358     redo A;
2359     }
2360 wakaba 1.57 } elsif ($self->{state} == COMMENT_STATE) {
2361 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2362 wakaba 1.77 !!!cp (145);
2363 wakaba 1.57 $self->{state} = COMMENT_END_DASH_STATE;
2364 wakaba 1.1 !!!next-input-character;
2365     redo A;
2366 wakaba 1.183 } elsif ($self->{nc} == -1) {
2367 wakaba 1.77 !!!cp (146);
2368 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2369 wakaba 1.57 $self->{state} = DATA_STATE;
2370 wakaba 1.1 ## reconsume
2371    
2372 wakaba 1.183 !!!emit ($self->{ct}); # comment
2373 wakaba 1.1
2374     redo A;
2375     } else {
2376 wakaba 1.77 !!!cp (147);
2377 wakaba 1.183 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2378     $self->{read_until}->($self->{ct}->{data},
2379 wakaba 1.173 q[-],
2380 wakaba 1.183 length $self->{ct}->{data});
2381 wakaba 1.173
2382 wakaba 1.1 ## Stay in the state
2383     !!!next-input-character;
2384     redo A;
2385     }
2386 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2387 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2388 wakaba 1.77 !!!cp (148);
2389 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
2390 wakaba 1.1 !!!next-input-character;
2391     redo A;
2392 wakaba 1.183 } elsif ($self->{nc} == -1) {
2393 wakaba 1.77 !!!cp (149);
2394 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2395 wakaba 1.57 $self->{state} = DATA_STATE;
2396 wakaba 1.1 ## reconsume
2397    
2398 wakaba 1.183 !!!emit ($self->{ct}); # comment
2399 wakaba 1.1
2400     redo A;
2401     } else {
2402 wakaba 1.77 !!!cp (150);
2403 wakaba 1.183 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2404 wakaba 1.57 $self->{state} = COMMENT_STATE;
2405 wakaba 1.1 !!!next-input-character;
2406     redo A;
2407     }
2408 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_STATE) {
2409 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
2410 wakaba 1.77 !!!cp (151);
2411 wakaba 1.57 $self->{state} = DATA_STATE;
2412 wakaba 1.1 !!!next-input-character;
2413    
2414 wakaba 1.183 !!!emit ($self->{ct}); # comment
2415 wakaba 1.1
2416     redo A;
2417 wakaba 1.183 } elsif ($self->{nc} == 0x002D) { # -
2418 wakaba 1.77 !!!cp (152);
2419 wakaba 1.114 !!!parse-error (type => 'dash in comment',
2420     line => $self->{line_prev},
2421     column => $self->{column_prev});
2422 wakaba 1.183 $self->{ct}->{data} .= '-'; # comment
2423 wakaba 1.1 ## Stay in the state
2424     !!!next-input-character;
2425     redo A;
2426 wakaba 1.183 } elsif ($self->{nc} == -1) {
2427 wakaba 1.77 !!!cp (153);
2428 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2429 wakaba 1.57 $self->{state} = DATA_STATE;
2430 wakaba 1.1 ## reconsume
2431    
2432 wakaba 1.183 !!!emit ($self->{ct}); # comment
2433 wakaba 1.1
2434     redo A;
2435     } else {
2436 wakaba 1.77 !!!cp (154);
2437 wakaba 1.114 !!!parse-error (type => 'dash in comment',
2438     line => $self->{line_prev},
2439     column => $self->{column_prev});
2440 wakaba 1.183 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2441 wakaba 1.57 $self->{state} = COMMENT_STATE;
2442 wakaba 1.1 !!!next-input-character;
2443     redo A;
2444     }
2445 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_STATE) {
2446 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2447 wakaba 1.77 !!!cp (155);
2448 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2449 wakaba 1.1 !!!next-input-character;
2450     redo A;
2451     } else {
2452 wakaba 1.77 !!!cp (156);
2453 wakaba 1.3 !!!parse-error (type => 'no space before DOCTYPE name');
2454 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2455 wakaba 1.1 ## reconsume
2456     redo A;
2457     }
2458 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2459 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2460 wakaba 1.77 !!!cp (157);
2461 wakaba 1.1 ## Stay in the state
2462     !!!next-input-character;
2463     redo A;
2464 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2465 wakaba 1.77 !!!cp (158);
2466 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
2467 wakaba 1.57 $self->{state} = DATA_STATE;
2468 wakaba 1.1 !!!next-input-character;
2469    
2470 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2471 wakaba 1.1
2472     redo A;
2473 wakaba 1.183 } elsif ($self->{nc} == -1) {
2474 wakaba 1.77 !!!cp (159);
2475 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
2476 wakaba 1.57 $self->{state} = DATA_STATE;
2477 wakaba 1.1 ## reconsume
2478    
2479 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2480 wakaba 1.1
2481     redo A;
2482     } else {
2483 wakaba 1.77 !!!cp (160);
2484 wakaba 1.183 $self->{ct}->{name} = chr $self->{nc};
2485     delete $self->{ct}->{quirks};
2486 wakaba 1.4 ## ISSUE: "Set the token's name name to the" in the spec
2487 wakaba 1.57 $self->{state} = DOCTYPE_NAME_STATE;
2488 wakaba 1.1 !!!next-input-character;
2489     redo A;
2490     }
2491 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2492 wakaba 1.18 ## ISSUE: Redundant "First," in the spec.
2493 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2494 wakaba 1.77 !!!cp (161);
2495 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2496 wakaba 1.1 !!!next-input-character;
2497     redo A;
2498 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2499 wakaba 1.77 !!!cp (162);
2500 wakaba 1.57 $self->{state} = DATA_STATE;
2501 wakaba 1.1 !!!next-input-character;
2502    
2503 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2504 wakaba 1.1
2505     redo A;
2506 wakaba 1.183 } elsif ($self->{nc} == -1) {
2507 wakaba 1.77 !!!cp (163);
2508 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2509 wakaba 1.57 $self->{state} = DATA_STATE;
2510 wakaba 1.1 ## reconsume
2511    
2512 wakaba 1.183 $self->{ct}->{quirks} = 1;
2513     !!!emit ($self->{ct}); # DOCTYPE
2514 wakaba 1.1
2515     redo A;
2516     } else {
2517 wakaba 1.77 !!!cp (164);
2518 wakaba 1.183 $self->{ct}->{name}
2519     .= chr ($self->{nc}); # DOCTYPE
2520 wakaba 1.1 ## Stay in the state
2521     !!!next-input-character;
2522     redo A;
2523     }
2524 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2525 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2526 wakaba 1.77 !!!cp (165);
2527 wakaba 1.1 ## Stay in the state
2528     !!!next-input-character;
2529     redo A;
2530 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2531 wakaba 1.77 !!!cp (166);
2532 wakaba 1.57 $self->{state} = DATA_STATE;
2533 wakaba 1.1 !!!next-input-character;
2534    
2535 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2536 wakaba 1.1
2537     redo A;
2538 wakaba 1.183 } elsif ($self->{nc} == -1) {
2539 wakaba 1.77 !!!cp (167);
2540 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2541 wakaba 1.57 $self->{state} = DATA_STATE;
2542 wakaba 1.1 ## reconsume
2543    
2544 wakaba 1.183 $self->{ct}->{quirks} = 1;
2545     !!!emit ($self->{ct}); # DOCTYPE
2546 wakaba 1.18
2547     redo A;
2548 wakaba 1.183 } elsif ($self->{nc} == 0x0050 or # P
2549     $self->{nc} == 0x0070) { # p
2550 wakaba 1.166 $self->{state} = PUBLIC_STATE;
2551 wakaba 1.183 $self->{s_kwd} = chr $self->{nc};
2552 wakaba 1.18 !!!next-input-character;
2553 wakaba 1.166 redo A;
2554 wakaba 1.183 } elsif ($self->{nc} == 0x0053 or # S
2555     $self->{nc} == 0x0073) { # s
2556 wakaba 1.166 $self->{state} = SYSTEM_STATE;
2557 wakaba 1.183 $self->{s_kwd} = chr $self->{nc};
2558 wakaba 1.18 !!!next-input-character;
2559 wakaba 1.166 redo A;
2560 wakaba 1.18 } else {
2561 wakaba 1.77 !!!cp (180);
2562 wakaba 1.166 !!!parse-error (type => 'string after DOCTYPE name');
2563 wakaba 1.183 $self->{ct}->{quirks} = 1;
2564 wakaba 1.166
2565     $self->{state} = BOGUS_DOCTYPE_STATE;
2566 wakaba 1.18 !!!next-input-character;
2567 wakaba 1.166 redo A;
2568 wakaba 1.18 }
2569 wakaba 1.166 } elsif ($self->{state} == PUBLIC_STATE) {
2570     ## ASCII case-insensitive
2571 wakaba 1.183 if ($self->{nc} == [
2572 wakaba 1.166 undef,
2573     0x0055, # U
2574     0x0042, # B
2575     0x004C, # L
2576     0x0049, # I
2577 wakaba 1.183 ]->[length $self->{s_kwd}] or
2578     $self->{nc} == [
2579 wakaba 1.166 undef,
2580     0x0075, # u
2581     0x0062, # b
2582     0x006C, # l
2583     0x0069, # i
2584 wakaba 1.183 ]->[length $self->{s_kwd}]) {
2585 wakaba 1.166 !!!cp (175);
2586     ## Stay in the state.
2587 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
2588 wakaba 1.166 !!!next-input-character;
2589     redo A;
2590 wakaba 1.183 } elsif ((length $self->{s_kwd}) == 5 and
2591     ($self->{nc} == 0x0043 or # C
2592     $self->{nc} == 0x0063)) { # c
2593 wakaba 1.166 !!!cp (168);
2594     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2595     !!!next-input-character;
2596     redo A;
2597     } else {
2598     !!!cp (169);
2599     !!!parse-error (type => 'string after DOCTYPE name',
2600     line => $self->{line_prev},
2601 wakaba 1.183 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2602     $self->{ct}->{quirks} = 1;
2603 wakaba 1.18
2604 wakaba 1.166 $self->{state} = BOGUS_DOCTYPE_STATE;
2605     ## Reconsume.
2606     redo A;
2607     }
2608     } elsif ($self->{state} == SYSTEM_STATE) {
2609     ## ASCII case-insensitive
2610 wakaba 1.183 if ($self->{nc} == [
2611 wakaba 1.166 undef,
2612     0x0059, # Y
2613     0x0053, # S
2614     0x0054, # T
2615     0x0045, # E
2616 wakaba 1.183 ]->[length $self->{s_kwd}] or
2617     $self->{nc} == [
2618 wakaba 1.166 undef,
2619     0x0079, # y
2620     0x0073, # s
2621     0x0074, # t
2622     0x0065, # e
2623 wakaba 1.183 ]->[length $self->{s_kwd}]) {
2624 wakaba 1.166 !!!cp (170);
2625     ## Stay in the state.
2626 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
2627 wakaba 1.166 !!!next-input-character;
2628     redo A;
2629 wakaba 1.183 } elsif ((length $self->{s_kwd}) == 5 and
2630     ($self->{nc} == 0x004D or # M
2631     $self->{nc} == 0x006D)) { # m
2632 wakaba 1.166 !!!cp (171);
2633     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2634     !!!next-input-character;
2635     redo A;
2636     } else {
2637     !!!cp (172);
2638     !!!parse-error (type => 'string after DOCTYPE name',
2639     line => $self->{line_prev},
2640 wakaba 1.183 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2641     $self->{ct}->{quirks} = 1;
2642 wakaba 1.73
2643 wakaba 1.166 $self->{state} = BOGUS_DOCTYPE_STATE;
2644     ## Reconsume.
2645     redo A;
2646     }
2647 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2648 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2649 wakaba 1.77 !!!cp (181);
2650 wakaba 1.18 ## Stay in the state
2651     !!!next-input-character;
2652     redo A;
2653 wakaba 1.183 } elsif ($self->{nc} eq 0x0022) { # "
2654 wakaba 1.77 !!!cp (182);
2655 wakaba 1.183 $self->{ct}->{pubid} = ''; # DOCTYPE
2656 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2657 wakaba 1.18 !!!next-input-character;
2658     redo A;
2659 wakaba 1.183 } elsif ($self->{nc} eq 0x0027) { # '
2660 wakaba 1.77 !!!cp (183);
2661 wakaba 1.183 $self->{ct}->{pubid} = ''; # DOCTYPE
2662 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2663 wakaba 1.18 !!!next-input-character;
2664     redo A;
2665 wakaba 1.183 } elsif ($self->{nc} eq 0x003E) { # >
2666 wakaba 1.77 !!!cp (184);
2667 wakaba 1.18 !!!parse-error (type => 'no PUBLIC literal');
2668    
2669 wakaba 1.57 $self->{state} = DATA_STATE;
2670 wakaba 1.18 !!!next-input-character;
2671    
2672 wakaba 1.183 $self->{ct}->{quirks} = 1;
2673     !!!emit ($self->{ct}); # DOCTYPE
2674 wakaba 1.18
2675     redo A;
2676 wakaba 1.183 } elsif ($self->{nc} == -1) {
2677 wakaba 1.77 !!!cp (185);
2678 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2679    
2680 wakaba 1.57 $self->{state} = DATA_STATE;
2681 wakaba 1.18 ## reconsume
2682    
2683 wakaba 1.183 $self->{ct}->{quirks} = 1;
2684     !!!emit ($self->{ct}); # DOCTYPE
2685 wakaba 1.18
2686     redo A;
2687     } else {
2688 wakaba 1.77 !!!cp (186);
2689 wakaba 1.18 !!!parse-error (type => 'string after PUBLIC');
2690 wakaba 1.183 $self->{ct}->{quirks} = 1;
2691 wakaba 1.73
2692 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2693 wakaba 1.18 !!!next-input-character;
2694     redo A;
2695     }
2696 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2697 wakaba 1.183 if ($self->{nc} == 0x0022) { # "
2698 wakaba 1.77 !!!cp (187);
2699 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2700 wakaba 1.18 !!!next-input-character;
2701     redo A;
2702 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2703 wakaba 1.77 !!!cp (188);
2704 wakaba 1.69 !!!parse-error (type => 'unclosed PUBLIC literal');
2705    
2706     $self->{state} = DATA_STATE;
2707     !!!next-input-character;
2708    
2709 wakaba 1.183 $self->{ct}->{quirks} = 1;
2710     !!!emit ($self->{ct}); # DOCTYPE
2711 wakaba 1.69
2712     redo A;
2713 wakaba 1.183 } elsif ($self->{nc} == -1) {
2714 wakaba 1.77 !!!cp (189);
2715 wakaba 1.18 !!!parse-error (type => 'unclosed PUBLIC literal');
2716    
2717 wakaba 1.57 $self->{state} = DATA_STATE;
2718 wakaba 1.18 ## reconsume
2719    
2720 wakaba 1.183 $self->{ct}->{quirks} = 1;
2721     !!!emit ($self->{ct}); # DOCTYPE
2722 wakaba 1.18
2723     redo A;
2724     } else {
2725 wakaba 1.77 !!!cp (190);
2726 wakaba 1.183 $self->{ct}->{pubid} # DOCTYPE
2727     .= chr $self->{nc};
2728     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2729     length $self->{ct}->{pubid});
2730 wakaba 1.173
2731 wakaba 1.18 ## Stay in the state
2732     !!!next-input-character;
2733     redo A;
2734     }
2735 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2736 wakaba 1.183 if ($self->{nc} == 0x0027) { # '
2737 wakaba 1.77 !!!cp (191);
2738 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2739 wakaba 1.18 !!!next-input-character;
2740     redo A;
2741 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2742 wakaba 1.77 !!!cp (192);
2743 wakaba 1.69 !!!parse-error (type => 'unclosed PUBLIC literal');
2744    
2745     $self->{state} = DATA_STATE;
2746     !!!next-input-character;
2747    
2748 wakaba 1.183 $self->{ct}->{quirks} = 1;
2749     !!!emit ($self->{ct}); # DOCTYPE
2750 wakaba 1.69
2751     redo A;
2752 wakaba 1.183 } elsif ($self->{nc} == -1) {
2753 wakaba 1.77 !!!cp (193);
2754 wakaba 1.18 !!!parse-error (type => 'unclosed PUBLIC literal');
2755    
2756 wakaba 1.57 $self->{state} = DATA_STATE;
2757 wakaba 1.18 ## reconsume
2758    
2759 wakaba 1.183 $self->{ct}->{quirks} = 1;
2760     !!!emit ($self->{ct}); # DOCTYPE
2761 wakaba 1.18
2762     redo A;
2763     } else {
2764 wakaba 1.77 !!!cp (194);
2765 wakaba 1.183 $self->{ct}->{pubid} # DOCTYPE
2766     .= chr $self->{nc};
2767     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2768     length $self->{ct}->{pubid});
2769 wakaba 1.173
2770 wakaba 1.18 ## Stay in the state
2771     !!!next-input-character;
2772     redo A;
2773     }
2774 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2775 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2776 wakaba 1.77 !!!cp (195);
2777 wakaba 1.18 ## Stay in the state
2778     !!!next-input-character;
2779     redo A;
2780 wakaba 1.183 } elsif ($self->{nc} == 0x0022) { # "
2781 wakaba 1.77 !!!cp (196);
2782 wakaba 1.183 $self->{ct}->{sysid} = ''; # DOCTYPE
2783 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2784 wakaba 1.18 !!!next-input-character;
2785     redo A;
2786 wakaba 1.183 } elsif ($self->{nc} == 0x0027) { # '
2787 wakaba 1.77 !!!cp (197);
2788 wakaba 1.183 $self->{ct}->{sysid} = ''; # DOCTYPE
2789 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2790 wakaba 1.18 !!!next-input-character;
2791     redo A;
2792 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2793 wakaba 1.77 !!!cp (198);
2794 wakaba 1.57 $self->{state} = DATA_STATE;
2795 wakaba 1.18 !!!next-input-character;
2796    
2797 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2798 wakaba 1.18
2799     redo A;
2800 wakaba 1.183 } elsif ($self->{nc} == -1) {
2801 wakaba 1.77 !!!cp (199);
2802 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2803    
2804 wakaba 1.57 $self->{state} = DATA_STATE;
2805 wakaba 1.26 ## reconsume
2806 wakaba 1.18
2807 wakaba 1.183 $self->{ct}->{quirks} = 1;
2808     !!!emit ($self->{ct}); # DOCTYPE
2809 wakaba 1.18
2810     redo A;
2811     } else {
2812 wakaba 1.77 !!!cp (200);
2813 wakaba 1.18 !!!parse-error (type => 'string after PUBLIC literal');
2814 wakaba 1.183 $self->{ct}->{quirks} = 1;
2815 wakaba 1.73
2816 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2817 wakaba 1.18 !!!next-input-character;
2818     redo A;
2819     }
2820 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2821 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2822 wakaba 1.77 !!!cp (201);
2823 wakaba 1.18 ## Stay in the state
2824     !!!next-input-character;
2825     redo A;
2826 wakaba 1.183 } elsif ($self->{nc} == 0x0022) { # "
2827 wakaba 1.77 !!!cp (202);
2828 wakaba 1.183 $self->{ct}->{sysid} = ''; # DOCTYPE
2829 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2830 wakaba 1.18 !!!next-input-character;
2831     redo A;
2832 wakaba 1.183 } elsif ($self->{nc} == 0x0027) { # '
2833 wakaba 1.77 !!!cp (203);
2834 wakaba 1.183 $self->{ct}->{sysid} = ''; # DOCTYPE
2835 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2836 wakaba 1.18 !!!next-input-character;
2837     redo A;
2838 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2839 wakaba 1.77 !!!cp (204);
2840 wakaba 1.18 !!!parse-error (type => 'no SYSTEM literal');
2841 wakaba 1.57 $self->{state} = DATA_STATE;
2842 wakaba 1.18 !!!next-input-character;
2843    
2844 wakaba 1.183 $self->{ct}->{quirks} = 1;
2845     !!!emit ($self->{ct}); # DOCTYPE
2846 wakaba 1.18
2847     redo A;
2848 wakaba 1.183 } elsif ($self->{nc} == -1) {
2849 wakaba 1.77 !!!cp (205);
2850 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2851    
2852 wakaba 1.57 $self->{state} = DATA_STATE;
2853 wakaba 1.26 ## reconsume
2854 wakaba 1.18
2855 wakaba 1.183 $self->{ct}->{quirks} = 1;
2856     !!!emit ($self->{ct}); # DOCTYPE
2857 wakaba 1.18
2858     redo A;
2859     } else {
2860 wakaba 1.77 !!!cp (206);
2861 wakaba 1.30 !!!parse-error (type => 'string after SYSTEM');
2862 wakaba 1.183 $self->{ct}->{quirks} = 1;
2863 wakaba 1.73
2864 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2865 wakaba 1.18 !!!next-input-character;
2866     redo A;
2867     }
2868 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2869 wakaba 1.183 if ($self->{nc} == 0x0022) { # "
2870 wakaba 1.77 !!!cp (207);
2871 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2872 wakaba 1.18 !!!next-input-character;
2873     redo A;
2874 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2875 wakaba 1.77 !!!cp (208);
2876 wakaba 1.153 !!!parse-error (type => 'unclosed SYSTEM literal');
2877 wakaba 1.69
2878     $self->{state} = DATA_STATE;
2879     !!!next-input-character;
2880    
2881 wakaba 1.183 $self->{ct}->{quirks} = 1;
2882     !!!emit ($self->{ct}); # DOCTYPE
2883 wakaba 1.69
2884     redo A;
2885 wakaba 1.183 } elsif ($self->{nc} == -1) {
2886 wakaba 1.77 !!!cp (209);
2887 wakaba 1.18 !!!parse-error (type => 'unclosed SYSTEM literal');
2888    
2889 wakaba 1.57 $self->{state} = DATA_STATE;
2890 wakaba 1.18 ## reconsume
2891    
2892 wakaba 1.183 $self->{ct}->{quirks} = 1;
2893     !!!emit ($self->{ct}); # DOCTYPE
2894 wakaba 1.18
2895     redo A;
2896     } else {
2897 wakaba 1.77 !!!cp (210);
2898 wakaba 1.183 $self->{ct}->{sysid} # DOCTYPE
2899     .= chr $self->{nc};
2900     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2901     length $self->{ct}->{sysid});
2902 wakaba 1.173
2903 wakaba 1.18 ## Stay in the state
2904     !!!next-input-character;
2905     redo A;
2906     }
2907 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2908 wakaba 1.183 if ($self->{nc} == 0x0027) { # '
2909 wakaba 1.77 !!!cp (211);
2910 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2911 wakaba 1.18 !!!next-input-character;
2912     redo A;
2913 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2914 wakaba 1.77 !!!cp (212);
2915 wakaba 1.153 !!!parse-error (type => 'unclosed SYSTEM literal');
2916 wakaba 1.69
2917     $self->{state} = DATA_STATE;
2918     !!!next-input-character;
2919    
2920 wakaba 1.183 $self->{ct}->{quirks} = 1;
2921     !!!emit ($self->{ct}); # DOCTYPE
2922 wakaba 1.69
2923     redo A;
2924 wakaba 1.183 } elsif ($self->{nc} == -1) {
2925 wakaba 1.77 !!!cp (213);
2926 wakaba 1.18 !!!parse-error (type => 'unclosed SYSTEM literal');
2927    
2928 wakaba 1.57 $self->{state} = DATA_STATE;
2929 wakaba 1.18 ## reconsume
2930    
2931 wakaba 1.183 $self->{ct}->{quirks} = 1;
2932     !!!emit ($self->{ct}); # DOCTYPE
2933 wakaba 1.1
2934     redo A;
2935     } else {
2936 wakaba 1.77 !!!cp (214);
2937 wakaba 1.183 $self->{ct}->{sysid} # DOCTYPE
2938     .= chr $self->{nc};
2939     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2940     length $self->{ct}->{sysid});
2941 wakaba 1.173
2942 wakaba 1.18 ## Stay in the state
2943     !!!next-input-character;
2944     redo A;
2945     }
2946 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2947 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2948 wakaba 1.77 !!!cp (215);
2949 wakaba 1.18 ## Stay in the state
2950     !!!next-input-character;
2951     redo A;
2952 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2953 wakaba 1.77 !!!cp (216);
2954 wakaba 1.57 $self->{state} = DATA_STATE;
2955 wakaba 1.18 !!!next-input-character;
2956    
2957 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2958 wakaba 1.18
2959     redo A;
2960 wakaba 1.183 } elsif ($self->{nc} == -1) {
2961 wakaba 1.77 !!!cp (217);
2962 wakaba 1.150 !!!parse-error (type => 'unclosed DOCTYPE');
2963 wakaba 1.57 $self->{state} = DATA_STATE;
2964 wakaba 1.26 ## reconsume
2965 wakaba 1.18
2966 wakaba 1.183 $self->{ct}->{quirks} = 1;
2967     !!!emit ($self->{ct}); # DOCTYPE
2968 wakaba 1.18
2969     redo A;
2970     } else {
2971 wakaba 1.77 !!!cp (218);
2972 wakaba 1.18 !!!parse-error (type => 'string after SYSTEM literal');
2973 wakaba 1.183 #$self->{ct}->{quirks} = 1;
2974 wakaba 1.73
2975 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2976 wakaba 1.1 !!!next-input-character;
2977     redo A;
2978     }
2979 wakaba 1.57 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2980 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
2981 wakaba 1.77 !!!cp (219);
2982 wakaba 1.57 $self->{state} = DATA_STATE;
2983 wakaba 1.1 !!!next-input-character;
2984    
2985 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2986 wakaba 1.1
2987     redo A;
2988 wakaba 1.183 } elsif ($self->{nc} == -1) {
2989 wakaba 1.77 !!!cp (220);
2990 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2991 wakaba 1.57 $self->{state} = DATA_STATE;
2992 wakaba 1.1 ## reconsume
2993    
2994 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2995 wakaba 1.1
2996     redo A;
2997     } else {
2998 wakaba 1.77 !!!cp (221);
2999 wakaba 1.173 my $s = '';
3000     $self->{read_until}->($s, q[>], 0);
3001    
3002 wakaba 1.1 ## Stay in the state
3003     !!!next-input-character;
3004     redo A;
3005     }
3006 wakaba 1.165 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3007     ## NOTE: "CDATA section state" in the state is jointly implemented
3008     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3009     ## and |CDATA_SECTION_MSE2_STATE|.
3010 wakaba 1.127
3011 wakaba 1.183 if ($self->{nc} == 0x005D) { # ]
3012 wakaba 1.165 !!!cp (221.1);
3013     $self->{state} = CDATA_SECTION_MSE1_STATE;
3014     !!!next-input-character;
3015     redo A;
3016 wakaba 1.183 } elsif ($self->{nc} == -1) {
3017 wakaba 1.165 $self->{state} = DATA_STATE;
3018     !!!next-input-character;
3019 wakaba 1.183 if (length $self->{ct}->{data}) { # character
3020 wakaba 1.165 !!!cp (221.2);
3021 wakaba 1.183 !!!emit ($self->{ct}); # character
3022 wakaba 1.165 } else {
3023     !!!cp (221.3);
3024 wakaba 1.183 ## No token to emit. $self->{ct} is discarded.
3025 wakaba 1.165 }
3026     redo A;
3027     } else {
3028     !!!cp (221.4);
3029 wakaba 1.183 $self->{ct}->{data} .= chr $self->{nc};
3030     $self->{read_until}->($self->{ct}->{data},
3031 wakaba 1.173 q<]>,
3032 wakaba 1.183 length $self->{ct}->{data});
3033 wakaba 1.173
3034 wakaba 1.165 ## Stay in the state.
3035     !!!next-input-character;
3036     redo A;
3037     }
3038 wakaba 1.127
3039 wakaba 1.165 ## ISSUE: "text tokens" in spec.
3040     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3041 wakaba 1.183 if ($self->{nc} == 0x005D) { # ]
3042 wakaba 1.165 !!!cp (221.5);
3043     $self->{state} = CDATA_SECTION_MSE2_STATE;
3044     !!!next-input-character;
3045     redo A;
3046     } else {
3047     !!!cp (221.6);
3048 wakaba 1.183 $self->{ct}->{data} .= ']';
3049 wakaba 1.165 $self->{state} = CDATA_SECTION_STATE;
3050     ## Reconsume.
3051     redo A;
3052     }
3053     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3054 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
3055 wakaba 1.165 $self->{state} = DATA_STATE;
3056     !!!next-input-character;
3057 wakaba 1.183 if (length $self->{ct}->{data}) { # character
3058 wakaba 1.165 !!!cp (221.7);
3059 wakaba 1.183 !!!emit ($self->{ct}); # character
3060 wakaba 1.127 } else {
3061 wakaba 1.165 !!!cp (221.8);
3062 wakaba 1.183 ## No token to emit. $self->{ct} is discarded.
3063 wakaba 1.127 }
3064 wakaba 1.165 redo A;
3065 wakaba 1.183 } elsif ($self->{nc} == 0x005D) { # ]
3066 wakaba 1.165 !!!cp (221.9); # character
3067 wakaba 1.183 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3068 wakaba 1.165 ## Stay in the state.
3069 wakaba 1.127 !!!next-input-character;
3070 wakaba 1.165 redo A;
3071 wakaba 1.127 } else {
3072 wakaba 1.165 !!!cp (221.11);
3073 wakaba 1.183 $self->{ct}->{data} .= ']]'; # character
3074 wakaba 1.165 $self->{state} = CDATA_SECTION_STATE;
3075     ## Reconsume.
3076     redo A;
3077 wakaba 1.127 }
3078 wakaba 1.167 } elsif ($self->{state} == ENTITY_STATE) {
3079 wakaba 1.187 if ($is_space->{$self->{nc}} or
3080     {
3081     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3082     $self->{entity_add} => 1,
3083     }->{$self->{nc}}) {
3084 wakaba 1.168 !!!cp (1001);
3085     ## Don't consume
3086     ## No error
3087     ## Return nothing.
3088     #
3089 wakaba 1.183 } elsif ($self->{nc} == 0x0023) { # #
3090 wakaba 1.170 !!!cp (999);
3091 wakaba 1.168 $self->{state} = ENTITY_HASH_STATE;
3092 wakaba 1.183 $self->{s_kwd} = '#';
3093 wakaba 1.168 !!!next-input-character;
3094     redo A;
3095 wakaba 1.183 } elsif ((0x0041 <= $self->{nc} and
3096     $self->{nc} <= 0x005A) or # A..Z
3097     (0x0061 <= $self->{nc} and
3098     $self->{nc} <= 0x007A)) { # a..z
3099 wakaba 1.170 !!!cp (998);
3100 wakaba 1.168 require Whatpm::_NamedEntityList;
3101     $self->{state} = ENTITY_NAME_STATE;
3102 wakaba 1.183 $self->{s_kwd} = chr $self->{nc};
3103     $self->{entity__value} = $self->{s_kwd};
3104 wakaba 1.168 $self->{entity__match} = 0;
3105     !!!next-input-character;
3106     redo A;
3107     } else {
3108     !!!cp (1027);
3109     !!!parse-error (type => 'bare ero');
3110     ## Return nothing.
3111     #
3112     }
3113 wakaba 1.20
3114 wakaba 1.168 ## NOTE: No character is consumed by the "consume a character
3115     ## reference" algorithm. In other word, there is an "&" character
3116     ## that does not introduce a character reference, which would be
3117     ## appended to the parent element or the attribute value in later
3118     ## process of the tokenizer.
3119 wakaba 1.112
3120 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3121 wakaba 1.170 !!!cp (997);
3122 wakaba 1.169 $self->{state} = $self->{prev_state};
3123 wakaba 1.168 ## Reconsume.
3124     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3125     line => $self->{line_prev},
3126     column => $self->{column_prev},
3127     });
3128     redo A;
3129 wakaba 1.169 } else {
3130 wakaba 1.170 !!!cp (996);
3131 wakaba 1.183 $self->{ca}->{value} .= '&';
3132 wakaba 1.169 $self->{state} = $self->{prev_state};
3133     ## Reconsume.
3134     redo A;
3135 wakaba 1.168 }
3136     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3137 wakaba 1.183 if ($self->{nc} == 0x0078 or # x
3138     $self->{nc} == 0x0058) { # X
3139 wakaba 1.170 !!!cp (995);
3140 wakaba 1.168 $self->{state} = HEXREF_X_STATE;
3141 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
3142 wakaba 1.168 !!!next-input-character;
3143     redo A;
3144 wakaba 1.183 } elsif (0x0030 <= $self->{nc} and
3145     $self->{nc} <= 0x0039) { # 0..9
3146 wakaba 1.170 !!!cp (994);
3147 wakaba 1.168 $self->{state} = NCR_NUM_STATE;
3148 wakaba 1.183 $self->{s_kwd} = $self->{nc} - 0x0030;
3149 wakaba 1.168 !!!next-input-character;
3150     redo A;
3151     } else {
3152     !!!parse-error (type => 'bare nero',
3153     line => $self->{line_prev},
3154     column => $self->{column_prev} - 1);
3155    
3156     ## NOTE: According to the spec algorithm, nothing is returned,
3157     ## and then "&#" is appended to the parent element or the attribute
3158     ## value in the later processing.
3159    
3160 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3161 wakaba 1.170 !!!cp (1019);
3162 wakaba 1.169 $self->{state} = $self->{prev_state};
3163 wakaba 1.168 ## Reconsume.
3164     !!!emit ({type => CHARACTER_TOKEN,
3165     data => '&#',
3166     line => $self->{line_prev},
3167     column => $self->{column_prev} - 1,
3168     });
3169     redo A;
3170 wakaba 1.169 } else {
3171 wakaba 1.170 !!!cp (993);
3172 wakaba 1.183 $self->{ca}->{value} .= '&#';
3173 wakaba 1.169 $self->{state} = $self->{prev_state};
3174     ## Reconsume.
3175     redo A;
3176 wakaba 1.1 }
3177 wakaba 1.168 }
3178     } elsif ($self->{state} == NCR_NUM_STATE) {
3179 wakaba 1.183 if (0x0030 <= $self->{nc} and
3180     $self->{nc} <= 0x0039) { # 0..9
3181 wakaba 1.78 !!!cp (1012);
3182 wakaba 1.183 $self->{s_kwd} *= 10;
3183     $self->{s_kwd} += $self->{nc} - 0x0030;
3184 wakaba 1.1
3185 wakaba 1.168 ## Stay in the state.
3186 wakaba 1.1 !!!next-input-character;
3187 wakaba 1.168 redo A;
3188 wakaba 1.183 } elsif ($self->{nc} == 0x003B) { # ;
3189 wakaba 1.78 !!!cp (1013);
3190 wakaba 1.1 !!!next-input-character;
3191 wakaba 1.168 #
3192 wakaba 1.1 } else {
3193 wakaba 1.78 !!!cp (1014);
3194 wakaba 1.168 !!!parse-error (type => 'no refc');
3195     ## Reconsume.
3196     #
3197 wakaba 1.1 }
3198    
3199 wakaba 1.183 my $code = $self->{s_kwd};
3200 wakaba 1.168 my $l = $self->{line_prev};
3201     my $c = $self->{column_prev};
3202 wakaba 1.191 if ($charref_map->{$code}) {
3203 wakaba 1.78 !!!cp (1015);
3204 wakaba 1.153 !!!parse-error (type => 'invalid character reference',
3205     text => (sprintf 'U+%04X', $code),
3206     line => $l, column => $c);
3207 wakaba 1.191 $code = $charref_map->{$code};
3208 wakaba 1.26 } elsif ($code > 0x10FFFF) {
3209 wakaba 1.78 !!!cp (1016);
3210 wakaba 1.153 !!!parse-error (type => 'invalid character reference',
3211     text => (sprintf 'U-%08X', $code),
3212     line => $l, column => $c);
3213 wakaba 1.26 $code = 0xFFFD;
3214 wakaba 1.1 }
3215 wakaba 1.168
3216 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3217 wakaba 1.170 !!!cp (992);
3218 wakaba 1.169 $self->{state} = $self->{prev_state};
3219 wakaba 1.168 ## Reconsume.
3220 wakaba 1.169 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3221     line => $l, column => $c,
3222     });
3223 wakaba 1.168 redo A;
3224     } else {
3225 wakaba 1.170 !!!cp (991);
3226 wakaba 1.183 $self->{ca}->{value} .= chr $code;
3227     $self->{ca}->{has_reference} = 1;
3228 wakaba 1.169 $self->{state} = $self->{prev_state};
3229 wakaba 1.168 ## Reconsume.
3230     redo A;
3231     }
3232     } elsif ($self->{state} == HEXREF_X_STATE) {
3233 wakaba 1.183 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3234     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3235     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3236 wakaba 1.168 # 0..9, A..F, a..f
3237 wakaba 1.170 !!!cp (990);
3238 wakaba 1.168 $self->{state} = HEXREF_HEX_STATE;
3239 wakaba 1.183 $self->{s_kwd} = 0;
3240 wakaba 1.168 ## Reconsume.
3241     redo A;
3242     } else {
3243     !!!parse-error (type => 'bare hcro',
3244     line => $self->{line_prev},
3245     column => $self->{column_prev} - 2);
3246    
3247     ## NOTE: According to the spec algorithm, nothing is returned,
3248     ## and then "&#" followed by "X" or "x" is appended to the parent
3249     ## element or the attribute value in the later processing.
3250    
3251 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3252 wakaba 1.170 !!!cp (1005);
3253 wakaba 1.169 $self->{state} = $self->{prev_state};
3254 wakaba 1.168 ## Reconsume.
3255     !!!emit ({type => CHARACTER_TOKEN,
3256 wakaba 1.183 data => '&' . $self->{s_kwd},
3257 wakaba 1.168 line => $self->{line_prev},
3258 wakaba 1.183 column => $self->{column_prev} - length $self->{s_kwd},
3259 wakaba 1.168 });
3260     redo A;
3261 wakaba 1.169 } else {
3262 wakaba 1.170 !!!cp (989);
3263 wakaba 1.183 $self->{ca}->{value} .= '&' . $self->{s_kwd};
3264 wakaba 1.169 $self->{state} = $self->{prev_state};
3265     ## Reconsume.
3266     redo A;
3267 wakaba 1.168 }
3268     }
3269     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3270 wakaba 1.183 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3271 wakaba 1.168 # 0..9
3272     !!!cp (1002);
3273 wakaba 1.183 $self->{s_kwd} *= 0x10;
3274     $self->{s_kwd} += $self->{nc} - 0x0030;
3275 wakaba 1.168 ## Stay in the state.
3276     !!!next-input-character;
3277     redo A;
3278 wakaba 1.183 } elsif (0x0061 <= $self->{nc} and
3279     $self->{nc} <= 0x0066) { # a..f
3280 wakaba 1.168 !!!cp (1003);
3281 wakaba 1.183 $self->{s_kwd} *= 0x10;
3282     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3283 wakaba 1.168 ## Stay in the state.
3284     !!!next-input-character;
3285     redo A;
3286 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
3287     $self->{nc} <= 0x0046) { # A..F
3288 wakaba 1.168 !!!cp (1004);
3289 wakaba 1.183 $self->{s_kwd} *= 0x10;
3290     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3291 wakaba 1.168 ## Stay in the state.
3292     !!!next-input-character;
3293     redo A;
3294 wakaba 1.183 } elsif ($self->{nc} == 0x003B) { # ;
3295 wakaba 1.168 !!!cp (1006);
3296     !!!next-input-character;
3297     #
3298     } else {
3299     !!!cp (1007);
3300     !!!parse-error (type => 'no refc',
3301     line => $self->{line},
3302     column => $self->{column});
3303     ## Reconsume.
3304     #
3305     }
3306    
3307 wakaba 1.183 my $code = $self->{s_kwd};
3308 wakaba 1.168 my $l = $self->{line_prev};
3309     my $c = $self->{column_prev};
3310 wakaba 1.191 if ($charref_map->{$code}) {
3311 wakaba 1.168 !!!cp (1008);
3312     !!!parse-error (type => 'invalid character reference',
3313     text => (sprintf 'U+%04X', $code),
3314     line => $l, column => $c);
3315 wakaba 1.191 $code = $charref_map->{$code};
3316 wakaba 1.168 } elsif ($code > 0x10FFFF) {
3317     !!!cp (1009);
3318     !!!parse-error (type => 'invalid character reference',
3319     text => (sprintf 'U-%08X', $code),
3320     line => $l, column => $c);
3321     $code = 0xFFFD;
3322     }
3323    
3324 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3325 wakaba 1.170 !!!cp (988);
3326 wakaba 1.169 $self->{state} = $self->{prev_state};
3327 wakaba 1.168 ## Reconsume.
3328 wakaba 1.169 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3329     line => $l, column => $c,
3330     });
3331 wakaba 1.168 redo A;
3332     } else {
3333 wakaba 1.170 !!!cp (987);
3334 wakaba 1.183 $self->{ca}->{value} .= chr $code;
3335     $self->{ca}->{has_reference} = 1;
3336 wakaba 1.169 $self->{state} = $self->{prev_state};
3337 wakaba 1.168 ## Reconsume.
3338     redo A;
3339     }
3340     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3341 wakaba 1.183 if (length $self->{s_kwd} < 30 and
3342 wakaba 1.168 ## NOTE: Some number greater than the maximum length of entity name
3343 wakaba 1.183 ((0x0041 <= $self->{nc} and # a
3344     $self->{nc} <= 0x005A) or # x
3345     (0x0061 <= $self->{nc} and # a
3346     $self->{nc} <= 0x007A) or # z
3347     (0x0030 <= $self->{nc} and # 0
3348     $self->{nc} <= 0x0039) or # 9
3349     $self->{nc} == 0x003B)) { # ;
3350 wakaba 1.168 our $EntityChar;
3351 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
3352     if (defined $EntityChar->{$self->{s_kwd}}) {
3353     if ($self->{nc} == 0x003B) { # ;
3354 wakaba 1.168 !!!cp (1020);
3355 wakaba 1.183 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
3356 wakaba 1.168 $self->{entity__match} = 1;
3357     !!!next-input-character;
3358     #
3359     } else {
3360     !!!cp (1021);
3361 wakaba 1.183 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
3362 wakaba 1.168 $self->{entity__match} = -1;
3363     ## Stay in the state.
3364     !!!next-input-character;
3365     redo A;
3366     }
3367     } else {
3368     !!!cp (1022);
3369 wakaba 1.183 $self->{entity__value} .= chr $self->{nc};
3370 wakaba 1.168 $self->{entity__match} *= 2;
3371     ## Stay in the state.
3372 wakaba 1.16 !!!next-input-character;
3373 wakaba 1.168 redo A;
3374     }
3375     }
3376    
3377     my $data;
3378     my $has_ref;
3379     if ($self->{entity__match} > 0) {
3380     !!!cp (1023);
3381     $data = $self->{entity__value};
3382     $has_ref = 1;
3383     #
3384     } elsif ($self->{entity__match} < 0) {
3385     !!!parse-error (type => 'no refc');
3386 wakaba 1.169 if ($self->{prev_state} != DATA_STATE and # in attribute
3387     $self->{entity__match} < -1) {
3388 wakaba 1.168 !!!cp (1024);
3389 wakaba 1.183 $data = '&' . $self->{s_kwd};
3390 wakaba 1.168 #
3391 wakaba 1.37 } else {
3392 wakaba 1.168 !!!cp (1025);
3393     $data = $self->{entity__value};
3394     $has_ref = 1;
3395     #
3396 wakaba 1.16 }
3397 wakaba 1.1 } else {
3398 wakaba 1.168 !!!cp (1026);
3399     !!!parse-error (type => 'bare ero',
3400     line => $self->{line_prev},
3401 wakaba 1.183 column => $self->{column_prev} - length $self->{s_kwd});
3402     $data = '&' . $self->{s_kwd};
3403 wakaba 1.168 #
3404 wakaba 1.1 }
3405 wakaba 1.168
3406     ## NOTE: In these cases, when a character reference is found,
3407     ## it is consumed and a character token is returned, or, otherwise,
3408     ## nothing is consumed and returned, according to the spec algorithm.
3409     ## In this implementation, anything that has been examined by the
3410     ## tokenizer is appended to the parent element or the attribute value
3411     ## as string, either literal string when no character reference or
3412     ## entity-replaced string otherwise, in this stage, since any characters
3413     ## that would not be consumed are appended in the data state or in an
3414     ## appropriate attribute value state anyway.
3415    
3416 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3417 wakaba 1.170 !!!cp (986);
3418 wakaba 1.169 $self->{state} = $self->{prev_state};
3419 wakaba 1.168 ## Reconsume.
3420     !!!emit ({type => CHARACTER_TOKEN,
3421 wakaba 1.169 data => $data,
3422 wakaba 1.168 line => $self->{line_prev},
3423 wakaba 1.183 column => $self->{column_prev} + 1 - length $self->{s_kwd},
3424 wakaba 1.168 });
3425 wakaba 1.167 redo A;
3426 wakaba 1.169 } else {
3427 wakaba 1.170 !!!cp (985);
3428 wakaba 1.183 $self->{ca}->{value} .= $data;
3429     $self->{ca}->{has_reference} = 1 if $has_ref;
3430 wakaba 1.169 $self->{state} = $self->{prev_state};
3431     ## Reconsume.
3432     redo A;
3433 wakaba 1.37 }
3434 wakaba 1.1 } else {
3435 wakaba 1.167 die "$0: $self->{state}: Unknown state";
3436     }
3437     } # A
3438    
3439     die "$0: _get_next_token: unexpected case";
3440     } # _get_next_token
3441 wakaba 1.1
3442     sub _initialize_tree_constructor ($) {
3443     my $self = shift;
3444     ## NOTE: $self->{document} MUST be specified before this method is called
3445     $self->{document}->strict_error_checking (0);
3446     ## TODO: Turn mutation events off # MUST
3447     ## TODO: Turn loose Document option (manakai extension) on
3448 wakaba 1.18 $self->{document}->manakai_is_html (1); # MUST
3449 wakaba 1.154 $self->{document}->set_user_data (manakai_source_line => 1);
3450     $self->{document}->set_user_data (manakai_source_column => 1);
3451 wakaba 1.1 } # _initialize_tree_constructor
3452    
3453     sub _terminate_tree_constructor ($) {
3454     my $self = shift;
3455     $self->{document}->strict_error_checking (1);
3456     ## TODO: Turn mutation events on
3457     } # _terminate_tree_constructor
3458    
3459     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3460    
3461 wakaba 1.3 { # tree construction stage
3462     my $token;
3463    
3464 wakaba 1.1 sub _construct_tree ($) {
3465     my ($self) = @_;
3466    
3467     ## When an interactive UA render the $self->{document} available
3468     ## to the user, or when it begin accepting user input, are
3469     ## not defined.
3470    
3471     ## Append a character: collect it and all subsequent consecutive
3472     ## characters and insert one Text node whose data is concatenation
3473     ## of all those characters. # MUST
3474    
3475     !!!next-token;
3476    
3477 wakaba 1.3 undef $self->{form_element};
3478     undef $self->{head_element};
3479     $self->{open_elements} = [];
3480     undef $self->{inner_html_node};
3481    
3482 wakaba 1.84 ## NOTE: The "initial" insertion mode.
3483 wakaba 1.3 $self->_tree_construction_initial; # MUST
3484 wakaba 1.84
3485     ## NOTE: The "before html" insertion mode.
3486 wakaba 1.3 $self->_tree_construction_root_element;
3487 wakaba 1.84 $self->{insertion_mode} = BEFORE_HEAD_IM;
3488    
3489     ## NOTE: The "before head" insertion mode and so on.
3490 wakaba 1.3 $self->_tree_construction_main;
3491     } # _construct_tree
3492    
3493     sub _tree_construction_initial ($) {
3494     my $self = shift;
3495 wakaba 1.84
3496     ## NOTE: "initial" insertion mode
3497    
3498 wakaba 1.18 INITIAL: {
3499 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
3500 wakaba 1.18 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3501     ## error, switch to a conformance checking mode for another
3502     ## language.
3503     my $doctype_name = $token->{name};
3504     $doctype_name = '' unless defined $doctype_name;
3505 wakaba 1.159 $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3506 wakaba 1.18 if (not defined $token->{name} or # <!DOCTYPE>
3507 wakaba 1.183 defined $token->{sysid}) {
3508 wakaba 1.79 !!!cp ('t1');
3509 wakaba 1.113 !!!parse-error (type => 'not HTML5', token => $token);
3510 wakaba 1.18 } elsif ($doctype_name ne 'HTML') {
3511 wakaba 1.79 !!!cp ('t2');
3512 wakaba 1.113 !!!parse-error (type => 'not HTML5', token => $token);
3513 wakaba 1.183 } elsif (defined $token->{pubid}) {
3514     if ($token->{pubid} eq 'XSLT-compat') {
3515 wakaba 1.159 !!!cp ('t1.2');
3516     !!!parse-error (type => 'XSLT-compat', token => $token,
3517     level => $self->{level}->{should});
3518     } else {
3519     !!!parse-error (type => 'not HTML5', token => $token);
3520     }
3521 wakaba 1.79 } else {
3522     !!!cp ('t3');
3523 wakaba 1.159 #
3524 wakaba 1.18 }
3525    
3526     my $doctype = $self->{document}->create_document_type_definition
3527     ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3528 wakaba 1.122 ## NOTE: Default value for both |public_id| and |system_id| attributes
3529     ## are empty strings, so that we don't set any value in missing cases.
3530 wakaba 1.183 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
3531     $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
3532 wakaba 1.18 ## NOTE: Other DocumentType attributes are null or empty lists.
3533     ## ISSUE: internalSubset = null??
3534     $self->{document}->append_child ($doctype);
3535    
3536 wakaba 1.75 if ($token->{quirks} or $doctype_name ne 'HTML') {
3537 wakaba 1.79 !!!cp ('t4');
3538 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3539 wakaba 1.183 } elsif (defined $token->{pubid}) {
3540     my $pubid = $token->{pubid};
3541 wakaba 1.18 $pubid =~ tr/a-z/A-z/;
3542 wakaba 1.143 my $prefix = [
3543     "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3544     "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3545     "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3546     "-//IETF//DTD HTML 2.0 LEVEL 1//",
3547     "-//IETF//DTD HTML 2.0 LEVEL 2//",
3548     "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3549     "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3550     "-//IETF//DTD HTML 2.0 STRICT//",
3551     "-//IETF//DTD HTML 2.0//",
3552     "-//IETF//DTD HTML 2.1E//",
3553     "-//IETF//DTD HTML 3.0//",
3554     "-//IETF//DTD HTML 3.2 FINAL//",
3555     "-//IETF//DTD HTML 3.2//",
3556     "-//IETF//DTD HTML 3//",
3557     "-//IETF//DTD HTML LEVEL 0//",
3558     "-//IETF//DTD HTML LEVEL 1//",
3559     "-//IETF//DTD HTML LEVEL 2//",
3560     "-//IETF//DTD HTML LEVEL 3//",
3561     "-//IETF//DTD HTML STRICT LEVEL 0//",
3562     "-//IETF//DTD HTML STRICT LEVEL 1//",
3563     "-//IETF//DTD HTML STRICT LEVEL 2//",
3564     "-//IETF//DTD HTML STRICT LEVEL 3//",
3565     "-//IETF//DTD HTML STRICT//",
3566     "-//IETF//DTD HTML//",
3567     "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3568     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3569     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3570     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3571     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3572     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3573     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3574     "-//NETSCAPE COMM. CORP.//DTD HTML//",
3575     "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3576     "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3577     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3578     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3579     "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3580     "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3581     "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3582     "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3583     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3584     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3585     "-//W3C//DTD HTML 3 1995-03-24//",
3586     "-//W3C//DTD HTML 3.2 DRAFT//",
3587     "-//W3C//DTD HTML 3.2 FINAL//",
3588     "-//W3C//DTD HTML 3.2//",
3589     "-//W3C//DTD HTML 3.2S DRAFT//",
3590     "-//W3C//DTD HTML 4.0 FRAMESET//",
3591     "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3592     "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3593     "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3594     "-//W3C//DTD W3 HTML//",
3595     "-//W3O//DTD W3 HTML 3.0//",
3596     "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3597     "-//WEBTECHS//DTD MOZILLA HTML//",
3598     ]; # $prefix
3599     my $match;
3600     for (@$prefix) {
3601     if (substr ($prefix, 0, length $_) eq $_) {
3602     $match = 1;
3603     last;
3604     }
3605     }
3606     if ($match or
3607     $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3608     $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3609     $pubid eq "HTML") {
3610 wakaba 1.79 !!!cp ('t5');
3611 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3612 wakaba 1.143 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3613     $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3614 wakaba 1.183 if (defined $token->{sysid}) {
3615 wakaba 1.79 !!!cp ('t6');
3616 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3617     } else {
3618 wakaba 1.79 !!!cp ('t7');
3619 wakaba 1.18 $self->{document}->manakai_compat_mode ('limited quirks');
3620 wakaba 1.3 }
3621 wakaba 1.143 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3622     $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3623 wakaba 1.79 !!!cp ('t8');
3624 wakaba 1.18 $self->{document}->manakai_compat_mode ('limited quirks');
3625 wakaba 1.79 } else {
3626     !!!cp ('t9');
3627 wakaba 1.18 }
3628 wakaba 1.79 } else {
3629     !!!cp ('t10');
3630 wakaba 1.18 }
3631 wakaba 1.183 if (defined $token->{sysid}) {
3632     my $sysid = $token->{sysid};
3633 wakaba 1.18 $sysid =~ tr/A-Z/a-z/;
3634     if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3635 wakaba 1.143 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3636     ## marked as quirks.
3637 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3638 wakaba 1.79 !!!cp ('t11');
3639     } else {
3640     !!!cp ('t12');
3641 wakaba 1.18 }
3642 wakaba 1.79 } else {
3643     !!!cp ('t13');
3644 wakaba 1.18 }
3645    
3646 wakaba 1.84 ## Go to the "before html" insertion mode.
3647 wakaba 1.18 !!!next-token;
3648     return;
3649     } elsif ({
3650 wakaba 1.55 START_TAG_TOKEN, 1,
3651     END_TAG_TOKEN, 1,
3652     END_OF_FILE_TOKEN, 1,
3653 wakaba 1.18 }->{$token->{type}}) {
3654 wakaba 1.79 !!!cp ('t14');
3655 wakaba 1.113 !!!parse-error (type => 'no DOCTYPE', token => $token);
3656 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3657 wakaba 1.84 ## Go to the "before html" insertion mode.
3658 wakaba 1.18 ## reprocess
3659 wakaba 1.125 !!!ack-later;
3660 wakaba 1.18 return;
3661 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
3662 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
3663 wakaba 1.18 ## Ignore the token
3664 wakaba 1.26
3665 wakaba 1.18 unless (length $token->{data}) {
3666 wakaba 1.79 !!!cp ('t15');
3667 wakaba 1.84 ## Stay in the insertion mode.
3668 wakaba 1.18 !!!next-token;
3669     redo INITIAL;
3670 wakaba 1.79 } else {
3671     !!!cp ('t16');
3672 wakaba 1.3 }
3673 wakaba 1.79 } else {
3674     !!!cp ('t17');
3675 wakaba 1.3 }
3676 wakaba 1.18
3677 wakaba 1.113 !!!parse-error (type => 'no DOCTYPE', token => $token);
3678 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3679 wakaba 1.84 ## Go to the "before html" insertion mode.
3680 wakaba 1.18 ## reprocess
3681     return;
3682 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
3683 wakaba 1.79 !!!cp ('t18');
3684 wakaba 1.18 my $comment = $self->{document}->create_comment ($token->{data});
3685     $self->{document}->append_child ($comment);
3686    
3687 wakaba 1.84 ## Stay in the insertion mode.
3688 wakaba 1.18 !!!next-token;
3689     redo INITIAL;
3690     } else {
3691 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
3692 wakaba 1.18 }
3693     } # INITIAL
3694 wakaba 1.79
3695     die "$0: _tree_construction_initial: This should be never reached";
3696 wakaba 1.3 } # _tree_construction_initial
3697    
3698     sub _tree_construction_root_element ($) {
3699     my $self = shift;
3700 wakaba 1.84
3701     ## NOTE: "before html" insertion mode.
3702 wakaba 1.3
3703     B: {
3704 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
3705 wakaba 1.79 !!!cp ('t19');
3706 wakaba 1.153 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3707 wakaba 1.3 ## Ignore the token
3708 wakaba 1.84 ## Stay in the insertion mode.
3709 wakaba 1.3 !!!next-token;
3710     redo B;
3711 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
3712 wakaba 1.79 !!!cp ('t20');
3713 wakaba 1.3 my $comment = $self->{document}->create_comment ($token->{data});
3714     $self->{document}->append_child ($comment);
3715 wakaba 1.84 ## Stay in the insertion mode.
3716 wakaba 1.3 !!!next-token;
3717     redo B;
3718 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
3719 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
3720 wakaba 1.26 ## Ignore the token.
3721    
3722 wakaba 1.3 unless (length $token->{data}) {
3723 wakaba 1.79 !!!cp ('t21');
3724 wakaba 1.84 ## Stay in the insertion mode.
3725 wakaba 1.3 !!!next-token;
3726     redo B;
3727 wakaba 1.79 } else {
3728     !!!cp ('t22');
3729 wakaba 1.3 }
3730 wakaba 1.79 } else {
3731     !!!cp ('t23');
3732 wakaba 1.3 }
3733 wakaba 1.61
3734     $self->{application_cache_selection}->(undef);
3735    
3736     #
3737     } elsif ($token->{type} == START_TAG_TOKEN) {
3738 wakaba 1.84 if ($token->{tag_name} eq 'html') {
3739     my $root_element;
3740 wakaba 1.126 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3741 wakaba 1.84 $self->{document}->append_child ($root_element);
3742 wakaba 1.123 push @{$self->{open_elements}},
3743     [$root_element, $el_category->{html}];
3744 wakaba 1.84
3745     if ($token->{attributes}->{manifest}) {
3746     !!!cp ('t24');
3747     $self->{application_cache_selection}
3748     ->($token->{attributes}->{manifest}->{value});
3749 wakaba 1.118 ## ISSUE: Spec is unclear on relative references.
3750     ## According to Hixie (#whatwg 2008-03-19), it should be
3751     ## resolved against the base URI of the document in HTML
3752     ## or xml:base of the element in XHTML.
3753 wakaba 1.84 } else {
3754     !!!cp ('t25');
3755     $self->{application_cache_selection}->(undef);
3756     }
3757    
3758 wakaba 1.125 !!!nack ('t25c');
3759    
3760 wakaba 1.84 !!!next-token;
3761     return; ## Go to the "before head" insertion mode.
3762 wakaba 1.61 } else {
3763 wakaba 1.84 !!!cp ('t25.1');
3764     #
3765 wakaba 1.61 }
3766 wakaba 1.3 } elsif ({
3767 wakaba 1.55 END_TAG_TOKEN, 1,
3768     END_OF_FILE_TOKEN, 1,
3769 wakaba 1.3 }->{$token->{type}}) {
3770 wakaba 1.79 !!!cp ('t26');
3771 wakaba 1.3 #
3772     } else {
3773 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
3774 wakaba 1.3 }
3775 wakaba 1.61
3776 wakaba 1.126 my $root_element;
3777     !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3778 wakaba 1.84 $self->{document}->append_child ($root_element);
3779 wakaba 1.123 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3780 wakaba 1.84
3781     $self->{application_cache_selection}->(undef);
3782    
3783     ## NOTE: Reprocess the token.
3784 wakaba 1.125 !!!ack-later;
3785 wakaba 1.84 return; ## Go to the "before head" insertion mode.
3786    
3787     ## ISSUE: There is an issue in the spec
3788 wakaba 1.3 } # B
3789 wakaba 1.79
3790     die "$0: _tree_construction_root_element: This should never be reached";
3791 wakaba 1.3 } # _tree_construction_root_element
3792    
3793     sub _reset_insertion_mode ($) {
3794     my $self = shift;
3795    
3796     ## Step 1
3797     my $last;
3798    
3799     ## Step 2
3800     my $i = -1;
3801     my $node = $self->{open_elements}->[$i];
3802    
3803     ## Step 3
3804     S3: {
3805 wakaba 1.29 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3806     $last = 1;
3807     if (defined $self->{inner_html_node}) {
3808 wakaba 1.140 !!!cp ('t28');
3809     $node = $self->{inner_html_node};
3810     } else {
3811     die "_reset_insertion_mode: t27";
3812 wakaba 1.3 }
3813     }
3814 wakaba 1.140
3815     ## Step 4..14
3816     my $new_mode;
3817     if ($node->[1] & FOREIGN_EL) {
3818     !!!cp ('t28.1');
3819     ## NOTE: Strictly spaking, the line below only applies to MathML and
3820     ## SVG elements. Currently the HTML syntax supports only MathML and
3821     ## SVG elements as foreigners.
3822 wakaba 1.148 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3823 wakaba 1.140 } elsif ($node->[1] & TABLE_CELL_EL) {
3824     if ($last) {
3825     !!!cp ('t28.2');
3826     #
3827     } else {
3828     !!!cp ('t28.3');
3829     $new_mode = IN_CELL_IM;
3830     }
3831     } else {
3832     !!!cp ('t28.4');
3833     $new_mode = {
3834 wakaba 1.54 select => IN_SELECT_IM,
3835 wakaba 1.83 ## NOTE: |option| and |optgroup| do not set
3836     ## insertion mode to "in select" by themselves.
3837 wakaba 1.54 tr => IN_ROW_IM,
3838     tbody => IN_TABLE_BODY_IM,
3839     thead => IN_TABLE_BODY_IM,
3840     tfoot => IN_TABLE_BODY_IM,
3841     caption => IN_CAPTION_IM,
3842     colgroup => IN_COLUMN_GROUP_IM,
3843     table => IN_TABLE_IM,
3844     head => IN_BODY_IM, # not in head!
3845     body => IN_BODY_IM,
3846     frameset => IN_FRAMESET_IM,
3847 wakaba 1.123 }->{$node->[0]->manakai_local_name};
3848 wakaba 1.140 }
3849     $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3850 wakaba 1.3
3851 wakaba 1.126 ## Step 15
3852 wakaba 1.123 if ($node->[1] & HTML_EL) {
3853 wakaba 1.3 unless (defined $self->{head_element}) {
3854 wakaba 1.79 !!!cp ('t29');
3855 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
3856 wakaba 1.3 } else {
3857 wakaba 1.81 ## ISSUE: Can this state be reached?
3858 wakaba 1.79 !!!cp ('t30');
3859 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3860 wakaba 1.3 }
3861     return;
3862 wakaba 1.79 } else {
3863     !!!cp ('t31');
3864 wakaba 1.3 }
3865    
3866 wakaba 1.126 ## Step 16
3867 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3868 wakaba 1.3
3869 wakaba 1.126 ## Step 17
3870 wakaba 1.3 $i--;
3871     $node = $self->{open_elements}->[$i];
3872    
3873 wakaba 1.126 ## Step 18
3874 wakaba 1.3 redo S3;
3875     } # S3
3876 wakaba 1.79
3877     die "$0: _reset_insertion_mode: This line should never be reached";
3878 wakaba 1.3 } # _reset_insertion_mode
3879    
3880     sub _tree_construction_main ($) {
3881     my $self = shift;
3882    
3883 wakaba 1.1 my $active_formatting_elements = [];
3884    
3885     my $reconstruct_active_formatting_elements = sub { # MUST
3886     my $insert = shift;
3887    
3888     ## Step 1
3889     return unless @$active_formatting_elements;
3890    
3891     ## Step 3
3892     my $i = -1;
3893     my $entry = $active_formatting_elements->[$i];
3894    
3895     ## Step 2
3896     return if $entry->[0] eq '#marker';
3897 wakaba 1.3 for (@{$self->{open_elements}}) {
3898 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
3899 wakaba 1.79 !!!cp ('t32');
3900 wakaba 1.1 return;
3901     }
3902     }
3903    
3904     S4: {
3905     ## Step 4
3906     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3907    
3908     ## Step 5
3909     $i--;
3910     $entry = $active_formatting_elements->[$i];
3911    
3912     ## Step 6
3913     if ($entry->[0] eq '#marker') {
3914 wakaba 1.81 !!!cp ('t33_1');
3915 wakaba 1.1 #
3916     } else {
3917     my $in_open_elements;
3918 wakaba 1.3 OE: for (@{$self->{open_elements}}) {
3919 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
3920 wakaba 1.79 !!!cp ('t33');
3921 wakaba 1.1 $in_open_elements = 1;
3922     last OE;
3923     }
3924     }
3925     if ($in_open_elements) {
3926 wakaba 1.79 !!!cp ('t34');
3927 wakaba 1.1 #
3928     } else {
3929 wakaba 1.81 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3930 wakaba 1.79 !!!cp ('t35');
3931 wakaba 1.1 redo S4;
3932     }
3933     }
3934    
3935     ## Step 7
3936     $i++;
3937     $entry = $active_formatting_elements->[$i];
3938     } # S4
3939    
3940     S7: {
3941     ## Step 8
3942     my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3943    
3944     ## Step 9
3945     $insert->($clone->[0]);
3946 wakaba 1.3 push @{$self->{open_elements}}, $clone;
3947 wakaba 1.1
3948     ## Step 10
3949 wakaba 1.3 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3950 wakaba 1.1
3951     ## Step 11
3952     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3953 wakaba 1.79 !!!cp ('t36');
3954 wakaba 1.1 ## Step 7'
3955     $i++;
3956     $entry = $active_formatting_elements->[$i];
3957    
3958     redo S7;
3959     }
3960 wakaba 1.79
3961     !!!cp ('t37');
3962 wakaba 1.1 } # S7
3963     }; # $reconstruct_active_formatting_elements
3964    
3965     my $clear_up_to_marker = sub {
3966     for (reverse 0..$#$active_formatting_elements) {
3967     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3968 wakaba 1.79 !!!cp ('t38');
3969 wakaba 1.1 splice @$active_formatting_elements, $_;
3970     return;
3971     }
3972     }
3973 wakaba 1.79
3974     !!!cp ('t39');
3975 wakaba 1.1 }; # $clear_up_to_marker
3976    
3977 wakaba 1.96 my $insert;
3978    
3979     my $parse_rcdata = sub ($) {
3980     my ($content_model_flag) = @_;
3981 wakaba 1.25
3982     ## Step 1
3983     my $start_tag_name = $token->{tag_name};
3984     my $el;
3985 wakaba 1.126 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3986 wakaba 1.25
3987     ## Step 2
3988 wakaba 1.96 $insert->($el);
3989 wakaba 1.25
3990     ## Step 3
3991 wakaba 1.40 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3992 wakaba 1.13 delete $self->{escape}; # MUST
3993 wakaba 1.25
3994     ## Step 4
3995 wakaba 1.1 my $text = '';
3996 wakaba 1.125 !!!nack ('t40.1');
3997 wakaba 1.1 !!!next-token;
3998 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
3999 wakaba 1.79 !!!cp ('t40');
4000 wakaba 1.1 $text .= $token->{data};
4001     !!!next-token;
4002 wakaba 1.25 }
4003    
4004     ## Step 5
4005 wakaba 1.1 if (length $text) {
4006 wakaba 1.79 !!!cp ('t41');
4007 wakaba 1.25 my $text = $self->{document}->create_text_node ($text);
4008     $el->append_child ($text);
4009 wakaba 1.1 }
4010 wakaba 1.25
4011     ## Step 6
4012 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
4013 wakaba 1.25
4014     ## Step 7
4015 wakaba 1.79 if ($token->{type} == END_TAG_TOKEN and
4016     $token->{tag_name} eq $start_tag_name) {
4017     !!!cp ('t42');
4018 wakaba 1.1 ## Ignore the token
4019     } else {
4020 wakaba 1.96 ## NOTE: An end-of-file token.
4021     if ($content_model_flag == CDATA_CONTENT_MODEL) {
4022     !!!cp ('t43');
4023 wakaba 1.153 !!!parse-error (type => 'in CDATA:#eof', token => $token);
4024 wakaba 1.96 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
4025     !!!cp ('t44');
4026 wakaba 1.153 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
4027 wakaba 1.96 } else {
4028     die "$0: $content_model_flag in parse_rcdata";
4029     }
4030 wakaba 1.1 }
4031     !!!next-token;
4032 wakaba 1.25 }; # $parse_rcdata
4033 wakaba 1.1
4034 wakaba 1.96 my $script_start_tag = sub () {
4035 wakaba 1.1 my $script_el;
4036 wakaba 1.126 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
4037 wakaba 1.1 ## TODO: mark as "parser-inserted"
4038    
4039 wakaba 1.40 $self->{content_model} = CDATA_CONTENT_MODEL;
4040 wakaba 1.13 delete $self->{escape}; # MUST
4041 wakaba 1.1
4042     my $text = '';
4043 wakaba 1.125 !!!nack ('t45.1');
4044 wakaba 1.1 !!!next-token;
4045 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
4046 wakaba 1.79 !!!cp ('t45');
4047 wakaba 1.1 $text .= $token->{data};
4048     !!!next-token;
4049     } # stop if non-character token or tokenizer stops tokenising
4050     if (length $text) {
4051 wakaba 1.79 !!!cp ('t46');
4052 wakaba 1.1 $script_el->manakai_append_text ($text);
4053     }
4054    
4055 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
4056 wakaba 1.1
4057 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
4058 wakaba 1.1 $token->{tag_name} eq 'script') {
4059 wakaba 1.79 !!!cp ('t47');
4060 wakaba 1.1 ## Ignore the token
4061     } else {
4062 wakaba 1.79 !!!cp ('t48');
4063 wakaba 1.153 !!!parse-error (type => 'in CDATA:#eof', token => $token);
4064 wakaba 1.1 ## ISSUE: And ignore?
4065     ## TODO: mark as "already executed"
4066     }
4067    
4068 wakaba 1.3 if (defined $self->{inner_html_node}) {
4069 wakaba 1.79 !!!cp ('t49');
4070 wakaba 1.3 ## TODO: mark as "already executed"
4071     } else {
4072 wakaba 1.79 !!!cp ('t50');
4073 wakaba 1.1 ## TODO: $old_insertion_point = current insertion point
4074     ## TODO: insertion point = just before the next input character
4075 wakaba 1.25
4076     $insert->($script_el);
4077 wakaba 1.1
4078     ## TODO: insertion point = $old_insertion_point (might be "undefined")
4079    
4080     ## TODO: if there is a script that will execute as soon as the parser resume, then...
4081     }
4082    
4083     !!!next-token;
4084     }; # $script_start_tag
4085    
4086 wakaba 1.102 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
4087     ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
4088     my $open_tables = [[$self->{open_elements}->[0]->[0]]];
4089    
4090 wakaba 1.1 my $formatting_end_tag = sub {
4091 wakaba 1.113 my $end_tag_token = shift;
4092     my $tag_name = $end_tag_token->{tag_name};
4093 wakaba 1.1
4094 wakaba 1.103 ## NOTE: The adoption agency algorithm (AAA).
4095 wakaba 1.102
4096 wakaba 1.1 FET: {
4097     ## Step 1
4098     my $formatting_element;
4099     my $formatting_element_i_in_active;
4100     AFE: for (reverse 0..$#$active_formatting_elements) {
4101 wakaba 1.123 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
4102     !!!cp ('t52');
4103     last AFE;
4104     } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
4105     eq $tag_name) {
4106 wakaba 1.79 !!!cp ('t51');
4107 wakaba 1.1 $formatting_element = $active_formatting_elements->[$_];
4108     $formatting_element_i_in_active = $_;
4109     last AFE;
4110     }
4111     } # AFE
4112     unless (defined $formatting_element) {
4113 wakaba 1.79 !!!cp ('t53');
4114 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
4115 wakaba 1.1 ## Ignore the token
4116     !!!next-token;
4117     return;
4118     }
4119     ## has an element in scope
4120     my $in_scope = 1;
4121     my $formatting_element_i_in_open;
4122 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4123     my $node = $self->{open_elements}->[$_];
4124 wakaba 1.1 if ($node->[0] eq $formatting_element->[0]) {
4125     if ($in_scope) {
4126 wakaba 1.79 !!!cp ('t54');
4127 wakaba 1.1 $formatting_element_i_in_open = $_;
4128     last INSCOPE;
4129     } else { # in open elements but not in scope
4130 wakaba 1.79 !!!cp ('t55');
4131 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4132     text => $token->{tag_name},
4133 wakaba 1.113 token => $end_tag_token);
4134 wakaba 1.1 ## Ignore the token
4135     !!!next-token;
4136     return;
4137     }
4138 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
4139 wakaba 1.79 !!!cp ('t56');
4140 wakaba 1.1 $in_scope = 0;
4141     }
4142     } # INSCOPE
4143     unless (defined $formatting_element_i_in_open) {
4144 wakaba 1.79 !!!cp ('t57');
4145 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4146     text => $token->{tag_name},
4147 wakaba 1.113 token => $end_tag_token);
4148 wakaba 1.1 pop @$active_formatting_elements; # $formatting_element
4149     !!!next-token; ## TODO: ok?
4150     return;
4151     }
4152 wakaba 1.3 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
4153 wakaba 1.79 !!!cp ('t58');
4154 wakaba 1.122 !!!parse-error (type => 'not closed',
4155 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4156 wakaba 1.122 ->manakai_local_name,
4157 wakaba 1.113 token => $end_tag_token);
4158 wakaba 1.1 }
4159    
4160     ## Step 2
4161     my $furthest_block;
4162     my $furthest_block_i_in_open;
4163 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
4164     my $node = $self->{open_elements}->[$_];
4165 wakaba 1.123 if (not ($node->[1] & FORMATTING_EL) and
4166 wakaba 1.1 #not $phrasing_category->{$node->[1]} and
4167 wakaba 1.123 ($node->[1] & SPECIAL_EL or
4168     $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
4169 wakaba 1.79 !!!cp ('t59');
4170 wakaba 1.1 $furthest_block = $node;
4171     $furthest_block_i_in_open = $_;
4172     } elsif ($node->[0] eq $formatting_element->[0]) {
4173 wakaba 1.79 !!!cp ('t60');
4174 wakaba 1.1 last OE;
4175     }
4176     } # OE
4177    
4178     ## Step 3
4179     unless (defined $furthest_block) { # MUST
4180 wakaba 1.79 !!!cp ('t61');
4181 wakaba 1.3 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
4182 wakaba 1.1 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
4183     !!!next-token;
4184     return;
4185     }
4186    
4187     ## Step 4
4188 wakaba 1.3 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
4189 wakaba 1.1
4190     ## Step 5
4191     my $furthest_block_parent = $furthest_block->[0]->parent_node;
4192     if (defined $furthest_block_parent) {
4193 wakaba 1.79 !!!cp ('t62');
4194 wakaba 1.1 $furthest_block_parent->remove_child ($furthest_block->[0]);
4195     }
4196    
4197     ## Step 6
4198     my $bookmark_prev_el
4199     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
4200     ->[0];
4201    
4202     ## Step 7
4203     my $node = $furthest_block;
4204     my $node_i_in_open = $furthest_block_i_in_open;
4205     my $last_node = $furthest_block;
4206     S7: {
4207     ## Step 1
4208     $node_i_in_open--;
4209 wakaba 1.3 $node = $self->{open_elements}->[$node_i_in_open];
4210 wakaba 1.1
4211     ## Step 2
4212     my $node_i_in_active;
4213     S7S2: {
4214     for (reverse 0..$#$active_formatting_elements) {
4215     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4216 wakaba 1.79 !!!cp ('t63');
4217 wakaba 1.1 $node_i_in_active = $_;
4218     last S7S2;
4219     }
4220     }
4221 wakaba 1.3 splice @{$self->{open_elements}}, $node_i_in_open, 1;
4222 wakaba 1.1 redo S7;
4223     } # S7S2
4224    
4225     ## Step 3
4226     last S7 if $node->[0] eq $formatting_element->[0];
4227    
4228     ## Step 4
4229     if ($last_node->[0] eq $furthest_block->[0]) {
4230 wakaba 1.79 !!!cp ('t64');
4231 wakaba 1.1 $bookmark_prev_el = $node->[0];
4232     }
4233    
4234     ## Step 5
4235     if ($node->[0]->has_child_nodes ()) {
4236 wakaba 1.79 !!!cp ('t65');
4237 wakaba 1.1 my $clone = [$node->[0]->clone_node (0), $node->[1]];
4238     $active_formatting_elements->[$node_i_in_active] = $clone;
4239 wakaba 1.3 $self->{open_elements}->[$node_i_in_open] = $clone;
4240 wakaba 1.1 $node = $clone;
4241     }
4242    
4243     ## Step 6
4244     $node->[0]->append_child ($last_node->[0]);
4245    
4246     ## Step 7
4247     $last_node = $node;
4248    
4249     ## Step 8
4250     redo S7;
4251     } # S7
4252    
4253     ## Step 8
4254 wakaba 1.123 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
4255 wakaba 1.102 my $foster_parent_element;
4256     my $next_sibling;
4257 wakaba 1.123 OE: for (reverse 0..$#{$self->{open_elements}}) {
4258     if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4259 wakaba 1.102 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4260     if (defined $parent and $parent->node_type == 1) {
4261     !!!cp ('t65.1');
4262     $foster_parent_element = $parent;
4263     $next_sibling = $self->{open_elements}->[$_]->[0];
4264     } else {
4265     !!!cp ('t65.2');
4266     $foster_parent_element
4267     = $self->{open_elements}->[$_ - 1]->[0];
4268     }
4269     last OE;
4270     }
4271     } # OE
4272     $foster_parent_element = $self->{open_elements}->[0]->[0]
4273     unless defined $foster_parent_element;
4274     $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
4275     $open_tables->[-1]->[1] = 1; # tainted
4276     } else {
4277     !!!cp ('t65.3');
4278     $common_ancestor_node->[0]->append_child ($last_node->[0]);
4279     }
4280 wakaba 1.1
4281     ## Step 9
4282     my $clone = [$formatting_element->[0]->clone_node (0),
4283     $formatting_element->[1]];
4284    
4285     ## Step 10
4286     my @cn = @{$furthest_block->[0]->child_nodes};
4287     $clone->[0]->append_child ($_) for @cn;
4288    
4289     ## Step 11
4290     $furthest_block->[0]->append_child ($clone->[0]);
4291    
4292     ## Step 12
4293     my $i;
4294     AFE: for (reverse 0..$#$active_formatting_elements) {
4295     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
4296 wakaba 1.79 !!!cp ('t66');
4297 wakaba 1.1 splice @$active_formatting_elements, $_, 1;
4298     $i-- and last AFE if defined $i;
4299     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
4300 wakaba 1.79 !!!cp ('t67');
4301 wakaba 1.1 $i = $_;
4302     }
4303     } # AFE
4304     splice @$active_formatting_elements, $i + 1, 0, $clone;
4305    
4306     ## Step 13
4307     undef $i;
4308 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
4309     if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
4310 wakaba 1.79 !!!cp ('t68');
4311 wakaba 1.3 splice @{$self->{open_elements}}, $_, 1;
4312 wakaba 1.1 $i-- and last OE if defined $i;
4313 wakaba 1.3 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
4314 wakaba 1.79 !!!cp ('t69');
4315 wakaba 1.1 $i = $_;
4316     }
4317     } # OE
4318 wakaba 1.3 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
4319 wakaba 1.1
4320     ## Step 14
4321     redo FET;
4322     } # FET
4323     }; # $formatting_end_tag
4324    
4325 wakaba 1.96 $insert = my $insert_to_current = sub {
4326 wakaba 1.25 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
4327 wakaba 1.1 }; # $insert_to_current
4328    
4329     my $insert_to_foster = sub {
4330 wakaba 1.95 my $child = shift;
4331 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4332 wakaba 1.95 # MUST
4333     my $foster_parent_element;
4334     my $next_sibling;
4335 wakaba 1.123 OE: for (reverse 0..$#{$self->{open_elements}}) {
4336     if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4337 wakaba 1.3 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4338 wakaba 1.1 if (defined $parent and $parent->node_type == 1) {
4339 wakaba 1.79 !!!cp ('t70');
4340 wakaba 1.1 $foster_parent_element = $parent;
4341 wakaba 1.3 $next_sibling = $self->{open_elements}->[$_]->[0];
4342 wakaba 1.1 } else {
4343 wakaba 1.79 !!!cp ('t71');
4344 wakaba 1.1 $foster_parent_element
4345 wakaba 1.3 = $self->{open_elements}->[$_ - 1]->[0];
4346 wakaba 1.1 }
4347     last OE;
4348     }
4349     } # OE
4350 wakaba 1.3 $foster_parent_element = $self->{open_elements}->[0]->[0]
4351 wakaba 1.1 unless defined $foster_parent_element;
4352     $foster_parent_element->insert_before
4353     ($child, $next_sibling);
4354 wakaba 1.95 $open_tables->[-1]->[1] = 1; # tainted
4355     } else {
4356     !!!cp ('t72');
4357     $self->{open_elements}->[-1]->[0]->append_child ($child);
4358     }
4359 wakaba 1.1 }; # $insert_to_foster
4360    
4361 wakaba 1.126 B: while (1) {
4362 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
4363 wakaba 1.79 !!!cp ('t73');
4364 wakaba 1.153 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4365 wakaba 1.52 ## Ignore the token
4366     ## Stay in the phase
4367     !!!next-token;
4368 wakaba 1.126 next B;
4369 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN and
4370 wakaba 1.52 $token->{tag_name} eq 'html') {
4371 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4372 wakaba 1.79 !!!cp ('t79');
4373 wakaba 1.153 !!!parse-error (type => 'after html', text => 'html', token => $token);
4374 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4375     } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4376 wakaba 1.79 !!!cp ('t80');
4377 wakaba 1.153 !!!parse-error (type => 'after html', text => 'html', token => $token);
4378 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4379 wakaba 1.79 } else {
4380     !!!cp ('t81');
4381 wakaba 1.52 }
4382    
4383 wakaba 1.84 !!!cp ('t82');
4384 wakaba 1.113 !!!parse-error (type => 'not first start tag', token => $token);
4385 wakaba 1.52 my $top_el = $self->{open_elements}->[0]->[0];
4386     for my $attr_name (keys %{$token->{attributes}}) {
4387     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4388 wakaba 1.79 !!!cp ('t84');
4389 wakaba 1.52 $top_el->set_attribute_ns
4390     (undef, [undef, $attr_name],
4391     $token->{attributes}->{$attr_name}->{value});
4392     }
4393     }
4394 wakaba 1.125 !!!nack ('t84.1');
4395 wakaba 1.52 !!!next-token;
4396 wakaba 1.126 next B;
4397 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
4398 wakaba 1.52 my $comment = $self->{document}->create_comment ($token->{data});
4399 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4400 wakaba 1.79 !!!cp ('t85');
4401 wakaba 1.52 $self->{document}->append_child ($comment);
4402 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4403 wakaba 1.79 !!!cp ('t86');
4404 wakaba 1.52 $self->{open_elements}->[0]->[0]->append_child ($comment);
4405     } else {
4406 wakaba 1.79 !!!cp ('t87');
4407 wakaba 1.52 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4408     }
4409     !!!next-token;
4410 wakaba 1.126 next B;
4411     } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4412     if ($token->{type} == CHARACTER_TOKEN) {
4413     !!!cp ('t87.1');
4414     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4415     !!!next-token;
4416     next B;
4417     } elsif ($token->{type} == START_TAG_TOKEN) {
4418 wakaba 1.129 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4419     $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4420 wakaba 1.126 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4421     ($token->{tag_name} eq 'svg' and
4422     $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4423     ## NOTE: "using the rules for secondary insertion mode"then"continue"
4424     !!!cp ('t87.2');
4425     #
4426     } elsif ({
4427 wakaba 1.130 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4428 wakaba 1.146 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4429     em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4430     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4431     img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4432     nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4433     small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4434     sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4435 wakaba 1.126 }->{$token->{tag_name}}) {
4436     !!!cp ('t87.2');
4437     !!!parse-error (type => 'not closed',
4438 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4439 wakaba 1.126 ->manakai_local_name,
4440     token => $token);
4441    
4442     pop @{$self->{open_elements}}
4443     while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4444    
4445 wakaba 1.130 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4446 wakaba 1.126 ## Reprocess.
4447     next B;
4448     } else {
4449 wakaba 1.131 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4450     my $tag_name = $token->{tag_name};
4451     if ($nsuri eq $SVG_NS) {
4452     $tag_name = {
4453     altglyph => 'altGlyph',
4454     altglyphdef => 'altGlyphDef',
4455     altglyphitem => 'altGlyphItem',
4456     animatecolor => 'animateColor',
4457     animatemotion => 'animateMotion',
4458     animatetransform => 'animateTransform',
4459     clippath => 'clipPath',
4460     feblend => 'feBlend',
4461     fecolormatrix => 'feColorMatrix',
4462     fecomponenttransfer => 'feComponentTransfer',
4463     fecomposite => 'feComposite',
4464     feconvolvematrix => 'feConvolveMatrix',
4465     fediffuselighting => 'feDiffuseLighting',
4466     fedisplacementmap => 'feDisplacementMap',
4467     fedistantlight => 'feDistantLight',
4468     feflood => 'feFlood',
4469     fefunca => 'feFuncA',
4470     fefuncb => 'feFuncB',
4471     fefuncg => 'feFuncG',
4472     fefuncr => 'feFuncR',
4473     fegaussianblur => 'feGaussianBlur',
4474     feimage => 'feImage',
4475     femerge => 'feMerge',
4476     femergenode => 'feMergeNode',
4477     femorphology => 'feMorphology',
4478     feoffset => 'feOffset',
4479     fepointlight => 'fePointLight',
4480     fespecularlighting => 'feSpecularLighting',
4481     fespotlight => 'feSpotLight',
4482     fetile => 'feTile',
4483     feturbulence => 'feTurbulence',
4484     foreignobject => 'foreignObject',
4485     glyphref => 'glyphRef',
4486     lineargradient => 'linearGradient',
4487     radialgradient => 'radialGradient',
4488     #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4489     textpath => 'textPath',
4490     }->{$tag_name} || $tag_name;
4491     }
4492    
4493     ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4494    
4495     ## "adjust foreign attributes" - done in insert-element-f
4496 wakaba 1.126
4497 wakaba 1.131 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4498 wakaba 1.126
4499     if ($self->{self_closing}) {
4500     pop @{$self->{open_elements}};
4501     !!!ack ('t87.3');
4502     } else {
4503     !!!cp ('t87.4');
4504     }
4505    
4506     !!!next-token;
4507     next B;
4508     }
4509     } elsif ($token->{type} == END_TAG_TOKEN) {
4510     ## NOTE: "using the rules for secondary insertion mode" then "continue"
4511     !!!cp ('t87.5');
4512     #
4513     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4514     !!!cp ('t87.6');
4515 wakaba 1.146 !!!parse-error (type => 'not closed',
4516 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4517 wakaba 1.146 ->manakai_local_name,
4518     token => $token);
4519    
4520     pop @{$self->{open_elements}}
4521     while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4522    
4523     $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4524     ## Reprocess.
4525     next B;
4526 wakaba 1.126 } else {
4527     die "$0: $token->{type}: Unknown token type";
4528     }
4529     }
4530    
4531     if ($self->{insertion_mode} & HEAD_IMS) {
4532 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4533 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
4534 wakaba 1.99 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4535     !!!cp ('t88.2');
4536     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4537 wakaba 1.177 #
4538 wakaba 1.99 } else {
4539     !!!cp ('t88.1');
4540     ## Ignore the token.
4541 wakaba 1.177 #
4542 wakaba 1.99 }
4543 wakaba 1.52 unless (length $token->{data}) {
4544 wakaba 1.79 !!!cp ('t88');
4545 wakaba 1.52 !!!next-token;
4546 wakaba 1.126 next B;
4547 wakaba 1.1 }
4548 wakaba 1.177 ## TODO: set $token->{column} appropriately
4549 wakaba 1.1 }
4550 wakaba 1.52
4551 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4552 wakaba 1.79 !!!cp ('t89');
4553 wakaba 1.52 ## As if <head>
4554 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4555 wakaba 1.52 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4556 wakaba 1.123 push @{$self->{open_elements}},
4557     [$self->{head_element}, $el_category->{head}];
4558 wakaba 1.52
4559     ## Reprocess in the "in head" insertion mode...
4560     pop @{$self->{open_elements}};
4561    
4562     ## Reprocess in the "after head" insertion mode...
4563 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4564 wakaba 1.79 !!!cp ('t90');
4565 wakaba 1.52 ## As if </noscript>
4566     pop @{$self->{open_elements}};
4567 wakaba 1.153 !!!parse-error (type => 'in noscript:#text', token => $token);
4568 wakaba 1.1
4569 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
4570     ## As if </head>
4571     pop @{$self->{open_elements}};
4572    
4573     ## Reprocess in the "after head" insertion mode...
4574 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4575 wakaba 1.79 !!!cp ('t91');
4576 wakaba 1.52 pop @{$self->{open_elements}};
4577    
4578     ## Reprocess in the "after head" insertion mode...
4579 wakaba 1.79 } else {
4580     !!!cp ('t92');
4581 wakaba 1.1 }
4582 wakaba 1.52
4583 wakaba 1.123 ## "after head" insertion mode
4584     ## As if <body>
4585     !!!insert-element ('body',, $token);
4586     $self->{insertion_mode} = IN_BODY_IM;
4587     ## reprocess
4588 wakaba 1.126 next B;
4589 wakaba 1.123 } elsif ($token->{type} == START_TAG_TOKEN) {
4590     if ($token->{tag_name} eq 'head') {
4591     if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4592     !!!cp ('t93');
4593 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4594 wakaba 1.123 $self->{open_elements}->[-1]->[0]->append_child
4595     ($self->{head_element});
4596     push @{$self->{open_elements}},
4597     [$self->{head_element}, $el_category->{head}];
4598     $self->{insertion_mode} = IN_HEAD_IM;
4599 wakaba 1.125 !!!nack ('t93.1');
4600 wakaba 1.123 !!!next-token;
4601 wakaba 1.126 next B;
4602 wakaba 1.125 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4603 wakaba 1.139 !!!cp ('t93.2');
4604 wakaba 1.153 !!!parse-error (type => 'after head', text => 'head',
4605     token => $token);
4606 wakaba 1.139 ## Ignore the token
4607     !!!nack ('t93.3');
4608     !!!next-token;
4609     next B;
4610 wakaba 1.125 } else {
4611     !!!cp ('t95');
4612 wakaba 1.153 !!!parse-error (type => 'in head:head',
4613     token => $token); # or in head noscript
4614 wakaba 1.125 ## Ignore the token
4615     !!!nack ('t95.1');
4616     !!!next-token;
4617 wakaba 1.126 next B;
4618 wakaba 1.125 }
4619     } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4620 wakaba 1.126 !!!cp ('t96');
4621     ## As if <head>
4622     !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4623     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4624     push @{$self->{open_elements}},
4625     [$self->{head_element}, $el_category->{head}];
4626 wakaba 1.52
4627 wakaba 1.126 $self->{insertion_mode} = IN_HEAD_IM;
4628     ## Reprocess in the "in head" insertion mode...
4629     } else {
4630     !!!cp ('t97');
4631     }
4632 wakaba 1.52
4633 wakaba 1.49 if ($token->{tag_name} eq 'base') {
4634 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4635 wakaba 1.79 !!!cp ('t98');
4636 wakaba 1.49 ## As if </noscript>
4637     pop @{$self->{open_elements}};
4638 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'base',
4639     token => $token);
4640 wakaba 1.49
4641 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4642 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4643 wakaba 1.79 } else {
4644     !!!cp ('t99');
4645 wakaba 1.49 }
4646    
4647     ## NOTE: There is a "as if in head" code clone.
4648 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4649 wakaba 1.79 !!!cp ('t100');
4650 wakaba 1.153 !!!parse-error (type => 'after head',
4651     text => $token->{tag_name}, token => $token);
4652 wakaba 1.123 push @{$self->{open_elements}},
4653     [$self->{head_element}, $el_category->{head}];
4654 wakaba 1.79 } else {
4655     !!!cp ('t101');
4656 wakaba 1.49 }
4657 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4658 wakaba 1.49 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4659 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4660 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4661 wakaba 1.125 !!!nack ('t101.1');
4662 wakaba 1.49 !!!next-token;
4663 wakaba 1.126 next B;
4664 wakaba 1.49 } elsif ($token->{tag_name} eq 'link') {
4665 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
4666 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4667 wakaba 1.79 !!!cp ('t102');
4668 wakaba 1.153 !!!parse-error (type => 'after head',
4669     text => $token->{tag_name}, token => $token);
4670 wakaba 1.123 push @{$self->{open_elements}},
4671     [$self->{head_element}, $el_category->{head}];
4672 wakaba 1.79 } else {
4673     !!!cp ('t103');
4674 wakaba 1.25 }
4675 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4676 wakaba 1.25 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4677 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4678 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4679 wakaba 1.125 !!!ack ('t103.1');
4680 wakaba 1.1 !!!next-token;
4681 wakaba 1.126 next B;
4682 wakaba 1.34 } elsif ($token->{tag_name} eq 'meta') {
4683     ## NOTE: There is a "as if in head" code clone.
4684 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4685 wakaba 1.79 !!!cp ('t104');
4686 wakaba 1.153 !!!parse-error (type => 'after head',
4687     text => $token->{tag_name}, token => $token);
4688 wakaba 1.123 push @{$self->{open_elements}},
4689     [$self->{head_element}, $el_category->{head}];
4690 wakaba 1.79 } else {
4691     !!!cp ('t105');
4692 wakaba 1.34 }
4693 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4694 wakaba 1.66 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4695 wakaba 1.34
4696     unless ($self->{confident}) {
4697 wakaba 1.134 if ($token->{attributes}->{charset}) {
4698 wakaba 1.79 !!!cp ('t106');
4699 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
4700     ## in the {change_encoding} callback.
4701 wakaba 1.63 $self->{change_encoding}
4702 wakaba 1.114 ->($self, $token->{attributes}->{charset}->{value},
4703     $token);
4704 wakaba 1.66
4705     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4706     ->set_user_data (manakai_has_reference =>
4707     $token->{attributes}->{charset}
4708     ->{has_reference});
4709 wakaba 1.63 } elsif ($token->{attributes}->{content}) {
4710     if ($token->{attributes}->{content}->{value}
4711 wakaba 1.144 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4712 wakaba 1.186 [\x09\x0A\x0C\x0D\x20]*=
4713     [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4714     ([^"'\x09\x0A\x0C\x0D\x20]
4715     [^\x09\x0A\x0C\x0D\x20\x3B]*))/x) {
4716 wakaba 1.79 !!!cp ('t107');
4717 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
4718     ## in the {change_encoding} callback.
4719 wakaba 1.63 $self->{change_encoding}
4720 wakaba 1.114 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4721     $token);
4722 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4723     ->set_user_data (manakai_has_reference =>
4724     $token->{attributes}->{content}
4725     ->{has_reference});
4726 wakaba 1.79 } else {
4727     !!!cp ('t108');
4728 wakaba 1.63 }
4729 wakaba 1.34 }
4730 wakaba 1.66 } else {
4731     if ($token->{attributes}->{charset}) {
4732 wakaba 1.79 !!!cp ('t109');
4733 wakaba 1.66 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4734     ->set_user_data (manakai_has_reference =>
4735     $token->{attributes}->{charset}
4736     ->{has_reference});
4737     }
4738 wakaba 1.68 if ($token->{attributes}->{content}) {
4739 wakaba 1.79 !!!cp ('t110');
4740 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4741     ->set_user_data (manakai_has_reference =>
4742     $token->{attributes}->{content}
4743     ->{has_reference});
4744     }
4745 wakaba 1.34 }
4746    
4747 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4748 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4749 wakaba 1.125 !!!ack ('t110.1');
4750 wakaba 1.34 !!!next-token;
4751 wakaba 1.126 next B;
4752 wakaba 1.49 } elsif ($token->{tag_name} eq 'title') {
4753 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4754 wakaba 1.79 !!!cp ('t111');
4755 wakaba 1.49 ## As if </noscript>
4756     pop @{$self->{open_elements}};
4757 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'title',
4758     token => $token);
4759 wakaba 1.49
4760 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4761 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4762 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4763 wakaba 1.79 !!!cp ('t112');
4764 wakaba 1.153 !!!parse-error (type => 'after head',
4765     text => $token->{tag_name}, token => $token);
4766 wakaba 1.123 push @{$self->{open_elements}},
4767     [$self->{head_element}, $el_category->{head}];
4768 wakaba 1.79 } else {
4769     !!!cp ('t113');
4770 wakaba 1.25 }
4771 wakaba 1.49
4772     ## NOTE: There is a "as if in head" code clone.
4773 wakaba 1.31 my $parent = defined $self->{head_element} ? $self->{head_element}
4774     : $self->{open_elements}->[-1]->[0];
4775 wakaba 1.96 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4776 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4777 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4778 wakaba 1.126 next B;
4779 wakaba 1.148 } elsif ($token->{tag_name} eq 'style' or
4780     $token->{tag_name} eq 'noframes') {
4781 wakaba 1.25 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4782 wakaba 1.54 ## insertion mode IN_HEAD_IM)
4783 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
4784 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4785 wakaba 1.79 !!!cp ('t114');
4786 wakaba 1.153 !!!parse-error (type => 'after head',
4787     text => $token->{tag_name}, token => $token);
4788 wakaba 1.123 push @{$self->{open_elements}},
4789     [$self->{head_element}, $el_category->{head}];
4790 wakaba 1.79 } else {
4791     !!!cp ('t115');
4792 wakaba 1.25 }
4793 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
4794 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4795 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4796 wakaba 1.126 next B;
4797 wakaba 1.25 } elsif ($token->{tag_name} eq 'noscript') {
4798 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_IM) {
4799 wakaba 1.79 !!!cp ('t116');
4800 wakaba 1.25 ## NOTE: and scripting is disalbed
4801 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4802 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4803 wakaba 1.125 !!!nack ('t116.1');
4804 wakaba 1.1 !!!next-token;
4805 wakaba 1.126 next B;
4806 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4807 wakaba 1.79 !!!cp ('t117');
4808 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'noscript',
4809     token => $token);
4810 wakaba 1.1 ## Ignore the token
4811 wakaba 1.125 !!!nack ('t117.1');
4812 wakaba 1.41 !!!next-token;
4813 wakaba 1.126 next B;
4814 wakaba 1.1 } else {
4815 wakaba 1.79 !!!cp ('t118');
4816 wakaba 1.25 #
4817 wakaba 1.1 }
4818 wakaba 1.49 } elsif ($token->{tag_name} eq 'script') {
4819 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4820 wakaba 1.79 !!!cp ('t119');
4821 wakaba 1.49 ## As if </noscript>
4822     pop @{$self->{open_elements}};
4823 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'script',
4824     token => $token);
4825 wakaba 1.49
4826 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4827 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4828 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4829 wakaba 1.79 !!!cp ('t120');
4830 wakaba 1.153 !!!parse-error (type => 'after head',
4831     text => $token->{tag_name}, token => $token);
4832 wakaba 1.123 push @{$self->{open_elements}},
4833     [$self->{head_element}, $el_category->{head}];
4834 wakaba 1.79 } else {
4835     !!!cp ('t121');
4836 wakaba 1.25 }
4837 wakaba 1.49
4838 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
4839 wakaba 1.100 $script_start_tag->();
4840     pop @{$self->{open_elements}} # <head>
4841 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4842 wakaba 1.126 next B;
4843 wakaba 1.49 } elsif ($token->{tag_name} eq 'body' or
4844 wakaba 1.25 $token->{tag_name} eq 'frameset') {
4845 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4846 wakaba 1.79 !!!cp ('t122');
4847 wakaba 1.49 ## As if </noscript>
4848     pop @{$self->{open_elements}};
4849 wakaba 1.153 !!!parse-error (type => 'in noscript',
4850     text => $token->{tag_name}, token => $token);
4851 wakaba 1.49
4852     ## Reprocess in the "in head" insertion mode...
4853     ## As if </head>
4854     pop @{$self->{open_elements}};
4855    
4856     ## Reprocess in the "after head" insertion mode...
4857 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4858 wakaba 1.79 !!!cp ('t124');
4859 wakaba 1.49 pop @{$self->{open_elements}};
4860    
4861     ## Reprocess in the "after head" insertion mode...
4862 wakaba 1.79 } else {
4863     !!!cp ('t125');
4864 wakaba 1.49 }
4865    
4866     ## "after head" insertion mode
4867 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4868 wakaba 1.54 if ($token->{tag_name} eq 'body') {
4869 wakaba 1.79 !!!cp ('t126');
4870 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4871     } elsif ($token->{tag_name} eq 'frameset') {
4872 wakaba 1.79 !!!cp ('t127');
4873 wakaba 1.54 $self->{insertion_mode} = IN_FRAMESET_IM;
4874     } else {
4875     die "$0: tag name: $self->{tag_name}";
4876     }
4877 wakaba 1.125 !!!nack ('t127.1');
4878 wakaba 1.1 !!!next-token;
4879 wakaba 1.126 next B;
4880 wakaba 1.1 } else {
4881 wakaba 1.79 !!!cp ('t128');
4882 wakaba 1.1 #
4883     }
4884 wakaba 1.49
4885 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4886 wakaba 1.79 !!!cp ('t129');
4887 wakaba 1.49 ## As if </noscript>
4888     pop @{$self->{open_elements}};
4889 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
4890     text => $token->{tag_name}, token => $token);
4891 wakaba 1.49
4892     ## Reprocess in the "in head" insertion mode...
4893     ## As if </head>
4894 wakaba 1.25 pop @{$self->{open_elements}};
4895 wakaba 1.49
4896     ## Reprocess in the "after head" insertion mode...
4897 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4898 wakaba 1.79 !!!cp ('t130');
4899 wakaba 1.49 ## As if </head>
4900 wakaba 1.25 pop @{$self->{open_elements}};
4901 wakaba 1.49
4902     ## Reprocess in the "after head" insertion mode...
4903 wakaba 1.79 } else {
4904     !!!cp ('t131');
4905 wakaba 1.49 }
4906    
4907     ## "after head" insertion mode
4908     ## As if <body>
4909 wakaba 1.116 !!!insert-element ('body',, $token);
4910 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4911 wakaba 1.49 ## reprocess
4912 wakaba 1.125 !!!ack-later;
4913 wakaba 1.126 next B;
4914 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4915 wakaba 1.49 if ($token->{tag_name} eq 'head') {
4916 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4917 wakaba 1.79 !!!cp ('t132');
4918 wakaba 1.50 ## As if <head>
4919 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4920 wakaba 1.50 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4921 wakaba 1.123 push @{$self->{open_elements}},
4922     [$self->{head_element}, $el_category->{head}];
4923 wakaba 1.50
4924     ## Reprocess in the "in head" insertion mode...
4925     pop @{$self->{open_elements}};
4926 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4927 wakaba 1.50 !!!next-token;
4928 wakaba 1.126 next B;
4929 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4930 wakaba 1.79 !!!cp ('t133');
4931 wakaba 1.49 ## As if </noscript>
4932     pop @{$self->{open_elements}};
4933 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
4934     text => 'head', token => $token);
4935 wakaba 1.49
4936     ## Reprocess in the "in head" insertion mode...
4937 wakaba 1.50 pop @{$self->{open_elements}};
4938 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4939 wakaba 1.50 !!!next-token;
4940 wakaba 1.126 next B;
4941 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4942 wakaba 1.79 !!!cp ('t134');
4943 wakaba 1.49 pop @{$self->{open_elements}};
4944 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4945 wakaba 1.49 !!!next-token;
4946 wakaba 1.126 next B;
4947 wakaba 1.139 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4948     !!!cp ('t134.1');
4949 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => 'head',
4950     token => $token);
4951 wakaba 1.139 ## Ignore the token
4952     !!!next-token;
4953     next B;
4954 wakaba 1.49 } else {
4955 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4956 wakaba 1.49 }
4957     } elsif ($token->{tag_name} eq 'noscript') {
4958 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4959 wakaba 1.79 !!!cp ('t136');
4960 wakaba 1.49 pop @{$self->{open_elements}};
4961 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4962 wakaba 1.49 !!!next-token;
4963 wakaba 1.126 next B;
4964 wakaba 1.139 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4965     $self->{insertion_mode} == AFTER_HEAD_IM) {
4966 wakaba 1.79 !!!cp ('t137');
4967 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4968     text => 'noscript', token => $token);
4969 wakaba 1.50 ## Ignore the token ## ISSUE: An issue in the spec.
4970     !!!next-token;
4971 wakaba 1.126 next B;
4972 wakaba 1.49 } else {
4973 wakaba 1.79 !!!cp ('t138');
4974 wakaba 1.49 #
4975     }
4976     } elsif ({
4977 wakaba 1.31 body => 1, html => 1,
4978     }->{$token->{tag_name}}) {
4979 wakaba 1.139 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
4980     $self->{insertion_mode} == IN_HEAD_IM or
4981     $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4982 wakaba 1.79 !!!cp ('t140');
4983 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4984     text => $token->{tag_name}, token => $token);
4985 wakaba 1.49 ## Ignore the token
4986     !!!next-token;
4987 wakaba 1.126 next B;
4988 wakaba 1.139 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4989     !!!cp ('t140.1');
4990 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4991     text => $token->{tag_name}, token => $token);
4992 wakaba 1.139 ## Ignore the token
4993     !!!next-token;
4994     next B;
4995 wakaba 1.79 } else {
4996 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4997 wakaba 1.49 }
4998 wakaba 1.139 } elsif ($token->{tag_name} eq 'p') {
4999     !!!cp ('t142');
5000 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5001     text => $token->{tag_name}, token => $token);
5002 wakaba 1.139 ## Ignore the token
5003     !!!next-token;
5004     next B;
5005     } elsif ($token->{tag_name} eq 'br') {
5006 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5007 wakaba 1.139 !!!cp ('t142.2');
5008     ## (before head) as if <head>, (in head) as if </head>
5009 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
5010 wakaba 1.50 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
5011 wakaba 1.139 $self->{insertion_mode} = AFTER_HEAD_IM;
5012    
5013     ## Reprocess in the "after head" insertion mode...
5014     } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5015     !!!cp ('t143.2');
5016     ## As if </head>
5017     pop @{$self->{open_elements}};
5018     $self->{insertion_mode} = AFTER_HEAD_IM;
5019    
5020     ## Reprocess in the "after head" insertion mode...
5021     } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5022     !!!cp ('t143.3');
5023     ## ISSUE: Two parse errors for <head><noscript></br>
5024 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5025     text => 'br', token => $token);
5026 wakaba 1.139 ## As if </noscript>
5027     pop @{$self->{open_elements}};
5028     $self->{insertion_mode} = IN_HEAD_IM;
5029 wakaba 1.50
5030     ## Reprocess in the "in head" insertion mode...
5031 wakaba 1.139 ## As if </head>
5032     pop @{$self->{open_elements}};
5033     $self->{insertion_mode} = AFTER_HEAD_IM;
5034    
5035     ## Reprocess in the "after head" insertion mode...
5036     } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
5037     !!!cp ('t143.4');
5038     #
5039 wakaba 1.79 } else {
5040 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5041 wakaba 1.50 }
5042    
5043 wakaba 1.139 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
5044 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5045     text => 'br', token => $token);
5046 wakaba 1.139 ## Ignore the token
5047     !!!next-token;
5048     next B;
5049 wakaba 1.25 } else {
5050 wakaba 1.139 !!!cp ('t145');
5051 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5052     text => $token->{tag_name}, token => $token);
5053 wakaba 1.139 ## Ignore the token
5054     !!!next-token;
5055     next B;
5056 wakaba 1.49 }
5057    
5058 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5059 wakaba 1.79 !!!cp ('t146');
5060 wakaba 1.49 ## As if </noscript>
5061     pop @{$self->{open_elements}};
5062 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
5063     text => $token->{tag_name}, token => $token);
5064 wakaba 1.49
5065     ## Reprocess in the "in head" insertion mode...
5066     ## As if </head>
5067     pop @{$self->{open_elements}};
5068    
5069     ## Reprocess in the "after head" insertion mode...
5070 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5071 wakaba 1.79 !!!cp ('t147');
5072 wakaba 1.49 ## As if </head>
5073     pop @{$self->{open_elements}};
5074    
5075     ## Reprocess in the "after head" insertion mode...
5076 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5077 wakaba 1.82 ## ISSUE: This case cannot be reached?
5078 wakaba 1.79 !!!cp ('t148');
5079 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5080     text => $token->{tag_name}, token => $token);
5081 wakaba 1.50 ## Ignore the token ## ISSUE: An issue in the spec.
5082     !!!next-token;
5083 wakaba 1.126 next B;
5084 wakaba 1.79 } else {
5085     !!!cp ('t149');
5086 wakaba 1.1 }
5087    
5088 wakaba 1.49 ## "after head" insertion mode
5089     ## As if <body>
5090 wakaba 1.116 !!!insert-element ('body',, $token);
5091 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
5092 wakaba 1.52 ## reprocess
5093 wakaba 1.126 next B;
5094 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5095     if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5096     !!!cp ('t149.1');
5097    
5098     ## NOTE: As if <head>
5099 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
5100 wakaba 1.104 $self->{open_elements}->[-1]->[0]->append_child
5101     ($self->{head_element});
5102 wakaba 1.123 #push @{$self->{open_elements}},
5103     # [$self->{head_element}, $el_category->{head}];
5104 wakaba 1.104 #$self->{insertion_mode} = IN_HEAD_IM;
5105     ## NOTE: Reprocess.
5106    
5107     ## NOTE: As if </head>
5108     #pop @{$self->{open_elements}};
5109     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5110     ## NOTE: Reprocess.
5111    
5112     #
5113     } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5114     !!!cp ('t149.2');
5115    
5116     ## NOTE: As if </head>
5117     pop @{$self->{open_elements}};
5118     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5119     ## NOTE: Reprocess.
5120    
5121     #
5122     } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5123     !!!cp ('t149.3');
5124    
5125 wakaba 1.113 !!!parse-error (type => 'in noscript:#eof', token => $token);
5126 wakaba 1.104
5127     ## As if </noscript>
5128     pop @{$self->{open_elements}};
5129     #$self->{insertion_mode} = IN_HEAD_IM;
5130     ## NOTE: Reprocess.
5131    
5132     ## NOTE: As if </head>
5133     pop @{$self->{open_elements}};
5134     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5135     ## NOTE: Reprocess.
5136    
5137     #
5138     } else {
5139     !!!cp ('t149.4');
5140     #
5141     }
5142    
5143     ## NOTE: As if <body>
5144 wakaba 1.116 !!!insert-element ('body',, $token);
5145 wakaba 1.104 $self->{insertion_mode} = IN_BODY_IM;
5146     ## NOTE: Reprocess.
5147 wakaba 1.126 next B;
5148 wakaba 1.104 } else {
5149     die "$0: $token->{type}: Unknown token type";
5150     }
5151 wakaba 1.52
5152     ## ISSUE: An issue in the spec.
5153 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_IMS) {
5154 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
5155 wakaba 1.79 !!!cp ('t150');
5156 wakaba 1.52 ## NOTE: There is a code clone of "character in body".
5157     $reconstruct_active_formatting_elements->($insert_to_current);
5158    
5159     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5160    
5161     !!!next-token;
5162 wakaba 1.126 next B;
5163 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
5164 wakaba 1.52 if ({
5165     caption => 1, col => 1, colgroup => 1, tbody => 1,
5166     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
5167     }->{$token->{tag_name}}) {
5168 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
5169 wakaba 1.52 ## have an element in table scope
5170 wakaba 1.108 for (reverse 0..$#{$self->{open_elements}}) {
5171 wakaba 1.52 my $node = $self->{open_elements}->[$_];
5172 wakaba 1.123 if ($node->[1] & TABLE_CELL_EL) {
5173 wakaba 1.79 !!!cp ('t151');
5174 wakaba 1.108
5175     ## Close the cell
5176 wakaba 1.125 !!!back-token; # <x>
5177 wakaba 1.122 $token = {type => END_TAG_TOKEN,
5178     tag_name => $node->[0]->manakai_local_name,
5179 wakaba 1.114 line => $token->{line},
5180     column => $token->{column}};
5181 wakaba 1.126 next B;
5182 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5183 wakaba 1.79 !!!cp ('t152');
5184 wakaba 1.108 ## ISSUE: This case can never be reached, maybe.
5185     last;
5186 wakaba 1.52 }
5187 wakaba 1.108 }
5188    
5189     !!!cp ('t153');
5190     !!!parse-error (type => 'start tag not allowed',
5191 wakaba 1.153 text => $token->{tag_name}, token => $token);
5192 wakaba 1.108 ## Ignore the token
5193 wakaba 1.125 !!!nack ('t153.1');
5194 wakaba 1.108 !!!next-token;
5195 wakaba 1.126 next B;
5196 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5197 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'caption',
5198     token => $token);
5199 wakaba 1.52
5200 wakaba 1.108 ## NOTE: As if </caption>.
5201 wakaba 1.52 ## have a table element in table scope
5202     my $i;
5203 wakaba 1.108 INSCOPE: {
5204     for (reverse 0..$#{$self->{open_elements}}) {
5205     my $node = $self->{open_elements}->[$_];
5206 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5207 wakaba 1.108 !!!cp ('t155');
5208     $i = $_;
5209     last INSCOPE;
5210 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5211 wakaba 1.108 !!!cp ('t156');
5212     last;
5213     }
5214 wakaba 1.52 }
5215 wakaba 1.108
5216     !!!cp ('t157');
5217     !!!parse-error (type => 'start tag not allowed',
5218 wakaba 1.153 text => $token->{tag_name}, token => $token);
5219 wakaba 1.108 ## Ignore the token
5220 wakaba 1.125 !!!nack ('t157.1');
5221 wakaba 1.108 !!!next-token;
5222 wakaba 1.126 next B;
5223 wakaba 1.52 } # INSCOPE
5224    
5225     ## generate implied end tags
5226 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5227     & END_TAG_OPTIONAL_EL) {
5228 wakaba 1.79 !!!cp ('t158');
5229 wakaba 1.86 pop @{$self->{open_elements}};
5230 wakaba 1.52 }
5231    
5232 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5233 wakaba 1.79 !!!cp ('t159');
5234 wakaba 1.122 !!!parse-error (type => 'not closed',
5235 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5236 wakaba 1.122 ->manakai_local_name,
5237     token => $token);
5238 wakaba 1.79 } else {
5239     !!!cp ('t160');
5240 wakaba 1.52 }
5241    
5242     splice @{$self->{open_elements}}, $i;
5243    
5244     $clear_up_to_marker->();
5245    
5246 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5247 wakaba 1.52
5248     ## reprocess
5249 wakaba 1.125 !!!ack-later;
5250 wakaba 1.126 next B;
5251 wakaba 1.52 } else {
5252 wakaba 1.79 !!!cp ('t161');
5253 wakaba 1.52 #
5254     }
5255     } else {
5256 wakaba 1.79 !!!cp ('t162');
5257 wakaba 1.52 #
5258     }
5259 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
5260 wakaba 1.52 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5261 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
5262 wakaba 1.43 ## have an element in table scope
5263 wakaba 1.52 my $i;
5264 wakaba 1.43 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5265     my $node = $self->{open_elements}->[$_];
5266 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5267 wakaba 1.79 !!!cp ('t163');
5268 wakaba 1.52 $i = $_;
5269 wakaba 1.43 last INSCOPE;
5270 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5271 wakaba 1.79 !!!cp ('t164');
5272 wakaba 1.43 last INSCOPE;
5273     }
5274     } # INSCOPE
5275 wakaba 1.52 unless (defined $i) {
5276 wakaba 1.79 !!!cp ('t165');
5277 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5278     text => $token->{tag_name},
5279     token => $token);
5280 wakaba 1.43 ## Ignore the token
5281     !!!next-token;
5282 wakaba 1.126 next B;
5283 wakaba 1.43 }
5284    
5285 wakaba 1.52 ## generate implied end tags
5286 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5287     & END_TAG_OPTIONAL_EL) {
5288 wakaba 1.79 !!!cp ('t166');
5289 wakaba 1.86 pop @{$self->{open_elements}};
5290 wakaba 1.52 }
5291 wakaba 1.86
5292 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
5293     ne $token->{tag_name}) {
5294 wakaba 1.79 !!!cp ('t167');
5295 wakaba 1.122 !!!parse-error (type => 'not closed',
5296 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5297 wakaba 1.122 ->manakai_local_name,
5298     token => $token);
5299 wakaba 1.79 } else {
5300     !!!cp ('t168');
5301 wakaba 1.52 }
5302    
5303     splice @{$self->{open_elements}}, $i;
5304    
5305     $clear_up_to_marker->();
5306    
5307 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
5308 wakaba 1.52
5309     !!!next-token;
5310 wakaba 1.126 next B;
5311 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5312 wakaba 1.79 !!!cp ('t169');
5313 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5314     text => $token->{tag_name}, token => $token);
5315 wakaba 1.52 ## Ignore the token
5316     !!!next-token;
5317 wakaba 1.126 next B;
5318 wakaba 1.52 } else {
5319 wakaba 1.79 !!!cp ('t170');
5320 wakaba 1.52 #
5321     }
5322     } elsif ($token->{tag_name} eq 'caption') {
5323 wakaba 1.54 if ($self->{insertion_mode} == IN_CAPTION_IM) {
5324 wakaba 1.43 ## have a table element in table scope
5325     my $i;
5326 wakaba 1.108 INSCOPE: {
5327     for (reverse 0..$#{$self->{open_elements}}) {
5328     my $node = $self->{open_elements}->[$_];
5329 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5330 wakaba 1.108 !!!cp ('t171');
5331     $i = $_;
5332     last INSCOPE;
5333 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5334 wakaba 1.108 !!!cp ('t172');
5335     last;
5336     }
5337 wakaba 1.43 }
5338 wakaba 1.108
5339     !!!cp ('t173');
5340     !!!parse-error (type => 'unmatched end tag',
5341 wakaba 1.153 text => $token->{tag_name}, token => $token);
5342 wakaba 1.108 ## Ignore the token
5343     !!!next-token;
5344 wakaba 1.126 next B;
5345 wakaba 1.43 } # INSCOPE
5346    
5347     ## generate implied end tags
5348 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5349     & END_TAG_OPTIONAL_EL) {
5350 wakaba 1.79 !!!cp ('t174');
5351 wakaba 1.86 pop @{$self->{open_elements}};
5352 wakaba 1.43 }
5353 wakaba 1.52
5354 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5355 wakaba 1.79 !!!cp ('t175');
5356 wakaba 1.122 !!!parse-error (type => 'not closed',
5357 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5358 wakaba 1.122 ->manakai_local_name,
5359     token => $token);
5360 wakaba 1.79 } else {
5361     !!!cp ('t176');
5362 wakaba 1.52 }
5363    
5364     splice @{$self->{open_elements}}, $i;
5365    
5366     $clear_up_to_marker->();
5367    
5368 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5369 wakaba 1.52
5370     !!!next-token;
5371 wakaba 1.126 next B;
5372 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5373 wakaba 1.79 !!!cp ('t177');
5374 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5375     text => $token->{tag_name}, token => $token);
5376 wakaba 1.52 ## Ignore the token
5377     !!!next-token;
5378 wakaba 1.126 next B;
5379 wakaba 1.52 } else {
5380 wakaba 1.79 !!!cp ('t178');
5381 wakaba 1.52 #
5382     }
5383     } elsif ({
5384     table => 1, tbody => 1, tfoot => 1,
5385     thead => 1, tr => 1,
5386     }->{$token->{tag_name}} and
5387 wakaba 1.54 $self->{insertion_mode} == IN_CELL_IM) {
5388 wakaba 1.52 ## have an element in table scope
5389     my $i;
5390     my $tn;
5391 wakaba 1.108 INSCOPE: {
5392     for (reverse 0..$#{$self->{open_elements}}) {
5393     my $node = $self->{open_elements}->[$_];
5394 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5395 wakaba 1.108 !!!cp ('t179');
5396     $i = $_;
5397    
5398     ## Close the cell
5399 wakaba 1.125 !!!back-token; # </x>
5400 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5401     line => $token->{line},
5402     column => $token->{column}};
5403 wakaba 1.126 next B;
5404 wakaba 1.123 } elsif ($node->[1] & TABLE_CELL_EL) {
5405 wakaba 1.108 !!!cp ('t180');
5406 wakaba 1.123 $tn = $node->[0]->manakai_local_name;
5407 wakaba 1.108 ## NOTE: There is exactly one |td| or |th| element
5408     ## in scope in the stack of open elements by definition.
5409 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5410 wakaba 1.108 ## ISSUE: Can this be reached?
5411     !!!cp ('t181');
5412     last;
5413     }
5414 wakaba 1.52 }
5415 wakaba 1.108
5416 wakaba 1.79 !!!cp ('t182');
5417 wakaba 1.108 !!!parse-error (type => 'unmatched end tag',
5418 wakaba 1.153 text => $token->{tag_name}, token => $token);
5419 wakaba 1.52 ## Ignore the token
5420     !!!next-token;
5421 wakaba 1.126 next B;
5422 wakaba 1.108 } # INSCOPE
5423 wakaba 1.52 } elsif ($token->{tag_name} eq 'table' and
5424 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
5425 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'caption',
5426     token => $token);
5427 wakaba 1.52
5428     ## As if </caption>
5429     ## have a table element in table scope
5430     my $i;
5431     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5432     my $node = $self->{open_elements}->[$_];
5433 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5434 wakaba 1.79 !!!cp ('t184');
5435 wakaba 1.52 $i = $_;
5436     last INSCOPE;
5437 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5438 wakaba 1.79 !!!cp ('t185');
5439 wakaba 1.52 last INSCOPE;
5440     }
5441     } # INSCOPE
5442     unless (defined $i) {
5443 wakaba 1.79 !!!cp ('t186');
5444 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5445     text => 'caption', token => $token);
5446 wakaba 1.52 ## Ignore the token
5447     !!!next-token;
5448 wakaba 1.126 next B;
5449 wakaba 1.52 }
5450    
5451     ## generate implied end tags
5452 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5453 wakaba 1.79 !!!cp ('t187');
5454 wakaba 1.86 pop @{$self->{open_elements}};
5455 wakaba 1.52 }
5456    
5457 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5458 wakaba 1.79 !!!cp ('t188');
5459 wakaba 1.122 !!!parse-error (type => 'not closed',
5460 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5461 wakaba 1.122 ->manakai_local_name,
5462     token => $token);
5463 wakaba 1.79 } else {
5464     !!!cp ('t189');
5465 wakaba 1.52 }
5466    
5467     splice @{$self->{open_elements}}, $i;
5468    
5469     $clear_up_to_marker->();
5470    
5471 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5472 wakaba 1.52
5473     ## reprocess
5474 wakaba 1.126 next B;
5475 wakaba 1.52 } elsif ({
5476     body => 1, col => 1, colgroup => 1, html => 1,
5477     }->{$token->{tag_name}}) {
5478 wakaba 1.56 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5479 wakaba 1.79 !!!cp ('t190');
5480 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5481     text => $token->{tag_name}, token => $token);
5482 wakaba 1.52 ## Ignore the token
5483     !!!next-token;
5484 wakaba 1.126 next B;
5485 wakaba 1.52 } else {
5486 wakaba 1.79 !!!cp ('t191');
5487 wakaba 1.52 #
5488     }
5489     } elsif ({
5490     tbody => 1, tfoot => 1,
5491     thead => 1, tr => 1,
5492     }->{$token->{tag_name}} and
5493 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
5494 wakaba 1.79 !!!cp ('t192');
5495 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5496     text => $token->{tag_name}, token => $token);
5497 wakaba 1.52 ## Ignore the token
5498     !!!next-token;
5499 wakaba 1.126 next B;
5500 wakaba 1.52 } else {
5501 wakaba 1.79 !!!cp ('t193');
5502 wakaba 1.52 #
5503     }
5504 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5505     for my $entry (@{$self->{open_elements}}) {
5506 wakaba 1.123 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5507 wakaba 1.104 !!!cp ('t75');
5508 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
5509 wakaba 1.104 last;
5510     }
5511     }
5512    
5513     ## Stop parsing.
5514     last B;
5515 wakaba 1.52 } else {
5516     die "$0: $token->{type}: Unknown token type";
5517     }
5518    
5519     $insert = $insert_to_current;
5520     #
5521 wakaba 1.56 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5522 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
5523 wakaba 1.95 if (not $open_tables->[-1]->[1] and # tainted
5524 wakaba 1.188 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
5525 wakaba 1.95 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5526 wakaba 1.52
5527 wakaba 1.95 unless (length $token->{data}) {
5528     !!!cp ('t194');
5529     !!!next-token;
5530 wakaba 1.126 next B;
5531 wakaba 1.95 } else {
5532     !!!cp ('t195');
5533     }
5534     }
5535 wakaba 1.52
5536 wakaba 1.153 !!!parse-error (type => 'in table:#text', token => $token);
5537 wakaba 1.52
5538     ## As if in body, but insert into foster parent element
5539     ## ISSUE: Spec says that "whenever a node would be inserted
5540     ## into the current node" while characters might not be
5541     ## result in a new Text node.
5542     $reconstruct_active_formatting_elements->($insert_to_foster);
5543    
5544 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5545 wakaba 1.52 # MUST
5546     my $foster_parent_element;
5547     my $next_sibling;
5548     my $prev_sibling;
5549     OE: for (reverse 0..$#{$self->{open_elements}}) {
5550 wakaba 1.123 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5551 wakaba 1.52 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5552     if (defined $parent and $parent->node_type == 1) {
5553 wakaba 1.79 !!!cp ('t196');
5554 wakaba 1.52 $foster_parent_element = $parent;
5555     $next_sibling = $self->{open_elements}->[$_]->[0];
5556     $prev_sibling = $next_sibling->previous_sibling;
5557     } else {
5558 wakaba 1.79 !!!cp ('t197');
5559 wakaba 1.52 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5560     $prev_sibling = $foster_parent_element->last_child;
5561     }
5562     last OE;
5563     }
5564     } # OE
5565     $foster_parent_element = $self->{open_elements}->[0]->[0] and
5566     $prev_sibling = $foster_parent_element->last_child
5567     unless defined $foster_parent_element;
5568     if (defined $prev_sibling and
5569     $prev_sibling->node_type == 3) {
5570 wakaba 1.79 !!!cp ('t198');
5571 wakaba 1.52 $prev_sibling->manakai_append_text ($token->{data});
5572     } else {
5573 wakaba 1.79 !!!cp ('t199');
5574 wakaba 1.52 $foster_parent_element->insert_before
5575     ($self->{document}->create_text_node ($token->{data}),
5576     $next_sibling);
5577     }
5578 wakaba 1.95 $open_tables->[-1]->[1] = 1; # tainted
5579     } else {
5580     !!!cp ('t200');
5581     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5582     }
5583 wakaba 1.52
5584 wakaba 1.95 !!!next-token;
5585 wakaba 1.126 next B;
5586 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
5587 wakaba 1.153 if ({
5588     tr => ($self->{insertion_mode} != IN_ROW_IM),
5589     th => 1, td => 1,
5590     }->{$token->{tag_name}}) {
5591     if ($self->{insertion_mode} == IN_TABLE_IM) {
5592     ## Clear back to table context
5593     while (not ($self->{open_elements}->[-1]->[1]
5594     & TABLE_SCOPING_EL)) {
5595     !!!cp ('t201');
5596     pop @{$self->{open_elements}};
5597     }
5598    
5599     !!!insert-element ('tbody',, $token);
5600     $self->{insertion_mode} = IN_TABLE_BODY_IM;
5601     ## reprocess in the "in table body" insertion mode...
5602     }
5603    
5604     if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5605     unless ($token->{tag_name} eq 'tr') {
5606     !!!cp ('t202');
5607     !!!parse-error (type => 'missing start tag:tr', token => $token);
5608     }
5609 wakaba 1.43
5610 wakaba 1.153 ## Clear back to table body context
5611     while (not ($self->{open_elements}->[-1]->[1]
5612     & TABLE_ROWS_SCOPING_EL)) {
5613     !!!cp ('t203');
5614     ## ISSUE: Can this case be reached?
5615     pop @{$self->{open_elements}};
5616     }
5617 wakaba 1.43
5618 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
5619 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
5620 wakaba 1.79 !!!cp ('t204');
5621 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5622 wakaba 1.125 !!!nack ('t204');
5623 wakaba 1.52 !!!next-token;
5624 wakaba 1.126 next B;
5625 wakaba 1.52 } else {
5626 wakaba 1.79 !!!cp ('t205');
5627 wakaba 1.116 !!!insert-element ('tr',, $token);
5628 wakaba 1.52 ## reprocess in the "in row" insertion mode
5629     }
5630 wakaba 1.79 } else {
5631     !!!cp ('t206');
5632 wakaba 1.52 }
5633    
5634     ## Clear back to table row context
5635 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5636     & TABLE_ROW_SCOPING_EL)) {
5637 wakaba 1.79 !!!cp ('t207');
5638 wakaba 1.52 pop @{$self->{open_elements}};
5639 wakaba 1.43 }
5640 wakaba 1.52
5641 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5642 wakaba 1.54 $self->{insertion_mode} = IN_CELL_IM;
5643 wakaba 1.52
5644     push @$active_formatting_elements, ['#marker', ''];
5645    
5646 wakaba 1.125 !!!nack ('t207.1');
5647 wakaba 1.52 !!!next-token;
5648 wakaba 1.126 next B;
5649 wakaba 1.52 } elsif ({
5650     caption => 1, col => 1, colgroup => 1,
5651     tbody => 1, tfoot => 1, thead => 1,
5652 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5653 wakaba 1.52 }->{$token->{tag_name}}) {
5654 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
5655 wakaba 1.52 ## As if </tr>
5656 wakaba 1.43 ## have an element in table scope
5657     my $i;
5658     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5659     my $node = $self->{open_elements}->[$_];
5660 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5661 wakaba 1.79 !!!cp ('t208');
5662 wakaba 1.43 $i = $_;
5663     last INSCOPE;
5664 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5665 wakaba 1.79 !!!cp ('t209');
5666 wakaba 1.43 last INSCOPE;
5667     }
5668     } # INSCOPE
5669 wakaba 1.79 unless (defined $i) {
5670 wakaba 1.125 !!!cp ('t210');
5671 wakaba 1.83 ## TODO: This type is wrong.
5672 wakaba 1.153 !!!parse-error (type => 'unmacthed end tag',
5673     text => $token->{tag_name}, token => $token);
5674 wakaba 1.52 ## Ignore the token
5675 wakaba 1.125 !!!nack ('t210.1');
5676 wakaba 1.52 !!!next-token;
5677 wakaba 1.126 next B;
5678 wakaba 1.43 }
5679    
5680 wakaba 1.52 ## Clear back to table row context
5681 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5682     & TABLE_ROW_SCOPING_EL)) {
5683 wakaba 1.79 !!!cp ('t211');
5684 wakaba 1.83 ## ISSUE: Can this case be reached?
5685 wakaba 1.52 pop @{$self->{open_elements}};
5686 wakaba 1.1 }
5687 wakaba 1.43
5688 wakaba 1.52 pop @{$self->{open_elements}}; # tr
5689 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5690 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
5691 wakaba 1.79 !!!cp ('t212');
5692 wakaba 1.52 ## reprocess
5693 wakaba 1.125 !!!ack-later;
5694 wakaba 1.126 next B;
5695 wakaba 1.52 } else {
5696 wakaba 1.79 !!!cp ('t213');
5697 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
5698     }
5699 wakaba 1.1 }
5700 wakaba 1.52
5701 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5702 wakaba 1.52 ## have an element in table scope
5703 wakaba 1.43 my $i;
5704     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5705     my $node = $self->{open_elements}->[$_];
5706 wakaba 1.123 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5707 wakaba 1.79 !!!cp ('t214');
5708 wakaba 1.43 $i = $_;
5709     last INSCOPE;
5710 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5711 wakaba 1.79 !!!cp ('t215');
5712 wakaba 1.43 last INSCOPE;
5713     }
5714     } # INSCOPE
5715 wakaba 1.52 unless (defined $i) {
5716 wakaba 1.79 !!!cp ('t216');
5717 wakaba 1.153 ## TODO: This erorr type is wrong.
5718     !!!parse-error (type => 'unmatched end tag',
5719     text => $token->{tag_name}, token => $token);
5720 wakaba 1.52 ## Ignore the token
5721 wakaba 1.125 !!!nack ('t216.1');
5722 wakaba 1.52 !!!next-token;
5723 wakaba 1.126 next B;
5724 wakaba 1.43 }
5725 wakaba 1.52
5726     ## Clear back to table body context
5727 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5728     & TABLE_ROWS_SCOPING_EL)) {
5729 wakaba 1.79 !!!cp ('t217');
5730 wakaba 1.83 ## ISSUE: Can this state be reached?
5731 wakaba 1.52 pop @{$self->{open_elements}};
5732 wakaba 1.43 }
5733    
5734 wakaba 1.52 ## As if <{current node}>
5735     ## have an element in table scope
5736     ## true by definition
5737 wakaba 1.43
5738 wakaba 1.52 ## Clear back to table body context
5739     ## nop by definition
5740 wakaba 1.43
5741 wakaba 1.52 pop @{$self->{open_elements}};
5742 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5743 wakaba 1.52 ## reprocess in "in table" insertion mode...
5744 wakaba 1.79 } else {
5745     !!!cp ('t218');
5746 wakaba 1.52 }
5747    
5748     if ($token->{tag_name} eq 'col') {
5749     ## Clear back to table context
5750 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5751     & TABLE_SCOPING_EL)) {
5752 wakaba 1.79 !!!cp ('t219');
5753 wakaba 1.83 ## ISSUE: Can this state be reached?
5754 wakaba 1.52 pop @{$self->{open_elements}};
5755     }
5756 wakaba 1.43
5757 wakaba 1.116 !!!insert-element ('colgroup',, $token);
5758 wakaba 1.54 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5759 wakaba 1.52 ## reprocess
5760 wakaba 1.125 !!!ack-later;
5761 wakaba 1.126 next B;
5762 wakaba 1.52 } elsif ({
5763     caption => 1,
5764     colgroup => 1,
5765     tbody => 1, tfoot => 1, thead => 1,
5766     }->{$token->{tag_name}}) {
5767     ## Clear back to table context
5768 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5769     & TABLE_SCOPING_EL)) {
5770 wakaba 1.79 !!!cp ('t220');
5771 wakaba 1.83 ## ISSUE: Can this state be reached?
5772 wakaba 1.52 pop @{$self->{open_elements}};
5773 wakaba 1.1 }
5774 wakaba 1.52
5775     push @$active_formatting_elements, ['#marker', '']
5776     if $token->{tag_name} eq 'caption';
5777    
5778 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5779 wakaba 1.52 $self->{insertion_mode} = {
5780 wakaba 1.54 caption => IN_CAPTION_IM,
5781     colgroup => IN_COLUMN_GROUP_IM,
5782     tbody => IN_TABLE_BODY_IM,
5783     tfoot => IN_TABLE_BODY_IM,
5784     thead => IN_TABLE_BODY_IM,
5785 wakaba 1.52 }->{$token->{tag_name}};
5786 wakaba 1.1 !!!next-token;
5787 wakaba 1.125 !!!nack ('t220.1');
5788 wakaba 1.126 next B;
5789 wakaba 1.52 } else {
5790     die "$0: in table: <>: $token->{tag_name}";
5791 wakaba 1.1 }
5792 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
5793 wakaba 1.122 !!!parse-error (type => 'not closed',
5794 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5795 wakaba 1.122 ->manakai_local_name,
5796     token => $token);
5797 wakaba 1.1
5798 wakaba 1.52 ## As if </table>
5799 wakaba 1.1 ## have a table element in table scope
5800     my $i;
5801 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5802     my $node = $self->{open_elements}->[$_];
5803 wakaba 1.123 if ($node->[1] & TABLE_EL) {
5804 wakaba 1.79 !!!cp ('t221');
5805 wakaba 1.1 $i = $_;
5806     last INSCOPE;
5807 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5808 wakaba 1.79 !!!cp ('t222');
5809 wakaba 1.1 last INSCOPE;
5810     }
5811     } # INSCOPE
5812     unless (defined $i) {
5813 wakaba 1.79 !!!cp ('t223');
5814 wakaba 1.83 ## TODO: The following is wrong, maybe.
5815 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => 'table',
5816     token => $token);
5817 wakaba 1.52 ## Ignore tokens </table><table>
5818 wakaba 1.125 !!!nack ('t223.1');
5819 wakaba 1.1 !!!next-token;
5820 wakaba 1.126 next B;
5821 wakaba 1.1 }
5822    
5823 wakaba 1.151 ## TODO: Followings are removed from the latest spec.
5824 wakaba 1.1 ## generate implied end tags
5825 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5826 wakaba 1.79 !!!cp ('t224');
5827 wakaba 1.86 pop @{$self->{open_elements}};
5828 wakaba 1.1 }
5829    
5830 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5831 wakaba 1.79 !!!cp ('t225');
5832 wakaba 1.122 ## NOTE: |<table><tr><table>|
5833     !!!parse-error (type => 'not closed',
5834 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5835 wakaba 1.122 ->manakai_local_name,
5836     token => $token);
5837 wakaba 1.79 } else {
5838     !!!cp ('t226');
5839 wakaba 1.1 }
5840    
5841 wakaba 1.3 splice @{$self->{open_elements}}, $i;
5842 wakaba 1.95 pop @{$open_tables};
5843 wakaba 1.1
5844 wakaba 1.52 $self->_reset_insertion_mode;
5845 wakaba 1.1
5846 wakaba 1.125 ## reprocess
5847     !!!ack-later;
5848 wakaba 1.126 next B;
5849 wakaba 1.100 } elsif ($token->{tag_name} eq 'style') {
5850     if (not $open_tables->[-1]->[1]) { # tainted
5851     !!!cp ('t227.8');
5852     ## NOTE: This is a "as if in head" code clone.
5853     $parse_rcdata->(CDATA_CONTENT_MODEL);
5854 wakaba 1.126 next B;
5855 wakaba 1.100 } else {
5856     !!!cp ('t227.7');
5857     #
5858     }
5859     } elsif ($token->{tag_name} eq 'script') {
5860     if (not $open_tables->[-1]->[1]) { # tainted
5861     !!!cp ('t227.6');
5862     ## NOTE: This is a "as if in head" code clone.
5863     $script_start_tag->();
5864 wakaba 1.126 next B;
5865 wakaba 1.100 } else {
5866     !!!cp ('t227.5');
5867     #
5868     }
5869 wakaba 1.98 } elsif ($token->{tag_name} eq 'input') {
5870     if (not $open_tables->[-1]->[1]) { # tainted
5871     if ($token->{attributes}->{type}) { ## TODO: case
5872     my $type = lc $token->{attributes}->{type}->{value};
5873     if ($type eq 'hidden') {
5874     !!!cp ('t227.3');
5875 wakaba 1.153 !!!parse-error (type => 'in table',
5876     text => $token->{tag_name}, token => $token);
5877 wakaba 1.98
5878 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5879 wakaba 1.98
5880     ## TODO: form element pointer
5881    
5882     pop @{$self->{open_elements}};
5883    
5884     !!!next-token;
5885 wakaba 1.125 !!!ack ('t227.2.1');
5886 wakaba 1.126 next B;
5887 wakaba 1.98 } else {
5888     !!!cp ('t227.2');
5889     #
5890     }
5891     } else {
5892     !!!cp ('t227.1');
5893     #
5894     }
5895     } else {
5896     !!!cp ('t227.4');
5897     #
5898     }
5899 wakaba 1.58 } else {
5900 wakaba 1.79 !!!cp ('t227');
5901 wakaba 1.58 #
5902     }
5903 wakaba 1.98
5904 wakaba 1.153 !!!parse-error (type => 'in table', text => $token->{tag_name},
5905     token => $token);
5906 wakaba 1.98
5907     $insert = $insert_to_foster;
5908     #
5909 wakaba 1.58 } elsif ($token->{type} == END_TAG_TOKEN) {
5910 wakaba 1.52 if ($token->{tag_name} eq 'tr' and
5911 wakaba 1.54 $self->{insertion_mode} == IN_ROW_IM) {
5912 wakaba 1.52 ## have an element in table scope
5913     my $i;
5914     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5915     my $node = $self->{open_elements}->[$_];
5916 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5917 wakaba 1.79 !!!cp ('t228');
5918 wakaba 1.52 $i = $_;
5919     last INSCOPE;
5920 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5921 wakaba 1.79 !!!cp ('t229');
5922 wakaba 1.52 last INSCOPE;
5923     }
5924     } # INSCOPE
5925     unless (defined $i) {
5926 wakaba 1.79 !!!cp ('t230');
5927 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5928     text => $token->{tag_name}, token => $token);
5929 wakaba 1.52 ## Ignore the token
5930 wakaba 1.125 !!!nack ('t230.1');
5931 wakaba 1.42 !!!next-token;
5932 wakaba 1.126 next B;
5933 wakaba 1.79 } else {
5934     !!!cp ('t232');
5935 wakaba 1.42 }
5936    
5937 wakaba 1.52 ## Clear back to table row context
5938 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5939     & TABLE_ROW_SCOPING_EL)) {
5940 wakaba 1.79 !!!cp ('t231');
5941 wakaba 1.83 ## ISSUE: Can this state be reached?
5942 wakaba 1.52 pop @{$self->{open_elements}};
5943     }
5944 wakaba 1.42
5945 wakaba 1.52 pop @{$self->{open_elements}}; # tr
5946 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5947 wakaba 1.52 !!!next-token;
5948 wakaba 1.125 !!!nack ('t231.1');
5949 wakaba 1.126 next B;
5950 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
5951 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
5952 wakaba 1.52 ## As if </tr>
5953     ## have an element in table scope
5954     my $i;
5955     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5956     my $node = $self->{open_elements}->[$_];
5957 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5958 wakaba 1.79 !!!cp ('t233');
5959 wakaba 1.52 $i = $_;
5960     last INSCOPE;
5961 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5962 wakaba 1.79 !!!cp ('t234');
5963 wakaba 1.52 last INSCOPE;
5964 wakaba 1.42 }
5965 wakaba 1.52 } # INSCOPE
5966     unless (defined $i) {
5967 wakaba 1.79 !!!cp ('t235');
5968 wakaba 1.83 ## TODO: The following is wrong.
5969 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5970     text => $token->{type}, token => $token);
5971 wakaba 1.52 ## Ignore the token
5972 wakaba 1.125 !!!nack ('t236.1');
5973 wakaba 1.52 !!!next-token;
5974 wakaba 1.126 next B;
5975 wakaba 1.42 }
5976 wakaba 1.52
5977     ## Clear back to table row context
5978 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5979     & TABLE_ROW_SCOPING_EL)) {
5980 wakaba 1.79 !!!cp ('t236');
5981 wakaba 1.83 ## ISSUE: Can this state be reached?
5982 wakaba 1.46 pop @{$self->{open_elements}};
5983 wakaba 1.1 }
5984 wakaba 1.46
5985 wakaba 1.52 pop @{$self->{open_elements}}; # tr
5986 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5987 wakaba 1.46 ## reprocess in the "in table body" insertion mode...
5988 wakaba 1.1 }
5989    
5990 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5991 wakaba 1.52 ## have an element in table scope
5992     my $i;
5993     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5994     my $node = $self->{open_elements}->[$_];
5995 wakaba 1.123 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5996 wakaba 1.79 !!!cp ('t237');
5997 wakaba 1.52 $i = $_;
5998     last INSCOPE;
5999 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6000 wakaba 1.79 !!!cp ('t238');
6001 wakaba 1.52 last INSCOPE;
6002     }
6003     } # INSCOPE
6004     unless (defined $i) {
6005 wakaba 1.79 !!!cp ('t239');
6006 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6007     text => $token->{tag_name}, token => $token);
6008 wakaba 1.52 ## Ignore the token
6009 wakaba 1.125 !!!nack ('t239.1');
6010 wakaba 1.52 !!!next-token;
6011 wakaba 1.126 next B;
6012 wakaba 1.47 }
6013    
6014     ## Clear back to table body context
6015 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6016     & TABLE_ROWS_SCOPING_EL)) {
6017 wakaba 1.79 !!!cp ('t240');
6018 wakaba 1.47 pop @{$self->{open_elements}};
6019     }
6020    
6021 wakaba 1.52 ## As if <{current node}>
6022     ## have an element in table scope
6023     ## true by definition
6024    
6025     ## Clear back to table body context
6026     ## nop by definition
6027    
6028     pop @{$self->{open_elements}};
6029 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6030 wakaba 1.52 ## reprocess in the "in table" insertion mode...
6031     }
6032    
6033 wakaba 1.94 ## NOTE: </table> in the "in table" insertion mode.
6034     ## When you edit the code fragment below, please ensure that
6035     ## the code for <table> in the "in table" insertion mode
6036     ## is synced with it.
6037    
6038 wakaba 1.52 ## have a table element in table scope
6039     my $i;
6040     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6041     my $node = $self->{open_elements}->[$_];
6042 wakaba 1.123 if ($node->[1] & TABLE_EL) {
6043 wakaba 1.79 !!!cp ('t241');
6044 wakaba 1.52 $i = $_;
6045     last INSCOPE;
6046 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6047 wakaba 1.79 !!!cp ('t242');
6048 wakaba 1.52 last INSCOPE;
6049 wakaba 1.47 }
6050 wakaba 1.52 } # INSCOPE
6051     unless (defined $i) {
6052 wakaba 1.79 !!!cp ('t243');
6053 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6054     text => $token->{tag_name}, token => $token);
6055 wakaba 1.52 ## Ignore the token
6056 wakaba 1.125 !!!nack ('t243.1');
6057 wakaba 1.52 !!!next-token;
6058 wakaba 1.126 next B;
6059 wakaba 1.3 }
6060 wakaba 1.52
6061     splice @{$self->{open_elements}}, $i;
6062 wakaba 1.95 pop @{$open_tables};
6063 wakaba 1.1
6064 wakaba 1.52 $self->_reset_insertion_mode;
6065 wakaba 1.47
6066     !!!next-token;
6067 wakaba 1.126 next B;
6068 wakaba 1.47 } elsif ({
6069 wakaba 1.48 tbody => 1, tfoot => 1, thead => 1,
6070 wakaba 1.52 }->{$token->{tag_name}} and
6071 wakaba 1.56 $self->{insertion_mode} & ROW_IMS) {
6072 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
6073 wakaba 1.52 ## have an element in table scope
6074     my $i;
6075     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6076     my $node = $self->{open_elements}->[$_];
6077 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6078 wakaba 1.79 !!!cp ('t247');
6079 wakaba 1.52 $i = $_;
6080     last INSCOPE;
6081 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6082 wakaba 1.79 !!!cp ('t248');
6083 wakaba 1.52 last INSCOPE;
6084     }
6085     } # INSCOPE
6086     unless (defined $i) {
6087 wakaba 1.79 !!!cp ('t249');
6088 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6089     text => $token->{tag_name}, token => $token);
6090 wakaba 1.52 ## Ignore the token
6091 wakaba 1.125 !!!nack ('t249.1');
6092 wakaba 1.52 !!!next-token;
6093 wakaba 1.126 next B;
6094 wakaba 1.52 }
6095    
6096 wakaba 1.48 ## As if </tr>
6097     ## have an element in table scope
6098     my $i;
6099     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6100     my $node = $self->{open_elements}->[$_];
6101 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
6102 wakaba 1.79 !!!cp ('t250');
6103 wakaba 1.48 $i = $_;
6104     last INSCOPE;
6105 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6106 wakaba 1.79 !!!cp ('t251');
6107 wakaba 1.48 last INSCOPE;
6108     }
6109     } # INSCOPE
6110 wakaba 1.52 unless (defined $i) {
6111 wakaba 1.79 !!!cp ('t252');
6112 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6113     text => 'tr', token => $token);
6114 wakaba 1.52 ## Ignore the token
6115 wakaba 1.125 !!!nack ('t252.1');
6116 wakaba 1.52 !!!next-token;
6117 wakaba 1.126 next B;
6118 wakaba 1.52 }
6119 wakaba 1.48
6120     ## Clear back to table row context
6121 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6122     & TABLE_ROW_SCOPING_EL)) {
6123 wakaba 1.79 !!!cp ('t253');
6124 wakaba 1.83 ## ISSUE: Can this case be reached?
6125 wakaba 1.48 pop @{$self->{open_elements}};
6126     }
6127    
6128     pop @{$self->{open_elements}}; # tr
6129 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
6130 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
6131     }
6132    
6133     ## have an element in table scope
6134     my $i;
6135     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6136     my $node = $self->{open_elements}->[$_];
6137 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6138 wakaba 1.79 !!!cp ('t254');
6139 wakaba 1.52 $i = $_;
6140     last INSCOPE;
6141 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6142 wakaba 1.79 !!!cp ('t255');
6143 wakaba 1.52 last INSCOPE;
6144     }
6145     } # INSCOPE
6146     unless (defined $i) {
6147 wakaba 1.79 !!!cp ('t256');
6148 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6149     text => $token->{tag_name}, token => $token);
6150 wakaba 1.52 ## Ignore the token
6151 wakaba 1.125 !!!nack ('t256.1');
6152 wakaba 1.52 !!!next-token;
6153 wakaba 1.126 next B;
6154 wakaba 1.52 }
6155    
6156     ## Clear back to table body context
6157 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6158     & TABLE_ROWS_SCOPING_EL)) {
6159 wakaba 1.79 !!!cp ('t257');
6160 wakaba 1.83 ## ISSUE: Can this case be reached?
6161 wakaba 1.52 pop @{$self->{open_elements}};
6162     }
6163    
6164     pop @{$self->{open_elements}};
6165 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6166 wakaba 1.125 !!!nack ('t257.1');
6167 wakaba 1.52 !!!next-token;
6168 wakaba 1.126 next B;
6169 wakaba 1.52 } elsif ({
6170     body => 1, caption => 1, col => 1, colgroup => 1,
6171     html => 1, td => 1, th => 1,
6172 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
6173     tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
6174 wakaba 1.52 }->{$token->{tag_name}}) {
6175 wakaba 1.125 !!!cp ('t258');
6176 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6177     text => $token->{tag_name}, token => $token);
6178 wakaba 1.125 ## Ignore the token
6179     !!!nack ('t258.1');
6180     !!!next-token;
6181 wakaba 1.126 next B;
6182 wakaba 1.58 } else {
6183 wakaba 1.79 !!!cp ('t259');
6184 wakaba 1.153 !!!parse-error (type => 'in table:/',
6185     text => $token->{tag_name}, token => $token);
6186 wakaba 1.52
6187 wakaba 1.58 $insert = $insert_to_foster;
6188     #
6189     }
6190 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6191 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6192 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6193 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6194 wakaba 1.104 !!!cp ('t259.1');
6195 wakaba 1.105 #
6196 wakaba 1.104 } else {
6197     !!!cp ('t259.2');
6198 wakaba 1.105 #
6199 wakaba 1.104 }
6200    
6201     ## Stop parsing
6202     last B;
6203 wakaba 1.58 } else {
6204     die "$0: $token->{type}: Unknown token type";
6205     }
6206 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6207 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6208 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6209 wakaba 1.52 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6210     unless (length $token->{data}) {
6211 wakaba 1.79 !!!cp ('t260');
6212 wakaba 1.52 !!!next-token;
6213 wakaba 1.126 next B;
6214 wakaba 1.52 }
6215     }
6216    
6217 wakaba 1.79 !!!cp ('t261');
6218 wakaba 1.52 #
6219 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6220 wakaba 1.52 if ($token->{tag_name} eq 'col') {
6221 wakaba 1.79 !!!cp ('t262');
6222 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6223 wakaba 1.52 pop @{$self->{open_elements}};
6224 wakaba 1.125 !!!ack ('t262.1');
6225 wakaba 1.52 !!!next-token;
6226 wakaba 1.126 next B;
6227 wakaba 1.52 } else {
6228 wakaba 1.79 !!!cp ('t263');
6229 wakaba 1.52 #
6230     }
6231 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6232 wakaba 1.52 if ($token->{tag_name} eq 'colgroup') {
6233 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6234 wakaba 1.79 !!!cp ('t264');
6235 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6236     text => 'colgroup', token => $token);
6237 wakaba 1.52 ## Ignore the token
6238     !!!next-token;
6239 wakaba 1.126 next B;
6240 wakaba 1.52 } else {
6241 wakaba 1.79 !!!cp ('t265');
6242 wakaba 1.52 pop @{$self->{open_elements}}; # colgroup
6243 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6244 wakaba 1.52 !!!next-token;
6245 wakaba 1.126 next B;
6246 wakaba 1.52 }
6247     } elsif ($token->{tag_name} eq 'col') {
6248 wakaba 1.79 !!!cp ('t266');
6249 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6250     text => 'col', token => $token);
6251 wakaba 1.52 ## Ignore the token
6252     !!!next-token;
6253 wakaba 1.126 next B;
6254 wakaba 1.52 } else {
6255 wakaba 1.79 !!!cp ('t267');
6256 wakaba 1.52 #
6257     }
6258 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6259 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6260 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6261     !!!cp ('t270.2');
6262     ## Stop parsing.
6263     last B;
6264     } else {
6265     ## NOTE: As if </colgroup>.
6266     !!!cp ('t270.1');
6267     pop @{$self->{open_elements}}; # colgroup
6268     $self->{insertion_mode} = IN_TABLE_IM;
6269     ## Reprocess.
6270 wakaba 1.126 next B;
6271 wakaba 1.104 }
6272     } else {
6273     die "$0: $token->{type}: Unknown token type";
6274     }
6275 wakaba 1.52
6276     ## As if </colgroup>
6277 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6278 wakaba 1.79 !!!cp ('t269');
6279 wakaba 1.104 ## TODO: Wrong error type?
6280 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6281     text => 'colgroup', token => $token);
6282 wakaba 1.52 ## Ignore the token
6283 wakaba 1.125 !!!nack ('t269.1');
6284 wakaba 1.52 !!!next-token;
6285 wakaba 1.126 next B;
6286 wakaba 1.52 } else {
6287 wakaba 1.79 !!!cp ('t270');
6288 wakaba 1.52 pop @{$self->{open_elements}}; # colgroup
6289 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6290 wakaba 1.125 !!!ack-later;
6291 wakaba 1.52 ## reprocess
6292 wakaba 1.126 next B;
6293 wakaba 1.52 }
6294 wakaba 1.101 } elsif ($self->{insertion_mode} & SELECT_IMS) {
6295 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
6296 wakaba 1.79 !!!cp ('t271');
6297 wakaba 1.58 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
6298     !!!next-token;
6299 wakaba 1.126 next B;
6300 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
6301 wakaba 1.123 if ($token->{tag_name} eq 'option') {
6302     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6303     !!!cp ('t272');
6304     ## As if </option>
6305     pop @{$self->{open_elements}};
6306     } else {
6307     !!!cp ('t273');
6308     }
6309 wakaba 1.52
6310 wakaba 1.123 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6311 wakaba 1.125 !!!nack ('t273.1');
6312 wakaba 1.123 !!!next-token;
6313 wakaba 1.126 next B;
6314 wakaba 1.123 } elsif ($token->{tag_name} eq 'optgroup') {
6315     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6316     !!!cp ('t274');
6317     ## As if </option>
6318     pop @{$self->{open_elements}};
6319     } else {
6320     !!!cp ('t275');
6321     }
6322 wakaba 1.52
6323 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6324     !!!cp ('t276');
6325     ## As if </optgroup>
6326     pop @{$self->{open_elements}};
6327     } else {
6328     !!!cp ('t277');
6329     }
6330 wakaba 1.52
6331 wakaba 1.123 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6332 wakaba 1.125 !!!nack ('t277.1');
6333 wakaba 1.123 !!!next-token;
6334 wakaba 1.126 next B;
6335 wakaba 1.146 } elsif ({
6336     select => 1, input => 1, textarea => 1,
6337     }->{$token->{tag_name}} or
6338 wakaba 1.101 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6339     {
6340     caption => 1, table => 1,
6341     tbody => 1, tfoot => 1, thead => 1,
6342     tr => 1, td => 1, th => 1,
6343     }->{$token->{tag_name}})) {
6344     ## TODO: The type below is not good - <select> is replaced by </select>
6345 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'select',
6346     token => $token);
6347 wakaba 1.101 ## NOTE: As if the token were </select> (<select> case) or
6348     ## as if there were </select> (otherwise).
6349 wakaba 1.123 ## have an element in table scope
6350     my $i;
6351     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6352     my $node = $self->{open_elements}->[$_];
6353     if ($node->[1] & SELECT_EL) {
6354     !!!cp ('t278');
6355     $i = $_;
6356     last INSCOPE;
6357     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6358     !!!cp ('t279');
6359     last INSCOPE;
6360     }
6361     } # INSCOPE
6362     unless (defined $i) {
6363     !!!cp ('t280');
6364 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6365     text => 'select', token => $token);
6366 wakaba 1.123 ## Ignore the token
6367 wakaba 1.125 !!!nack ('t280.1');
6368 wakaba 1.123 !!!next-token;
6369 wakaba 1.126 next B;
6370 wakaba 1.123 }
6371 wakaba 1.52
6372 wakaba 1.123 !!!cp ('t281');
6373     splice @{$self->{open_elements}}, $i;
6374 wakaba 1.52
6375 wakaba 1.123 $self->_reset_insertion_mode;
6376 wakaba 1.47
6377 wakaba 1.101 if ($token->{tag_name} eq 'select') {
6378 wakaba 1.125 !!!nack ('t281.2');
6379 wakaba 1.101 !!!next-token;
6380 wakaba 1.126 next B;
6381 wakaba 1.101 } else {
6382     !!!cp ('t281.1');
6383 wakaba 1.125 !!!ack-later;
6384 wakaba 1.101 ## Reprocess the token.
6385 wakaba 1.126 next B;
6386 wakaba 1.101 }
6387 wakaba 1.58 } else {
6388 wakaba 1.79 !!!cp ('t282');
6389 wakaba 1.153 !!!parse-error (type => 'in select',
6390     text => $token->{tag_name}, token => $token);
6391 wakaba 1.58 ## Ignore the token
6392 wakaba 1.125 !!!nack ('t282.1');
6393 wakaba 1.58 !!!next-token;
6394 wakaba 1.126 next B;
6395 wakaba 1.58 }
6396     } elsif ($token->{type} == END_TAG_TOKEN) {
6397 wakaba 1.123 if ($token->{tag_name} eq 'optgroup') {
6398     if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6399     $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6400     !!!cp ('t283');
6401     ## As if </option>
6402     splice @{$self->{open_elements}}, -2;
6403     } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6404     !!!cp ('t284');
6405     pop @{$self->{open_elements}};
6406     } else {
6407     !!!cp ('t285');
6408 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6409     text => $token->{tag_name}, token => $token);
6410 wakaba 1.123 ## Ignore the token
6411     }
6412 wakaba 1.125 !!!nack ('t285.1');
6413 wakaba 1.123 !!!next-token;
6414 wakaba 1.126 next B;
6415 wakaba 1.123 } elsif ($token->{tag_name} eq 'option') {
6416     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6417     !!!cp ('t286');
6418     pop @{$self->{open_elements}};
6419     } else {
6420     !!!cp ('t287');
6421 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6422     text => $token->{tag_name}, token => $token);
6423 wakaba 1.123 ## Ignore the token
6424     }
6425 wakaba 1.125 !!!nack ('t287.1');
6426 wakaba 1.123 !!!next-token;
6427 wakaba 1.126 next B;
6428 wakaba 1.123 } elsif ($token->{tag_name} eq 'select') {
6429     ## have an element in table scope
6430     my $i;
6431     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6432     my $node = $self->{open_elements}->[$_];
6433     if ($node->[1] & SELECT_EL) {
6434     !!!cp ('t288');
6435     $i = $_;
6436     last INSCOPE;
6437     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6438     !!!cp ('t289');
6439     last INSCOPE;
6440     }
6441     } # INSCOPE
6442     unless (defined $i) {
6443     !!!cp ('t290');
6444 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6445     text => $token->{tag_name}, token => $token);
6446 wakaba 1.123 ## Ignore the token
6447 wakaba 1.125 !!!nack ('t290.1');
6448 wakaba 1.123 !!!next-token;
6449 wakaba 1.126 next B;
6450 wakaba 1.123 }
6451 wakaba 1.52
6452 wakaba 1.123 !!!cp ('t291');
6453     splice @{$self->{open_elements}}, $i;
6454 wakaba 1.52
6455 wakaba 1.123 $self->_reset_insertion_mode;
6456 wakaba 1.52
6457 wakaba 1.125 !!!nack ('t291.1');
6458 wakaba 1.123 !!!next-token;
6459 wakaba 1.126 next B;
6460 wakaba 1.101 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6461     {
6462     caption => 1, table => 1, tbody => 1,
6463     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6464     }->{$token->{tag_name}}) {
6465 wakaba 1.83 ## TODO: The following is wrong?
6466 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6467     text => $token->{tag_name}, token => $token);
6468 wakaba 1.52
6469 wakaba 1.123 ## have an element in table scope
6470     my $i;
6471     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6472     my $node = $self->{open_elements}->[$_];
6473     if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6474     !!!cp ('t292');
6475     $i = $_;
6476     last INSCOPE;
6477     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6478     !!!cp ('t293');
6479     last INSCOPE;
6480     }
6481     } # INSCOPE
6482     unless (defined $i) {
6483     !!!cp ('t294');
6484     ## Ignore the token
6485 wakaba 1.125 !!!nack ('t294.1');
6486 wakaba 1.123 !!!next-token;
6487 wakaba 1.126 next B;
6488 wakaba 1.123 }
6489 wakaba 1.52
6490 wakaba 1.123 ## As if </select>
6491     ## have an element in table scope
6492     undef $i;
6493     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6494     my $node = $self->{open_elements}->[$_];
6495     if ($node->[1] & SELECT_EL) {
6496     !!!cp ('t295');
6497     $i = $_;
6498     last INSCOPE;
6499     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6500 wakaba 1.83 ## ISSUE: Can this state be reached?
6501 wakaba 1.123 !!!cp ('t296');
6502     last INSCOPE;
6503     }
6504     } # INSCOPE
6505     unless (defined $i) {
6506     !!!cp ('t297');
6507 wakaba 1.83 ## TODO: The following error type is correct?
6508 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6509     text => 'select', token => $token);
6510 wakaba 1.123 ## Ignore the </select> token
6511 wakaba 1.125 !!!nack ('t297.1');
6512 wakaba 1.123 !!!next-token; ## TODO: ok?
6513 wakaba 1.126 next B;
6514 wakaba 1.123 }
6515 wakaba 1.52
6516 wakaba 1.123 !!!cp ('t298');
6517     splice @{$self->{open_elements}}, $i;
6518 wakaba 1.52
6519 wakaba 1.123 $self->_reset_insertion_mode;
6520 wakaba 1.52
6521 wakaba 1.125 !!!ack-later;
6522 wakaba 1.123 ## reprocess
6523 wakaba 1.126 next B;
6524 wakaba 1.58 } else {
6525 wakaba 1.79 !!!cp ('t299');
6526 wakaba 1.153 !!!parse-error (type => 'in select:/',
6527     text => $token->{tag_name}, token => $token);
6528 wakaba 1.52 ## Ignore the token
6529 wakaba 1.125 !!!nack ('t299.3');
6530 wakaba 1.52 !!!next-token;
6531 wakaba 1.126 next B;
6532 wakaba 1.58 }
6533 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6534 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6535 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6536     !!!cp ('t299.1');
6537 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6538 wakaba 1.104 } else {
6539     !!!cp ('t299.2');
6540     }
6541    
6542     ## Stop parsing.
6543     last B;
6544 wakaba 1.58 } else {
6545     die "$0: $token->{type}: Unknown token type";
6546     }
6547 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6548 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6549 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6550 wakaba 1.52 my $data = $1;
6551     ## As if in body
6552     $reconstruct_active_formatting_elements->($insert_to_current);
6553    
6554     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6555    
6556     unless (length $token->{data}) {
6557 wakaba 1.79 !!!cp ('t300');
6558 wakaba 1.52 !!!next-token;
6559 wakaba 1.126 next B;
6560 wakaba 1.52 }
6561     }
6562    
6563 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6564 wakaba 1.79 !!!cp ('t301');
6565 wakaba 1.153 !!!parse-error (type => 'after html:#text', token => $token);
6566 wakaba 1.188 #
6567 wakaba 1.79 } else {
6568     !!!cp ('t302');
6569 wakaba 1.188 ## "after body" insertion mode
6570     !!!parse-error (type => 'after body:#text', token => $token);
6571     #
6572 wakaba 1.52 }
6573    
6574 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6575 wakaba 1.52 ## reprocess
6576 wakaba 1.126 next B;
6577 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6578 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6579 wakaba 1.79 !!!cp ('t303');
6580 wakaba 1.153 !!!parse-error (type => 'after html',
6581     text => $token->{tag_name}, token => $token);
6582 wakaba 1.188 #
6583 wakaba 1.79 } else {
6584     !!!cp ('t304');
6585 wakaba 1.188 ## "after body" insertion mode
6586     !!!parse-error (type => 'after body',
6587     text => $token->{tag_name}, token => $token);
6588     #
6589 wakaba 1.52 }
6590    
6591 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6592 wakaba 1.125 !!!ack-later;
6593 wakaba 1.52 ## reprocess
6594 wakaba 1.126 next B;
6595 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6596 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6597 wakaba 1.79 !!!cp ('t305');
6598 wakaba 1.153 !!!parse-error (type => 'after html:/',
6599     text => $token->{tag_name}, token => $token);
6600 wakaba 1.52
6601 wakaba 1.188 $self->{insertion_mode} = IN_BODY_IM;
6602     ## Reprocess.
6603     next B;
6604 wakaba 1.79 } else {
6605     !!!cp ('t306');
6606 wakaba 1.52 }
6607    
6608     ## "after body" insertion mode
6609     if ($token->{tag_name} eq 'html') {
6610     if (defined $self->{inner_html_node}) {
6611 wakaba 1.79 !!!cp ('t307');
6612 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6613     text => 'html', token => $token);
6614 wakaba 1.52 ## Ignore the token
6615     !!!next-token;
6616 wakaba 1.126 next B;
6617 wakaba 1.52 } else {
6618 wakaba 1.79 !!!cp ('t308');
6619 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6620 wakaba 1.52 !!!next-token;
6621 wakaba 1.126 next B;
6622 wakaba 1.52 }
6623     } else {
6624 wakaba 1.79 !!!cp ('t309');
6625 wakaba 1.153 !!!parse-error (type => 'after body:/',
6626     text => $token->{tag_name}, token => $token);
6627 wakaba 1.52
6628 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6629 wakaba 1.52 ## reprocess
6630 wakaba 1.126 next B;
6631 wakaba 1.52 }
6632 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6633     !!!cp ('t309.2');
6634     ## Stop parsing
6635     last B;
6636 wakaba 1.52 } else {
6637     die "$0: $token->{type}: Unknown token type";
6638     }
6639 wakaba 1.56 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6640 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6641 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6642 wakaba 1.52 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6643    
6644     unless (length $token->{data}) {
6645 wakaba 1.79 !!!cp ('t310');
6646 wakaba 1.52 !!!next-token;
6647 wakaba 1.126 next B;
6648 wakaba 1.52 }
6649     }
6650    
6651 wakaba 1.188 if ($token->{data} =~ s/^[^\x09\x0A\x0C\x20]+//) {
6652 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6653 wakaba 1.79 !!!cp ('t311');
6654 wakaba 1.153 !!!parse-error (type => 'in frameset:#text', token => $token);
6655 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6656 wakaba 1.79 !!!cp ('t312');
6657 wakaba 1.153 !!!parse-error (type => 'after frameset:#text', token => $token);
6658 wakaba 1.158 } else { # "after after frameset"
6659 wakaba 1.79 !!!cp ('t313');
6660 wakaba 1.153 !!!parse-error (type => 'after html:#text', token => $token);
6661 wakaba 1.52 }
6662    
6663     ## Ignore the token.
6664     if (length $token->{data}) {
6665 wakaba 1.79 !!!cp ('t314');
6666 wakaba 1.52 ## reprocess the rest of characters
6667     } else {
6668 wakaba 1.79 !!!cp ('t315');
6669 wakaba 1.52 !!!next-token;
6670     }
6671 wakaba 1.126 next B;
6672 wakaba 1.52 }
6673    
6674     die qq[$0: Character "$token->{data}"];
6675 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6676 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
6677 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6678 wakaba 1.79 !!!cp ('t318');
6679 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6680 wakaba 1.125 !!!nack ('t318.1');
6681 wakaba 1.52 !!!next-token;
6682 wakaba 1.126 next B;
6683 wakaba 1.52 } elsif ($token->{tag_name} eq 'frame' and
6684 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6685 wakaba 1.79 !!!cp ('t319');
6686 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6687 wakaba 1.52 pop @{$self->{open_elements}};
6688 wakaba 1.125 !!!ack ('t319.1');
6689 wakaba 1.52 !!!next-token;
6690 wakaba 1.126 next B;
6691 wakaba 1.52 } elsif ($token->{tag_name} eq 'noframes') {
6692 wakaba 1.79 !!!cp ('t320');
6693 wakaba 1.148 ## NOTE: As if in head.
6694 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
6695 wakaba 1.126 next B;
6696 wakaba 1.158
6697     ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6698     ## has no parse error.
6699 wakaba 1.52 } else {
6700 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6701 wakaba 1.79 !!!cp ('t321');
6702 wakaba 1.153 !!!parse-error (type => 'in frameset',
6703     text => $token->{tag_name}, token => $token);
6704 wakaba 1.158 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6705 wakaba 1.79 !!!cp ('t322');
6706 wakaba 1.153 !!!parse-error (type => 'after frameset',
6707     text => $token->{tag_name}, token => $token);
6708 wakaba 1.158 } else { # "after after frameset"
6709     !!!cp ('t322.2');
6710     !!!parse-error (type => 'after after frameset',
6711     text => $token->{tag_name}, token => $token);
6712 wakaba 1.52 }
6713     ## Ignore the token
6714 wakaba 1.125 !!!nack ('t322.1');
6715 wakaba 1.52 !!!next-token;
6716 wakaba 1.126 next B;
6717 wakaba 1.52 }
6718 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6719 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
6720 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6721 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6722 wakaba 1.52 @{$self->{open_elements}} == 1) {
6723 wakaba 1.79 !!!cp ('t325');
6724 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6725     text => $token->{tag_name}, token => $token);
6726 wakaba 1.52 ## Ignore the token
6727     !!!next-token;
6728     } else {
6729 wakaba 1.79 !!!cp ('t326');
6730 wakaba 1.52 pop @{$self->{open_elements}};
6731     !!!next-token;
6732     }
6733 wakaba 1.47
6734 wakaba 1.52 if (not defined $self->{inner_html_node} and
6735 wakaba 1.123 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6736 wakaba 1.79 !!!cp ('t327');
6737 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6738 wakaba 1.79 } else {
6739     !!!cp ('t328');
6740 wakaba 1.52 }
6741 wakaba 1.126 next B;
6742 wakaba 1.52 } elsif ($token->{tag_name} eq 'html' and
6743 wakaba 1.54 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6744 wakaba 1.79 !!!cp ('t329');
6745 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6746 wakaba 1.52 !!!next-token;
6747 wakaba 1.126 next B;
6748 wakaba 1.52 } else {
6749 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6750 wakaba 1.79 !!!cp ('t330');
6751 wakaba 1.153 !!!parse-error (type => 'in frameset:/',
6752     text => $token->{tag_name}, token => $token);
6753 wakaba 1.158 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6754     !!!cp ('t330.1');
6755     !!!parse-error (type => 'after frameset:/',
6756     text => $token->{tag_name}, token => $token);
6757     } else { # "after after html"
6758 wakaba 1.79 !!!cp ('t331');
6759 wakaba 1.158 !!!parse-error (type => 'after after frameset:/',
6760 wakaba 1.153 text => $token->{tag_name}, token => $token);
6761 wakaba 1.52 }
6762     ## Ignore the token
6763     !!!next-token;
6764 wakaba 1.126 next B;
6765 wakaba 1.52 }
6766 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6767 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6768 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6769     !!!cp ('t331.1');
6770 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6771 wakaba 1.104 } else {
6772     !!!cp ('t331.2');
6773     }
6774    
6775     ## Stop parsing
6776     last B;
6777 wakaba 1.52 } else {
6778     die "$0: $token->{type}: Unknown token type";
6779     }
6780 wakaba 1.47
6781 wakaba 1.52 ## ISSUE: An issue in spec here
6782     } else {
6783     die "$0: $self->{insertion_mode}: Unknown insertion mode";
6784     }
6785 wakaba 1.47
6786 wakaba 1.52 ## "in body" insertion mode
6787 wakaba 1.55 if ($token->{type} == START_TAG_TOKEN) {
6788 wakaba 1.52 if ($token->{tag_name} eq 'script') {
6789 wakaba 1.79 !!!cp ('t332');
6790 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6791 wakaba 1.100 $script_start_tag->();
6792 wakaba 1.126 next B;
6793 wakaba 1.52 } elsif ($token->{tag_name} eq 'style') {
6794 wakaba 1.79 !!!cp ('t333');
6795 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6796 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
6797 wakaba 1.126 next B;
6798 wakaba 1.52 } elsif ({
6799     base => 1, link => 1,
6800     }->{$token->{tag_name}}) {
6801 wakaba 1.79 !!!cp ('t334');
6802 wakaba 1.52 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6803 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6804 wakaba 1.52 pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6805 wakaba 1.125 !!!ack ('t334.1');
6806 wakaba 1.52 !!!next-token;
6807 wakaba 1.126 next B;
6808 wakaba 1.52 } elsif ($token->{tag_name} eq 'meta') {
6809     ## NOTE: This is an "as if in head" code clone, only "-t" differs
6810 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6811 wakaba 1.66 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
6812 wakaba 1.46
6813 wakaba 1.52 unless ($self->{confident}) {
6814 wakaba 1.134 if ($token->{attributes}->{charset}) {
6815 wakaba 1.79 !!!cp ('t335');
6816 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
6817     ## in the {change_encoding} callback.
6818 wakaba 1.63 $self->{change_encoding}
6819 wakaba 1.114 ->($self, $token->{attributes}->{charset}->{value}, $token);
6820 wakaba 1.66
6821     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6822     ->set_user_data (manakai_has_reference =>
6823     $token->{attributes}->{charset}
6824     ->{has_reference});
6825 wakaba 1.63 } elsif ($token->{attributes}->{content}) {
6826     if ($token->{attributes}->{content}->{value}
6827 wakaba 1.144 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6828 wakaba 1.189 [\x09\x0A\x0C\x0D\x20]*=
6829     [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6830     ([^"'\x09\x0A\x0C\x0D\x20][^\x09\x0A\x0C\x0D\x20\x3B]*))
6831     /x) {
6832 wakaba 1.79 !!!cp ('t336');
6833 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
6834     ## in the {change_encoding} callback.
6835 wakaba 1.63 $self->{change_encoding}
6836 wakaba 1.114 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6837 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6838     ->set_user_data (manakai_has_reference =>
6839     $token->{attributes}->{content}
6840     ->{has_reference});
6841 wakaba 1.63 }
6842 wakaba 1.52 }
6843 wakaba 1.66 } else {
6844     if ($token->{attributes}->{charset}) {
6845 wakaba 1.79 !!!cp ('t337');
6846 wakaba 1.66 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6847     ->set_user_data (manakai_has_reference =>
6848     $token->{attributes}->{charset}
6849     ->{has_reference});
6850     }
6851 wakaba 1.68 if ($token->{attributes}->{content}) {
6852 wakaba 1.79 !!!cp ('t338');
6853 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6854     ->set_user_data (manakai_has_reference =>
6855     $token->{attributes}->{content}
6856     ->{has_reference});
6857     }
6858 wakaba 1.52 }
6859 wakaba 1.1
6860 wakaba 1.125 !!!ack ('t338.1');
6861 wakaba 1.52 !!!next-token;
6862 wakaba 1.126 next B;
6863 wakaba 1.52 } elsif ($token->{tag_name} eq 'title') {
6864 wakaba 1.79 !!!cp ('t341');
6865 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6866 wakaba 1.96 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6867 wakaba 1.126 next B;
6868 wakaba 1.52 } elsif ($token->{tag_name} eq 'body') {
6869 wakaba 1.153 !!!parse-error (type => 'in body', text => 'body', token => $token);
6870 wakaba 1.46
6871 wakaba 1.52 if (@{$self->{open_elements}} == 1 or
6872 wakaba 1.123 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6873 wakaba 1.79 !!!cp ('t342');
6874 wakaba 1.52 ## Ignore the token
6875     } else {
6876     my $body_el = $self->{open_elements}->[1]->[0];
6877     for my $attr_name (keys %{$token->{attributes}}) {
6878     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6879 wakaba 1.79 !!!cp ('t343');
6880 wakaba 1.52 $body_el->set_attribute_ns
6881     (undef, [undef, $attr_name],
6882     $token->{attributes}->{$attr_name}->{value});
6883     }
6884     }
6885     }
6886 wakaba 1.125 !!!nack ('t343.1');
6887 wakaba 1.52 !!!next-token;
6888 wakaba 1.126 next B;
6889 wakaba 1.52 } elsif ({
6890     address => 1, blockquote => 1, center => 1, dir => 1,
6891 wakaba 1.85 div => 1, dl => 1, fieldset => 1,
6892     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
6893 wakaba 1.97 menu => 1, ol => 1, p => 1, ul => 1,
6894     pre => 1, listing => 1,
6895 wakaba 1.109 form => 1,
6896     table => 1,
6897     hr => 1,
6898 wakaba 1.52 }->{$token->{tag_name}}) {
6899 wakaba 1.109 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6900     !!!cp ('t350');
6901 wakaba 1.113 !!!parse-error (type => 'in form:form', token => $token);
6902 wakaba 1.109 ## Ignore the token
6903 wakaba 1.125 !!!nack ('t350.1');
6904 wakaba 1.109 !!!next-token;
6905 wakaba 1.126 next B;
6906 wakaba 1.109 }
6907    
6908 wakaba 1.52 ## has a p element in scope
6909     INSCOPE: for (reverse @{$self->{open_elements}}) {
6910 wakaba 1.123 if ($_->[1] & P_EL) {
6911 wakaba 1.79 !!!cp ('t344');
6912 wakaba 1.125 !!!back-token; # <form>
6913 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6914     line => $token->{line}, column => $token->{column}};
6915 wakaba 1.126 next B;
6916 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
6917 wakaba 1.79 !!!cp ('t345');
6918 wakaba 1.52 last INSCOPE;
6919     }
6920     } # INSCOPE
6921    
6922 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6923 wakaba 1.97 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6924 wakaba 1.125 !!!nack ('t346.1');
6925 wakaba 1.52 !!!next-token;
6926 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6927 wakaba 1.52 $token->{data} =~ s/^\x0A//;
6928     unless (length $token->{data}) {
6929 wakaba 1.79 !!!cp ('t346');
6930 wakaba 1.1 !!!next-token;
6931 wakaba 1.79 } else {
6932     !!!cp ('t349');
6933 wakaba 1.52 }
6934 wakaba 1.79 } else {
6935     !!!cp ('t348');
6936 wakaba 1.52 }
6937 wakaba 1.109 } elsif ($token->{tag_name} eq 'form') {
6938     !!!cp ('t347.1');
6939     $self->{form_element} = $self->{open_elements}->[-1]->[0];
6940    
6941 wakaba 1.125 !!!nack ('t347.2');
6942 wakaba 1.109 !!!next-token;
6943     } elsif ($token->{tag_name} eq 'table') {
6944     !!!cp ('t382');
6945     push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6946    
6947     $self->{insertion_mode} = IN_TABLE_IM;
6948    
6949 wakaba 1.125 !!!nack ('t382.1');
6950 wakaba 1.109 !!!next-token;
6951     } elsif ($token->{tag_name} eq 'hr') {
6952     !!!cp ('t386');
6953     pop @{$self->{open_elements}};
6954    
6955 wakaba 1.125 !!!nack ('t386.1');
6956 wakaba 1.109 !!!next-token;
6957 wakaba 1.52 } else {
6958 wakaba 1.125 !!!nack ('t347.1');
6959 wakaba 1.52 !!!next-token;
6960     }
6961 wakaba 1.126 next B;
6962 wakaba 1.109 } elsif ({li => 1, dt => 1, dd => 1}->{$token->{tag_name}}) {
6963 wakaba 1.52 ## has a p element in scope
6964     INSCOPE: for (reverse @{$self->{open_elements}}) {
6965 wakaba 1.123 if ($_->[1] & P_EL) {
6966 wakaba 1.79 !!!cp ('t353');
6967 wakaba 1.125 !!!back-token; # <x>
6968 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6969     line => $token->{line}, column => $token->{column}};
6970 wakaba 1.126 next B;
6971 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
6972 wakaba 1.79 !!!cp ('t354');
6973 wakaba 1.52 last INSCOPE;
6974     }
6975     } # INSCOPE
6976 wakaba 1.193
6977     ## NOTE: Special, Scope (<li><foo><li> == <li><foo><li/></foo></li>)
6978     ## Interpreted as <li><foo/></li><li/> (non-conforming)
6979     ## blockquote (O9.27), center (O), dd (Fx3, O, S3.1.2, IE7),
6980     ## dt (Fx, O, S, IE), dl (O), fieldset (O, S, IE), form (Fx, O, S),
6981     ## hn (O), pre (O), applet (O, S), button (O, S), marquee (Fx, O, S),
6982     ## object (Fx)
6983     ## Generate non-tree (non-conforming)
6984     ## basefont (IE7 (where basefont is non-void)), center (IE),
6985     ## form (IE), hn (IE)
6986     ## address, div, p (<li><foo><li> == <li><foo/></li><li/>)
6987     ## Interpreted as <li><foo><li/></foo></li> (non-conforming)
6988     ## div (Fx, S)
6989 wakaba 1.52
6990     ## Step 1
6991     my $i = -1;
6992     my $node = $self->{open_elements}->[$i];
6993 wakaba 1.109 my $li_or_dtdd = {li => {li => 1},
6994     dt => {dt => 1, dd => 1},
6995     dd => {dt => 1, dd => 1}}->{$token->{tag_name}};
6996 wakaba 1.52 LI: {
6997     ## Step 2
6998 wakaba 1.123 if ($li_or_dtdd->{$node->[0]->manakai_local_name}) {
6999 wakaba 1.52 if ($i != -1) {
7000 wakaba 1.79 !!!cp ('t355');
7001 wakaba 1.122 !!!parse-error (type => 'not closed',
7002 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7003 wakaba 1.122 ->manakai_local_name,
7004     token => $token);
7005 wakaba 1.79 } else {
7006     !!!cp ('t356');
7007 wakaba 1.52 }
7008     splice @{$self->{open_elements}}, $i;
7009     last LI;
7010 wakaba 1.79 } else {
7011     !!!cp ('t357');
7012 wakaba 1.52 }
7013    
7014     ## Step 3
7015 wakaba 1.123 if (not ($node->[1] & FORMATTING_EL) and
7016 wakaba 1.52 #not $phrasing_category->{$node->[1]} and
7017 wakaba 1.123 ($node->[1] & SPECIAL_EL or
7018     $node->[1] & SCOPING_EL) and
7019     not ($node->[1] & ADDRESS_EL) and
7020     not ($node->[1] & DIV_EL)) {
7021 wakaba 1.79 !!!cp ('t358');
7022 wakaba 1.52 last LI;
7023     }
7024    
7025 wakaba 1.79 !!!cp ('t359');
7026 wakaba 1.52 ## Step 4
7027     $i--;
7028     $node = $self->{open_elements}->[$i];
7029     redo LI;
7030     } # LI
7031    
7032 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7033 wakaba 1.125 !!!nack ('t359.1');
7034 wakaba 1.52 !!!next-token;
7035 wakaba 1.126 next B;
7036 wakaba 1.52 } elsif ($token->{tag_name} eq 'plaintext') {
7037     ## has a p element in scope
7038     INSCOPE: for (reverse @{$self->{open_elements}}) {
7039 wakaba 1.123 if ($_->[1] & P_EL) {
7040 wakaba 1.79 !!!cp ('t367');
7041 wakaba 1.125 !!!back-token; # <plaintext>
7042 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'p',
7043     line => $token->{line}, column => $token->{column}};
7044 wakaba 1.126 next B;
7045 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
7046 wakaba 1.79 !!!cp ('t368');
7047 wakaba 1.52 last INSCOPE;
7048 wakaba 1.46 }
7049 wakaba 1.52 } # INSCOPE
7050    
7051 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7052 wakaba 1.52
7053     $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
7054    
7055 wakaba 1.125 !!!nack ('t368.1');
7056 wakaba 1.52 !!!next-token;
7057 wakaba 1.126 next B;
7058 wakaba 1.52 } elsif ($token->{tag_name} eq 'a') {
7059     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
7060     my $node = $active_formatting_elements->[$i];
7061 wakaba 1.123 if ($node->[1] & A_EL) {
7062 wakaba 1.79 !!!cp ('t371');
7063 wakaba 1.113 !!!parse-error (type => 'in a:a', token => $token);
7064 wakaba 1.52
7065 wakaba 1.125 !!!back-token; # <a>
7066 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'a',
7067     line => $token->{line}, column => $token->{column}};
7068 wakaba 1.113 $formatting_end_tag->($token);
7069 wakaba 1.52
7070     AFE2: for (reverse 0..$#$active_formatting_elements) {
7071     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
7072 wakaba 1.79 !!!cp ('t372');
7073 wakaba 1.52 splice @$active_formatting_elements, $_, 1;
7074     last AFE2;
7075 wakaba 1.1 }
7076 wakaba 1.52 } # AFE2
7077     OE: for (reverse 0..$#{$self->{open_elements}}) {
7078     if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
7079 wakaba 1.79 !!!cp ('t373');
7080 wakaba 1.52 splice @{$self->{open_elements}}, $_, 1;
7081     last OE;
7082 wakaba 1.1 }
7083 wakaba 1.52 } # OE
7084     last AFE;
7085     } elsif ($node->[0] eq '#marker') {
7086 wakaba 1.79 !!!cp ('t374');
7087 wakaba 1.52 last AFE;
7088     }
7089     } # AFE
7090    
7091     $reconstruct_active_formatting_elements->($insert_to_current);
7092 wakaba 1.1
7093 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7094 wakaba 1.52 push @$active_formatting_elements, $self->{open_elements}->[-1];
7095 wakaba 1.1
7096 wakaba 1.125 !!!nack ('t374.1');
7097 wakaba 1.52 !!!next-token;
7098 wakaba 1.126 next B;
7099 wakaba 1.52 } elsif ($token->{tag_name} eq 'nobr') {
7100     $reconstruct_active_formatting_elements->($insert_to_current);
7101 wakaba 1.1
7102 wakaba 1.52 ## has a |nobr| element in scope
7103     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7104     my $node = $self->{open_elements}->[$_];
7105 wakaba 1.123 if ($node->[1] & NOBR_EL) {
7106 wakaba 1.79 !!!cp ('t376');
7107 wakaba 1.113 !!!parse-error (type => 'in nobr:nobr', token => $token);
7108 wakaba 1.125 !!!back-token; # <nobr>
7109 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
7110     line => $token->{line}, column => $token->{column}};
7111 wakaba 1.126 next B;
7112 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7113 wakaba 1.79 !!!cp ('t377');
7114 wakaba 1.52 last INSCOPE;
7115     }
7116     } # INSCOPE
7117    
7118 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7119 wakaba 1.52 push @$active_formatting_elements, $self->{open_elements}->[-1];
7120    
7121 wakaba 1.125 !!!nack ('t377.1');
7122 wakaba 1.52 !!!next-token;
7123 wakaba 1.126 next B;
7124 wakaba 1.52 } elsif ($token->{tag_name} eq 'button') {
7125     ## has a button element in scope
7126     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7127     my $node = $self->{open_elements}->[$_];
7128 wakaba 1.123 if ($node->[1] & BUTTON_EL) {
7129 wakaba 1.79 !!!cp ('t378');
7130 wakaba 1.113 !!!parse-error (type => 'in button:button', token => $token);
7131 wakaba 1.125 !!!back-token; # <button>
7132 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'button',
7133     line => $token->{line}, column => $token->{column}};
7134 wakaba 1.126 next B;
7135 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7136 wakaba 1.79 !!!cp ('t379');
7137 wakaba 1.52 last INSCOPE;
7138     }
7139     } # INSCOPE
7140    
7141     $reconstruct_active_formatting_elements->($insert_to_current);
7142    
7143 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7144 wakaba 1.85
7145     ## TODO: associate with $self->{form_element} if defined
7146    
7147 wakaba 1.52 push @$active_formatting_elements, ['#marker', ''];
7148 wakaba 1.1
7149 wakaba 1.125 !!!nack ('t379.1');
7150 wakaba 1.52 !!!next-token;
7151 wakaba 1.126 next B;
7152 wakaba 1.103 } elsif ({
7153 wakaba 1.109 xmp => 1,
7154     iframe => 1,
7155     noembed => 1,
7156 wakaba 1.148 noframes => 1, ## NOTE: This is an "as if in head" code clone.
7157 wakaba 1.109 noscript => 0, ## TODO: 1 if scripting is enabled
7158 wakaba 1.103 }->{$token->{tag_name}}) {
7159 wakaba 1.109 if ($token->{tag_name} eq 'xmp') {
7160     !!!cp ('t381');
7161     $reconstruct_active_formatting_elements->($insert_to_current);
7162     } else {
7163     !!!cp ('t399');
7164     }
7165     ## NOTE: There is an "as if in body" code clone.
7166 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
7167 wakaba 1.126 next B;
7168 wakaba 1.52 } elsif ($token->{tag_name} eq 'isindex') {
7169 wakaba 1.113 !!!parse-error (type => 'isindex', token => $token);
7170 wakaba 1.52
7171     if (defined $self->{form_element}) {
7172 wakaba 1.79 !!!cp ('t389');
7173 wakaba 1.52 ## Ignore the token
7174 wakaba 1.125 !!!nack ('t389'); ## NOTE: Not acknowledged.
7175 wakaba 1.52 !!!next-token;
7176 wakaba 1.126 next B;
7177 wakaba 1.52 } else {
7178 wakaba 1.147 !!!ack ('t391.1');
7179    
7180 wakaba 1.52 my $at = $token->{attributes};
7181     my $form_attrs;
7182     $form_attrs->{action} = $at->{action} if $at->{action};
7183     my $prompt_attr = $at->{prompt};
7184     $at->{name} = {name => 'name', value => 'isindex'};
7185     delete $at->{action};
7186     delete $at->{prompt};
7187     my @tokens = (
7188 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'form',
7189 wakaba 1.114 attributes => $form_attrs,
7190     line => $token->{line}, column => $token->{column}},
7191     {type => START_TAG_TOKEN, tag_name => 'hr',
7192     line => $token->{line}, column => $token->{column}},
7193     {type => START_TAG_TOKEN, tag_name => 'p',
7194     line => $token->{line}, column => $token->{column}},
7195     {type => START_TAG_TOKEN, tag_name => 'label',
7196     line => $token->{line}, column => $token->{column}},
7197 wakaba 1.52 );
7198     if ($prompt_attr) {
7199 wakaba 1.79 !!!cp ('t390');
7200 wakaba 1.114 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
7201 wakaba 1.118 #line => $token->{line}, column => $token->{column},
7202     };
7203 wakaba 1.1 } else {
7204 wakaba 1.79 !!!cp ('t391');
7205 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN,
7206 wakaba 1.114 data => 'This is a searchable index. Insert your search keywords here: ',
7207 wakaba 1.118 #line => $token->{line}, column => $token->{column},
7208     }; # SHOULD
7209 wakaba 1.52 ## TODO: make this configurable
7210 wakaba 1.1 }
7211 wakaba 1.52 push @tokens,
7212 wakaba 1.114 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
7213     line => $token->{line}, column => $token->{column}},
7214 wakaba 1.55 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
7215 wakaba 1.114 {type => END_TAG_TOKEN, tag_name => 'label',
7216     line => $token->{line}, column => $token->{column}},
7217     {type => END_TAG_TOKEN, tag_name => 'p',
7218     line => $token->{line}, column => $token->{column}},
7219     {type => START_TAG_TOKEN, tag_name => 'hr',
7220     line => $token->{line}, column => $token->{column}},
7221     {type => END_TAG_TOKEN, tag_name => 'form',
7222     line => $token->{line}, column => $token->{column}};
7223 wakaba 1.52 !!!back-token (@tokens);
7224 wakaba 1.125 !!!next-token;
7225 wakaba 1.126 next B;
7226 wakaba 1.52 }
7227     } elsif ($token->{tag_name} eq 'textarea') {
7228     my $tag_name = $token->{tag_name};
7229     my $el;
7230 wakaba 1.126 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
7231 wakaba 1.52
7232     ## TODO: $self->{form_element} if defined
7233     $self->{content_model} = RCDATA_CONTENT_MODEL;
7234     delete $self->{escape}; # MUST
7235    
7236     $insert->($el);
7237    
7238     my $text = '';
7239 wakaba 1.125 !!!nack ('t392.1');
7240 wakaba 1.52 !!!next-token;
7241 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
7242 wakaba 1.52 $token->{data} =~ s/^\x0A//;
7243 wakaba 1.51 unless (length $token->{data}) {
7244 wakaba 1.79 !!!cp ('t392');
7245 wakaba 1.51 !!!next-token;
7246 wakaba 1.79 } else {
7247     !!!cp ('t393');
7248 wakaba 1.51 }
7249 wakaba 1.79 } else {
7250     !!!cp ('t394');
7251 wakaba 1.51 }
7252 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
7253 wakaba 1.79 !!!cp ('t395');
7254 wakaba 1.52 $text .= $token->{data};
7255     !!!next-token;
7256     }
7257     if (length $text) {
7258 wakaba 1.79 !!!cp ('t396');
7259 wakaba 1.52 $el->manakai_append_text ($text);
7260     }
7261    
7262     $self->{content_model} = PCDATA_CONTENT_MODEL;
7263 wakaba 1.51
7264 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
7265 wakaba 1.52 $token->{tag_name} eq $tag_name) {
7266 wakaba 1.79 !!!cp ('t397');
7267 wakaba 1.52 ## Ignore the token
7268     } else {
7269 wakaba 1.79 !!!cp ('t398');
7270 wakaba 1.153 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
7271 wakaba 1.51 }
7272 wakaba 1.52 !!!next-token;
7273 wakaba 1.126 next B;
7274 wakaba 1.151 } elsif ($token->{tag_name} eq 'rt' or
7275     $token->{tag_name} eq 'rp') {
7276     ## has a |ruby| element in scope
7277     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7278     my $node = $self->{open_elements}->[$_];
7279     if ($node->[1] & RUBY_EL) {
7280     !!!cp ('t398.1');
7281     ## generate implied end tags
7282     while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7283     !!!cp ('t398.2');
7284     pop @{$self->{open_elements}};
7285     }
7286     unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
7287     !!!cp ('t398.3');
7288     !!!parse-error (type => 'not closed',
7289 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7290 wakaba 1.151 ->manakai_local_name,
7291     token => $token);
7292     pop @{$self->{open_elements}}
7293     while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
7294     }
7295     last INSCOPE;
7296     } elsif ($node->[1] & SCOPING_EL) {
7297     !!!cp ('t398.4');
7298     last INSCOPE;
7299     }
7300     } # INSCOPE
7301    
7302     !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7303    
7304     !!!nack ('t398.5');
7305     !!!next-token;
7306     redo B;
7307 wakaba 1.126 } elsif ($token->{tag_name} eq 'math' or
7308     $token->{tag_name} eq 'svg') {
7309     $reconstruct_active_formatting_elements->($insert_to_current);
7310 wakaba 1.131
7311 wakaba 1.155 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
7312    
7313 wakaba 1.131 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
7314    
7315     ## "adjust foreign attributes" - done in insert-element-f
7316 wakaba 1.126
7317 wakaba 1.131 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
7318 wakaba 1.126
7319     if ($self->{self_closing}) {
7320     pop @{$self->{open_elements}};
7321     !!!ack ('t398.1');
7322     } else {
7323     !!!cp ('t398.2');
7324     $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
7325     ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
7326     ## mode, "in body" (not "in foreign content") secondary insertion
7327     ## mode, maybe.
7328     }
7329    
7330     !!!next-token;
7331     next B;
7332 wakaba 1.52 } elsif ({
7333     caption => 1, col => 1, colgroup => 1, frame => 1,
7334     frameset => 1, head => 1, option => 1, optgroup => 1,
7335     tbody => 1, td => 1, tfoot => 1, th => 1,
7336     thead => 1, tr => 1,
7337     }->{$token->{tag_name}}) {
7338 wakaba 1.79 !!!cp ('t401');
7339 wakaba 1.153 !!!parse-error (type => 'in body',
7340     text => $token->{tag_name}, token => $token);
7341 wakaba 1.52 ## Ignore the token
7342 wakaba 1.125 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
7343 wakaba 1.52 !!!next-token;
7344 wakaba 1.126 next B;
7345 wakaba 1.52
7346     ## ISSUE: An issue on HTML5 new elements in the spec.
7347     } else {
7348 wakaba 1.110 if ($token->{tag_name} eq 'image') {
7349     !!!cp ('t384');
7350 wakaba 1.113 !!!parse-error (type => 'image', token => $token);
7351 wakaba 1.110 $token->{tag_name} = 'img';
7352     } else {
7353     !!!cp ('t385');
7354     }
7355    
7356     ## NOTE: There is an "as if <br>" code clone.
7357 wakaba 1.52 $reconstruct_active_formatting_elements->($insert_to_current);
7358    
7359 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7360 wakaba 1.109
7361 wakaba 1.110 if ({
7362     applet => 1, marquee => 1, object => 1,
7363     }->{$token->{tag_name}}) {
7364     !!!cp ('t380');
7365     push @$active_formatting_elements, ['#marker', ''];
7366 wakaba 1.125 !!!nack ('t380.1');
7367 wakaba 1.110 } elsif ({
7368     b => 1, big => 1, em => 1, font => 1, i => 1,
7369 wakaba 1.193 s => 1, small => 1, strike => 1,
7370 wakaba 1.110 strong => 1, tt => 1, u => 1,
7371     }->{$token->{tag_name}}) {
7372     !!!cp ('t375');
7373     push @$active_formatting_elements, $self->{open_elements}->[-1];
7374 wakaba 1.125 !!!nack ('t375.1');
7375 wakaba 1.110 } elsif ($token->{tag_name} eq 'input') {
7376     !!!cp ('t388');
7377     ## TODO: associate with $self->{form_element} if defined
7378     pop @{$self->{open_elements}};
7379 wakaba 1.125 !!!ack ('t388.2');
7380 wakaba 1.110 } elsif ({
7381     area => 1, basefont => 1, bgsound => 1, br => 1,
7382     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7383     #image => 1,
7384     }->{$token->{tag_name}}) {
7385     !!!cp ('t388.1');
7386     pop @{$self->{open_elements}};
7387 wakaba 1.125 !!!ack ('t388.3');
7388 wakaba 1.110 } elsif ($token->{tag_name} eq 'select') {
7389 wakaba 1.109 ## TODO: associate with $self->{form_element} if defined
7390    
7391     if ($self->{insertion_mode} & TABLE_IMS or
7392     $self->{insertion_mode} & BODY_TABLE_IMS or
7393     $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7394     !!!cp ('t400.1');
7395     $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7396     } else {
7397     !!!cp ('t400.2');
7398     $self->{insertion_mode} = IN_SELECT_IM;
7399     }
7400 wakaba 1.125 !!!nack ('t400.3');
7401 wakaba 1.110 } else {
7402 wakaba 1.125 !!!nack ('t402');
7403 wakaba 1.109 }
7404 wakaba 1.51
7405 wakaba 1.52 !!!next-token;
7406 wakaba 1.126 next B;
7407 wakaba 1.52 }
7408 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
7409 wakaba 1.52 if ($token->{tag_name} eq 'body') {
7410 wakaba 1.107 ## has a |body| element in scope
7411     my $i;
7412 wakaba 1.111 INSCOPE: {
7413     for (reverse @{$self->{open_elements}}) {
7414 wakaba 1.123 if ($_->[1] & BODY_EL) {
7415 wakaba 1.111 !!!cp ('t405');
7416     $i = $_;
7417     last INSCOPE;
7418 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
7419 wakaba 1.111 !!!cp ('t405.1');
7420     last;
7421     }
7422 wakaba 1.52 }
7423 wakaba 1.111
7424     !!!parse-error (type => 'start tag not allowed',
7425 wakaba 1.153 text => $token->{tag_name}, token => $token);
7426 wakaba 1.107 ## NOTE: Ignore the token.
7427 wakaba 1.52 !!!next-token;
7428 wakaba 1.126 next B;
7429 wakaba 1.111 } # INSCOPE
7430 wakaba 1.107
7431     for (@{$self->{open_elements}}) {
7432 wakaba 1.123 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7433 wakaba 1.107 !!!cp ('t403');
7434 wakaba 1.122 !!!parse-error (type => 'not closed',
7435 wakaba 1.153 text => $_->[0]->manakai_local_name,
7436 wakaba 1.122 token => $token);
7437 wakaba 1.107 last;
7438     } else {
7439     !!!cp ('t404');
7440     }
7441     }
7442    
7443     $self->{insertion_mode} = AFTER_BODY_IM;
7444     !!!next-token;
7445 wakaba 1.126 next B;
7446 wakaba 1.52 } elsif ($token->{tag_name} eq 'html') {
7447 wakaba 1.122 ## TODO: Update this code. It seems that the code below is not
7448     ## up-to-date, though it has same effect as speced.
7449 wakaba 1.123 if (@{$self->{open_elements}} > 1 and
7450     $self->{open_elements}->[1]->[1] & BODY_EL) {
7451 wakaba 1.52 ## ISSUE: There is an issue in the spec.
7452 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7453 wakaba 1.79 !!!cp ('t406');
7454 wakaba 1.122 !!!parse-error (type => 'not closed',
7455 wakaba 1.153 text => $self->{open_elements}->[1]->[0]
7456 wakaba 1.122 ->manakai_local_name,
7457     token => $token);
7458 wakaba 1.79 } else {
7459     !!!cp ('t407');
7460 wakaba 1.1 }
7461 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
7462 wakaba 1.52 ## reprocess
7463 wakaba 1.126 next B;
7464 wakaba 1.51 } else {
7465 wakaba 1.79 !!!cp ('t408');
7466 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7467     text => $token->{tag_name}, token => $token);
7468 wakaba 1.52 ## Ignore the token
7469     !!!next-token;
7470 wakaba 1.126 next B;
7471 wakaba 1.51 }
7472 wakaba 1.52 } elsif ({
7473     address => 1, blockquote => 1, center => 1, dir => 1,
7474     div => 1, dl => 1, fieldset => 1, listing => 1,
7475     menu => 1, ol => 1, pre => 1, ul => 1,
7476     dd => 1, dt => 1, li => 1,
7477 wakaba 1.103 applet => 1, button => 1, marquee => 1, object => 1,
7478 wakaba 1.52 }->{$token->{tag_name}}) {
7479     ## has an element in scope
7480     my $i;
7481     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7482     my $node = $self->{open_elements}->[$_];
7483 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7484 wakaba 1.79 !!!cp ('t410');
7485 wakaba 1.52 $i = $_;
7486 wakaba 1.87 last INSCOPE;
7487 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7488 wakaba 1.79 !!!cp ('t411');
7489 wakaba 1.52 last INSCOPE;
7490 wakaba 1.51 }
7491 wakaba 1.52 } # INSCOPE
7492 wakaba 1.89
7493     unless (defined $i) { # has an element in scope
7494     !!!cp ('t413');
7495 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7496     text => $token->{tag_name}, token => $token);
7497 wakaba 1.157 ## NOTE: Ignore the token.
7498 wakaba 1.89 } else {
7499     ## Step 1. generate implied end tags
7500     while ({
7501 wakaba 1.151 ## END_TAG_OPTIONAL_EL
7502 wakaba 1.89 dd => ($token->{tag_name} ne 'dd'),
7503     dt => ($token->{tag_name} ne 'dt'),
7504     li => ($token->{tag_name} ne 'li'),
7505     p => 1,
7506 wakaba 1.151 rt => 1,
7507     rp => 1,
7508 wakaba 1.123 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7509 wakaba 1.89 !!!cp ('t409');
7510     pop @{$self->{open_elements}};
7511     }
7512    
7513     ## Step 2.
7514 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7515     ne $token->{tag_name}) {
7516 wakaba 1.79 !!!cp ('t412');
7517 wakaba 1.122 !!!parse-error (type => 'not closed',
7518 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7519 wakaba 1.122 ->manakai_local_name,
7520     token => $token);
7521 wakaba 1.51 } else {
7522 wakaba 1.89 !!!cp ('t414');
7523 wakaba 1.51 }
7524 wakaba 1.89
7525     ## Step 3.
7526 wakaba 1.52 splice @{$self->{open_elements}}, $i;
7527 wakaba 1.89
7528     ## Step 4.
7529     $clear_up_to_marker->()
7530     if {
7531 wakaba 1.103 applet => 1, button => 1, marquee => 1, object => 1,
7532 wakaba 1.89 }->{$token->{tag_name}};
7533 wakaba 1.51 }
7534 wakaba 1.52 !!!next-token;
7535 wakaba 1.126 next B;
7536 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
7537 wakaba 1.92 undef $self->{form_element};
7538    
7539 wakaba 1.52 ## has an element in scope
7540 wakaba 1.92 my $i;
7541 wakaba 1.52 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7542     my $node = $self->{open_elements}->[$_];
7543 wakaba 1.123 if ($node->[1] & FORM_EL) {
7544 wakaba 1.79 !!!cp ('t418');
7545 wakaba 1.92 $i = $_;
7546 wakaba 1.52 last INSCOPE;
7547 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7548 wakaba 1.79 !!!cp ('t419');
7549 wakaba 1.52 last INSCOPE;
7550     }
7551     } # INSCOPE
7552 wakaba 1.92
7553     unless (defined $i) { # has an element in scope
7554 wakaba 1.79 !!!cp ('t421');
7555 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7556     text => $token->{tag_name}, token => $token);
7557 wakaba 1.157 ## NOTE: Ignore the token.
7558 wakaba 1.92 } else {
7559     ## Step 1. generate implied end tags
7560 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7561 wakaba 1.92 !!!cp ('t417');
7562     pop @{$self->{open_elements}};
7563     }
7564    
7565     ## Step 2.
7566 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7567     ne $token->{tag_name}) {
7568 wakaba 1.92 !!!cp ('t417.1');
7569 wakaba 1.122 !!!parse-error (type => 'not closed',
7570 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7571 wakaba 1.122 ->manakai_local_name,
7572     token => $token);
7573 wakaba 1.92 } else {
7574     !!!cp ('t420');
7575     }
7576    
7577     ## Step 3.
7578     splice @{$self->{open_elements}}, $i;
7579 wakaba 1.52 }
7580    
7581     !!!next-token;
7582 wakaba 1.126 next B;
7583 wakaba 1.52 } elsif ({
7584     h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7585     }->{$token->{tag_name}}) {
7586     ## has an element in scope
7587     my $i;
7588     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7589     my $node = $self->{open_elements}->[$_];
7590 wakaba 1.123 if ($node->[1] & HEADING_EL) {
7591 wakaba 1.79 !!!cp ('t423');
7592 wakaba 1.52 $i = $_;
7593     last INSCOPE;
7594 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7595 wakaba 1.79 !!!cp ('t424');
7596 wakaba 1.52 last INSCOPE;
7597 wakaba 1.51 }
7598 wakaba 1.52 } # INSCOPE
7599 wakaba 1.93
7600     unless (defined $i) { # has an element in scope
7601     !!!cp ('t425.1');
7602 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7603     text => $token->{tag_name}, token => $token);
7604 wakaba 1.157 ## NOTE: Ignore the token.
7605 wakaba 1.79 } else {
7606 wakaba 1.93 ## Step 1. generate implied end tags
7607 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7608 wakaba 1.93 !!!cp ('t422');
7609     pop @{$self->{open_elements}};
7610     }
7611    
7612     ## Step 2.
7613 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7614     ne $token->{tag_name}) {
7615 wakaba 1.93 !!!cp ('t425');
7616 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7617     text => $token->{tag_name}, token => $token);
7618 wakaba 1.93 } else {
7619     !!!cp ('t426');
7620     }
7621    
7622     ## Step 3.
7623     splice @{$self->{open_elements}}, $i;
7624 wakaba 1.36 }
7625 wakaba 1.52
7626     !!!next-token;
7627 wakaba 1.126 next B;
7628 wakaba 1.87 } elsif ($token->{tag_name} eq 'p') {
7629     ## has an element in scope
7630     my $i;
7631     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7632     my $node = $self->{open_elements}->[$_];
7633 wakaba 1.123 if ($node->[1] & P_EL) {
7634 wakaba 1.87 !!!cp ('t410.1');
7635     $i = $_;
7636 wakaba 1.88 last INSCOPE;
7637 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7638 wakaba 1.87 !!!cp ('t411.1');
7639     last INSCOPE;
7640     }
7641     } # INSCOPE
7642 wakaba 1.91
7643     if (defined $i) {
7644 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7645     ne $token->{tag_name}) {
7646 wakaba 1.87 !!!cp ('t412.1');
7647 wakaba 1.122 !!!parse-error (type => 'not closed',
7648 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7649 wakaba 1.122 ->manakai_local_name,
7650     token => $token);
7651 wakaba 1.87 } else {
7652 wakaba 1.91 !!!cp ('t414.1');
7653 wakaba 1.87 }
7654 wakaba 1.91
7655 wakaba 1.87 splice @{$self->{open_elements}}, $i;
7656     } else {
7657 wakaba 1.91 !!!cp ('t413.1');
7658 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7659     text => $token->{tag_name}, token => $token);
7660 wakaba 1.91
7661 wakaba 1.87 !!!cp ('t415.1');
7662     ## As if <p>, then reprocess the current token
7663     my $el;
7664 wakaba 1.126 !!!create-element ($el, $HTML_NS, 'p',, $token);
7665 wakaba 1.87 $insert->($el);
7666 wakaba 1.91 ## NOTE: Not inserted into |$self->{open_elements}|.
7667 wakaba 1.87 }
7668 wakaba 1.91
7669 wakaba 1.87 !!!next-token;
7670 wakaba 1.126 next B;
7671 wakaba 1.52 } elsif ({
7672     a => 1,
7673     b => 1, big => 1, em => 1, font => 1, i => 1,
7674 wakaba 1.193 nobr => 1, s => 1, small => 1, strike => 1,
7675 wakaba 1.52 strong => 1, tt => 1, u => 1,
7676     }->{$token->{tag_name}}) {
7677 wakaba 1.79 !!!cp ('t427');
7678 wakaba 1.113 $formatting_end_tag->($token);
7679 wakaba 1.126 next B;
7680 wakaba 1.52 } elsif ($token->{tag_name} eq 'br') {
7681 wakaba 1.79 !!!cp ('t428');
7682 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7683     text => 'br', token => $token);
7684 wakaba 1.52
7685     ## As if <br>
7686     $reconstruct_active_formatting_elements->($insert_to_current);
7687    
7688     my $el;
7689 wakaba 1.126 !!!create-element ($el, $HTML_NS, 'br',, $token);
7690 wakaba 1.52 $insert->($el);
7691    
7692     ## Ignore the token.
7693     !!!next-token;
7694 wakaba 1.126 next B;
7695 wakaba 1.52 } elsif ({
7696     caption => 1, col => 1, colgroup => 1, frame => 1,
7697     frameset => 1, head => 1, option => 1, optgroup => 1,
7698     tbody => 1, td => 1, tfoot => 1, th => 1,
7699     thead => 1, tr => 1,
7700     area => 1, basefont => 1, bgsound => 1,
7701     embed => 1, hr => 1, iframe => 1, image => 1,
7702     img => 1, input => 1, isindex => 1, noembed => 1,
7703     noframes => 1, param => 1, select => 1, spacer => 1,
7704     table => 1, textarea => 1, wbr => 1,
7705     noscript => 0, ## TODO: if scripting is enabled
7706     }->{$token->{tag_name}}) {
7707 wakaba 1.79 !!!cp ('t429');
7708 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7709     text => $token->{tag_name}, token => $token);
7710 wakaba 1.52 ## Ignore the token
7711     !!!next-token;
7712 wakaba 1.126 next B;
7713 wakaba 1.52
7714     ## ISSUE: Issue on HTML5 new elements in spec
7715    
7716     } else {
7717     ## Step 1
7718     my $node_i = -1;
7719     my $node = $self->{open_elements}->[$node_i];
7720 wakaba 1.51
7721 wakaba 1.52 ## Step 2
7722     S2: {
7723 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7724 wakaba 1.52 ## Step 1
7725     ## generate implied end tags
7726 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7727 wakaba 1.79 !!!cp ('t430');
7728 wakaba 1.151 ## NOTE: |<ruby><rt></ruby>|.
7729     ## ISSUE: <ruby><rt></rt> will also take this code path,
7730     ## which seems wrong.
7731 wakaba 1.86 pop @{$self->{open_elements}};
7732 wakaba 1.151 $node_i++;
7733 wakaba 1.52 }
7734    
7735     ## Step 2
7736 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7737     ne $token->{tag_name}) {
7738 wakaba 1.79 !!!cp ('t431');
7739 wakaba 1.58 ## NOTE: <x><y></x>
7740 wakaba 1.122 !!!parse-error (type => 'not closed',
7741 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7742 wakaba 1.122 ->manakai_local_name,
7743     token => $token);
7744 wakaba 1.79 } else {
7745     !!!cp ('t432');
7746 wakaba 1.52 }
7747    
7748     ## Step 3
7749 wakaba 1.151 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7750 wakaba 1.51
7751 wakaba 1.1 !!!next-token;
7752 wakaba 1.52 last S2;
7753 wakaba 1.1 } else {
7754 wakaba 1.52 ## Step 3
7755 wakaba 1.123 if (not ($node->[1] & FORMATTING_EL) and
7756 wakaba 1.52 #not $phrasing_category->{$node->[1]} and
7757 wakaba 1.123 ($node->[1] & SPECIAL_EL or
7758     $node->[1] & SCOPING_EL)) {
7759 wakaba 1.79 !!!cp ('t433');
7760 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7761     text => $token->{tag_name}, token => $token);
7762 wakaba 1.52 ## Ignore the token
7763     !!!next-token;
7764     last S2;
7765 wakaba 1.193
7766     ## NOTE: |<span><dd></span>a|: In Safari 3.1.2 and Opera
7767     ## 9.27, "a" is a child of <dd> (conforming). In
7768     ## Firefox 3.0.2, "a" is a child of <body>. In WinIE 7,
7769     ## "a" is a child of both <body> and <dd>.
7770 wakaba 1.52 }
7771 wakaba 1.193
7772 wakaba 1.79 !!!cp ('t434');
7773 wakaba 1.1 }
7774 wakaba 1.52
7775     ## Step 4
7776     $node_i--;
7777     $node = $self->{open_elements}->[$node_i];
7778    
7779     ## Step 5;
7780     redo S2;
7781     } # S2
7782 wakaba 1.126 next B;
7783 wakaba 1.1 }
7784     }
7785 wakaba 1.126 next B;
7786     } continue { # B
7787     if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7788     ## NOTE: The code below is executed in cases where it does not have
7789     ## to be, but it it is harmless even in those cases.
7790     ## has an element in scope
7791     INSCOPE: {
7792     for (reverse 0..$#{$self->{open_elements}}) {
7793     my $node = $self->{open_elements}->[$_];
7794     if ($node->[1] & FOREIGN_EL) {
7795     last INSCOPE;
7796     } elsif ($node->[1] & SCOPING_EL) {
7797     last;
7798     }
7799     }
7800    
7801     ## NOTE: No foreign element in scope.
7802     $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7803     } # INSCOPE
7804     }
7805 wakaba 1.1 } # B
7806    
7807     ## Stop parsing # MUST
7808    
7809     ## TODO: script stuffs
7810 wakaba 1.3 } # _tree_construct_main
7811    
7812 wakaba 1.177 sub set_inner_html ($$$$;$) {
7813 wakaba 1.3 my $class = shift;
7814     my $node = shift;
7815 wakaba 1.177 #my $s = \$_[0];
7816 wakaba 1.3 my $onerror = $_[1];
7817 wakaba 1.162 my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7818 wakaba 1.3
7819 wakaba 1.63 ## ISSUE: Should {confident} be true?
7820    
7821 wakaba 1.3 my $nt = $node->node_type;
7822     if ($nt == 9) {
7823     # MUST
7824    
7825     ## Step 1 # MUST
7826     ## TODO: If the document has an active parser, ...
7827     ## ISSUE: There is an issue in the spec.
7828    
7829     ## Step 2 # MUST
7830     my @cn = @{$node->child_nodes};
7831     for (@cn) {
7832     $node->remove_child ($_);
7833     }
7834    
7835     ## Step 3, 4, 5 # MUST
7836 wakaba 1.177 $class->parse_char_string ($_[0] => $node, $onerror, $get_wrapper);
7837 wakaba 1.3 } elsif ($nt == 1) {
7838     ## TODO: If non-html element
7839    
7840     ## NOTE: Most of this code is copied from |parse_string|
7841    
7842 wakaba 1.162 ## TODO: Support for $get_wrapper
7843    
7844 wakaba 1.3 ## Step 1 # MUST
7845 wakaba 1.14 my $this_doc = $node->owner_document;
7846     my $doc = $this_doc->implementation->create_document;
7847 wakaba 1.18 $doc->manakai_is_html (1);
7848 wakaba 1.3 my $p = $class->new;
7849     $p->{document} = $doc;
7850    
7851 wakaba 1.84 ## Step 8 # MUST
7852 wakaba 1.3 my $i = 0;
7853 wakaba 1.121 $p->{line_prev} = $p->{line} = 1;
7854     $p->{column_prev} = $p->{column} = 0;
7855 wakaba 1.177 require Whatpm::Charset::DecodeHandle;
7856     my $input = Whatpm::Charset::DecodeHandle::CharString->new (\($_[0]));
7857     $input = $get_wrapper->($input);
7858 wakaba 1.183 $p->{set_nc} = sub {
7859 wakaba 1.3 my $self = shift;
7860 wakaba 1.14
7861 wakaba 1.178 my $char = '';
7862 wakaba 1.183 if (defined $self->{next_nc}) {
7863     $char = $self->{next_nc};
7864     delete $self->{next_nc};
7865     $self->{nc} = ord $char;
7866 wakaba 1.177 } else {
7867 wakaba 1.180 $self->{char_buffer} = '';
7868     $self->{char_buffer_pos} = 0;
7869    
7870     my $count = $input->manakai_read_until
7871 wakaba 1.182 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/,
7872     $self->{char_buffer_pos});
7873 wakaba 1.180 if ($count) {
7874     $self->{line_prev} = $self->{line};
7875     $self->{column_prev} = $self->{column};
7876     $self->{column}++;
7877 wakaba 1.183 $self->{nc}
7878 wakaba 1.180 = ord substr ($self->{char_buffer},
7879     $self->{char_buffer_pos}++, 1);
7880     return;
7881     }
7882    
7883 wakaba 1.178 if ($input->read ($char, 1)) {
7884 wakaba 1.183 $self->{nc} = ord $char;
7885 wakaba 1.178 } else {
7886 wakaba 1.183 $self->{nc} = -1;
7887 wakaba 1.178 return;
7888     }
7889 wakaba 1.177 }
7890 wakaba 1.121
7891     ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7892     $p->{column}++;
7893 wakaba 1.4
7894 wakaba 1.183 if ($self->{nc} == 0x000A) { # LF
7895 wakaba 1.121 $p->{line}++;
7896     $p->{column} = 0;
7897 wakaba 1.79 !!!cp ('i1');
7898 wakaba 1.183 } elsif ($self->{nc} == 0x000D) { # CR
7899 wakaba 1.177 ## TODO: support for abort/streaming
7900 wakaba 1.178 my $next = '';
7901     if ($input->read ($next, 1) and $next ne "\x0A") {
7902 wakaba 1.183 $self->{next_nc} = $next;
7903 wakaba 1.177 }
7904 wakaba 1.183 $self->{nc} = 0x000A; # LF # MUST
7905 wakaba 1.121 $p->{line}++;
7906     $p->{column} = 0;
7907 wakaba 1.79 !!!cp ('i2');
7908 wakaba 1.183 } elsif ($self->{nc} == 0x0000) { # NULL
7909 wakaba 1.79 !!!cp ('i4');
7910 wakaba 1.14 !!!parse-error (type => 'NULL');
7911 wakaba 1.183 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
7912 wakaba 1.3 }
7913     };
7914 wakaba 1.171
7915 wakaba 1.172 $p->{read_until} = sub {
7916 wakaba 1.177 #my ($scalar, $specials_range, $offset) = @_;
7917 wakaba 1.183 return 0 if defined $p->{next_nc};
7918 wakaba 1.180
7919 wakaba 1.182 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
7920 wakaba 1.180 my $offset = $_[2] || 0;
7921    
7922     if ($p->{char_buffer_pos} < length $p->{char_buffer}) {
7923     pos ($p->{char_buffer}) = $p->{char_buffer_pos};
7924     if ($p->{char_buffer} =~ /\G(?>$pattern)+/) {
7925     substr ($_[0], $offset)
7926     = substr ($p->{char_buffer}, $-[0], $+[0] - $-[0]);
7927     my $count = $+[0] - $-[0];
7928     if ($count) {
7929     $p->{column} += $count;
7930     $p->{char_buffer_pos} += $count;
7931     $p->{line_prev} = $p->{line};
7932     $p->{column_prev} = $p->{column} - 1;
7933 wakaba 1.183 $p->{nc} = -1;
7934 wakaba 1.180 }
7935     return $count;
7936     } else {
7937     return 0;
7938     }
7939     } else {
7940     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
7941     if ($count) {
7942     $p->{column} += $count;
7943     $p->{column_prev} += $count;
7944 wakaba 1.183 $p->{nc} = -1;
7945 wakaba 1.180 }
7946     return $count;
7947 wakaba 1.177 }
7948     }; # $p->{read_until}
7949 wakaba 1.171
7950 wakaba 1.3 my $ponerror = $onerror || sub {
7951     my (%opt) = @_;
7952 wakaba 1.121 my $line = $opt{line};
7953     my $column = $opt{column};
7954     if (defined $opt{token} and defined $opt{token}->{line}) {
7955     $line = $opt{token}->{line};
7956     $column = $opt{token}->{column};
7957     }
7958     warn "Parse error ($opt{type}) at line $line column $column\n";
7959 wakaba 1.3 };
7960     $p->{parse_error} = sub {
7961 wakaba 1.121 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7962 wakaba 1.3 };
7963    
7964 wakaba 1.178 my $char_onerror = sub {
7965     my (undef, $type, %opt) = @_;
7966     $ponerror->(layer => 'encode',
7967     line => $p->{line}, column => $p->{column} + 1,
7968     %opt, type => $type);
7969     }; # $char_onerror
7970     $input->onerror ($char_onerror);
7971    
7972 wakaba 1.3 $p->_initialize_tokenizer;
7973     $p->_initialize_tree_constructor;
7974    
7975     ## Step 2
7976 wakaba 1.71 my $node_ln = $node->manakai_local_name;
7977 wakaba 1.40 $p->{content_model} = {
7978     title => RCDATA_CONTENT_MODEL,
7979     textarea => RCDATA_CONTENT_MODEL,
7980     style => CDATA_CONTENT_MODEL,
7981     script => CDATA_CONTENT_MODEL,
7982     xmp => CDATA_CONTENT_MODEL,
7983     iframe => CDATA_CONTENT_MODEL,
7984     noembed => CDATA_CONTENT_MODEL,
7985     noframes => CDATA_CONTENT_MODEL,
7986     noscript => CDATA_CONTENT_MODEL,
7987     plaintext => PLAINTEXT_CONTENT_MODEL,
7988     }->{$node_ln};
7989     $p->{content_model} = PCDATA_CONTENT_MODEL
7990     unless defined $p->{content_model};
7991     ## ISSUE: What is "the name of the element"? local name?
7992 wakaba 1.3
7993 wakaba 1.123 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
7994     ## TODO: Foreign element OK?
7995 wakaba 1.3
7996 wakaba 1.84 ## Step 3
7997 wakaba 1.3 my $root = $doc->create_element_ns
7998     ('http://www.w3.org/1999/xhtml', [undef, 'html']);
7999    
8000 wakaba 1.84 ## Step 4 # MUST
8001 wakaba 1.3 $doc->append_child ($root);
8002    
8003 wakaba 1.84 ## Step 5 # MUST
8004 wakaba 1.123 push @{$p->{open_elements}}, [$root, $el_category->{html}];
8005 wakaba 1.3
8006     undef $p->{head_element};
8007    
8008 wakaba 1.84 ## Step 6 # MUST
8009 wakaba 1.3 $p->_reset_insertion_mode;
8010    
8011 wakaba 1.84 ## Step 7 # MUST
8012 wakaba 1.3 my $anode = $node;
8013     AN: while (defined $anode) {
8014     if ($anode->node_type == 1) {
8015     my $nsuri = $anode->namespace_uri;
8016     if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
8017 wakaba 1.71 if ($anode->manakai_local_name eq 'form') {
8018 wakaba 1.79 !!!cp ('i5');
8019 wakaba 1.3 $p->{form_element} = $anode;
8020     last AN;
8021     }
8022     }
8023     }
8024     $anode = $anode->parent_node;
8025     } # AN
8026    
8027 wakaba 1.84 ## Step 9 # MUST
8028 wakaba 1.3 {
8029     my $self = $p;
8030     !!!next-token;
8031     }
8032     $p->_tree_construction_main;
8033    
8034 wakaba 1.84 ## Step 10 # MUST
8035 wakaba 1.3 my @cn = @{$node->child_nodes};
8036     for (@cn) {
8037     $node->remove_child ($_);
8038     }
8039     ## ISSUE: mutation events? read-only?
8040    
8041 wakaba 1.84 ## Step 11 # MUST
8042 wakaba 1.3 @cn = @{$root->child_nodes};
8043     for (@cn) {
8044 wakaba 1.14 $this_doc->adopt_node ($_);
8045 wakaba 1.3 $node->append_child ($_);
8046     }
8047 wakaba 1.14 ## ISSUE: mutation events?
8048 wakaba 1.3
8049     $p->_terminate_tree_constructor;
8050 wakaba 1.121
8051     delete $p->{parse_error}; # delete loop
8052 wakaba 1.3 } else {
8053     die "$0: |set_inner_html| is not defined for node of type $nt";
8054     }
8055     } # set_inner_html
8056    
8057     } # tree construction stage
8058 wakaba 1.1
8059 wakaba 1.63 package Whatpm::HTML::RestartParser;
8060     push our @ISA, 'Error';
8061    
8062 wakaba 1.1 1;
8063 wakaba 1.193 # $Date: 2008/10/02 10:59:04 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24