/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.196 - (hide annotations) (download) (as text)
Sat Oct 4 07:58:58 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.195: +151 -58 lines
File MIME type: application/x-wais-source
++ whatpm/Whatpm/ChangeLog	4 Oct 2008 07:58:26 -0000
	* HTML.pm.src: <li>, <dt>, and <dd> steps reimplemented (HTML5
	revisions 1731 and 1831).

2008-10-04  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.2 package Whatpm::HTML;
2 wakaba 1.1 use strict;
3 wakaba 1.196 our $VERSION=do{my @r=(q$Revision: 1.195 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.63 use Error qw(:try);
5 wakaba 1.1
6 wakaba 1.182 ## NOTE: This module don't check all HTML5 parse errors; character
7     ## encoding related parse errors are expected to be handled by relevant
8     ## modules.
9     ## Parse errors for control characters that are not allowed in HTML5
10     ## documents, for surrogate code points, and for noncharacter code
11     ## points, as well as U+FFFD substitions for characters whose code points
12     ## is higher than U+10FFFF may be detected by combining the parser with
13     ## the checker implemented by Whatpm::Charset::UnicodeChecker (for its
14     ## usage example, see |t/HTML-tree.t| in the Whatpm package or the
15     ## WebHACC::Language::HTML module in the WebHACC package).
16    
17 wakaba 1.18 ## ISSUE:
18     ## var doc = implementation.createDocument (null, null, null);
19     ## doc.write ('');
20     ## alert (doc.compatMode);
21 wakaba 1.1
22 wakaba 1.139 require IO::Handle;
23    
24 wakaba 1.126 my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
25     my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
26     my $SVG_NS = q<http://www.w3.org/2000/svg>;
27     my $XLINK_NS = q<http://www.w3.org/1999/xlink>;
28     my $XML_NS = q<http://www.w3.org/XML/1998/namespace>;
29     my $XMLNS_NS = q<http://www.w3.org/2000/xmlns/>;
30    
31 wakaba 1.123 sub A_EL () { 0b1 }
32     sub ADDRESS_EL () { 0b10 }
33     sub BODY_EL () { 0b100 }
34     sub BUTTON_EL () { 0b1000 }
35     sub CAPTION_EL () { 0b10000 }
36     sub DD_EL () { 0b100000 }
37     sub DIV_EL () { 0b1000000 }
38     sub DT_EL () { 0b10000000 }
39     sub FORM_EL () { 0b100000000 }
40     sub FORMATTING_EL () { 0b1000000000 }
41     sub FRAMESET_EL () { 0b10000000000 }
42     sub HEADING_EL () { 0b100000000000 }
43     sub HTML_EL () { 0b1000000000000 }
44     sub LI_EL () { 0b10000000000000 }
45     sub NOBR_EL () { 0b100000000000000 }
46     sub OPTION_EL () { 0b1000000000000000 }
47     sub OPTGROUP_EL () { 0b10000000000000000 }
48     sub P_EL () { 0b100000000000000000 }
49     sub SELECT_EL () { 0b1000000000000000000 }
50     sub TABLE_EL () { 0b10000000000000000000 }
51     sub TABLE_CELL_EL () { 0b100000000000000000000 }
52     sub TABLE_ROW_EL () { 0b1000000000000000000000 }
53     sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 }
54     sub MISC_SCOPING_EL () { 0b100000000000000000000000 }
55     sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 }
56 wakaba 1.126 sub FOREIGN_EL () { 0b10000000000000000000000000 }
57     sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 }
58     sub MML_AXML_EL () { 0b1000000000000000000000000000 }
59 wakaba 1.151 sub RUBY_EL () { 0b10000000000000000000000000000 }
60     sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 }
61 wakaba 1.123
62     sub TABLE_ROWS_EL () {
63     TABLE_EL |
64     TABLE_ROW_EL |
65     TABLE_ROW_GROUP_EL
66     }
67    
68 wakaba 1.151 ## NOTE: Used in "generate implied end tags" algorithm.
69 wakaba 1.194 ## NOTE: There is a code where a modified version of
70     ## END_TAG_OPTIONAL_EL is used in "generate implied end tags"
71     ## implementation (search for the algorithm name).
72 wakaba 1.123 sub END_TAG_OPTIONAL_EL () {
73     DD_EL |
74     DT_EL |
75     LI_EL |
76 wakaba 1.194 OPTION_EL |
77     OPTGROUP_EL |
78 wakaba 1.151 P_EL |
79     RUBY_COMPONENT_EL
80 wakaba 1.123 }
81    
82 wakaba 1.151 ## NOTE: Used in </body> and EOF algorithms.
83 wakaba 1.123 sub ALL_END_TAG_OPTIONAL_EL () {
84 wakaba 1.151 DD_EL |
85     DT_EL |
86     LI_EL |
87     P_EL |
88    
89 wakaba 1.123 BODY_EL |
90     HTML_EL |
91     TABLE_CELL_EL |
92     TABLE_ROW_EL |
93     TABLE_ROW_GROUP_EL
94     }
95    
96     sub SCOPING_EL () {
97     BUTTON_EL |
98     CAPTION_EL |
99     HTML_EL |
100     TABLE_EL |
101     TABLE_CELL_EL |
102     MISC_SCOPING_EL
103     }
104    
105     sub TABLE_SCOPING_EL () {
106     HTML_EL |
107     TABLE_EL
108     }
109    
110     sub TABLE_ROWS_SCOPING_EL () {
111     HTML_EL |
112     TABLE_ROW_GROUP_EL
113     }
114    
115     sub TABLE_ROW_SCOPING_EL () {
116     HTML_EL |
117     TABLE_ROW_EL
118     }
119    
120     sub SPECIAL_EL () {
121     ADDRESS_EL |
122     BODY_EL |
123     DIV_EL |
124 wakaba 1.151
125     DD_EL |
126     DT_EL |
127     LI_EL |
128     P_EL |
129    
130 wakaba 1.123 FORM_EL |
131     FRAMESET_EL |
132     HEADING_EL |
133     OPTION_EL |
134     OPTGROUP_EL |
135     SELECT_EL |
136     TABLE_ROW_EL |
137     TABLE_ROW_GROUP_EL |
138     MISC_SPECIAL_EL
139     }
140    
141     my $el_category = {
142     a => A_EL | FORMATTING_EL,
143     address => ADDRESS_EL,
144     applet => MISC_SCOPING_EL,
145     area => MISC_SPECIAL_EL,
146 wakaba 1.193 article => MISC_SPECIAL_EL,
147     aside => MISC_SPECIAL_EL,
148 wakaba 1.123 b => FORMATTING_EL,
149     base => MISC_SPECIAL_EL,
150     basefont => MISC_SPECIAL_EL,
151     bgsound => MISC_SPECIAL_EL,
152     big => FORMATTING_EL,
153     blockquote => MISC_SPECIAL_EL,
154     body => BODY_EL,
155     br => MISC_SPECIAL_EL,
156     button => BUTTON_EL,
157     caption => CAPTION_EL,
158     center => MISC_SPECIAL_EL,
159     col => MISC_SPECIAL_EL,
160     colgroup => MISC_SPECIAL_EL,
161 wakaba 1.193 command => MISC_SPECIAL_EL,
162     datagrid => MISC_SPECIAL_EL,
163 wakaba 1.123 dd => DD_EL,
164 wakaba 1.193 details => MISC_SPECIAL_EL,
165     dialog => MISC_SPECIAL_EL,
166 wakaba 1.123 dir => MISC_SPECIAL_EL,
167     div => DIV_EL,
168     dl => MISC_SPECIAL_EL,
169     dt => DT_EL,
170     em => FORMATTING_EL,
171     embed => MISC_SPECIAL_EL,
172 wakaba 1.193 eventsource => MISC_SPECIAL_EL,
173 wakaba 1.123 fieldset => MISC_SPECIAL_EL,
174 wakaba 1.193 figure => MISC_SPECIAL_EL,
175 wakaba 1.123 font => FORMATTING_EL,
176 wakaba 1.193 footer => MISC_SPECIAL_EL,
177 wakaba 1.123 form => FORM_EL,
178     frame => MISC_SPECIAL_EL,
179     frameset => FRAMESET_EL,
180     h1 => HEADING_EL,
181     h2 => HEADING_EL,
182     h3 => HEADING_EL,
183     h4 => HEADING_EL,
184     h5 => HEADING_EL,
185     h6 => HEADING_EL,
186     head => MISC_SPECIAL_EL,
187 wakaba 1.193 header => MISC_SPECIAL_EL,
188 wakaba 1.123 hr => MISC_SPECIAL_EL,
189     html => HTML_EL,
190     i => FORMATTING_EL,
191     iframe => MISC_SPECIAL_EL,
192     img => MISC_SPECIAL_EL,
193 wakaba 1.193 #image => MISC_SPECIAL_EL, ## NOTE: Commented out in the spec.
194 wakaba 1.123 input => MISC_SPECIAL_EL,
195     isindex => MISC_SPECIAL_EL,
196     li => LI_EL,
197     link => MISC_SPECIAL_EL,
198     listing => MISC_SPECIAL_EL,
199     marquee => MISC_SCOPING_EL,
200     menu => MISC_SPECIAL_EL,
201     meta => MISC_SPECIAL_EL,
202 wakaba 1.193 nav => MISC_SPECIAL_EL,
203 wakaba 1.123 nobr => NOBR_EL | FORMATTING_EL,
204     noembed => MISC_SPECIAL_EL,
205     noframes => MISC_SPECIAL_EL,
206     noscript => MISC_SPECIAL_EL,
207     object => MISC_SCOPING_EL,
208     ol => MISC_SPECIAL_EL,
209     optgroup => OPTGROUP_EL,
210     option => OPTION_EL,
211     p => P_EL,
212     param => MISC_SPECIAL_EL,
213     plaintext => MISC_SPECIAL_EL,
214     pre => MISC_SPECIAL_EL,
215 wakaba 1.151 rp => RUBY_COMPONENT_EL,
216     rt => RUBY_COMPONENT_EL,
217     ruby => RUBY_EL,
218 wakaba 1.123 s => FORMATTING_EL,
219     script => MISC_SPECIAL_EL,
220     select => SELECT_EL,
221 wakaba 1.193 section => MISC_SPECIAL_EL,
222 wakaba 1.123 small => FORMATTING_EL,
223     spacer => MISC_SPECIAL_EL,
224     strike => FORMATTING_EL,
225     strong => FORMATTING_EL,
226     style => MISC_SPECIAL_EL,
227     table => TABLE_EL,
228     tbody => TABLE_ROW_GROUP_EL,
229     td => TABLE_CELL_EL,
230     textarea => MISC_SPECIAL_EL,
231     tfoot => TABLE_ROW_GROUP_EL,
232     th => TABLE_CELL_EL,
233     thead => TABLE_ROW_GROUP_EL,
234     title => MISC_SPECIAL_EL,
235     tr => TABLE_ROW_EL,
236     tt => FORMATTING_EL,
237     u => FORMATTING_EL,
238     ul => MISC_SPECIAL_EL,
239     wbr => MISC_SPECIAL_EL,
240     };
241    
242 wakaba 1.126 my $el_category_f = {
243     $MML_NS => {
244     'annotation-xml' => MML_AXML_EL,
245     mi => FOREIGN_FLOW_CONTENT_EL,
246     mo => FOREIGN_FLOW_CONTENT_EL,
247     mn => FOREIGN_FLOW_CONTENT_EL,
248     ms => FOREIGN_FLOW_CONTENT_EL,
249     mtext => FOREIGN_FLOW_CONTENT_EL,
250     },
251     $SVG_NS => {
252 wakaba 1.131 foreignObject => FOREIGN_FLOW_CONTENT_EL,
253 wakaba 1.126 desc => FOREIGN_FLOW_CONTENT_EL,
254     title => FOREIGN_FLOW_CONTENT_EL,
255     },
256     ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements.
257     };
258    
259 wakaba 1.131 my $svg_attr_name = {
260 wakaba 1.146 attributename => 'attributeName',
261 wakaba 1.131 attributetype => 'attributeType',
262     basefrequency => 'baseFrequency',
263     baseprofile => 'baseProfile',
264     calcmode => 'calcMode',
265     clippathunits => 'clipPathUnits',
266     contentscripttype => 'contentScriptType',
267     contentstyletype => 'contentStyleType',
268     diffuseconstant => 'diffuseConstant',
269     edgemode => 'edgeMode',
270     externalresourcesrequired => 'externalResourcesRequired',
271     filterres => 'filterRes',
272     filterunits => 'filterUnits',
273     glyphref => 'glyphRef',
274     gradienttransform => 'gradientTransform',
275     gradientunits => 'gradientUnits',
276     kernelmatrix => 'kernelMatrix',
277     kernelunitlength => 'kernelUnitLength',
278     keypoints => 'keyPoints',
279     keysplines => 'keySplines',
280     keytimes => 'keyTimes',
281     lengthadjust => 'lengthAdjust',
282     limitingconeangle => 'limitingConeAngle',
283     markerheight => 'markerHeight',
284     markerunits => 'markerUnits',
285     markerwidth => 'markerWidth',
286     maskcontentunits => 'maskContentUnits',
287     maskunits => 'maskUnits',
288     numoctaves => 'numOctaves',
289     pathlength => 'pathLength',
290     patterncontentunits => 'patternContentUnits',
291     patterntransform => 'patternTransform',
292     patternunits => 'patternUnits',
293     pointsatx => 'pointsAtX',
294     pointsaty => 'pointsAtY',
295     pointsatz => 'pointsAtZ',
296     preservealpha => 'preserveAlpha',
297     preserveaspectratio => 'preserveAspectRatio',
298     primitiveunits => 'primitiveUnits',
299     refx => 'refX',
300     refy => 'refY',
301     repeatcount => 'repeatCount',
302     repeatdur => 'repeatDur',
303     requiredextensions => 'requiredExtensions',
304 wakaba 1.146 requiredfeatures => 'requiredFeatures',
305 wakaba 1.131 specularconstant => 'specularConstant',
306     specularexponent => 'specularExponent',
307     spreadmethod => 'spreadMethod',
308     startoffset => 'startOffset',
309     stddeviation => 'stdDeviation',
310     stitchtiles => 'stitchTiles',
311     surfacescale => 'surfaceScale',
312     systemlanguage => 'systemLanguage',
313     tablevalues => 'tableValues',
314     targetx => 'targetX',
315     targety => 'targetY',
316     textlength => 'textLength',
317     viewbox => 'viewBox',
318     viewtarget => 'viewTarget',
319     xchannelselector => 'xChannelSelector',
320     ychannelselector => 'yChannelSelector',
321     zoomandpan => 'zoomAndPan',
322     };
323    
324     my $foreign_attr_xname = {
325     'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']],
326     'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']],
327     'xlink:href' => [$XLINK_NS, ['xlink', 'href']],
328     'xlink:role' => [$XLINK_NS, ['xlink', 'role']],
329     'xlink:show' => [$XLINK_NS, ['xlink', 'show']],
330     'xlink:title' => [$XLINK_NS, ['xlink', 'title']],
331     'xlink:type' => [$XLINK_NS, ['xlink', 'type']],
332     'xml:base' => [$XML_NS, ['xml', 'base']],
333     'xml:lang' => [$XML_NS, ['xml', 'lang']],
334     'xml:space' => [$XML_NS, ['xml', 'space']],
335     'xmlns' => [$XMLNS_NS, [undef, 'xmlns']],
336     'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']],
337     };
338    
339     ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
340    
341 wakaba 1.191 my $charref_map = {
342     0x0D => 0x000A,
343 wakaba 1.10 0x80 => 0x20AC,
344     0x81 => 0xFFFD,
345     0x82 => 0x201A,
346     0x83 => 0x0192,
347     0x84 => 0x201E,
348     0x85 => 0x2026,
349     0x86 => 0x2020,
350     0x87 => 0x2021,
351     0x88 => 0x02C6,
352     0x89 => 0x2030,
353     0x8A => 0x0160,
354     0x8B => 0x2039,
355     0x8C => 0x0152,
356     0x8D => 0xFFFD,
357     0x8E => 0x017D,
358     0x8F => 0xFFFD,
359     0x90 => 0xFFFD,
360     0x91 => 0x2018,
361     0x92 => 0x2019,
362     0x93 => 0x201C,
363     0x94 => 0x201D,
364     0x95 => 0x2022,
365     0x96 => 0x2013,
366     0x97 => 0x2014,
367     0x98 => 0x02DC,
368     0x99 => 0x2122,
369     0x9A => 0x0161,
370     0x9B => 0x203A,
371     0x9C => 0x0153,
372     0x9D => 0xFFFD,
373     0x9E => 0x017E,
374     0x9F => 0x0178,
375 wakaba 1.191 }; # $charref_map
376     $charref_map->{$_} = 0xFFFD
377     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
378     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
379     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
380     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
381     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
382     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
383     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
384 wakaba 1.1
385 wakaba 1.192 ## TODO: Invoke the reset algorithm when a resettable element is
386     ## created (cf. HTML5 revision 2259).
387    
388 wakaba 1.63 sub parse_byte_string ($$$$;$) {
389 wakaba 1.138 my $self = shift;
390     my $charset_name = shift;
391     open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
392     return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
393     } # parse_byte_string
394    
395 wakaba 1.162 sub parse_byte_stream ($$$$;$$) {
396     # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_;
397 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
398 wakaba 1.133 my $charset_name = shift;
399 wakaba 1.138 my $byte_stream = $_[0];
400 wakaba 1.133
401 wakaba 1.134 my $onerror = $_[2] || sub {
402     my (%opt) = @_;
403     warn "Parse error ($opt{type})\n";
404     };
405     $self->{parse_error} = $onerror; # updated later by parse_char_string
406    
407 wakaba 1.162 my $get_wrapper = $_[3] || sub ($) {
408     return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle
409     };
410    
411 wakaba 1.133 ## HTML5 encoding sniffing algorithm
412     require Message::Charset::Info;
413     my $charset;
414 wakaba 1.136 my $buffer;
415     my ($char_stream, $e_status);
416 wakaba 1.133
417     SNIFFING: {
418 wakaba 1.160 ## NOTE: By setting |allow_fallback| option true when the
419     ## |get_decode_handle| method is invoked, we ignore what the HTML5
420     ## spec requires, i.e. unsupported encoding should be ignored.
421     ## TODO: We should not do this unless the parser is invoked
422     ## in the conformance checking mode, in which this behavior
423     ## would be useful.
424 wakaba 1.133
425     ## Step 1
426     if (defined $charset_name) {
427 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
428     ## TODO: Is this ok? Transfer protocol's parameter should be
429     ## interpreted in its semantics?
430 wakaba 1.133
431 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
432     ($byte_stream, allow_error_reporting => 1,
433 wakaba 1.133 allow_fallback => 1);
434 wakaba 1.136 if ($char_stream) {
435 wakaba 1.133 $self->{confident} = 1;
436     last SNIFFING;
437 wakaba 1.136 } else {
438 wakaba 1.190 !!!parse-error (type => 'charset:not supported',
439     layer => 'encode',
440     line => 1, column => 1,
441     value => $charset_name,
442     level => $self->{level}->{uncertain});
443 wakaba 1.133 }
444     }
445    
446     ## Step 2
447 wakaba 1.136 my $byte_buffer = '';
448     for (1..1024) {
449     my $char = $byte_stream->getc;
450     last unless defined $char;
451     $byte_buffer .= $char;
452     } ## TODO: timeout
453 wakaba 1.133
454     ## Step 3
455 wakaba 1.136 if ($byte_buffer =~ /^\xFE\xFF/) {
456 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-16be');
457 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
458     ($byte_stream, allow_error_reporting => 1,
459     allow_fallback => 1, byte_buffer => \$byte_buffer);
460 wakaba 1.133 $self->{confident} = 1;
461     last SNIFFING;
462 wakaba 1.136 } elsif ($byte_buffer =~ /^\xFF\xFE/) {
463 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-16le');
464 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
465     ($byte_stream, allow_error_reporting => 1,
466     allow_fallback => 1, byte_buffer => \$byte_buffer);
467 wakaba 1.133 $self->{confident} = 1;
468     last SNIFFING;
469 wakaba 1.136 } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
470 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
471 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
472     ($byte_stream, allow_error_reporting => 1,
473     allow_fallback => 1, byte_buffer => \$byte_buffer);
474 wakaba 1.133 $self->{confident} = 1;
475     last SNIFFING;
476     }
477    
478     ## Step 4
479     ## TODO: <meta charset>
480    
481     ## Step 5
482     ## TODO: from history
483    
484     ## Step 6
485 wakaba 1.65 require Whatpm::Charset::UniversalCharDet;
486 wakaba 1.133 $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
487 wakaba 1.136 ($byte_buffer);
488 wakaba 1.133 if (defined $charset_name) {
489 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
490 wakaba 1.133
491     ## ISSUE: Unsupported encoding is not ignored according to the spec.
492 wakaba 1.136 require Whatpm::Charset::DecodeHandle;
493     $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
494     ($byte_stream);
495     ($char_stream, $e_status) = $charset->get_decode_handle
496     ($buffer, allow_error_reporting => 1,
497     allow_fallback => 1, byte_buffer => \$byte_buffer);
498     if ($char_stream) {
499     $buffer->{buffer} = $byte_buffer;
500 wakaba 1.153 !!!parse-error (type => 'sniffing:chardet',
501     text => $charset_name,
502     level => $self->{level}->{info},
503     layer => 'encode',
504 wakaba 1.134 line => 1, column => 1);
505 wakaba 1.133 $self->{confident} = 0;
506     last SNIFFING;
507     }
508     }
509    
510     ## Step 7: default
511     ## TODO: Make this configurable.
512 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('windows-1252');
513 wakaba 1.133 ## NOTE: We choose |windows-1252| here, since |utf-8| should be
514     ## detectable in the step 6.
515 wakaba 1.136 require Whatpm::Charset::DecodeHandle;
516     $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
517     ($byte_stream);
518     ($char_stream, $e_status)
519     = $charset->get_decode_handle ($buffer,
520     allow_error_reporting => 1,
521     allow_fallback => 1,
522     byte_buffer => \$byte_buffer);
523     $buffer->{buffer} = $byte_buffer;
524 wakaba 1.153 !!!parse-error (type => 'sniffing:default',
525     text => 'windows-1252',
526     level => $self->{level}->{info},
527     line => 1, column => 1,
528     layer => 'encode');
529 wakaba 1.63 $self->{confident} = 0;
530 wakaba 1.133 } # SNIFFING
531    
532     if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
533 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
534 wakaba 1.153 !!!parse-error (type => 'chardecode:fallback',
535 wakaba 1.160 #text => $self->{input_encoding},
536 wakaba 1.153 level => $self->{level}->{uncertain},
537     line => 1, column => 1,
538     layer => 'encode');
539 wakaba 1.133 } elsif (not ($e_status &
540 wakaba 1.178 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
541 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name;
542 wakaba 1.153 !!!parse-error (type => 'chardecode:no error',
543     text => $self->{input_encoding},
544     level => $self->{level}->{uncertain},
545     line => 1, column => 1,
546     layer => 'encode');
547 wakaba 1.160 } else {
548     $self->{input_encoding} = $charset->get_iana_name;
549 wakaba 1.63 }
550    
551     $self->{change_encoding} = sub {
552     my $self = shift;
553 wakaba 1.134 $charset_name = shift;
554 wakaba 1.114 my $token = shift;
555 wakaba 1.63
556 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ($charset_name);
557 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
558     ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
559     byte_buffer => \ $buffer->{buffer});
560 wakaba 1.134
561 wakaba 1.136 if ($char_stream) { # if supported
562 wakaba 1.134 ## "Change the encoding" algorithm:
563 wakaba 1.63
564 wakaba 1.134 ## Step 1
565 wakaba 1.149 if ($charset->{category} &
566     Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) {
567 wakaba 1.161 $charset = Message::Charset::Info->get_by_html_name ('utf-8');
568 wakaba 1.136 ($char_stream, $e_status) = $charset->get_decode_handle
569     ($byte_stream,
570     byte_buffer => \ $buffer->{buffer});
571 wakaba 1.134 }
572     $charset_name = $charset->get_iana_name;
573    
574     ## Step 2
575     if (defined $self->{input_encoding} and
576     $self->{input_encoding} eq $charset_name) {
577 wakaba 1.153 !!!parse-error (type => 'charset label:matching',
578     text => $charset_name,
579     level => $self->{level}->{info});
580 wakaba 1.134 $self->{confident} = 1;
581     return;
582     }
583 wakaba 1.63
584 wakaba 1.153 !!!parse-error (type => 'charset label detected',
585     text => $self->{input_encoding},
586     value => $charset_name,
587     level => $self->{level}->{warn},
588     token => $token);
589 wakaba 1.134
590     ## Step 3
591     # if (can) {
592     ## change the encoding on the fly.
593     #$self->{confident} = 1;
594     #return;
595     # }
596    
597     ## Step 4
598     throw Whatpm::HTML::RestartParser ();
599 wakaba 1.63 }
600     }; # $self->{change_encoding}
601    
602 wakaba 1.136 my $char_onerror = sub {
603     my (undef, $type, %opt) = @_;
604 wakaba 1.153 !!!parse-error (layer => 'encode',
605 wakaba 1.174 line => $self->{line}, column => $self->{column} + 1,
606     %opt, type => $type);
607 wakaba 1.136 if ($opt{octets}) {
608     ${$opt{octets}} = "\x{FFFD}"; # relacement character
609     }
610     };
611 wakaba 1.162
612     my $wrapped_char_stream = $get_wrapper->($char_stream);
613     $wrapped_char_stream->onerror ($char_onerror);
614 wakaba 1.136
615 wakaba 1.182 my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef;
616 wakaba 1.63 my $return;
617     try {
618 wakaba 1.162 $return = $self->parse_char_stream ($wrapped_char_stream, @args);
619 wakaba 1.63 } catch Whatpm::HTML::RestartParser with {
620 wakaba 1.134 ## NOTE: Invoked after {change_encoding}.
621    
622     if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
623 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name?
624 wakaba 1.153 !!!parse-error (type => 'chardecode:fallback',
625     level => $self->{level}->{uncertain},
626 wakaba 1.160 #text => $self->{input_encoding},
627 wakaba 1.153 line => 1, column => 1,
628     layer => 'encode');
629 wakaba 1.134 } elsif (not ($e_status &
630 wakaba 1.178 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
631 wakaba 1.160 $self->{input_encoding} = $charset->get_iana_name;
632 wakaba 1.153 !!!parse-error (type => 'chardecode:no error',
633     text => $self->{input_encoding},
634     level => $self->{level}->{uncertain},
635     line => 1, column => 1,
636     layer => 'encode');
637 wakaba 1.160 } else {
638     $self->{input_encoding} = $charset->get_iana_name;
639 wakaba 1.134 }
640 wakaba 1.63 $self->{confident} = 1;
641 wakaba 1.162
642     $wrapped_char_stream = $get_wrapper->($char_stream);
643     $wrapped_char_stream->onerror ($char_onerror);
644    
645     $return = $self->parse_char_stream ($wrapped_char_stream, @args);
646 wakaba 1.63 };
647     return $return;
648 wakaba 1.138 } # parse_byte_stream
649 wakaba 1.63
650 wakaba 1.71 ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
651     ## and the HTML layer MUST ignore it. However, we does strip BOM in
652     ## the encoding layer and the HTML layer does not ignore any U+FEFF,
653     ## because the core part of our HTML parser expects a string of character,
654     ## not a string of bytes or code units or anything which might contain a BOM.
655     ## Therefore, any parser interface that accepts a string of bytes,
656     ## such as |parse_byte_string| in this module, must ensure that it does
657     ## strip the BOM and never strip any ZWNBSP.
658    
659 wakaba 1.162 sub parse_char_string ($$$;$$) {
660     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
661 wakaba 1.135 my $self = shift;
662 wakaba 1.139 my $s = ref $_[0] ? $_[0] : \($_[0]);
663 wakaba 1.171 require Whatpm::Charset::DecodeHandle;
664     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
665 wakaba 1.135 return $self->parse_char_stream ($input, @_[1..$#_]);
666     } # parse_char_string
667 wakaba 1.162 *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
668 wakaba 1.63
669 wakaba 1.182 sub parse_char_stream ($$$;$$) {
670 wakaba 1.63 my $self = ref $_[0] ? shift : shift->new;
671 wakaba 1.135 my $input = $_[0];
672 wakaba 1.1 $self->{document} = $_[1];
673 wakaba 1.63 @{$self->{document}->child_nodes} = ();
674 wakaba 1.1
675 wakaba 1.3 ## NOTE: |set_inner_html| copies most of this method's code
676    
677 wakaba 1.63 $self->{confident} = 1 unless exists $self->{confident};
678 wakaba 1.64 $self->{document}->input_encoding ($self->{input_encoding})
679     if defined $self->{input_encoding};
680 wakaba 1.178 ## TODO: |{input_encoding}| is needless?
681 wakaba 1.63
682 wakaba 1.112 $self->{line_prev} = $self->{line} = 1;
683 wakaba 1.179 $self->{column_prev} = -1;
684     $self->{column} = 0;
685 wakaba 1.183 $self->{set_nc} = sub {
686 wakaba 1.1 my $self = shift;
687 wakaba 1.13
688 wakaba 1.178 my $char = '';
689 wakaba 1.183 if (defined $self->{next_nc}) {
690     $char = $self->{next_nc};
691     delete $self->{next_nc};
692     $self->{nc} = ord $char;
693 wakaba 1.139 } else {
694 wakaba 1.179 $self->{char_buffer} = '';
695     $self->{char_buffer_pos} = 0;
696    
697     my $count = $input->manakai_read_until
698 wakaba 1.182 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
699 wakaba 1.179 if ($count) {
700     $self->{line_prev} = $self->{line};
701     $self->{column_prev} = $self->{column};
702     $self->{column}++;
703 wakaba 1.183 $self->{nc}
704 wakaba 1.179 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
705     return;
706     }
707    
708 wakaba 1.178 if ($input->read ($char, 1)) {
709 wakaba 1.183 $self->{nc} = ord $char;
710 wakaba 1.178 } else {
711 wakaba 1.183 $self->{nc} = -1;
712 wakaba 1.178 return;
713     }
714 wakaba 1.139 }
715 wakaba 1.112
716     ($self->{line_prev}, $self->{column_prev})
717     = ($self->{line}, $self->{column});
718     $self->{column}++;
719 wakaba 1.1
720 wakaba 1.183 if ($self->{nc} == 0x000A) { # LF
721 wakaba 1.132 !!!cp ('j1');
722 wakaba 1.112 $self->{line}++;
723     $self->{column} = 0;
724 wakaba 1.183 } elsif ($self->{nc} == 0x000D) { # CR
725 wakaba 1.132 !!!cp ('j2');
726 wakaba 1.170 ## TODO: support for abort/streaming
727 wakaba 1.178 my $next = '';
728     if ($input->read ($next, 1) and $next ne "\x0A") {
729 wakaba 1.183 $self->{next_nc} = $next;
730 wakaba 1.135 }
731 wakaba 1.183 $self->{nc} = 0x000A; # LF # MUST
732 wakaba 1.112 $self->{line}++;
733     $self->{column} = 0;
734 wakaba 1.183 } elsif ($self->{nc} == 0x0000) { # NULL
735 wakaba 1.132 !!!cp ('j4');
736 wakaba 1.8 !!!parse-error (type => 'NULL');
737 wakaba 1.183 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
738 wakaba 1.1 }
739     };
740    
741 wakaba 1.172 $self->{read_until} = sub {
742     #my ($scalar, $specials_range, $offset) = @_;
743 wakaba 1.183 return 0 if defined $self->{next_nc};
744 wakaba 1.180
745 wakaba 1.182 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
746 wakaba 1.180 my $offset = $_[2] || 0;
747    
748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
749     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
750     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
751     substr ($_[0], $offset)
752     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
753     my $count = $+[0] - $-[0];
754     if ($count) {
755     $self->{column} += $count;
756     $self->{char_buffer_pos} += $count;
757     $self->{line_prev} = $self->{line};
758     $self->{column_prev} = $self->{column} - 1;
759 wakaba 1.183 $self->{nc} = -1;
760 wakaba 1.180 }
761     return $count;
762     } else {
763     return 0;
764     }
765     } else {
766     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
767     if ($count) {
768     $self->{column} += $count;
769     $self->{line_prev} = $self->{line};
770     $self->{column_prev} = $self->{column} - 1;
771 wakaba 1.183 $self->{nc} = -1;
772 wakaba 1.180 }
773     return $count;
774 wakaba 1.172 }
775     }; # $self->{read_until}
776 wakaba 1.171
777 wakaba 1.3 my $onerror = $_[2] || sub {
778     my (%opt) = @_;
779 wakaba 1.112 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
780     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
781     warn "Parse error ($opt{type}) at line $line column $column\n";
782 wakaba 1.3 };
783     $self->{parse_error} = sub {
784 wakaba 1.112 $onerror->(line => $self->{line}, column => $self->{column}, @_);
785 wakaba 1.1 };
786    
787 wakaba 1.182 my $char_onerror = sub {
788     my (undef, $type, %opt) = @_;
789     !!!parse-error (layer => 'encode',
790     line => $self->{line}, column => $self->{column} + 1,
791     %opt, type => $type);
792     }; # $char_onerror
793    
794     if ($_[3]) {
795     $input = $_[3]->($input);
796     $input->onerror ($char_onerror);
797     } else {
798     $input->onerror ($char_onerror) unless defined $input->onerror;
799     }
800    
801 wakaba 1.1 $self->_initialize_tokenizer;
802     $self->_initialize_tree_constructor;
803     $self->_construct_tree;
804     $self->_terminate_tree_constructor;
805    
806 wakaba 1.112 delete $self->{parse_error}; # remove loop
807    
808 wakaba 1.1 return $self->{document};
809 wakaba 1.135 } # parse_char_stream
810 wakaba 1.1
811     sub new ($) {
812     my $class = shift;
813 wakaba 1.134 my $self = bless {
814 wakaba 1.153 level => {must => 'm',
815 wakaba 1.159 should => 's',
816 wakaba 1.153 warn => 'w',
817     info => 'i',
818     uncertain => 'u'},
819 wakaba 1.134 }, $class;
820 wakaba 1.183 $self->{set_nc} = sub {
821     $self->{nc} = -1;
822 wakaba 1.1 };
823     $self->{parse_error} = sub {
824     #
825     };
826 wakaba 1.63 $self->{change_encoding} = sub {
827     # if ($_[0] is a supported encoding) {
828     # run "change the encoding" algorithm;
829     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
830     # }
831     };
832 wakaba 1.61 $self->{application_cache_selection} = sub {
833     #
834     };
835 wakaba 1.1 return $self;
836     } # new
837    
838 wakaba 1.40 sub CM_ENTITY () { 0b001 } # & markup in data
839     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
840     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
841    
842     sub PLAINTEXT_CONTENT_MODEL () { 0 }
843     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
844     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
845     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
846    
847 wakaba 1.57 sub DATA_STATE () { 0 }
848 wakaba 1.168 #sub ENTITY_DATA_STATE () { 1 }
849 wakaba 1.57 sub TAG_OPEN_STATE () { 2 }
850     sub CLOSE_TAG_OPEN_STATE () { 3 }
851     sub TAG_NAME_STATE () { 4 }
852     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
853     sub ATTRIBUTE_NAME_STATE () { 6 }
854     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
855     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
856     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
857     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
858     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
859 wakaba 1.168 #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
860 wakaba 1.57 sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
861     sub COMMENT_START_STATE () { 14 }
862     sub COMMENT_START_DASH_STATE () { 15 }
863     sub COMMENT_STATE () { 16 }
864     sub COMMENT_END_STATE () { 17 }
865     sub COMMENT_END_DASH_STATE () { 18 }
866     sub BOGUS_COMMENT_STATE () { 19 }
867     sub DOCTYPE_STATE () { 20 }
868     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
869     sub DOCTYPE_NAME_STATE () { 22 }
870     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
871     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
872     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
873     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
874     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
875     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
876     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
877     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
878     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
879     sub BOGUS_DOCTYPE_STATE () { 32 }
880 wakaba 1.72 sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
881 wakaba 1.125 sub SELF_CLOSING_START_TAG_STATE () { 34 }
882 wakaba 1.165 sub CDATA_SECTION_STATE () { 35 }
883 wakaba 1.164 sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
884     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
885     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
886 wakaba 1.185 sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
887 wakaba 1.165 sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
888     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
889 wakaba 1.166 sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
890     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
891 wakaba 1.168 ## NOTE: "Entity data state", "entity in attribute value state", and
892     ## "consume a character reference" algorithm are jointly implemented
893     ## using the following six states:
894     sub ENTITY_STATE () { 44 }
895     sub ENTITY_HASH_STATE () { 45 }
896     sub NCR_NUM_STATE () { 46 }
897     sub HEXREF_X_STATE () { 47 }
898     sub HEXREF_HEX_STATE () { 48 }
899     sub ENTITY_NAME_STATE () { 49 }
900 wakaba 1.185 sub PCDATA_STATE () { 50 } # "data state" in the spec
901 wakaba 1.57
902 wakaba 1.55 sub DOCTYPE_TOKEN () { 1 }
903     sub COMMENT_TOKEN () { 2 }
904     sub START_TAG_TOKEN () { 3 }
905     sub END_TAG_TOKEN () { 4 }
906     sub END_OF_FILE_TOKEN () { 5 }
907     sub CHARACTER_TOKEN () { 6 }
908    
909 wakaba 1.54 sub AFTER_HTML_IMS () { 0b100 }
910     sub HEAD_IMS () { 0b1000 }
911     sub BODY_IMS () { 0b10000 }
912 wakaba 1.56 sub BODY_TABLE_IMS () { 0b100000 }
913 wakaba 1.54 sub TABLE_IMS () { 0b1000000 }
914 wakaba 1.56 sub ROW_IMS () { 0b10000000 }
915 wakaba 1.54 sub BODY_AFTER_IMS () { 0b100000000 }
916     sub FRAME_IMS () { 0b1000000000 }
917 wakaba 1.101 sub SELECT_IMS () { 0b10000000000 }
918 wakaba 1.126 sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
919     ## NOTE: "in foreign content" insertion mode is special; it is combined
920     ## with the secondary insertion mode. In this parser, they are stored
921     ## together in the bit-or'ed form.
922 wakaba 1.54
923 wakaba 1.84 ## NOTE: "initial" and "before html" insertion modes have no constants.
924    
925     ## NOTE: "after after body" insertion mode.
926 wakaba 1.54 sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS }
927 wakaba 1.84
928     ## NOTE: "after after frameset" insertion mode.
929 wakaba 1.54 sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS }
930 wakaba 1.84
931 wakaba 1.54 sub IN_HEAD_IM () { HEAD_IMS | 0b00 }
932     sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 }
933     sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
934     sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
935     sub IN_BODY_IM () { BODY_IMS }
936 wakaba 1.56 sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
937     sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
938     sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
939     sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
940 wakaba 1.54 sub IN_TABLE_IM () { TABLE_IMS }
941     sub AFTER_BODY_IM () { BODY_AFTER_IMS }
942     sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
943     sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 }
944 wakaba 1.101 sub IN_SELECT_IM () { SELECT_IMS | 0b01 }
945     sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 }
946 wakaba 1.54 sub IN_COLUMN_GROUP_IM () { 0b10 }
947    
948 wakaba 1.1 ## Implementations MUST act as if state machine in the spec
949    
950     sub _initialize_tokenizer ($) {
951     my $self = shift;
952 wakaba 1.57 $self->{state} = DATA_STATE; # MUST
953 wakaba 1.183 #$self->{s_kwd}; # state keyword - initialized when used
954 wakaba 1.169 #$self->{entity__value}; # initialized when used
955     #$self->{entity__match}; # initialized when used
956 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # be
957 wakaba 1.183 undef $self->{ct}; # current token
958     undef $self->{ca}; # current attribute
959     undef $self->{last_stag_name}; # last emitted start tag name
960 wakaba 1.169 #$self->{prev_state}; # initialized when used
961 wakaba 1.125 delete $self->{self_closing};
962 wakaba 1.179 $self->{char_buffer} = '';
963     $self->{char_buffer_pos} = 0;
964 wakaba 1.183 $self->{nc} = -1; # next input character
965     #$self->{next_nc}
966 wakaba 1.1 !!!next-input-character;
967     $self->{token} = [];
968 wakaba 1.18 # $self->{escape}
969 wakaba 1.1 } # _initialize_tokenizer
970    
971     ## A token has:
972 wakaba 1.55 ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
973     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
974     ## ->{name} (DOCTYPE_TOKEN)
975     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
976 wakaba 1.183 ## ->{pubid} (DOCTYPE_TOKEN)
977     ## ->{sysid} (DOCTYPE_TOKEN)
978 wakaba 1.75 ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
979 wakaba 1.55 ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
980 wakaba 1.66 ## ->{name}
981     ## ->{value}
982     ## ->{has_reference} == 1 or 0
983 wakaba 1.55 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
984 wakaba 1.125 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
985     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
986     ## while the token is pushed back to the stack.
987    
988 wakaba 1.1 ## Emitted token MUST immediately be handled by the tree construction state.
989    
990     ## Before each step, UA MAY check to see if either one of the scripts in
991     ## "list of scripts that will execute as soon as possible" or the first
992     ## script in the "list of scripts that will execute asynchronously",
993     ## has completed loading. If one has, then it MUST be executed
994     ## and removed from the list.
995    
996 wakaba 1.169 ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
997     ## (This requirement was dropped from HTML5 spec, unfortunately.)
998 wakaba 1.59
999 wakaba 1.187 my $is_space = {
1000     0x0009 => 1, # CHARACTER TABULATION (HT)
1001     0x000A => 1, # LINE FEED (LF)
1002     #0x000B => 0, # LINE TABULATION (VT)
1003     0x000C => 1, # FORM FEED (FF)
1004     #0x000D => 1, # CARRIAGE RETURN (CR)
1005     0x0020 => 1, # SPACE (SP)
1006     };
1007    
1008 wakaba 1.1 sub _get_next_token ($) {
1009     my $self = shift;
1010 wakaba 1.125
1011     if ($self->{self_closing}) {
1012 wakaba 1.183 !!!parse-error (type => 'nestc', token => $self->{ct});
1013 wakaba 1.125 ## NOTE: The |self_closing| flag is only set by start tag token.
1014     ## In addition, when a start tag token is emitted, it is always set to
1015 wakaba 1.183 ## |ct|.
1016 wakaba 1.125 delete $self->{self_closing};
1017     }
1018    
1019 wakaba 1.1 if (@{$self->{token}}) {
1020 wakaba 1.125 $self->{self_closing} = $self->{token}->[0]->{self_closing};
1021 wakaba 1.1 return shift @{$self->{token}};
1022     }
1023    
1024     A: {
1025 wakaba 1.185 if ($self->{state} == PCDATA_STATE) {
1026     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
1027    
1028 wakaba 1.183 if ($self->{nc} == 0x0026) { # &
1029 wakaba 1.185 !!!cp (0.1);
1030     ## NOTE: In the spec, the tokenizer is switched to the
1031     ## "entity data state". In this implementation, the tokenizer
1032     ## is switched to the |ENTITY_STATE|, which is an implementation
1033     ## of the "consume a character reference" algorithm.
1034     $self->{entity_add} = -1;
1035     $self->{prev_state} = DATA_STATE;
1036     $self->{state} = ENTITY_STATE;
1037     !!!next-input-character;
1038     redo A;
1039     } elsif ($self->{nc} == 0x003C) { # <
1040     !!!cp (0.2);
1041     $self->{state} = TAG_OPEN_STATE;
1042     !!!next-input-character;
1043     redo A;
1044     } elsif ($self->{nc} == -1) {
1045     !!!cp (0.3);
1046     !!!emit ({type => END_OF_FILE_TOKEN,
1047     line => $self->{line}, column => $self->{column}});
1048     last A; ## TODO: ok?
1049     } else {
1050     !!!cp (0.4);
1051     #
1052     }
1053    
1054     # Anything else
1055     my $token = {type => CHARACTER_TOKEN,
1056     data => chr $self->{nc},
1057     line => $self->{line}, column => $self->{column},
1058     };
1059     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
1060    
1061     ## Stay in the state.
1062     !!!next-input-character;
1063     !!!emit ($token);
1064     redo A;
1065     } elsif ($self->{state} == DATA_STATE) {
1066     $self->{s_kwd} = '' unless defined $self->{s_kwd};
1067     if ($self->{nc} == 0x0026) { # &
1068     $self->{s_kwd} = '';
1069 wakaba 1.72 if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
1070     not $self->{escape}) {
1071 wakaba 1.77 !!!cp (1);
1072 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1073     ## "entity data state". In this implementation, the tokenizer
1074     ## is switched to the |ENTITY_STATE|, which is an implementation
1075     ## of the "consume a character reference" algorithm.
1076 wakaba 1.183 $self->{entity_add} = -1;
1077 wakaba 1.169 $self->{prev_state} = DATA_STATE;
1078 wakaba 1.167 $self->{state} = ENTITY_STATE;
1079 wakaba 1.1 !!!next-input-character;
1080     redo A;
1081     } else {
1082 wakaba 1.77 !!!cp (2);
1083 wakaba 1.1 #
1084     }
1085 wakaba 1.183 } elsif ($self->{nc} == 0x002D) { # -
1086 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1087 wakaba 1.185 $self->{s_kwd} .= '-';
1088    
1089 wakaba 1.184 if ($self->{s_kwd} eq '<!--') {
1090     !!!cp (3);
1091     $self->{escape} = 1; # unless $self->{escape};
1092     $self->{s_kwd} = '--';
1093     #
1094     } elsif ($self->{s_kwd} eq '---') {
1095     !!!cp (4);
1096     $self->{s_kwd} = '--';
1097     #
1098 wakaba 1.77 } else {
1099     !!!cp (5);
1100 wakaba 1.184 #
1101 wakaba 1.13 }
1102     }
1103    
1104     #
1105 wakaba 1.184 } elsif ($self->{nc} == 0x0021) { # !
1106 wakaba 1.185 if (length $self->{s_kwd}) {
1107 wakaba 1.184 !!!cp (5.1);
1108     $self->{s_kwd} .= '!';
1109     #
1110     } else {
1111     !!!cp (5.2);
1112 wakaba 1.185 #$self->{s_kwd} = '';
1113 wakaba 1.184 #
1114     }
1115     #
1116 wakaba 1.183 } elsif ($self->{nc} == 0x003C) { # <
1117 wakaba 1.40 if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
1118     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
1119 wakaba 1.13 not $self->{escape})) {
1120 wakaba 1.77 !!!cp (6);
1121 wakaba 1.57 $self->{state} = TAG_OPEN_STATE;
1122 wakaba 1.1 !!!next-input-character;
1123     redo A;
1124     } else {
1125 wakaba 1.77 !!!cp (7);
1126 wakaba 1.185 $self->{s_kwd} = '';
1127 wakaba 1.1 #
1128     }
1129 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1130 wakaba 1.13 if ($self->{escape} and
1131 wakaba 1.40 ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
1132 wakaba 1.185 if ($self->{s_kwd} eq '--') {
1133 wakaba 1.77 !!!cp (8);
1134 wakaba 1.13 delete $self->{escape};
1135 wakaba 1.77 } else {
1136     !!!cp (9);
1137 wakaba 1.13 }
1138 wakaba 1.77 } else {
1139     !!!cp (10);
1140 wakaba 1.13 }
1141    
1142 wakaba 1.185 $self->{s_kwd} = '';
1143 wakaba 1.13 #
1144 wakaba 1.183 } elsif ($self->{nc} == -1) {
1145 wakaba 1.77 !!!cp (11);
1146 wakaba 1.185 $self->{s_kwd} = '';
1147 wakaba 1.112 !!!emit ({type => END_OF_FILE_TOKEN,
1148     line => $self->{line}, column => $self->{column}});
1149 wakaba 1.1 last A; ## TODO: ok?
1150 wakaba 1.77 } else {
1151     !!!cp (12);
1152 wakaba 1.185 $self->{s_kwd} = '';
1153 wakaba 1.184 #
1154 wakaba 1.1 }
1155 wakaba 1.184
1156 wakaba 1.1 # Anything else
1157 wakaba 1.55 my $token = {type => CHARACTER_TOKEN,
1158 wakaba 1.183 data => chr $self->{nc},
1159 wakaba 1.120 line => $self->{line}, column => $self->{column},
1160 wakaba 1.118 };
1161 wakaba 1.184 if ($self->{read_until}->($token->{data}, q[-!<>&],
1162     length $token->{data})) {
1163 wakaba 1.185 $self->{s_kwd} = '';
1164 wakaba 1.184 }
1165 wakaba 1.171
1166 wakaba 1.185 ## Stay in the data state.
1167     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
1168     !!!cp (13);
1169     $self->{state} = PCDATA_STATE;
1170     } else {
1171     !!!cp (14);
1172     ## Stay in the state.
1173     }
1174 wakaba 1.1 !!!next-input-character;
1175     !!!emit ($token);
1176     redo A;
1177 wakaba 1.57 } elsif ($self->{state} == TAG_OPEN_STATE) {
1178 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1179 wakaba 1.183 if ($self->{nc} == 0x002F) { # /
1180 wakaba 1.77 !!!cp (15);
1181 wakaba 1.1 !!!next-input-character;
1182 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
1183 wakaba 1.1 redo A;
1184 wakaba 1.184 } elsif ($self->{nc} == 0x0021) { # !
1185     !!!cp (15.1);
1186     $self->{s_kwd} = '<' unless $self->{escape};
1187     #
1188 wakaba 1.1 } else {
1189 wakaba 1.77 !!!cp (16);
1190 wakaba 1.184 #
1191     }
1192 wakaba 1.1
1193 wakaba 1.184 ## reconsume
1194     $self->{state} = DATA_STATE;
1195     !!!emit ({type => CHARACTER_TOKEN, data => '<',
1196     line => $self->{line_prev},
1197     column => $self->{column_prev},
1198     });
1199     redo A;
1200 wakaba 1.40 } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1201 wakaba 1.183 if ($self->{nc} == 0x0021) { # !
1202 wakaba 1.77 !!!cp (17);
1203 wakaba 1.57 $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1204 wakaba 1.1 !!!next-input-character;
1205     redo A;
1206 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1207 wakaba 1.77 !!!cp (18);
1208 wakaba 1.57 $self->{state} = CLOSE_TAG_OPEN_STATE;
1209 wakaba 1.1 !!!next-input-character;
1210     redo A;
1211 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1212     $self->{nc} <= 0x005A) { # A..Z
1213 wakaba 1.77 !!!cp (19);
1214 wakaba 1.183 $self->{ct}
1215 wakaba 1.55 = {type => START_TAG_TOKEN,
1216 wakaba 1.183 tag_name => chr ($self->{nc} + 0x0020),
1217 wakaba 1.112 line => $self->{line_prev},
1218     column => $self->{column_prev}};
1219 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1220 wakaba 1.1 !!!next-input-character;
1221     redo A;
1222 wakaba 1.183 } elsif (0x0061 <= $self->{nc} and
1223     $self->{nc} <= 0x007A) { # a..z
1224 wakaba 1.77 !!!cp (20);
1225 wakaba 1.183 $self->{ct} = {type => START_TAG_TOKEN,
1226     tag_name => chr ($self->{nc}),
1227 wakaba 1.112 line => $self->{line_prev},
1228     column => $self->{column_prev}};
1229 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1230 wakaba 1.1 !!!next-input-character;
1231     redo A;
1232 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1233 wakaba 1.77 !!!cp (21);
1234 wakaba 1.115 !!!parse-error (type => 'empty start tag',
1235     line => $self->{line_prev},
1236     column => $self->{column_prev});
1237 wakaba 1.57 $self->{state} = DATA_STATE;
1238 wakaba 1.1 !!!next-input-character;
1239    
1240 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '<>',
1241 wakaba 1.120 line => $self->{line_prev},
1242     column => $self->{column_prev},
1243 wakaba 1.118 });
1244 wakaba 1.1
1245     redo A;
1246 wakaba 1.183 } elsif ($self->{nc} == 0x003F) { # ?
1247 wakaba 1.77 !!!cp (22);
1248 wakaba 1.115 !!!parse-error (type => 'pio',
1249     line => $self->{line_prev},
1250     column => $self->{column_prev});
1251 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1252 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1253 wakaba 1.120 line => $self->{line_prev},
1254     column => $self->{column_prev},
1255 wakaba 1.118 };
1256 wakaba 1.183 ## $self->{nc} is intentionally left as is
1257 wakaba 1.1 redo A;
1258     } else {
1259 wakaba 1.77 !!!cp (23);
1260 wakaba 1.136 !!!parse-error (type => 'bare stago',
1261     line => $self->{line_prev},
1262     column => $self->{column_prev});
1263 wakaba 1.57 $self->{state} = DATA_STATE;
1264 wakaba 1.1 ## reconsume
1265    
1266 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '<',
1267 wakaba 1.120 line => $self->{line_prev},
1268     column => $self->{column_prev},
1269 wakaba 1.118 });
1270 wakaba 1.1
1271     redo A;
1272     }
1273     } else {
1274 wakaba 1.40 die "$0: $self->{content_model} in tag open";
1275 wakaba 1.1 }
1276 wakaba 1.57 } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1277 wakaba 1.164 ## NOTE: The "close tag open state" in the spec is implemented as
1278 wakaba 1.185 ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
1279 wakaba 1.164
1280 wakaba 1.113 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1281 wakaba 1.40 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1282 wakaba 1.183 if (defined $self->{last_stag_name}) {
1283 wakaba 1.185 $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
1284 wakaba 1.183 $self->{s_kwd} = '';
1285 wakaba 1.164 ## Reconsume.
1286     redo A;
1287 wakaba 1.23 } else {
1288     ## No start tag token has ever been emitted
1289 wakaba 1.164 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1290 wakaba 1.77 !!!cp (28);
1291 wakaba 1.57 $self->{state} = DATA_STATE;
1292 wakaba 1.164 ## Reconsume.
1293 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1294 wakaba 1.120 line => $l, column => $c,
1295 wakaba 1.118 });
1296 wakaba 1.1 redo A;
1297     }
1298     }
1299 wakaba 1.164
1300 wakaba 1.183 if (0x0041 <= $self->{nc} and
1301     $self->{nc} <= 0x005A) { # A..Z
1302 wakaba 1.77 !!!cp (29);
1303 wakaba 1.183 $self->{ct}
1304 wakaba 1.112 = {type => END_TAG_TOKEN,
1305 wakaba 1.183 tag_name => chr ($self->{nc} + 0x0020),
1306 wakaba 1.112 line => $l, column => $c};
1307 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1308 wakaba 1.1 !!!next-input-character;
1309     redo A;
1310 wakaba 1.183 } elsif (0x0061 <= $self->{nc} and
1311     $self->{nc} <= 0x007A) { # a..z
1312 wakaba 1.77 !!!cp (30);
1313 wakaba 1.183 $self->{ct} = {type => END_TAG_TOKEN,
1314     tag_name => chr ($self->{nc}),
1315 wakaba 1.112 line => $l, column => $c};
1316 wakaba 1.57 $self->{state} = TAG_NAME_STATE;
1317 wakaba 1.1 !!!next-input-character;
1318     redo A;
1319 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1320 wakaba 1.77 !!!cp (31);
1321 wakaba 1.115 !!!parse-error (type => 'empty end tag',
1322     line => $self->{line_prev}, ## "<" in "</>"
1323     column => $self->{column_prev} - 1);
1324 wakaba 1.57 $self->{state} = DATA_STATE;
1325 wakaba 1.1 !!!next-input-character;
1326     redo A;
1327 wakaba 1.183 } elsif ($self->{nc} == -1) {
1328 wakaba 1.77 !!!cp (32);
1329 wakaba 1.3 !!!parse-error (type => 'bare etago');
1330 wakaba 1.57 $self->{state} = DATA_STATE;
1331 wakaba 1.1 # reconsume
1332    
1333 wakaba 1.112 !!!emit ({type => CHARACTER_TOKEN, data => '</',
1334 wakaba 1.120 line => $l, column => $c,
1335 wakaba 1.118 });
1336 wakaba 1.1
1337     redo A;
1338     } else {
1339 wakaba 1.77 !!!cp (33);
1340 wakaba 1.3 !!!parse-error (type => 'bogus end tag');
1341 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
1342 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
1343 wakaba 1.120 line => $self->{line_prev}, # "<" of "</"
1344     column => $self->{column_prev} - 1,
1345 wakaba 1.118 };
1346 wakaba 1.183 ## NOTE: $self->{nc} is intentionally left as is.
1347 wakaba 1.164 ## Although the "anything else" case of the spec not explicitly
1348     ## states that the next input character is to be reconsumed,
1349     ## it will be included to the |data| of the comment token
1350     ## generated from the bogus end tag, as defined in the
1351     ## "bogus comment state" entry.
1352     redo A;
1353     }
1354 wakaba 1.185 } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
1355 wakaba 1.183 my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
1356 wakaba 1.164 if (length $ch) {
1357     my $CH = $ch;
1358     $ch =~ tr/a-z/A-Z/;
1359 wakaba 1.183 my $nch = chr $self->{nc};
1360 wakaba 1.164 if ($nch eq $ch or $nch eq $CH) {
1361     !!!cp (24);
1362     ## Stay in the state.
1363 wakaba 1.183 $self->{s_kwd} .= $nch;
1364 wakaba 1.164 !!!next-input-character;
1365     redo A;
1366     } else {
1367     !!!cp (25);
1368     $self->{state} = DATA_STATE;
1369     ## Reconsume.
1370     !!!emit ({type => CHARACTER_TOKEN,
1371 wakaba 1.183 data => '</' . $self->{s_kwd},
1372 wakaba 1.164 line => $self->{line_prev},
1373 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1374 wakaba 1.164 });
1375     redo A;
1376     }
1377     } else { # after "<{tag-name}"
1378 wakaba 1.187 unless ($is_space->{$self->{nc}} or
1379     {
1380 wakaba 1.164 0x003E => 1, # >
1381     0x002F => 1, # /
1382     -1 => 1, # EOF
1383 wakaba 1.183 }->{$self->{nc}}) {
1384 wakaba 1.164 !!!cp (26);
1385     ## Reconsume.
1386     $self->{state} = DATA_STATE;
1387     !!!emit ({type => CHARACTER_TOKEN,
1388 wakaba 1.183 data => '</' . $self->{s_kwd},
1389 wakaba 1.164 line => $self->{line_prev},
1390 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd},
1391 wakaba 1.164 });
1392     redo A;
1393     } else {
1394     !!!cp (27);
1395 wakaba 1.183 $self->{ct}
1396 wakaba 1.164 = {type => END_TAG_TOKEN,
1397 wakaba 1.183 tag_name => $self->{last_stag_name},
1398 wakaba 1.164 line => $self->{line_prev},
1399 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd}};
1400 wakaba 1.164 $self->{state} = TAG_NAME_STATE;
1401     ## Reconsume.
1402     redo A;
1403     }
1404 wakaba 1.1 }
1405 wakaba 1.57 } elsif ($self->{state} == TAG_NAME_STATE) {
1406 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1407 wakaba 1.77 !!!cp (34);
1408 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1409 wakaba 1.1 !!!next-input-character;
1410     redo A;
1411 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1412     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1413 wakaba 1.77 !!!cp (35);
1414 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1415     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1416 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1417 wakaba 1.183 #if ($self->{ct}->{attributes}) {
1418 wakaba 1.78 # ## NOTE: This should never be reached.
1419     # !!! cp (36);
1420     # !!! parse-error (type => 'end tag attribute');
1421     #} else {
1422 wakaba 1.77 !!!cp (37);
1423 wakaba 1.78 #}
1424 wakaba 1.1 } else {
1425 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1426 wakaba 1.1 }
1427 wakaba 1.57 $self->{state} = DATA_STATE;
1428 wakaba 1.1 !!!next-input-character;
1429    
1430 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1431 wakaba 1.1
1432     redo A;
1433 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1434     $self->{nc} <= 0x005A) { # A..Z
1435 wakaba 1.77 !!!cp (38);
1436 wakaba 1.183 $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
1437 wakaba 1.1 # start tag or end tag
1438     ## Stay in this state
1439     !!!next-input-character;
1440     redo A;
1441 wakaba 1.183 } elsif ($self->{nc} == -1) {
1442 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1443 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1444 wakaba 1.77 !!!cp (39);
1445 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1446     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1447 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1448 wakaba 1.183 #if ($self->{ct}->{attributes}) {
1449 wakaba 1.78 # ## NOTE: This state should never be reached.
1450     # !!! cp (40);
1451     # !!! parse-error (type => 'end tag attribute');
1452     #} else {
1453 wakaba 1.77 !!!cp (41);
1454 wakaba 1.78 #}
1455 wakaba 1.1 } else {
1456 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1457 wakaba 1.1 }
1458 wakaba 1.57 $self->{state} = DATA_STATE;
1459 wakaba 1.1 # reconsume
1460    
1461 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1462 wakaba 1.1
1463     redo A;
1464 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1465 wakaba 1.125 !!!cp (42);
1466     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1467 wakaba 1.1 !!!next-input-character;
1468     redo A;
1469     } else {
1470 wakaba 1.77 !!!cp (44);
1471 wakaba 1.183 $self->{ct}->{tag_name} .= chr $self->{nc};
1472 wakaba 1.1 # start tag or end tag
1473     ## Stay in the state
1474     !!!next-input-character;
1475     redo A;
1476     }
1477 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1478 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1479 wakaba 1.77 !!!cp (45);
1480 wakaba 1.1 ## Stay in the state
1481     !!!next-input-character;
1482     redo A;
1483 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1484     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1485 wakaba 1.77 !!!cp (46);
1486 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1487     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1488 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1489 wakaba 1.183 if ($self->{ct}->{attributes}) {
1490 wakaba 1.77 !!!cp (47);
1491 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1492 wakaba 1.77 } else {
1493     !!!cp (48);
1494 wakaba 1.1 }
1495     } else {
1496 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1497 wakaba 1.1 }
1498 wakaba 1.57 $self->{state} = DATA_STATE;
1499 wakaba 1.1 !!!next-input-character;
1500    
1501 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1502 wakaba 1.1
1503     redo A;
1504 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1505     $self->{nc} <= 0x005A) { # A..Z
1506 wakaba 1.77 !!!cp (49);
1507 wakaba 1.183 $self->{ca}
1508     = {name => chr ($self->{nc} + 0x0020),
1509 wakaba 1.119 value => '',
1510     line => $self->{line}, column => $self->{column}};
1511 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1512 wakaba 1.1 !!!next-input-character;
1513     redo A;
1514 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1515 wakaba 1.125 !!!cp (50);
1516     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1517 wakaba 1.1 !!!next-input-character;
1518     redo A;
1519 wakaba 1.183 } elsif ($self->{nc} == -1) {
1520 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1521 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1522 wakaba 1.77 !!!cp (52);
1523 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1524     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1525 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1526 wakaba 1.183 if ($self->{ct}->{attributes}) {
1527 wakaba 1.77 !!!cp (53);
1528 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1529 wakaba 1.77 } else {
1530     !!!cp (54);
1531 wakaba 1.1 }
1532     } else {
1533 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1534 wakaba 1.1 }
1535 wakaba 1.57 $self->{state} = DATA_STATE;
1536 wakaba 1.1 # reconsume
1537    
1538 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1539 wakaba 1.1
1540     redo A;
1541     } else {
1542 wakaba 1.72 if ({
1543     0x0022 => 1, # "
1544     0x0027 => 1, # '
1545     0x003D => 1, # =
1546 wakaba 1.183 }->{$self->{nc}}) {
1547 wakaba 1.77 !!!cp (55);
1548 wakaba 1.72 !!!parse-error (type => 'bad attribute name');
1549 wakaba 1.77 } else {
1550     !!!cp (56);
1551 wakaba 1.72 }
1552 wakaba 1.183 $self->{ca}
1553     = {name => chr ($self->{nc}),
1554 wakaba 1.119 value => '',
1555     line => $self->{line}, column => $self->{column}};
1556 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1557 wakaba 1.1 !!!next-input-character;
1558     redo A;
1559     }
1560 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1561 wakaba 1.1 my $before_leave = sub {
1562 wakaba 1.183 if (exists $self->{ct}->{attributes} # start tag or end tag
1563     ->{$self->{ca}->{name}}) { # MUST
1564 wakaba 1.77 !!!cp (57);
1565 wakaba 1.183 !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1566     ## Discard $self->{ca} # MUST
1567 wakaba 1.1 } else {
1568 wakaba 1.77 !!!cp (58);
1569 wakaba 1.183 $self->{ct}->{attributes}->{$self->{ca}->{name}}
1570     = $self->{ca};
1571 wakaba 1.1 }
1572     }; # $before_leave
1573    
1574 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1575 wakaba 1.77 !!!cp (59);
1576 wakaba 1.1 $before_leave->();
1577 wakaba 1.57 $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1578 wakaba 1.1 !!!next-input-character;
1579     redo A;
1580 wakaba 1.183 } elsif ($self->{nc} == 0x003D) { # =
1581 wakaba 1.77 !!!cp (60);
1582 wakaba 1.1 $before_leave->();
1583 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1584 wakaba 1.1 !!!next-input-character;
1585     redo A;
1586 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1587 wakaba 1.1 $before_leave->();
1588 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1589 wakaba 1.77 !!!cp (61);
1590 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1591     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1592 wakaba 1.77 !!!cp (62);
1593 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1594 wakaba 1.183 if ($self->{ct}->{attributes}) {
1595 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1596 wakaba 1.1 }
1597     } else {
1598 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1599 wakaba 1.1 }
1600 wakaba 1.57 $self->{state} = DATA_STATE;
1601 wakaba 1.1 !!!next-input-character;
1602    
1603 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1604 wakaba 1.1
1605     redo A;
1606 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1607     $self->{nc} <= 0x005A) { # A..Z
1608 wakaba 1.77 !!!cp (63);
1609 wakaba 1.183 $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1610 wakaba 1.1 ## Stay in the state
1611     !!!next-input-character;
1612     redo A;
1613 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1614 wakaba 1.125 !!!cp (64);
1615 wakaba 1.1 $before_leave->();
1616 wakaba 1.125 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1617 wakaba 1.1 !!!next-input-character;
1618     redo A;
1619 wakaba 1.183 } elsif ($self->{nc} == -1) {
1620 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1621 wakaba 1.1 $before_leave->();
1622 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1623 wakaba 1.77 !!!cp (66);
1624 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1625     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1626 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1627 wakaba 1.183 if ($self->{ct}->{attributes}) {
1628 wakaba 1.77 !!!cp (67);
1629 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1630 wakaba 1.77 } else {
1631 wakaba 1.78 ## NOTE: This state should never be reached.
1632 wakaba 1.77 !!!cp (68);
1633 wakaba 1.1 }
1634     } else {
1635 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1636 wakaba 1.1 }
1637 wakaba 1.57 $self->{state} = DATA_STATE;
1638 wakaba 1.1 # reconsume
1639    
1640 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1641 wakaba 1.1
1642     redo A;
1643     } else {
1644 wakaba 1.183 if ($self->{nc} == 0x0022 or # "
1645     $self->{nc} == 0x0027) { # '
1646 wakaba 1.77 !!!cp (69);
1647 wakaba 1.72 !!!parse-error (type => 'bad attribute name');
1648 wakaba 1.77 } else {
1649     !!!cp (70);
1650 wakaba 1.72 }
1651 wakaba 1.183 $self->{ca}->{name} .= chr ($self->{nc});
1652 wakaba 1.1 ## Stay in the state
1653     !!!next-input-character;
1654     redo A;
1655     }
1656 wakaba 1.57 } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1657 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1658 wakaba 1.77 !!!cp (71);
1659 wakaba 1.1 ## Stay in the state
1660     !!!next-input-character;
1661     redo A;
1662 wakaba 1.183 } elsif ($self->{nc} == 0x003D) { # =
1663 wakaba 1.77 !!!cp (72);
1664 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1665 wakaba 1.1 !!!next-input-character;
1666     redo A;
1667 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1668     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1669 wakaba 1.77 !!!cp (73);
1670 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1671     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1672 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1673 wakaba 1.183 if ($self->{ct}->{attributes}) {
1674 wakaba 1.77 !!!cp (74);
1675 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1676 wakaba 1.77 } else {
1677 wakaba 1.78 ## NOTE: This state should never be reached.
1678 wakaba 1.77 !!!cp (75);
1679 wakaba 1.1 }
1680     } else {
1681 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1682 wakaba 1.1 }
1683 wakaba 1.57 $self->{state} = DATA_STATE;
1684 wakaba 1.1 !!!next-input-character;
1685    
1686 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1687 wakaba 1.1
1688     redo A;
1689 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
1690     $self->{nc} <= 0x005A) { # A..Z
1691 wakaba 1.77 !!!cp (76);
1692 wakaba 1.183 $self->{ca}
1693     = {name => chr ($self->{nc} + 0x0020),
1694 wakaba 1.119 value => '',
1695     line => $self->{line}, column => $self->{column}};
1696 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1697 wakaba 1.1 !!!next-input-character;
1698     redo A;
1699 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
1700 wakaba 1.125 !!!cp (77);
1701     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1702 wakaba 1.1 !!!next-input-character;
1703     redo A;
1704 wakaba 1.183 } elsif ($self->{nc} == -1) {
1705 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1706 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1707 wakaba 1.77 !!!cp (79);
1708 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1709     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1710 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1711 wakaba 1.183 if ($self->{ct}->{attributes}) {
1712 wakaba 1.77 !!!cp (80);
1713 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1714 wakaba 1.77 } else {
1715 wakaba 1.78 ## NOTE: This state should never be reached.
1716 wakaba 1.77 !!!cp (81);
1717 wakaba 1.1 }
1718     } else {
1719 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1720 wakaba 1.1 }
1721 wakaba 1.57 $self->{state} = DATA_STATE;
1722 wakaba 1.1 # reconsume
1723    
1724 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1725 wakaba 1.1
1726     redo A;
1727     } else {
1728 wakaba 1.183 if ($self->{nc} == 0x0022 or # "
1729     $self->{nc} == 0x0027) { # '
1730 wakaba 1.156 !!!cp (78);
1731     !!!parse-error (type => 'bad attribute name');
1732     } else {
1733     !!!cp (82);
1734     }
1735 wakaba 1.183 $self->{ca}
1736     = {name => chr ($self->{nc}),
1737 wakaba 1.119 value => '',
1738     line => $self->{line}, column => $self->{column}};
1739 wakaba 1.57 $self->{state} = ATTRIBUTE_NAME_STATE;
1740 wakaba 1.1 !!!next-input-character;
1741     redo A;
1742     }
1743 wakaba 1.57 } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1744 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1745 wakaba 1.77 !!!cp (83);
1746 wakaba 1.1 ## Stay in the state
1747     !!!next-input-character;
1748     redo A;
1749 wakaba 1.183 } elsif ($self->{nc} == 0x0022) { # "
1750 wakaba 1.77 !!!cp (84);
1751 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1752 wakaba 1.1 !!!next-input-character;
1753     redo A;
1754 wakaba 1.183 } elsif ($self->{nc} == 0x0026) { # &
1755 wakaba 1.77 !!!cp (85);
1756 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1757 wakaba 1.1 ## reconsume
1758     redo A;
1759 wakaba 1.183 } elsif ($self->{nc} == 0x0027) { # '
1760 wakaba 1.77 !!!cp (86);
1761 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1762 wakaba 1.1 !!!next-input-character;
1763     redo A;
1764 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1765 wakaba 1.156 !!!parse-error (type => 'empty unquoted attribute value');
1766 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1767 wakaba 1.77 !!!cp (87);
1768 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1769     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1770 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1771 wakaba 1.183 if ($self->{ct}->{attributes}) {
1772 wakaba 1.77 !!!cp (88);
1773 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1774 wakaba 1.77 } else {
1775 wakaba 1.78 ## NOTE: This state should never be reached.
1776 wakaba 1.77 !!!cp (89);
1777 wakaba 1.1 }
1778     } else {
1779 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1780 wakaba 1.1 }
1781 wakaba 1.57 $self->{state} = DATA_STATE;
1782 wakaba 1.1 !!!next-input-character;
1783    
1784 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1785 wakaba 1.1
1786     redo A;
1787 wakaba 1.183 } elsif ($self->{nc} == -1) {
1788 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1789 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1790 wakaba 1.77 !!!cp (90);
1791 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1792     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1793 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1794 wakaba 1.183 if ($self->{ct}->{attributes}) {
1795 wakaba 1.77 !!!cp (91);
1796 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1797 wakaba 1.77 } else {
1798 wakaba 1.78 ## NOTE: This state should never be reached.
1799 wakaba 1.77 !!!cp (92);
1800 wakaba 1.1 }
1801     } else {
1802 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1803 wakaba 1.1 }
1804 wakaba 1.57 $self->{state} = DATA_STATE;
1805 wakaba 1.1 ## reconsume
1806    
1807 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1808 wakaba 1.1
1809     redo A;
1810     } else {
1811 wakaba 1.183 if ($self->{nc} == 0x003D) { # =
1812 wakaba 1.77 !!!cp (93);
1813 wakaba 1.72 !!!parse-error (type => 'bad attribute value');
1814 wakaba 1.77 } else {
1815     !!!cp (94);
1816 wakaba 1.72 }
1817 wakaba 1.183 $self->{ca}->{value} .= chr ($self->{nc});
1818 wakaba 1.57 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1819 wakaba 1.1 !!!next-input-character;
1820     redo A;
1821     }
1822 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1823 wakaba 1.183 if ($self->{nc} == 0x0022) { # "
1824 wakaba 1.77 !!!cp (95);
1825 wakaba 1.72 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1826 wakaba 1.1 !!!next-input-character;
1827     redo A;
1828 wakaba 1.183 } elsif ($self->{nc} == 0x0026) { # &
1829 wakaba 1.77 !!!cp (96);
1830 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1831     ## "entity in attribute value state". In this implementation, the
1832     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1833     ## implementation of the "consume a character reference" algorithm.
1834 wakaba 1.169 $self->{prev_state} = $self->{state};
1835 wakaba 1.183 $self->{entity_add} = 0x0022; # "
1836 wakaba 1.167 $self->{state} = ENTITY_STATE;
1837 wakaba 1.1 !!!next-input-character;
1838     redo A;
1839 wakaba 1.183 } elsif ($self->{nc} == -1) {
1840 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1841 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1842 wakaba 1.77 !!!cp (97);
1843 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1844     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1845 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1846 wakaba 1.183 if ($self->{ct}->{attributes}) {
1847 wakaba 1.77 !!!cp (98);
1848 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1849 wakaba 1.77 } else {
1850 wakaba 1.78 ## NOTE: This state should never be reached.
1851 wakaba 1.77 !!!cp (99);
1852 wakaba 1.1 }
1853     } else {
1854 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1855 wakaba 1.1 }
1856 wakaba 1.57 $self->{state} = DATA_STATE;
1857 wakaba 1.1 ## reconsume
1858    
1859 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1860 wakaba 1.1
1861     redo A;
1862     } else {
1863 wakaba 1.77 !!!cp (100);
1864 wakaba 1.183 $self->{ca}->{value} .= chr ($self->{nc});
1865     $self->{read_until}->($self->{ca}->{value},
1866 wakaba 1.173 q["&],
1867 wakaba 1.183 length $self->{ca}->{value});
1868 wakaba 1.173
1869 wakaba 1.1 ## Stay in the state
1870     !!!next-input-character;
1871     redo A;
1872     }
1873 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1874 wakaba 1.183 if ($self->{nc} == 0x0027) { # '
1875 wakaba 1.77 !!!cp (101);
1876 wakaba 1.72 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1877 wakaba 1.1 !!!next-input-character;
1878     redo A;
1879 wakaba 1.183 } elsif ($self->{nc} == 0x0026) { # &
1880 wakaba 1.77 !!!cp (102);
1881 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1882     ## "entity in attribute value state". In this implementation, the
1883     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1884     ## implementation of the "consume a character reference" algorithm.
1885 wakaba 1.183 $self->{entity_add} = 0x0027; # '
1886 wakaba 1.169 $self->{prev_state} = $self->{state};
1887 wakaba 1.167 $self->{state} = ENTITY_STATE;
1888 wakaba 1.1 !!!next-input-character;
1889     redo A;
1890 wakaba 1.183 } elsif ($self->{nc} == -1) {
1891 wakaba 1.3 !!!parse-error (type => 'unclosed attribute value');
1892 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1893 wakaba 1.77 !!!cp (103);
1894 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1895     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1896 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1897 wakaba 1.183 if ($self->{ct}->{attributes}) {
1898 wakaba 1.77 !!!cp (104);
1899 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1900 wakaba 1.77 } else {
1901 wakaba 1.78 ## NOTE: This state should never be reached.
1902 wakaba 1.77 !!!cp (105);
1903 wakaba 1.1 }
1904     } else {
1905 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1906 wakaba 1.1 }
1907 wakaba 1.57 $self->{state} = DATA_STATE;
1908 wakaba 1.1 ## reconsume
1909    
1910 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1911 wakaba 1.1
1912     redo A;
1913     } else {
1914 wakaba 1.77 !!!cp (106);
1915 wakaba 1.183 $self->{ca}->{value} .= chr ($self->{nc});
1916     $self->{read_until}->($self->{ca}->{value},
1917 wakaba 1.173 q['&],
1918 wakaba 1.183 length $self->{ca}->{value});
1919 wakaba 1.173
1920 wakaba 1.1 ## Stay in the state
1921     !!!next-input-character;
1922     redo A;
1923     }
1924 wakaba 1.57 } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1925 wakaba 1.187 if ($is_space->{$self->{nc}}) {
1926 wakaba 1.77 !!!cp (107);
1927 wakaba 1.57 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1928 wakaba 1.1 !!!next-input-character;
1929     redo A;
1930 wakaba 1.183 } elsif ($self->{nc} == 0x0026) { # &
1931 wakaba 1.77 !!!cp (108);
1932 wakaba 1.167 ## NOTE: In the spec, the tokenizer is switched to the
1933     ## "entity in attribute value state". In this implementation, the
1934     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1935     ## implementation of the "consume a character reference" algorithm.
1936 wakaba 1.183 $self->{entity_add} = -1;
1937 wakaba 1.169 $self->{prev_state} = $self->{state};
1938 wakaba 1.167 $self->{state} = ENTITY_STATE;
1939 wakaba 1.1 !!!next-input-character;
1940     redo A;
1941 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
1942     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1943 wakaba 1.77 !!!cp (109);
1944 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1945     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1946 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1947 wakaba 1.183 if ($self->{ct}->{attributes}) {
1948 wakaba 1.77 !!!cp (110);
1949 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1950 wakaba 1.77 } else {
1951 wakaba 1.78 ## NOTE: This state should never be reached.
1952 wakaba 1.77 !!!cp (111);
1953 wakaba 1.1 }
1954     } else {
1955 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1956 wakaba 1.1 }
1957 wakaba 1.57 $self->{state} = DATA_STATE;
1958 wakaba 1.1 !!!next-input-character;
1959    
1960 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1961 wakaba 1.1
1962     redo A;
1963 wakaba 1.183 } elsif ($self->{nc} == -1) {
1964 wakaba 1.3 !!!parse-error (type => 'unclosed tag');
1965 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1966 wakaba 1.77 !!!cp (112);
1967 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
1968     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1969 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1970 wakaba 1.183 if ($self->{ct}->{attributes}) {
1971 wakaba 1.77 !!!cp (113);
1972 wakaba 1.3 !!!parse-error (type => 'end tag attribute');
1973 wakaba 1.77 } else {
1974 wakaba 1.78 ## NOTE: This state should never be reached.
1975 wakaba 1.77 !!!cp (114);
1976 wakaba 1.1 }
1977     } else {
1978 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
1979 wakaba 1.1 }
1980 wakaba 1.57 $self->{state} = DATA_STATE;
1981 wakaba 1.1 ## reconsume
1982    
1983 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
1984 wakaba 1.1
1985     redo A;
1986     } else {
1987 wakaba 1.72 if ({
1988     0x0022 => 1, # "
1989     0x0027 => 1, # '
1990     0x003D => 1, # =
1991 wakaba 1.183 }->{$self->{nc}}) {
1992 wakaba 1.77 !!!cp (115);
1993 wakaba 1.72 !!!parse-error (type => 'bad attribute value');
1994 wakaba 1.77 } else {
1995     !!!cp (116);
1996 wakaba 1.72 }
1997 wakaba 1.183 $self->{ca}->{value} .= chr ($self->{nc});
1998     $self->{read_until}->($self->{ca}->{value},
1999 wakaba 1.173 q["'=& >],
2000 wakaba 1.183 length $self->{ca}->{value});
2001 wakaba 1.173
2002 wakaba 1.1 ## Stay in the state
2003     !!!next-input-character;
2004     redo A;
2005     }
2006 wakaba 1.72 } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2007 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2008 wakaba 1.77 !!!cp (118);
2009 wakaba 1.72 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2010     !!!next-input-character;
2011     redo A;
2012 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2013     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2014 wakaba 1.77 !!!cp (119);
2015 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
2016     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2017 wakaba 1.72 $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2018 wakaba 1.183 if ($self->{ct}->{attributes}) {
2019 wakaba 1.77 !!!cp (120);
2020 wakaba 1.72 !!!parse-error (type => 'end tag attribute');
2021 wakaba 1.77 } else {
2022 wakaba 1.78 ## NOTE: This state should never be reached.
2023 wakaba 1.77 !!!cp (121);
2024 wakaba 1.72 }
2025     } else {
2026 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
2027 wakaba 1.72 }
2028     $self->{state} = DATA_STATE;
2029     !!!next-input-character;
2030    
2031 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
2032 wakaba 1.72
2033     redo A;
2034 wakaba 1.183 } elsif ($self->{nc} == 0x002F) { # /
2035 wakaba 1.125 !!!cp (122);
2036     $self->{state} = SELF_CLOSING_START_TAG_STATE;
2037 wakaba 1.72 !!!next-input-character;
2038 wakaba 1.125 redo A;
2039 wakaba 1.183 } elsif ($self->{nc} == -1) {
2040 wakaba 1.141 !!!parse-error (type => 'unclosed tag');
2041 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2042 wakaba 1.141 !!!cp (122.3);
2043 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
2044     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2045     if ($self->{ct}->{attributes}) {
2046 wakaba 1.141 !!!cp (122.1);
2047     !!!parse-error (type => 'end tag attribute');
2048     } else {
2049     ## NOTE: This state should never be reached.
2050     !!!cp (122.2);
2051     }
2052     } else {
2053 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
2054 wakaba 1.141 }
2055     $self->{state} = DATA_STATE;
2056     ## Reconsume.
2057 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
2058 wakaba 1.141 redo A;
2059 wakaba 1.125 } else {
2060     !!!cp ('124.1');
2061     !!!parse-error (type => 'no space between attributes');
2062     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2063     ## reconsume
2064     redo A;
2065     }
2066     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2067 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
2068     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2069 wakaba 1.125 !!!cp ('124.2');
2070 wakaba 1.183 !!!parse-error (type => 'nestc', token => $self->{ct});
2071 wakaba 1.125 ## TODO: Different type than slash in start tag
2072     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2073 wakaba 1.183 if ($self->{ct}->{attributes}) {
2074 wakaba 1.125 !!!cp ('124.4');
2075     !!!parse-error (type => 'end tag attribute');
2076     } else {
2077     !!!cp ('124.5');
2078     }
2079     ## TODO: Test |<title></title/>|
2080 wakaba 1.72 } else {
2081 wakaba 1.125 !!!cp ('124.3');
2082     $self->{self_closing} = 1;
2083 wakaba 1.72 }
2084 wakaba 1.125
2085     $self->{state} = DATA_STATE;
2086     !!!next-input-character;
2087    
2088 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
2089 wakaba 1.125
2090 wakaba 1.72 redo A;
2091 wakaba 1.183 } elsif ($self->{nc} == -1) {
2092 wakaba 1.141 !!!parse-error (type => 'unclosed tag');
2093 wakaba 1.183 if ($self->{ct}->{type} == START_TAG_TOKEN) {
2094 wakaba 1.141 !!!cp (124.7);
2095 wakaba 1.183 $self->{last_stag_name} = $self->{ct}->{tag_name};
2096     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2097     if ($self->{ct}->{attributes}) {
2098 wakaba 1.141 !!!cp (124.5);
2099     !!!parse-error (type => 'end tag attribute');
2100     } else {
2101     ## NOTE: This state should never be reached.
2102     !!!cp (124.6);
2103     }
2104     } else {
2105 wakaba 1.183 die "$0: $self->{ct}->{type}: Unknown token type";
2106 wakaba 1.141 }
2107     $self->{state} = DATA_STATE;
2108     ## Reconsume.
2109 wakaba 1.183 !!!emit ($self->{ct}); # start tag or end tag
2110 wakaba 1.141 redo A;
2111 wakaba 1.72 } else {
2112 wakaba 1.125 !!!cp ('124.4');
2113     !!!parse-error (type => 'nestc');
2114     ## TODO: This error type is wrong.
2115 wakaba 1.72 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2116 wakaba 1.125 ## Reconsume.
2117 wakaba 1.72 redo A;
2118     }
2119 wakaba 1.57 } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2120 wakaba 1.1 ## (only happen if PCDATA state)
2121 wakaba 1.167
2122     ## NOTE: Unlike spec's "bogus comment state", this implementation
2123     ## consumes characters one-by-one basis.
2124 wakaba 1.1
2125 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
2126 wakaba 1.167 !!!cp (124);
2127     $self->{state} = DATA_STATE;
2128     !!!next-input-character;
2129 wakaba 1.1
2130 wakaba 1.183 !!!emit ($self->{ct}); # comment
2131 wakaba 1.167 redo A;
2132 wakaba 1.183 } elsif ($self->{nc} == -1) {
2133 wakaba 1.167 !!!cp (125);
2134     $self->{state} = DATA_STATE;
2135     ## reconsume
2136 wakaba 1.1
2137 wakaba 1.183 !!!emit ($self->{ct}); # comment
2138 wakaba 1.167 redo A;
2139     } else {
2140     !!!cp (126);
2141 wakaba 1.183 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2142     $self->{read_until}->($self->{ct}->{data},
2143 wakaba 1.173 q[>],
2144 wakaba 1.183 length $self->{ct}->{data});
2145 wakaba 1.173
2146 wakaba 1.167 ## Stay in the state.
2147     !!!next-input-character;
2148     redo A;
2149     }
2150 wakaba 1.57 } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2151 wakaba 1.1 ## (only happen if PCDATA state)
2152    
2153 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2154 wakaba 1.163 !!!cp (133);
2155     $self->{state} = MD_HYPHEN_STATE;
2156 wakaba 1.1 !!!next-input-character;
2157 wakaba 1.163 redo A;
2158 wakaba 1.183 } elsif ($self->{nc} == 0x0044 or # D
2159     $self->{nc} == 0x0064) { # d
2160 wakaba 1.163 ## ASCII case-insensitive.
2161     !!!cp (130);
2162     $self->{state} = MD_DOCTYPE_STATE;
2163 wakaba 1.183 $self->{s_kwd} = chr $self->{nc};
2164 wakaba 1.1 !!!next-input-character;
2165 wakaba 1.163 redo A;
2166 wakaba 1.127 } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2167     $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2168 wakaba 1.183 $self->{nc} == 0x005B) { # [
2169 wakaba 1.163 !!!cp (135.4);
2170     $self->{state} = MD_CDATA_STATE;
2171 wakaba 1.183 $self->{s_kwd} = '[';
2172 wakaba 1.127 !!!next-input-character;
2173 wakaba 1.163 redo A;
2174 wakaba 1.77 } else {
2175     !!!cp (136);
2176 wakaba 1.1 }
2177    
2178 wakaba 1.163 !!!parse-error (type => 'bogus comment',
2179     line => $self->{line_prev},
2180     column => $self->{column_prev} - 1);
2181     ## Reconsume.
2182 wakaba 1.57 $self->{state} = BOGUS_COMMENT_STATE;
2183 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2184 wakaba 1.163 line => $self->{line_prev},
2185     column => $self->{column_prev} - 1,
2186 wakaba 1.118 };
2187 wakaba 1.1 redo A;
2188 wakaba 1.163 } elsif ($self->{state} == MD_HYPHEN_STATE) {
2189 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2190 wakaba 1.163 !!!cp (127);
2191 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN, data => '',
2192 wakaba 1.163 line => $self->{line_prev},
2193     column => $self->{column_prev} - 2,
2194     };
2195     $self->{state} = COMMENT_START_STATE;
2196     !!!next-input-character;
2197     redo A;
2198     } else {
2199     !!!cp (128);
2200     !!!parse-error (type => 'bogus comment',
2201     line => $self->{line_prev},
2202     column => $self->{column_prev} - 2);
2203     $self->{state} = BOGUS_COMMENT_STATE;
2204     ## Reconsume.
2205 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN,
2206 wakaba 1.163 data => '-',
2207     line => $self->{line_prev},
2208     column => $self->{column_prev} - 2,
2209     };
2210     redo A;
2211     }
2212     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2213     ## ASCII case-insensitive.
2214 wakaba 1.183 if ($self->{nc} == [
2215 wakaba 1.163 undef,
2216     0x004F, # O
2217     0x0043, # C
2218     0x0054, # T
2219     0x0059, # Y
2220     0x0050, # P
2221 wakaba 1.183 ]->[length $self->{s_kwd}] or
2222     $self->{nc} == [
2223 wakaba 1.163 undef,
2224     0x006F, # o
2225     0x0063, # c
2226     0x0074, # t
2227     0x0079, # y
2228     0x0070, # p
2229 wakaba 1.183 ]->[length $self->{s_kwd}]) {
2230 wakaba 1.163 !!!cp (131);
2231     ## Stay in the state.
2232 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
2233 wakaba 1.163 !!!next-input-character;
2234     redo A;
2235 wakaba 1.183 } elsif ((length $self->{s_kwd}) == 6 and
2236     ($self->{nc} == 0x0045 or # E
2237     $self->{nc} == 0x0065)) { # e
2238 wakaba 1.163 !!!cp (129);
2239     $self->{state} = DOCTYPE_STATE;
2240 wakaba 1.183 $self->{ct} = {type => DOCTYPE_TOKEN,
2241 wakaba 1.163 quirks => 1,
2242     line => $self->{line_prev},
2243     column => $self->{column_prev} - 7,
2244     };
2245     !!!next-input-character;
2246     redo A;
2247     } else {
2248     !!!cp (132);
2249     !!!parse-error (type => 'bogus comment',
2250     line => $self->{line_prev},
2251 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2252 wakaba 1.163 $self->{state} = BOGUS_COMMENT_STATE;
2253     ## Reconsume.
2254 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN,
2255     data => $self->{s_kwd},
2256 wakaba 1.163 line => $self->{line_prev},
2257 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2258 wakaba 1.163 };
2259     redo A;
2260     }
2261     } elsif ($self->{state} == MD_CDATA_STATE) {
2262 wakaba 1.183 if ($self->{nc} == {
2263 wakaba 1.163 '[' => 0x0043, # C
2264     '[C' => 0x0044, # D
2265     '[CD' => 0x0041, # A
2266     '[CDA' => 0x0054, # T
2267     '[CDAT' => 0x0041, # A
2268 wakaba 1.183 }->{$self->{s_kwd}}) {
2269 wakaba 1.163 !!!cp (135.1);
2270     ## Stay in the state.
2271 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
2272 wakaba 1.163 !!!next-input-character;
2273     redo A;
2274 wakaba 1.183 } elsif ($self->{s_kwd} eq '[CDATA' and
2275     $self->{nc} == 0x005B) { # [
2276 wakaba 1.163 !!!cp (135.2);
2277 wakaba 1.183 $self->{ct} = {type => CHARACTER_TOKEN,
2278 wakaba 1.165 data => '',
2279     line => $self->{line_prev},
2280     column => $self->{column_prev} - 7};
2281     $self->{state} = CDATA_SECTION_STATE;
2282 wakaba 1.163 !!!next-input-character;
2283     redo A;
2284     } else {
2285     !!!cp (135.3);
2286     !!!parse-error (type => 'bogus comment',
2287     line => $self->{line_prev},
2288 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd});
2289 wakaba 1.163 $self->{state} = BOGUS_COMMENT_STATE;
2290     ## Reconsume.
2291 wakaba 1.183 $self->{ct} = {type => COMMENT_TOKEN,
2292     data => $self->{s_kwd},
2293 wakaba 1.163 line => $self->{line_prev},
2294 wakaba 1.183 column => $self->{column_prev} - 1 - length $self->{s_kwd},
2295 wakaba 1.163 };
2296     redo A;
2297     }
2298 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_STATE) {
2299 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2300 wakaba 1.77 !!!cp (137);
2301 wakaba 1.57 $self->{state} = COMMENT_START_DASH_STATE;
2302 wakaba 1.23 !!!next-input-character;
2303     redo A;
2304 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2305 wakaba 1.77 !!!cp (138);
2306 wakaba 1.23 !!!parse-error (type => 'bogus comment');
2307 wakaba 1.57 $self->{state} = DATA_STATE;
2308 wakaba 1.23 !!!next-input-character;
2309    
2310 wakaba 1.183 !!!emit ($self->{ct}); # comment
2311 wakaba 1.23
2312     redo A;
2313 wakaba 1.183 } elsif ($self->{nc} == -1) {
2314 wakaba 1.77 !!!cp (139);
2315 wakaba 1.23 !!!parse-error (type => 'unclosed comment');
2316 wakaba 1.57 $self->{state} = DATA_STATE;
2317 wakaba 1.23 ## reconsume
2318    
2319 wakaba 1.183 !!!emit ($self->{ct}); # comment
2320 wakaba 1.23
2321     redo A;
2322     } else {
2323 wakaba 1.77 !!!cp (140);
2324 wakaba 1.183 $self->{ct}->{data} # comment
2325     .= chr ($self->{nc});
2326 wakaba 1.57 $self->{state} = COMMENT_STATE;
2327 wakaba 1.23 !!!next-input-character;
2328     redo A;
2329     }
2330 wakaba 1.57 } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2331 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2332 wakaba 1.77 !!!cp (141);
2333 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
2334 wakaba 1.23 !!!next-input-character;
2335     redo A;
2336 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2337 wakaba 1.77 !!!cp (142);
2338 wakaba 1.23 !!!parse-error (type => 'bogus comment');
2339 wakaba 1.57 $self->{state} = DATA_STATE;
2340 wakaba 1.23 !!!next-input-character;
2341    
2342 wakaba 1.183 !!!emit ($self->{ct}); # comment
2343 wakaba 1.23
2344     redo A;
2345 wakaba 1.183 } elsif ($self->{nc} == -1) {
2346 wakaba 1.77 !!!cp (143);
2347 wakaba 1.23 !!!parse-error (type => 'unclosed comment');
2348 wakaba 1.57 $self->{state} = DATA_STATE;
2349 wakaba 1.23 ## reconsume
2350    
2351 wakaba 1.183 !!!emit ($self->{ct}); # comment
2352 wakaba 1.23
2353     redo A;
2354     } else {
2355 wakaba 1.77 !!!cp (144);
2356 wakaba 1.183 $self->{ct}->{data} # comment
2357     .= '-' . chr ($self->{nc});
2358 wakaba 1.57 $self->{state} = COMMENT_STATE;
2359 wakaba 1.23 !!!next-input-character;
2360     redo A;
2361     }
2362 wakaba 1.57 } elsif ($self->{state} == COMMENT_STATE) {
2363 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2364 wakaba 1.77 !!!cp (145);
2365 wakaba 1.57 $self->{state} = COMMENT_END_DASH_STATE;
2366 wakaba 1.1 !!!next-input-character;
2367     redo A;
2368 wakaba 1.183 } elsif ($self->{nc} == -1) {
2369 wakaba 1.77 !!!cp (146);
2370 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2371 wakaba 1.57 $self->{state} = DATA_STATE;
2372 wakaba 1.1 ## reconsume
2373    
2374 wakaba 1.183 !!!emit ($self->{ct}); # comment
2375 wakaba 1.1
2376     redo A;
2377     } else {
2378 wakaba 1.77 !!!cp (147);
2379 wakaba 1.183 $self->{ct}->{data} .= chr ($self->{nc}); # comment
2380     $self->{read_until}->($self->{ct}->{data},
2381 wakaba 1.173 q[-],
2382 wakaba 1.183 length $self->{ct}->{data});
2383 wakaba 1.173
2384 wakaba 1.1 ## Stay in the state
2385     !!!next-input-character;
2386     redo A;
2387     }
2388 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2389 wakaba 1.183 if ($self->{nc} == 0x002D) { # -
2390 wakaba 1.77 !!!cp (148);
2391 wakaba 1.57 $self->{state} = COMMENT_END_STATE;
2392 wakaba 1.1 !!!next-input-character;
2393     redo A;
2394 wakaba 1.183 } elsif ($self->{nc} == -1) {
2395 wakaba 1.77 !!!cp (149);
2396 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2397 wakaba 1.57 $self->{state} = DATA_STATE;
2398 wakaba 1.1 ## reconsume
2399    
2400 wakaba 1.183 !!!emit ($self->{ct}); # comment
2401 wakaba 1.1
2402     redo A;
2403     } else {
2404 wakaba 1.77 !!!cp (150);
2405 wakaba 1.183 $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2406 wakaba 1.57 $self->{state} = COMMENT_STATE;
2407 wakaba 1.1 !!!next-input-character;
2408     redo A;
2409     }
2410 wakaba 1.57 } elsif ($self->{state} == COMMENT_END_STATE) {
2411 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
2412 wakaba 1.77 !!!cp (151);
2413 wakaba 1.57 $self->{state} = DATA_STATE;
2414 wakaba 1.1 !!!next-input-character;
2415    
2416 wakaba 1.183 !!!emit ($self->{ct}); # comment
2417 wakaba 1.1
2418     redo A;
2419 wakaba 1.183 } elsif ($self->{nc} == 0x002D) { # -
2420 wakaba 1.77 !!!cp (152);
2421 wakaba 1.114 !!!parse-error (type => 'dash in comment',
2422     line => $self->{line_prev},
2423     column => $self->{column_prev});
2424 wakaba 1.183 $self->{ct}->{data} .= '-'; # comment
2425 wakaba 1.1 ## Stay in the state
2426     !!!next-input-character;
2427     redo A;
2428 wakaba 1.183 } elsif ($self->{nc} == -1) {
2429 wakaba 1.77 !!!cp (153);
2430 wakaba 1.3 !!!parse-error (type => 'unclosed comment');
2431 wakaba 1.57 $self->{state} = DATA_STATE;
2432 wakaba 1.1 ## reconsume
2433    
2434 wakaba 1.183 !!!emit ($self->{ct}); # comment
2435 wakaba 1.1
2436     redo A;
2437     } else {
2438 wakaba 1.77 !!!cp (154);
2439 wakaba 1.114 !!!parse-error (type => 'dash in comment',
2440     line => $self->{line_prev},
2441     column => $self->{column_prev});
2442 wakaba 1.183 $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2443 wakaba 1.57 $self->{state} = COMMENT_STATE;
2444 wakaba 1.1 !!!next-input-character;
2445     redo A;
2446     }
2447 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_STATE) {
2448 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2449 wakaba 1.77 !!!cp (155);
2450 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2451 wakaba 1.1 !!!next-input-character;
2452     redo A;
2453     } else {
2454 wakaba 1.77 !!!cp (156);
2455 wakaba 1.3 !!!parse-error (type => 'no space before DOCTYPE name');
2456 wakaba 1.57 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2457 wakaba 1.1 ## reconsume
2458     redo A;
2459     }
2460 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2461 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2462 wakaba 1.77 !!!cp (157);
2463 wakaba 1.1 ## Stay in the state
2464     !!!next-input-character;
2465     redo A;
2466 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2467 wakaba 1.77 !!!cp (158);
2468 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
2469 wakaba 1.57 $self->{state} = DATA_STATE;
2470 wakaba 1.1 !!!next-input-character;
2471    
2472 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2473 wakaba 1.1
2474     redo A;
2475 wakaba 1.183 } elsif ($self->{nc} == -1) {
2476 wakaba 1.77 !!!cp (159);
2477 wakaba 1.3 !!!parse-error (type => 'no DOCTYPE name');
2478 wakaba 1.57 $self->{state} = DATA_STATE;
2479 wakaba 1.1 ## reconsume
2480    
2481 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE (quirks)
2482 wakaba 1.1
2483     redo A;
2484     } else {
2485 wakaba 1.77 !!!cp (160);
2486 wakaba 1.183 $self->{ct}->{name} = chr $self->{nc};
2487     delete $self->{ct}->{quirks};
2488 wakaba 1.4 ## ISSUE: "Set the token's name name to the" in the spec
2489 wakaba 1.57 $self->{state} = DOCTYPE_NAME_STATE;
2490 wakaba 1.1 !!!next-input-character;
2491     redo A;
2492     }
2493 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2494 wakaba 1.18 ## ISSUE: Redundant "First," in the spec.
2495 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2496 wakaba 1.77 !!!cp (161);
2497 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2498 wakaba 1.1 !!!next-input-character;
2499     redo A;
2500 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2501 wakaba 1.77 !!!cp (162);
2502 wakaba 1.57 $self->{state} = DATA_STATE;
2503 wakaba 1.1 !!!next-input-character;
2504    
2505 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2506 wakaba 1.1
2507     redo A;
2508 wakaba 1.183 } elsif ($self->{nc} == -1) {
2509 wakaba 1.77 !!!cp (163);
2510 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2511 wakaba 1.57 $self->{state} = DATA_STATE;
2512 wakaba 1.1 ## reconsume
2513    
2514 wakaba 1.183 $self->{ct}->{quirks} = 1;
2515     !!!emit ($self->{ct}); # DOCTYPE
2516 wakaba 1.1
2517     redo A;
2518     } else {
2519 wakaba 1.77 !!!cp (164);
2520 wakaba 1.183 $self->{ct}->{name}
2521     .= chr ($self->{nc}); # DOCTYPE
2522 wakaba 1.1 ## Stay in the state
2523     !!!next-input-character;
2524     redo A;
2525     }
2526 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2527 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2528 wakaba 1.77 !!!cp (165);
2529 wakaba 1.1 ## Stay in the state
2530     !!!next-input-character;
2531     redo A;
2532 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2533 wakaba 1.77 !!!cp (166);
2534 wakaba 1.57 $self->{state} = DATA_STATE;
2535 wakaba 1.1 !!!next-input-character;
2536    
2537 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2538 wakaba 1.1
2539     redo A;
2540 wakaba 1.183 } elsif ($self->{nc} == -1) {
2541 wakaba 1.77 !!!cp (167);
2542 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2543 wakaba 1.57 $self->{state} = DATA_STATE;
2544 wakaba 1.1 ## reconsume
2545    
2546 wakaba 1.183 $self->{ct}->{quirks} = 1;
2547     !!!emit ($self->{ct}); # DOCTYPE
2548 wakaba 1.18
2549     redo A;
2550 wakaba 1.183 } elsif ($self->{nc} == 0x0050 or # P
2551     $self->{nc} == 0x0070) { # p
2552 wakaba 1.166 $self->{state} = PUBLIC_STATE;
2553 wakaba 1.183 $self->{s_kwd} = chr $self->{nc};
2554 wakaba 1.18 !!!next-input-character;
2555 wakaba 1.166 redo A;
2556 wakaba 1.183 } elsif ($self->{nc} == 0x0053 or # S
2557     $self->{nc} == 0x0073) { # s
2558 wakaba 1.166 $self->{state} = SYSTEM_STATE;
2559 wakaba 1.183 $self->{s_kwd} = chr $self->{nc};
2560 wakaba 1.18 !!!next-input-character;
2561 wakaba 1.166 redo A;
2562 wakaba 1.18 } else {
2563 wakaba 1.77 !!!cp (180);
2564 wakaba 1.166 !!!parse-error (type => 'string after DOCTYPE name');
2565 wakaba 1.183 $self->{ct}->{quirks} = 1;
2566 wakaba 1.166
2567     $self->{state} = BOGUS_DOCTYPE_STATE;
2568 wakaba 1.18 !!!next-input-character;
2569 wakaba 1.166 redo A;
2570 wakaba 1.18 }
2571 wakaba 1.166 } elsif ($self->{state} == PUBLIC_STATE) {
2572     ## ASCII case-insensitive
2573 wakaba 1.183 if ($self->{nc} == [
2574 wakaba 1.166 undef,
2575     0x0055, # U
2576     0x0042, # B
2577     0x004C, # L
2578     0x0049, # I
2579 wakaba 1.183 ]->[length $self->{s_kwd}] or
2580     $self->{nc} == [
2581 wakaba 1.166 undef,
2582     0x0075, # u
2583     0x0062, # b
2584     0x006C, # l
2585     0x0069, # i
2586 wakaba 1.183 ]->[length $self->{s_kwd}]) {
2587 wakaba 1.166 !!!cp (175);
2588     ## Stay in the state.
2589 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
2590 wakaba 1.166 !!!next-input-character;
2591     redo A;
2592 wakaba 1.183 } elsif ((length $self->{s_kwd}) == 5 and
2593     ($self->{nc} == 0x0043 or # C
2594     $self->{nc} == 0x0063)) { # c
2595 wakaba 1.166 !!!cp (168);
2596     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2597     !!!next-input-character;
2598     redo A;
2599     } else {
2600     !!!cp (169);
2601     !!!parse-error (type => 'string after DOCTYPE name',
2602     line => $self->{line_prev},
2603 wakaba 1.183 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2604     $self->{ct}->{quirks} = 1;
2605 wakaba 1.18
2606 wakaba 1.166 $self->{state} = BOGUS_DOCTYPE_STATE;
2607     ## Reconsume.
2608     redo A;
2609     }
2610     } elsif ($self->{state} == SYSTEM_STATE) {
2611     ## ASCII case-insensitive
2612 wakaba 1.183 if ($self->{nc} == [
2613 wakaba 1.166 undef,
2614     0x0059, # Y
2615     0x0053, # S
2616     0x0054, # T
2617     0x0045, # E
2618 wakaba 1.183 ]->[length $self->{s_kwd}] or
2619     $self->{nc} == [
2620 wakaba 1.166 undef,
2621     0x0079, # y
2622     0x0073, # s
2623     0x0074, # t
2624     0x0065, # e
2625 wakaba 1.183 ]->[length $self->{s_kwd}]) {
2626 wakaba 1.166 !!!cp (170);
2627     ## Stay in the state.
2628 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
2629 wakaba 1.166 !!!next-input-character;
2630     redo A;
2631 wakaba 1.183 } elsif ((length $self->{s_kwd}) == 5 and
2632     ($self->{nc} == 0x004D or # M
2633     $self->{nc} == 0x006D)) { # m
2634 wakaba 1.166 !!!cp (171);
2635     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2636     !!!next-input-character;
2637     redo A;
2638     } else {
2639     !!!cp (172);
2640     !!!parse-error (type => 'string after DOCTYPE name',
2641     line => $self->{line_prev},
2642 wakaba 1.183 column => $self->{column_prev} + 1 - length $self->{s_kwd});
2643     $self->{ct}->{quirks} = 1;
2644 wakaba 1.73
2645 wakaba 1.166 $self->{state} = BOGUS_DOCTYPE_STATE;
2646     ## Reconsume.
2647     redo A;
2648     }
2649 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2650 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2651 wakaba 1.77 !!!cp (181);
2652 wakaba 1.18 ## Stay in the state
2653     !!!next-input-character;
2654     redo A;
2655 wakaba 1.183 } elsif ($self->{nc} eq 0x0022) { # "
2656 wakaba 1.77 !!!cp (182);
2657 wakaba 1.183 $self->{ct}->{pubid} = ''; # DOCTYPE
2658 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2659 wakaba 1.18 !!!next-input-character;
2660     redo A;
2661 wakaba 1.183 } elsif ($self->{nc} eq 0x0027) { # '
2662 wakaba 1.77 !!!cp (183);
2663 wakaba 1.183 $self->{ct}->{pubid} = ''; # DOCTYPE
2664 wakaba 1.57 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2665 wakaba 1.18 !!!next-input-character;
2666     redo A;
2667 wakaba 1.183 } elsif ($self->{nc} eq 0x003E) { # >
2668 wakaba 1.77 !!!cp (184);
2669 wakaba 1.18 !!!parse-error (type => 'no PUBLIC literal');
2670    
2671 wakaba 1.57 $self->{state} = DATA_STATE;
2672 wakaba 1.18 !!!next-input-character;
2673    
2674 wakaba 1.183 $self->{ct}->{quirks} = 1;
2675     !!!emit ($self->{ct}); # DOCTYPE
2676 wakaba 1.18
2677     redo A;
2678 wakaba 1.183 } elsif ($self->{nc} == -1) {
2679 wakaba 1.77 !!!cp (185);
2680 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2681    
2682 wakaba 1.57 $self->{state} = DATA_STATE;
2683 wakaba 1.18 ## reconsume
2684    
2685 wakaba 1.183 $self->{ct}->{quirks} = 1;
2686     !!!emit ($self->{ct}); # DOCTYPE
2687 wakaba 1.18
2688     redo A;
2689     } else {
2690 wakaba 1.77 !!!cp (186);
2691 wakaba 1.18 !!!parse-error (type => 'string after PUBLIC');
2692 wakaba 1.183 $self->{ct}->{quirks} = 1;
2693 wakaba 1.73
2694 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2695 wakaba 1.18 !!!next-input-character;
2696     redo A;
2697     }
2698 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2699 wakaba 1.183 if ($self->{nc} == 0x0022) { # "
2700 wakaba 1.77 !!!cp (187);
2701 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2702 wakaba 1.18 !!!next-input-character;
2703     redo A;
2704 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2705 wakaba 1.77 !!!cp (188);
2706 wakaba 1.69 !!!parse-error (type => 'unclosed PUBLIC literal');
2707    
2708     $self->{state} = DATA_STATE;
2709     !!!next-input-character;
2710    
2711 wakaba 1.183 $self->{ct}->{quirks} = 1;
2712     !!!emit ($self->{ct}); # DOCTYPE
2713 wakaba 1.69
2714     redo A;
2715 wakaba 1.183 } elsif ($self->{nc} == -1) {
2716 wakaba 1.77 !!!cp (189);
2717 wakaba 1.18 !!!parse-error (type => 'unclosed PUBLIC literal');
2718    
2719 wakaba 1.57 $self->{state} = DATA_STATE;
2720 wakaba 1.18 ## reconsume
2721    
2722 wakaba 1.183 $self->{ct}->{quirks} = 1;
2723     !!!emit ($self->{ct}); # DOCTYPE
2724 wakaba 1.18
2725     redo A;
2726     } else {
2727 wakaba 1.77 !!!cp (190);
2728 wakaba 1.183 $self->{ct}->{pubid} # DOCTYPE
2729     .= chr $self->{nc};
2730     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2731     length $self->{ct}->{pubid});
2732 wakaba 1.173
2733 wakaba 1.18 ## Stay in the state
2734     !!!next-input-character;
2735     redo A;
2736     }
2737 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2738 wakaba 1.183 if ($self->{nc} == 0x0027) { # '
2739 wakaba 1.77 !!!cp (191);
2740 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2741 wakaba 1.18 !!!next-input-character;
2742     redo A;
2743 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2744 wakaba 1.77 !!!cp (192);
2745 wakaba 1.69 !!!parse-error (type => 'unclosed PUBLIC literal');
2746    
2747     $self->{state} = DATA_STATE;
2748     !!!next-input-character;
2749    
2750 wakaba 1.183 $self->{ct}->{quirks} = 1;
2751     !!!emit ($self->{ct}); # DOCTYPE
2752 wakaba 1.69
2753     redo A;
2754 wakaba 1.183 } elsif ($self->{nc} == -1) {
2755 wakaba 1.77 !!!cp (193);
2756 wakaba 1.18 !!!parse-error (type => 'unclosed PUBLIC literal');
2757    
2758 wakaba 1.57 $self->{state} = DATA_STATE;
2759 wakaba 1.18 ## reconsume
2760    
2761 wakaba 1.183 $self->{ct}->{quirks} = 1;
2762     !!!emit ($self->{ct}); # DOCTYPE
2763 wakaba 1.18
2764     redo A;
2765     } else {
2766 wakaba 1.77 !!!cp (194);
2767 wakaba 1.183 $self->{ct}->{pubid} # DOCTYPE
2768     .= chr $self->{nc};
2769     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2770     length $self->{ct}->{pubid});
2771 wakaba 1.173
2772 wakaba 1.18 ## Stay in the state
2773     !!!next-input-character;
2774     redo A;
2775     }
2776 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2777 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2778 wakaba 1.77 !!!cp (195);
2779 wakaba 1.18 ## Stay in the state
2780     !!!next-input-character;
2781     redo A;
2782 wakaba 1.183 } elsif ($self->{nc} == 0x0022) { # "
2783 wakaba 1.77 !!!cp (196);
2784 wakaba 1.183 $self->{ct}->{sysid} = ''; # DOCTYPE
2785 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2786 wakaba 1.18 !!!next-input-character;
2787     redo A;
2788 wakaba 1.183 } elsif ($self->{nc} == 0x0027) { # '
2789 wakaba 1.77 !!!cp (197);
2790 wakaba 1.183 $self->{ct}->{sysid} = ''; # DOCTYPE
2791 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2792 wakaba 1.18 !!!next-input-character;
2793     redo A;
2794 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2795 wakaba 1.77 !!!cp (198);
2796 wakaba 1.57 $self->{state} = DATA_STATE;
2797 wakaba 1.18 !!!next-input-character;
2798    
2799 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2800 wakaba 1.18
2801     redo A;
2802 wakaba 1.183 } elsif ($self->{nc} == -1) {
2803 wakaba 1.77 !!!cp (199);
2804 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2805    
2806 wakaba 1.57 $self->{state} = DATA_STATE;
2807 wakaba 1.26 ## reconsume
2808 wakaba 1.18
2809 wakaba 1.183 $self->{ct}->{quirks} = 1;
2810     !!!emit ($self->{ct}); # DOCTYPE
2811 wakaba 1.18
2812     redo A;
2813     } else {
2814 wakaba 1.77 !!!cp (200);
2815 wakaba 1.18 !!!parse-error (type => 'string after PUBLIC literal');
2816 wakaba 1.183 $self->{ct}->{quirks} = 1;
2817 wakaba 1.73
2818 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2819 wakaba 1.18 !!!next-input-character;
2820     redo A;
2821     }
2822 wakaba 1.57 } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2823 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2824 wakaba 1.77 !!!cp (201);
2825 wakaba 1.18 ## Stay in the state
2826     !!!next-input-character;
2827     redo A;
2828 wakaba 1.183 } elsif ($self->{nc} == 0x0022) { # "
2829 wakaba 1.77 !!!cp (202);
2830 wakaba 1.183 $self->{ct}->{sysid} = ''; # DOCTYPE
2831 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2832 wakaba 1.18 !!!next-input-character;
2833     redo A;
2834 wakaba 1.183 } elsif ($self->{nc} == 0x0027) { # '
2835 wakaba 1.77 !!!cp (203);
2836 wakaba 1.183 $self->{ct}->{sysid} = ''; # DOCTYPE
2837 wakaba 1.57 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2838 wakaba 1.18 !!!next-input-character;
2839     redo A;
2840 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2841 wakaba 1.77 !!!cp (204);
2842 wakaba 1.18 !!!parse-error (type => 'no SYSTEM literal');
2843 wakaba 1.57 $self->{state} = DATA_STATE;
2844 wakaba 1.18 !!!next-input-character;
2845    
2846 wakaba 1.183 $self->{ct}->{quirks} = 1;
2847     !!!emit ($self->{ct}); # DOCTYPE
2848 wakaba 1.18
2849     redo A;
2850 wakaba 1.183 } elsif ($self->{nc} == -1) {
2851 wakaba 1.77 !!!cp (205);
2852 wakaba 1.18 !!!parse-error (type => 'unclosed DOCTYPE');
2853    
2854 wakaba 1.57 $self->{state} = DATA_STATE;
2855 wakaba 1.26 ## reconsume
2856 wakaba 1.18
2857 wakaba 1.183 $self->{ct}->{quirks} = 1;
2858     !!!emit ($self->{ct}); # DOCTYPE
2859 wakaba 1.18
2860     redo A;
2861     } else {
2862 wakaba 1.77 !!!cp (206);
2863 wakaba 1.30 !!!parse-error (type => 'string after SYSTEM');
2864 wakaba 1.183 $self->{ct}->{quirks} = 1;
2865 wakaba 1.73
2866 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2867 wakaba 1.18 !!!next-input-character;
2868     redo A;
2869     }
2870 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2871 wakaba 1.183 if ($self->{nc} == 0x0022) { # "
2872 wakaba 1.77 !!!cp (207);
2873 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2874 wakaba 1.18 !!!next-input-character;
2875     redo A;
2876 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2877 wakaba 1.77 !!!cp (208);
2878 wakaba 1.153 !!!parse-error (type => 'unclosed SYSTEM literal');
2879 wakaba 1.69
2880     $self->{state} = DATA_STATE;
2881     !!!next-input-character;
2882    
2883 wakaba 1.183 $self->{ct}->{quirks} = 1;
2884     !!!emit ($self->{ct}); # DOCTYPE
2885 wakaba 1.69
2886     redo A;
2887 wakaba 1.183 } elsif ($self->{nc} == -1) {
2888 wakaba 1.77 !!!cp (209);
2889 wakaba 1.18 !!!parse-error (type => 'unclosed SYSTEM literal');
2890    
2891 wakaba 1.57 $self->{state} = DATA_STATE;
2892 wakaba 1.18 ## reconsume
2893    
2894 wakaba 1.183 $self->{ct}->{quirks} = 1;
2895     !!!emit ($self->{ct}); # DOCTYPE
2896 wakaba 1.18
2897     redo A;
2898     } else {
2899 wakaba 1.77 !!!cp (210);
2900 wakaba 1.183 $self->{ct}->{sysid} # DOCTYPE
2901     .= chr $self->{nc};
2902     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2903     length $self->{ct}->{sysid});
2904 wakaba 1.173
2905 wakaba 1.18 ## Stay in the state
2906     !!!next-input-character;
2907     redo A;
2908     }
2909 wakaba 1.57 } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2910 wakaba 1.183 if ($self->{nc} == 0x0027) { # '
2911 wakaba 1.77 !!!cp (211);
2912 wakaba 1.57 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2913 wakaba 1.18 !!!next-input-character;
2914     redo A;
2915 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2916 wakaba 1.77 !!!cp (212);
2917 wakaba 1.153 !!!parse-error (type => 'unclosed SYSTEM literal');
2918 wakaba 1.69
2919     $self->{state} = DATA_STATE;
2920     !!!next-input-character;
2921    
2922 wakaba 1.183 $self->{ct}->{quirks} = 1;
2923     !!!emit ($self->{ct}); # DOCTYPE
2924 wakaba 1.69
2925     redo A;
2926 wakaba 1.183 } elsif ($self->{nc} == -1) {
2927 wakaba 1.77 !!!cp (213);
2928 wakaba 1.18 !!!parse-error (type => 'unclosed SYSTEM literal');
2929    
2930 wakaba 1.57 $self->{state} = DATA_STATE;
2931 wakaba 1.18 ## reconsume
2932    
2933 wakaba 1.183 $self->{ct}->{quirks} = 1;
2934     !!!emit ($self->{ct}); # DOCTYPE
2935 wakaba 1.1
2936     redo A;
2937     } else {
2938 wakaba 1.77 !!!cp (214);
2939 wakaba 1.183 $self->{ct}->{sysid} # DOCTYPE
2940     .= chr $self->{nc};
2941     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2942     length $self->{ct}->{sysid});
2943 wakaba 1.173
2944 wakaba 1.18 ## Stay in the state
2945     !!!next-input-character;
2946     redo A;
2947     }
2948 wakaba 1.57 } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2949 wakaba 1.187 if ($is_space->{$self->{nc}}) {
2950 wakaba 1.77 !!!cp (215);
2951 wakaba 1.18 ## Stay in the state
2952     !!!next-input-character;
2953     redo A;
2954 wakaba 1.183 } elsif ($self->{nc} == 0x003E) { # >
2955 wakaba 1.77 !!!cp (216);
2956 wakaba 1.57 $self->{state} = DATA_STATE;
2957 wakaba 1.18 !!!next-input-character;
2958    
2959 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2960 wakaba 1.18
2961     redo A;
2962 wakaba 1.183 } elsif ($self->{nc} == -1) {
2963 wakaba 1.77 !!!cp (217);
2964 wakaba 1.150 !!!parse-error (type => 'unclosed DOCTYPE');
2965 wakaba 1.57 $self->{state} = DATA_STATE;
2966 wakaba 1.26 ## reconsume
2967 wakaba 1.18
2968 wakaba 1.183 $self->{ct}->{quirks} = 1;
2969     !!!emit ($self->{ct}); # DOCTYPE
2970 wakaba 1.18
2971     redo A;
2972     } else {
2973 wakaba 1.77 !!!cp (218);
2974 wakaba 1.18 !!!parse-error (type => 'string after SYSTEM literal');
2975 wakaba 1.183 #$self->{ct}->{quirks} = 1;
2976 wakaba 1.73
2977 wakaba 1.57 $self->{state} = BOGUS_DOCTYPE_STATE;
2978 wakaba 1.1 !!!next-input-character;
2979     redo A;
2980     }
2981 wakaba 1.57 } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2982 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
2983 wakaba 1.77 !!!cp (219);
2984 wakaba 1.57 $self->{state} = DATA_STATE;
2985 wakaba 1.1 !!!next-input-character;
2986    
2987 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2988 wakaba 1.1
2989     redo A;
2990 wakaba 1.183 } elsif ($self->{nc} == -1) {
2991 wakaba 1.77 !!!cp (220);
2992 wakaba 1.3 !!!parse-error (type => 'unclosed DOCTYPE');
2993 wakaba 1.57 $self->{state} = DATA_STATE;
2994 wakaba 1.1 ## reconsume
2995    
2996 wakaba 1.183 !!!emit ($self->{ct}); # DOCTYPE
2997 wakaba 1.1
2998     redo A;
2999     } else {
3000 wakaba 1.77 !!!cp (221);
3001 wakaba 1.173 my $s = '';
3002     $self->{read_until}->($s, q[>], 0);
3003    
3004 wakaba 1.1 ## Stay in the state
3005     !!!next-input-character;
3006     redo A;
3007     }
3008 wakaba 1.165 } elsif ($self->{state} == CDATA_SECTION_STATE) {
3009     ## NOTE: "CDATA section state" in the state is jointly implemented
3010     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3011     ## and |CDATA_SECTION_MSE2_STATE|.
3012 wakaba 1.127
3013 wakaba 1.183 if ($self->{nc} == 0x005D) { # ]
3014 wakaba 1.165 !!!cp (221.1);
3015     $self->{state} = CDATA_SECTION_MSE1_STATE;
3016     !!!next-input-character;
3017     redo A;
3018 wakaba 1.183 } elsif ($self->{nc} == -1) {
3019 wakaba 1.165 $self->{state} = DATA_STATE;
3020     !!!next-input-character;
3021 wakaba 1.183 if (length $self->{ct}->{data}) { # character
3022 wakaba 1.165 !!!cp (221.2);
3023 wakaba 1.183 !!!emit ($self->{ct}); # character
3024 wakaba 1.165 } else {
3025     !!!cp (221.3);
3026 wakaba 1.183 ## No token to emit. $self->{ct} is discarded.
3027 wakaba 1.165 }
3028     redo A;
3029     } else {
3030     !!!cp (221.4);
3031 wakaba 1.183 $self->{ct}->{data} .= chr $self->{nc};
3032     $self->{read_until}->($self->{ct}->{data},
3033 wakaba 1.173 q<]>,
3034 wakaba 1.183 length $self->{ct}->{data});
3035 wakaba 1.173
3036 wakaba 1.165 ## Stay in the state.
3037     !!!next-input-character;
3038     redo A;
3039     }
3040 wakaba 1.127
3041 wakaba 1.165 ## ISSUE: "text tokens" in spec.
3042     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3043 wakaba 1.183 if ($self->{nc} == 0x005D) { # ]
3044 wakaba 1.165 !!!cp (221.5);
3045     $self->{state} = CDATA_SECTION_MSE2_STATE;
3046     !!!next-input-character;
3047     redo A;
3048     } else {
3049     !!!cp (221.6);
3050 wakaba 1.183 $self->{ct}->{data} .= ']';
3051 wakaba 1.165 $self->{state} = CDATA_SECTION_STATE;
3052     ## Reconsume.
3053     redo A;
3054     }
3055     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3056 wakaba 1.183 if ($self->{nc} == 0x003E) { # >
3057 wakaba 1.165 $self->{state} = DATA_STATE;
3058     !!!next-input-character;
3059 wakaba 1.183 if (length $self->{ct}->{data}) { # character
3060 wakaba 1.165 !!!cp (221.7);
3061 wakaba 1.183 !!!emit ($self->{ct}); # character
3062 wakaba 1.127 } else {
3063 wakaba 1.165 !!!cp (221.8);
3064 wakaba 1.183 ## No token to emit. $self->{ct} is discarded.
3065 wakaba 1.127 }
3066 wakaba 1.165 redo A;
3067 wakaba 1.183 } elsif ($self->{nc} == 0x005D) { # ]
3068 wakaba 1.165 !!!cp (221.9); # character
3069 wakaba 1.183 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3070 wakaba 1.165 ## Stay in the state.
3071 wakaba 1.127 !!!next-input-character;
3072 wakaba 1.165 redo A;
3073 wakaba 1.127 } else {
3074 wakaba 1.165 !!!cp (221.11);
3075 wakaba 1.183 $self->{ct}->{data} .= ']]'; # character
3076 wakaba 1.165 $self->{state} = CDATA_SECTION_STATE;
3077     ## Reconsume.
3078     redo A;
3079 wakaba 1.127 }
3080 wakaba 1.167 } elsif ($self->{state} == ENTITY_STATE) {
3081 wakaba 1.187 if ($is_space->{$self->{nc}} or
3082     {
3083     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3084     $self->{entity_add} => 1,
3085     }->{$self->{nc}}) {
3086 wakaba 1.168 !!!cp (1001);
3087     ## Don't consume
3088     ## No error
3089     ## Return nothing.
3090     #
3091 wakaba 1.183 } elsif ($self->{nc} == 0x0023) { # #
3092 wakaba 1.170 !!!cp (999);
3093 wakaba 1.168 $self->{state} = ENTITY_HASH_STATE;
3094 wakaba 1.183 $self->{s_kwd} = '#';
3095 wakaba 1.168 !!!next-input-character;
3096     redo A;
3097 wakaba 1.183 } elsif ((0x0041 <= $self->{nc} and
3098     $self->{nc} <= 0x005A) or # A..Z
3099     (0x0061 <= $self->{nc} and
3100     $self->{nc} <= 0x007A)) { # a..z
3101 wakaba 1.170 !!!cp (998);
3102 wakaba 1.168 require Whatpm::_NamedEntityList;
3103     $self->{state} = ENTITY_NAME_STATE;
3104 wakaba 1.183 $self->{s_kwd} = chr $self->{nc};
3105     $self->{entity__value} = $self->{s_kwd};
3106 wakaba 1.168 $self->{entity__match} = 0;
3107     !!!next-input-character;
3108     redo A;
3109     } else {
3110     !!!cp (1027);
3111     !!!parse-error (type => 'bare ero');
3112     ## Return nothing.
3113     #
3114     }
3115 wakaba 1.20
3116 wakaba 1.168 ## NOTE: No character is consumed by the "consume a character
3117     ## reference" algorithm. In other word, there is an "&" character
3118     ## that does not introduce a character reference, which would be
3119     ## appended to the parent element or the attribute value in later
3120     ## process of the tokenizer.
3121 wakaba 1.112
3122 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3123 wakaba 1.170 !!!cp (997);
3124 wakaba 1.169 $self->{state} = $self->{prev_state};
3125 wakaba 1.168 ## Reconsume.
3126     !!!emit ({type => CHARACTER_TOKEN, data => '&',
3127     line => $self->{line_prev},
3128     column => $self->{column_prev},
3129     });
3130     redo A;
3131 wakaba 1.169 } else {
3132 wakaba 1.170 !!!cp (996);
3133 wakaba 1.183 $self->{ca}->{value} .= '&';
3134 wakaba 1.169 $self->{state} = $self->{prev_state};
3135     ## Reconsume.
3136     redo A;
3137 wakaba 1.168 }
3138     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3139 wakaba 1.183 if ($self->{nc} == 0x0078 or # x
3140     $self->{nc} == 0x0058) { # X
3141 wakaba 1.170 !!!cp (995);
3142 wakaba 1.168 $self->{state} = HEXREF_X_STATE;
3143 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
3144 wakaba 1.168 !!!next-input-character;
3145     redo A;
3146 wakaba 1.183 } elsif (0x0030 <= $self->{nc} and
3147     $self->{nc} <= 0x0039) { # 0..9
3148 wakaba 1.170 !!!cp (994);
3149 wakaba 1.168 $self->{state} = NCR_NUM_STATE;
3150 wakaba 1.183 $self->{s_kwd} = $self->{nc} - 0x0030;
3151 wakaba 1.168 !!!next-input-character;
3152     redo A;
3153     } else {
3154     !!!parse-error (type => 'bare nero',
3155     line => $self->{line_prev},
3156     column => $self->{column_prev} - 1);
3157    
3158     ## NOTE: According to the spec algorithm, nothing is returned,
3159     ## and then "&#" is appended to the parent element or the attribute
3160     ## value in the later processing.
3161    
3162 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3163 wakaba 1.170 !!!cp (1019);
3164 wakaba 1.169 $self->{state} = $self->{prev_state};
3165 wakaba 1.168 ## Reconsume.
3166     !!!emit ({type => CHARACTER_TOKEN,
3167     data => '&#',
3168     line => $self->{line_prev},
3169     column => $self->{column_prev} - 1,
3170     });
3171     redo A;
3172 wakaba 1.169 } else {
3173 wakaba 1.170 !!!cp (993);
3174 wakaba 1.183 $self->{ca}->{value} .= '&#';
3175 wakaba 1.169 $self->{state} = $self->{prev_state};
3176     ## Reconsume.
3177     redo A;
3178 wakaba 1.1 }
3179 wakaba 1.168 }
3180     } elsif ($self->{state} == NCR_NUM_STATE) {
3181 wakaba 1.183 if (0x0030 <= $self->{nc} and
3182     $self->{nc} <= 0x0039) { # 0..9
3183 wakaba 1.78 !!!cp (1012);
3184 wakaba 1.183 $self->{s_kwd} *= 10;
3185     $self->{s_kwd} += $self->{nc} - 0x0030;
3186 wakaba 1.1
3187 wakaba 1.168 ## Stay in the state.
3188 wakaba 1.1 !!!next-input-character;
3189 wakaba 1.168 redo A;
3190 wakaba 1.183 } elsif ($self->{nc} == 0x003B) { # ;
3191 wakaba 1.78 !!!cp (1013);
3192 wakaba 1.1 !!!next-input-character;
3193 wakaba 1.168 #
3194 wakaba 1.1 } else {
3195 wakaba 1.78 !!!cp (1014);
3196 wakaba 1.168 !!!parse-error (type => 'no refc');
3197     ## Reconsume.
3198     #
3199 wakaba 1.1 }
3200    
3201 wakaba 1.183 my $code = $self->{s_kwd};
3202 wakaba 1.168 my $l = $self->{line_prev};
3203     my $c = $self->{column_prev};
3204 wakaba 1.191 if ($charref_map->{$code}) {
3205 wakaba 1.78 !!!cp (1015);
3206 wakaba 1.153 !!!parse-error (type => 'invalid character reference',
3207     text => (sprintf 'U+%04X', $code),
3208     line => $l, column => $c);
3209 wakaba 1.191 $code = $charref_map->{$code};
3210 wakaba 1.26 } elsif ($code > 0x10FFFF) {
3211 wakaba 1.78 !!!cp (1016);
3212 wakaba 1.153 !!!parse-error (type => 'invalid character reference',
3213     text => (sprintf 'U-%08X', $code),
3214     line => $l, column => $c);
3215 wakaba 1.26 $code = 0xFFFD;
3216 wakaba 1.1 }
3217 wakaba 1.168
3218 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3219 wakaba 1.170 !!!cp (992);
3220 wakaba 1.169 $self->{state} = $self->{prev_state};
3221 wakaba 1.168 ## Reconsume.
3222 wakaba 1.169 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3223     line => $l, column => $c,
3224     });
3225 wakaba 1.168 redo A;
3226     } else {
3227 wakaba 1.170 !!!cp (991);
3228 wakaba 1.183 $self->{ca}->{value} .= chr $code;
3229     $self->{ca}->{has_reference} = 1;
3230 wakaba 1.169 $self->{state} = $self->{prev_state};
3231 wakaba 1.168 ## Reconsume.
3232     redo A;
3233     }
3234     } elsif ($self->{state} == HEXREF_X_STATE) {
3235 wakaba 1.183 if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3236     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3237     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3238 wakaba 1.168 # 0..9, A..F, a..f
3239 wakaba 1.170 !!!cp (990);
3240 wakaba 1.168 $self->{state} = HEXREF_HEX_STATE;
3241 wakaba 1.183 $self->{s_kwd} = 0;
3242 wakaba 1.168 ## Reconsume.
3243     redo A;
3244     } else {
3245     !!!parse-error (type => 'bare hcro',
3246     line => $self->{line_prev},
3247     column => $self->{column_prev} - 2);
3248    
3249     ## NOTE: According to the spec algorithm, nothing is returned,
3250     ## and then "&#" followed by "X" or "x" is appended to the parent
3251     ## element or the attribute value in the later processing.
3252    
3253 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3254 wakaba 1.170 !!!cp (1005);
3255 wakaba 1.169 $self->{state} = $self->{prev_state};
3256 wakaba 1.168 ## Reconsume.
3257     !!!emit ({type => CHARACTER_TOKEN,
3258 wakaba 1.183 data => '&' . $self->{s_kwd},
3259 wakaba 1.168 line => $self->{line_prev},
3260 wakaba 1.183 column => $self->{column_prev} - length $self->{s_kwd},
3261 wakaba 1.168 });
3262     redo A;
3263 wakaba 1.169 } else {
3264 wakaba 1.170 !!!cp (989);
3265 wakaba 1.183 $self->{ca}->{value} .= '&' . $self->{s_kwd};
3266 wakaba 1.169 $self->{state} = $self->{prev_state};
3267     ## Reconsume.
3268     redo A;
3269 wakaba 1.168 }
3270     }
3271     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3272 wakaba 1.183 if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3273 wakaba 1.168 # 0..9
3274     !!!cp (1002);
3275 wakaba 1.183 $self->{s_kwd} *= 0x10;
3276     $self->{s_kwd} += $self->{nc} - 0x0030;
3277 wakaba 1.168 ## Stay in the state.
3278     !!!next-input-character;
3279     redo A;
3280 wakaba 1.183 } elsif (0x0061 <= $self->{nc} and
3281     $self->{nc} <= 0x0066) { # a..f
3282 wakaba 1.168 !!!cp (1003);
3283 wakaba 1.183 $self->{s_kwd} *= 0x10;
3284     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3285 wakaba 1.168 ## Stay in the state.
3286     !!!next-input-character;
3287     redo A;
3288 wakaba 1.183 } elsif (0x0041 <= $self->{nc} and
3289     $self->{nc} <= 0x0046) { # A..F
3290 wakaba 1.168 !!!cp (1004);
3291 wakaba 1.183 $self->{s_kwd} *= 0x10;
3292     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3293 wakaba 1.168 ## Stay in the state.
3294     !!!next-input-character;
3295     redo A;
3296 wakaba 1.183 } elsif ($self->{nc} == 0x003B) { # ;
3297 wakaba 1.168 !!!cp (1006);
3298     !!!next-input-character;
3299     #
3300     } else {
3301     !!!cp (1007);
3302     !!!parse-error (type => 'no refc',
3303     line => $self->{line},
3304     column => $self->{column});
3305     ## Reconsume.
3306     #
3307     }
3308    
3309 wakaba 1.183 my $code = $self->{s_kwd};
3310 wakaba 1.168 my $l = $self->{line_prev};
3311     my $c = $self->{column_prev};
3312 wakaba 1.191 if ($charref_map->{$code}) {
3313 wakaba 1.168 !!!cp (1008);
3314     !!!parse-error (type => 'invalid character reference',
3315     text => (sprintf 'U+%04X', $code),
3316     line => $l, column => $c);
3317 wakaba 1.191 $code = $charref_map->{$code};
3318 wakaba 1.168 } elsif ($code > 0x10FFFF) {
3319     !!!cp (1009);
3320     !!!parse-error (type => 'invalid character reference',
3321     text => (sprintf 'U-%08X', $code),
3322     line => $l, column => $c);
3323     $code = 0xFFFD;
3324     }
3325    
3326 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3327 wakaba 1.170 !!!cp (988);
3328 wakaba 1.169 $self->{state} = $self->{prev_state};
3329 wakaba 1.168 ## Reconsume.
3330 wakaba 1.169 !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3331     line => $l, column => $c,
3332     });
3333 wakaba 1.168 redo A;
3334     } else {
3335 wakaba 1.170 !!!cp (987);
3336 wakaba 1.183 $self->{ca}->{value} .= chr $code;
3337     $self->{ca}->{has_reference} = 1;
3338 wakaba 1.169 $self->{state} = $self->{prev_state};
3339 wakaba 1.168 ## Reconsume.
3340     redo A;
3341     }
3342     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3343 wakaba 1.183 if (length $self->{s_kwd} < 30 and
3344 wakaba 1.168 ## NOTE: Some number greater than the maximum length of entity name
3345 wakaba 1.183 ((0x0041 <= $self->{nc} and # a
3346     $self->{nc} <= 0x005A) or # x
3347     (0x0061 <= $self->{nc} and # a
3348     $self->{nc} <= 0x007A) or # z
3349     (0x0030 <= $self->{nc} and # 0
3350     $self->{nc} <= 0x0039) or # 9
3351     $self->{nc} == 0x003B)) { # ;
3352 wakaba 1.168 our $EntityChar;
3353 wakaba 1.183 $self->{s_kwd} .= chr $self->{nc};
3354     if (defined $EntityChar->{$self->{s_kwd}}) {
3355     if ($self->{nc} == 0x003B) { # ;
3356 wakaba 1.168 !!!cp (1020);
3357 wakaba 1.183 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
3358 wakaba 1.168 $self->{entity__match} = 1;
3359     !!!next-input-character;
3360     #
3361     } else {
3362     !!!cp (1021);
3363 wakaba 1.183 $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
3364 wakaba 1.168 $self->{entity__match} = -1;
3365     ## Stay in the state.
3366     !!!next-input-character;
3367     redo A;
3368     }
3369     } else {
3370     !!!cp (1022);
3371 wakaba 1.183 $self->{entity__value} .= chr $self->{nc};
3372 wakaba 1.168 $self->{entity__match} *= 2;
3373     ## Stay in the state.
3374 wakaba 1.16 !!!next-input-character;
3375 wakaba 1.168 redo A;
3376     }
3377     }
3378    
3379     my $data;
3380     my $has_ref;
3381     if ($self->{entity__match} > 0) {
3382     !!!cp (1023);
3383     $data = $self->{entity__value};
3384     $has_ref = 1;
3385     #
3386     } elsif ($self->{entity__match} < 0) {
3387     !!!parse-error (type => 'no refc');
3388 wakaba 1.169 if ($self->{prev_state} != DATA_STATE and # in attribute
3389     $self->{entity__match} < -1) {
3390 wakaba 1.168 !!!cp (1024);
3391 wakaba 1.183 $data = '&' . $self->{s_kwd};
3392 wakaba 1.168 #
3393 wakaba 1.37 } else {
3394 wakaba 1.168 !!!cp (1025);
3395     $data = $self->{entity__value};
3396     $has_ref = 1;
3397     #
3398 wakaba 1.16 }
3399 wakaba 1.1 } else {
3400 wakaba 1.168 !!!cp (1026);
3401     !!!parse-error (type => 'bare ero',
3402     line => $self->{line_prev},
3403 wakaba 1.183 column => $self->{column_prev} - length $self->{s_kwd});
3404     $data = '&' . $self->{s_kwd};
3405 wakaba 1.168 #
3406 wakaba 1.1 }
3407 wakaba 1.168
3408     ## NOTE: In these cases, when a character reference is found,
3409     ## it is consumed and a character token is returned, or, otherwise,
3410     ## nothing is consumed and returned, according to the spec algorithm.
3411     ## In this implementation, anything that has been examined by the
3412     ## tokenizer is appended to the parent element or the attribute value
3413     ## as string, either literal string when no character reference or
3414     ## entity-replaced string otherwise, in this stage, since any characters
3415     ## that would not be consumed are appended in the data state or in an
3416     ## appropriate attribute value state anyway.
3417    
3418 wakaba 1.169 if ($self->{prev_state} == DATA_STATE) {
3419 wakaba 1.170 !!!cp (986);
3420 wakaba 1.169 $self->{state} = $self->{prev_state};
3421 wakaba 1.168 ## Reconsume.
3422     !!!emit ({type => CHARACTER_TOKEN,
3423 wakaba 1.169 data => $data,
3424 wakaba 1.168 line => $self->{line_prev},
3425 wakaba 1.183 column => $self->{column_prev} + 1 - length $self->{s_kwd},
3426 wakaba 1.168 });
3427 wakaba 1.167 redo A;
3428 wakaba 1.169 } else {
3429 wakaba 1.170 !!!cp (985);
3430 wakaba 1.183 $self->{ca}->{value} .= $data;
3431     $self->{ca}->{has_reference} = 1 if $has_ref;
3432 wakaba 1.169 $self->{state} = $self->{prev_state};
3433     ## Reconsume.
3434     redo A;
3435 wakaba 1.37 }
3436 wakaba 1.1 } else {
3437 wakaba 1.167 die "$0: $self->{state}: Unknown state";
3438     }
3439     } # A
3440    
3441     die "$0: _get_next_token: unexpected case";
3442     } # _get_next_token
3443 wakaba 1.1
3444     sub _initialize_tree_constructor ($) {
3445     my $self = shift;
3446     ## NOTE: $self->{document} MUST be specified before this method is called
3447     $self->{document}->strict_error_checking (0);
3448     ## TODO: Turn mutation events off # MUST
3449     ## TODO: Turn loose Document option (manakai extension) on
3450 wakaba 1.18 $self->{document}->manakai_is_html (1); # MUST
3451 wakaba 1.154 $self->{document}->set_user_data (manakai_source_line => 1);
3452     $self->{document}->set_user_data (manakai_source_column => 1);
3453 wakaba 1.1 } # _initialize_tree_constructor
3454    
3455     sub _terminate_tree_constructor ($) {
3456     my $self = shift;
3457     $self->{document}->strict_error_checking (1);
3458     ## TODO: Turn mutation events on
3459     } # _terminate_tree_constructor
3460    
3461     ## ISSUE: Should append_child (for example) in script executed in tree construction stage fire mutation events?
3462    
3463 wakaba 1.3 { # tree construction stage
3464     my $token;
3465    
3466 wakaba 1.1 sub _construct_tree ($) {
3467     my ($self) = @_;
3468    
3469     ## When an interactive UA render the $self->{document} available
3470     ## to the user, or when it begin accepting user input, are
3471     ## not defined.
3472    
3473     ## Append a character: collect it and all subsequent consecutive
3474     ## characters and insert one Text node whose data is concatenation
3475     ## of all those characters. # MUST
3476    
3477     !!!next-token;
3478    
3479 wakaba 1.3 undef $self->{form_element};
3480     undef $self->{head_element};
3481     $self->{open_elements} = [];
3482     undef $self->{inner_html_node};
3483    
3484 wakaba 1.84 ## NOTE: The "initial" insertion mode.
3485 wakaba 1.3 $self->_tree_construction_initial; # MUST
3486 wakaba 1.84
3487     ## NOTE: The "before html" insertion mode.
3488 wakaba 1.3 $self->_tree_construction_root_element;
3489 wakaba 1.84 $self->{insertion_mode} = BEFORE_HEAD_IM;
3490    
3491     ## NOTE: The "before head" insertion mode and so on.
3492 wakaba 1.3 $self->_tree_construction_main;
3493     } # _construct_tree
3494    
3495     sub _tree_construction_initial ($) {
3496     my $self = shift;
3497 wakaba 1.84
3498     ## NOTE: "initial" insertion mode
3499    
3500 wakaba 1.18 INITIAL: {
3501 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
3502 wakaba 1.18 ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
3503     ## error, switch to a conformance checking mode for another
3504     ## language.
3505     my $doctype_name = $token->{name};
3506     $doctype_name = '' unless defined $doctype_name;
3507 wakaba 1.159 $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3508 wakaba 1.18 if (not defined $token->{name} or # <!DOCTYPE>
3509 wakaba 1.183 defined $token->{sysid}) {
3510 wakaba 1.79 !!!cp ('t1');
3511 wakaba 1.113 !!!parse-error (type => 'not HTML5', token => $token);
3512 wakaba 1.18 } elsif ($doctype_name ne 'HTML') {
3513 wakaba 1.79 !!!cp ('t2');
3514 wakaba 1.113 !!!parse-error (type => 'not HTML5', token => $token);
3515 wakaba 1.183 } elsif (defined $token->{pubid}) {
3516     if ($token->{pubid} eq 'XSLT-compat') {
3517 wakaba 1.159 !!!cp ('t1.2');
3518     !!!parse-error (type => 'XSLT-compat', token => $token,
3519     level => $self->{level}->{should});
3520     } else {
3521     !!!parse-error (type => 'not HTML5', token => $token);
3522     }
3523 wakaba 1.79 } else {
3524     !!!cp ('t3');
3525 wakaba 1.159 #
3526 wakaba 1.18 }
3527    
3528     my $doctype = $self->{document}->create_document_type_definition
3529     ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3530 wakaba 1.122 ## NOTE: Default value for both |public_id| and |system_id| attributes
3531     ## are empty strings, so that we don't set any value in missing cases.
3532 wakaba 1.183 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
3533     $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
3534 wakaba 1.18 ## NOTE: Other DocumentType attributes are null or empty lists.
3535     ## ISSUE: internalSubset = null??
3536     $self->{document}->append_child ($doctype);
3537    
3538 wakaba 1.75 if ($token->{quirks} or $doctype_name ne 'HTML') {
3539 wakaba 1.79 !!!cp ('t4');
3540 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3541 wakaba 1.183 } elsif (defined $token->{pubid}) {
3542     my $pubid = $token->{pubid};
3543 wakaba 1.18 $pubid =~ tr/a-z/A-z/;
3544 wakaba 1.143 my $prefix = [
3545     "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
3546     "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3547     "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
3548     "-//IETF//DTD HTML 2.0 LEVEL 1//",
3549     "-//IETF//DTD HTML 2.0 LEVEL 2//",
3550     "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
3551     "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
3552     "-//IETF//DTD HTML 2.0 STRICT//",
3553     "-//IETF//DTD HTML 2.0//",
3554     "-//IETF//DTD HTML 2.1E//",
3555     "-//IETF//DTD HTML 3.0//",
3556     "-//IETF//DTD HTML 3.2 FINAL//",
3557     "-//IETF//DTD HTML 3.2//",
3558     "-//IETF//DTD HTML 3//",
3559     "-//IETF//DTD HTML LEVEL 0//",
3560     "-//IETF//DTD HTML LEVEL 1//",
3561     "-//IETF//DTD HTML LEVEL 2//",
3562     "-//IETF//DTD HTML LEVEL 3//",
3563     "-//IETF//DTD HTML STRICT LEVEL 0//",
3564     "-//IETF//DTD HTML STRICT LEVEL 1//",
3565     "-//IETF//DTD HTML STRICT LEVEL 2//",
3566     "-//IETF//DTD HTML STRICT LEVEL 3//",
3567     "-//IETF//DTD HTML STRICT//",
3568     "-//IETF//DTD HTML//",
3569     "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
3570     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
3571     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
3572     "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
3573     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
3574     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
3575     "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
3576     "-//NETSCAPE COMM. CORP.//DTD HTML//",
3577     "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
3578     "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
3579     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
3580     "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
3581     "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
3582     "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
3583     "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
3584     "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
3585     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
3586     "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
3587     "-//W3C//DTD HTML 3 1995-03-24//",
3588     "-//W3C//DTD HTML 3.2 DRAFT//",
3589     "-//W3C//DTD HTML 3.2 FINAL//",
3590     "-//W3C//DTD HTML 3.2//",
3591     "-//W3C//DTD HTML 3.2S DRAFT//",
3592     "-//W3C//DTD HTML 4.0 FRAMESET//",
3593     "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
3594     "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
3595     "-//W3C//DTD HTML EXPERIMENTAL 970421//",
3596     "-//W3C//DTD W3 HTML//",
3597     "-//W3O//DTD W3 HTML 3.0//",
3598     "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
3599     "-//WEBTECHS//DTD MOZILLA HTML//",
3600     ]; # $prefix
3601     my $match;
3602     for (@$prefix) {
3603     if (substr ($prefix, 0, length $_) eq $_) {
3604     $match = 1;
3605     last;
3606     }
3607     }
3608     if ($match or
3609     $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
3610     $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
3611     $pubid eq "HTML") {
3612 wakaba 1.79 !!!cp ('t5');
3613 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3614 wakaba 1.143 } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3615     $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3616 wakaba 1.183 if (defined $token->{sysid}) {
3617 wakaba 1.79 !!!cp ('t6');
3618 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3619     } else {
3620 wakaba 1.79 !!!cp ('t7');
3621 wakaba 1.18 $self->{document}->manakai_compat_mode ('limited quirks');
3622 wakaba 1.3 }
3623 wakaba 1.143 } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
3624     $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
3625 wakaba 1.79 !!!cp ('t8');
3626 wakaba 1.18 $self->{document}->manakai_compat_mode ('limited quirks');
3627 wakaba 1.79 } else {
3628     !!!cp ('t9');
3629 wakaba 1.18 }
3630 wakaba 1.79 } else {
3631     !!!cp ('t10');
3632 wakaba 1.18 }
3633 wakaba 1.183 if (defined $token->{sysid}) {
3634     my $sysid = $token->{sysid};
3635 wakaba 1.18 $sysid =~ tr/A-Z/a-z/;
3636     if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3637 wakaba 1.143 ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
3638     ## marked as quirks.
3639 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3640 wakaba 1.79 !!!cp ('t11');
3641     } else {
3642     !!!cp ('t12');
3643 wakaba 1.18 }
3644 wakaba 1.79 } else {
3645     !!!cp ('t13');
3646 wakaba 1.18 }
3647    
3648 wakaba 1.84 ## Go to the "before html" insertion mode.
3649 wakaba 1.18 !!!next-token;
3650     return;
3651     } elsif ({
3652 wakaba 1.55 START_TAG_TOKEN, 1,
3653     END_TAG_TOKEN, 1,
3654     END_OF_FILE_TOKEN, 1,
3655 wakaba 1.18 }->{$token->{type}}) {
3656 wakaba 1.79 !!!cp ('t14');
3657 wakaba 1.113 !!!parse-error (type => 'no DOCTYPE', token => $token);
3658 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3659 wakaba 1.84 ## Go to the "before html" insertion mode.
3660 wakaba 1.18 ## reprocess
3661 wakaba 1.125 !!!ack-later;
3662 wakaba 1.18 return;
3663 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
3664 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
3665 wakaba 1.18 ## Ignore the token
3666 wakaba 1.26
3667 wakaba 1.18 unless (length $token->{data}) {
3668 wakaba 1.79 !!!cp ('t15');
3669 wakaba 1.84 ## Stay in the insertion mode.
3670 wakaba 1.18 !!!next-token;
3671     redo INITIAL;
3672 wakaba 1.79 } else {
3673     !!!cp ('t16');
3674 wakaba 1.3 }
3675 wakaba 1.79 } else {
3676     !!!cp ('t17');
3677 wakaba 1.3 }
3678 wakaba 1.18
3679 wakaba 1.113 !!!parse-error (type => 'no DOCTYPE', token => $token);
3680 wakaba 1.18 $self->{document}->manakai_compat_mode ('quirks');
3681 wakaba 1.84 ## Go to the "before html" insertion mode.
3682 wakaba 1.18 ## reprocess
3683     return;
3684 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
3685 wakaba 1.79 !!!cp ('t18');
3686 wakaba 1.18 my $comment = $self->{document}->create_comment ($token->{data});
3687     $self->{document}->append_child ($comment);
3688    
3689 wakaba 1.84 ## Stay in the insertion mode.
3690 wakaba 1.18 !!!next-token;
3691     redo INITIAL;
3692     } else {
3693 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
3694 wakaba 1.18 }
3695     } # INITIAL
3696 wakaba 1.79
3697     die "$0: _tree_construction_initial: This should be never reached";
3698 wakaba 1.3 } # _tree_construction_initial
3699    
3700     sub _tree_construction_root_element ($) {
3701     my $self = shift;
3702 wakaba 1.84
3703     ## NOTE: "before html" insertion mode.
3704 wakaba 1.3
3705     B: {
3706 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
3707 wakaba 1.79 !!!cp ('t19');
3708 wakaba 1.153 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
3709 wakaba 1.3 ## Ignore the token
3710 wakaba 1.84 ## Stay in the insertion mode.
3711 wakaba 1.3 !!!next-token;
3712     redo B;
3713 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
3714 wakaba 1.79 !!!cp ('t20');
3715 wakaba 1.3 my $comment = $self->{document}->create_comment ($token->{data});
3716     $self->{document}->append_child ($comment);
3717 wakaba 1.84 ## Stay in the insertion mode.
3718 wakaba 1.3 !!!next-token;
3719     redo B;
3720 wakaba 1.55 } elsif ($token->{type} == CHARACTER_TOKEN) {
3721 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
3722 wakaba 1.26 ## Ignore the token.
3723    
3724 wakaba 1.3 unless (length $token->{data}) {
3725 wakaba 1.79 !!!cp ('t21');
3726 wakaba 1.84 ## Stay in the insertion mode.
3727 wakaba 1.3 !!!next-token;
3728     redo B;
3729 wakaba 1.79 } else {
3730     !!!cp ('t22');
3731 wakaba 1.3 }
3732 wakaba 1.79 } else {
3733     !!!cp ('t23');
3734 wakaba 1.3 }
3735 wakaba 1.61
3736     $self->{application_cache_selection}->(undef);
3737    
3738     #
3739     } elsif ($token->{type} == START_TAG_TOKEN) {
3740 wakaba 1.84 if ($token->{tag_name} eq 'html') {
3741     my $root_element;
3742 wakaba 1.126 !!!create-element ($root_element, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
3743 wakaba 1.84 $self->{document}->append_child ($root_element);
3744 wakaba 1.123 push @{$self->{open_elements}},
3745     [$root_element, $el_category->{html}];
3746 wakaba 1.84
3747     if ($token->{attributes}->{manifest}) {
3748     !!!cp ('t24');
3749     $self->{application_cache_selection}
3750     ->($token->{attributes}->{manifest}->{value});
3751 wakaba 1.118 ## ISSUE: Spec is unclear on relative references.
3752     ## According to Hixie (#whatwg 2008-03-19), it should be
3753     ## resolved against the base URI of the document in HTML
3754     ## or xml:base of the element in XHTML.
3755 wakaba 1.84 } else {
3756     !!!cp ('t25');
3757     $self->{application_cache_selection}->(undef);
3758     }
3759    
3760 wakaba 1.125 !!!nack ('t25c');
3761    
3762 wakaba 1.84 !!!next-token;
3763     return; ## Go to the "before head" insertion mode.
3764 wakaba 1.61 } else {
3765 wakaba 1.84 !!!cp ('t25.1');
3766     #
3767 wakaba 1.61 }
3768 wakaba 1.3 } elsif ({
3769 wakaba 1.55 END_TAG_TOKEN, 1,
3770     END_OF_FILE_TOKEN, 1,
3771 wakaba 1.3 }->{$token->{type}}) {
3772 wakaba 1.79 !!!cp ('t26');
3773 wakaba 1.3 #
3774     } else {
3775 wakaba 1.55 die "$0: $token->{type}: Unknown token type";
3776 wakaba 1.3 }
3777 wakaba 1.61
3778 wakaba 1.126 my $root_element;
3779     !!!create-element ($root_element, $HTML_NS, 'html',, $token);
3780 wakaba 1.84 $self->{document}->append_child ($root_element);
3781 wakaba 1.123 push @{$self->{open_elements}}, [$root_element, $el_category->{html}];
3782 wakaba 1.84
3783     $self->{application_cache_selection}->(undef);
3784    
3785     ## NOTE: Reprocess the token.
3786 wakaba 1.125 !!!ack-later;
3787 wakaba 1.84 return; ## Go to the "before head" insertion mode.
3788    
3789     ## ISSUE: There is an issue in the spec
3790 wakaba 1.3 } # B
3791 wakaba 1.79
3792     die "$0: _tree_construction_root_element: This should never be reached";
3793 wakaba 1.3 } # _tree_construction_root_element
3794    
3795     sub _reset_insertion_mode ($) {
3796     my $self = shift;
3797    
3798     ## Step 1
3799     my $last;
3800    
3801     ## Step 2
3802     my $i = -1;
3803     my $node = $self->{open_elements}->[$i];
3804    
3805     ## Step 3
3806     S3: {
3807 wakaba 1.29 if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
3808     $last = 1;
3809     if (defined $self->{inner_html_node}) {
3810 wakaba 1.140 !!!cp ('t28');
3811     $node = $self->{inner_html_node};
3812     } else {
3813     die "_reset_insertion_mode: t27";
3814 wakaba 1.3 }
3815     }
3816 wakaba 1.140
3817     ## Step 4..14
3818     my $new_mode;
3819     if ($node->[1] & FOREIGN_EL) {
3820     !!!cp ('t28.1');
3821     ## NOTE: Strictly spaking, the line below only applies to MathML and
3822     ## SVG elements. Currently the HTML syntax supports only MathML and
3823     ## SVG elements as foreigners.
3824 wakaba 1.148 $new_mode = IN_BODY_IM | IN_FOREIGN_CONTENT_IM;
3825 wakaba 1.140 } elsif ($node->[1] & TABLE_CELL_EL) {
3826     if ($last) {
3827     !!!cp ('t28.2');
3828     #
3829     } else {
3830     !!!cp ('t28.3');
3831     $new_mode = IN_CELL_IM;
3832     }
3833     } else {
3834     !!!cp ('t28.4');
3835     $new_mode = {
3836 wakaba 1.54 select => IN_SELECT_IM,
3837 wakaba 1.83 ## NOTE: |option| and |optgroup| do not set
3838     ## insertion mode to "in select" by themselves.
3839 wakaba 1.54 tr => IN_ROW_IM,
3840     tbody => IN_TABLE_BODY_IM,
3841     thead => IN_TABLE_BODY_IM,
3842     tfoot => IN_TABLE_BODY_IM,
3843     caption => IN_CAPTION_IM,
3844     colgroup => IN_COLUMN_GROUP_IM,
3845     table => IN_TABLE_IM,
3846     head => IN_BODY_IM, # not in head!
3847     body => IN_BODY_IM,
3848     frameset => IN_FRAMESET_IM,
3849 wakaba 1.123 }->{$node->[0]->manakai_local_name};
3850 wakaba 1.140 }
3851     $self->{insertion_mode} = $new_mode and return if defined $new_mode;
3852 wakaba 1.3
3853 wakaba 1.126 ## Step 15
3854 wakaba 1.123 if ($node->[1] & HTML_EL) {
3855 wakaba 1.3 unless (defined $self->{head_element}) {
3856 wakaba 1.79 !!!cp ('t29');
3857 wakaba 1.54 $self->{insertion_mode} = BEFORE_HEAD_IM;
3858 wakaba 1.3 } else {
3859 wakaba 1.81 ## ISSUE: Can this state be reached?
3860 wakaba 1.79 !!!cp ('t30');
3861 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
3862 wakaba 1.3 }
3863     return;
3864 wakaba 1.79 } else {
3865     !!!cp ('t31');
3866 wakaba 1.3 }
3867    
3868 wakaba 1.126 ## Step 16
3869 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM and return if $last;
3870 wakaba 1.3
3871 wakaba 1.126 ## Step 17
3872 wakaba 1.3 $i--;
3873     $node = $self->{open_elements}->[$i];
3874    
3875 wakaba 1.126 ## Step 18
3876 wakaba 1.3 redo S3;
3877     } # S3
3878 wakaba 1.79
3879     die "$0: _reset_insertion_mode: This line should never be reached";
3880 wakaba 1.3 } # _reset_insertion_mode
3881    
3882     sub _tree_construction_main ($) {
3883     my $self = shift;
3884    
3885 wakaba 1.1 my $active_formatting_elements = [];
3886    
3887     my $reconstruct_active_formatting_elements = sub { # MUST
3888     my $insert = shift;
3889    
3890     ## Step 1
3891     return unless @$active_formatting_elements;
3892    
3893     ## Step 3
3894     my $i = -1;
3895     my $entry = $active_formatting_elements->[$i];
3896    
3897     ## Step 2
3898     return if $entry->[0] eq '#marker';
3899 wakaba 1.3 for (@{$self->{open_elements}}) {
3900 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
3901 wakaba 1.79 !!!cp ('t32');
3902 wakaba 1.1 return;
3903     }
3904     }
3905    
3906     S4: {
3907     ## Step 4
3908     last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0];
3909    
3910     ## Step 5
3911     $i--;
3912     $entry = $active_formatting_elements->[$i];
3913    
3914     ## Step 6
3915     if ($entry->[0] eq '#marker') {
3916 wakaba 1.81 !!!cp ('t33_1');
3917 wakaba 1.1 #
3918     } else {
3919     my $in_open_elements;
3920 wakaba 1.3 OE: for (@{$self->{open_elements}}) {
3921 wakaba 1.1 if ($entry->[0] eq $_->[0]) {
3922 wakaba 1.79 !!!cp ('t33');
3923 wakaba 1.1 $in_open_elements = 1;
3924     last OE;
3925     }
3926     }
3927     if ($in_open_elements) {
3928 wakaba 1.79 !!!cp ('t34');
3929 wakaba 1.1 #
3930     } else {
3931 wakaba 1.81 ## NOTE: <!DOCTYPE HTML><p><b><i><u></p> <p>X
3932 wakaba 1.79 !!!cp ('t35');
3933 wakaba 1.1 redo S4;
3934     }
3935     }
3936    
3937     ## Step 7
3938     $i++;
3939     $entry = $active_formatting_elements->[$i];
3940     } # S4
3941    
3942     S7: {
3943     ## Step 8
3944     my $clone = [$entry->[0]->clone_node (0), $entry->[1]];
3945    
3946     ## Step 9
3947     $insert->($clone->[0]);
3948 wakaba 1.3 push @{$self->{open_elements}}, $clone;
3949 wakaba 1.1
3950     ## Step 10
3951 wakaba 1.3 $active_formatting_elements->[$i] = $self->{open_elements}->[-1];
3952 wakaba 1.1
3953     ## Step 11
3954     unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) {
3955 wakaba 1.79 !!!cp ('t36');
3956 wakaba 1.1 ## Step 7'
3957     $i++;
3958     $entry = $active_formatting_elements->[$i];
3959    
3960     redo S7;
3961     }
3962 wakaba 1.79
3963     !!!cp ('t37');
3964 wakaba 1.1 } # S7
3965     }; # $reconstruct_active_formatting_elements
3966    
3967     my $clear_up_to_marker = sub {
3968     for (reverse 0..$#$active_formatting_elements) {
3969     if ($active_formatting_elements->[$_]->[0] eq '#marker') {
3970 wakaba 1.79 !!!cp ('t38');
3971 wakaba 1.1 splice @$active_formatting_elements, $_;
3972     return;
3973     }
3974     }
3975 wakaba 1.79
3976     !!!cp ('t39');
3977 wakaba 1.1 }; # $clear_up_to_marker
3978    
3979 wakaba 1.96 my $insert;
3980    
3981     my $parse_rcdata = sub ($) {
3982     my ($content_model_flag) = @_;
3983 wakaba 1.25
3984     ## Step 1
3985     my $start_tag_name = $token->{tag_name};
3986     my $el;
3987 wakaba 1.126 !!!create-element ($el, $HTML_NS, $start_tag_name, $token->{attributes}, $token);
3988 wakaba 1.25
3989     ## Step 2
3990 wakaba 1.96 $insert->($el);
3991 wakaba 1.25
3992     ## Step 3
3993 wakaba 1.40 $self->{content_model} = $content_model_flag; # CDATA or RCDATA
3994 wakaba 1.13 delete $self->{escape}; # MUST
3995 wakaba 1.25
3996     ## Step 4
3997 wakaba 1.1 my $text = '';
3998 wakaba 1.125 !!!nack ('t40.1');
3999 wakaba 1.1 !!!next-token;
4000 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) { # or until stop tokenizing
4001 wakaba 1.79 !!!cp ('t40');
4002 wakaba 1.1 $text .= $token->{data};
4003     !!!next-token;
4004 wakaba 1.25 }
4005    
4006     ## Step 5
4007 wakaba 1.1 if (length $text) {
4008 wakaba 1.79 !!!cp ('t41');
4009 wakaba 1.25 my $text = $self->{document}->create_text_node ($text);
4010     $el->append_child ($text);
4011 wakaba 1.1 }
4012 wakaba 1.25
4013     ## Step 6
4014 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
4015 wakaba 1.25
4016     ## Step 7
4017 wakaba 1.79 if ($token->{type} == END_TAG_TOKEN and
4018     $token->{tag_name} eq $start_tag_name) {
4019     !!!cp ('t42');
4020 wakaba 1.1 ## Ignore the token
4021     } else {
4022 wakaba 1.96 ## NOTE: An end-of-file token.
4023     if ($content_model_flag == CDATA_CONTENT_MODEL) {
4024     !!!cp ('t43');
4025 wakaba 1.153 !!!parse-error (type => 'in CDATA:#eof', token => $token);
4026 wakaba 1.96 } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
4027     !!!cp ('t44');
4028 wakaba 1.153 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
4029 wakaba 1.96 } else {
4030     die "$0: $content_model_flag in parse_rcdata";
4031     }
4032 wakaba 1.1 }
4033     !!!next-token;
4034 wakaba 1.25 }; # $parse_rcdata
4035 wakaba 1.1
4036 wakaba 1.96 my $script_start_tag = sub () {
4037 wakaba 1.1 my $script_el;
4038 wakaba 1.126 !!!create-element ($script_el, $HTML_NS, 'script', $token->{attributes}, $token);
4039 wakaba 1.1 ## TODO: mark as "parser-inserted"
4040    
4041 wakaba 1.40 $self->{content_model} = CDATA_CONTENT_MODEL;
4042 wakaba 1.13 delete $self->{escape}; # MUST
4043 wakaba 1.1
4044     my $text = '';
4045 wakaba 1.125 !!!nack ('t45.1');
4046 wakaba 1.1 !!!next-token;
4047 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
4048 wakaba 1.79 !!!cp ('t45');
4049 wakaba 1.1 $text .= $token->{data};
4050     !!!next-token;
4051     } # stop if non-character token or tokenizer stops tokenising
4052     if (length $text) {
4053 wakaba 1.79 !!!cp ('t46');
4054 wakaba 1.1 $script_el->manakai_append_text ($text);
4055     }
4056    
4057 wakaba 1.40 $self->{content_model} = PCDATA_CONTENT_MODEL;
4058 wakaba 1.1
4059 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
4060 wakaba 1.1 $token->{tag_name} eq 'script') {
4061 wakaba 1.79 !!!cp ('t47');
4062 wakaba 1.1 ## Ignore the token
4063     } else {
4064 wakaba 1.79 !!!cp ('t48');
4065 wakaba 1.153 !!!parse-error (type => 'in CDATA:#eof', token => $token);
4066 wakaba 1.1 ## ISSUE: And ignore?
4067     ## TODO: mark as "already executed"
4068     }
4069    
4070 wakaba 1.3 if (defined $self->{inner_html_node}) {
4071 wakaba 1.79 !!!cp ('t49');
4072 wakaba 1.3 ## TODO: mark as "already executed"
4073     } else {
4074 wakaba 1.79 !!!cp ('t50');
4075 wakaba 1.1 ## TODO: $old_insertion_point = current insertion point
4076     ## TODO: insertion point = just before the next input character
4077 wakaba 1.25
4078     $insert->($script_el);
4079 wakaba 1.1
4080     ## TODO: insertion point = $old_insertion_point (might be "undefined")
4081    
4082     ## TODO: if there is a script that will execute as soon as the parser resume, then...
4083     }
4084    
4085     !!!next-token;
4086     }; # $script_start_tag
4087    
4088 wakaba 1.102 ## NOTE: $open_tables->[-1]->[0] is the "current table" element node.
4089     ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag.
4090     my $open_tables = [[$self->{open_elements}->[0]->[0]]];
4091    
4092 wakaba 1.1 my $formatting_end_tag = sub {
4093 wakaba 1.113 my $end_tag_token = shift;
4094     my $tag_name = $end_tag_token->{tag_name};
4095 wakaba 1.1
4096 wakaba 1.103 ## NOTE: The adoption agency algorithm (AAA).
4097 wakaba 1.102
4098 wakaba 1.1 FET: {
4099     ## Step 1
4100     my $formatting_element;
4101     my $formatting_element_i_in_active;
4102     AFE: for (reverse 0..$#$active_formatting_elements) {
4103 wakaba 1.123 if ($active_formatting_elements->[$_]->[0] eq '#marker') {
4104     !!!cp ('t52');
4105     last AFE;
4106     } elsif ($active_formatting_elements->[$_]->[0]->manakai_local_name
4107     eq $tag_name) {
4108 wakaba 1.79 !!!cp ('t51');
4109 wakaba 1.1 $formatting_element = $active_formatting_elements->[$_];
4110     $formatting_element_i_in_active = $_;
4111     last AFE;
4112     }
4113     } # AFE
4114     unless (defined $formatting_element) {
4115 wakaba 1.79 !!!cp ('t53');
4116 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => $tag_name, token => $end_tag_token);
4117 wakaba 1.1 ## Ignore the token
4118     !!!next-token;
4119     return;
4120     }
4121     ## has an element in scope
4122     my $in_scope = 1;
4123     my $formatting_element_i_in_open;
4124 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4125     my $node = $self->{open_elements}->[$_];
4126 wakaba 1.1 if ($node->[0] eq $formatting_element->[0]) {
4127     if ($in_scope) {
4128 wakaba 1.79 !!!cp ('t54');
4129 wakaba 1.1 $formatting_element_i_in_open = $_;
4130     last INSCOPE;
4131     } else { # in open elements but not in scope
4132 wakaba 1.79 !!!cp ('t55');
4133 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4134     text => $token->{tag_name},
4135 wakaba 1.113 token => $end_tag_token);
4136 wakaba 1.1 ## Ignore the token
4137     !!!next-token;
4138     return;
4139     }
4140 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
4141 wakaba 1.79 !!!cp ('t56');
4142 wakaba 1.1 $in_scope = 0;
4143     }
4144     } # INSCOPE
4145     unless (defined $formatting_element_i_in_open) {
4146 wakaba 1.79 !!!cp ('t57');
4147 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4148     text => $token->{tag_name},
4149 wakaba 1.113 token => $end_tag_token);
4150 wakaba 1.1 pop @$active_formatting_elements; # $formatting_element
4151     !!!next-token; ## TODO: ok?
4152     return;
4153     }
4154 wakaba 1.3 if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) {
4155 wakaba 1.79 !!!cp ('t58');
4156 wakaba 1.122 !!!parse-error (type => 'not closed',
4157 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4158 wakaba 1.122 ->manakai_local_name,
4159 wakaba 1.113 token => $end_tag_token);
4160 wakaba 1.1 }
4161    
4162     ## Step 2
4163     my $furthest_block;
4164     my $furthest_block_i_in_open;
4165 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
4166     my $node = $self->{open_elements}->[$_];
4167 wakaba 1.123 if (not ($node->[1] & FORMATTING_EL) and
4168 wakaba 1.1 #not $phrasing_category->{$node->[1]} and
4169 wakaba 1.123 ($node->[1] & SPECIAL_EL or
4170     $node->[1] & SCOPING_EL)) { ## Scoping is redundant, maybe
4171 wakaba 1.79 !!!cp ('t59');
4172 wakaba 1.1 $furthest_block = $node;
4173     $furthest_block_i_in_open = $_;
4174     } elsif ($node->[0] eq $formatting_element->[0]) {
4175 wakaba 1.79 !!!cp ('t60');
4176 wakaba 1.1 last OE;
4177     }
4178     } # OE
4179    
4180     ## Step 3
4181     unless (defined $furthest_block) { # MUST
4182 wakaba 1.79 !!!cp ('t61');
4183 wakaba 1.3 splice @{$self->{open_elements}}, $formatting_element_i_in_open;
4184 wakaba 1.1 splice @$active_formatting_elements, $formatting_element_i_in_active, 1;
4185     !!!next-token;
4186     return;
4187     }
4188    
4189     ## Step 4
4190 wakaba 1.3 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1];
4191 wakaba 1.1
4192     ## Step 5
4193     my $furthest_block_parent = $furthest_block->[0]->parent_node;
4194     if (defined $furthest_block_parent) {
4195 wakaba 1.79 !!!cp ('t62');
4196 wakaba 1.1 $furthest_block_parent->remove_child ($furthest_block->[0]);
4197     }
4198    
4199     ## Step 6
4200     my $bookmark_prev_el
4201     = $active_formatting_elements->[$formatting_element_i_in_active - 1]
4202     ->[0];
4203    
4204     ## Step 7
4205     my $node = $furthest_block;
4206     my $node_i_in_open = $furthest_block_i_in_open;
4207     my $last_node = $furthest_block;
4208     S7: {
4209     ## Step 1
4210     $node_i_in_open--;
4211 wakaba 1.3 $node = $self->{open_elements}->[$node_i_in_open];
4212 wakaba 1.1
4213     ## Step 2
4214     my $node_i_in_active;
4215     S7S2: {
4216     for (reverse 0..$#$active_formatting_elements) {
4217     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
4218 wakaba 1.79 !!!cp ('t63');
4219 wakaba 1.1 $node_i_in_active = $_;
4220     last S7S2;
4221     }
4222     }
4223 wakaba 1.3 splice @{$self->{open_elements}}, $node_i_in_open, 1;
4224 wakaba 1.1 redo S7;
4225     } # S7S2
4226    
4227     ## Step 3
4228     last S7 if $node->[0] eq $formatting_element->[0];
4229    
4230     ## Step 4
4231     if ($last_node->[0] eq $furthest_block->[0]) {
4232 wakaba 1.79 !!!cp ('t64');
4233 wakaba 1.1 $bookmark_prev_el = $node->[0];
4234     }
4235    
4236     ## Step 5
4237     if ($node->[0]->has_child_nodes ()) {
4238 wakaba 1.79 !!!cp ('t65');
4239 wakaba 1.1 my $clone = [$node->[0]->clone_node (0), $node->[1]];
4240     $active_formatting_elements->[$node_i_in_active] = $clone;
4241 wakaba 1.3 $self->{open_elements}->[$node_i_in_open] = $clone;
4242 wakaba 1.1 $node = $clone;
4243     }
4244    
4245     ## Step 6
4246     $node->[0]->append_child ($last_node->[0]);
4247    
4248     ## Step 7
4249     $last_node = $node;
4250    
4251     ## Step 8
4252     redo S7;
4253     } # S7
4254    
4255     ## Step 8
4256 wakaba 1.123 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) {
4257 wakaba 1.102 my $foster_parent_element;
4258     my $next_sibling;
4259 wakaba 1.123 OE: for (reverse 0..$#{$self->{open_elements}}) {
4260     if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4261 wakaba 1.102 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4262     if (defined $parent and $parent->node_type == 1) {
4263     !!!cp ('t65.1');
4264     $foster_parent_element = $parent;
4265     $next_sibling = $self->{open_elements}->[$_]->[0];
4266     } else {
4267     !!!cp ('t65.2');
4268     $foster_parent_element
4269     = $self->{open_elements}->[$_ - 1]->[0];
4270     }
4271     last OE;
4272     }
4273     } # OE
4274     $foster_parent_element = $self->{open_elements}->[0]->[0]
4275     unless defined $foster_parent_element;
4276     $foster_parent_element->insert_before ($last_node->[0], $next_sibling);
4277     $open_tables->[-1]->[1] = 1; # tainted
4278     } else {
4279     !!!cp ('t65.3');
4280     $common_ancestor_node->[0]->append_child ($last_node->[0]);
4281     }
4282 wakaba 1.1
4283     ## Step 9
4284     my $clone = [$formatting_element->[0]->clone_node (0),
4285     $formatting_element->[1]];
4286    
4287     ## Step 10
4288     my @cn = @{$furthest_block->[0]->child_nodes};
4289     $clone->[0]->append_child ($_) for @cn;
4290    
4291     ## Step 11
4292     $furthest_block->[0]->append_child ($clone->[0]);
4293    
4294     ## Step 12
4295     my $i;
4296     AFE: for (reverse 0..$#$active_formatting_elements) {
4297     if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) {
4298 wakaba 1.79 !!!cp ('t66');
4299 wakaba 1.1 splice @$active_formatting_elements, $_, 1;
4300     $i-- and last AFE if defined $i;
4301     } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) {
4302 wakaba 1.79 !!!cp ('t67');
4303 wakaba 1.1 $i = $_;
4304     }
4305     } # AFE
4306     splice @$active_formatting_elements, $i + 1, 0, $clone;
4307    
4308     ## Step 13
4309     undef $i;
4310 wakaba 1.3 OE: for (reverse 0..$#{$self->{open_elements}}) {
4311     if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) {
4312 wakaba 1.79 !!!cp ('t68');
4313 wakaba 1.3 splice @{$self->{open_elements}}, $_, 1;
4314 wakaba 1.1 $i-- and last OE if defined $i;
4315 wakaba 1.3 } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) {
4316 wakaba 1.79 !!!cp ('t69');
4317 wakaba 1.1 $i = $_;
4318     }
4319     } # OE
4320 wakaba 1.3 splice @{$self->{open_elements}}, $i + 1, 1, $clone;
4321 wakaba 1.1
4322     ## Step 14
4323     redo FET;
4324     } # FET
4325     }; # $formatting_end_tag
4326    
4327 wakaba 1.96 $insert = my $insert_to_current = sub {
4328 wakaba 1.25 $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
4329 wakaba 1.1 }; # $insert_to_current
4330    
4331     my $insert_to_foster = sub {
4332 wakaba 1.95 my $child = shift;
4333 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
4334 wakaba 1.95 # MUST
4335     my $foster_parent_element;
4336     my $next_sibling;
4337 wakaba 1.123 OE: for (reverse 0..$#{$self->{open_elements}}) {
4338     if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
4339 wakaba 1.3 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
4340 wakaba 1.1 if (defined $parent and $parent->node_type == 1) {
4341 wakaba 1.79 !!!cp ('t70');
4342 wakaba 1.1 $foster_parent_element = $parent;
4343 wakaba 1.3 $next_sibling = $self->{open_elements}->[$_]->[0];
4344 wakaba 1.1 } else {
4345 wakaba 1.79 !!!cp ('t71');
4346 wakaba 1.1 $foster_parent_element
4347 wakaba 1.3 = $self->{open_elements}->[$_ - 1]->[0];
4348 wakaba 1.1 }
4349     last OE;
4350     }
4351     } # OE
4352 wakaba 1.3 $foster_parent_element = $self->{open_elements}->[0]->[0]
4353 wakaba 1.1 unless defined $foster_parent_element;
4354     $foster_parent_element->insert_before
4355     ($child, $next_sibling);
4356 wakaba 1.95 $open_tables->[-1]->[1] = 1; # tainted
4357     } else {
4358     !!!cp ('t72');
4359     $self->{open_elements}->[-1]->[0]->append_child ($child);
4360     }
4361 wakaba 1.1 }; # $insert_to_foster
4362    
4363 wakaba 1.126 B: while (1) {
4364 wakaba 1.55 if ($token->{type} == DOCTYPE_TOKEN) {
4365 wakaba 1.79 !!!cp ('t73');
4366 wakaba 1.153 !!!parse-error (type => 'in html:#DOCTYPE', token => $token);
4367 wakaba 1.52 ## Ignore the token
4368     ## Stay in the phase
4369     !!!next-token;
4370 wakaba 1.126 next B;
4371 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN and
4372 wakaba 1.52 $token->{tag_name} eq 'html') {
4373 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
4374 wakaba 1.79 !!!cp ('t79');
4375 wakaba 1.153 !!!parse-error (type => 'after html', text => 'html', token => $token);
4376 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
4377     } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
4378 wakaba 1.79 !!!cp ('t80');
4379 wakaba 1.153 !!!parse-error (type => 'after html', text => 'html', token => $token);
4380 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
4381 wakaba 1.79 } else {
4382     !!!cp ('t81');
4383 wakaba 1.52 }
4384    
4385 wakaba 1.84 !!!cp ('t82');
4386 wakaba 1.113 !!!parse-error (type => 'not first start tag', token => $token);
4387 wakaba 1.52 my $top_el = $self->{open_elements}->[0]->[0];
4388     for my $attr_name (keys %{$token->{attributes}}) {
4389     unless ($top_el->has_attribute_ns (undef, $attr_name)) {
4390 wakaba 1.79 !!!cp ('t84');
4391 wakaba 1.52 $top_el->set_attribute_ns
4392     (undef, [undef, $attr_name],
4393     $token->{attributes}->{$attr_name}->{value});
4394     }
4395     }
4396 wakaba 1.125 !!!nack ('t84.1');
4397 wakaba 1.52 !!!next-token;
4398 wakaba 1.126 next B;
4399 wakaba 1.55 } elsif ($token->{type} == COMMENT_TOKEN) {
4400 wakaba 1.52 my $comment = $self->{document}->create_comment ($token->{data});
4401 wakaba 1.56 if ($self->{insertion_mode} & AFTER_HTML_IMS) {
4402 wakaba 1.79 !!!cp ('t85');
4403 wakaba 1.52 $self->{document}->append_child ($comment);
4404 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
4405 wakaba 1.79 !!!cp ('t86');
4406 wakaba 1.52 $self->{open_elements}->[0]->[0]->append_child ($comment);
4407     } else {
4408 wakaba 1.79 !!!cp ('t87');
4409 wakaba 1.52 $self->{open_elements}->[-1]->[0]->append_child ($comment);
4410     }
4411     !!!next-token;
4412 wakaba 1.126 next B;
4413     } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
4414     if ($token->{type} == CHARACTER_TOKEN) {
4415     !!!cp ('t87.1');
4416     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4417     !!!next-token;
4418     next B;
4419     } elsif ($token->{type} == START_TAG_TOKEN) {
4420 wakaba 1.129 if ((not {mglyph => 1, malignmark => 1}->{$token->{tag_name}} and
4421     $self->{open_elements}->[-1]->[1] & FOREIGN_FLOW_CONTENT_EL) or
4422 wakaba 1.126 not ($self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
4423     ($token->{tag_name} eq 'svg' and
4424     $self->{open_elements}->[-1]->[1] & MML_AXML_EL)) {
4425     ## NOTE: "using the rules for secondary insertion mode"then"continue"
4426     !!!cp ('t87.2');
4427     #
4428     } elsif ({
4429 wakaba 1.130 b => 1, big => 1, blockquote => 1, body => 1, br => 1,
4430 wakaba 1.146 center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1,
4431     em => 1, embed => 1, font => 1, h1 => 1, h2 => 1, h3 => 1,
4432     h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1,
4433     img => 1, li => 1, listing => 1, menu => 1, meta => 1,
4434     nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1,
4435     small => 1, span => 1, strong => 1, strike => 1, sub => 1,
4436     sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1,
4437 wakaba 1.126 }->{$token->{tag_name}}) {
4438     !!!cp ('t87.2');
4439     !!!parse-error (type => 'not closed',
4440 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4441 wakaba 1.126 ->manakai_local_name,
4442     token => $token);
4443    
4444     pop @{$self->{open_elements}}
4445     while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4446    
4447 wakaba 1.130 $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4448 wakaba 1.126 ## Reprocess.
4449     next B;
4450     } else {
4451 wakaba 1.131 my $nsuri = $self->{open_elements}->[-1]->[0]->namespace_uri;
4452     my $tag_name = $token->{tag_name};
4453     if ($nsuri eq $SVG_NS) {
4454     $tag_name = {
4455     altglyph => 'altGlyph',
4456     altglyphdef => 'altGlyphDef',
4457     altglyphitem => 'altGlyphItem',
4458     animatecolor => 'animateColor',
4459     animatemotion => 'animateMotion',
4460     animatetransform => 'animateTransform',
4461     clippath => 'clipPath',
4462     feblend => 'feBlend',
4463     fecolormatrix => 'feColorMatrix',
4464     fecomponenttransfer => 'feComponentTransfer',
4465     fecomposite => 'feComposite',
4466     feconvolvematrix => 'feConvolveMatrix',
4467     fediffuselighting => 'feDiffuseLighting',
4468     fedisplacementmap => 'feDisplacementMap',
4469     fedistantlight => 'feDistantLight',
4470     feflood => 'feFlood',
4471     fefunca => 'feFuncA',
4472     fefuncb => 'feFuncB',
4473     fefuncg => 'feFuncG',
4474     fefuncr => 'feFuncR',
4475     fegaussianblur => 'feGaussianBlur',
4476     feimage => 'feImage',
4477     femerge => 'feMerge',
4478     femergenode => 'feMergeNode',
4479     femorphology => 'feMorphology',
4480     feoffset => 'feOffset',
4481     fepointlight => 'fePointLight',
4482     fespecularlighting => 'feSpecularLighting',
4483     fespotlight => 'feSpotLight',
4484     fetile => 'feTile',
4485     feturbulence => 'feTurbulence',
4486     foreignobject => 'foreignObject',
4487     glyphref => 'glyphRef',
4488     lineargradient => 'linearGradient',
4489     radialgradient => 'radialGradient',
4490     #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2)
4491     textpath => 'textPath',
4492     }->{$tag_name} || $tag_name;
4493     }
4494    
4495     ## "adjust SVG attributes" (SVG only) - done in insert-element-f
4496    
4497     ## "adjust foreign attributes" - done in insert-element-f
4498 wakaba 1.126
4499 wakaba 1.131 !!!insert-element-f ($nsuri, $tag_name, $token->{attributes}, $token);
4500 wakaba 1.126
4501     if ($self->{self_closing}) {
4502     pop @{$self->{open_elements}};
4503     !!!ack ('t87.3');
4504     } else {
4505     !!!cp ('t87.4');
4506     }
4507    
4508     !!!next-token;
4509     next B;
4510     }
4511     } elsif ($token->{type} == END_TAG_TOKEN) {
4512     ## NOTE: "using the rules for secondary insertion mode" then "continue"
4513     !!!cp ('t87.5');
4514     #
4515     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
4516     !!!cp ('t87.6');
4517 wakaba 1.146 !!!parse-error (type => 'not closed',
4518 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
4519 wakaba 1.146 ->manakai_local_name,
4520     token => $token);
4521    
4522     pop @{$self->{open_elements}}
4523     while $self->{open_elements}->[-1]->[1] & FOREIGN_EL;
4524    
4525     $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
4526     ## Reprocess.
4527     next B;
4528 wakaba 1.126 } else {
4529     die "$0: $token->{type}: Unknown token type";
4530     }
4531     }
4532    
4533     if ($self->{insertion_mode} & HEAD_IMS) {
4534 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
4535 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
4536 wakaba 1.99 unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4537     !!!cp ('t88.2');
4538     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4539 wakaba 1.177 #
4540 wakaba 1.99 } else {
4541     !!!cp ('t88.1');
4542     ## Ignore the token.
4543 wakaba 1.177 #
4544 wakaba 1.99 }
4545 wakaba 1.52 unless (length $token->{data}) {
4546 wakaba 1.79 !!!cp ('t88');
4547 wakaba 1.52 !!!next-token;
4548 wakaba 1.126 next B;
4549 wakaba 1.1 }
4550 wakaba 1.177 ## TODO: set $token->{column} appropriately
4551 wakaba 1.1 }
4552 wakaba 1.52
4553 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4554 wakaba 1.79 !!!cp ('t89');
4555 wakaba 1.52 ## As if <head>
4556 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4557 wakaba 1.52 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4558 wakaba 1.123 push @{$self->{open_elements}},
4559     [$self->{head_element}, $el_category->{head}];
4560 wakaba 1.52
4561     ## Reprocess in the "in head" insertion mode...
4562     pop @{$self->{open_elements}};
4563    
4564     ## Reprocess in the "after head" insertion mode...
4565 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4566 wakaba 1.79 !!!cp ('t90');
4567 wakaba 1.52 ## As if </noscript>
4568     pop @{$self->{open_elements}};
4569 wakaba 1.153 !!!parse-error (type => 'in noscript:#text', token => $token);
4570 wakaba 1.1
4571 wakaba 1.52 ## Reprocess in the "in head" insertion mode...
4572     ## As if </head>
4573     pop @{$self->{open_elements}};
4574    
4575     ## Reprocess in the "after head" insertion mode...
4576 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4577 wakaba 1.79 !!!cp ('t91');
4578 wakaba 1.52 pop @{$self->{open_elements}};
4579    
4580     ## Reprocess in the "after head" insertion mode...
4581 wakaba 1.79 } else {
4582     !!!cp ('t92');
4583 wakaba 1.1 }
4584 wakaba 1.52
4585 wakaba 1.123 ## "after head" insertion mode
4586     ## As if <body>
4587     !!!insert-element ('body',, $token);
4588     $self->{insertion_mode} = IN_BODY_IM;
4589     ## reprocess
4590 wakaba 1.126 next B;
4591 wakaba 1.123 } elsif ($token->{type} == START_TAG_TOKEN) {
4592     if ($token->{tag_name} eq 'head') {
4593     if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4594     !!!cp ('t93');
4595 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
4596 wakaba 1.123 $self->{open_elements}->[-1]->[0]->append_child
4597     ($self->{head_element});
4598     push @{$self->{open_elements}},
4599     [$self->{head_element}, $el_category->{head}];
4600     $self->{insertion_mode} = IN_HEAD_IM;
4601 wakaba 1.125 !!!nack ('t93.1');
4602 wakaba 1.123 !!!next-token;
4603 wakaba 1.126 next B;
4604 wakaba 1.125 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4605 wakaba 1.139 !!!cp ('t93.2');
4606 wakaba 1.153 !!!parse-error (type => 'after head', text => 'head',
4607     token => $token);
4608 wakaba 1.139 ## Ignore the token
4609     !!!nack ('t93.3');
4610     !!!next-token;
4611     next B;
4612 wakaba 1.125 } else {
4613     !!!cp ('t95');
4614 wakaba 1.153 !!!parse-error (type => 'in head:head',
4615     token => $token); # or in head noscript
4616 wakaba 1.125 ## Ignore the token
4617     !!!nack ('t95.1');
4618     !!!next-token;
4619 wakaba 1.126 next B;
4620 wakaba 1.125 }
4621     } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4622 wakaba 1.126 !!!cp ('t96');
4623     ## As if <head>
4624     !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4625     $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4626     push @{$self->{open_elements}},
4627     [$self->{head_element}, $el_category->{head}];
4628 wakaba 1.52
4629 wakaba 1.126 $self->{insertion_mode} = IN_HEAD_IM;
4630     ## Reprocess in the "in head" insertion mode...
4631     } else {
4632     !!!cp ('t97');
4633     }
4634 wakaba 1.52
4635 wakaba 1.49 if ($token->{tag_name} eq 'base') {
4636 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4637 wakaba 1.79 !!!cp ('t98');
4638 wakaba 1.49 ## As if </noscript>
4639     pop @{$self->{open_elements}};
4640 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'base',
4641     token => $token);
4642 wakaba 1.49
4643 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4644 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4645 wakaba 1.79 } else {
4646     !!!cp ('t99');
4647 wakaba 1.49 }
4648    
4649     ## NOTE: There is a "as if in head" code clone.
4650 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4651 wakaba 1.79 !!!cp ('t100');
4652 wakaba 1.153 !!!parse-error (type => 'after head',
4653     text => $token->{tag_name}, token => $token);
4654 wakaba 1.123 push @{$self->{open_elements}},
4655     [$self->{head_element}, $el_category->{head}];
4656 wakaba 1.79 } else {
4657     !!!cp ('t101');
4658 wakaba 1.49 }
4659 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4660 wakaba 1.194 pop @{$self->{open_elements}};
4661 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4662 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4663 wakaba 1.125 !!!nack ('t101.1');
4664 wakaba 1.49 !!!next-token;
4665 wakaba 1.126 next B;
4666 wakaba 1.194 } elsif ($token->{tag_name} eq 'link') {
4667     ## NOTE: There is a "as if in head" code clone.
4668     if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4669     !!!cp ('t102');
4670     !!!parse-error (type => 'after head',
4671     text => $token->{tag_name}, token => $token);
4672     push @{$self->{open_elements}},
4673     [$self->{head_element}, $el_category->{head}];
4674     } else {
4675     !!!cp ('t103');
4676     }
4677     !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4678     pop @{$self->{open_elements}};
4679     pop @{$self->{open_elements}} # <head>
4680     if $self->{insertion_mode} == AFTER_HEAD_IM;
4681     !!!ack ('t103.1');
4682     !!!next-token;
4683     next B;
4684     } elsif ($token->{tag_name} eq 'command' or
4685     $token->{tag_name} eq 'eventsource') {
4686     if ($self->{insertion_mode} == IN_HEAD_IM) {
4687     ## NOTE: If the insertion mode at the time of the emission
4688     ## of the token was "before head", $self->{insertion_mode}
4689     ## is already changed to |IN_HEAD_IM|.
4690    
4691     ## NOTE: There is a "as if in head" code clone.
4692     !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4693     pop @{$self->{open_elements}};
4694     pop @{$self->{open_elements}} # <head>
4695     if $self->{insertion_mode} == AFTER_HEAD_IM;
4696     !!!ack ('t103.2');
4697     !!!next-token;
4698     next B;
4699     } else {
4700     ## NOTE: "in head noscript" or "after head" insertion mode
4701     ## - in these cases, these tags are treated as same as
4702     ## normal in-body tags.
4703     !!!cp ('t103.3');
4704     #
4705     }
4706 wakaba 1.34 } elsif ($token->{tag_name} eq 'meta') {
4707     ## NOTE: There is a "as if in head" code clone.
4708 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4709 wakaba 1.79 !!!cp ('t104');
4710 wakaba 1.153 !!!parse-error (type => 'after head',
4711     text => $token->{tag_name}, token => $token);
4712 wakaba 1.123 push @{$self->{open_elements}},
4713     [$self->{head_element}, $el_category->{head}];
4714 wakaba 1.79 } else {
4715     !!!cp ('t105');
4716 wakaba 1.34 }
4717 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4718 wakaba 1.66 my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4719 wakaba 1.34
4720     unless ($self->{confident}) {
4721 wakaba 1.134 if ($token->{attributes}->{charset}) {
4722 wakaba 1.79 !!!cp ('t106');
4723 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
4724     ## in the {change_encoding} callback.
4725 wakaba 1.63 $self->{change_encoding}
4726 wakaba 1.114 ->($self, $token->{attributes}->{charset}->{value},
4727     $token);
4728 wakaba 1.66
4729     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4730     ->set_user_data (manakai_has_reference =>
4731     $token->{attributes}->{charset}
4732     ->{has_reference});
4733 wakaba 1.63 } elsif ($token->{attributes}->{content}) {
4734     if ($token->{attributes}->{content}->{value}
4735 wakaba 1.144 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4736 wakaba 1.186 [\x09\x0A\x0C\x0D\x20]*=
4737     [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4738     ([^"'\x09\x0A\x0C\x0D\x20]
4739     [^\x09\x0A\x0C\x0D\x20\x3B]*))/x) {
4740 wakaba 1.79 !!!cp ('t107');
4741 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
4742     ## in the {change_encoding} callback.
4743 wakaba 1.63 $self->{change_encoding}
4744 wakaba 1.114 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
4745     $token);
4746 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4747     ->set_user_data (manakai_has_reference =>
4748     $token->{attributes}->{content}
4749     ->{has_reference});
4750 wakaba 1.79 } else {
4751     !!!cp ('t108');
4752 wakaba 1.63 }
4753 wakaba 1.34 }
4754 wakaba 1.66 } else {
4755     if ($token->{attributes}->{charset}) {
4756 wakaba 1.79 !!!cp ('t109');
4757 wakaba 1.66 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4758     ->set_user_data (manakai_has_reference =>
4759     $token->{attributes}->{charset}
4760     ->{has_reference});
4761     }
4762 wakaba 1.68 if ($token->{attributes}->{content}) {
4763 wakaba 1.79 !!!cp ('t110');
4764 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4765     ->set_user_data (manakai_has_reference =>
4766     $token->{attributes}->{content}
4767     ->{has_reference});
4768     }
4769 wakaba 1.34 }
4770    
4771 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4772 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4773 wakaba 1.125 !!!ack ('t110.1');
4774 wakaba 1.34 !!!next-token;
4775 wakaba 1.126 next B;
4776 wakaba 1.49 } elsif ($token->{tag_name} eq 'title') {
4777 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4778 wakaba 1.79 !!!cp ('t111');
4779 wakaba 1.49 ## As if </noscript>
4780     pop @{$self->{open_elements}};
4781 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'title',
4782     token => $token);
4783 wakaba 1.49
4784 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4785 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4786 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4787 wakaba 1.79 !!!cp ('t112');
4788 wakaba 1.153 !!!parse-error (type => 'after head',
4789     text => $token->{tag_name}, token => $token);
4790 wakaba 1.123 push @{$self->{open_elements}},
4791     [$self->{head_element}, $el_category->{head}];
4792 wakaba 1.79 } else {
4793     !!!cp ('t113');
4794 wakaba 1.25 }
4795 wakaba 1.49
4796     ## NOTE: There is a "as if in head" code clone.
4797 wakaba 1.31 my $parent = defined $self->{head_element} ? $self->{head_element}
4798     : $self->{open_elements}->[-1]->[0];
4799 wakaba 1.96 $parse_rcdata->(RCDATA_CONTENT_MODEL);
4800 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4801 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4802 wakaba 1.126 next B;
4803 wakaba 1.148 } elsif ($token->{tag_name} eq 'style' or
4804     $token->{tag_name} eq 'noframes') {
4805 wakaba 1.25 ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
4806 wakaba 1.54 ## insertion mode IN_HEAD_IM)
4807 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
4808 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4809 wakaba 1.79 !!!cp ('t114');
4810 wakaba 1.153 !!!parse-error (type => 'after head',
4811     text => $token->{tag_name}, token => $token);
4812 wakaba 1.123 push @{$self->{open_elements}},
4813     [$self->{head_element}, $el_category->{head}];
4814 wakaba 1.79 } else {
4815     !!!cp ('t115');
4816 wakaba 1.25 }
4817 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
4818 wakaba 1.100 pop @{$self->{open_elements}} # <head>
4819 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4820 wakaba 1.126 next B;
4821 wakaba 1.25 } elsif ($token->{tag_name} eq 'noscript') {
4822 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_IM) {
4823 wakaba 1.79 !!!cp ('t116');
4824 wakaba 1.25 ## NOTE: and scripting is disalbed
4825 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4826 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM;
4827 wakaba 1.125 !!!nack ('t116.1');
4828 wakaba 1.1 !!!next-token;
4829 wakaba 1.126 next B;
4830 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4831 wakaba 1.79 !!!cp ('t117');
4832 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'noscript',
4833     token => $token);
4834 wakaba 1.1 ## Ignore the token
4835 wakaba 1.125 !!!nack ('t117.1');
4836 wakaba 1.41 !!!next-token;
4837 wakaba 1.126 next B;
4838 wakaba 1.1 } else {
4839 wakaba 1.79 !!!cp ('t118');
4840 wakaba 1.25 #
4841 wakaba 1.1 }
4842 wakaba 1.49 } elsif ($token->{tag_name} eq 'script') {
4843 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4844 wakaba 1.79 !!!cp ('t119');
4845 wakaba 1.49 ## As if </noscript>
4846     pop @{$self->{open_elements}};
4847 wakaba 1.153 !!!parse-error (type => 'in noscript', text => 'script',
4848     token => $token);
4849 wakaba 1.49
4850 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4851 wakaba 1.49 ## Reprocess in the "in head" insertion mode...
4852 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4853 wakaba 1.79 !!!cp ('t120');
4854 wakaba 1.153 !!!parse-error (type => 'after head',
4855     text => $token->{tag_name}, token => $token);
4856 wakaba 1.123 push @{$self->{open_elements}},
4857     [$self->{head_element}, $el_category->{head}];
4858 wakaba 1.79 } else {
4859     !!!cp ('t121');
4860 wakaba 1.25 }
4861 wakaba 1.49
4862 wakaba 1.25 ## NOTE: There is a "as if in head" code clone.
4863 wakaba 1.100 $script_start_tag->();
4864     pop @{$self->{open_elements}} # <head>
4865 wakaba 1.54 if $self->{insertion_mode} == AFTER_HEAD_IM;
4866 wakaba 1.126 next B;
4867 wakaba 1.49 } elsif ($token->{tag_name} eq 'body' or
4868 wakaba 1.25 $token->{tag_name} eq 'frameset') {
4869 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4870 wakaba 1.79 !!!cp ('t122');
4871 wakaba 1.49 ## As if </noscript>
4872     pop @{$self->{open_elements}};
4873 wakaba 1.153 !!!parse-error (type => 'in noscript',
4874     text => $token->{tag_name}, token => $token);
4875 wakaba 1.49
4876     ## Reprocess in the "in head" insertion mode...
4877     ## As if </head>
4878     pop @{$self->{open_elements}};
4879    
4880     ## Reprocess in the "after head" insertion mode...
4881 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4882 wakaba 1.79 !!!cp ('t124');
4883 wakaba 1.49 pop @{$self->{open_elements}};
4884    
4885     ## Reprocess in the "after head" insertion mode...
4886 wakaba 1.79 } else {
4887     !!!cp ('t125');
4888 wakaba 1.49 }
4889    
4890     ## "after head" insertion mode
4891 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4892 wakaba 1.54 if ($token->{tag_name} eq 'body') {
4893 wakaba 1.79 !!!cp ('t126');
4894 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4895     } elsif ($token->{tag_name} eq 'frameset') {
4896 wakaba 1.79 !!!cp ('t127');
4897 wakaba 1.54 $self->{insertion_mode} = IN_FRAMESET_IM;
4898     } else {
4899     die "$0: tag name: $self->{tag_name}";
4900     }
4901 wakaba 1.125 !!!nack ('t127.1');
4902 wakaba 1.1 !!!next-token;
4903 wakaba 1.126 next B;
4904 wakaba 1.1 } else {
4905 wakaba 1.79 !!!cp ('t128');
4906 wakaba 1.1 #
4907     }
4908 wakaba 1.49
4909 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4910 wakaba 1.79 !!!cp ('t129');
4911 wakaba 1.49 ## As if </noscript>
4912     pop @{$self->{open_elements}};
4913 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
4914     text => $token->{tag_name}, token => $token);
4915 wakaba 1.49
4916     ## Reprocess in the "in head" insertion mode...
4917     ## As if </head>
4918 wakaba 1.25 pop @{$self->{open_elements}};
4919 wakaba 1.49
4920     ## Reprocess in the "after head" insertion mode...
4921 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4922 wakaba 1.79 !!!cp ('t130');
4923 wakaba 1.49 ## As if </head>
4924 wakaba 1.25 pop @{$self->{open_elements}};
4925 wakaba 1.49
4926     ## Reprocess in the "after head" insertion mode...
4927 wakaba 1.79 } else {
4928     !!!cp ('t131');
4929 wakaba 1.49 }
4930    
4931     ## "after head" insertion mode
4932     ## As if <body>
4933 wakaba 1.116 !!!insert-element ('body',, $token);
4934 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
4935 wakaba 1.49 ## reprocess
4936 wakaba 1.125 !!!ack-later;
4937 wakaba 1.126 next B;
4938 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
4939 wakaba 1.49 if ($token->{tag_name} eq 'head') {
4940 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4941 wakaba 1.79 !!!cp ('t132');
4942 wakaba 1.50 ## As if <head>
4943 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
4944 wakaba 1.50 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
4945 wakaba 1.123 push @{$self->{open_elements}},
4946     [$self->{head_element}, $el_category->{head}];
4947 wakaba 1.50
4948     ## Reprocess in the "in head" insertion mode...
4949     pop @{$self->{open_elements}};
4950 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4951 wakaba 1.50 !!!next-token;
4952 wakaba 1.126 next B;
4953 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4954 wakaba 1.79 !!!cp ('t133');
4955 wakaba 1.49 ## As if </noscript>
4956     pop @{$self->{open_elements}};
4957 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
4958     text => 'head', token => $token);
4959 wakaba 1.49
4960     ## Reprocess in the "in head" insertion mode...
4961 wakaba 1.50 pop @{$self->{open_elements}};
4962 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4963 wakaba 1.50 !!!next-token;
4964 wakaba 1.126 next B;
4965 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
4966 wakaba 1.79 !!!cp ('t134');
4967 wakaba 1.49 pop @{$self->{open_elements}};
4968 wakaba 1.54 $self->{insertion_mode} = AFTER_HEAD_IM;
4969 wakaba 1.49 !!!next-token;
4970 wakaba 1.126 next B;
4971 wakaba 1.139 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
4972     !!!cp ('t134.1');
4973 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => 'head',
4974     token => $token);
4975 wakaba 1.139 ## Ignore the token
4976     !!!next-token;
4977     next B;
4978 wakaba 1.49 } else {
4979 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
4980 wakaba 1.49 }
4981     } elsif ($token->{tag_name} eq 'noscript') {
4982 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
4983 wakaba 1.79 !!!cp ('t136');
4984 wakaba 1.49 pop @{$self->{open_elements}};
4985 wakaba 1.54 $self->{insertion_mode} = IN_HEAD_IM;
4986 wakaba 1.49 !!!next-token;
4987 wakaba 1.126 next B;
4988 wakaba 1.139 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
4989     $self->{insertion_mode} == AFTER_HEAD_IM) {
4990 wakaba 1.79 !!!cp ('t137');
4991 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
4992     text => 'noscript', token => $token);
4993 wakaba 1.50 ## Ignore the token ## ISSUE: An issue in the spec.
4994     !!!next-token;
4995 wakaba 1.126 next B;
4996 wakaba 1.49 } else {
4997 wakaba 1.79 !!!cp ('t138');
4998 wakaba 1.49 #
4999     }
5000     } elsif ({
5001 wakaba 1.31 body => 1, html => 1,
5002     }->{$token->{tag_name}}) {
5003 wakaba 1.139 if ($self->{insertion_mode} == BEFORE_HEAD_IM or
5004     $self->{insertion_mode} == IN_HEAD_IM or
5005     $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5006 wakaba 1.79 !!!cp ('t140');
5007 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5008     text => $token->{tag_name}, token => $token);
5009 wakaba 1.49 ## Ignore the token
5010     !!!next-token;
5011 wakaba 1.126 next B;
5012 wakaba 1.139 } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
5013     !!!cp ('t140.1');
5014 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5015     text => $token->{tag_name}, token => $token);
5016 wakaba 1.139 ## Ignore the token
5017     !!!next-token;
5018     next B;
5019 wakaba 1.79 } else {
5020 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5021 wakaba 1.49 }
5022 wakaba 1.139 } elsif ($token->{tag_name} eq 'p') {
5023     !!!cp ('t142');
5024 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5025     text => $token->{tag_name}, token => $token);
5026 wakaba 1.139 ## Ignore the token
5027     !!!next-token;
5028     next B;
5029     } elsif ($token->{tag_name} eq 'br') {
5030 wakaba 1.54 if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5031 wakaba 1.139 !!!cp ('t142.2');
5032     ## (before head) as if <head>, (in head) as if </head>
5033 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
5034 wakaba 1.50 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
5035 wakaba 1.139 $self->{insertion_mode} = AFTER_HEAD_IM;
5036    
5037     ## Reprocess in the "after head" insertion mode...
5038     } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5039     !!!cp ('t143.2');
5040     ## As if </head>
5041     pop @{$self->{open_elements}};
5042     $self->{insertion_mode} = AFTER_HEAD_IM;
5043    
5044     ## Reprocess in the "after head" insertion mode...
5045     } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5046     !!!cp ('t143.3');
5047     ## ISSUE: Two parse errors for <head><noscript></br>
5048 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5049     text => 'br', token => $token);
5050 wakaba 1.139 ## As if </noscript>
5051     pop @{$self->{open_elements}};
5052     $self->{insertion_mode} = IN_HEAD_IM;
5053 wakaba 1.50
5054     ## Reprocess in the "in head" insertion mode...
5055 wakaba 1.139 ## As if </head>
5056     pop @{$self->{open_elements}};
5057     $self->{insertion_mode} = AFTER_HEAD_IM;
5058    
5059     ## Reprocess in the "after head" insertion mode...
5060     } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
5061     !!!cp ('t143.4');
5062     #
5063 wakaba 1.79 } else {
5064 wakaba 1.139 die "$0: $self->{insertion_mode}: Unknown insertion mode";
5065 wakaba 1.50 }
5066    
5067 wakaba 1.139 ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
5068 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5069     text => 'br', token => $token);
5070 wakaba 1.139 ## Ignore the token
5071     !!!next-token;
5072     next B;
5073 wakaba 1.25 } else {
5074 wakaba 1.139 !!!cp ('t145');
5075 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5076     text => $token->{tag_name}, token => $token);
5077 wakaba 1.139 ## Ignore the token
5078     !!!next-token;
5079     next B;
5080 wakaba 1.49 }
5081    
5082 wakaba 1.54 if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5083 wakaba 1.79 !!!cp ('t146');
5084 wakaba 1.49 ## As if </noscript>
5085     pop @{$self->{open_elements}};
5086 wakaba 1.153 !!!parse-error (type => 'in noscript:/',
5087     text => $token->{tag_name}, token => $token);
5088 wakaba 1.49
5089     ## Reprocess in the "in head" insertion mode...
5090     ## As if </head>
5091     pop @{$self->{open_elements}};
5092    
5093     ## Reprocess in the "after head" insertion mode...
5094 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5095 wakaba 1.79 !!!cp ('t147');
5096 wakaba 1.49 ## As if </head>
5097     pop @{$self->{open_elements}};
5098    
5099     ## Reprocess in the "after head" insertion mode...
5100 wakaba 1.54 } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5101 wakaba 1.82 ## ISSUE: This case cannot be reached?
5102 wakaba 1.79 !!!cp ('t148');
5103 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5104     text => $token->{tag_name}, token => $token);
5105 wakaba 1.50 ## Ignore the token ## ISSUE: An issue in the spec.
5106     !!!next-token;
5107 wakaba 1.126 next B;
5108 wakaba 1.79 } else {
5109     !!!cp ('t149');
5110 wakaba 1.1 }
5111    
5112 wakaba 1.49 ## "after head" insertion mode
5113     ## As if <body>
5114 wakaba 1.116 !!!insert-element ('body',, $token);
5115 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
5116 wakaba 1.52 ## reprocess
5117 wakaba 1.126 next B;
5118 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5119     if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
5120     !!!cp ('t149.1');
5121    
5122     ## NOTE: As if <head>
5123 wakaba 1.126 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
5124 wakaba 1.104 $self->{open_elements}->[-1]->[0]->append_child
5125     ($self->{head_element});
5126 wakaba 1.123 #push @{$self->{open_elements}},
5127     # [$self->{head_element}, $el_category->{head}];
5128 wakaba 1.104 #$self->{insertion_mode} = IN_HEAD_IM;
5129     ## NOTE: Reprocess.
5130    
5131     ## NOTE: As if </head>
5132     #pop @{$self->{open_elements}};
5133     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5134     ## NOTE: Reprocess.
5135    
5136     #
5137     } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
5138     !!!cp ('t149.2');
5139    
5140     ## NOTE: As if </head>
5141     pop @{$self->{open_elements}};
5142     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5143     ## NOTE: Reprocess.
5144    
5145     #
5146     } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
5147     !!!cp ('t149.3');
5148    
5149 wakaba 1.113 !!!parse-error (type => 'in noscript:#eof', token => $token);
5150 wakaba 1.104
5151     ## As if </noscript>
5152     pop @{$self->{open_elements}};
5153     #$self->{insertion_mode} = IN_HEAD_IM;
5154     ## NOTE: Reprocess.
5155    
5156     ## NOTE: As if </head>
5157     pop @{$self->{open_elements}};
5158     #$self->{insertion_mode} = IN_AFTER_HEAD_IM;
5159     ## NOTE: Reprocess.
5160    
5161     #
5162     } else {
5163     !!!cp ('t149.4');
5164     #
5165     }
5166    
5167     ## NOTE: As if <body>
5168 wakaba 1.116 !!!insert-element ('body',, $token);
5169 wakaba 1.104 $self->{insertion_mode} = IN_BODY_IM;
5170     ## NOTE: Reprocess.
5171 wakaba 1.126 next B;
5172 wakaba 1.104 } else {
5173     die "$0: $token->{type}: Unknown token type";
5174     }
5175 wakaba 1.52
5176     ## ISSUE: An issue in the spec.
5177 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_IMS) {
5178 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
5179 wakaba 1.79 !!!cp ('t150');
5180 wakaba 1.52 ## NOTE: There is a code clone of "character in body".
5181     $reconstruct_active_formatting_elements->($insert_to_current);
5182    
5183     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5184    
5185     !!!next-token;
5186 wakaba 1.126 next B;
5187 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
5188 wakaba 1.52 if ({
5189     caption => 1, col => 1, colgroup => 1, tbody => 1,
5190     td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
5191     }->{$token->{tag_name}}) {
5192 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
5193 wakaba 1.52 ## have an element in table scope
5194 wakaba 1.108 for (reverse 0..$#{$self->{open_elements}}) {
5195 wakaba 1.52 my $node = $self->{open_elements}->[$_];
5196 wakaba 1.123 if ($node->[1] & TABLE_CELL_EL) {
5197 wakaba 1.79 !!!cp ('t151');
5198 wakaba 1.108
5199     ## Close the cell
5200 wakaba 1.125 !!!back-token; # <x>
5201 wakaba 1.122 $token = {type => END_TAG_TOKEN,
5202     tag_name => $node->[0]->manakai_local_name,
5203 wakaba 1.114 line => $token->{line},
5204     column => $token->{column}};
5205 wakaba 1.126 next B;
5206 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5207 wakaba 1.79 !!!cp ('t152');
5208 wakaba 1.108 ## ISSUE: This case can never be reached, maybe.
5209     last;
5210 wakaba 1.52 }
5211 wakaba 1.108 }
5212    
5213     !!!cp ('t153');
5214     !!!parse-error (type => 'start tag not allowed',
5215 wakaba 1.153 text => $token->{tag_name}, token => $token);
5216 wakaba 1.108 ## Ignore the token
5217 wakaba 1.125 !!!nack ('t153.1');
5218 wakaba 1.108 !!!next-token;
5219 wakaba 1.126 next B;
5220 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5221 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'caption',
5222     token => $token);
5223 wakaba 1.52
5224 wakaba 1.108 ## NOTE: As if </caption>.
5225 wakaba 1.52 ## have a table element in table scope
5226     my $i;
5227 wakaba 1.108 INSCOPE: {
5228     for (reverse 0..$#{$self->{open_elements}}) {
5229     my $node = $self->{open_elements}->[$_];
5230 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5231 wakaba 1.108 !!!cp ('t155');
5232     $i = $_;
5233     last INSCOPE;
5234 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5235 wakaba 1.108 !!!cp ('t156');
5236     last;
5237     }
5238 wakaba 1.52 }
5239 wakaba 1.108
5240     !!!cp ('t157');
5241     !!!parse-error (type => 'start tag not allowed',
5242 wakaba 1.153 text => $token->{tag_name}, token => $token);
5243 wakaba 1.108 ## Ignore the token
5244 wakaba 1.125 !!!nack ('t157.1');
5245 wakaba 1.108 !!!next-token;
5246 wakaba 1.126 next B;
5247 wakaba 1.52 } # INSCOPE
5248    
5249     ## generate implied end tags
5250 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5251     & END_TAG_OPTIONAL_EL) {
5252 wakaba 1.79 !!!cp ('t158');
5253 wakaba 1.86 pop @{$self->{open_elements}};
5254 wakaba 1.52 }
5255    
5256 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5257 wakaba 1.79 !!!cp ('t159');
5258 wakaba 1.122 !!!parse-error (type => 'not closed',
5259 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5260 wakaba 1.122 ->manakai_local_name,
5261     token => $token);
5262 wakaba 1.79 } else {
5263     !!!cp ('t160');
5264 wakaba 1.52 }
5265    
5266     splice @{$self->{open_elements}}, $i;
5267    
5268     $clear_up_to_marker->();
5269    
5270 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5271 wakaba 1.52
5272     ## reprocess
5273 wakaba 1.125 !!!ack-later;
5274 wakaba 1.126 next B;
5275 wakaba 1.52 } else {
5276 wakaba 1.79 !!!cp ('t161');
5277 wakaba 1.52 #
5278     }
5279     } else {
5280 wakaba 1.79 !!!cp ('t162');
5281 wakaba 1.52 #
5282     }
5283 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
5284 wakaba 1.52 if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') {
5285 wakaba 1.54 if ($self->{insertion_mode} == IN_CELL_IM) {
5286 wakaba 1.43 ## have an element in table scope
5287 wakaba 1.52 my $i;
5288 wakaba 1.43 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5289     my $node = $self->{open_elements}->[$_];
5290 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5291 wakaba 1.79 !!!cp ('t163');
5292 wakaba 1.52 $i = $_;
5293 wakaba 1.43 last INSCOPE;
5294 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5295 wakaba 1.79 !!!cp ('t164');
5296 wakaba 1.43 last INSCOPE;
5297     }
5298     } # INSCOPE
5299 wakaba 1.52 unless (defined $i) {
5300 wakaba 1.79 !!!cp ('t165');
5301 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5302     text => $token->{tag_name},
5303     token => $token);
5304 wakaba 1.43 ## Ignore the token
5305     !!!next-token;
5306 wakaba 1.126 next B;
5307 wakaba 1.43 }
5308    
5309 wakaba 1.52 ## generate implied end tags
5310 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5311     & END_TAG_OPTIONAL_EL) {
5312 wakaba 1.79 !!!cp ('t166');
5313 wakaba 1.86 pop @{$self->{open_elements}};
5314 wakaba 1.52 }
5315 wakaba 1.86
5316 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
5317     ne $token->{tag_name}) {
5318 wakaba 1.79 !!!cp ('t167');
5319 wakaba 1.122 !!!parse-error (type => 'not closed',
5320 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5321 wakaba 1.122 ->manakai_local_name,
5322     token => $token);
5323 wakaba 1.79 } else {
5324     !!!cp ('t168');
5325 wakaba 1.52 }
5326    
5327     splice @{$self->{open_elements}}, $i;
5328    
5329     $clear_up_to_marker->();
5330    
5331 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
5332 wakaba 1.52
5333     !!!next-token;
5334 wakaba 1.126 next B;
5335 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CAPTION_IM) {
5336 wakaba 1.79 !!!cp ('t169');
5337 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5338     text => $token->{tag_name}, token => $token);
5339 wakaba 1.52 ## Ignore the token
5340     !!!next-token;
5341 wakaba 1.126 next B;
5342 wakaba 1.52 } else {
5343 wakaba 1.79 !!!cp ('t170');
5344 wakaba 1.52 #
5345     }
5346     } elsif ($token->{tag_name} eq 'caption') {
5347 wakaba 1.54 if ($self->{insertion_mode} == IN_CAPTION_IM) {
5348 wakaba 1.43 ## have a table element in table scope
5349     my $i;
5350 wakaba 1.108 INSCOPE: {
5351     for (reverse 0..$#{$self->{open_elements}}) {
5352     my $node = $self->{open_elements}->[$_];
5353 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5354 wakaba 1.108 !!!cp ('t171');
5355     $i = $_;
5356     last INSCOPE;
5357 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5358 wakaba 1.108 !!!cp ('t172');
5359     last;
5360     }
5361 wakaba 1.43 }
5362 wakaba 1.108
5363     !!!cp ('t173');
5364     !!!parse-error (type => 'unmatched end tag',
5365 wakaba 1.153 text => $token->{tag_name}, token => $token);
5366 wakaba 1.108 ## Ignore the token
5367     !!!next-token;
5368 wakaba 1.126 next B;
5369 wakaba 1.43 } # INSCOPE
5370    
5371     ## generate implied end tags
5372 wakaba 1.123 while ($self->{open_elements}->[-1]->[1]
5373     & END_TAG_OPTIONAL_EL) {
5374 wakaba 1.79 !!!cp ('t174');
5375 wakaba 1.86 pop @{$self->{open_elements}};
5376 wakaba 1.43 }
5377 wakaba 1.52
5378 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5379 wakaba 1.79 !!!cp ('t175');
5380 wakaba 1.122 !!!parse-error (type => 'not closed',
5381 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5382 wakaba 1.122 ->manakai_local_name,
5383     token => $token);
5384 wakaba 1.79 } else {
5385     !!!cp ('t176');
5386 wakaba 1.52 }
5387    
5388     splice @{$self->{open_elements}}, $i;
5389    
5390     $clear_up_to_marker->();
5391    
5392 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5393 wakaba 1.52
5394     !!!next-token;
5395 wakaba 1.126 next B;
5396 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_CELL_IM) {
5397 wakaba 1.79 !!!cp ('t177');
5398 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5399     text => $token->{tag_name}, token => $token);
5400 wakaba 1.52 ## Ignore the token
5401     !!!next-token;
5402 wakaba 1.126 next B;
5403 wakaba 1.52 } else {
5404 wakaba 1.79 !!!cp ('t178');
5405 wakaba 1.52 #
5406     }
5407     } elsif ({
5408     table => 1, tbody => 1, tfoot => 1,
5409     thead => 1, tr => 1,
5410     }->{$token->{tag_name}} and
5411 wakaba 1.54 $self->{insertion_mode} == IN_CELL_IM) {
5412 wakaba 1.52 ## have an element in table scope
5413     my $i;
5414     my $tn;
5415 wakaba 1.108 INSCOPE: {
5416     for (reverse 0..$#{$self->{open_elements}}) {
5417     my $node = $self->{open_elements}->[$_];
5418 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
5419 wakaba 1.108 !!!cp ('t179');
5420     $i = $_;
5421    
5422     ## Close the cell
5423 wakaba 1.125 !!!back-token; # </x>
5424 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => $tn,
5425     line => $token->{line},
5426     column => $token->{column}};
5427 wakaba 1.126 next B;
5428 wakaba 1.123 } elsif ($node->[1] & TABLE_CELL_EL) {
5429 wakaba 1.108 !!!cp ('t180');
5430 wakaba 1.123 $tn = $node->[0]->manakai_local_name;
5431 wakaba 1.108 ## NOTE: There is exactly one |td| or |th| element
5432     ## in scope in the stack of open elements by definition.
5433 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5434 wakaba 1.108 ## ISSUE: Can this be reached?
5435     !!!cp ('t181');
5436     last;
5437     }
5438 wakaba 1.52 }
5439 wakaba 1.108
5440 wakaba 1.79 !!!cp ('t182');
5441 wakaba 1.108 !!!parse-error (type => 'unmatched end tag',
5442 wakaba 1.153 text => $token->{tag_name}, token => $token);
5443 wakaba 1.52 ## Ignore the token
5444     !!!next-token;
5445 wakaba 1.126 next B;
5446 wakaba 1.108 } # INSCOPE
5447 wakaba 1.52 } elsif ($token->{tag_name} eq 'table' and
5448 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
5449 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'caption',
5450     token => $token);
5451 wakaba 1.52
5452     ## As if </caption>
5453     ## have a table element in table scope
5454     my $i;
5455     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5456     my $node = $self->{open_elements}->[$_];
5457 wakaba 1.123 if ($node->[1] & CAPTION_EL) {
5458 wakaba 1.79 !!!cp ('t184');
5459 wakaba 1.52 $i = $_;
5460     last INSCOPE;
5461 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5462 wakaba 1.79 !!!cp ('t185');
5463 wakaba 1.52 last INSCOPE;
5464     }
5465     } # INSCOPE
5466     unless (defined $i) {
5467 wakaba 1.79 !!!cp ('t186');
5468 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5469     text => 'caption', token => $token);
5470 wakaba 1.52 ## Ignore the token
5471     !!!next-token;
5472 wakaba 1.126 next B;
5473 wakaba 1.52 }
5474    
5475     ## generate implied end tags
5476 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5477 wakaba 1.79 !!!cp ('t187');
5478 wakaba 1.86 pop @{$self->{open_elements}};
5479 wakaba 1.52 }
5480    
5481 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & CAPTION_EL) {
5482 wakaba 1.79 !!!cp ('t188');
5483 wakaba 1.122 !!!parse-error (type => 'not closed',
5484 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5485 wakaba 1.122 ->manakai_local_name,
5486     token => $token);
5487 wakaba 1.79 } else {
5488     !!!cp ('t189');
5489 wakaba 1.52 }
5490    
5491     splice @{$self->{open_elements}}, $i;
5492    
5493     $clear_up_to_marker->();
5494    
5495 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5496 wakaba 1.52
5497     ## reprocess
5498 wakaba 1.126 next B;
5499 wakaba 1.52 } elsif ({
5500     body => 1, col => 1, colgroup => 1, html => 1,
5501     }->{$token->{tag_name}}) {
5502 wakaba 1.56 if ($self->{insertion_mode} & BODY_TABLE_IMS) {
5503 wakaba 1.79 !!!cp ('t190');
5504 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5505     text => $token->{tag_name}, token => $token);
5506 wakaba 1.52 ## Ignore the token
5507     !!!next-token;
5508 wakaba 1.126 next B;
5509 wakaba 1.52 } else {
5510 wakaba 1.79 !!!cp ('t191');
5511 wakaba 1.52 #
5512     }
5513     } elsif ({
5514     tbody => 1, tfoot => 1,
5515     thead => 1, tr => 1,
5516     }->{$token->{tag_name}} and
5517 wakaba 1.54 $self->{insertion_mode} == IN_CAPTION_IM) {
5518 wakaba 1.79 !!!cp ('t192');
5519 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5520     text => $token->{tag_name}, token => $token);
5521 wakaba 1.52 ## Ignore the token
5522     !!!next-token;
5523 wakaba 1.126 next B;
5524 wakaba 1.52 } else {
5525 wakaba 1.79 !!!cp ('t193');
5526 wakaba 1.52 #
5527     }
5528 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
5529     for my $entry (@{$self->{open_elements}}) {
5530 wakaba 1.123 unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) {
5531 wakaba 1.104 !!!cp ('t75');
5532 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
5533 wakaba 1.104 last;
5534     }
5535     }
5536    
5537     ## Stop parsing.
5538     last B;
5539 wakaba 1.52 } else {
5540     die "$0: $token->{type}: Unknown token type";
5541     }
5542    
5543     $insert = $insert_to_current;
5544     #
5545 wakaba 1.56 } elsif ($self->{insertion_mode} & TABLE_IMS) {
5546 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
5547 wakaba 1.95 if (not $open_tables->[-1]->[1] and # tainted
5548 wakaba 1.188 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
5549 wakaba 1.95 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5550 wakaba 1.52
5551 wakaba 1.95 unless (length $token->{data}) {
5552     !!!cp ('t194');
5553     !!!next-token;
5554 wakaba 1.126 next B;
5555 wakaba 1.95 } else {
5556     !!!cp ('t195');
5557     }
5558     }
5559 wakaba 1.52
5560 wakaba 1.153 !!!parse-error (type => 'in table:#text', token => $token);
5561 wakaba 1.52
5562     ## As if in body, but insert into foster parent element
5563     ## ISSUE: Spec says that "whenever a node would be inserted
5564     ## into the current node" while characters might not be
5565     ## result in a new Text node.
5566     $reconstruct_active_formatting_elements->($insert_to_foster);
5567    
5568 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) {
5569 wakaba 1.52 # MUST
5570     my $foster_parent_element;
5571     my $next_sibling;
5572     my $prev_sibling;
5573     OE: for (reverse 0..$#{$self->{open_elements}}) {
5574 wakaba 1.123 if ($self->{open_elements}->[$_]->[1] & TABLE_EL) {
5575 wakaba 1.52 my $parent = $self->{open_elements}->[$_]->[0]->parent_node;
5576     if (defined $parent and $parent->node_type == 1) {
5577 wakaba 1.79 !!!cp ('t196');
5578 wakaba 1.52 $foster_parent_element = $parent;
5579     $next_sibling = $self->{open_elements}->[$_]->[0];
5580     $prev_sibling = $next_sibling->previous_sibling;
5581     } else {
5582 wakaba 1.79 !!!cp ('t197');
5583 wakaba 1.52 $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0];
5584     $prev_sibling = $foster_parent_element->last_child;
5585     }
5586     last OE;
5587     }
5588     } # OE
5589     $foster_parent_element = $self->{open_elements}->[0]->[0] and
5590     $prev_sibling = $foster_parent_element->last_child
5591     unless defined $foster_parent_element;
5592     if (defined $prev_sibling and
5593     $prev_sibling->node_type == 3) {
5594 wakaba 1.79 !!!cp ('t198');
5595 wakaba 1.52 $prev_sibling->manakai_append_text ($token->{data});
5596     } else {
5597 wakaba 1.79 !!!cp ('t199');
5598 wakaba 1.52 $foster_parent_element->insert_before
5599     ($self->{document}->create_text_node ($token->{data}),
5600     $next_sibling);
5601     }
5602 wakaba 1.95 $open_tables->[-1]->[1] = 1; # tainted
5603     } else {
5604     !!!cp ('t200');
5605     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
5606     }
5607 wakaba 1.52
5608 wakaba 1.95 !!!next-token;
5609 wakaba 1.126 next B;
5610 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
5611 wakaba 1.153 if ({
5612     tr => ($self->{insertion_mode} != IN_ROW_IM),
5613     th => 1, td => 1,
5614     }->{$token->{tag_name}}) {
5615     if ($self->{insertion_mode} == IN_TABLE_IM) {
5616     ## Clear back to table context
5617     while (not ($self->{open_elements}->[-1]->[1]
5618     & TABLE_SCOPING_EL)) {
5619     !!!cp ('t201');
5620     pop @{$self->{open_elements}};
5621     }
5622    
5623     !!!insert-element ('tbody',, $token);
5624     $self->{insertion_mode} = IN_TABLE_BODY_IM;
5625     ## reprocess in the "in table body" insertion mode...
5626     }
5627    
5628     if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5629     unless ($token->{tag_name} eq 'tr') {
5630     !!!cp ('t202');
5631     !!!parse-error (type => 'missing start tag:tr', token => $token);
5632     }
5633 wakaba 1.43
5634 wakaba 1.153 ## Clear back to table body context
5635     while (not ($self->{open_elements}->[-1]->[1]
5636     & TABLE_ROWS_SCOPING_EL)) {
5637     !!!cp ('t203');
5638     ## ISSUE: Can this case be reached?
5639     pop @{$self->{open_elements}};
5640     }
5641 wakaba 1.43
5642 wakaba 1.54 $self->{insertion_mode} = IN_ROW_IM;
5643 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
5644 wakaba 1.79 !!!cp ('t204');
5645 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5646 wakaba 1.125 !!!nack ('t204');
5647 wakaba 1.52 !!!next-token;
5648 wakaba 1.126 next B;
5649 wakaba 1.52 } else {
5650 wakaba 1.79 !!!cp ('t205');
5651 wakaba 1.116 !!!insert-element ('tr',, $token);
5652 wakaba 1.52 ## reprocess in the "in row" insertion mode
5653     }
5654 wakaba 1.79 } else {
5655     !!!cp ('t206');
5656 wakaba 1.52 }
5657    
5658     ## Clear back to table row context
5659 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5660     & TABLE_ROW_SCOPING_EL)) {
5661 wakaba 1.79 !!!cp ('t207');
5662 wakaba 1.52 pop @{$self->{open_elements}};
5663 wakaba 1.43 }
5664 wakaba 1.52
5665 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5666 wakaba 1.54 $self->{insertion_mode} = IN_CELL_IM;
5667 wakaba 1.52
5668     push @$active_formatting_elements, ['#marker', ''];
5669    
5670 wakaba 1.125 !!!nack ('t207.1');
5671 wakaba 1.52 !!!next-token;
5672 wakaba 1.126 next B;
5673 wakaba 1.52 } elsif ({
5674     caption => 1, col => 1, colgroup => 1,
5675     tbody => 1, tfoot => 1, thead => 1,
5676 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
5677 wakaba 1.52 }->{$token->{tag_name}}) {
5678 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
5679 wakaba 1.52 ## As if </tr>
5680 wakaba 1.43 ## have an element in table scope
5681     my $i;
5682     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5683     my $node = $self->{open_elements}->[$_];
5684 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5685 wakaba 1.79 !!!cp ('t208');
5686 wakaba 1.43 $i = $_;
5687     last INSCOPE;
5688 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5689 wakaba 1.79 !!!cp ('t209');
5690 wakaba 1.43 last INSCOPE;
5691     }
5692     } # INSCOPE
5693 wakaba 1.79 unless (defined $i) {
5694 wakaba 1.125 !!!cp ('t210');
5695 wakaba 1.83 ## TODO: This type is wrong.
5696 wakaba 1.153 !!!parse-error (type => 'unmacthed end tag',
5697     text => $token->{tag_name}, token => $token);
5698 wakaba 1.52 ## Ignore the token
5699 wakaba 1.125 !!!nack ('t210.1');
5700 wakaba 1.52 !!!next-token;
5701 wakaba 1.126 next B;
5702 wakaba 1.43 }
5703    
5704 wakaba 1.52 ## Clear back to table row context
5705 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5706     & TABLE_ROW_SCOPING_EL)) {
5707 wakaba 1.79 !!!cp ('t211');
5708 wakaba 1.83 ## ISSUE: Can this case be reached?
5709 wakaba 1.52 pop @{$self->{open_elements}};
5710 wakaba 1.1 }
5711 wakaba 1.43
5712 wakaba 1.52 pop @{$self->{open_elements}}; # tr
5713 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5714 wakaba 1.52 if ($token->{tag_name} eq 'tr') {
5715 wakaba 1.79 !!!cp ('t212');
5716 wakaba 1.52 ## reprocess
5717 wakaba 1.125 !!!ack-later;
5718 wakaba 1.126 next B;
5719 wakaba 1.52 } else {
5720 wakaba 1.79 !!!cp ('t213');
5721 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
5722     }
5723 wakaba 1.1 }
5724 wakaba 1.52
5725 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
5726 wakaba 1.52 ## have an element in table scope
5727 wakaba 1.43 my $i;
5728     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5729     my $node = $self->{open_elements}->[$_];
5730 wakaba 1.123 if ($node->[1] & TABLE_ROW_GROUP_EL) {
5731 wakaba 1.79 !!!cp ('t214');
5732 wakaba 1.43 $i = $_;
5733     last INSCOPE;
5734 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5735 wakaba 1.79 !!!cp ('t215');
5736 wakaba 1.43 last INSCOPE;
5737     }
5738     } # INSCOPE
5739 wakaba 1.52 unless (defined $i) {
5740 wakaba 1.79 !!!cp ('t216');
5741 wakaba 1.153 ## TODO: This erorr type is wrong.
5742     !!!parse-error (type => 'unmatched end tag',
5743     text => $token->{tag_name}, token => $token);
5744 wakaba 1.52 ## Ignore the token
5745 wakaba 1.125 !!!nack ('t216.1');
5746 wakaba 1.52 !!!next-token;
5747 wakaba 1.126 next B;
5748 wakaba 1.43 }
5749 wakaba 1.52
5750     ## Clear back to table body context
5751 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5752     & TABLE_ROWS_SCOPING_EL)) {
5753 wakaba 1.79 !!!cp ('t217');
5754 wakaba 1.83 ## ISSUE: Can this state be reached?
5755 wakaba 1.52 pop @{$self->{open_elements}};
5756 wakaba 1.43 }
5757    
5758 wakaba 1.52 ## As if <{current node}>
5759     ## have an element in table scope
5760     ## true by definition
5761 wakaba 1.43
5762 wakaba 1.52 ## Clear back to table body context
5763     ## nop by definition
5764 wakaba 1.43
5765 wakaba 1.52 pop @{$self->{open_elements}};
5766 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
5767 wakaba 1.52 ## reprocess in "in table" insertion mode...
5768 wakaba 1.79 } else {
5769     !!!cp ('t218');
5770 wakaba 1.52 }
5771    
5772     if ($token->{tag_name} eq 'col') {
5773     ## Clear back to table context
5774 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5775     & TABLE_SCOPING_EL)) {
5776 wakaba 1.79 !!!cp ('t219');
5777 wakaba 1.83 ## ISSUE: Can this state be reached?
5778 wakaba 1.52 pop @{$self->{open_elements}};
5779     }
5780 wakaba 1.43
5781 wakaba 1.116 !!!insert-element ('colgroup',, $token);
5782 wakaba 1.54 $self->{insertion_mode} = IN_COLUMN_GROUP_IM;
5783 wakaba 1.52 ## reprocess
5784 wakaba 1.125 !!!ack-later;
5785 wakaba 1.126 next B;
5786 wakaba 1.52 } elsif ({
5787     caption => 1,
5788     colgroup => 1,
5789     tbody => 1, tfoot => 1, thead => 1,
5790     }->{$token->{tag_name}}) {
5791     ## Clear back to table context
5792 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5793     & TABLE_SCOPING_EL)) {
5794 wakaba 1.79 !!!cp ('t220');
5795 wakaba 1.83 ## ISSUE: Can this state be reached?
5796 wakaba 1.52 pop @{$self->{open_elements}};
5797 wakaba 1.1 }
5798 wakaba 1.52
5799     push @$active_formatting_elements, ['#marker', '']
5800     if $token->{tag_name} eq 'caption';
5801    
5802 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5803 wakaba 1.52 $self->{insertion_mode} = {
5804 wakaba 1.54 caption => IN_CAPTION_IM,
5805     colgroup => IN_COLUMN_GROUP_IM,
5806     tbody => IN_TABLE_BODY_IM,
5807     tfoot => IN_TABLE_BODY_IM,
5808     thead => IN_TABLE_BODY_IM,
5809 wakaba 1.52 }->{$token->{tag_name}};
5810 wakaba 1.1 !!!next-token;
5811 wakaba 1.125 !!!nack ('t220.1');
5812 wakaba 1.126 next B;
5813 wakaba 1.52 } else {
5814     die "$0: in table: <>: $token->{tag_name}";
5815 wakaba 1.1 }
5816 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
5817 wakaba 1.122 !!!parse-error (type => 'not closed',
5818 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5819 wakaba 1.122 ->manakai_local_name,
5820     token => $token);
5821 wakaba 1.1
5822 wakaba 1.52 ## As if </table>
5823 wakaba 1.1 ## have a table element in table scope
5824     my $i;
5825 wakaba 1.3 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5826     my $node = $self->{open_elements}->[$_];
5827 wakaba 1.123 if ($node->[1] & TABLE_EL) {
5828 wakaba 1.79 !!!cp ('t221');
5829 wakaba 1.1 $i = $_;
5830     last INSCOPE;
5831 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5832 wakaba 1.79 !!!cp ('t222');
5833 wakaba 1.1 last INSCOPE;
5834     }
5835     } # INSCOPE
5836     unless (defined $i) {
5837 wakaba 1.79 !!!cp ('t223');
5838 wakaba 1.83 ## TODO: The following is wrong, maybe.
5839 wakaba 1.153 !!!parse-error (type => 'unmatched end tag', text => 'table',
5840     token => $token);
5841 wakaba 1.52 ## Ignore tokens </table><table>
5842 wakaba 1.125 !!!nack ('t223.1');
5843 wakaba 1.1 !!!next-token;
5844 wakaba 1.126 next B;
5845 wakaba 1.1 }
5846    
5847 wakaba 1.151 ## TODO: Followings are removed from the latest spec.
5848 wakaba 1.1 ## generate implied end tags
5849 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
5850 wakaba 1.79 !!!cp ('t224');
5851 wakaba 1.86 pop @{$self->{open_elements}};
5852 wakaba 1.1 }
5853    
5854 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & TABLE_EL) {
5855 wakaba 1.79 !!!cp ('t225');
5856 wakaba 1.122 ## NOTE: |<table><tr><table>|
5857     !!!parse-error (type => 'not closed',
5858 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
5859 wakaba 1.122 ->manakai_local_name,
5860     token => $token);
5861 wakaba 1.79 } else {
5862     !!!cp ('t226');
5863 wakaba 1.1 }
5864    
5865 wakaba 1.3 splice @{$self->{open_elements}}, $i;
5866 wakaba 1.95 pop @{$open_tables};
5867 wakaba 1.1
5868 wakaba 1.52 $self->_reset_insertion_mode;
5869 wakaba 1.1
5870 wakaba 1.125 ## reprocess
5871     !!!ack-later;
5872 wakaba 1.126 next B;
5873 wakaba 1.100 } elsif ($token->{tag_name} eq 'style') {
5874     if (not $open_tables->[-1]->[1]) { # tainted
5875     !!!cp ('t227.8');
5876     ## NOTE: This is a "as if in head" code clone.
5877     $parse_rcdata->(CDATA_CONTENT_MODEL);
5878 wakaba 1.126 next B;
5879 wakaba 1.100 } else {
5880     !!!cp ('t227.7');
5881     #
5882     }
5883     } elsif ($token->{tag_name} eq 'script') {
5884     if (not $open_tables->[-1]->[1]) { # tainted
5885     !!!cp ('t227.6');
5886     ## NOTE: This is a "as if in head" code clone.
5887     $script_start_tag->();
5888 wakaba 1.126 next B;
5889 wakaba 1.100 } else {
5890     !!!cp ('t227.5');
5891     #
5892     }
5893 wakaba 1.98 } elsif ($token->{tag_name} eq 'input') {
5894     if (not $open_tables->[-1]->[1]) { # tainted
5895     if ($token->{attributes}->{type}) { ## TODO: case
5896     my $type = lc $token->{attributes}->{type}->{value};
5897     if ($type eq 'hidden') {
5898     !!!cp ('t227.3');
5899 wakaba 1.153 !!!parse-error (type => 'in table',
5900     text => $token->{tag_name}, token => $token);
5901 wakaba 1.98
5902 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
5903 wakaba 1.98
5904     ## TODO: form element pointer
5905    
5906     pop @{$self->{open_elements}};
5907    
5908     !!!next-token;
5909 wakaba 1.125 !!!ack ('t227.2.1');
5910 wakaba 1.126 next B;
5911 wakaba 1.98 } else {
5912     !!!cp ('t227.2');
5913     #
5914     }
5915     } else {
5916     !!!cp ('t227.1');
5917     #
5918     }
5919     } else {
5920     !!!cp ('t227.4');
5921     #
5922     }
5923 wakaba 1.58 } else {
5924 wakaba 1.79 !!!cp ('t227');
5925 wakaba 1.58 #
5926     }
5927 wakaba 1.98
5928 wakaba 1.153 !!!parse-error (type => 'in table', text => $token->{tag_name},
5929     token => $token);
5930 wakaba 1.98
5931     $insert = $insert_to_foster;
5932     #
5933 wakaba 1.58 } elsif ($token->{type} == END_TAG_TOKEN) {
5934 wakaba 1.52 if ($token->{tag_name} eq 'tr' and
5935 wakaba 1.54 $self->{insertion_mode} == IN_ROW_IM) {
5936 wakaba 1.52 ## have an element in table scope
5937     my $i;
5938     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5939     my $node = $self->{open_elements}->[$_];
5940 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5941 wakaba 1.79 !!!cp ('t228');
5942 wakaba 1.52 $i = $_;
5943     last INSCOPE;
5944 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5945 wakaba 1.79 !!!cp ('t229');
5946 wakaba 1.52 last INSCOPE;
5947     }
5948     } # INSCOPE
5949     unless (defined $i) {
5950 wakaba 1.79 !!!cp ('t230');
5951 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5952     text => $token->{tag_name}, token => $token);
5953 wakaba 1.52 ## Ignore the token
5954 wakaba 1.125 !!!nack ('t230.1');
5955 wakaba 1.42 !!!next-token;
5956 wakaba 1.126 next B;
5957 wakaba 1.79 } else {
5958     !!!cp ('t232');
5959 wakaba 1.42 }
5960    
5961 wakaba 1.52 ## Clear back to table row context
5962 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
5963     & TABLE_ROW_SCOPING_EL)) {
5964 wakaba 1.79 !!!cp ('t231');
5965 wakaba 1.83 ## ISSUE: Can this state be reached?
5966 wakaba 1.52 pop @{$self->{open_elements}};
5967     }
5968 wakaba 1.42
5969 wakaba 1.52 pop @{$self->{open_elements}}; # tr
5970 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
5971 wakaba 1.52 !!!next-token;
5972 wakaba 1.125 !!!nack ('t231.1');
5973 wakaba 1.126 next B;
5974 wakaba 1.52 } elsif ($token->{tag_name} eq 'table') {
5975 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
5976 wakaba 1.52 ## As if </tr>
5977     ## have an element in table scope
5978     my $i;
5979     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
5980     my $node = $self->{open_elements}->[$_];
5981 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
5982 wakaba 1.79 !!!cp ('t233');
5983 wakaba 1.52 $i = $_;
5984     last INSCOPE;
5985 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
5986 wakaba 1.79 !!!cp ('t234');
5987 wakaba 1.52 last INSCOPE;
5988 wakaba 1.42 }
5989 wakaba 1.52 } # INSCOPE
5990     unless (defined $i) {
5991 wakaba 1.79 !!!cp ('t235');
5992 wakaba 1.83 ## TODO: The following is wrong.
5993 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
5994     text => $token->{type}, token => $token);
5995 wakaba 1.52 ## Ignore the token
5996 wakaba 1.125 !!!nack ('t236.1');
5997 wakaba 1.52 !!!next-token;
5998 wakaba 1.126 next B;
5999 wakaba 1.42 }
6000 wakaba 1.52
6001     ## Clear back to table row context
6002 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6003     & TABLE_ROW_SCOPING_EL)) {
6004 wakaba 1.79 !!!cp ('t236');
6005 wakaba 1.83 ## ISSUE: Can this state be reached?
6006 wakaba 1.46 pop @{$self->{open_elements}};
6007 wakaba 1.1 }
6008 wakaba 1.46
6009 wakaba 1.52 pop @{$self->{open_elements}}; # tr
6010 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
6011 wakaba 1.46 ## reprocess in the "in table body" insertion mode...
6012 wakaba 1.1 }
6013    
6014 wakaba 1.54 if ($self->{insertion_mode} == IN_TABLE_BODY_IM) {
6015 wakaba 1.52 ## have an element in table scope
6016     my $i;
6017     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6018     my $node = $self->{open_elements}->[$_];
6019 wakaba 1.123 if ($node->[1] & TABLE_ROW_GROUP_EL) {
6020 wakaba 1.79 !!!cp ('t237');
6021 wakaba 1.52 $i = $_;
6022     last INSCOPE;
6023 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6024 wakaba 1.79 !!!cp ('t238');
6025 wakaba 1.52 last INSCOPE;
6026     }
6027     } # INSCOPE
6028     unless (defined $i) {
6029 wakaba 1.79 !!!cp ('t239');
6030 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6031     text => $token->{tag_name}, token => $token);
6032 wakaba 1.52 ## Ignore the token
6033 wakaba 1.125 !!!nack ('t239.1');
6034 wakaba 1.52 !!!next-token;
6035 wakaba 1.126 next B;
6036 wakaba 1.47 }
6037    
6038     ## Clear back to table body context
6039 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6040     & TABLE_ROWS_SCOPING_EL)) {
6041 wakaba 1.79 !!!cp ('t240');
6042 wakaba 1.47 pop @{$self->{open_elements}};
6043     }
6044    
6045 wakaba 1.52 ## As if <{current node}>
6046     ## have an element in table scope
6047     ## true by definition
6048    
6049     ## Clear back to table body context
6050     ## nop by definition
6051    
6052     pop @{$self->{open_elements}};
6053 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6054 wakaba 1.52 ## reprocess in the "in table" insertion mode...
6055     }
6056    
6057 wakaba 1.94 ## NOTE: </table> in the "in table" insertion mode.
6058     ## When you edit the code fragment below, please ensure that
6059     ## the code for <table> in the "in table" insertion mode
6060     ## is synced with it.
6061    
6062 wakaba 1.52 ## have a table element in table scope
6063     my $i;
6064     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6065     my $node = $self->{open_elements}->[$_];
6066 wakaba 1.123 if ($node->[1] & TABLE_EL) {
6067 wakaba 1.79 !!!cp ('t241');
6068 wakaba 1.52 $i = $_;
6069     last INSCOPE;
6070 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6071 wakaba 1.79 !!!cp ('t242');
6072 wakaba 1.52 last INSCOPE;
6073 wakaba 1.47 }
6074 wakaba 1.52 } # INSCOPE
6075     unless (defined $i) {
6076 wakaba 1.79 !!!cp ('t243');
6077 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6078     text => $token->{tag_name}, token => $token);
6079 wakaba 1.52 ## Ignore the token
6080 wakaba 1.125 !!!nack ('t243.1');
6081 wakaba 1.52 !!!next-token;
6082 wakaba 1.126 next B;
6083 wakaba 1.3 }
6084 wakaba 1.52
6085     splice @{$self->{open_elements}}, $i;
6086 wakaba 1.95 pop @{$open_tables};
6087 wakaba 1.1
6088 wakaba 1.52 $self->_reset_insertion_mode;
6089 wakaba 1.47
6090     !!!next-token;
6091 wakaba 1.126 next B;
6092 wakaba 1.47 } elsif ({
6093 wakaba 1.48 tbody => 1, tfoot => 1, thead => 1,
6094 wakaba 1.52 }->{$token->{tag_name}} and
6095 wakaba 1.56 $self->{insertion_mode} & ROW_IMS) {
6096 wakaba 1.54 if ($self->{insertion_mode} == IN_ROW_IM) {
6097 wakaba 1.52 ## have an element in table scope
6098     my $i;
6099     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6100     my $node = $self->{open_elements}->[$_];
6101 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6102 wakaba 1.79 !!!cp ('t247');
6103 wakaba 1.52 $i = $_;
6104     last INSCOPE;
6105 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6106 wakaba 1.79 !!!cp ('t248');
6107 wakaba 1.52 last INSCOPE;
6108     }
6109     } # INSCOPE
6110     unless (defined $i) {
6111 wakaba 1.79 !!!cp ('t249');
6112 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6113     text => $token->{tag_name}, token => $token);
6114 wakaba 1.52 ## Ignore the token
6115 wakaba 1.125 !!!nack ('t249.1');
6116 wakaba 1.52 !!!next-token;
6117 wakaba 1.126 next B;
6118 wakaba 1.52 }
6119    
6120 wakaba 1.48 ## As if </tr>
6121     ## have an element in table scope
6122     my $i;
6123     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6124     my $node = $self->{open_elements}->[$_];
6125 wakaba 1.123 if ($node->[1] & TABLE_ROW_EL) {
6126 wakaba 1.79 !!!cp ('t250');
6127 wakaba 1.48 $i = $_;
6128     last INSCOPE;
6129 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6130 wakaba 1.79 !!!cp ('t251');
6131 wakaba 1.48 last INSCOPE;
6132     }
6133     } # INSCOPE
6134 wakaba 1.52 unless (defined $i) {
6135 wakaba 1.79 !!!cp ('t252');
6136 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6137     text => 'tr', token => $token);
6138 wakaba 1.52 ## Ignore the token
6139 wakaba 1.125 !!!nack ('t252.1');
6140 wakaba 1.52 !!!next-token;
6141 wakaba 1.126 next B;
6142 wakaba 1.52 }
6143 wakaba 1.48
6144     ## Clear back to table row context
6145 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6146     & TABLE_ROW_SCOPING_EL)) {
6147 wakaba 1.79 !!!cp ('t253');
6148 wakaba 1.83 ## ISSUE: Can this case be reached?
6149 wakaba 1.48 pop @{$self->{open_elements}};
6150     }
6151    
6152     pop @{$self->{open_elements}}; # tr
6153 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_BODY_IM;
6154 wakaba 1.52 ## reprocess in the "in table body" insertion mode...
6155     }
6156    
6157     ## have an element in table scope
6158     my $i;
6159     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6160     my $node = $self->{open_elements}->[$_];
6161 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6162 wakaba 1.79 !!!cp ('t254');
6163 wakaba 1.52 $i = $_;
6164     last INSCOPE;
6165 wakaba 1.123 } elsif ($node->[1] & TABLE_SCOPING_EL) {
6166 wakaba 1.79 !!!cp ('t255');
6167 wakaba 1.52 last INSCOPE;
6168     }
6169     } # INSCOPE
6170     unless (defined $i) {
6171 wakaba 1.79 !!!cp ('t256');
6172 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6173     text => $token->{tag_name}, token => $token);
6174 wakaba 1.52 ## Ignore the token
6175 wakaba 1.125 !!!nack ('t256.1');
6176 wakaba 1.52 !!!next-token;
6177 wakaba 1.126 next B;
6178 wakaba 1.52 }
6179    
6180     ## Clear back to table body context
6181 wakaba 1.123 while (not ($self->{open_elements}->[-1]->[1]
6182     & TABLE_ROWS_SCOPING_EL)) {
6183 wakaba 1.79 !!!cp ('t257');
6184 wakaba 1.83 ## ISSUE: Can this case be reached?
6185 wakaba 1.52 pop @{$self->{open_elements}};
6186     }
6187    
6188     pop @{$self->{open_elements}};
6189 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6190 wakaba 1.125 !!!nack ('t257.1');
6191 wakaba 1.52 !!!next-token;
6192 wakaba 1.126 next B;
6193 wakaba 1.52 } elsif ({
6194     body => 1, caption => 1, col => 1, colgroup => 1,
6195     html => 1, td => 1, th => 1,
6196 wakaba 1.54 tr => 1, # $self->{insertion_mode} == IN_ROW_IM
6197     tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM
6198 wakaba 1.52 }->{$token->{tag_name}}) {
6199 wakaba 1.125 !!!cp ('t258');
6200 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6201     text => $token->{tag_name}, token => $token);
6202 wakaba 1.125 ## Ignore the token
6203     !!!nack ('t258.1');
6204     !!!next-token;
6205 wakaba 1.126 next B;
6206 wakaba 1.58 } else {
6207 wakaba 1.79 !!!cp ('t259');
6208 wakaba 1.153 !!!parse-error (type => 'in table:/',
6209     text => $token->{tag_name}, token => $token);
6210 wakaba 1.52
6211 wakaba 1.58 $insert = $insert_to_foster;
6212     #
6213     }
6214 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6215 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6216 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6217 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6218 wakaba 1.104 !!!cp ('t259.1');
6219 wakaba 1.105 #
6220 wakaba 1.104 } else {
6221     !!!cp ('t259.2');
6222 wakaba 1.105 #
6223 wakaba 1.104 }
6224    
6225     ## Stop parsing
6226     last B;
6227 wakaba 1.58 } else {
6228     die "$0: $token->{type}: Unknown token type";
6229     }
6230 wakaba 1.54 } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6231 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6232 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6233 wakaba 1.52 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6234     unless (length $token->{data}) {
6235 wakaba 1.79 !!!cp ('t260');
6236 wakaba 1.52 !!!next-token;
6237 wakaba 1.126 next B;
6238 wakaba 1.52 }
6239     }
6240    
6241 wakaba 1.79 !!!cp ('t261');
6242 wakaba 1.52 #
6243 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6244 wakaba 1.52 if ($token->{tag_name} eq 'col') {
6245 wakaba 1.79 !!!cp ('t262');
6246 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6247 wakaba 1.52 pop @{$self->{open_elements}};
6248 wakaba 1.125 !!!ack ('t262.1');
6249 wakaba 1.52 !!!next-token;
6250 wakaba 1.126 next B;
6251 wakaba 1.52 } else {
6252 wakaba 1.79 !!!cp ('t263');
6253 wakaba 1.52 #
6254     }
6255 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6256 wakaba 1.52 if ($token->{tag_name} eq 'colgroup') {
6257 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6258 wakaba 1.79 !!!cp ('t264');
6259 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6260     text => 'colgroup', token => $token);
6261 wakaba 1.52 ## Ignore the token
6262     !!!next-token;
6263 wakaba 1.126 next B;
6264 wakaba 1.52 } else {
6265 wakaba 1.79 !!!cp ('t265');
6266 wakaba 1.52 pop @{$self->{open_elements}}; # colgroup
6267 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6268 wakaba 1.52 !!!next-token;
6269 wakaba 1.126 next B;
6270 wakaba 1.52 }
6271     } elsif ($token->{tag_name} eq 'col') {
6272 wakaba 1.79 !!!cp ('t266');
6273 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6274     text => 'col', token => $token);
6275 wakaba 1.52 ## Ignore the token
6276     !!!next-token;
6277 wakaba 1.126 next B;
6278 wakaba 1.52 } else {
6279 wakaba 1.79 !!!cp ('t267');
6280 wakaba 1.52 #
6281     }
6282 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6283 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6284 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6285     !!!cp ('t270.2');
6286     ## Stop parsing.
6287     last B;
6288     } else {
6289     ## NOTE: As if </colgroup>.
6290     !!!cp ('t270.1');
6291     pop @{$self->{open_elements}}; # colgroup
6292     $self->{insertion_mode} = IN_TABLE_IM;
6293     ## Reprocess.
6294 wakaba 1.126 next B;
6295 wakaba 1.104 }
6296     } else {
6297     die "$0: $token->{type}: Unknown token type";
6298     }
6299 wakaba 1.52
6300     ## As if </colgroup>
6301 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL) {
6302 wakaba 1.79 !!!cp ('t269');
6303 wakaba 1.104 ## TODO: Wrong error type?
6304 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6305     text => 'colgroup', token => $token);
6306 wakaba 1.52 ## Ignore the token
6307 wakaba 1.125 !!!nack ('t269.1');
6308 wakaba 1.52 !!!next-token;
6309 wakaba 1.126 next B;
6310 wakaba 1.52 } else {
6311 wakaba 1.79 !!!cp ('t270');
6312 wakaba 1.52 pop @{$self->{open_elements}}; # colgroup
6313 wakaba 1.54 $self->{insertion_mode} = IN_TABLE_IM;
6314 wakaba 1.125 !!!ack-later;
6315 wakaba 1.52 ## reprocess
6316 wakaba 1.126 next B;
6317 wakaba 1.52 }
6318 wakaba 1.101 } elsif ($self->{insertion_mode} & SELECT_IMS) {
6319 wakaba 1.58 if ($token->{type} == CHARACTER_TOKEN) {
6320 wakaba 1.79 !!!cp ('t271');
6321 wakaba 1.58 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
6322     !!!next-token;
6323 wakaba 1.126 next B;
6324 wakaba 1.58 } elsif ($token->{type} == START_TAG_TOKEN) {
6325 wakaba 1.123 if ($token->{tag_name} eq 'option') {
6326     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6327     !!!cp ('t272');
6328     ## As if </option>
6329     pop @{$self->{open_elements}};
6330     } else {
6331     !!!cp ('t273');
6332     }
6333 wakaba 1.52
6334 wakaba 1.123 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6335 wakaba 1.125 !!!nack ('t273.1');
6336 wakaba 1.123 !!!next-token;
6337 wakaba 1.126 next B;
6338 wakaba 1.123 } elsif ($token->{tag_name} eq 'optgroup') {
6339     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6340     !!!cp ('t274');
6341     ## As if </option>
6342     pop @{$self->{open_elements}};
6343     } else {
6344     !!!cp ('t275');
6345     }
6346 wakaba 1.52
6347 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6348     !!!cp ('t276');
6349     ## As if </optgroup>
6350     pop @{$self->{open_elements}};
6351     } else {
6352     !!!cp ('t277');
6353     }
6354 wakaba 1.52
6355 wakaba 1.123 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6356 wakaba 1.125 !!!nack ('t277.1');
6357 wakaba 1.123 !!!next-token;
6358 wakaba 1.126 next B;
6359 wakaba 1.146 } elsif ({
6360     select => 1, input => 1, textarea => 1,
6361     }->{$token->{tag_name}} or
6362 wakaba 1.101 ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6363     {
6364     caption => 1, table => 1,
6365     tbody => 1, tfoot => 1, thead => 1,
6366     tr => 1, td => 1, th => 1,
6367     }->{$token->{tag_name}})) {
6368     ## TODO: The type below is not good - <select> is replaced by </select>
6369 wakaba 1.153 !!!parse-error (type => 'not closed', text => 'select',
6370     token => $token);
6371 wakaba 1.101 ## NOTE: As if the token were </select> (<select> case) or
6372     ## as if there were </select> (otherwise).
6373 wakaba 1.123 ## have an element in table scope
6374     my $i;
6375     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6376     my $node = $self->{open_elements}->[$_];
6377     if ($node->[1] & SELECT_EL) {
6378     !!!cp ('t278');
6379     $i = $_;
6380     last INSCOPE;
6381     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6382     !!!cp ('t279');
6383     last INSCOPE;
6384     }
6385     } # INSCOPE
6386     unless (defined $i) {
6387     !!!cp ('t280');
6388 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6389     text => 'select', token => $token);
6390 wakaba 1.123 ## Ignore the token
6391 wakaba 1.125 !!!nack ('t280.1');
6392 wakaba 1.123 !!!next-token;
6393 wakaba 1.126 next B;
6394 wakaba 1.123 }
6395 wakaba 1.52
6396 wakaba 1.123 !!!cp ('t281');
6397     splice @{$self->{open_elements}}, $i;
6398 wakaba 1.52
6399 wakaba 1.123 $self->_reset_insertion_mode;
6400 wakaba 1.47
6401 wakaba 1.101 if ($token->{tag_name} eq 'select') {
6402 wakaba 1.125 !!!nack ('t281.2');
6403 wakaba 1.101 !!!next-token;
6404 wakaba 1.126 next B;
6405 wakaba 1.101 } else {
6406     !!!cp ('t281.1');
6407 wakaba 1.125 !!!ack-later;
6408 wakaba 1.101 ## Reprocess the token.
6409 wakaba 1.126 next B;
6410 wakaba 1.101 }
6411 wakaba 1.58 } else {
6412 wakaba 1.79 !!!cp ('t282');
6413 wakaba 1.153 !!!parse-error (type => 'in select',
6414     text => $token->{tag_name}, token => $token);
6415 wakaba 1.58 ## Ignore the token
6416 wakaba 1.125 !!!nack ('t282.1');
6417 wakaba 1.58 !!!next-token;
6418 wakaba 1.126 next B;
6419 wakaba 1.58 }
6420     } elsif ($token->{type} == END_TAG_TOKEN) {
6421 wakaba 1.123 if ($token->{tag_name} eq 'optgroup') {
6422     if ($self->{open_elements}->[-1]->[1] & OPTION_EL and
6423     $self->{open_elements}->[-2]->[1] & OPTGROUP_EL) {
6424     !!!cp ('t283');
6425     ## As if </option>
6426     splice @{$self->{open_elements}}, -2;
6427     } elsif ($self->{open_elements}->[-1]->[1] & OPTGROUP_EL) {
6428     !!!cp ('t284');
6429     pop @{$self->{open_elements}};
6430     } else {
6431     !!!cp ('t285');
6432 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6433     text => $token->{tag_name}, token => $token);
6434 wakaba 1.123 ## Ignore the token
6435     }
6436 wakaba 1.125 !!!nack ('t285.1');
6437 wakaba 1.123 !!!next-token;
6438 wakaba 1.126 next B;
6439 wakaba 1.123 } elsif ($token->{tag_name} eq 'option') {
6440     if ($self->{open_elements}->[-1]->[1] & OPTION_EL) {
6441     !!!cp ('t286');
6442     pop @{$self->{open_elements}};
6443     } else {
6444     !!!cp ('t287');
6445 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6446     text => $token->{tag_name}, token => $token);
6447 wakaba 1.123 ## Ignore the token
6448     }
6449 wakaba 1.125 !!!nack ('t287.1');
6450 wakaba 1.123 !!!next-token;
6451 wakaba 1.126 next B;
6452 wakaba 1.123 } elsif ($token->{tag_name} eq 'select') {
6453     ## have an element in table scope
6454     my $i;
6455     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6456     my $node = $self->{open_elements}->[$_];
6457     if ($node->[1] & SELECT_EL) {
6458     !!!cp ('t288');
6459     $i = $_;
6460     last INSCOPE;
6461     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6462     !!!cp ('t289');
6463     last INSCOPE;
6464     }
6465     } # INSCOPE
6466     unless (defined $i) {
6467     !!!cp ('t290');
6468 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6469     text => $token->{tag_name}, token => $token);
6470 wakaba 1.123 ## Ignore the token
6471 wakaba 1.125 !!!nack ('t290.1');
6472 wakaba 1.123 !!!next-token;
6473 wakaba 1.126 next B;
6474 wakaba 1.123 }
6475 wakaba 1.52
6476 wakaba 1.123 !!!cp ('t291');
6477     splice @{$self->{open_elements}}, $i;
6478 wakaba 1.52
6479 wakaba 1.123 $self->_reset_insertion_mode;
6480 wakaba 1.52
6481 wakaba 1.125 !!!nack ('t291.1');
6482 wakaba 1.123 !!!next-token;
6483 wakaba 1.126 next B;
6484 wakaba 1.101 } elsif ($self->{insertion_mode} == IN_SELECT_IN_TABLE_IM and
6485     {
6486     caption => 1, table => 1, tbody => 1,
6487     tfoot => 1, thead => 1, tr => 1, td => 1, th => 1,
6488     }->{$token->{tag_name}}) {
6489 wakaba 1.83 ## TODO: The following is wrong?
6490 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6491     text => $token->{tag_name}, token => $token);
6492 wakaba 1.52
6493 wakaba 1.123 ## have an element in table scope
6494     my $i;
6495     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6496     my $node = $self->{open_elements}->[$_];
6497     if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
6498     !!!cp ('t292');
6499     $i = $_;
6500     last INSCOPE;
6501     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6502     !!!cp ('t293');
6503     last INSCOPE;
6504     }
6505     } # INSCOPE
6506     unless (defined $i) {
6507     !!!cp ('t294');
6508     ## Ignore the token
6509 wakaba 1.125 !!!nack ('t294.1');
6510 wakaba 1.123 !!!next-token;
6511 wakaba 1.126 next B;
6512 wakaba 1.123 }
6513 wakaba 1.52
6514 wakaba 1.123 ## As if </select>
6515     ## have an element in table scope
6516     undef $i;
6517     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
6518     my $node = $self->{open_elements}->[$_];
6519     if ($node->[1] & SELECT_EL) {
6520     !!!cp ('t295');
6521     $i = $_;
6522     last INSCOPE;
6523     } elsif ($node->[1] & TABLE_SCOPING_EL) {
6524 wakaba 1.83 ## ISSUE: Can this state be reached?
6525 wakaba 1.123 !!!cp ('t296');
6526     last INSCOPE;
6527     }
6528     } # INSCOPE
6529     unless (defined $i) {
6530     !!!cp ('t297');
6531 wakaba 1.83 ## TODO: The following error type is correct?
6532 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6533     text => 'select', token => $token);
6534 wakaba 1.123 ## Ignore the </select> token
6535 wakaba 1.125 !!!nack ('t297.1');
6536 wakaba 1.123 !!!next-token; ## TODO: ok?
6537 wakaba 1.126 next B;
6538 wakaba 1.123 }
6539 wakaba 1.52
6540 wakaba 1.123 !!!cp ('t298');
6541     splice @{$self->{open_elements}}, $i;
6542 wakaba 1.52
6543 wakaba 1.123 $self->_reset_insertion_mode;
6544 wakaba 1.52
6545 wakaba 1.125 !!!ack-later;
6546 wakaba 1.123 ## reprocess
6547 wakaba 1.126 next B;
6548 wakaba 1.58 } else {
6549 wakaba 1.79 !!!cp ('t299');
6550 wakaba 1.153 !!!parse-error (type => 'in select:/',
6551     text => $token->{tag_name}, token => $token);
6552 wakaba 1.52 ## Ignore the token
6553 wakaba 1.125 !!!nack ('t299.3');
6554 wakaba 1.52 !!!next-token;
6555 wakaba 1.126 next B;
6556 wakaba 1.58 }
6557 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6558 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6559 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6560     !!!cp ('t299.1');
6561 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6562 wakaba 1.104 } else {
6563     !!!cp ('t299.2');
6564     }
6565    
6566     ## Stop parsing.
6567     last B;
6568 wakaba 1.58 } else {
6569     die "$0: $token->{type}: Unknown token type";
6570     }
6571 wakaba 1.56 } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6572 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6573 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6574 wakaba 1.52 my $data = $1;
6575     ## As if in body
6576     $reconstruct_active_formatting_elements->($insert_to_current);
6577    
6578     $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6579    
6580     unless (length $token->{data}) {
6581 wakaba 1.79 !!!cp ('t300');
6582 wakaba 1.52 !!!next-token;
6583 wakaba 1.126 next B;
6584 wakaba 1.52 }
6585     }
6586    
6587 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6588 wakaba 1.79 !!!cp ('t301');
6589 wakaba 1.153 !!!parse-error (type => 'after html:#text', token => $token);
6590 wakaba 1.188 #
6591 wakaba 1.79 } else {
6592     !!!cp ('t302');
6593 wakaba 1.188 ## "after body" insertion mode
6594     !!!parse-error (type => 'after body:#text', token => $token);
6595     #
6596 wakaba 1.52 }
6597    
6598 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6599 wakaba 1.52 ## reprocess
6600 wakaba 1.126 next B;
6601 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6602 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6603 wakaba 1.79 !!!cp ('t303');
6604 wakaba 1.153 !!!parse-error (type => 'after html',
6605     text => $token->{tag_name}, token => $token);
6606 wakaba 1.188 #
6607 wakaba 1.79 } else {
6608     !!!cp ('t304');
6609 wakaba 1.188 ## "after body" insertion mode
6610     !!!parse-error (type => 'after body',
6611     text => $token->{tag_name}, token => $token);
6612     #
6613 wakaba 1.52 }
6614    
6615 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6616 wakaba 1.125 !!!ack-later;
6617 wakaba 1.52 ## reprocess
6618 wakaba 1.126 next B;
6619 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6620 wakaba 1.54 if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6621 wakaba 1.79 !!!cp ('t305');
6622 wakaba 1.153 !!!parse-error (type => 'after html:/',
6623     text => $token->{tag_name}, token => $token);
6624 wakaba 1.52
6625 wakaba 1.188 $self->{insertion_mode} = IN_BODY_IM;
6626     ## Reprocess.
6627     next B;
6628 wakaba 1.79 } else {
6629     !!!cp ('t306');
6630 wakaba 1.52 }
6631    
6632     ## "after body" insertion mode
6633     if ($token->{tag_name} eq 'html') {
6634     if (defined $self->{inner_html_node}) {
6635 wakaba 1.79 !!!cp ('t307');
6636 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6637     text => 'html', token => $token);
6638 wakaba 1.52 ## Ignore the token
6639     !!!next-token;
6640 wakaba 1.126 next B;
6641 wakaba 1.52 } else {
6642 wakaba 1.79 !!!cp ('t308');
6643 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_BODY_IM;
6644 wakaba 1.52 !!!next-token;
6645 wakaba 1.126 next B;
6646 wakaba 1.52 }
6647     } else {
6648 wakaba 1.79 !!!cp ('t309');
6649 wakaba 1.153 !!!parse-error (type => 'after body:/',
6650     text => $token->{tag_name}, token => $token);
6651 wakaba 1.52
6652 wakaba 1.54 $self->{insertion_mode} = IN_BODY_IM;
6653 wakaba 1.52 ## reprocess
6654 wakaba 1.126 next B;
6655 wakaba 1.52 }
6656 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6657     !!!cp ('t309.2');
6658     ## Stop parsing
6659     last B;
6660 wakaba 1.52 } else {
6661     die "$0: $token->{type}: Unknown token type";
6662     }
6663 wakaba 1.56 } elsif ($self->{insertion_mode} & FRAME_IMS) {
6664 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6665 wakaba 1.188 if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6666 wakaba 1.52 $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6667    
6668     unless (length $token->{data}) {
6669 wakaba 1.79 !!!cp ('t310');
6670 wakaba 1.52 !!!next-token;
6671 wakaba 1.126 next B;
6672 wakaba 1.52 }
6673     }
6674    
6675 wakaba 1.188 if ($token->{data} =~ s/^[^\x09\x0A\x0C\x20]+//) {
6676 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6677 wakaba 1.79 !!!cp ('t311');
6678 wakaba 1.153 !!!parse-error (type => 'in frameset:#text', token => $token);
6679 wakaba 1.54 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6680 wakaba 1.79 !!!cp ('t312');
6681 wakaba 1.153 !!!parse-error (type => 'after frameset:#text', token => $token);
6682 wakaba 1.158 } else { # "after after frameset"
6683 wakaba 1.79 !!!cp ('t313');
6684 wakaba 1.153 !!!parse-error (type => 'after html:#text', token => $token);
6685 wakaba 1.52 }
6686    
6687     ## Ignore the token.
6688     if (length $token->{data}) {
6689 wakaba 1.79 !!!cp ('t314');
6690 wakaba 1.52 ## reprocess the rest of characters
6691     } else {
6692 wakaba 1.79 !!!cp ('t315');
6693 wakaba 1.52 !!!next-token;
6694     }
6695 wakaba 1.126 next B;
6696 wakaba 1.52 }
6697    
6698     die qq[$0: Character "$token->{data}"];
6699 wakaba 1.55 } elsif ($token->{type} == START_TAG_TOKEN) {
6700 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
6701 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6702 wakaba 1.79 !!!cp ('t318');
6703 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6704 wakaba 1.125 !!!nack ('t318.1');
6705 wakaba 1.52 !!!next-token;
6706 wakaba 1.126 next B;
6707 wakaba 1.52 } elsif ($token->{tag_name} eq 'frame' and
6708 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6709 wakaba 1.79 !!!cp ('t319');
6710 wakaba 1.116 !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
6711 wakaba 1.52 pop @{$self->{open_elements}};
6712 wakaba 1.125 !!!ack ('t319.1');
6713 wakaba 1.52 !!!next-token;
6714 wakaba 1.126 next B;
6715 wakaba 1.52 } elsif ($token->{tag_name} eq 'noframes') {
6716 wakaba 1.79 !!!cp ('t320');
6717 wakaba 1.148 ## NOTE: As if in head.
6718 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
6719 wakaba 1.126 next B;
6720 wakaba 1.158
6721     ## NOTE: |<!DOCTYPE HTML><frameset></frameset></html><noframes></noframes>|
6722     ## has no parse error.
6723 wakaba 1.52 } else {
6724 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6725 wakaba 1.79 !!!cp ('t321');
6726 wakaba 1.153 !!!parse-error (type => 'in frameset',
6727     text => $token->{tag_name}, token => $token);
6728 wakaba 1.158 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6729 wakaba 1.79 !!!cp ('t322');
6730 wakaba 1.153 !!!parse-error (type => 'after frameset',
6731     text => $token->{tag_name}, token => $token);
6732 wakaba 1.158 } else { # "after after frameset"
6733     !!!cp ('t322.2');
6734     !!!parse-error (type => 'after after frameset',
6735     text => $token->{tag_name}, token => $token);
6736 wakaba 1.52 }
6737     ## Ignore the token
6738 wakaba 1.125 !!!nack ('t322.1');
6739 wakaba 1.52 !!!next-token;
6740 wakaba 1.126 next B;
6741 wakaba 1.52 }
6742 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
6743 wakaba 1.52 if ($token->{tag_name} eq 'frameset' and
6744 wakaba 1.54 $self->{insertion_mode} == IN_FRAMESET_IM) {
6745 wakaba 1.123 if ($self->{open_elements}->[-1]->[1] & HTML_EL and
6746 wakaba 1.52 @{$self->{open_elements}} == 1) {
6747 wakaba 1.79 !!!cp ('t325');
6748 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
6749     text => $token->{tag_name}, token => $token);
6750 wakaba 1.52 ## Ignore the token
6751     !!!next-token;
6752     } else {
6753 wakaba 1.79 !!!cp ('t326');
6754 wakaba 1.52 pop @{$self->{open_elements}};
6755     !!!next-token;
6756     }
6757 wakaba 1.47
6758 wakaba 1.52 if (not defined $self->{inner_html_node} and
6759 wakaba 1.123 not ($self->{open_elements}->[-1]->[1] & FRAMESET_EL)) {
6760 wakaba 1.79 !!!cp ('t327');
6761 wakaba 1.54 $self->{insertion_mode} = AFTER_FRAMESET_IM;
6762 wakaba 1.79 } else {
6763     !!!cp ('t328');
6764 wakaba 1.52 }
6765 wakaba 1.126 next B;
6766 wakaba 1.52 } elsif ($token->{tag_name} eq 'html' and
6767 wakaba 1.54 $self->{insertion_mode} == AFTER_FRAMESET_IM) {
6768 wakaba 1.79 !!!cp ('t329');
6769 wakaba 1.54 $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM;
6770 wakaba 1.52 !!!next-token;
6771 wakaba 1.126 next B;
6772 wakaba 1.52 } else {
6773 wakaba 1.54 if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6774 wakaba 1.79 !!!cp ('t330');
6775 wakaba 1.153 !!!parse-error (type => 'in frameset:/',
6776     text => $token->{tag_name}, token => $token);
6777 wakaba 1.158 } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) {
6778     !!!cp ('t330.1');
6779     !!!parse-error (type => 'after frameset:/',
6780     text => $token->{tag_name}, token => $token);
6781     } else { # "after after html"
6782 wakaba 1.79 !!!cp ('t331');
6783 wakaba 1.158 !!!parse-error (type => 'after after frameset:/',
6784 wakaba 1.153 text => $token->{tag_name}, token => $token);
6785 wakaba 1.52 }
6786     ## Ignore the token
6787     !!!next-token;
6788 wakaba 1.126 next B;
6789 wakaba 1.52 }
6790 wakaba 1.104 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
6791 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & HTML_EL and
6792 wakaba 1.104 @{$self->{open_elements}} == 1) { # redundant, maybe
6793     !!!cp ('t331.1');
6794 wakaba 1.113 !!!parse-error (type => 'in body:#eof', token => $token);
6795 wakaba 1.104 } else {
6796     !!!cp ('t331.2');
6797     }
6798    
6799     ## Stop parsing
6800     last B;
6801 wakaba 1.52 } else {
6802     die "$0: $token->{type}: Unknown token type";
6803     }
6804 wakaba 1.47
6805 wakaba 1.52 ## ISSUE: An issue in spec here
6806     } else {
6807     die "$0: $self->{insertion_mode}: Unknown insertion mode";
6808     }
6809 wakaba 1.47
6810 wakaba 1.52 ## "in body" insertion mode
6811 wakaba 1.55 if ($token->{type} == START_TAG_TOKEN) {
6812 wakaba 1.52 if ($token->{tag_name} eq 'script') {
6813 wakaba 1.79 !!!cp ('t332');
6814 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6815 wakaba 1.100 $script_start_tag->();
6816 wakaba 1.126 next B;
6817 wakaba 1.52 } elsif ($token->{tag_name} eq 'style') {
6818 wakaba 1.79 !!!cp ('t333');
6819 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6820 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
6821 wakaba 1.126 next B;
6822 wakaba 1.52 } elsif ({
6823 wakaba 1.194 base => 1, command => 1, eventsource => 1, link => 1,
6824 wakaba 1.52 }->{$token->{tag_name}}) {
6825 wakaba 1.79 !!!cp ('t334');
6826 wakaba 1.52 ## NOTE: This is an "as if in head" code clone, only "-t" differs
6827 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6828 wakaba 1.194 pop @{$self->{open_elements}};
6829 wakaba 1.125 !!!ack ('t334.1');
6830 wakaba 1.52 !!!next-token;
6831 wakaba 1.126 next B;
6832 wakaba 1.52 } elsif ($token->{tag_name} eq 'meta') {
6833     ## NOTE: This is an "as if in head" code clone, only "-t" differs
6834 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6835 wakaba 1.194 my $meta_el = pop @{$self->{open_elements}};
6836 wakaba 1.46
6837 wakaba 1.52 unless ($self->{confident}) {
6838 wakaba 1.134 if ($token->{attributes}->{charset}) {
6839 wakaba 1.79 !!!cp ('t335');
6840 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
6841     ## in the {change_encoding} callback.
6842 wakaba 1.63 $self->{change_encoding}
6843 wakaba 1.114 ->($self, $token->{attributes}->{charset}->{value}, $token);
6844 wakaba 1.66
6845     $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6846     ->set_user_data (manakai_has_reference =>
6847     $token->{attributes}->{charset}
6848     ->{has_reference});
6849 wakaba 1.63 } elsif ($token->{attributes}->{content}) {
6850     if ($token->{attributes}->{content}->{value}
6851 wakaba 1.144 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6852 wakaba 1.189 [\x09\x0A\x0C\x0D\x20]*=
6853     [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6854     ([^"'\x09\x0A\x0C\x0D\x20][^\x09\x0A\x0C\x0D\x20\x3B]*))
6855     /x) {
6856 wakaba 1.79 !!!cp ('t336');
6857 wakaba 1.134 ## NOTE: Whether the encoding is supported or not is handled
6858     ## in the {change_encoding} callback.
6859 wakaba 1.63 $self->{change_encoding}
6860 wakaba 1.114 ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
6861 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6862     ->set_user_data (manakai_has_reference =>
6863     $token->{attributes}->{content}
6864     ->{has_reference});
6865 wakaba 1.63 }
6866 wakaba 1.52 }
6867 wakaba 1.66 } else {
6868     if ($token->{attributes}->{charset}) {
6869 wakaba 1.79 !!!cp ('t337');
6870 wakaba 1.66 $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
6871     ->set_user_data (manakai_has_reference =>
6872     $token->{attributes}->{charset}
6873     ->{has_reference});
6874     }
6875 wakaba 1.68 if ($token->{attributes}->{content}) {
6876 wakaba 1.79 !!!cp ('t338');
6877 wakaba 1.68 $meta_el->[0]->get_attribute_node_ns (undef, 'content')
6878     ->set_user_data (manakai_has_reference =>
6879     $token->{attributes}->{content}
6880     ->{has_reference});
6881     }
6882 wakaba 1.52 }
6883 wakaba 1.1
6884 wakaba 1.125 !!!ack ('t338.1');
6885 wakaba 1.52 !!!next-token;
6886 wakaba 1.126 next B;
6887 wakaba 1.52 } elsif ($token->{tag_name} eq 'title') {
6888 wakaba 1.79 !!!cp ('t341');
6889 wakaba 1.52 ## NOTE: This is an "as if in head" code clone
6890 wakaba 1.96 $parse_rcdata->(RCDATA_CONTENT_MODEL);
6891 wakaba 1.126 next B;
6892 wakaba 1.52 } elsif ($token->{tag_name} eq 'body') {
6893 wakaba 1.153 !!!parse-error (type => 'in body', text => 'body', token => $token);
6894 wakaba 1.46
6895 wakaba 1.52 if (@{$self->{open_elements}} == 1 or
6896 wakaba 1.123 not ($self->{open_elements}->[1]->[1] & BODY_EL)) {
6897 wakaba 1.79 !!!cp ('t342');
6898 wakaba 1.52 ## Ignore the token
6899     } else {
6900     my $body_el = $self->{open_elements}->[1]->[0];
6901     for my $attr_name (keys %{$token->{attributes}}) {
6902     unless ($body_el->has_attribute_ns (undef, $attr_name)) {
6903 wakaba 1.79 !!!cp ('t343');
6904 wakaba 1.52 $body_el->set_attribute_ns
6905     (undef, [undef, $attr_name],
6906     $token->{attributes}->{$attr_name}->{value});
6907     }
6908     }
6909     }
6910 wakaba 1.125 !!!nack ('t343.1');
6911 wakaba 1.52 !!!next-token;
6912 wakaba 1.126 next B;
6913 wakaba 1.52 } elsif ({
6914 wakaba 1.195 ## NOTE: Start tags for non-phrasing flow content elements
6915    
6916     ## NOTE: The normal one
6917     address => 1, article => 1, aside => 1, blockquote => 1,
6918     center => 1, datagrid => 1, details => 1, dialog => 1,
6919     dir => 1, div => 1, dl => 1, fieldset => 1, figure => 1,
6920     footer => 1, h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1,
6921     h6 => 1, header => 1, menu => 1, nav => 1, ol => 1, p => 1,
6922     section => 1, ul => 1,
6923     ## NOTE: As normal, but drops leading newline
6924 wakaba 1.97 pre => 1, listing => 1,
6925 wakaba 1.195 ## NOTE: As normal, but interacts with the form element pointer
6926 wakaba 1.109 form => 1,
6927 wakaba 1.195
6928 wakaba 1.109 table => 1,
6929     hr => 1,
6930 wakaba 1.52 }->{$token->{tag_name}}) {
6931 wakaba 1.109 if ($token->{tag_name} eq 'form' and defined $self->{form_element}) {
6932     !!!cp ('t350');
6933 wakaba 1.113 !!!parse-error (type => 'in form:form', token => $token);
6934 wakaba 1.109 ## Ignore the token
6935 wakaba 1.125 !!!nack ('t350.1');
6936 wakaba 1.109 !!!next-token;
6937 wakaba 1.126 next B;
6938 wakaba 1.109 }
6939    
6940 wakaba 1.52 ## has a p element in scope
6941     INSCOPE: for (reverse @{$self->{open_elements}}) {
6942 wakaba 1.123 if ($_->[1] & P_EL) {
6943 wakaba 1.79 !!!cp ('t344');
6944 wakaba 1.125 !!!back-token; # <form>
6945 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'p',
6946     line => $token->{line}, column => $token->{column}};
6947 wakaba 1.126 next B;
6948 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
6949 wakaba 1.79 !!!cp ('t345');
6950 wakaba 1.52 last INSCOPE;
6951     }
6952     } # INSCOPE
6953    
6954 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6955 wakaba 1.97 if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') {
6956 wakaba 1.125 !!!nack ('t346.1');
6957 wakaba 1.52 !!!next-token;
6958 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
6959 wakaba 1.52 $token->{data} =~ s/^\x0A//;
6960     unless (length $token->{data}) {
6961 wakaba 1.79 !!!cp ('t346');
6962 wakaba 1.1 !!!next-token;
6963 wakaba 1.79 } else {
6964     !!!cp ('t349');
6965 wakaba 1.52 }
6966 wakaba 1.79 } else {
6967     !!!cp ('t348');
6968 wakaba 1.52 }
6969 wakaba 1.109 } elsif ($token->{tag_name} eq 'form') {
6970     !!!cp ('t347.1');
6971     $self->{form_element} = $self->{open_elements}->[-1]->[0];
6972    
6973 wakaba 1.125 !!!nack ('t347.2');
6974 wakaba 1.109 !!!next-token;
6975     } elsif ($token->{tag_name} eq 'table') {
6976     !!!cp ('t382');
6977     push @{$open_tables}, [$self->{open_elements}->[-1]->[0]];
6978    
6979     $self->{insertion_mode} = IN_TABLE_IM;
6980    
6981 wakaba 1.125 !!!nack ('t382.1');
6982 wakaba 1.109 !!!next-token;
6983     } elsif ($token->{tag_name} eq 'hr') {
6984     !!!cp ('t386');
6985     pop @{$self->{open_elements}};
6986    
6987 wakaba 1.125 !!!nack ('t386.1');
6988 wakaba 1.109 !!!next-token;
6989 wakaba 1.52 } else {
6990 wakaba 1.125 !!!nack ('t347.1');
6991 wakaba 1.52 !!!next-token;
6992     }
6993 wakaba 1.126 next B;
6994 wakaba 1.196 } elsif ($token->{tag_name} eq 'li') {
6995     ## NOTE: As normal, but imply </li> when there's another <li> ...
6996 wakaba 1.193
6997     ## NOTE: Special, Scope (<li><foo><li> == <li><foo><li/></foo></li>)
6998     ## Interpreted as <li><foo/></li><li/> (non-conforming)
6999     ## blockquote (O9.27), center (O), dd (Fx3, O, S3.1.2, IE7),
7000     ## dt (Fx, O, S, IE), dl (O), fieldset (O, S, IE), form (Fx, O, S),
7001     ## hn (O), pre (O), applet (O, S), button (O, S), marquee (Fx, O, S),
7002     ## object (Fx)
7003     ## Generate non-tree (non-conforming)
7004     ## basefont (IE7 (where basefont is non-void)), center (IE),
7005     ## form (IE), hn (IE)
7006     ## address, div, p (<li><foo><li> == <li><foo/></li><li/>)
7007     ## Interpreted as <li><foo><li/></foo></li> (non-conforming)
7008     ## div (Fx, S)
7009 wakaba 1.196
7010     my $non_optional;
7011 wakaba 1.52 my $i = -1;
7012 wakaba 1.196
7013     ## 1.
7014     for my $node (reverse @{$self->{open_elements}}) {
7015     if ($node->[1] & LI_EL) {
7016     ## 2. (a) As if </li>
7017     {
7018     ## If no </li> - not applied
7019     #
7020    
7021     ## Otherwise
7022    
7023     ## 1. generate implied end tags, except for </li>
7024     #
7025    
7026     ## 2. If current node != "li", parse error
7027     if ($non_optional) {
7028     !!!parse-error (type => 'not closed',
7029     text => $non_optional->[0]->manakai_local_name,
7030     token => $token);
7031     !!!cp ('t355');
7032     } else {
7033     !!!cp ('t356');
7034     }
7035    
7036     ## 3. Pop
7037     splice @{$self->{open_elements}}, $i;
7038 wakaba 1.52 }
7039 wakaba 1.196
7040     last; ## 2. (b) goto 5.
7041     } elsif (
7042     ## NOTE: not "formatting" and not "phrasing"
7043     ($node->[1] & SPECIAL_EL or
7044     $node->[1] & SCOPING_EL) and
7045     ## NOTE: "li", "dt", and "dd" are in |SPECIAL_EL|.
7046    
7047     (not $node->[1] & ADDRESS_EL) &
7048     (not $node->[1] & DIV_EL) &
7049     (not $node->[1] & P_EL)) {
7050     ## 3.
7051 wakaba 1.79 !!!cp ('t357');
7052 wakaba 1.196 last; ## goto 5.
7053     } elsif ($node->[1] & END_TAG_OPTIONAL_EL) {
7054 wakaba 1.79 !!!cp ('t358');
7055 wakaba 1.196 #
7056     } else {
7057     !!!cp ('t359');
7058     $non_optional ||= $node;
7059     #
7060 wakaba 1.52 }
7061 wakaba 1.196 ## 4.
7062     ## goto 2.
7063 wakaba 1.52 $i--;
7064 wakaba 1.196 }
7065    
7066     ## 5. (a) has a |p| element in scope
7067     ## ISSUE: Is this step really necessary?
7068     INSCOPE: for (reverse @{$self->{open_elements}}) {
7069     if ($_->[1] & P_EL) {
7070     !!!cp ('t353');
7071     !!!back-token; # <x>
7072     $token = {type => END_TAG_TOKEN, tag_name => 'p',
7073     line => $token->{line}, column => $token->{column}};
7074     next B;
7075     } elsif ($_->[1] & SCOPING_EL) {
7076     !!!cp ('t354');
7077     last INSCOPE;
7078     }
7079     } # INSCOPE
7080    
7081     ## 5. (b) insert
7082 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7083 wakaba 1.125 !!!nack ('t359.1');
7084 wakaba 1.52 !!!next-token;
7085 wakaba 1.126 next B;
7086 wakaba 1.196 } elsif ($token->{tag_name} eq 'dt' or
7087     $token->{tag_name} eq 'dd') {
7088     ## NOTE: As normal, but imply </dt> or </dd> when ...
7089    
7090     my $non_optional;
7091     my $i = -1;
7092    
7093     ## 1.
7094     for my $node (reverse @{$self->{open_elements}}) {
7095     if ($node->[1] & DT_EL or $node->[1] & DD_EL) {
7096     ## 2. (a) As if </li>
7097     {
7098     ## If no </li> - not applied
7099     #
7100    
7101     ## Otherwise
7102    
7103     ## 1. generate implied end tags, except for </dt> or </dd>
7104     #
7105    
7106     ## 2. If current node != "dt"|"dd", parse error
7107     if ($non_optional) {
7108     !!!parse-error (type => 'not closed',
7109     text => $non_optional->[0]->manakai_local_name,
7110     token => $token);
7111     !!!cp ('t355.1');
7112     } else {
7113     !!!cp ('t356.1');
7114     }
7115    
7116     ## 3. Pop
7117     splice @{$self->{open_elements}}, $i;
7118     }
7119    
7120     last; ## 2. (b) goto 5.
7121     } elsif (
7122     ## NOTE: not "formatting" and not "phrasing"
7123     ($node->[1] & SPECIAL_EL or
7124     $node->[1] & SCOPING_EL) and
7125     ## NOTE: "li", "dt", and "dd" are in |SPECIAL_EL|.
7126    
7127     (not $node->[1] & ADDRESS_EL) &
7128     (not $node->[1] & DIV_EL) &
7129     (not $node->[1] & P_EL)) {
7130     ## 3.
7131     !!!cp ('t357.1');
7132     last; ## goto 5.
7133     } elsif ($node->[1] & END_TAG_OPTIONAL_EL) {
7134     !!!cp ('t358.1');
7135     #
7136     } else {
7137     !!!cp ('t359.1');
7138     $non_optional ||= $node;
7139     #
7140     }
7141     ## 4.
7142     ## goto 2.
7143     $i--;
7144     }
7145    
7146     ## 5. (a) has a |p| element in scope
7147     ## ISSUE: Is this step really necessary?
7148     INSCOPE: for (reverse @{$self->{open_elements}}) {
7149     if ($_->[1] & P_EL) {
7150     !!!cp ('t353.1');
7151     !!!back-token; # <x>
7152     $token = {type => END_TAG_TOKEN, tag_name => 'p',
7153     line => $token->{line}, column => $token->{column}};
7154     next B;
7155     } elsif ($_->[1] & SCOPING_EL) {
7156     !!!cp ('t354.1');
7157     last INSCOPE;
7158     }
7159     } # INSCOPE
7160    
7161     ## 5. (b) insert
7162     !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7163     !!!nack ('t359.2');
7164     !!!next-token;
7165     next B;
7166 wakaba 1.52 } elsif ($token->{tag_name} eq 'plaintext') {
7167 wakaba 1.195 ## NOTE: As normal, but effectively ends parsing
7168    
7169 wakaba 1.52 ## has a p element in scope
7170     INSCOPE: for (reverse @{$self->{open_elements}}) {
7171 wakaba 1.123 if ($_->[1] & P_EL) {
7172 wakaba 1.79 !!!cp ('t367');
7173 wakaba 1.125 !!!back-token; # <plaintext>
7174 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'p',
7175     line => $token->{line}, column => $token->{column}};
7176 wakaba 1.126 next B;
7177 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
7178 wakaba 1.79 !!!cp ('t368');
7179 wakaba 1.52 last INSCOPE;
7180 wakaba 1.46 }
7181 wakaba 1.52 } # INSCOPE
7182    
7183 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7184 wakaba 1.52
7185     $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
7186    
7187 wakaba 1.125 !!!nack ('t368.1');
7188 wakaba 1.52 !!!next-token;
7189 wakaba 1.126 next B;
7190 wakaba 1.52 } elsif ($token->{tag_name} eq 'a') {
7191     AFE: for my $i (reverse 0..$#$active_formatting_elements) {
7192     my $node = $active_formatting_elements->[$i];
7193 wakaba 1.123 if ($node->[1] & A_EL) {
7194 wakaba 1.79 !!!cp ('t371');
7195 wakaba 1.113 !!!parse-error (type => 'in a:a', token => $token);
7196 wakaba 1.52
7197 wakaba 1.125 !!!back-token; # <a>
7198 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'a',
7199     line => $token->{line}, column => $token->{column}};
7200 wakaba 1.113 $formatting_end_tag->($token);
7201 wakaba 1.52
7202     AFE2: for (reverse 0..$#$active_formatting_elements) {
7203     if ($active_formatting_elements->[$_]->[0] eq $node->[0]) {
7204 wakaba 1.79 !!!cp ('t372');
7205 wakaba 1.52 splice @$active_formatting_elements, $_, 1;
7206     last AFE2;
7207 wakaba 1.1 }
7208 wakaba 1.52 } # AFE2
7209     OE: for (reverse 0..$#{$self->{open_elements}}) {
7210     if ($self->{open_elements}->[$_]->[0] eq $node->[0]) {
7211 wakaba 1.79 !!!cp ('t373');
7212 wakaba 1.52 splice @{$self->{open_elements}}, $_, 1;
7213     last OE;
7214 wakaba 1.1 }
7215 wakaba 1.52 } # OE
7216     last AFE;
7217     } elsif ($node->[0] eq '#marker') {
7218 wakaba 1.79 !!!cp ('t374');
7219 wakaba 1.52 last AFE;
7220     }
7221     } # AFE
7222    
7223     $reconstruct_active_formatting_elements->($insert_to_current);
7224 wakaba 1.1
7225 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7226 wakaba 1.52 push @$active_formatting_elements, $self->{open_elements}->[-1];
7227 wakaba 1.1
7228 wakaba 1.125 !!!nack ('t374.1');
7229 wakaba 1.52 !!!next-token;
7230 wakaba 1.126 next B;
7231 wakaba 1.52 } elsif ($token->{tag_name} eq 'nobr') {
7232     $reconstruct_active_formatting_elements->($insert_to_current);
7233 wakaba 1.1
7234 wakaba 1.52 ## has a |nobr| element in scope
7235     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7236     my $node = $self->{open_elements}->[$_];
7237 wakaba 1.123 if ($node->[1] & NOBR_EL) {
7238 wakaba 1.79 !!!cp ('t376');
7239 wakaba 1.113 !!!parse-error (type => 'in nobr:nobr', token => $token);
7240 wakaba 1.125 !!!back-token; # <nobr>
7241 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'nobr',
7242     line => $token->{line}, column => $token->{column}};
7243 wakaba 1.126 next B;
7244 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7245 wakaba 1.79 !!!cp ('t377');
7246 wakaba 1.52 last INSCOPE;
7247     }
7248     } # INSCOPE
7249    
7250 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7251 wakaba 1.52 push @$active_formatting_elements, $self->{open_elements}->[-1];
7252    
7253 wakaba 1.125 !!!nack ('t377.1');
7254 wakaba 1.52 !!!next-token;
7255 wakaba 1.126 next B;
7256 wakaba 1.52 } elsif ($token->{tag_name} eq 'button') {
7257     ## has a button element in scope
7258     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7259     my $node = $self->{open_elements}->[$_];
7260 wakaba 1.123 if ($node->[1] & BUTTON_EL) {
7261 wakaba 1.79 !!!cp ('t378');
7262 wakaba 1.113 !!!parse-error (type => 'in button:button', token => $token);
7263 wakaba 1.125 !!!back-token; # <button>
7264 wakaba 1.114 $token = {type => END_TAG_TOKEN, tag_name => 'button',
7265     line => $token->{line}, column => $token->{column}};
7266 wakaba 1.126 next B;
7267 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7268 wakaba 1.79 !!!cp ('t379');
7269 wakaba 1.52 last INSCOPE;
7270     }
7271     } # INSCOPE
7272    
7273     $reconstruct_active_formatting_elements->($insert_to_current);
7274    
7275 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7276 wakaba 1.85
7277     ## TODO: associate with $self->{form_element} if defined
7278    
7279 wakaba 1.52 push @$active_formatting_elements, ['#marker', ''];
7280 wakaba 1.1
7281 wakaba 1.125 !!!nack ('t379.1');
7282 wakaba 1.52 !!!next-token;
7283 wakaba 1.126 next B;
7284 wakaba 1.103 } elsif ({
7285 wakaba 1.109 xmp => 1,
7286     iframe => 1,
7287     noembed => 1,
7288 wakaba 1.148 noframes => 1, ## NOTE: This is an "as if in head" code clone.
7289 wakaba 1.109 noscript => 0, ## TODO: 1 if scripting is enabled
7290 wakaba 1.103 }->{$token->{tag_name}}) {
7291 wakaba 1.109 if ($token->{tag_name} eq 'xmp') {
7292     !!!cp ('t381');
7293     $reconstruct_active_formatting_elements->($insert_to_current);
7294     } else {
7295     !!!cp ('t399');
7296     }
7297     ## NOTE: There is an "as if in body" code clone.
7298 wakaba 1.96 $parse_rcdata->(CDATA_CONTENT_MODEL);
7299 wakaba 1.126 next B;
7300 wakaba 1.52 } elsif ($token->{tag_name} eq 'isindex') {
7301 wakaba 1.113 !!!parse-error (type => 'isindex', token => $token);
7302 wakaba 1.52
7303     if (defined $self->{form_element}) {
7304 wakaba 1.79 !!!cp ('t389');
7305 wakaba 1.52 ## Ignore the token
7306 wakaba 1.125 !!!nack ('t389'); ## NOTE: Not acknowledged.
7307 wakaba 1.52 !!!next-token;
7308 wakaba 1.126 next B;
7309 wakaba 1.52 } else {
7310 wakaba 1.147 !!!ack ('t391.1');
7311    
7312 wakaba 1.52 my $at = $token->{attributes};
7313     my $form_attrs;
7314     $form_attrs->{action} = $at->{action} if $at->{action};
7315     my $prompt_attr = $at->{prompt};
7316     $at->{name} = {name => 'name', value => 'isindex'};
7317     delete $at->{action};
7318     delete $at->{prompt};
7319     my @tokens = (
7320 wakaba 1.55 {type => START_TAG_TOKEN, tag_name => 'form',
7321 wakaba 1.114 attributes => $form_attrs,
7322     line => $token->{line}, column => $token->{column}},
7323     {type => START_TAG_TOKEN, tag_name => 'hr',
7324     line => $token->{line}, column => $token->{column}},
7325     {type => START_TAG_TOKEN, tag_name => 'p',
7326     line => $token->{line}, column => $token->{column}},
7327     {type => START_TAG_TOKEN, tag_name => 'label',
7328     line => $token->{line}, column => $token->{column}},
7329 wakaba 1.52 );
7330     if ($prompt_attr) {
7331 wakaba 1.79 !!!cp ('t390');
7332 wakaba 1.114 push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value},
7333 wakaba 1.118 #line => $token->{line}, column => $token->{column},
7334     };
7335 wakaba 1.1 } else {
7336 wakaba 1.79 !!!cp ('t391');
7337 wakaba 1.55 push @tokens, {type => CHARACTER_TOKEN,
7338 wakaba 1.114 data => 'This is a searchable index. Insert your search keywords here: ',
7339 wakaba 1.118 #line => $token->{line}, column => $token->{column},
7340     }; # SHOULD
7341 wakaba 1.52 ## TODO: make this configurable
7342 wakaba 1.1 }
7343 wakaba 1.52 push @tokens,
7344 wakaba 1.114 {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at,
7345     line => $token->{line}, column => $token->{column}},
7346 wakaba 1.55 #{type => CHARACTER_TOKEN, data => ''}, # SHOULD
7347 wakaba 1.114 {type => END_TAG_TOKEN, tag_name => 'label',
7348     line => $token->{line}, column => $token->{column}},
7349     {type => END_TAG_TOKEN, tag_name => 'p',
7350     line => $token->{line}, column => $token->{column}},
7351     {type => START_TAG_TOKEN, tag_name => 'hr',
7352     line => $token->{line}, column => $token->{column}},
7353     {type => END_TAG_TOKEN, tag_name => 'form',
7354     line => $token->{line}, column => $token->{column}};
7355 wakaba 1.52 !!!back-token (@tokens);
7356 wakaba 1.125 !!!next-token;
7357 wakaba 1.126 next B;
7358 wakaba 1.52 }
7359     } elsif ($token->{tag_name} eq 'textarea') {
7360     my $tag_name = $token->{tag_name};
7361     my $el;
7362 wakaba 1.126 !!!create-element ($el, $HTML_NS, $token->{tag_name}, $token->{attributes}, $token);
7363 wakaba 1.52
7364     ## TODO: $self->{form_element} if defined
7365     $self->{content_model} = RCDATA_CONTENT_MODEL;
7366     delete $self->{escape}; # MUST
7367    
7368     $insert->($el);
7369    
7370     my $text = '';
7371 wakaba 1.125 !!!nack ('t392.1');
7372 wakaba 1.52 !!!next-token;
7373 wakaba 1.55 if ($token->{type} == CHARACTER_TOKEN) {
7374 wakaba 1.52 $token->{data} =~ s/^\x0A//;
7375 wakaba 1.51 unless (length $token->{data}) {
7376 wakaba 1.79 !!!cp ('t392');
7377 wakaba 1.51 !!!next-token;
7378 wakaba 1.79 } else {
7379     !!!cp ('t393');
7380 wakaba 1.51 }
7381 wakaba 1.79 } else {
7382     !!!cp ('t394');
7383 wakaba 1.51 }
7384 wakaba 1.55 while ($token->{type} == CHARACTER_TOKEN) {
7385 wakaba 1.79 !!!cp ('t395');
7386 wakaba 1.52 $text .= $token->{data};
7387     !!!next-token;
7388     }
7389     if (length $text) {
7390 wakaba 1.79 !!!cp ('t396');
7391 wakaba 1.52 $el->manakai_append_text ($text);
7392     }
7393    
7394     $self->{content_model} = PCDATA_CONTENT_MODEL;
7395 wakaba 1.51
7396 wakaba 1.55 if ($token->{type} == END_TAG_TOKEN and
7397 wakaba 1.52 $token->{tag_name} eq $tag_name) {
7398 wakaba 1.79 !!!cp ('t397');
7399 wakaba 1.52 ## Ignore the token
7400     } else {
7401 wakaba 1.79 !!!cp ('t398');
7402 wakaba 1.153 !!!parse-error (type => 'in RCDATA:#eof', token => $token);
7403 wakaba 1.51 }
7404 wakaba 1.52 !!!next-token;
7405 wakaba 1.126 next B;
7406 wakaba 1.151 } elsif ($token->{tag_name} eq 'rt' or
7407     $token->{tag_name} eq 'rp') {
7408     ## has a |ruby| element in scope
7409     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7410     my $node = $self->{open_elements}->[$_];
7411     if ($node->[1] & RUBY_EL) {
7412     !!!cp ('t398.1');
7413     ## generate implied end tags
7414     while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7415     !!!cp ('t398.2');
7416     pop @{$self->{open_elements}};
7417     }
7418     unless ($self->{open_elements}->[-1]->[1] & RUBY_EL) {
7419     !!!cp ('t398.3');
7420     !!!parse-error (type => 'not closed',
7421 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7422 wakaba 1.151 ->manakai_local_name,
7423     token => $token);
7424     pop @{$self->{open_elements}}
7425     while not $self->{open_elements}->[-1]->[1] & RUBY_EL;
7426     }
7427     last INSCOPE;
7428     } elsif ($node->[1] & SCOPING_EL) {
7429     !!!cp ('t398.4');
7430     last INSCOPE;
7431     }
7432     } # INSCOPE
7433    
7434     !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7435    
7436     !!!nack ('t398.5');
7437     !!!next-token;
7438     redo B;
7439 wakaba 1.126 } elsif ($token->{tag_name} eq 'math' or
7440     $token->{tag_name} eq 'svg') {
7441     $reconstruct_active_formatting_elements->($insert_to_current);
7442 wakaba 1.131
7443 wakaba 1.155 ## "Adjust MathML attributes" ('math' only) - done in insert-element-f
7444    
7445 wakaba 1.131 ## "adjust SVG attributes" ('svg' only) - done in insert-element-f
7446    
7447     ## "adjust foreign attributes" - done in insert-element-f
7448 wakaba 1.126
7449 wakaba 1.131 !!!insert-element-f ($token->{tag_name} eq 'math' ? $MML_NS : $SVG_NS, $token->{tag_name}, $token->{attributes}, $token);
7450 wakaba 1.126
7451     if ($self->{self_closing}) {
7452     pop @{$self->{open_elements}};
7453     !!!ack ('t398.1');
7454     } else {
7455     !!!cp ('t398.2');
7456     $self->{insertion_mode} |= IN_FOREIGN_CONTENT_IM;
7457     ## NOTE: |<body><math><mi><svg>| -> "in foreign content" insertion
7458     ## mode, "in body" (not "in foreign content") secondary insertion
7459     ## mode, maybe.
7460     }
7461    
7462     !!!next-token;
7463     next B;
7464 wakaba 1.52 } elsif ({
7465     caption => 1, col => 1, colgroup => 1, frame => 1,
7466     frameset => 1, head => 1, option => 1, optgroup => 1,
7467     tbody => 1, td => 1, tfoot => 1, th => 1,
7468     thead => 1, tr => 1,
7469     }->{$token->{tag_name}}) {
7470 wakaba 1.79 !!!cp ('t401');
7471 wakaba 1.153 !!!parse-error (type => 'in body',
7472     text => $token->{tag_name}, token => $token);
7473 wakaba 1.52 ## Ignore the token
7474 wakaba 1.125 !!!nack ('t401.1'); ## NOTE: |<col/>| or |<frame/>| here is an error.
7475 wakaba 1.52 !!!next-token;
7476 wakaba 1.126 next B;
7477 wakaba 1.52
7478     ## ISSUE: An issue on HTML5 new elements in the spec.
7479     } else {
7480 wakaba 1.110 if ($token->{tag_name} eq 'image') {
7481     !!!cp ('t384');
7482 wakaba 1.113 !!!parse-error (type => 'image', token => $token);
7483 wakaba 1.110 $token->{tag_name} = 'img';
7484     } else {
7485     !!!cp ('t385');
7486     }
7487    
7488     ## NOTE: There is an "as if <br>" code clone.
7489 wakaba 1.52 $reconstruct_active_formatting_elements->($insert_to_current);
7490    
7491 wakaba 1.116 !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
7492 wakaba 1.109
7493 wakaba 1.110 if ({
7494     applet => 1, marquee => 1, object => 1,
7495     }->{$token->{tag_name}}) {
7496     !!!cp ('t380');
7497     push @$active_formatting_elements, ['#marker', ''];
7498 wakaba 1.125 !!!nack ('t380.1');
7499 wakaba 1.110 } elsif ({
7500     b => 1, big => 1, em => 1, font => 1, i => 1,
7501 wakaba 1.193 s => 1, small => 1, strike => 1,
7502 wakaba 1.110 strong => 1, tt => 1, u => 1,
7503     }->{$token->{tag_name}}) {
7504     !!!cp ('t375');
7505     push @$active_formatting_elements, $self->{open_elements}->[-1];
7506 wakaba 1.125 !!!nack ('t375.1');
7507 wakaba 1.110 } elsif ($token->{tag_name} eq 'input') {
7508     !!!cp ('t388');
7509     ## TODO: associate with $self->{form_element} if defined
7510     pop @{$self->{open_elements}};
7511 wakaba 1.125 !!!ack ('t388.2');
7512 wakaba 1.110 } elsif ({
7513     area => 1, basefont => 1, bgsound => 1, br => 1,
7514     embed => 1, img => 1, param => 1, spacer => 1, wbr => 1,
7515     #image => 1,
7516     }->{$token->{tag_name}}) {
7517     !!!cp ('t388.1');
7518     pop @{$self->{open_elements}};
7519 wakaba 1.125 !!!ack ('t388.3');
7520 wakaba 1.110 } elsif ($token->{tag_name} eq 'select') {
7521 wakaba 1.109 ## TODO: associate with $self->{form_element} if defined
7522    
7523     if ($self->{insertion_mode} & TABLE_IMS or
7524     $self->{insertion_mode} & BODY_TABLE_IMS or
7525     $self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
7526     !!!cp ('t400.1');
7527     $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM;
7528     } else {
7529     !!!cp ('t400.2');
7530     $self->{insertion_mode} = IN_SELECT_IM;
7531     }
7532 wakaba 1.125 !!!nack ('t400.3');
7533 wakaba 1.110 } else {
7534 wakaba 1.125 !!!nack ('t402');
7535 wakaba 1.109 }
7536 wakaba 1.51
7537 wakaba 1.52 !!!next-token;
7538 wakaba 1.126 next B;
7539 wakaba 1.52 }
7540 wakaba 1.55 } elsif ($token->{type} == END_TAG_TOKEN) {
7541 wakaba 1.52 if ($token->{tag_name} eq 'body') {
7542 wakaba 1.107 ## has a |body| element in scope
7543     my $i;
7544 wakaba 1.111 INSCOPE: {
7545     for (reverse @{$self->{open_elements}}) {
7546 wakaba 1.123 if ($_->[1] & BODY_EL) {
7547 wakaba 1.111 !!!cp ('t405');
7548     $i = $_;
7549     last INSCOPE;
7550 wakaba 1.123 } elsif ($_->[1] & SCOPING_EL) {
7551 wakaba 1.111 !!!cp ('t405.1');
7552     last;
7553     }
7554 wakaba 1.52 }
7555 wakaba 1.111
7556     !!!parse-error (type => 'start tag not allowed',
7557 wakaba 1.153 text => $token->{tag_name}, token => $token);
7558 wakaba 1.107 ## NOTE: Ignore the token.
7559 wakaba 1.52 !!!next-token;
7560 wakaba 1.126 next B;
7561 wakaba 1.111 } # INSCOPE
7562 wakaba 1.107
7563     for (@{$self->{open_elements}}) {
7564 wakaba 1.123 unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL) {
7565 wakaba 1.107 !!!cp ('t403');
7566 wakaba 1.122 !!!parse-error (type => 'not closed',
7567 wakaba 1.153 text => $_->[0]->manakai_local_name,
7568 wakaba 1.122 token => $token);
7569 wakaba 1.107 last;
7570     } else {
7571     !!!cp ('t404');
7572     }
7573     }
7574    
7575     $self->{insertion_mode} = AFTER_BODY_IM;
7576     !!!next-token;
7577 wakaba 1.126 next B;
7578 wakaba 1.52 } elsif ($token->{tag_name} eq 'html') {
7579 wakaba 1.122 ## TODO: Update this code. It seems that the code below is not
7580     ## up-to-date, though it has same effect as speced.
7581 wakaba 1.123 if (@{$self->{open_elements}} > 1 and
7582     $self->{open_elements}->[1]->[1] & BODY_EL) {
7583 wakaba 1.52 ## ISSUE: There is an issue in the spec.
7584 wakaba 1.123 unless ($self->{open_elements}->[-1]->[1] & BODY_EL) {
7585 wakaba 1.79 !!!cp ('t406');
7586 wakaba 1.122 !!!parse-error (type => 'not closed',
7587 wakaba 1.153 text => $self->{open_elements}->[1]->[0]
7588 wakaba 1.122 ->manakai_local_name,
7589     token => $token);
7590 wakaba 1.79 } else {
7591     !!!cp ('t407');
7592 wakaba 1.1 }
7593 wakaba 1.54 $self->{insertion_mode} = AFTER_BODY_IM;
7594 wakaba 1.52 ## reprocess
7595 wakaba 1.126 next B;
7596 wakaba 1.51 } else {
7597 wakaba 1.79 !!!cp ('t408');
7598 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7599     text => $token->{tag_name}, token => $token);
7600 wakaba 1.52 ## Ignore the token
7601     !!!next-token;
7602 wakaba 1.126 next B;
7603 wakaba 1.51 }
7604 wakaba 1.52 } elsif ({
7605 wakaba 1.195 ## NOTE: End tags for non-phrasing flow content elements
7606    
7607     ## NOTE: The normal ones
7608     address => 1, article => 1, aside => 1, blockquote => 1,
7609     center => 1, datagrid => 1, details => 1, dialog => 1,
7610     dir => 1, div => 1, dl => 1, fieldset => 1, figure => 1,
7611     footer => 1, header => 1, listing => 1, menu => 1, nav => 1,
7612     ol => 1, pre => 1, section => 1, ul => 1,
7613    
7614     ## NOTE: As normal, but ... optional tags
7615 wakaba 1.52 dd => 1, dt => 1, li => 1,
7616 wakaba 1.195
7617 wakaba 1.103 applet => 1, button => 1, marquee => 1, object => 1,
7618 wakaba 1.52 }->{$token->{tag_name}}) {
7619     ## has an element in scope
7620     my $i;
7621     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7622     my $node = $self->{open_elements}->[$_];
7623 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7624 wakaba 1.79 !!!cp ('t410');
7625 wakaba 1.52 $i = $_;
7626 wakaba 1.87 last INSCOPE;
7627 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7628 wakaba 1.79 !!!cp ('t411');
7629 wakaba 1.52 last INSCOPE;
7630 wakaba 1.51 }
7631 wakaba 1.52 } # INSCOPE
7632 wakaba 1.89
7633     unless (defined $i) { # has an element in scope
7634     !!!cp ('t413');
7635 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7636     text => $token->{tag_name}, token => $token);
7637 wakaba 1.157 ## NOTE: Ignore the token.
7638 wakaba 1.89 } else {
7639     ## Step 1. generate implied end tags
7640     while ({
7641 wakaba 1.151 ## END_TAG_OPTIONAL_EL
7642 wakaba 1.89 dd => ($token->{tag_name} ne 'dd'),
7643     dt => ($token->{tag_name} ne 'dt'),
7644     li => ($token->{tag_name} ne 'li'),
7645 wakaba 1.194 option => 1,
7646     optgroup => 1,
7647 wakaba 1.89 p => 1,
7648 wakaba 1.151 rt => 1,
7649     rp => 1,
7650 wakaba 1.123 }->{$self->{open_elements}->[-1]->[0]->manakai_local_name}) {
7651 wakaba 1.89 !!!cp ('t409');
7652     pop @{$self->{open_elements}};
7653     }
7654    
7655     ## Step 2.
7656 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7657     ne $token->{tag_name}) {
7658 wakaba 1.79 !!!cp ('t412');
7659 wakaba 1.122 !!!parse-error (type => 'not closed',
7660 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7661 wakaba 1.122 ->manakai_local_name,
7662     token => $token);
7663 wakaba 1.51 } else {
7664 wakaba 1.89 !!!cp ('t414');
7665 wakaba 1.51 }
7666 wakaba 1.89
7667     ## Step 3.
7668 wakaba 1.52 splice @{$self->{open_elements}}, $i;
7669 wakaba 1.89
7670     ## Step 4.
7671     $clear_up_to_marker->()
7672     if {
7673 wakaba 1.103 applet => 1, button => 1, marquee => 1, object => 1,
7674 wakaba 1.89 }->{$token->{tag_name}};
7675 wakaba 1.51 }
7676 wakaba 1.52 !!!next-token;
7677 wakaba 1.126 next B;
7678 wakaba 1.52 } elsif ($token->{tag_name} eq 'form') {
7679 wakaba 1.195 ## NOTE: As normal, but interacts with the form element pointer
7680    
7681 wakaba 1.92 undef $self->{form_element};
7682    
7683 wakaba 1.52 ## has an element in scope
7684 wakaba 1.92 my $i;
7685 wakaba 1.52 INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7686     my $node = $self->{open_elements}->[$_];
7687 wakaba 1.123 if ($node->[1] & FORM_EL) {
7688 wakaba 1.79 !!!cp ('t418');
7689 wakaba 1.92 $i = $_;
7690 wakaba 1.52 last INSCOPE;
7691 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7692 wakaba 1.79 !!!cp ('t419');
7693 wakaba 1.52 last INSCOPE;
7694     }
7695     } # INSCOPE
7696 wakaba 1.92
7697     unless (defined $i) { # has an element in scope
7698 wakaba 1.79 !!!cp ('t421');
7699 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7700     text => $token->{tag_name}, token => $token);
7701 wakaba 1.157 ## NOTE: Ignore the token.
7702 wakaba 1.92 } else {
7703     ## Step 1. generate implied end tags
7704 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7705 wakaba 1.92 !!!cp ('t417');
7706     pop @{$self->{open_elements}};
7707     }
7708    
7709     ## Step 2.
7710 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7711     ne $token->{tag_name}) {
7712 wakaba 1.92 !!!cp ('t417.1');
7713 wakaba 1.122 !!!parse-error (type => 'not closed',
7714 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7715 wakaba 1.122 ->manakai_local_name,
7716     token => $token);
7717 wakaba 1.92 } else {
7718     !!!cp ('t420');
7719     }
7720    
7721     ## Step 3.
7722     splice @{$self->{open_elements}}, $i;
7723 wakaba 1.52 }
7724    
7725     !!!next-token;
7726 wakaba 1.126 next B;
7727 wakaba 1.52 } elsif ({
7728 wakaba 1.195 ## NOTE: As normal, except acts as a closer for any ...
7729 wakaba 1.52 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
7730     }->{$token->{tag_name}}) {
7731     ## has an element in scope
7732     my $i;
7733     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7734     my $node = $self->{open_elements}->[$_];
7735 wakaba 1.123 if ($node->[1] & HEADING_EL) {
7736 wakaba 1.79 !!!cp ('t423');
7737 wakaba 1.52 $i = $_;
7738     last INSCOPE;
7739 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7740 wakaba 1.79 !!!cp ('t424');
7741 wakaba 1.52 last INSCOPE;
7742 wakaba 1.51 }
7743 wakaba 1.52 } # INSCOPE
7744 wakaba 1.93
7745     unless (defined $i) { # has an element in scope
7746     !!!cp ('t425.1');
7747 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7748     text => $token->{tag_name}, token => $token);
7749 wakaba 1.157 ## NOTE: Ignore the token.
7750 wakaba 1.79 } else {
7751 wakaba 1.93 ## Step 1. generate implied end tags
7752 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7753 wakaba 1.93 !!!cp ('t422');
7754     pop @{$self->{open_elements}};
7755     }
7756    
7757     ## Step 2.
7758 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7759     ne $token->{tag_name}) {
7760 wakaba 1.93 !!!cp ('t425');
7761 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7762     text => $token->{tag_name}, token => $token);
7763 wakaba 1.93 } else {
7764     !!!cp ('t426');
7765     }
7766    
7767     ## Step 3.
7768     splice @{$self->{open_elements}}, $i;
7769 wakaba 1.36 }
7770 wakaba 1.52
7771     !!!next-token;
7772 wakaba 1.126 next B;
7773 wakaba 1.87 } elsif ($token->{tag_name} eq 'p') {
7774 wakaba 1.195 ## NOTE: As normal, except </p> implies <p> and ...
7775    
7776 wakaba 1.87 ## has an element in scope
7777     my $i;
7778     INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
7779     my $node = $self->{open_elements}->[$_];
7780 wakaba 1.123 if ($node->[1] & P_EL) {
7781 wakaba 1.87 !!!cp ('t410.1');
7782     $i = $_;
7783 wakaba 1.88 last INSCOPE;
7784 wakaba 1.123 } elsif ($node->[1] & SCOPING_EL) {
7785 wakaba 1.87 !!!cp ('t411.1');
7786     last INSCOPE;
7787     }
7788     } # INSCOPE
7789 wakaba 1.91
7790     if (defined $i) {
7791 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7792     ne $token->{tag_name}) {
7793 wakaba 1.87 !!!cp ('t412.1');
7794 wakaba 1.122 !!!parse-error (type => 'not closed',
7795 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7796 wakaba 1.122 ->manakai_local_name,
7797     token => $token);
7798 wakaba 1.87 } else {
7799 wakaba 1.91 !!!cp ('t414.1');
7800 wakaba 1.87 }
7801 wakaba 1.91
7802 wakaba 1.87 splice @{$self->{open_elements}}, $i;
7803     } else {
7804 wakaba 1.91 !!!cp ('t413.1');
7805 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7806     text => $token->{tag_name}, token => $token);
7807 wakaba 1.91
7808 wakaba 1.87 !!!cp ('t415.1');
7809     ## As if <p>, then reprocess the current token
7810     my $el;
7811 wakaba 1.126 !!!create-element ($el, $HTML_NS, 'p',, $token);
7812 wakaba 1.87 $insert->($el);
7813 wakaba 1.91 ## NOTE: Not inserted into |$self->{open_elements}|.
7814 wakaba 1.87 }
7815 wakaba 1.91
7816 wakaba 1.87 !!!next-token;
7817 wakaba 1.126 next B;
7818 wakaba 1.52 } elsif ({
7819     a => 1,
7820     b => 1, big => 1, em => 1, font => 1, i => 1,
7821 wakaba 1.193 nobr => 1, s => 1, small => 1, strike => 1,
7822 wakaba 1.52 strong => 1, tt => 1, u => 1,
7823     }->{$token->{tag_name}}) {
7824 wakaba 1.79 !!!cp ('t427');
7825 wakaba 1.113 $formatting_end_tag->($token);
7826 wakaba 1.126 next B;
7827 wakaba 1.52 } elsif ($token->{tag_name} eq 'br') {
7828 wakaba 1.79 !!!cp ('t428');
7829 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7830     text => 'br', token => $token);
7831 wakaba 1.52
7832     ## As if <br>
7833     $reconstruct_active_formatting_elements->($insert_to_current);
7834    
7835     my $el;
7836 wakaba 1.126 !!!create-element ($el, $HTML_NS, 'br',, $token);
7837 wakaba 1.52 $insert->($el);
7838    
7839     ## Ignore the token.
7840     !!!next-token;
7841 wakaba 1.126 next B;
7842 wakaba 1.52 } elsif ({
7843     caption => 1, col => 1, colgroup => 1, frame => 1,
7844     frameset => 1, head => 1, option => 1, optgroup => 1,
7845     tbody => 1, td => 1, tfoot => 1, th => 1,
7846     thead => 1, tr => 1,
7847     area => 1, basefont => 1, bgsound => 1,
7848     embed => 1, hr => 1, iframe => 1, image => 1,
7849     img => 1, input => 1, isindex => 1, noembed => 1,
7850     noframes => 1, param => 1, select => 1, spacer => 1,
7851     table => 1, textarea => 1, wbr => 1,
7852     noscript => 0, ## TODO: if scripting is enabled
7853     }->{$token->{tag_name}}) {
7854 wakaba 1.79 !!!cp ('t429');
7855 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7856     text => $token->{tag_name}, token => $token);
7857 wakaba 1.52 ## Ignore the token
7858     !!!next-token;
7859 wakaba 1.126 next B;
7860 wakaba 1.52 } else {
7861 wakaba 1.195 if ($token->{tag_name} eq 'sarcasm') {
7862     sleep 0.001; # take a deep breath
7863     }
7864    
7865 wakaba 1.52 ## Step 1
7866     my $node_i = -1;
7867     my $node = $self->{open_elements}->[$node_i];
7868 wakaba 1.51
7869 wakaba 1.52 ## Step 2
7870     S2: {
7871 wakaba 1.123 if ($node->[0]->manakai_local_name eq $token->{tag_name}) {
7872 wakaba 1.52 ## Step 1
7873     ## generate implied end tags
7874 wakaba 1.123 while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) {
7875 wakaba 1.79 !!!cp ('t430');
7876 wakaba 1.151 ## NOTE: |<ruby><rt></ruby>|.
7877     ## ISSUE: <ruby><rt></rt> will also take this code path,
7878     ## which seems wrong.
7879 wakaba 1.86 pop @{$self->{open_elements}};
7880 wakaba 1.151 $node_i++;
7881 wakaba 1.52 }
7882    
7883     ## Step 2
7884 wakaba 1.123 if ($self->{open_elements}->[-1]->[0]->manakai_local_name
7885     ne $token->{tag_name}) {
7886 wakaba 1.79 !!!cp ('t431');
7887 wakaba 1.58 ## NOTE: <x><y></x>
7888 wakaba 1.122 !!!parse-error (type => 'not closed',
7889 wakaba 1.153 text => $self->{open_elements}->[-1]->[0]
7890 wakaba 1.122 ->manakai_local_name,
7891     token => $token);
7892 wakaba 1.79 } else {
7893     !!!cp ('t432');
7894 wakaba 1.52 }
7895    
7896     ## Step 3
7897 wakaba 1.151 splice @{$self->{open_elements}}, $node_i if $node_i < 0;
7898 wakaba 1.51
7899 wakaba 1.1 !!!next-token;
7900 wakaba 1.52 last S2;
7901 wakaba 1.1 } else {
7902 wakaba 1.52 ## Step 3
7903 wakaba 1.123 if (not ($node->[1] & FORMATTING_EL) and
7904 wakaba 1.52 #not $phrasing_category->{$node->[1]} and
7905 wakaba 1.123 ($node->[1] & SPECIAL_EL or
7906     $node->[1] & SCOPING_EL)) {
7907 wakaba 1.79 !!!cp ('t433');
7908 wakaba 1.153 !!!parse-error (type => 'unmatched end tag',
7909     text => $token->{tag_name}, token => $token);
7910 wakaba 1.52 ## Ignore the token
7911     !!!next-token;
7912     last S2;
7913 wakaba 1.193
7914     ## NOTE: |<span><dd></span>a|: In Safari 3.1.2 and Opera
7915     ## 9.27, "a" is a child of <dd> (conforming). In
7916     ## Firefox 3.0.2, "a" is a child of <body>. In WinIE 7,
7917     ## "a" is a child of both <body> and <dd>.
7918 wakaba 1.52 }
7919 wakaba 1.193
7920 wakaba 1.79 !!!cp ('t434');
7921 wakaba 1.1 }
7922 wakaba 1.52
7923     ## Step 4
7924     $node_i--;
7925     $node = $self->{open_elements}->[$node_i];
7926    
7927     ## Step 5;
7928     redo S2;
7929     } # S2
7930 wakaba 1.126 next B;
7931 wakaba 1.1 }
7932     }
7933 wakaba 1.126 next B;
7934     } continue { # B
7935     if ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM) {
7936     ## NOTE: The code below is executed in cases where it does not have
7937     ## to be, but it it is harmless even in those cases.
7938     ## has an element in scope
7939     INSCOPE: {
7940     for (reverse 0..$#{$self->{open_elements}}) {
7941     my $node = $self->{open_elements}->[$_];
7942     if ($node->[1] & FOREIGN_EL) {
7943     last INSCOPE;
7944     } elsif ($node->[1] & SCOPING_EL) {
7945     last;
7946     }
7947     }
7948    
7949     ## NOTE: No foreign element in scope.
7950     $self->{insertion_mode} &= ~ IN_FOREIGN_CONTENT_IM;
7951     } # INSCOPE
7952     }
7953 wakaba 1.1 } # B
7954    
7955     ## Stop parsing # MUST
7956    
7957     ## TODO: script stuffs
7958 wakaba 1.3 } # _tree_construct_main
7959    
7960 wakaba 1.177 sub set_inner_html ($$$$;$) {
7961 wakaba 1.3 my $class = shift;
7962     my $node = shift;
7963 wakaba 1.177 #my $s = \$_[0];
7964 wakaba 1.3 my $onerror = $_[1];
7965 wakaba 1.162 my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7966 wakaba 1.3
7967 wakaba 1.63 ## ISSUE: Should {confident} be true?
7968    
7969 wakaba 1.3 my $nt = $node->node_type;
7970     if ($nt == 9) {
7971     # MUST
7972    
7973     ## Step 1 # MUST
7974     ## TODO: If the document has an active parser, ...
7975     ## ISSUE: There is an issue in the spec.
7976    
7977     ## Step 2 # MUST
7978     my @cn = @{$node->child_nodes};
7979     for (@cn) {
7980     $node->remove_child ($_);
7981     }
7982    
7983     ## Step 3, 4, 5 # MUST
7984 wakaba 1.177 $class->parse_char_string ($_[0] => $node, $onerror, $get_wrapper);
7985 wakaba 1.3 } elsif ($nt == 1) {
7986     ## TODO: If non-html element
7987    
7988     ## NOTE: Most of this code is copied from |parse_string|
7989    
7990 wakaba 1.162 ## TODO: Support for $get_wrapper
7991    
7992 wakaba 1.3 ## Step 1 # MUST
7993 wakaba 1.14 my $this_doc = $node->owner_document;
7994     my $doc = $this_doc->implementation->create_document;
7995 wakaba 1.18 $doc->manakai_is_html (1);
7996 wakaba 1.3 my $p = $class->new;
7997     $p->{document} = $doc;
7998    
7999 wakaba 1.84 ## Step 8 # MUST
8000 wakaba 1.3 my $i = 0;
8001 wakaba 1.121 $p->{line_prev} = $p->{line} = 1;
8002     $p->{column_prev} = $p->{column} = 0;
8003 wakaba 1.177 require Whatpm::Charset::DecodeHandle;
8004     my $input = Whatpm::Charset::DecodeHandle::CharString->new (\($_[0]));
8005     $input = $get_wrapper->($input);
8006 wakaba 1.183 $p->{set_nc} = sub {
8007 wakaba 1.3 my $self = shift;
8008 wakaba 1.14
8009 wakaba 1.178 my $char = '';
8010 wakaba 1.183 if (defined $self->{next_nc}) {
8011     $char = $self->{next_nc};
8012     delete $self->{next_nc};
8013     $self->{nc} = ord $char;
8014 wakaba 1.177 } else {
8015 wakaba 1.180 $self->{char_buffer} = '';
8016     $self->{char_buffer_pos} = 0;
8017    
8018     my $count = $input->manakai_read_until
8019 wakaba 1.182 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/,
8020     $self->{char_buffer_pos});
8021 wakaba 1.180 if ($count) {
8022     $self->{line_prev} = $self->{line};
8023     $self->{column_prev} = $self->{column};
8024     $self->{column}++;
8025 wakaba 1.183 $self->{nc}
8026 wakaba 1.180 = ord substr ($self->{char_buffer},
8027     $self->{char_buffer_pos}++, 1);
8028     return;
8029     }
8030    
8031 wakaba 1.178 if ($input->read ($char, 1)) {
8032 wakaba 1.183 $self->{nc} = ord $char;
8033 wakaba 1.178 } else {
8034 wakaba 1.183 $self->{nc} = -1;
8035 wakaba 1.178 return;
8036     }
8037 wakaba 1.177 }
8038 wakaba 1.121
8039     ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
8040     $p->{column}++;
8041 wakaba 1.4
8042 wakaba 1.183 if ($self->{nc} == 0x000A) { # LF
8043 wakaba 1.121 $p->{line}++;
8044     $p->{column} = 0;
8045 wakaba 1.79 !!!cp ('i1');
8046 wakaba 1.183 } elsif ($self->{nc} == 0x000D) { # CR
8047 wakaba 1.177 ## TODO: support for abort/streaming
8048 wakaba 1.178 my $next = '';
8049     if ($input->read ($next, 1) and $next ne "\x0A") {
8050 wakaba 1.183 $self->{next_nc} = $next;
8051 wakaba 1.177 }
8052 wakaba 1.183 $self->{nc} = 0x000A; # LF # MUST
8053 wakaba 1.121 $p->{line}++;
8054     $p->{column} = 0;
8055 wakaba 1.79 !!!cp ('i2');
8056 wakaba 1.183 } elsif ($self->{nc} == 0x0000) { # NULL
8057 wakaba 1.79 !!!cp ('i4');
8058 wakaba 1.14 !!!parse-error (type => 'NULL');
8059 wakaba 1.183 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
8060 wakaba 1.3 }
8061     };
8062 wakaba 1.171
8063 wakaba 1.172 $p->{read_until} = sub {
8064 wakaba 1.177 #my ($scalar, $specials_range, $offset) = @_;
8065 wakaba 1.183 return 0 if defined $p->{next_nc};
8066 wakaba 1.180
8067 wakaba 1.182 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
8068 wakaba 1.180 my $offset = $_[2] || 0;
8069    
8070     if ($p->{char_buffer_pos} < length $p->{char_buffer}) {
8071     pos ($p->{char_buffer}) = $p->{char_buffer_pos};
8072     if ($p->{char_buffer} =~ /\G(?>$pattern)+/) {
8073     substr ($_[0], $offset)
8074     = substr ($p->{char_buffer}, $-[0], $+[0] - $-[0]);
8075     my $count = $+[0] - $-[0];
8076     if ($count) {
8077     $p->{column} += $count;
8078     $p->{char_buffer_pos} += $count;
8079     $p->{line_prev} = $p->{line};
8080     $p->{column_prev} = $p->{column} - 1;
8081 wakaba 1.183 $p->{nc} = -1;
8082 wakaba 1.180 }
8083     return $count;
8084     } else {
8085     return 0;
8086     }
8087     } else {
8088     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
8089     if ($count) {
8090     $p->{column} += $count;
8091     $p->{column_prev} += $count;
8092 wakaba 1.183 $p->{nc} = -1;
8093 wakaba 1.180 }
8094     return $count;
8095 wakaba 1.177 }
8096     }; # $p->{read_until}
8097 wakaba 1.171
8098 wakaba 1.3 my $ponerror = $onerror || sub {
8099     my (%opt) = @_;
8100 wakaba 1.121 my $line = $opt{line};
8101     my $column = $opt{column};
8102     if (defined $opt{token} and defined $opt{token}->{line}) {
8103     $line = $opt{token}->{line};
8104     $column = $opt{token}->{column};
8105     }
8106     warn "Parse error ($opt{type}) at line $line column $column\n";
8107 wakaba 1.3 };
8108     $p->{parse_error} = sub {
8109 wakaba 1.121 $ponerror->(line => $p->{line}, column => $p->{column}, @_);
8110 wakaba 1.3 };
8111    
8112 wakaba 1.178 my $char_onerror = sub {
8113     my (undef, $type, %opt) = @_;
8114     $ponerror->(layer => 'encode',
8115     line => $p->{line}, column => $p->{column} + 1,
8116     %opt, type => $type);
8117     }; # $char_onerror
8118     $input->onerror ($char_onerror);
8119    
8120 wakaba 1.3 $p->_initialize_tokenizer;
8121     $p->_initialize_tree_constructor;
8122    
8123     ## Step 2
8124 wakaba 1.71 my $node_ln = $node->manakai_local_name;
8125 wakaba 1.40 $p->{content_model} = {
8126     title => RCDATA_CONTENT_MODEL,
8127     textarea => RCDATA_CONTENT_MODEL,
8128     style => CDATA_CONTENT_MODEL,
8129     script => CDATA_CONTENT_MODEL,
8130     xmp => CDATA_CONTENT_MODEL,
8131     iframe => CDATA_CONTENT_MODEL,
8132     noembed => CDATA_CONTENT_MODEL,
8133     noframes => CDATA_CONTENT_MODEL,
8134     noscript => CDATA_CONTENT_MODEL,
8135     plaintext => PLAINTEXT_CONTENT_MODEL,
8136     }->{$node_ln};
8137     $p->{content_model} = PCDATA_CONTENT_MODEL
8138     unless defined $p->{content_model};
8139     ## ISSUE: What is "the name of the element"? local name?
8140 wakaba 1.3
8141 wakaba 1.123 $p->{inner_html_node} = [$node, $el_category->{$node_ln}];
8142     ## TODO: Foreign element OK?
8143 wakaba 1.3
8144 wakaba 1.84 ## Step 3
8145 wakaba 1.3 my $root = $doc->create_element_ns
8146     ('http://www.w3.org/1999/xhtml', [undef, 'html']);
8147    
8148 wakaba 1.84 ## Step 4 # MUST
8149 wakaba 1.3 $doc->append_child ($root);
8150    
8151 wakaba 1.84 ## Step 5 # MUST
8152 wakaba 1.123 push @{$p->{open_elements}}, [$root, $el_category->{html}];
8153 wakaba 1.3
8154     undef $p->{head_element};
8155    
8156 wakaba 1.84 ## Step 6 # MUST
8157 wakaba 1.3 $p->_reset_insertion_mode;
8158    
8159 wakaba 1.84 ## Step 7 # MUST
8160 wakaba 1.3 my $anode = $node;
8161     AN: while (defined $anode) {
8162     if ($anode->node_type == 1) {
8163     my $nsuri = $anode->namespace_uri;
8164     if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
8165 wakaba 1.71 if ($anode->manakai_local_name eq 'form') {
8166 wakaba 1.79 !!!cp ('i5');
8167 wakaba 1.3 $p->{form_element} = $anode;
8168     last AN;
8169     }
8170     }
8171     }
8172     $anode = $anode->parent_node;
8173     } # AN
8174    
8175 wakaba 1.84 ## Step 9 # MUST
8176 wakaba 1.3 {
8177     my $self = $p;
8178     !!!next-token;
8179     }
8180     $p->_tree_construction_main;
8181    
8182 wakaba 1.84 ## Step 10 # MUST
8183 wakaba 1.3 my @cn = @{$node->child_nodes};
8184     for (@cn) {
8185     $node->remove_child ($_);
8186     }
8187     ## ISSUE: mutation events? read-only?
8188    
8189 wakaba 1.84 ## Step 11 # MUST
8190 wakaba 1.3 @cn = @{$root->child_nodes};
8191     for (@cn) {
8192 wakaba 1.14 $this_doc->adopt_node ($_);
8193 wakaba 1.3 $node->append_child ($_);
8194     }
8195 wakaba 1.14 ## ISSUE: mutation events?
8196 wakaba 1.3
8197     $p->_terminate_tree_constructor;
8198 wakaba 1.121
8199     delete $p->{parse_error}; # delete loop
8200 wakaba 1.3 } else {
8201     die "$0: |set_inner_html| is not defined for node of type $nt";
8202     }
8203     } # set_inner_html
8204    
8205     } # tree construction stage
8206 wakaba 1.1
8207 wakaba 1.63 package Whatpm::HTML::RestartParser;
8208     push our @ISA, 'Error';
8209    
8210 wakaba 1.1 1;
8211 wakaba 1.196 # $Date: 2008/10/04 06:30:34 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24